masster 0.4.19__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +0 -1
- masster/_version.py +1 -1
- masster/logger.py +42 -0
- masster/sample/load.py +6 -5
- masster/sample/sample.py +0 -9
- masster/study/helpers.py +52 -11
- masster/study/merge.py +886 -58
- masster/study/plot.py +11 -5
- masster/study/study.py +18 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +1199 -27
- {masster-0.4.19.dist-info → masster-0.4.20.dist-info}/METADATA +1 -1
- {masster-0.4.19.dist-info → masster-0.4.20.dist-info}/RECORD +16 -17
- masster/wizard.py +0 -1175
- {masster-0.4.19.dist-info → masster-0.4.20.dist-info}/WHEEL +0 -0
- {masster-0.4.19.dist-info → masster-0.4.20.dist-info}/entry_points.txt +0 -0
- {masster-0.4.19.dist-info → masster-0.4.20.dist-info}/licenses/LICENSE +0 -0
masster/wizard/wizard.py
CHANGED
|
@@ -44,7 +44,9 @@ wizard.export_results()
|
|
|
44
44
|
from __future__ import annotations
|
|
45
45
|
|
|
46
46
|
import os
|
|
47
|
+
import sys
|
|
47
48
|
import time
|
|
49
|
+
import importlib
|
|
48
50
|
import multiprocessing
|
|
49
51
|
from pathlib import Path
|
|
50
52
|
from typing import Optional, Any, Dict, List
|
|
@@ -124,17 +126,18 @@ class wizard_def:
|
|
|
124
126
|
checkpoint_interval: int = 10 # Save progress every N files
|
|
125
127
|
|
|
126
128
|
# === Study Assembly ===
|
|
127
|
-
min_samples_for_merge: int =
|
|
129
|
+
min_samples_for_merge: int = 2
|
|
128
130
|
rt_tolerance: float = 1.5
|
|
129
|
-
|
|
131
|
+
mz_max_diff: float = 0.01
|
|
130
132
|
alignment_algorithm: str = "kd"
|
|
131
133
|
merge_method: str = "chunked"
|
|
132
134
|
|
|
133
135
|
# === Feature Detection ===
|
|
134
|
-
chrom_fwhm: float = 0.
|
|
135
|
-
noise_threshold: float =
|
|
136
|
+
chrom_fwhm: float = 0.5
|
|
137
|
+
noise_threshold: float = 200.0
|
|
136
138
|
chrom_peak_snr: float = 5.0
|
|
137
139
|
tol_ppm: float = 10.0
|
|
140
|
+
detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
|
|
138
141
|
|
|
139
142
|
# === Output & Export ===
|
|
140
143
|
generate_plots: bool = True
|
|
@@ -232,10 +235,16 @@ class Wizard:
|
|
|
232
235
|
of mass spectrometry files from raw data to final study results, including:
|
|
233
236
|
|
|
234
237
|
1. Raw data discovery and batch conversion to sample5 format
|
|
235
|
-
2.
|
|
236
|
-
3.
|
|
237
|
-
4.
|
|
238
|
-
5.
|
|
238
|
+
2. Automatic detector type detection and parameter optimization
|
|
239
|
+
3. Study assembly with feature alignment and merging
|
|
240
|
+
4. Automated plot generation and result export
|
|
241
|
+
5. Intelligent resume capability for interrupted processes
|
|
242
|
+
6. Adaptive optimization based on study size and system resources
|
|
243
|
+
|
|
244
|
+
The wizard automatically detects the type of MS detector using simplified rules:
|
|
245
|
+
- .raw files: Assume Orbitrap (noise threshold = 1e5)
|
|
246
|
+
- .wiff files: Assume Quadrupole (noise threshold = 200)
|
|
247
|
+
- .mzML files: Check metadata for Orbitrap detection
|
|
239
248
|
|
|
240
249
|
The wizard handles the complete workflow with minimal user intervention
|
|
241
250
|
while providing comprehensive logging and progress tracking.
|
|
@@ -262,6 +271,10 @@ class Wizard:
|
|
|
262
271
|
**kwargs: Additional parameters (see wizard_def for full list)
|
|
263
272
|
"""
|
|
264
273
|
|
|
274
|
+
# Auto-detect optimal number of cores (75% of total)
|
|
275
|
+
if num_cores <= 0:
|
|
276
|
+
num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
|
|
277
|
+
|
|
265
278
|
# Create parameters instance
|
|
266
279
|
if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
|
|
267
280
|
self.params = kwargs.pop("params")
|
|
@@ -293,6 +306,14 @@ class Wizard:
|
|
|
293
306
|
self.study_folder_path = Path(self.params.study_folder)
|
|
294
307
|
self.study_folder_path.mkdir(parents=True, exist_ok=True)
|
|
295
308
|
|
|
309
|
+
# Auto-infer polarity from the first file if not explicitly set by user
|
|
310
|
+
if polarity == "positive" and "polarity" not in kwargs:
|
|
311
|
+
inferred_polarity = self._infer_polarity_from_first_file()
|
|
312
|
+
if inferred_polarity:
|
|
313
|
+
self.params.polarity = inferred_polarity
|
|
314
|
+
# Update adducts based on inferred polarity
|
|
315
|
+
self.params.__post_init__()
|
|
316
|
+
|
|
296
317
|
# Setup logging
|
|
297
318
|
self._setup_logging()
|
|
298
319
|
|
|
@@ -315,6 +336,47 @@ class Wizard:
|
|
|
315
336
|
if self.params.resume_enabled:
|
|
316
337
|
self._load_checkpoint()
|
|
317
338
|
|
|
339
|
+
def _infer_polarity_from_first_file(self) -> str:
|
|
340
|
+
"""
|
|
341
|
+
Infer polarity from the first available raw data file.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Inferred polarity string ("positive" or "negative") or None if detection fails
|
|
345
|
+
"""
|
|
346
|
+
try:
|
|
347
|
+
# Find first file
|
|
348
|
+
for extension in ['.wiff', '.raw', '.mzML', '.d']:
|
|
349
|
+
pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
|
|
350
|
+
files = list(self.data_source_path.rglob(pattern))
|
|
351
|
+
if files:
|
|
352
|
+
first_file = files[0]
|
|
353
|
+
break
|
|
354
|
+
else:
|
|
355
|
+
return None
|
|
356
|
+
|
|
357
|
+
# Only implement for .wiff files initially (most common format)
|
|
358
|
+
if first_file.suffix.lower() == '.wiff':
|
|
359
|
+
from masster.sample.load import _wiff_to_dict
|
|
360
|
+
|
|
361
|
+
# Extract metadata from first file
|
|
362
|
+
metadata_df = _wiff_to_dict(str(first_file))
|
|
363
|
+
|
|
364
|
+
if not metadata_df.empty and 'polarity' in metadata_df.columns:
|
|
365
|
+
# Get polarity from first experiment
|
|
366
|
+
first_polarity = metadata_df['polarity'].iloc[0]
|
|
367
|
+
|
|
368
|
+
# Convert numeric polarity codes to string
|
|
369
|
+
if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
|
|
370
|
+
return "positive"
|
|
371
|
+
elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
|
|
372
|
+
return "negative"
|
|
373
|
+
|
|
374
|
+
except Exception:
|
|
375
|
+
# Silently fall back to default if inference fails
|
|
376
|
+
pass
|
|
377
|
+
|
|
378
|
+
return None
|
|
379
|
+
|
|
318
380
|
@property
|
|
319
381
|
def polarity(self) -> str:
|
|
320
382
|
"""Get the polarity setting."""
|
|
@@ -324,6 +386,93 @@ class Wizard:
|
|
|
324
386
|
def adducts(self) -> List[str]:
|
|
325
387
|
"""Get the adducts list."""
|
|
326
388
|
return self.params.adducts
|
|
389
|
+
|
|
390
|
+
def _reload(self):
|
|
391
|
+
"""
|
|
392
|
+
Reloads all masster modules to pick up any changes to their source code,
|
|
393
|
+
and updates the instance's class reference to the newly reloaded class version.
|
|
394
|
+
This ensures that the instance uses the latest implementation without restarting the interpreter.
|
|
395
|
+
"""
|
|
396
|
+
# Reset logger configuration flags to allow proper reconfiguration after reload
|
|
397
|
+
try:
|
|
398
|
+
import masster.logger as logger_module
|
|
399
|
+
|
|
400
|
+
if hasattr(logger_module, "_WIZARD_LOGGER_CONFIGURED"):
|
|
401
|
+
logger_module._WIZARD_LOGGER_CONFIGURED = False
|
|
402
|
+
except Exception:
|
|
403
|
+
pass
|
|
404
|
+
|
|
405
|
+
# Get the base module name (masster)
|
|
406
|
+
base_modname = self.__class__.__module__.split(".")[0]
|
|
407
|
+
current_module = self.__class__.__module__
|
|
408
|
+
|
|
409
|
+
# Dynamically find all wizard submodules
|
|
410
|
+
wizard_modules = []
|
|
411
|
+
wizard_module_prefix = f"{base_modname}.wizard."
|
|
412
|
+
|
|
413
|
+
# Get all currently loaded modules that are part of the wizard package
|
|
414
|
+
for module_name in sys.modules:
|
|
415
|
+
if (
|
|
416
|
+
module_name.startswith(wizard_module_prefix)
|
|
417
|
+
and module_name != current_module
|
|
418
|
+
):
|
|
419
|
+
wizard_modules.append(module_name)
|
|
420
|
+
|
|
421
|
+
# Add core masster modules
|
|
422
|
+
core_modules = [
|
|
423
|
+
f"{base_modname}._version",
|
|
424
|
+
f"{base_modname}.chromatogram",
|
|
425
|
+
f"{base_modname}.spectrum",
|
|
426
|
+
f"{base_modname}.logger",
|
|
427
|
+
]
|
|
428
|
+
|
|
429
|
+
# Add sample submodules
|
|
430
|
+
sample_modules = []
|
|
431
|
+
sample_module_prefix = f"{base_modname}.sample."
|
|
432
|
+
for module_name in sys.modules:
|
|
433
|
+
if (
|
|
434
|
+
module_name.startswith(sample_module_prefix)
|
|
435
|
+
and module_name != current_module
|
|
436
|
+
):
|
|
437
|
+
sample_modules.append(module_name)
|
|
438
|
+
|
|
439
|
+
# Add study submodules
|
|
440
|
+
study_modules = []
|
|
441
|
+
study_module_prefix = f"{base_modname}.study."
|
|
442
|
+
for module_name in sys.modules:
|
|
443
|
+
if (
|
|
444
|
+
module_name.startswith(study_module_prefix)
|
|
445
|
+
and module_name != current_module
|
|
446
|
+
):
|
|
447
|
+
study_modules.append(module_name)
|
|
448
|
+
|
|
449
|
+
all_modules_to_reload = (
|
|
450
|
+
core_modules + wizard_modules + sample_modules + study_modules
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# Reload all discovered modules
|
|
454
|
+
for full_module_name in all_modules_to_reload:
|
|
455
|
+
try:
|
|
456
|
+
if full_module_name in sys.modules:
|
|
457
|
+
mod = sys.modules[full_module_name]
|
|
458
|
+
importlib.reload(mod)
|
|
459
|
+
self.logger.debug(f"Reloaded module: {full_module_name}")
|
|
460
|
+
except Exception as e:
|
|
461
|
+
self.logger.warning(f"Failed to reload module {full_module_name}: {e}")
|
|
462
|
+
|
|
463
|
+
# Finally, reload the current module (wizard.py)
|
|
464
|
+
try:
|
|
465
|
+
mod = __import__(current_module, fromlist=[current_module.split(".")[0]])
|
|
466
|
+
importlib.reload(mod)
|
|
467
|
+
|
|
468
|
+
# Get the updated class reference from the reloaded module
|
|
469
|
+
new = getattr(mod, self.__class__.__name__)
|
|
470
|
+
# Update the class reference of the instance
|
|
471
|
+
self.__class__ = new
|
|
472
|
+
|
|
473
|
+
self.logger.debug("Module reload completed")
|
|
474
|
+
except Exception as e:
|
|
475
|
+
self.logger.error(f"Failed to reload current module {current_module}: {e}")
|
|
327
476
|
|
|
328
477
|
def _setup_logging(self):
|
|
329
478
|
"""Setup comprehensive logging system."""
|
|
@@ -526,6 +675,15 @@ class Wizard:
|
|
|
526
675
|
chrom_fwhm=2.0 # Wider peaks
|
|
527
676
|
)
|
|
528
677
|
|
|
678
|
+
# STEP 3.5: Validate feature detection results
|
|
679
|
+
if not hasattr(sample, 'features_df') or sample.features_df is None or len(sample.features_df) == 0:
|
|
680
|
+
self.logger.warning(f"No features detected in {file_path.name} - skipping additional processing")
|
|
681
|
+
# Still save the sample5 file for record keeping
|
|
682
|
+
sample.save(filename=str(output_file))
|
|
683
|
+
return output_file.stem
|
|
684
|
+
|
|
685
|
+
self.logger.info(f"Detected {len(sample.features_df)} features in {file_path.name}")
|
|
686
|
+
|
|
529
687
|
# STEP 4: Adduct detection
|
|
530
688
|
sample.find_adducts(adducts=self.adducts)
|
|
531
689
|
|
|
@@ -535,17 +693,11 @@ class Wizard:
|
|
|
535
693
|
# STEP 6: Save processed data
|
|
536
694
|
sample.save(filename=str(output_file))
|
|
537
695
|
|
|
538
|
-
# STEP 7: Generate additional outputs
|
|
539
|
-
|
|
540
|
-
csv_file = output_file.with_suffix('.features.csv')
|
|
541
|
-
sample.export_features(filename=str(csv_file))
|
|
542
|
-
|
|
543
|
-
if "mgf" in self.params.export_formats:
|
|
544
|
-
mgf_file = output_file.with_suffix('.mgf')
|
|
545
|
-
sample.export_mgf(filename=str(mgf_file), use_cache=False)
|
|
696
|
+
# STEP 7: Generate additional outputs (only for samples with features)
|
|
697
|
+
# Skip CSV export and individual MGF export as requested
|
|
546
698
|
|
|
547
699
|
if self.params.generate_plots:
|
|
548
|
-
plot_file = output_file.
|
|
700
|
+
plot_file = output_file.parent / (output_file.stem + "_2d.html")
|
|
549
701
|
sample.plot_2d(filename=str(plot_file), markersize=4)
|
|
550
702
|
|
|
551
703
|
# Memory cleanup
|
|
@@ -592,6 +744,10 @@ class Wizard:
|
|
|
592
744
|
self.logger.warning("No files found for conversion")
|
|
593
745
|
return False
|
|
594
746
|
|
|
747
|
+
# Detect detector type and adjust parameters before processing
|
|
748
|
+
detector_type = self._detect_detector_type()
|
|
749
|
+
self._adjust_parameters_for_detector(detector_type)
|
|
750
|
+
|
|
595
751
|
# Filter out already processed files if resuming
|
|
596
752
|
if self.params.resume_enabled and self.processed_files:
|
|
597
753
|
remaining_files = []
|
|
@@ -705,6 +861,125 @@ class Wizard:
|
|
|
705
861
|
self._save_checkpoint()
|
|
706
862
|
return successful_count > 0
|
|
707
863
|
|
|
864
|
+
def _detect_detector_type(self) -> str:
|
|
865
|
+
"""
|
|
866
|
+
Detect the type of MS detector from the first available file.
|
|
867
|
+
|
|
868
|
+
Simplified detection rules:
|
|
869
|
+
- .raw files: Assume Orbitrap (Thermo instruments)
|
|
870
|
+
- .wiff files: Assume Quadrupole (SCIEX instruments)
|
|
871
|
+
- .mzML files: Check metadata for Orbitrap detection
|
|
872
|
+
|
|
873
|
+
Returns:
|
|
874
|
+
String indicating detector type ("orbitrap", "quadrupole", "unknown")
|
|
875
|
+
"""
|
|
876
|
+
try:
|
|
877
|
+
# Find first raw file to analyze
|
|
878
|
+
for extension in ['.raw', '.wiff', '.mzML', '.d']:
|
|
879
|
+
if self.params.search_subfolders:
|
|
880
|
+
pattern = f"**/*{extension}"
|
|
881
|
+
files = list(self.data_source_path.rglob(pattern))
|
|
882
|
+
else:
|
|
883
|
+
pattern = f"*{extension}"
|
|
884
|
+
files = list(self.data_source_path.glob(pattern))
|
|
885
|
+
if files:
|
|
886
|
+
first_file = files[0]
|
|
887
|
+
break
|
|
888
|
+
else:
|
|
889
|
+
self.logger.warning("No raw files found for detector detection")
|
|
890
|
+
return "unknown"
|
|
891
|
+
|
|
892
|
+
self.logger.info(f"Detecting detector type from: {first_file.name}")
|
|
893
|
+
|
|
894
|
+
# Simplified detection rules
|
|
895
|
+
if first_file.suffix.lower() == '.raw':
|
|
896
|
+
# RAW files are Thermo -> assume Orbitrap
|
|
897
|
+
detector_type = "orbitrap"
|
|
898
|
+
self.logger.info("Detected .raw file -> Thermo Orbitrap detector")
|
|
899
|
+
return detector_type
|
|
900
|
+
|
|
901
|
+
elif first_file.suffix.lower() in ['.wiff', '.wiff2']:
|
|
902
|
+
# WIFF files are SCIEX -> assume Quadrupole
|
|
903
|
+
detector_type = "quadrupole"
|
|
904
|
+
self.logger.info("Detected .wiff file -> SCIEX Quadrupole detector")
|
|
905
|
+
return detector_type
|
|
906
|
+
|
|
907
|
+
elif first_file.suffix.lower() == '.mzml':
|
|
908
|
+
# For mzML files, check metadata for Orbitrap detection
|
|
909
|
+
try:
|
|
910
|
+
import warnings
|
|
911
|
+
with warnings.catch_warnings():
|
|
912
|
+
warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
|
|
913
|
+
import pyopenms as oms
|
|
914
|
+
|
|
915
|
+
exp = oms.MSExperiment()
|
|
916
|
+
oms.MzMLFile().load(str(first_file), exp)
|
|
917
|
+
|
|
918
|
+
# Check instrument metadata for Orbitrap keywords
|
|
919
|
+
instrument_info = []
|
|
920
|
+
if hasattr(exp, 'getExperimentalSettings'):
|
|
921
|
+
settings = exp.getExperimentalSettings()
|
|
922
|
+
if hasattr(settings, 'getInstrument'):
|
|
923
|
+
instrument = settings.getInstrument()
|
|
924
|
+
if hasattr(instrument, 'getName'):
|
|
925
|
+
name = instrument.getName().decode() if hasattr(instrument.getName(), 'decode') else str(instrument.getName())
|
|
926
|
+
instrument_info.append(name.lower())
|
|
927
|
+
if hasattr(instrument, 'getModel'):
|
|
928
|
+
model = instrument.getModel().decode() if hasattr(instrument.getModel(), 'decode') else str(instrument.getModel())
|
|
929
|
+
instrument_info.append(model.lower())
|
|
930
|
+
|
|
931
|
+
# Check for Orbitrap keywords in instrument info
|
|
932
|
+
orbitrap_keywords = ['orbitrap', 'exactive', 'q-exactive', 'exploris', 'fusion', 'lumos', 'velos', 'elite']
|
|
933
|
+
instrument_text = ' '.join(instrument_info)
|
|
934
|
+
|
|
935
|
+
if any(keyword in instrument_text for keyword in orbitrap_keywords):
|
|
936
|
+
detector_type = "orbitrap"
|
|
937
|
+
self.logger.info(f"Detected mzML with Orbitrap instrument: {instrument_text}")
|
|
938
|
+
else:
|
|
939
|
+
detector_type = "unknown"
|
|
940
|
+
self.logger.info(f"Detected mzML with unknown instrument: {instrument_text}")
|
|
941
|
+
|
|
942
|
+
return detector_type
|
|
943
|
+
|
|
944
|
+
except Exception as e:
|
|
945
|
+
self.logger.warning(f"Failed to analyze mzML file for detector type: {e}")
|
|
946
|
+
return "unknown"
|
|
947
|
+
|
|
948
|
+
except Exception as e:
|
|
949
|
+
self.logger.warning(f"Detector type detection failed: {e}")
|
|
950
|
+
|
|
951
|
+
return "unknown"
|
|
952
|
+
|
|
953
|
+
def _adjust_parameters_for_detector(self, detector_type: str):
|
|
954
|
+
"""
|
|
955
|
+
Adjust processing parameters based on detected detector type.
|
|
956
|
+
|
|
957
|
+
Simplified rules:
|
|
958
|
+
- "orbitrap": Use 1e5 noise threshold (high background noise)
|
|
959
|
+
- "quadrupole": Use 200 noise threshold (default, lower noise)
|
|
960
|
+
- "unknown": Use 200 noise threshold (default)
|
|
961
|
+
|
|
962
|
+
Parameters:
|
|
963
|
+
detector_type: Type of detector detected ("orbitrap", "quadrupole", "unknown")
|
|
964
|
+
"""
|
|
965
|
+
original_noise = self.params.noise_threshold
|
|
966
|
+
self.params.detector_type = detector_type # Store the detected type
|
|
967
|
+
|
|
968
|
+
if detector_type == "orbitrap":
|
|
969
|
+
# Orbitraps have much higher background noise, use 1e5 threshold
|
|
970
|
+
self.params.noise_threshold = 1e5
|
|
971
|
+
self._log_progress(f"Detector: Orbitrap detected - adjusted noise threshold: {original_noise} -> {self.params.noise_threshold}")
|
|
972
|
+
|
|
973
|
+
elif detector_type == "quadrupole":
|
|
974
|
+
# Quadrupole instruments have lower noise, use default threshold
|
|
975
|
+
self.params.noise_threshold = 200.0
|
|
976
|
+
self._log_progress(f"Detector: Quadrupole detected - noise threshold: {self.params.noise_threshold}")
|
|
977
|
+
|
|
978
|
+
else:
|
|
979
|
+
# Unknown detector type, keep default
|
|
980
|
+
self.params.noise_threshold = 200.0
|
|
981
|
+
self._log_progress(f"Detector: Unknown type detected - using default noise threshold: {self.params.noise_threshold}")
|
|
982
|
+
|
|
708
983
|
def assemble_study(self) -> bool:
|
|
709
984
|
"""
|
|
710
985
|
Assemble processed sample5 files into a study.
|
|
@@ -725,6 +1000,10 @@ class Wizard:
|
|
|
725
1000
|
self._log_progress(f"Assembling study from {len(sample5_files)} sample5 files")
|
|
726
1001
|
|
|
727
1002
|
try:
|
|
1003
|
+
# Detect detector type and adjust parameters if needed
|
|
1004
|
+
detector_type = self._detect_detector_type()
|
|
1005
|
+
self._adjust_parameters_for_detector(detector_type)
|
|
1006
|
+
|
|
728
1007
|
# Create study with optimized settings
|
|
729
1008
|
import masster
|
|
730
1009
|
study_params = study_defaults(
|
|
@@ -779,25 +1058,39 @@ class Wizard:
|
|
|
779
1058
|
return False
|
|
780
1059
|
|
|
781
1060
|
try:
|
|
1061
|
+
# Determine optimal algorithms based on study size
|
|
1062
|
+
num_samples = len(self.study.samples_df)
|
|
1063
|
+
|
|
1064
|
+
if num_samples < 500:
|
|
1065
|
+
# For smaller studies: use qt for both alignment and merge
|
|
1066
|
+
alignment_algorithm = "qt"
|
|
1067
|
+
merge_method = "qt"
|
|
1068
|
+
self.logger.info(f"Small study ({num_samples} samples) - using qt algorithms")
|
|
1069
|
+
else:
|
|
1070
|
+
# For larger studies: use kd for alignment and qt-chunked for merge
|
|
1071
|
+
alignment_algorithm = "kd"
|
|
1072
|
+
merge_method = "qt-chunked"
|
|
1073
|
+
self.logger.info(f"Large study ({num_samples} samples) - using kd alignment and qt-chunked merge")
|
|
1074
|
+
|
|
782
1075
|
# Align features across samples
|
|
783
1076
|
align_params = align_defaults(
|
|
784
1077
|
rt_tol=self.params.rt_tolerance,
|
|
785
|
-
|
|
786
|
-
algorithm=
|
|
1078
|
+
mz_max_diff=self.params.mz_max_diff,
|
|
1079
|
+
algorithm=alignment_algorithm
|
|
787
1080
|
)
|
|
788
|
-
|
|
789
|
-
self.logger.info(f"Aligning features with RT tolerance {self.params.rt_tolerance}s, m/z
|
|
1081
|
+
|
|
1082
|
+
self.logger.info(f"Aligning features with RT tolerance {self.params.rt_tolerance}s, m/z max diff {self.params.mz_max_diff} Da, algorithm: {alignment_algorithm}")
|
|
790
1083
|
self.study.align(params=align_params)
|
|
791
1084
|
|
|
792
1085
|
# Merge aligned features
|
|
793
1086
|
merge_params = merge_defaults(
|
|
794
|
-
method=
|
|
1087
|
+
method=merge_method,
|
|
795
1088
|
rt_tol=self.params.rt_tolerance,
|
|
796
|
-
mz_tol=self.params.
|
|
1089
|
+
mz_tol=self.params.mz_max_diff,
|
|
797
1090
|
min_samples=self.params.min_samples_for_merge
|
|
798
1091
|
)
|
|
799
1092
|
|
|
800
|
-
self.logger.info(f"Merging features using {
|
|
1093
|
+
self.logger.info(f"Merging features using {merge_method} method")
|
|
801
1094
|
self.study.merge(params=merge_params)
|
|
802
1095
|
|
|
803
1096
|
# Log results
|
|
@@ -920,12 +1213,47 @@ class Wizard:
|
|
|
920
1213
|
|
|
921
1214
|
self._log_progress(f"Completed {exports_completed} exports")
|
|
922
1215
|
self._save_checkpoint()
|
|
1216
|
+
|
|
1217
|
+
# Always perform additional export methods as requested
|
|
1218
|
+
self._export_additional_formats()
|
|
1219
|
+
|
|
923
1220
|
return True
|
|
924
1221
|
|
|
925
1222
|
except Exception as e:
|
|
926
1223
|
self.logger.error(f"Failed to export results: {e}")
|
|
927
1224
|
return False
|
|
928
1225
|
|
|
1226
|
+
def _export_additional_formats(self):
|
|
1227
|
+
"""Export additional formats: xlsx, parquet, save, and mgf."""
|
|
1228
|
+
self.logger.info("=== Exporting Additional Formats ===")
|
|
1229
|
+
|
|
1230
|
+
try:
|
|
1231
|
+
# Force export xlsx (study results in Excel format)
|
|
1232
|
+
xlsx_file = self.study_folder_path / "study_results.xlsx"
|
|
1233
|
+
if hasattr(self.study, 'export_xlsx'):
|
|
1234
|
+
self.study.export_xlsx(filename=str(xlsx_file))
|
|
1235
|
+
self.logger.info(f"Exported Excel: {xlsx_file}")
|
|
1236
|
+
|
|
1237
|
+
# Force export parquet (efficient binary format)
|
|
1238
|
+
parquet_file = self.study_folder_path / "study_data.parquet"
|
|
1239
|
+
if hasattr(self.study, 'export_parquet'):
|
|
1240
|
+
self.study.export_parquet(filename=str(parquet_file))
|
|
1241
|
+
self.logger.info(f"Exported Parquet: {parquet_file}")
|
|
1242
|
+
|
|
1243
|
+
# Force save the study in study5 format
|
|
1244
|
+
study_file = self.study_folder_path / "final_study.study5"
|
|
1245
|
+
self.study.save(filename=str(study_file))
|
|
1246
|
+
self.logger.info(f"Saved study: {study_file}")
|
|
1247
|
+
|
|
1248
|
+
# Force export MGF for MS2 spectra
|
|
1249
|
+
mgf_file = self.study_folder_path / "consensus_ms2.mgf"
|
|
1250
|
+
if hasattr(self.study, 'export_mgf'):
|
|
1251
|
+
self.study.export_mgf(filename=str(mgf_file))
|
|
1252
|
+
self.logger.info(f"Exported MGF: {mgf_file}")
|
|
1253
|
+
|
|
1254
|
+
except Exception as e:
|
|
1255
|
+
self.logger.warning(f"Some additional exports failed: {e}")
|
|
1256
|
+
|
|
929
1257
|
def save_study(self) -> bool:
|
|
930
1258
|
"""
|
|
931
1259
|
Save the final study in optimized format.
|
|
@@ -977,7 +1305,7 @@ class Wizard:
|
|
|
977
1305
|
f.write(f"Successful Files: {len(self.processed_files)}\n")
|
|
978
1306
|
f.write(f"Failed Files: {len(self.failed_files)}\n")
|
|
979
1307
|
f.write(f"RT Tolerance: {self.params.rt_tolerance}s\n")
|
|
980
|
-
f.write(f"m/z
|
|
1308
|
+
f.write(f"m/z Max Diff: {self.params.mz_max_diff} Da\n")
|
|
981
1309
|
f.write(f"Merge Method: {self.params.merge_method}\n")
|
|
982
1310
|
f.write(f"Processing Time: {self._get_total_processing_time()}\n")
|
|
983
1311
|
|
|
@@ -1144,6 +1472,753 @@ class Wizard:
|
|
|
1144
1472
|
}
|
|
1145
1473
|
}
|
|
1146
1474
|
|
|
1475
|
+
def execute(self) -> bool:
|
|
1476
|
+
"""
|
|
1477
|
+
Execute the complete automated processing pipeline.
|
|
1478
|
+
|
|
1479
|
+
This is a convenience method that runs the full pipeline with the wizard's
|
|
1480
|
+
current configuration. It performs standalone analysis of the samples/studies
|
|
1481
|
+
as proposed by the Wizard.
|
|
1482
|
+
|
|
1483
|
+
Returns:
|
|
1484
|
+
True if execution completed successfully, False otherwise
|
|
1485
|
+
"""
|
|
1486
|
+
self._log_progress("Executing Wizard automated processing...")
|
|
1487
|
+
return self.run_full_pipeline()
|
|
1488
|
+
|
|
1489
|
+
def export_script(self, filename: str) -> bool:
|
|
1490
|
+
"""
|
|
1491
|
+
Generate a Python script that replicates the wizard's processing steps.
|
|
1492
|
+
|
|
1493
|
+
Creates a standalone Python script that can be executed independently
|
|
1494
|
+
to perform the same analysis as the wizard with the current configuration.
|
|
1495
|
+
The script will be saved in the study folder.
|
|
1496
|
+
|
|
1497
|
+
This is useful for:
|
|
1498
|
+
- Creating reproducible analysis scripts
|
|
1499
|
+
- Customizing processing steps
|
|
1500
|
+
- Running analysis in different environments
|
|
1501
|
+
- Batch processing automation
|
|
1502
|
+
|
|
1503
|
+
Parameters:
|
|
1504
|
+
filename: Filename for the script (should end with .py). Script will be saved in the study folder.
|
|
1505
|
+
|
|
1506
|
+
Returns:
|
|
1507
|
+
True if script was generated successfully, False otherwise
|
|
1508
|
+
"""
|
|
1509
|
+
self._log_progress("Generating analysis script...")
|
|
1510
|
+
|
|
1511
|
+
try:
|
|
1512
|
+
# Ensure the filename is just a filename, not a full path
|
|
1513
|
+
script_filename = Path(filename).name
|
|
1514
|
+
if not script_filename.endswith('.py'):
|
|
1515
|
+
script_filename = script_filename.replace(Path(script_filename).suffix, '') + '.py'
|
|
1516
|
+
|
|
1517
|
+
# Place the script in the study folder
|
|
1518
|
+
script_path = self.study_folder_path / script_filename
|
|
1519
|
+
|
|
1520
|
+
# Generate the script content
|
|
1521
|
+
script_content = self._generate_script_content()
|
|
1522
|
+
|
|
1523
|
+
# Write the script
|
|
1524
|
+
with open(script_path, 'w', encoding='utf-8') as f:
|
|
1525
|
+
f.write(script_content)
|
|
1526
|
+
|
|
1527
|
+
self._log_progress(f"Analysis script saved: {os.path.abspath(script_path)}")
|
|
1528
|
+
self.logger.info(f"Generated standalone analysis script: {os.path.abspath(script_path)}")
|
|
1529
|
+
|
|
1530
|
+
return True
|
|
1531
|
+
|
|
1532
|
+
except Exception as e:
|
|
1533
|
+
self.logger.error(f"Failed to generate script: {e}")
|
|
1534
|
+
return False
|
|
1535
|
+
|
|
1536
|
+
def to_script(self, filename: str) -> bool:
|
|
1537
|
+
"""
|
|
1538
|
+
[DEPRECATED] Use export_script() instead.
|
|
1539
|
+
|
|
1540
|
+
Backward compatibility alias for export_script().
|
|
1541
|
+
"""
|
|
1542
|
+
return self.export_script(filename)
|
|
1543
|
+
|
|
1544
|
+
def _generate_script_content(self) -> str:
|
|
1545
|
+
"""
|
|
1546
|
+
Generate the content for the standalone analysis script.
|
|
1547
|
+
|
|
1548
|
+
Returns:
|
|
1549
|
+
Complete Python script content as string
|
|
1550
|
+
"""
|
|
1551
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
1552
|
+
|
|
1553
|
+
# Create script header
|
|
1554
|
+
script_lines = [
|
|
1555
|
+
'#!/usr/bin/env python3',
|
|
1556
|
+
'"""',
|
|
1557
|
+
'Standalone Mass Spectrometry Analysis Script',
|
|
1558
|
+
f'Generated by masster.Wizard on {timestamp}',
|
|
1559
|
+
'',
|
|
1560
|
+
'This script replicates the automated processing pipeline configured',
|
|
1561
|
+
'in the Wizard with the following settings:',
|
|
1562
|
+
f'- Data Source: {self.params.data_source.replace(chr(92), chr(92)*2)}',
|
|
1563
|
+
f'- Study Folder: {self.params.study_folder.replace(chr(92), chr(92)*2)}',
|
|
1564
|
+
f'- Polarity: {self.params.polarity}',
|
|
1565
|
+
f'- Number of Cores: {self.params.num_cores}',
|
|
1566
|
+
f'- Adducts: {", ".join(self.params.adducts)}',
|
|
1567
|
+
f'- RT Tolerance: {self.params.rt_tolerance}s',
|
|
1568
|
+
f'- m/z Max Diff: {self.params.mz_max_diff} Da',
|
|
1569
|
+
f'- Merge Method: {self.params.merge_method}',
|
|
1570
|
+
'"""',
|
|
1571
|
+
'',
|
|
1572
|
+
'import os',
|
|
1573
|
+
'import sys',
|
|
1574
|
+
'import time',
|
|
1575
|
+
'import multiprocessing',
|
|
1576
|
+
'from pathlib import Path',
|
|
1577
|
+
'from typing import List, Optional',
|
|
1578
|
+
'import concurrent.futures',
|
|
1579
|
+
'from datetime import datetime',
|
|
1580
|
+
'',
|
|
1581
|
+
'# Add error handling for masster import',
|
|
1582
|
+
'# First, try to add the masster directory to the Python path',
|
|
1583
|
+
'try:',
|
|
1584
|
+
' # Try to find masster by looking for it in common development locations',
|
|
1585
|
+
' possible_paths = [',
|
|
1586
|
+
' Path(__file__).parent.parent, # Script is in masster subfolder',
|
|
1587
|
+
' Path(__file__).parent.parent.parent, # Script is in study folder',
|
|
1588
|
+
' Path(os.getcwd()), # Current working directory',
|
|
1589
|
+
' Path(os.getcwd()).parent, # Parent of current directory',
|
|
1590
|
+
' Path(r"D:\\SW\\massistant"), # Specific development path',
|
|
1591
|
+
' Path.home() / "massistant", # Home directory',
|
|
1592
|
+
' Path.home() / "SW" / "massistant", # Common dev location',
|
|
1593
|
+
' ]',
|
|
1594
|
+
' ',
|
|
1595
|
+
' masster_found = False',
|
|
1596
|
+
' for possible_path in possible_paths:',
|
|
1597
|
+
' masster_dir = possible_path / "masster"',
|
|
1598
|
+
' if masster_dir.exists() and (masster_dir / "__init__.py").exists():',
|
|
1599
|
+
' if str(possible_path) not in sys.path:',
|
|
1600
|
+
' sys.path.insert(0, str(possible_path))',
|
|
1601
|
+
' masster_found = True',
|
|
1602
|
+
' print(f"Found masster at: {possible_path}")',
|
|
1603
|
+
' break',
|
|
1604
|
+
' ',
|
|
1605
|
+
' if not masster_found:',
|
|
1606
|
+
' # Try adding current directory to path as fallback',
|
|
1607
|
+
' current_dir = Path(os.getcwd())',
|
|
1608
|
+
' if str(current_dir) not in sys.path:',
|
|
1609
|
+
' sys.path.insert(0, str(current_dir))',
|
|
1610
|
+
' ',
|
|
1611
|
+
' import masster',
|
|
1612
|
+
'except ImportError as e:',
|
|
1613
|
+
' print(f"Error: masster library not found. {e}")',
|
|
1614
|
+
' print("Please ensure masster is installed or run this script from the masster directory.")',
|
|
1615
|
+
' print("You can install masster with: pip install -e .")',
|
|
1616
|
+
' sys.exit(1)',
|
|
1617
|
+
'',
|
|
1618
|
+
'',
|
|
1619
|
+
'def infer_polarity_from_first_file():',
|
|
1620
|
+
' """Infer polarity from the first available raw data file."""',
|
|
1621
|
+
' try:',
|
|
1622
|
+
' data_source_path = Path(DATA_SOURCE)',
|
|
1623
|
+
' # Find first file',
|
|
1624
|
+
' for extension in [\'.wiff\', \'.raw\', \'.mzML\', \'.d\']:',
|
|
1625
|
+
' pattern = f"**/*{extension}"',
|
|
1626
|
+
' files = list(data_source_path.rglob(pattern))',
|
|
1627
|
+
' if files:',
|
|
1628
|
+
' first_file = files[0]',
|
|
1629
|
+
' break',
|
|
1630
|
+
' else:',
|
|
1631
|
+
' return None',
|
|
1632
|
+
' ',
|
|
1633
|
+
' # Only implement for .wiff files initially',
|
|
1634
|
+
' if first_file.suffix.lower() == \'.wiff\':',
|
|
1635
|
+
' from masster.sample.load import _wiff_to_dict',
|
|
1636
|
+
' ',
|
|
1637
|
+
' # Extract metadata from first file',
|
|
1638
|
+
' metadata_df = _wiff_to_dict(str(first_file))',
|
|
1639
|
+
' ',
|
|
1640
|
+
' if not metadata_df.empty and \'polarity\' in metadata_df.columns:',
|
|
1641
|
+
' # Get polarity from first experiment',
|
|
1642
|
+
' first_polarity = metadata_df[\'polarity\'].iloc[0]',
|
|
1643
|
+
' ',
|
|
1644
|
+
' # Convert numeric polarity codes to string',
|
|
1645
|
+
' if first_polarity == 1 or str(first_polarity).lower() in [\'positive\', \'pos\', \'+\']:',
|
|
1646
|
+
' return "positive"',
|
|
1647
|
+
' elif first_polarity == -1 or str(first_polarity).lower() in [\'negative\', \'neg\', \'-\']:',
|
|
1648
|
+
' return "negative"',
|
|
1649
|
+
' except Exception:',
|
|
1650
|
+
' pass',
|
|
1651
|
+
' return None',
|
|
1652
|
+
'',
|
|
1653
|
+
'',
|
|
1654
|
+
'# Configuration Parameters',
|
|
1655
|
+
f'DATA_SOURCE = r"{self.params.data_source}"',
|
|
1656
|
+
f'STUDY_FOLDER = r"{self.params.study_folder}"',
|
|
1657
|
+
'',
|
|
1658
|
+
'# Auto-infer polarity from first file, fall back to default',
|
|
1659
|
+
'detected_polarity = infer_polarity_from_first_file()',
|
|
1660
|
+
f'POLARITY = detected_polarity or "{self.params.polarity}"',
|
|
1661
|
+
'NUM_CORES = max(1, int(multiprocessing.cpu_count() * 0.75)) # Auto-detect 75% of cores',
|
|
1662
|
+
'',
|
|
1663
|
+
'# Set adducts based on detected polarity',
|
|
1664
|
+
'if POLARITY.lower() in ["positive", "pos"]:',
|
|
1665
|
+
' ADDUCTS = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]',
|
|
1666
|
+
'elif POLARITY.lower() in ["negative", "neg"]:',
|
|
1667
|
+
' ADDUCTS = ["H-1:-:1.0", "CH2O2:0:0.5"]',
|
|
1668
|
+
'else:',
|
|
1669
|
+
f' ADDUCTS = {self.params.adducts!r} # Fall back to original',
|
|
1670
|
+
f'RT_TOLERANCE = {self.params.rt_tolerance}',
|
|
1671
|
+
f'MZ_TOLERANCE = {self.params.mz_max_diff}',
|
|
1672
|
+
f'MERGE_METHOD = "{self.params.merge_method}"',
|
|
1673
|
+
f'BATCH_SIZE = {self.params.batch_size}',
|
|
1674
|
+
f'CHROM_FWHM = {self.params.chrom_fwhm}',
|
|
1675
|
+
f'NOISE_THRESHOLD = {self.params.noise_threshold}',
|
|
1676
|
+
f'CHROM_PEAK_SNR = {self.params.chrom_peak_snr}',
|
|
1677
|
+
f'TOL_PPM = {self.params.tol_ppm}',
|
|
1678
|
+
f'MIN_SAMPLES_FOR_MERGE = {self.params.min_samples_for_merge}',
|
|
1679
|
+
'',
|
|
1680
|
+
'# File discovery settings',
|
|
1681
|
+
"FILE_EXTENSIONS = ['.wiff', '.raw', '.mzML']",
|
|
1682
|
+
f'SEARCH_SUBFOLDERS = {self.params.search_subfolders}',
|
|
1683
|
+
"SKIP_PATTERNS = []",
|
|
1684
|
+
f'MAX_FILE_SIZE_GB = {self.params.max_file_size_gb}',
|
|
1685
|
+
'',
|
|
1686
|
+
'# Output settings',
|
|
1687
|
+
f'GENERATE_PLOTS = {self.params.generate_plots}',
|
|
1688
|
+
f'EXPORT_FORMATS = {self.params.export_formats!r}',
|
|
1689
|
+
f'COMPRESS_OUTPUT = {self.params.compress_output}',
|
|
1690
|
+
f'CLEANUP_TEMP_FILES = {self.params.cleanup_temp_files}',
|
|
1691
|
+
'',
|
|
1692
|
+
'',
|
|
1693
|
+
'def log_progress(message: str):',
|
|
1694
|
+
' """Log progress message with timestamp."""',
|
|
1695
|
+
' timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")',
|
|
1696
|
+
' print(f"[{timestamp}] {message}")',
|
|
1697
|
+
'',
|
|
1698
|
+
'',
|
|
1699
|
+
'def discover_files() -> List[Path]:',
|
|
1700
|
+
' """Discover raw data files in the source directory."""',
|
|
1701
|
+
' log_progress("Discovering raw data files...")',
|
|
1702
|
+
' data_source_path = Path(DATA_SOURCE)',
|
|
1703
|
+
' found_files = []',
|
|
1704
|
+
' ',
|
|
1705
|
+
' for extension in FILE_EXTENSIONS:',
|
|
1706
|
+
' if SEARCH_SUBFOLDERS:',
|
|
1707
|
+
' pattern = f"**/*{extension}"',
|
|
1708
|
+
' files = list(data_source_path.rglob(pattern))',
|
|
1709
|
+
' else:',
|
|
1710
|
+
' pattern = f"*{extension}"',
|
|
1711
|
+
' files = list(data_source_path.glob(pattern))',
|
|
1712
|
+
' ',
|
|
1713
|
+
' # Filter out files matching skip patterns',
|
|
1714
|
+
' filtered_files = []',
|
|
1715
|
+
' for file_path in files:',
|
|
1716
|
+
' skip_file = False',
|
|
1717
|
+
' for pattern in SKIP_PATTERNS:',
|
|
1718
|
+
' if pattern.lower() in file_path.name.lower():',
|
|
1719
|
+
' skip_file = True',
|
|
1720
|
+
' print(f"Skipping file (matches pattern \'{pattern}\'): {file_path.name}")',
|
|
1721
|
+
' break',
|
|
1722
|
+
' ',
|
|
1723
|
+
' if not skip_file:',
|
|
1724
|
+
' # Check file size',
|
|
1725
|
+
' try:',
|
|
1726
|
+
' file_size_gb = file_path.stat().st_size / (1024**3)',
|
|
1727
|
+
' if file_size_gb > MAX_FILE_SIZE_GB:',
|
|
1728
|
+
' print(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")',
|
|
1729
|
+
' filtered_files.append(file_path)',
|
|
1730
|
+
' except Exception as e:',
|
|
1731
|
+
' print(f"Could not check file size for {file_path}: {e}")',
|
|
1732
|
+
' filtered_files.append(file_path)',
|
|
1733
|
+
' ',
|
|
1734
|
+
' found_files.extend(filtered_files)',
|
|
1735
|
+
' log_progress(f"Found {len(filtered_files)} {extension} files")',
|
|
1736
|
+
' ',
|
|
1737
|
+
' # Remove duplicates and sort',
|
|
1738
|
+
' found_files = sorted(list(set(found_files)))',
|
|
1739
|
+
' log_progress(f"Total files discovered: {len(found_files)}")',
|
|
1740
|
+
' return found_files',
|
|
1741
|
+
'',
|
|
1742
|
+
'',
|
|
1743
|
+
'def process_single_file(file_path: Path) -> Optional[str]:',
|
|
1744
|
+
' """Process a single file to sample5 format."""',
|
|
1745
|
+
' import gc',
|
|
1746
|
+
' study_folder_path = Path(STUDY_FOLDER)',
|
|
1747
|
+
' ',
|
|
1748
|
+
' # Generate output filename',
|
|
1749
|
+
' file_out = file_path.stem + ".sample5"',
|
|
1750
|
+
' output_file = study_folder_path / file_out',
|
|
1751
|
+
' ',
|
|
1752
|
+
' # Check if file already exists',
|
|
1753
|
+
' if output_file.exists():',
|
|
1754
|
+
' try:',
|
|
1755
|
+
' # Try to load existing file to verify it\'s valid',
|
|
1756
|
+
' sample = masster.Sample(log_level="ERROR")',
|
|
1757
|
+
' sample.load(str(output_file))',
|
|
1758
|
+
' print(f"Skipping {file_path.name} (already processed)")',
|
|
1759
|
+
' return output_file.stem',
|
|
1760
|
+
' except Exception:',
|
|
1761
|
+
' # If loading fails, file needs to be reprocessed',
|
|
1762
|
+
' pass',
|
|
1763
|
+
' ',
|
|
1764
|
+
' print(f"Processing {file_path.name}")',
|
|
1765
|
+
' ',
|
|
1766
|
+
' try:',
|
|
1767
|
+
' # Initialize sample',
|
|
1768
|
+
' sample = masster.Sample(',
|
|
1769
|
+
' log_label=file_path.name,',
|
|
1770
|
+
' log_level="ERROR" # Reduce logging overhead',
|
|
1771
|
+
' )',
|
|
1772
|
+
' ',
|
|
1773
|
+
' # STEP 1: Load raw data',
|
|
1774
|
+
' sample.load(str(file_path))',
|
|
1775
|
+
' ',
|
|
1776
|
+
' # STEP 2: Feature detection - First pass (strict parameters)',
|
|
1777
|
+
' sample.find_features(',
|
|
1778
|
+
' chrom_fwhm=CHROM_FWHM,',
|
|
1779
|
+
' noise=NOISE_THRESHOLD,',
|
|
1780
|
+
' tol_ppm=TOL_PPM,',
|
|
1781
|
+
' chrom_peak_snr=CHROM_PEAK_SNR,',
|
|
1782
|
+
' min_trace_length_multiplier=0.5,',
|
|
1783
|
+
' chrom_fwhm_min=CHROM_FWHM',
|
|
1784
|
+
' )',
|
|
1785
|
+
' ',
|
|
1786
|
+
' # STEP 3: Feature detection - Second pass (relaxed parameters)',
|
|
1787
|
+
' sample.find_features(',
|
|
1788
|
+
' chrom_peak_snr=CHROM_PEAK_SNR,',
|
|
1789
|
+
' noise=NOISE_THRESHOLD / 10, # Lower noise threshold',
|
|
1790
|
+
' chrom_fwhm=2.0 # Wider peaks',
|
|
1791
|
+
' )',
|
|
1792
|
+
' ',
|
|
1793
|
+
' # STEP 3.5: Validate feature detection results',
|
|
1794
|
+
' if not hasattr(sample, "features_df") or sample.features_df is None or len(sample.features_df) == 0:',
|
|
1795
|
+
' print(f"WARNING: No features detected in {file_path.name} - skipping additional processing")',
|
|
1796
|
+
' # Still save the sample5 file for record keeping',
|
|
1797
|
+
' sample.save(filename=str(output_file))',
|
|
1798
|
+
' return output_file.stem',
|
|
1799
|
+
' ',
|
|
1800
|
+
' print(f"Detected {len(sample.features_df)} features in {file_path.name}")',
|
|
1801
|
+
' ',
|
|
1802
|
+
' # STEP 4: Adduct detection',
|
|
1803
|
+
' sample.find_adducts(adducts=ADDUCTS)',
|
|
1804
|
+
' ',
|
|
1805
|
+
' # STEP 5: MS2 spectrum identification',
|
|
1806
|
+
' sample.find_ms2()',
|
|
1807
|
+
' ',
|
|
1808
|
+
' # STEP 6: Save processed data',
|
|
1809
|
+
' sample.save(filename=str(output_file))',
|
|
1810
|
+
' ',
|
|
1811
|
+
' # STEP 7: Generate additional outputs (only for samples with features)',
|
|
1812
|
+
' # Skip CSV export and individual MGF export as requested',
|
|
1813
|
+
' ',
|
|
1814
|
+
' if GENERATE_PLOTS:',
|
|
1815
|
+
' plot_file = output_file.parent / (output_file.stem + "_2d.html")',
|
|
1816
|
+
' sample.plot_2d(filename=str(plot_file), markersize=4)',
|
|
1817
|
+
' ',
|
|
1818
|
+
' # Memory cleanup',
|
|
1819
|
+
' result = output_file.stem',
|
|
1820
|
+
' del sample',
|
|
1821
|
+
' gc.collect()',
|
|
1822
|
+
' return result',
|
|
1823
|
+
' ',
|
|
1824
|
+
' except Exception as e:',
|
|
1825
|
+
' print(f"Error processing {file_path.name}: {e}")',
|
|
1826
|
+
' gc.collect()',
|
|
1827
|
+
' return None',
|
|
1828
|
+
'',
|
|
1829
|
+
'',
|
|
1830
|
+
'def convert_to_sample5(file_list: List[Path]) -> bool:',
|
|
1831
|
+
' """Convert raw data files to sample5 format in parallel."""',
|
|
1832
|
+
' log_progress("=== Starting Sample5 Conversion ===")',
|
|
1833
|
+
' log_progress(f"Converting {len(file_list)} files to sample5 format")',
|
|
1834
|
+
' ',
|
|
1835
|
+
' conversion_start = time.time()',
|
|
1836
|
+
' successful_count = 0',
|
|
1837
|
+
' failed_count = 0',
|
|
1838
|
+
' ',
|
|
1839
|
+
' with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_CORES) as executor:',
|
|
1840
|
+
' futures = [executor.submit(process_single_file, file_path) for file_path in file_list]',
|
|
1841
|
+
' ',
|
|
1842
|
+
' for i, future in enumerate(concurrent.futures.as_completed(futures)):',
|
|
1843
|
+
' result = future.result()',
|
|
1844
|
+
' if result:',
|
|
1845
|
+
' successful_count += 1',
|
|
1846
|
+
' else:',
|
|
1847
|
+
' failed_count += 1',
|
|
1848
|
+
' ',
|
|
1849
|
+
' # Progress update',
|
|
1850
|
+
' if (successful_count + failed_count) % 10 == 0:',
|
|
1851
|
+
' progress = (successful_count + failed_count) / len(file_list) * 100',
|
|
1852
|
+
' log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")',
|
|
1853
|
+
' ',
|
|
1854
|
+
' conversion_time = time.time() - conversion_start',
|
|
1855
|
+
' log_progress("=== Sample5 Conversion Complete ===")',
|
|
1856
|
+
' log_progress(f"Successful: {successful_count}")',
|
|
1857
|
+
' log_progress(f"Failed: {failed_count}")',
|
|
1858
|
+
' log_progress(f"Total time: {conversion_time:.1f} seconds")',
|
|
1859
|
+
' ',
|
|
1860
|
+
' return successful_count > 0',
|
|
1861
|
+
'',
|
|
1862
|
+
'',
|
|
1863
|
+
'def assemble_study() -> masster.Study:',
|
|
1864
|
+
' """Assemble processed sample5 files into a study."""',
|
|
1865
|
+
' log_progress("=== Starting Study Assembly ===")',
|
|
1866
|
+
' study_folder_path = Path(STUDY_FOLDER)',
|
|
1867
|
+
' ',
|
|
1868
|
+
' # Find all sample5 files',
|
|
1869
|
+
' sample5_files = list(study_folder_path.glob("*.sample5"))',
|
|
1870
|
+
' if not sample5_files:',
|
|
1871
|
+
' raise RuntimeError("No sample5 files found for study assembly")',
|
|
1872
|
+
' ',
|
|
1873
|
+
' log_progress(f"Assembling study from {len(sample5_files)} sample5 files")',
|
|
1874
|
+
' ',
|
|
1875
|
+
' # Create study with optimized settings',
|
|
1876
|
+
' from masster.study.defaults.study_def import study_defaults',
|
|
1877
|
+
' study_params = study_defaults(',
|
|
1878
|
+
' folder=str(study_folder_path),',
|
|
1879
|
+
' polarity=POLARITY,',
|
|
1880
|
+
' log_level="INFO",',
|
|
1881
|
+
f' log_label="Study-{self.params.polarity}",',
|
|
1882
|
+
' adducts=ADDUCTS',
|
|
1883
|
+
' )',
|
|
1884
|
+
' ',
|
|
1885
|
+
' study = masster.Study(params=study_params)',
|
|
1886
|
+
' ',
|
|
1887
|
+
' # Add all sample5 files',
|
|
1888
|
+
' sample5_pattern = str(study_folder_path / "*.sample5")',
|
|
1889
|
+
' study.add(sample5_pattern)',
|
|
1890
|
+
' log_progress(f"Added {len(study.samples_df)} samples to study")',
|
|
1891
|
+
' ',
|
|
1892
|
+
' # Filter features based on quality criteria',
|
|
1893
|
+
' if hasattr(study, "features_filter"):',
|
|
1894
|
+
' initial_features = len(study.features_df) if hasattr(study, "features_df") else 0',
|
|
1895
|
+
' feature_selection = study.features_select(',
|
|
1896
|
+
' chrom_coherence=0.3,',
|
|
1897
|
+
' chrom_prominence_scaled=1',
|
|
1898
|
+
' )',
|
|
1899
|
+
' study.features_filter(feature_selection)',
|
|
1900
|
+
' final_features = len(study.features_df) if hasattr(study, "features_df") else 0',
|
|
1901
|
+
' log_progress(f"Feature filtering: {initial_features} -> {final_features} features")',
|
|
1902
|
+
' ',
|
|
1903
|
+
' return study',
|
|
1904
|
+
'',
|
|
1905
|
+
'',
|
|
1906
|
+
'def align_and_merge(study: masster.Study) -> masster.Study:',
|
|
1907
|
+
' """Perform feature alignment and merging."""',
|
|
1908
|
+
' log_progress("=== Starting Feature Alignment and Merging ===")',
|
|
1909
|
+
' ',
|
|
1910
|
+
' # Import alignment and merge defaults',
|
|
1911
|
+
' from masster.study.defaults.align_def import align_defaults',
|
|
1912
|
+
' from masster.study.defaults.merge_def import merge_defaults',
|
|
1913
|
+
' ',
|
|
1914
|
+
' # Determine optimal algorithms based on study size',
|
|
1915
|
+
' num_samples = len(study.samples_df)',
|
|
1916
|
+
' ',
|
|
1917
|
+
' if num_samples < 500:',
|
|
1918
|
+
' # For smaller studies: use qt for both alignment and merge',
|
|
1919
|
+
' alignment_algorithm = "qt"',
|
|
1920
|
+
' merge_method = "qt"',
|
|
1921
|
+
' log_progress(f"Small study ({num_samples} samples) - using qt algorithms")',
|
|
1922
|
+
' else:',
|
|
1923
|
+
' # For larger studies: use kd for alignment and qt-chunked for merge',
|
|
1924
|
+
' alignment_algorithm = "kd"',
|
|
1925
|
+
' merge_method = "qt-chunked"',
|
|
1926
|
+
' log_progress(f"Large study ({num_samples} samples) - using kd alignment and qt-chunked merge")',
|
|
1927
|
+
' ',
|
|
1928
|
+
' # Align features across samples',
|
|
1929
|
+
' align_params = align_defaults(',
|
|
1930
|
+
' rt_tol=RT_TOLERANCE,',
|
|
1931
|
+
' mz_max_diff=MZ_TOLERANCE,',
|
|
1932
|
+
' algorithm=alignment_algorithm',
|
|
1933
|
+
' )',
|
|
1934
|
+
' ',
|
|
1935
|
+
' log_progress(f"Aligning features with RT tolerance {RT_TOLERANCE}s, m/z tolerance {MZ_TOLERANCE} Da, algorithm: {alignment_algorithm}")',
|
|
1936
|
+
' study.align(params=align_params)',
|
|
1937
|
+
' ',
|
|
1938
|
+
' # Merge aligned features',
|
|
1939
|
+
' merge_params = merge_defaults(',
|
|
1940
|
+
' method=merge_method,',
|
|
1941
|
+
' rt_tol=RT_TOLERANCE,',
|
|
1942
|
+
' mz_tol=MZ_TOLERANCE,',
|
|
1943
|
+
' min_samples=MIN_SAMPLES_FOR_MERGE',
|
|
1944
|
+
' )',
|
|
1945
|
+
' ',
|
|
1946
|
+
' log_progress(f"Merging features using {merge_method} method")',
|
|
1947
|
+
' study.merge(params=merge_params)',
|
|
1948
|
+
' ',
|
|
1949
|
+
' # Log results',
|
|
1950
|
+
' num_consensus = len(study.consensus_df) if hasattr(study, "consensus_df") else 0',
|
|
1951
|
+
' log_progress(f"Generated {num_consensus} consensus features")',
|
|
1952
|
+
' ',
|
|
1953
|
+
' # Get study info',
|
|
1954
|
+
' if hasattr(study, "info"):',
|
|
1955
|
+
' study.info()',
|
|
1956
|
+
' ',
|
|
1957
|
+
' return study',
|
|
1958
|
+
'',
|
|
1959
|
+
'',
|
|
1960
|
+
'def generate_plots(study: masster.Study) -> bool:',
|
|
1961
|
+
' """Generate visualization plots for the study."""',
|
|
1962
|
+
' if not GENERATE_PLOTS:',
|
|
1963
|
+
' log_progress("Plot generation disabled, skipping...")',
|
|
1964
|
+
' return True',
|
|
1965
|
+
' ',
|
|
1966
|
+
' log_progress("=== Generating Visualization Plots ===")',
|
|
1967
|
+
' study_folder_path = Path(STUDY_FOLDER)',
|
|
1968
|
+
' plots_generated = 0',
|
|
1969
|
+
' ',
|
|
1970
|
+
' try:',
|
|
1971
|
+
' # Alignment plot',
|
|
1972
|
+
' if hasattr(study, "plot_alignment"):',
|
|
1973
|
+
' alignment_plot = study_folder_path / "alignment_plot.html"',
|
|
1974
|
+
' study.plot_alignment(filename=str(alignment_plot))',
|
|
1975
|
+
' plots_generated += 1',
|
|
1976
|
+
' log_progress(f"Generated alignment plot: {alignment_plot}")',
|
|
1977
|
+
' ',
|
|
1978
|
+
' # Consensus 2D plot',
|
|
1979
|
+
' if hasattr(study, "plot_consensus_2d"):',
|
|
1980
|
+
' consensus_2d_plot = study_folder_path / "consensus_2d.html"',
|
|
1981
|
+
' study.plot_consensus_2d(filename=str(consensus_2d_plot))',
|
|
1982
|
+
' plots_generated += 1',
|
|
1983
|
+
' log_progress(f"Generated consensus 2D plot: {consensus_2d_plot}")',
|
|
1984
|
+
' ',
|
|
1985
|
+
' # PCA plot',
|
|
1986
|
+
' if hasattr(study, "plot_pca"):',
|
|
1987
|
+
' pca_plot = study_folder_path / "pca_plot.html"',
|
|
1988
|
+
' study.plot_pca(filename=str(pca_plot))',
|
|
1989
|
+
' plots_generated += 1',
|
|
1990
|
+
' log_progress(f"Generated PCA plot: {pca_plot}")',
|
|
1991
|
+
' ',
|
|
1992
|
+
' # Consensus statistics',
|
|
1993
|
+
' if hasattr(study, "plot_consensus_stats"):',
|
|
1994
|
+
' stats_plot = study_folder_path / "consensus_stats.html"',
|
|
1995
|
+
' study.plot_consensus_stats(filename=str(stats_plot))',
|
|
1996
|
+
' plots_generated += 1',
|
|
1997
|
+
' log_progress(f"Generated statistics plot: {stats_plot}")',
|
|
1998
|
+
' ',
|
|
1999
|
+
' log_progress(f"Generated {plots_generated} visualization plots")',
|
|
2000
|
+
' return True',
|
|
2001
|
+
' ',
|
|
2002
|
+
' except Exception as e:',
|
|
2003
|
+
' print(f"Failed to generate plots: {e}")',
|
|
2004
|
+
' return False',
|
|
2005
|
+
'',
|
|
2006
|
+
'',
|
|
2007
|
+
'def export_results(study: masster.Study) -> bool:',
|
|
2008
|
+
' """Export study results in requested formats."""',
|
|
2009
|
+
' log_progress("=== Exporting Study Results ===")',
|
|
2010
|
+
' study_folder_path = Path(STUDY_FOLDER)',
|
|
2011
|
+
' exports_completed = 0',
|
|
2012
|
+
' ',
|
|
2013
|
+
' try:',
|
|
2014
|
+
' # Skip CSV export as requested',
|
|
2015
|
+
' ',
|
|
2016
|
+
' # Export as Excel',
|
|
2017
|
+
' if "xlsx" in EXPORT_FORMATS and hasattr(study, "export_xlsx"):',
|
|
2018
|
+
' xlsx_file = study_folder_path / "study_results.xlsx"',
|
|
2019
|
+
' study.export_xlsx(filename=str(xlsx_file))',
|
|
2020
|
+
' exports_completed += 1',
|
|
2021
|
+
' log_progress(f"Exported Excel: {xlsx_file}")',
|
|
2022
|
+
' ',
|
|
2023
|
+
' # Export MGF for MS2 spectra',
|
|
2024
|
+
' if "mgf" in EXPORT_FORMATS and hasattr(study, "export_mgf"):',
|
|
2025
|
+
' mgf_file = study_folder_path / "consensus_ms2.mgf"',
|
|
2026
|
+
' study.export_mgf(filename=str(mgf_file))',
|
|
2027
|
+
' exports_completed += 1',
|
|
2028
|
+
' log_progress(f"Exported MGF: {mgf_file}")',
|
|
2029
|
+
' ',
|
|
2030
|
+
' # Export as Parquet for efficient storage',
|
|
2031
|
+
' if "parquet" in EXPORT_FORMATS and hasattr(study, "export_parquet"):',
|
|
2032
|
+
' parquet_file = study_folder_path / "study_data.parquet"',
|
|
2033
|
+
' study.export_parquet(filename=str(parquet_file))',
|
|
2034
|
+
' exports_completed += 1',
|
|
2035
|
+
' log_progress(f"Exported Parquet: {parquet_file}")',
|
|
2036
|
+
' ',
|
|
2037
|
+
' log_progress(f"Completed {exports_completed} exports")',
|
|
2038
|
+
' ',
|
|
2039
|
+
' # Always perform additional exports as requested',
|
|
2040
|
+
' log_progress("=== Exporting Additional Formats ===")',
|
|
2041
|
+
' ',
|
|
2042
|
+
' try:',
|
|
2043
|
+
' # Force export xlsx (study results in Excel format)',
|
|
2044
|
+
' xlsx_file = study_folder_path / "study_results.xlsx"',
|
|
2045
|
+
' if hasattr(study, "export_xlsx"):',
|
|
2046
|
+
' study.export_xlsx(filename=str(xlsx_file))',
|
|
2047
|
+
' log_progress(f"Exported Excel: {xlsx_file}")',
|
|
2048
|
+
' ',
|
|
2049
|
+
' # Force export parquet (efficient binary format)',
|
|
2050
|
+
' parquet_file = study_folder_path / "study_data.parquet"',
|
|
2051
|
+
' if hasattr(study, "export_parquet"):',
|
|
2052
|
+
' study.export_parquet(filename=str(parquet_file))',
|
|
2053
|
+
' log_progress(f"Exported Parquet: {parquet_file}")',
|
|
2054
|
+
' ',
|
|
2055
|
+
' # Force save the study in study5 format',
|
|
2056
|
+
' study_file = study_folder_path / "final_study.study5"',
|
|
2057
|
+
' study.save(filename=str(study_file))',
|
|
2058
|
+
' log_progress(f"Saved study: {study_file}")',
|
|
2059
|
+
' ',
|
|
2060
|
+
' # Force export MGF for MS2 spectra',
|
|
2061
|
+
' mgf_file = study_folder_path / "consensus_ms2.mgf"',
|
|
2062
|
+
' if hasattr(study, "export_mgf"):',
|
|
2063
|
+
' study.export_mgf(filename=str(mgf_file))',
|
|
2064
|
+
' log_progress(f"Exported MGF: {mgf_file}")',
|
|
2065
|
+
' ',
|
|
2066
|
+
' except Exception as e:',
|
|
2067
|
+
' print(f"Some additional exports failed: {e}")',
|
|
2068
|
+
' ',
|
|
2069
|
+
' return True',
|
|
2070
|
+
' ',
|
|
2071
|
+
' except Exception as e:',
|
|
2072
|
+
' print(f"Failed to export results: {e}")',
|
|
2073
|
+
' return False',
|
|
2074
|
+
'',
|
|
2075
|
+
'',
|
|
2076
|
+
'def save_study(study: masster.Study) -> bool:',
|
|
2077
|
+
' """Save the final study in optimized format."""',
|
|
2078
|
+
' log_progress("=== Saving Final Study ===")',
|
|
2079
|
+
' study_folder_path = Path(STUDY_FOLDER)',
|
|
2080
|
+
' ',
|
|
2081
|
+
' try:',
|
|
2082
|
+
' study_file = study_folder_path / "final_study.study5"',
|
|
2083
|
+
' ',
|
|
2084
|
+
' # Determine optimal save format based on study size',
|
|
2085
|
+
' num_samples = len(study.samples_df)',
|
|
2086
|
+
' num_features = len(study.consensus_df) if hasattr(study, "consensus_df") else 0',
|
|
2087
|
+
' ',
|
|
2088
|
+
' if num_samples > 50 or num_features > 10000:',
|
|
2089
|
+
' log_progress(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")',
|
|
2090
|
+
' compress_output = True',
|
|
2091
|
+
' else:',
|
|
2092
|
+
' log_progress(f"Small study ({num_samples} samples, {num_features} features) - using standard format")',
|
|
2093
|
+
' compress_output = False',
|
|
2094
|
+
' ',
|
|
2095
|
+
' # Save study',
|
|
2096
|
+
' if compress_output and hasattr(study, "save_compressed"):',
|
|
2097
|
+
' study.save_compressed(filename=str(study_file))',
|
|
2098
|
+
' log_progress(f"Saved compressed study: {study_file}")',
|
|
2099
|
+
' else:',
|
|
2100
|
+
' study.save(filename=str(study_file))',
|
|
2101
|
+
' log_progress(f"Saved study: {study_file}")',
|
|
2102
|
+
' ',
|
|
2103
|
+
' # Save metadata summary',
|
|
2104
|
+
' metadata_file = study_folder_path / "study_metadata.txt"',
|
|
2105
|
+
' with open(metadata_file, "w") as f:',
|
|
2106
|
+
' f.write("Study Processing Summary\\n")',
|
|
2107
|
+
' f.write("========================\\n")',
|
|
2108
|
+
' f.write(f"Processing Date: {datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\')}\\n")',
|
|
2109
|
+
' f.write(f"Polarity: {POLARITY}\\n")',
|
|
2110
|
+
' f.write(f"Adducts: {\', \'.join(ADDUCTS)}\\n")',
|
|
2111
|
+
' f.write(f"Number of Samples: {num_samples}\\n")',
|
|
2112
|
+
' f.write(f"Number of Consensus Features: {num_features}\\n")',
|
|
2113
|
+
' f.write(f"RT Tolerance: {RT_TOLERANCE}s\\n")',
|
|
2114
|
+
' f.write(f"m/z Tolerance: {MZ_TOLERANCE} Da\\n")',
|
|
2115
|
+
' f.write(f"Merge Method: {MERGE_METHOD}\\n")',
|
|
2116
|
+
' ',
|
|
2117
|
+
' log_progress(f"Saved study metadata: {metadata_file}")',
|
|
2118
|
+
' return True',
|
|
2119
|
+
' ',
|
|
2120
|
+
' except Exception as e:',
|
|
2121
|
+
' print(f"Failed to save study: {e}")',
|
|
2122
|
+
' return False',
|
|
2123
|
+
'',
|
|
2124
|
+
'',
|
|
2125
|
+
'def cleanup_temp_files() -> bool:',
|
|
2126
|
+
' """Clean up temporary files if requested."""',
|
|
2127
|
+
' if not CLEANUP_TEMP_FILES:',
|
|
2128
|
+
' return True',
|
|
2129
|
+
' ',
|
|
2130
|
+
' log_progress("=== Cleaning Up Temporary Files ===")',
|
|
2131
|
+
' study_folder_path = Path(STUDY_FOLDER)',
|
|
2132
|
+
' ',
|
|
2133
|
+
' try:',
|
|
2134
|
+
' cleaned_count = 0',
|
|
2135
|
+
' ',
|
|
2136
|
+
' # Remove individual sample plots if study plots were generated',
|
|
2137
|
+
' if GENERATE_PLOTS:',
|
|
2138
|
+
' temp_plots = list(study_folder_path.glob("*_2d.html"))',
|
|
2139
|
+
' for plot_file in temp_plots:',
|
|
2140
|
+
' if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:',
|
|
2141
|
+
' plot_file.unlink()',
|
|
2142
|
+
' cleaned_count += 1',
|
|
2143
|
+
' ',
|
|
2144
|
+
' log_progress(f"Cleaned up {cleaned_count} temporary files")',
|
|
2145
|
+
' return True',
|
|
2146
|
+
' ',
|
|
2147
|
+
' except Exception as e:',
|
|
2148
|
+
' print(f"Failed to cleanup temp files: {e}")',
|
|
2149
|
+
' return False',
|
|
2150
|
+
'',
|
|
2151
|
+
'',
|
|
2152
|
+
'def main():',
|
|
2153
|
+
' """Main execution function."""',
|
|
2154
|
+
' print("=" * 70)',
|
|
2155
|
+
' print("AUTOMATED MASS SPECTROMETRY ANALYSIS SCRIPT")',
|
|
2156
|
+
f' print("Generated by masster.Wizard on {timestamp}")',
|
|
2157
|
+
' print("=" * 70)',
|
|
2158
|
+
' ',
|
|
2159
|
+
' start_time = time.time()',
|
|
2160
|
+
' ',
|
|
2161
|
+
' try:',
|
|
2162
|
+
' # Ensure output directory exists',
|
|
2163
|
+
' Path(STUDY_FOLDER).mkdir(parents=True, exist_ok=True)',
|
|
2164
|
+
' ',
|
|
2165
|
+
' # Step 1: Discover files',
|
|
2166
|
+
' file_list = discover_files()',
|
|
2167
|
+
' if not file_list:',
|
|
2168
|
+
' print("No files found for processing")',
|
|
2169
|
+
' return False',
|
|
2170
|
+
' ',
|
|
2171
|
+
' # Step 2: Convert to sample5',
|
|
2172
|
+
' if not convert_to_sample5(file_list):',
|
|
2173
|
+
' print("Sample5 conversion failed")',
|
|
2174
|
+
' return False',
|
|
2175
|
+
' ',
|
|
2176
|
+
' # Step 3: Assemble study',
|
|
2177
|
+
' study = assemble_study()',
|
|
2178
|
+
' ',
|
|
2179
|
+
' # Step 4: Align and merge',
|
|
2180
|
+
' study = align_and_merge(study)',
|
|
2181
|
+
' ',
|
|
2182
|
+
' # Step 5: Generate plots',
|
|
2183
|
+
' generate_plots(study)',
|
|
2184
|
+
' ',
|
|
2185
|
+
' # Step 6: Export results',
|
|
2186
|
+
' export_results(study)',
|
|
2187
|
+
' ',
|
|
2188
|
+
' # Step 7: Save study',
|
|
2189
|
+
' save_study(study)',
|
|
2190
|
+
' ',
|
|
2191
|
+
' # Step 8: Cleanup',
|
|
2192
|
+
' cleanup_temp_files()',
|
|
2193
|
+
' ',
|
|
2194
|
+
' # Final summary',
|
|
2195
|
+
' total_time = time.time() - start_time',
|
|
2196
|
+
' print("=" * 70)',
|
|
2197
|
+
' print("ANALYSIS COMPLETED SUCCESSFULLY")',
|
|
2198
|
+
' print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")',
|
|
2199
|
+
' if hasattr(study, "consensus_df"):',
|
|
2200
|
+
' print(f"Consensus features generated: {len(study.consensus_df)}")',
|
|
2201
|
+
' print("=" * 70)',
|
|
2202
|
+
' ',
|
|
2203
|
+
' return True',
|
|
2204
|
+
' ',
|
|
2205
|
+
' except KeyboardInterrupt:',
|
|
2206
|
+
' print("\\nAnalysis interrupted by user")',
|
|
2207
|
+
' return False',
|
|
2208
|
+
' except Exception as e:',
|
|
2209
|
+
' print(f"Analysis failed with error: {e}")',
|
|
2210
|
+
' import traceback',
|
|
2211
|
+
' traceback.print_exc()',
|
|
2212
|
+
' return False',
|
|
2213
|
+
'',
|
|
2214
|
+
'',
|
|
2215
|
+
'if __name__ == "__main__":',
|
|
2216
|
+
' success = main()',
|
|
2217
|
+
' sys.exit(0 if success else 1)',
|
|
2218
|
+
]
|
|
2219
|
+
|
|
2220
|
+
return '\n'.join(script_lines)
|
|
2221
|
+
|
|
1147
2222
|
def info(self):
|
|
1148
2223
|
"""Print comprehensive wizard status information."""
|
|
1149
2224
|
status = self.get_status()
|
|
@@ -1157,6 +2232,8 @@ class Wizard:
|
|
|
1157
2232
|
print(f"Polarity: {status['parameters']['polarity']}")
|
|
1158
2233
|
print(f"CPU Cores: {status['parameters']['num_cores']}")
|
|
1159
2234
|
print(f"Adducts: {', '.join(status['parameters']['adducts'])}")
|
|
2235
|
+
print(f"Detector Type: {self.params.detector_type}")
|
|
2236
|
+
print(f"Noise Threshold: {self.params.noise_threshold}")
|
|
1160
2237
|
print(f"Processing Time: {status['processing_time']}")
|
|
1161
2238
|
print(f"Files Processed: {status['processed_files']}")
|
|
1162
2239
|
print(f"Files Failed: {status['failed_files']}")
|
|
@@ -1171,5 +2248,100 @@ class Wizard:
|
|
|
1171
2248
|
print("=" * 50)
|
|
1172
2249
|
|
|
1173
2250
|
|
|
1174
|
-
|
|
1175
|
-
|
|
2251
|
+
def create_script(
|
|
2252
|
+
source: str,
|
|
2253
|
+
study_folder: str,
|
|
2254
|
+
filename: str,
|
|
2255
|
+
polarity: str = "positive",
|
|
2256
|
+
adducts: Optional[List[str]] = None,
|
|
2257
|
+
params: Optional[wizard_def] = None,
|
|
2258
|
+
num_cores: int = 0,
|
|
2259
|
+
**kwargs
|
|
2260
|
+
) -> bool:
|
|
2261
|
+
"""
|
|
2262
|
+
Create a standalone analysis script without initializing a Wizard instance.
|
|
2263
|
+
|
|
2264
|
+
This function generates a Python script that replicates automated processing
|
|
2265
|
+
steps with the specified configuration. The script can be executed independently
|
|
2266
|
+
to perform the same analysis.
|
|
2267
|
+
|
|
2268
|
+
Parameters:
|
|
2269
|
+
source: Directory containing raw data files
|
|
2270
|
+
study_folder: Output directory for processed study
|
|
2271
|
+
filename: Filename for the generated script (should end with .py)
|
|
2272
|
+
polarity: Ion polarity mode ("positive" or "negative")
|
|
2273
|
+
adducts: List of adduct specifications (auto-set if None)
|
|
2274
|
+
params: Custom wizard_def parameters (optional)
|
|
2275
|
+
num_cores: Number of CPU cores (0 = auto-detect)
|
|
2276
|
+
**kwargs: Additional parameters to override defaults
|
|
2277
|
+
|
|
2278
|
+
Returns:
|
|
2279
|
+
True if script was generated successfully, False otherwise
|
|
2280
|
+
|
|
2281
|
+
Example:
|
|
2282
|
+
>>> from masster.wizard import create_script
|
|
2283
|
+
>>> create_script(
|
|
2284
|
+
... source=r'D:\\Data\\raw_files',
|
|
2285
|
+
... study_folder=r'D:\\Data\\output',
|
|
2286
|
+
... filename='run_masster.py',
|
|
2287
|
+
... polarity='positive'
|
|
2288
|
+
... )
|
|
2289
|
+
"""
|
|
2290
|
+
|
|
2291
|
+
try:
|
|
2292
|
+
# Create parameters
|
|
2293
|
+
if params is not None:
|
|
2294
|
+
# Use provided params as base
|
|
2295
|
+
wizard_params = params
|
|
2296
|
+
# Update with provided values
|
|
2297
|
+
wizard_params.data_source = source
|
|
2298
|
+
wizard_params.study_folder = study_folder
|
|
2299
|
+
if polarity != "positive": # Only override if explicitly different
|
|
2300
|
+
wizard_params.polarity = polarity
|
|
2301
|
+
if num_cores > 0:
|
|
2302
|
+
wizard_params.num_cores = num_cores
|
|
2303
|
+
if adducts is not None:
|
|
2304
|
+
wizard_params.adducts = adducts
|
|
2305
|
+
else:
|
|
2306
|
+
# Create new params with provided values
|
|
2307
|
+
wizard_params = wizard_def(
|
|
2308
|
+
data_source=source,
|
|
2309
|
+
study_folder=study_folder,
|
|
2310
|
+
polarity=polarity,
|
|
2311
|
+
num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
|
|
2312
|
+
)
|
|
2313
|
+
|
|
2314
|
+
if adducts is not None:
|
|
2315
|
+
wizard_params.adducts = adducts
|
|
2316
|
+
|
|
2317
|
+
# Apply any additional kwargs
|
|
2318
|
+
for key, value in kwargs.items():
|
|
2319
|
+
if hasattr(wizard_params, key):
|
|
2320
|
+
setattr(wizard_params, key, value)
|
|
2321
|
+
|
|
2322
|
+
# Ensure study folder exists
|
|
2323
|
+
study_path = Path(study_folder)
|
|
2324
|
+
study_path.mkdir(parents=True, exist_ok=True)
|
|
2325
|
+
|
|
2326
|
+
# Create a temporary Wizard instance to generate the script
|
|
2327
|
+
temp_wizard = Wizard(params=wizard_params)
|
|
2328
|
+
|
|
2329
|
+
# Generate the script using the existing method
|
|
2330
|
+
success = temp_wizard.export_script(filename)
|
|
2331
|
+
|
|
2332
|
+
if success:
|
|
2333
|
+
script_path = study_path / Path(filename).name
|
|
2334
|
+
print(f"Analysis script created: {script_path.absolute()}")
|
|
2335
|
+
print(f"Run with: python \"{script_path}\"")
|
|
2336
|
+
|
|
2337
|
+
return success
|
|
2338
|
+
|
|
2339
|
+
except Exception as e:
|
|
2340
|
+
print(f"Failed to create script: {e}")
|
|
2341
|
+
import traceback
|
|
2342
|
+
traceback.print_exc()
|
|
2343
|
+
return False
|
|
2344
|
+
|
|
2345
|
+
|
|
2346
|
+
# Export the main classes and functions
|
|
2347
|
+
__all__ = ["Wizard", "wizard_def", "create_script"]
|