masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/wizard/wizard.py
CHANGED
|
@@ -34,6 +34,7 @@ wizard = Wizard(
|
|
|
34
34
|
|
|
35
35
|
from __future__ import annotations
|
|
36
36
|
|
|
37
|
+
from math import e
|
|
37
38
|
import os
|
|
38
39
|
import sys
|
|
39
40
|
import time
|
|
@@ -58,99 +59,99 @@ from masster._version import __version__ as version
|
|
|
58
59
|
class wizard_def:
|
|
59
60
|
"""
|
|
60
61
|
Default parameters for the Wizard automated processing system.
|
|
61
|
-
|
|
62
|
+
|
|
62
63
|
This class provides comprehensive configuration for all stages of automated
|
|
63
64
|
mass spectrometry data processing from raw files to final results.
|
|
64
|
-
|
|
65
|
+
|
|
65
66
|
Attributes:
|
|
66
67
|
# Core Configuration
|
|
67
68
|
source (str): Path to directory containing raw data files
|
|
68
69
|
folder (str): Output directory for processed study
|
|
69
70
|
polarity (Optional[str]): Ion polarity mode ("positive", "negative", or None for auto-detection)
|
|
70
71
|
num_cores (int): Number of CPU cores to use for parallel processing
|
|
71
|
-
|
|
72
|
+
|
|
72
73
|
# File Discovery
|
|
73
74
|
file_extensions (List[str]): File extensions to search for
|
|
74
75
|
search_subfolders (bool): Whether to search subdirectories
|
|
75
76
|
skip_patterns (List[str]): Filename patterns to skip
|
|
76
|
-
|
|
77
|
+
|
|
77
78
|
# Processing Parameters
|
|
78
79
|
adducts (List[str]): Adduct specifications for given polarity
|
|
79
80
|
batch_size (int): Number of files to process per batch
|
|
80
81
|
memory_limit_gb (float): Memory limit for processing (GB)
|
|
81
|
-
|
|
82
|
+
|
|
82
83
|
# Resume & Recovery
|
|
83
84
|
resume_enabled (bool): Enable automatic resume capability
|
|
84
85
|
force_reprocess (bool): Force reprocessing of existing files
|
|
85
86
|
backup_enabled (bool): Create backups of intermediate results
|
|
86
|
-
|
|
87
|
+
|
|
87
88
|
# Output & Export
|
|
88
89
|
generate_plots (bool): Generate visualization plots
|
|
89
90
|
export_formats (List[str]): Output formats to generate
|
|
90
91
|
compress_output (bool): Compress final study file
|
|
91
|
-
|
|
92
|
+
|
|
92
93
|
# Logging
|
|
93
94
|
log_level (str): Logging detail level
|
|
94
95
|
log_to_file (bool): Save logs to file
|
|
95
96
|
progress_interval (int): Progress update interval (seconds)
|
|
96
97
|
"""
|
|
97
|
-
|
|
98
|
+
|
|
98
99
|
# === Core Configuration ===
|
|
99
100
|
source: str = ""
|
|
100
|
-
folder: str = ""
|
|
101
|
+
folder: str = ""
|
|
101
102
|
polarity: Optional[str] = None
|
|
102
103
|
num_cores: int = 4
|
|
103
|
-
|
|
104
|
+
|
|
104
105
|
# === File Discovery ===
|
|
105
106
|
file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"])
|
|
106
107
|
search_subfolders: bool = True
|
|
107
108
|
skip_patterns: List[str] = field(default_factory=lambda: ["blank", "test"])
|
|
108
|
-
|
|
109
|
+
|
|
109
110
|
# === Processing Parameters ===
|
|
110
111
|
adducts: List[str] = field(default_factory=list) # Will be set based on polarity
|
|
111
112
|
batch_size: int = 8
|
|
112
113
|
memory_limit_gb: float = 16.0
|
|
113
114
|
max_file_size_gb: float = 4.0
|
|
114
|
-
|
|
115
|
+
|
|
115
116
|
# === Resume & Recovery ===
|
|
116
117
|
resume_enabled: bool = True
|
|
117
118
|
force_reprocess: bool = False
|
|
118
119
|
backup_enabled: bool = True
|
|
119
120
|
checkpoint_interval: int = 10 # Save progress every N files
|
|
120
|
-
|
|
121
|
+
|
|
121
122
|
# === Study Assembly ===
|
|
122
123
|
min_samples_for_merge: int = 2
|
|
123
124
|
rt_tolerance: float = 1.5
|
|
124
125
|
mz_max_diff: float = 0.01
|
|
125
126
|
alignment_algorithm: str = "kd"
|
|
126
127
|
merge_method: str = "qt"
|
|
127
|
-
|
|
128
|
+
|
|
128
129
|
# === Feature Detection ===
|
|
129
|
-
chrom_fwhm: float =
|
|
130
|
-
noise: float =
|
|
130
|
+
chrom_fwhm: float | None = None
|
|
131
|
+
noise: float | None = None
|
|
131
132
|
chrom_peak_snr: float = 5.0
|
|
132
133
|
tol_ppm: float = 10.0
|
|
133
134
|
detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
|
|
134
|
-
|
|
135
|
+
|
|
135
136
|
# === Output & Export ===
|
|
136
137
|
generate_plots: bool = True
|
|
137
138
|
generate_interactive: bool = True
|
|
138
139
|
export_formats: List[str] = field(default_factory=lambda: ["csv", "mgf", "xlsx"])
|
|
139
140
|
compress_output: bool = True
|
|
140
141
|
adaptive_compression: bool = True # Adapt based on study size
|
|
141
|
-
|
|
142
|
+
|
|
142
143
|
# === Logging ===
|
|
143
144
|
log_level: str = "INFO"
|
|
144
145
|
log_to_file: bool = True
|
|
145
146
|
progress_interval: int = 30 # seconds
|
|
146
147
|
verbose_progress: bool = True
|
|
147
|
-
|
|
148
|
+
|
|
148
149
|
# === Advanced Options ===
|
|
149
150
|
use_process_pool: bool = True # vs ThreadPoolExecutor
|
|
150
151
|
optimize_memory: bool = True
|
|
151
152
|
cleanup_temp_files: bool = True
|
|
152
153
|
validate_outputs: bool = True
|
|
153
|
-
|
|
154
|
+
|
|
154
155
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
155
156
|
default_factory=lambda: {
|
|
156
157
|
"source": {
|
|
@@ -159,7 +160,7 @@ class wizard_def:
|
|
|
159
160
|
"required": True,
|
|
160
161
|
},
|
|
161
162
|
"folder": {
|
|
162
|
-
"dtype": str,
|
|
163
|
+
"dtype": str,
|
|
163
164
|
"description": "Output directory for processed study",
|
|
164
165
|
"required": True,
|
|
165
166
|
},
|
|
@@ -193,26 +194,26 @@ class wizard_def:
|
|
|
193
194
|
},
|
|
194
195
|
repr=False,
|
|
195
196
|
)
|
|
196
|
-
|
|
197
|
+
|
|
197
198
|
def __post_init__(self):
|
|
198
199
|
"""Set polarity-specific defaults after initialization."""
|
|
199
200
|
# Set default adducts based on polarity if not provided
|
|
200
201
|
if not self.adducts:
|
|
201
202
|
if self.polarity and self.polarity.lower() in ["positive", "pos"]:
|
|
202
203
|
self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
|
|
203
|
-
elif self.polarity and self.polarity.lower() in ["negative", "neg"]:
|
|
204
|
+
elif self.polarity and self.polarity.lower() in ["negative", "neg"]:
|
|
204
205
|
self.adducts = ["H-1:-:1.0", "CH2O2:0:0.5"]
|
|
205
206
|
else:
|
|
206
207
|
# Default to positive if polarity is None or unknown
|
|
207
208
|
self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
|
|
208
|
-
|
|
209
|
+
|
|
209
210
|
# Validate num_cores
|
|
210
211
|
max_cores = multiprocessing.cpu_count()
|
|
211
212
|
if self.num_cores <= 0:
|
|
212
213
|
self.num_cores = max_cores
|
|
213
214
|
elif self.num_cores > max_cores:
|
|
214
215
|
self.num_cores = max_cores
|
|
215
|
-
|
|
216
|
+
|
|
216
217
|
# Ensure paths are absolute
|
|
217
218
|
if self.source:
|
|
218
219
|
self.source = os.path.abspath(self.source)
|
|
@@ -223,24 +224,24 @@ class wizard_def:
|
|
|
223
224
|
class Wizard:
|
|
224
225
|
"""
|
|
225
226
|
Simplified Wizard for automated mass spectrometry data processing.
|
|
226
|
-
|
|
227
|
+
|
|
227
228
|
The Wizard provides a clean interface for creating and executing analysis scripts
|
|
228
229
|
that process raw MS data through the complete pipeline: file discovery, feature
|
|
229
230
|
detection, sample processing, study assembly, alignment, merging, and export.
|
|
230
|
-
|
|
231
|
+
|
|
231
232
|
Core functions:
|
|
232
233
|
- create_scripts(): Generate standalone analysis scripts
|
|
233
234
|
- test_only(): Process only one file for parameter validation
|
|
234
235
|
- test_and_run(): Test with single file, then run full batch if successful
|
|
235
236
|
- run(): Execute full batch processing on all files
|
|
236
|
-
|
|
237
|
+
|
|
237
238
|
Recommended workflow:
|
|
238
239
|
1. wizard = Wizard(source="raw_data", folder="output")
|
|
239
240
|
2. wizard.create_scripts() # Generate analysis scripts
|
|
240
241
|
3. wizard.test_only() # Validate with single file
|
|
241
242
|
4. wizard.run() # Process all files
|
|
242
243
|
"""
|
|
243
|
-
|
|
244
|
+
|
|
244
245
|
def __init__(
|
|
245
246
|
self,
|
|
246
247
|
source: str = "",
|
|
@@ -248,11 +249,11 @@ class Wizard:
|
|
|
248
249
|
polarity: Optional[str] = None,
|
|
249
250
|
adducts: Optional[List[str]] = None,
|
|
250
251
|
num_cores: int = 6,
|
|
251
|
-
**kwargs
|
|
252
|
+
**kwargs,
|
|
252
253
|
):
|
|
253
254
|
"""
|
|
254
255
|
Initialize the Wizard with analysis parameters.
|
|
255
|
-
|
|
256
|
+
|
|
256
257
|
Parameters:
|
|
257
258
|
source: Directory containing raw data files
|
|
258
259
|
folder: Output directory for processed study
|
|
@@ -261,145 +262,43 @@ class Wizard:
|
|
|
261
262
|
num_cores: Number of CPU cores (0 = auto-detect 75% of available)
|
|
262
263
|
**kwargs: Additional parameters (see wizard_def for full list)
|
|
263
264
|
"""
|
|
264
|
-
|
|
265
|
+
|
|
265
266
|
# Auto-detect optimal number of cores if not specified
|
|
266
267
|
if num_cores <= 0:
|
|
267
268
|
num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
|
|
268
|
-
|
|
269
|
+
|
|
269
270
|
# Create parameters instance
|
|
270
271
|
if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
|
|
271
272
|
self.params = kwargs.pop("params")
|
|
272
273
|
else:
|
|
273
274
|
# Create default parameters
|
|
274
|
-
self.params = wizard_def(
|
|
275
|
-
|
|
276
|
-
folder=folder,
|
|
277
|
-
polarity=polarity,
|
|
278
|
-
num_cores=num_cores
|
|
279
|
-
)
|
|
280
|
-
|
|
275
|
+
self.params = wizard_def(source=source, folder=folder, polarity=polarity, num_cores=num_cores)
|
|
276
|
+
|
|
281
277
|
# Set adducts if provided
|
|
282
278
|
if adducts is not None:
|
|
283
279
|
self.params.adducts = adducts
|
|
284
|
-
|
|
280
|
+
|
|
285
281
|
# Update with any additional parameters
|
|
286
282
|
for key, value in kwargs.items():
|
|
287
283
|
if hasattr(self.params, key):
|
|
288
284
|
setattr(self.params, key, value)
|
|
289
|
-
|
|
285
|
+
|
|
290
286
|
# Validate required parameters
|
|
291
287
|
if not self.params.source:
|
|
292
288
|
raise ValueError("source is required")
|
|
293
289
|
if not self.params.folder:
|
|
294
290
|
raise ValueError("folder is required")
|
|
295
|
-
|
|
291
|
+
|
|
296
292
|
# Create and validate paths
|
|
297
293
|
self.source_path = Path(self.params.source)
|
|
298
|
-
self.folder_path = Path(self.params.folder)
|
|
294
|
+
self.folder_path = Path(self.params.folder)
|
|
299
295
|
self.folder_path.mkdir(parents=True, exist_ok=True)
|
|
300
|
-
|
|
301
|
-
# Auto-infer polarity from the first file if polarity is None
|
|
302
|
-
if self.params.polarity is None:
|
|
303
|
-
inferred_polarity = self._infer_polarity_from_first_file()
|
|
304
|
-
if inferred_polarity:
|
|
305
|
-
self.params.polarity = inferred_polarity
|
|
306
|
-
# Update adducts based on inferred polarity
|
|
307
|
-
self.params.__post_init__()
|
|
308
296
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
Inferred polarity string ("positive" or "negative") or "positive" as fallback
|
|
315
|
-
"""
|
|
316
|
-
try:
|
|
317
|
-
# Find first file
|
|
318
|
-
for extension in ['.wiff', '.raw', '.mzML']:
|
|
319
|
-
pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
|
|
320
|
-
files = list(self.source_path.rglob(pattern))
|
|
321
|
-
if files:
|
|
322
|
-
first_file = files[0]
|
|
323
|
-
break
|
|
324
|
-
else:
|
|
325
|
-
return 'positive'
|
|
326
|
-
|
|
327
|
-
# Handle different file formats
|
|
328
|
-
if first_file.suffix.lower() == '.wiff':
|
|
329
|
-
return self._infer_polarity_from_wiff(str(first_file))
|
|
330
|
-
elif first_file.suffix.lower() == '.raw':
|
|
331
|
-
return self._infer_polarity_from_raw(str(first_file))
|
|
332
|
-
elif first_file.suffix.lower() == '.mzml':
|
|
333
|
-
return self._infer_polarity_from_mzml(str(first_file))
|
|
334
|
-
|
|
335
|
-
except Exception:
|
|
336
|
-
# Silently fall back to default if inference fails
|
|
337
|
-
pass
|
|
338
|
-
|
|
339
|
-
return 'positive'
|
|
340
|
-
|
|
341
|
-
def _infer_polarity_from_wiff(self, filename: str) -> str:
|
|
342
|
-
"""Infer polarity from WIFF file."""
|
|
343
|
-
try:
|
|
344
|
-
from masster.sample.load import _wiff_to_dict
|
|
345
|
-
|
|
346
|
-
# Extract metadata from first file
|
|
347
|
-
metadata_df = _wiff_to_dict(filename)
|
|
348
|
-
|
|
349
|
-
if not metadata_df.empty and 'polarity' in metadata_df.columns:
|
|
350
|
-
# Get polarity from first experiment
|
|
351
|
-
first_polarity = metadata_df['polarity'].iloc[0]
|
|
352
|
-
|
|
353
|
-
# Convert numeric polarity codes to string
|
|
354
|
-
if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
|
|
355
|
-
return "positive"
|
|
356
|
-
elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
|
|
357
|
-
return "negative"
|
|
358
|
-
except Exception:
|
|
359
|
-
pass
|
|
360
|
-
return 'positive'
|
|
361
|
-
|
|
362
|
-
def _infer_polarity_from_raw(self, filename: str) -> str:
|
|
363
|
-
"""Infer polarity from Thermo RAW file."""
|
|
364
|
-
try:
|
|
365
|
-
from masster.sample.thermo import ThermoRawFileReader
|
|
366
|
-
|
|
367
|
-
with ThermoRawFileReader(filename) as raw_reader:
|
|
368
|
-
# Get polarity from first scan
|
|
369
|
-
first_scan = 1
|
|
370
|
-
polarity = raw_reader.get_polarity_from_scan_event(first_scan)
|
|
371
|
-
if polarity in ['positive', 'negative']:
|
|
372
|
-
return polarity
|
|
373
|
-
except Exception:
|
|
374
|
-
pass
|
|
375
|
-
return 'positive'
|
|
376
|
-
|
|
377
|
-
def _infer_polarity_from_mzml(self, filename: str) -> str:
|
|
378
|
-
"""Infer polarity from mzML file."""
|
|
379
|
-
try:
|
|
380
|
-
# Import pyopenms with warnings suppression
|
|
381
|
-
import warnings
|
|
382
|
-
with warnings.catch_warnings():
|
|
383
|
-
warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
|
|
384
|
-
import pyopenms as oms
|
|
385
|
-
|
|
386
|
-
# Load the first few spectra to check polarity
|
|
387
|
-
omsexp = oms.MSExperiment()
|
|
388
|
-
oms.MzMLFile().load(filename, omsexp)
|
|
389
|
-
|
|
390
|
-
if omsexp.getNrSpectra() > 0:
|
|
391
|
-
first_spectrum = omsexp.getSpectra()[0]
|
|
392
|
-
try:
|
|
393
|
-
pol = first_spectrum.getInstrumentSettings().getPolarity()
|
|
394
|
-
if pol == 1:
|
|
395
|
-
return "positive"
|
|
396
|
-
elif pol == 2:
|
|
397
|
-
return "negative"
|
|
398
|
-
except Exception:
|
|
399
|
-
pass
|
|
400
|
-
except Exception:
|
|
401
|
-
pass
|
|
402
|
-
return 'positive'
|
|
297
|
+
# Set default polarity if not specified
|
|
298
|
+
if self.params.polarity is None:
|
|
299
|
+
self.params.polarity = "positive"
|
|
300
|
+
# Update adducts based on default polarity
|
|
301
|
+
self.params.__post_init__()
|
|
403
302
|
|
|
404
303
|
@property
|
|
405
304
|
def polarity(self) -> Optional[str]:
|
|
@@ -414,13 +313,13 @@ class Wizard:
|
|
|
414
313
|
def create_scripts(self) -> Dict[str, Any]:
|
|
415
314
|
"""
|
|
416
315
|
Generate analysis scripts based on source file analysis.
|
|
417
|
-
|
|
316
|
+
|
|
418
317
|
This method:
|
|
419
318
|
1. Analyzes the source files to extract metadata
|
|
420
319
|
2. Creates 1_masster_workflow.py with sample processing logic
|
|
421
320
|
3. Creates 2_interactive_analysis.py marimo notebook for study exploration
|
|
422
321
|
4. Returns instructions for next steps
|
|
423
|
-
|
|
322
|
+
|
|
424
323
|
Returns:
|
|
425
324
|
Dictionary containing:
|
|
426
325
|
- status: "success" or "error"
|
|
@@ -432,50 +331,79 @@ class Wizard:
|
|
|
432
331
|
try:
|
|
433
332
|
# Step 1: Analyze source files to extract metadata
|
|
434
333
|
source_info = self._analyze_source_files()
|
|
435
|
-
|
|
334
|
+
|
|
335
|
+
# Report extracted information from first file
|
|
336
|
+
print("\nInformation retrieval complete:")
|
|
337
|
+
print(f" Number of files found: {source_info.get('number_of_files', 0)}")
|
|
338
|
+
print(f" File types detected: {', '.join(source_info.get('file_types', []))}")
|
|
339
|
+
#print(f" First file: {source_info.get('first_file', 'None')}")
|
|
340
|
+
if source_info.get('first_file'):
|
|
341
|
+
print(f" Detected polarity: {source_info.get('polarity', 'unknown')}")
|
|
342
|
+
print(f" Detected detector type: {source_info.get('detector_type', 'unknown')}")
|
|
343
|
+
if source_info.get('baseline', 0) > 0:
|
|
344
|
+
print(f" Estimated baseline intensity: {source_info.get('baseline', 0):.1f}")
|
|
345
|
+
if source_info.get('length_minutes', 0) > 0:
|
|
346
|
+
print(f" Run length: {source_info.get('length_minutes', 0):.1f} minutes")
|
|
347
|
+
else:
|
|
348
|
+
print(" Run length: Could not determine")
|
|
349
|
+
if source_info.get('ms1_scans_per_second', 0) > 0:
|
|
350
|
+
print(f" MS1 scans per second: {source_info.get('ms1_scans_per_second', 0):.2f}")
|
|
351
|
+
print()
|
|
352
|
+
|
|
436
353
|
# Update wizard parameters based on detected metadata
|
|
437
|
-
if source_info.get(
|
|
438
|
-
self.params.polarity = source_info[
|
|
439
|
-
|
|
354
|
+
if source_info.get("polarity") and source_info["polarity"] != "positive":
|
|
355
|
+
self.params.polarity = source_info["polarity"]
|
|
356
|
+
|
|
440
357
|
files_created = []
|
|
441
|
-
|
|
358
|
+
|
|
442
359
|
# Step 2: Create 1_masster_workflow.py
|
|
443
360
|
workflow_script_path = self.folder_path / "1_masster_workflow.py"
|
|
361
|
+
print(f"Creating workflow script: {workflow_script_path}")
|
|
444
362
|
workflow_content = self._generate_workflow_script_content(source_info)
|
|
445
363
|
|
|
446
364
|
# Apply test mode modifications
|
|
447
365
|
workflow_content = self._add_test_mode_support(workflow_content)
|
|
448
|
-
|
|
449
|
-
with open(workflow_script_path,
|
|
366
|
+
|
|
367
|
+
with open(workflow_script_path, "w", encoding="utf-8") as f:
|
|
450
368
|
f.write(workflow_content)
|
|
451
369
|
files_created.append(str(workflow_script_path))
|
|
452
|
-
|
|
370
|
+
|
|
453
371
|
# Step 3: Create 2_interactive_analysis.py marimo notebook
|
|
454
372
|
notebook_path = self.folder_path / "2_interactive_analysis.py"
|
|
373
|
+
print(f"Creating interactive analysis notebook: {notebook_path}")
|
|
455
374
|
notebook_content = self._generate_interactive_notebook_content(source_info)
|
|
456
|
-
|
|
457
|
-
with open(notebook_path,
|
|
375
|
+
|
|
376
|
+
with open(notebook_path, "w", encoding="utf-8") as f:
|
|
458
377
|
f.write(notebook_content)
|
|
459
378
|
files_created.append(str(notebook_path))
|
|
460
|
-
|
|
379
|
+
|
|
461
380
|
# Step 4: Generate instructions
|
|
462
381
|
instructions = self._generate_instructions(source_info, files_created)
|
|
463
|
-
|
|
382
|
+
|
|
464
383
|
return {
|
|
465
384
|
"status": "success",
|
|
466
385
|
"message": f"Successfully created {len(files_created)} script files",
|
|
467
386
|
"instructions": instructions,
|
|
468
387
|
"files_created": files_created,
|
|
469
|
-
"source_info": source_info
|
|
388
|
+
"source_info": source_info,
|
|
470
389
|
}
|
|
471
|
-
|
|
390
|
+
|
|
472
391
|
except Exception as e:
|
|
392
|
+
import traceback
|
|
393
|
+
tb = traceback.extract_tb(e.__traceback__)
|
|
394
|
+
if tb:
|
|
395
|
+
line_number = tb[-1].lineno
|
|
396
|
+
function_name = tb[-1].name
|
|
397
|
+
error_location = f" (at line {line_number} in {function_name})"
|
|
398
|
+
else:
|
|
399
|
+
error_location = ""
|
|
400
|
+
|
|
473
401
|
return {
|
|
474
|
-
"status": "error",
|
|
475
|
-
"message": f"Failed to create scripts: {e}",
|
|
402
|
+
"status": "error",
|
|
403
|
+
"message": f"Failed to create scripts: {e}{error_location}",
|
|
476
404
|
"instructions": [],
|
|
477
405
|
"files_created": [],
|
|
478
|
-
"source_info": {}
|
|
406
|
+
"source_info": {},
|
|
479
407
|
}
|
|
480
408
|
|
|
481
409
|
def _analyze_source_files(self) -> Dict[str, Any]:
|
|
@@ -483,16 +411,20 @@ class Wizard:
|
|
|
483
411
|
result = {
|
|
484
412
|
"number_of_files": 0,
|
|
485
413
|
"file_types": [],
|
|
486
|
-
"
|
|
414
|
+
"detector_type": "tof",
|
|
415
|
+
"polarity": None,
|
|
416
|
+
"baseline": None,
|
|
487
417
|
"length_minutes": 0.0,
|
|
488
|
-
"
|
|
418
|
+
"ms1_scans_per_second": 0.0,
|
|
419
|
+
"first_file": None,
|
|
420
|
+
|
|
489
421
|
}
|
|
490
|
-
|
|
422
|
+
|
|
491
423
|
try:
|
|
492
424
|
# Find raw data files
|
|
493
425
|
extensions = [".wiff", ".raw", ".mzML"]
|
|
494
426
|
raw_files = []
|
|
495
|
-
|
|
427
|
+
|
|
496
428
|
for ext in extensions:
|
|
497
429
|
pattern = f"**/*{ext}"
|
|
498
430
|
files = list(self.source_path.rglob(pattern))
|
|
@@ -500,246 +432,284 @@ class Wizard:
|
|
|
500
432
|
raw_files.extend(files)
|
|
501
433
|
if ext not in result["file_types"]:
|
|
502
434
|
result["file_types"].append(ext)
|
|
503
|
-
|
|
435
|
+
|
|
504
436
|
result["number_of_files"] = len(raw_files)
|
|
505
|
-
|
|
437
|
+
|
|
506
438
|
if raw_files:
|
|
507
439
|
result["first_file"] = str(raw_files[0])
|
|
508
|
-
#
|
|
509
|
-
|
|
510
|
-
|
|
440
|
+
# load first file to infer polarity and length
|
|
441
|
+
print(f"Analyzing first file: {raw_files[0]}")
|
|
442
|
+
from masster import Sample
|
|
443
|
+
sample = Sample(filename=result["first_file"], logging_level='WARNING')
|
|
444
|
+
result['polarity'] = sample.polarity
|
|
445
|
+
# take max from polars ms1_df['rt']
|
|
446
|
+
if sample.ms1_df is not None:
|
|
447
|
+
if not sample.ms1_df.is_empty() and 'rt' in sample.ms1_df.columns:
|
|
448
|
+
max_rt = sample.ms1_df['rt'].max()
|
|
449
|
+
if max_rt is not None and isinstance(max_rt, (int, float)) and max_rt > 0:
|
|
450
|
+
result["length_minutes"] = float(max_rt) / 60.0
|
|
451
|
+
result["ms1_scans_per_second"] = len(sample.ms1_df) / float(max_rt) / 60.0
|
|
452
|
+
|
|
453
|
+
baseline = sample.ms1_df['inty'].quantile(0.001)
|
|
454
|
+
if baseline is not None and isinstance(baseline, (int, float)):
|
|
455
|
+
result["baseline"] = float(baseline)
|
|
456
|
+
if baseline > 5e3:
|
|
457
|
+
result["detector_type"] = "orbitrap"
|
|
458
|
+
else:
|
|
459
|
+
result["detector_type"] = "tof"
|
|
460
|
+
|
|
511
461
|
except Exception as e:
|
|
512
462
|
print(f"Warning: Could not analyze source files: {e}")
|
|
513
|
-
|
|
463
|
+
|
|
514
464
|
return result
|
|
515
465
|
|
|
516
466
|
def _generate_workflow_script_content(self, source_info: Dict[str, Any]) -> str:
|
|
517
467
|
"""Generate the content for 1_masster_workflow.py script."""
|
|
518
|
-
|
|
468
|
+
|
|
469
|
+
# Logic
|
|
470
|
+
noise = self.params.noise
|
|
471
|
+
if noise is None:
|
|
472
|
+
if source_info.get("detector_type") == "orbitrap":
|
|
473
|
+
noise = max(self.params.noise or 50.0, 5e4)
|
|
474
|
+
elif source_info.get("detector_type") == "tof":
|
|
475
|
+
default_noise = self.params.noise or 50.0
|
|
476
|
+
baseline = source_info.get("baseline", default_noise / 2.0)
|
|
477
|
+
noise = baseline * 2
|
|
478
|
+
|
|
479
|
+
chrom_fwhm = self.params.chrom_fwhm
|
|
480
|
+
if chrom_fwhm is None:
|
|
481
|
+
if source_info.get("length_minutes", 0) > 0:
|
|
482
|
+
if source_info["length_minutes"] < 10:
|
|
483
|
+
chrom_fwhm = 0.5
|
|
484
|
+
else:
|
|
485
|
+
chrom_fwhm = 2.0
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
# Generate script content
|
|
519
489
|
script_lines = [
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
490
|
+
"#!/usr/bin/env python3",
|
|
491
|
+
'"""',
|
|
492
|
+
"Automated Mass Spectrometry Data Analysis Pipeline",
|
|
493
|
+
"Generated by masster wizard",
|
|
524
494
|
'"""',
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
'
|
|
537
|
-
'
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
495
|
+
"",
|
|
496
|
+
"import os",
|
|
497
|
+
"import sys",
|
|
498
|
+
"import time",
|
|
499
|
+
"from pathlib import Path",
|
|
500
|
+
"",
|
|
501
|
+
"# Import masster modules",
|
|
502
|
+
"from masster.study import Study",
|
|
503
|
+
"from masster import __version__",
|
|
504
|
+
"",
|
|
505
|
+
"# Test mode configuration",
|
|
506
|
+
'TEST = os.environ.get("MASSTER_TEST", "0") == "1"',
|
|
507
|
+
'STOP_AFTER_TEST = os.environ.get("MASSTER_STOP_AFTER_TEST", "0") == "1" # Only run test, don\'t continue to full batch',
|
|
508
|
+
"",
|
|
509
|
+
"# Analysis parameters",
|
|
510
|
+
"PARAMS = {",
|
|
511
|
+
" # === Core Configuration ===",
|
|
542
512
|
f' "source": {str(self.source_path)!r}, # Directory containing raw data files',
|
|
543
513
|
f' "folder": {str(self.folder_path)!r}, # Output directory for processed study',
|
|
544
514
|
f' "polarity": {self.params.polarity!r}, # Ion polarity mode ("positive" or "negative")',
|
|
545
515
|
f' "num_cores": {self.params.num_cores}, # Number of CPU cores for parallel processing',
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
' "
|
|
549
|
-
' "
|
|
550
|
-
|
|
551
|
-
|
|
516
|
+
"",
|
|
517
|
+
" # === Test Mode ===",
|
|
518
|
+
' "test": TEST, # Process only first file for testing',
|
|
519
|
+
' "stop_after_test": STOP_AFTER_TEST, # Stop after test, don\'t run full batch',
|
|
520
|
+
"",
|
|
521
|
+
" # === File Discovery ===",
|
|
552
522
|
f' "file_extensions": {self.params.file_extensions!r}, # File extensions to search for',
|
|
553
523
|
f' "search_subfolders": {self.params.search_subfolders}, # Whether to search subdirectories recursively',
|
|
554
524
|
f' "skip_patterns": {self.params.skip_patterns!r}, # Filename patterns to skip',
|
|
555
|
-
|
|
556
|
-
|
|
525
|
+
"",
|
|
526
|
+
" # === Processing Parameters ===",
|
|
557
527
|
f' "adducts": {self.params.adducts!r}, # Adduct specifications for feature detection and annotation',
|
|
558
|
-
f' "noise": {
|
|
559
|
-
f' "chrom_fwhm": {
|
|
528
|
+
f' "noise": {noise}, # Noise threshold for feature detection',
|
|
529
|
+
f' "chrom_fwhm": {chrom_fwhm}, # Chromatographic peak full width at half maximum (seconds)',
|
|
560
530
|
f' "chrom_peak_snr": {self.params.chrom_peak_snr}, # Minimum signal-to-noise ratio for chromatographic peaks',
|
|
561
|
-
|
|
562
|
-
|
|
531
|
+
"",
|
|
532
|
+
" # === Alignment & Merging ===",
|
|
563
533
|
f' "rt_tol": {self.params.rt_tolerance}, # Retention time tolerance for alignment (seconds)',
|
|
564
534
|
f' "mz_tol": {self.params.mz_max_diff}, # Mass-to-charge ratio tolerance for alignment (Da)',
|
|
565
535
|
f' "alignment_method": {self.params.alignment_algorithm!r}, # Algorithm for sample alignment',
|
|
566
536
|
f' "min_samples_per_feature": {self.params.min_samples_for_merge}, # Minimum samples required per consensus feature',
|
|
567
537
|
f' "merge_method": {self.params.merge_method!r}, # Method for merging consensus features',
|
|
568
|
-
|
|
569
|
-
|
|
538
|
+
"",
|
|
539
|
+
" # === Sample Processing (used in add_samples_from_folder) ===",
|
|
570
540
|
f' "batch_size": {self.params.batch_size}, # Number of files to process per batch',
|
|
571
541
|
f' "memory_limit_gb": {self.params.memory_limit_gb}, # Memory limit for processing (GB)',
|
|
572
|
-
|
|
573
|
-
|
|
542
|
+
"",
|
|
543
|
+
" # === Script Options ===",
|
|
574
544
|
f' "resume_enabled": {self.params.resume_enabled}, # Enable automatic resume capability',
|
|
575
545
|
f' "force_reprocess": {self.params.force_reprocess}, # Force reprocessing of existing files',
|
|
576
546
|
f' "cleanup_temp_files": {self.params.cleanup_temp_files}, # Clean up temporary files after processing',
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
547
|
+
"}",
|
|
548
|
+
"",
|
|
549
|
+
"",
|
|
550
|
+
"def discover_raw_files(source_folder, file_extensions, search_subfolders=True):",
|
|
581
551
|
' """Discover raw data files in the source folder."""',
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
552
|
+
" source_path = Path(source_folder)",
|
|
553
|
+
" raw_files = []",
|
|
554
|
+
" ",
|
|
555
|
+
" for ext in file_extensions:",
|
|
556
|
+
" if search_subfolders:",
|
|
587
557
|
' pattern = f"**/*{ext}"',
|
|
588
|
-
|
|
589
|
-
|
|
558
|
+
" files = list(source_path.rglob(pattern))",
|
|
559
|
+
" else:",
|
|
590
560
|
' pattern = f"*{ext}"',
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
561
|
+
" files = list(source_path.glob(pattern))",
|
|
562
|
+
" raw_files.extend(files)",
|
|
563
|
+
" ",
|
|
564
|
+
" return raw_files",
|
|
565
|
+
"",
|
|
566
|
+
"",
|
|
567
|
+
"def process_single_file(args):",
|
|
598
568
|
' """Process a single raw file to sample5 format - module level for multiprocessing."""',
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
569
|
+
" raw_file, output_folder = args",
|
|
570
|
+
" from masster.sample import Sample",
|
|
571
|
+
" ",
|
|
572
|
+
" try:",
|
|
573
|
+
" # Create sample5 filename",
|
|
574
|
+
" sample_name = raw_file.stem",
|
|
605
575
|
' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
|
|
606
|
-
|
|
607
|
-
|
|
576
|
+
" ",
|
|
577
|
+
" # Skip if sample5 already exists",
|
|
608
578
|
' if sample5_path.exists() and not PARAMS["force_reprocess"]:',
|
|
609
579
|
' print(f" Skipping {raw_file.name} (sample5 already exists)")',
|
|
610
|
-
|
|
611
|
-
|
|
580
|
+
" return str(sample5_path)",
|
|
581
|
+
" ",
|
|
612
582
|
' print(f" Converting {raw_file.name}...")',
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
583
|
+
" ",
|
|
584
|
+
" # Load and process raw file with full pipeline",
|
|
585
|
+
" sample = Sample(log_label=sample_name)",
|
|
586
|
+
" sample.load(filename=str(raw_file))",
|
|
587
|
+
" sample.find_features(",
|
|
618
588
|
' noise=PARAMS["noise"],',
|
|
619
589
|
' chrom_fwhm=PARAMS["chrom_fwhm"],',
|
|
620
590
|
' chrom_peak_snr=PARAMS["chrom_peak_snr"]',
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
591
|
+
" )",
|
|
592
|
+
" sample.find_ms2()",
|
|
593
|
+
" sample.find_iso()",
|
|
594
|
+
" # sample.export_mgf()",
|
|
625
595
|
' # sample.plot_2d(filename=f"{sample5_path.replace(".sample5", ".html")}")',
|
|
626
|
-
|
|
627
|
-
|
|
596
|
+
" sample.save(str(sample5_path))",
|
|
597
|
+
" ",
|
|
628
598
|
' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
599
|
+
" return str(sample5_path)",
|
|
600
|
+
" ",
|
|
601
|
+
" except Exception as e:",
|
|
632
602
|
' print(f" ERROR processing {raw_file.name}: {e}")',
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
603
|
+
" return None",
|
|
604
|
+
"",
|
|
605
|
+
"",
|
|
606
|
+
"def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):",
|
|
637
607
|
' """Convert raw data files to sample5 format."""',
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
608
|
+
" import concurrent.futures",
|
|
609
|
+
" import os",
|
|
610
|
+
" ",
|
|
611
|
+
" # Create output directory",
|
|
612
|
+
" os.makedirs(output_folder, exist_ok=True)",
|
|
613
|
+
" ",
|
|
614
|
+
" # Prepare arguments for multiprocessing",
|
|
615
|
+
" file_args = [(raw_file, output_folder) for raw_file in raw_files]",
|
|
616
|
+
" ",
|
|
617
|
+
" # Process files in parallel",
|
|
618
|
+
" sample5_files = []",
|
|
619
|
+
" with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:",
|
|
620
|
+
" futures = [executor.submit(process_single_file, args) for args in file_args]",
|
|
621
|
+
" ",
|
|
622
|
+
" for future in concurrent.futures.as_completed(futures):",
|
|
623
|
+
" result = future.result()",
|
|
624
|
+
" if result:",
|
|
625
|
+
" sample5_files.append(result)",
|
|
626
|
+
" ",
|
|
627
|
+
" return sample5_files",
|
|
628
|
+
"",
|
|
629
|
+
"",
|
|
630
|
+
"def main():",
|
|
661
631
|
' """Main analysis pipeline."""',
|
|
662
|
-
|
|
632
|
+
" try:",
|
|
663
633
|
' print("=" * 70)',
|
|
664
634
|
f' print("masster {version} - Automated MS Data Analysis")',
|
|
665
635
|
' print("=" * 70)',
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
636
|
+
" print(f\"Source: {PARAMS['source']}\")",
|
|
637
|
+
" print(f\"Output: {PARAMS['folder']}\")",
|
|
638
|
+
" print(f\"Polarity: {PARAMS['polarity']}\")",
|
|
639
|
+
" print(f\"CPU Cores: {PARAMS['num_cores']}\")",
|
|
670
640
|
' print("=" * 70)',
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
641
|
+
" ",
|
|
642
|
+
" start_time = time.time()",
|
|
643
|
+
" ",
|
|
644
|
+
" # Step 1: Discover raw data files",
|
|
675
645
|
' print("\\nStep 1/7: Discovering raw data files...")',
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
646
|
+
" raw_files = discover_raw_files(",
|
|
647
|
+
" PARAMS['source'],",
|
|
648
|
+
" PARAMS['file_extensions'],",
|
|
649
|
+
" PARAMS['search_subfolders']",
|
|
650
|
+
" )",
|
|
651
|
+
" ",
|
|
652
|
+
" if not raw_files:",
|
|
683
653
|
' print("No raw data files found!")',
|
|
684
|
-
|
|
685
|
-
|
|
654
|
+
" return False",
|
|
655
|
+
" ",
|
|
686
656
|
' print(f"Found {len(raw_files)} raw data files")',
|
|
687
|
-
|
|
657
|
+
" for f in raw_files[:5]: # Show first 5 files",
|
|
688
658
|
' print(f" {f.name}")',
|
|
689
|
-
|
|
659
|
+
" if len(raw_files) > 5:",
|
|
690
660
|
' print(f" ... and {len(raw_files) - 5} more")',
|
|
691
|
-
|
|
692
|
-
|
|
661
|
+
" ",
|
|
662
|
+
" # Step 2: Process raw files",
|
|
693
663
|
' print("\\nStep 2/7: Processing raw files...")',
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
664
|
+
" sample5_files = convert_raw_to_sample5(",
|
|
665
|
+
" raw_files,",
|
|
666
|
+
" PARAMS['folder'],",
|
|
667
|
+
" PARAMS['polarity'],",
|
|
668
|
+
" PARAMS['num_cores']",
|
|
669
|
+
" )",
|
|
670
|
+
" ",
|
|
671
|
+
" if not sample5_files:",
|
|
702
672
|
' print("No sample5 files were created!")',
|
|
703
|
-
|
|
704
|
-
|
|
673
|
+
" return False",
|
|
674
|
+
" ",
|
|
705
675
|
' print(f"Successfully processed {len(sample5_files)} files to sample5")',
|
|
706
|
-
|
|
707
|
-
|
|
676
|
+
" ",
|
|
677
|
+
" # Step 3: Create and configure study",
|
|
708
678
|
' print("\\nStep 3/7: Initializing study...")',
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
679
|
+
" study = Study(folder=PARAMS['folder'])",
|
|
680
|
+
" study.polarity = PARAMS['polarity']",
|
|
681
|
+
" study.adducts = PARAMS['adducts']",
|
|
682
|
+
" ",
|
|
683
|
+
" # Step 4: Add sample5 files to study",
|
|
714
684
|
' print("\\nStep 4/7: Adding samples to study...")',
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
685
|
+
" study.add(str(Path(PARAMS['folder']) / \"*.sample5\"))",
|
|
686
|
+
" study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))",
|
|
687
|
+
" ",
|
|
688
|
+
" # Step 5: Core processing",
|
|
719
689
|
' print("\\nStep 5/7: Processing...")',
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
690
|
+
" study.align(",
|
|
691
|
+
" algorithm=PARAMS['alignment_method'],",
|
|
692
|
+
" rt_tol=PARAMS['rt_tol']",
|
|
693
|
+
" )",
|
|
694
|
+
" ",
|
|
695
|
+
" study.merge(",
|
|
726
696
|
' method="qt",',
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
697
|
+
" min_samples=PARAMS['min_samples_per_feature'],",
|
|
698
|
+
" threads=PARAMS['num_cores'],",
|
|
699
|
+
" rt_tol=PARAMS['rt_tol']",
|
|
700
|
+
" )",
|
|
701
|
+
" study.find_iso()",
|
|
702
|
+
" study.fill()",
|
|
703
|
+
" study.integrate()",
|
|
704
|
+
" ",
|
|
705
|
+
" # Step 6/7: Saving results",
|
|
736
706
|
' print("\\nStep 6/7: Saving results...")',
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
707
|
+
" study.save()",
|
|
708
|
+
" study.export_xlsx()",
|
|
709
|
+
" study.export_mgf()",
|
|
710
|
+
" study.export_mztab()",
|
|
711
|
+
" ",
|
|
712
|
+
" # Step 7: Plots",
|
|
743
713
|
' print("\\nStep 7/7: Exporting plots...")',
|
|
744
714
|
' study.plot_consensus_2d(filename="consensus.html")',
|
|
745
715
|
' study.plot_consensus_2d(filename="consensus.png")',
|
|
@@ -751,10 +721,10 @@ class Wizard:
|
|
|
751
721
|
' study.plot_bpc(filename="bpc.png")',
|
|
752
722
|
' study.plot_rt_correction(filename="rt_correction.html")',
|
|
753
723
|
' study.plot_rt_correction(filename="rt_correction.png")',
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
724
|
+
" ",
|
|
725
|
+
" # Print summary",
|
|
726
|
+
" study.info()",
|
|
727
|
+
" total_time = time.time() - start_time",
|
|
758
728
|
' print("\\n" + "=" * 70)',
|
|
759
729
|
' print("ANALYSIS COMPLETE")',
|
|
760
730
|
' print("=" * 70)',
|
|
@@ -764,50 +734,50 @@ class Wizard:
|
|
|
764
734
|
' if hasattr(study, "consensus_df"):',
|
|
765
735
|
' print(f"Consensus features generated: {len(study.consensus_df)}")',
|
|
766
736
|
' print("=" * 70)',
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
737
|
+
" ",
|
|
738
|
+
" return True",
|
|
739
|
+
" ",
|
|
740
|
+
" except KeyboardInterrupt:",
|
|
771
741
|
' print("\\nAnalysis interrupted by user")',
|
|
772
|
-
|
|
773
|
-
|
|
742
|
+
" return False",
|
|
743
|
+
" except Exception as e:",
|
|
774
744
|
' print(f"Analysis failed with error: {e}")',
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
745
|
+
" import traceback",
|
|
746
|
+
" traceback.print_exc()",
|
|
747
|
+
" return False",
|
|
748
|
+
"",
|
|
749
|
+
"",
|
|
780
750
|
'if __name__ == "__main__":',
|
|
781
|
-
|
|
782
|
-
|
|
751
|
+
" success = main()",
|
|
752
|
+
" sys.exit(0 if success else 1)",
|
|
783
753
|
]
|
|
784
|
-
|
|
785
|
-
return
|
|
754
|
+
|
|
755
|
+
return "\n".join(script_lines)
|
|
786
756
|
|
|
787
757
|
def _generate_interactive_notebook_content(self, source_info: Dict[str, Any]) -> str:
|
|
788
758
|
"""Generate the content for 2_interactive_analysis.py marimo notebook."""
|
|
789
|
-
|
|
759
|
+
|
|
790
760
|
notebook_lines = [
|
|
791
|
-
|
|
792
|
-
|
|
761
|
+
"import marimo",
|
|
762
|
+
"",
|
|
793
763
|
'__generated_with = "0.9.14"',
|
|
794
764
|
'app = marimo.App(width="medium")',
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
765
|
+
"",
|
|
766
|
+
"@app.cell",
|
|
767
|
+
"def __():",
|
|
768
|
+
" import marimo as mo",
|
|
769
|
+
" return (mo,)",
|
|
770
|
+
"",
|
|
771
|
+
"@app.cell",
|
|
772
|
+
"def __(mo):",
|
|
803
773
|
' mo.md(r"""',
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
f
|
|
807
|
-
f
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
774
|
+
" # MASSter Interactive Analysis",
|
|
775
|
+
" ",
|
|
776
|
+
f" **Source:** {source_info.get('number_of_files', 0)} files detected",
|
|
777
|
+
f" **Polarity:** {source_info.get('polarity', 'unknown')}",
|
|
778
|
+
" ",
|
|
779
|
+
" This notebook provides interactive exploration of your processed study.",
|
|
780
|
+
" Make sure you have run `python 1_masster_workflow.py` first.",
|
|
811
781
|
' """)',
|
|
812
782
|
' return ()',
|
|
813
783
|
'',
|
|
@@ -828,18 +798,20 @@ class Wizard:
|
|
|
828
798
|
' return ()',
|
|
829
799
|
'',
|
|
830
800
|
'if __name__ == "__main__":',
|
|
831
|
-
|
|
801
|
+
" app.run()",
|
|
832
802
|
]
|
|
833
|
-
|
|
834
|
-
return
|
|
803
|
+
|
|
804
|
+
return "\n".join(notebook_lines)
|
|
835
805
|
|
|
836
806
|
def _generate_instructions(self, source_info: Dict[str, Any], files_created: List[str]) -> List[str]:
|
|
837
807
|
"""Generate usage instructions for the created scripts."""
|
|
838
|
-
instructions = [
|
|
808
|
+
instructions = [
|
|
809
|
+
f"Source analysis: {source_info.get('number_of_files', 0)} files found",
|
|
839
810
|
f"Polarity detected: {source_info.get('polarity', 'unknown')}",
|
|
840
|
-
"Files created:"
|
|
811
|
+
"Files created:",
|
|
812
|
+
]
|
|
841
813
|
for file_path in files_created:
|
|
842
|
-
instructions.append(f"
|
|
814
|
+
instructions.append(f" {str(Path(file_path).resolve())}")
|
|
843
815
|
|
|
844
816
|
# Find the workflow script name from created files
|
|
845
817
|
workflow_script_name = "1_masster_workflow.py"
|
|
@@ -847,7 +819,7 @@ class Wizard:
|
|
|
847
819
|
if Path(file_path).name == "1_masster_workflow.py":
|
|
848
820
|
workflow_script_name = Path(file_path).name
|
|
849
821
|
break
|
|
850
|
-
|
|
822
|
+
|
|
851
823
|
instructions.extend([
|
|
852
824
|
"",
|
|
853
825
|
"Next steps:",
|
|
@@ -864,87 +836,127 @@ class Wizard:
|
|
|
864
836
|
"",
|
|
865
837
|
"4. INTERACTIVE ANALYSIS:",
|
|
866
838
|
f" uv run marimo edit {Path('2_interactive_analysis.py').name}",
|
|
867
|
-
""
|
|
868
|
-
)
|
|
869
|
-
|
|
839
|
+
"",
|
|
840
|
+
])
|
|
841
|
+
|
|
870
842
|
return instructions
|
|
871
843
|
|
|
872
844
|
def _add_test_mode_support(self, workflow_content: str) -> str:
|
|
873
845
|
"""Add test mode functionality to the generated workflow script."""
|
|
874
|
-
lines = workflow_content.split(
|
|
875
|
-
|
|
846
|
+
lines = workflow_content.split("\n")
|
|
847
|
+
|
|
876
848
|
# Insert test mode code after print statements in main function
|
|
877
849
|
for i, line in enumerate(lines):
|
|
878
850
|
# Add test mode print after the masster version line
|
|
879
851
|
if 'print("masster' in line and 'Automated MS Data Analysis")' in line:
|
|
880
|
-
lines.insert(i + 1, ' if
|
|
881
|
-
lines.insert(i + 2, ' print("
|
|
852
|
+
lines.insert(i + 1, ' if TEST:')
|
|
853
|
+
lines.insert(i + 2, ' print("TEST MODE: Processing single file only")')
|
|
882
854
|
break
|
|
883
|
-
|
|
855
|
+
|
|
884
856
|
# Add mode info after num_cores print
|
|
885
857
|
for i, line in enumerate(lines):
|
|
886
|
-
if
|
|
887
|
-
lines.insert(i + 1,
|
|
888
|
-
lines.insert(i + 2,
|
|
858
|
+
if "print(f\"CPU Cores: {PARAMS['num_cores']}\")" in line:
|
|
859
|
+
lines.insert(i + 1, " if TEST:")
|
|
860
|
+
lines.insert(i + 2, " print(f\"Mode: {'Test Only' if STOP_AFTER_TEST else 'Test + Full Batch'}\")")
|
|
889
861
|
break
|
|
890
|
-
|
|
862
|
+
|
|
891
863
|
# Add file limitation logic after file listing
|
|
892
864
|
for i, line in enumerate(lines):
|
|
893
865
|
if 'print(f" ... and {len(raw_files) - 5} more")' in line:
|
|
894
866
|
lines.insert(i + 1, ' ')
|
|
895
867
|
lines.insert(i + 2, ' # Limit to first file in test mode')
|
|
896
|
-
lines.insert(i + 3, ' if
|
|
868
|
+
lines.insert(i + 3, ' if TEST:')
|
|
897
869
|
lines.insert(i + 4, ' raw_files = raw_files[:1]')
|
|
898
|
-
lines.insert(i + 5, ' print(f"\\
|
|
870
|
+
lines.insert(i + 5, ' print(f"\\nTEST MODE: Processing only first file: {raw_files[0].name}")')
|
|
899
871
|
break
|
|
900
|
-
|
|
872
|
+
|
|
901
873
|
# Modify num_cores for test mode
|
|
902
874
|
for i, line in enumerate(lines):
|
|
903
|
-
if
|
|
904
|
-
lines[i] = line.replace(
|
|
875
|
+
if "PARAMS['num_cores']" in line and "convert_raw_to_sample5(" in lines[i - 2 : i + 3]:
|
|
876
|
+
lines[i] = line.replace(
|
|
877
|
+
"PARAMS['num_cores']", "PARAMS['num_cores'] if not TEST else 1 # Use single core for test"
|
|
878
|
+
)
|
|
905
879
|
break
|
|
906
|
-
|
|
880
|
+
|
|
907
881
|
# Add test-only exit logic after successful processing
|
|
908
882
|
for i, line in enumerate(lines):
|
|
909
883
|
if 'print(f"Successfully processed {len(sample5_files)} files to sample5")' in line:
|
|
910
884
|
lines.insert(i + 1, ' ')
|
|
911
|
-
lines.insert(i + 2, ' # Stop here if test
|
|
912
|
-
lines.insert(i + 3, ' if
|
|
913
|
-
lines.insert(i + 4, ' print("\\
|
|
885
|
+
lines.insert(i + 2, ' # Stop here if stop-after-test mode')
|
|
886
|
+
lines.insert(i + 3, ' if STOP_AFTER_TEST:')
|
|
887
|
+
lines.insert(i + 4, ' print("\\nSTOP AFTER TEST mode: Stopping after successful single file processing")')
|
|
914
888
|
lines.insert(i + 5, ' print(f"Test file created: {sample5_files[0]}")')
|
|
915
889
|
lines.insert(i + 6, ' print("\\nTo run full batch, use: wizard.run()")')
|
|
916
|
-
lines.insert(i + 7,
|
|
890
|
+
lines.insert(i + 7, " total_time = time.time() - start_time")
|
|
917
891
|
lines.insert(i + 8, ' print(f"\\nTest processing time: {total_time:.1f} seconds")')
|
|
918
|
-
lines.insert(i + 9,
|
|
892
|
+
lines.insert(i + 9, " return True")
|
|
919
893
|
break
|
|
920
|
-
|
|
921
|
-
return
|
|
894
|
+
|
|
895
|
+
return "\n".join(lines)
|
|
922
896
|
|
|
923
897
|
def test_and_run(self) -> Dict[str, Any]:
|
|
924
898
|
"""
|
|
925
899
|
Test the sample processing workflow with a single file, then run full batch.
|
|
926
|
-
|
|
927
|
-
This method runs the 1_masster_workflow.py script in test mode to process
|
|
928
|
-
the first raw file for validation, then automatically continues with the
|
|
929
|
-
full batch if the test succeeds. The script must already exist - call
|
|
900
|
+
|
|
901
|
+
This method first runs the 1_masster_workflow.py script in test-only mode to process
|
|
902
|
+
the first raw file for validation, then automatically continues with the
|
|
903
|
+
full batch if the test succeeds. The script must already exist - call
|
|
930
904
|
create_scripts() first if needed.
|
|
931
|
-
|
|
905
|
+
|
|
932
906
|
Returns:
|
|
933
907
|
Dictionary containing:
|
|
934
908
|
- status: "success" or "error"
|
|
935
909
|
- message: Status message
|
|
936
910
|
- instructions: List of next steps
|
|
937
911
|
"""
|
|
938
|
-
|
|
912
|
+
# Step 1: Run test-only mode first
|
|
913
|
+
print("Step 1/2: Testing with single file...")
|
|
914
|
+
test_result = self._execute_workflow(test=True, run=False)
|
|
915
|
+
|
|
916
|
+
if test_result["status"] != "success":
|
|
917
|
+
return {
|
|
918
|
+
"status": "error",
|
|
919
|
+
"message": f"Test failed: {test_result['message']}",
|
|
920
|
+
"instructions": [
|
|
921
|
+
"Single file test failed",
|
|
922
|
+
"Review parameters in 1_masster_workflow.py",
|
|
923
|
+
"Fix issues and try again",
|
|
924
|
+
],
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
print("Test successful! Proceeding with full batch...")
|
|
928
|
+
print("Step 2/2: Processing all files...")
|
|
929
|
+
|
|
930
|
+
# Step 2: Run full batch mode
|
|
931
|
+
full_result = self._execute_workflow(test=False, run=True)
|
|
932
|
+
|
|
933
|
+
return full_result
|
|
939
934
|
|
|
940
935
|
def test_only(self) -> Dict[str, Any]:
|
|
941
936
|
"""
|
|
942
937
|
Test the sample processing workflow with a single file only.
|
|
943
|
-
|
|
938
|
+
|
|
944
939
|
This method runs the 1_masster_workflow.py script in test-only mode to process
|
|
945
940
|
only the first raw file and then stops (does not continue to full study processing).
|
|
946
941
|
The script must already exist - call create_scripts() first if needed.
|
|
947
|
-
|
|
942
|
+
|
|
943
|
+
Returns:
|
|
944
|
+
Dictionary containing:
|
|
945
|
+
- status: "success" or "error"
|
|
946
|
+
- message: Status message
|
|
947
|
+
- instructions: List of next steps
|
|
948
|
+
- test_file: Path to the processed test file (if successful)
|
|
949
|
+
"""
|
|
950
|
+
return self._execute_workflow(test=True, run=False)
|
|
951
|
+
|
|
952
|
+
def test(self) -> Dict[str, Any]:
|
|
953
|
+
"""
|
|
954
|
+
Test the sample processing workflow with a single file only.
|
|
955
|
+
|
|
956
|
+
This method runs the 1_masster_workflow.py script in test-only mode to process
|
|
957
|
+
only the first raw file and then stops (does not continue to full study processing).
|
|
958
|
+
The script must already exist - call create_scripts() first if needed.
|
|
959
|
+
|
|
948
960
|
Returns:
|
|
949
961
|
Dictionary containing:
|
|
950
962
|
- status: "success" or "error"
|
|
@@ -952,98 +964,107 @@ class Wizard:
|
|
|
952
964
|
- instructions: List of next steps
|
|
953
965
|
- test_file: Path to the processed test file (if successful)
|
|
954
966
|
"""
|
|
955
|
-
return self._execute_workflow(
|
|
967
|
+
return self._execute_workflow(test=True, run=False)
|
|
968
|
+
|
|
956
969
|
|
|
957
970
|
def run(self) -> Dict[str, Any]:
|
|
958
971
|
"""
|
|
959
972
|
Run the sample processing workflow.
|
|
960
|
-
|
|
973
|
+
|
|
961
974
|
This method runs the 1_masster_workflow.py script to process raw files.
|
|
962
975
|
The script must already exist - call create_scripts() first if needed.
|
|
963
|
-
|
|
976
|
+
|
|
964
977
|
Returns:
|
|
965
978
|
Dictionary containing:
|
|
966
979
|
- status: "success" or "error"
|
|
967
980
|
- message: Status message
|
|
968
981
|
- instructions: List of next steps
|
|
969
982
|
"""
|
|
970
|
-
return self._execute_workflow(
|
|
983
|
+
return self._execute_workflow(test=False, run=True)
|
|
971
984
|
|
|
972
|
-
def _execute_workflow(self,
|
|
985
|
+
def _execute_workflow(self, test: bool = False, run: bool = True) -> Dict[str, Any]:
|
|
973
986
|
"""
|
|
974
987
|
Execute the workflow script in either test or full mode.
|
|
975
|
-
|
|
988
|
+
|
|
976
989
|
Args:
|
|
977
|
-
|
|
978
|
-
|
|
990
|
+
test: If True, run in test mode (single file), otherwise full batch
|
|
991
|
+
run: If False, stop after test (only used with test=True), if True continue with full processing
|
|
979
992
|
"""
|
|
980
993
|
try:
|
|
981
994
|
workflow_script_path = self.folder_path / "1_masster_workflow.py"
|
|
982
|
-
|
|
995
|
+
|
|
983
996
|
# Check if workflow script exists
|
|
984
997
|
if not workflow_script_path.exists():
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
"
|
|
991
|
-
"
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
998
|
+
print("Workflow script not found. Creating scripts automatically...")
|
|
999
|
+
create_result = self.create_scripts()
|
|
1000
|
+
|
|
1001
|
+
if create_result["status"] == "error":
|
|
1002
|
+
return {
|
|
1003
|
+
"status": "error",
|
|
1004
|
+
"message": f"Failed to create workflow script: {create_result['message']}",
|
|
1005
|
+
"instructions": [
|
|
1006
|
+
"Could not create 1_masster_workflow.py",
|
|
1007
|
+
"Please check source path and permissions",
|
|
1008
|
+
],
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
print(f"Created {len(create_result.get('files_created', []))} script files")
|
|
1012
|
+
for file_path in create_result.get('files_created', []):
|
|
1013
|
+
print(f" - {file_path}")
|
|
1014
|
+
print("Proceeding with workflow execution...")
|
|
1015
|
+
|
|
995
1016
|
# Setup execution mode
|
|
996
|
-
if
|
|
1017
|
+
if test and not run:
|
|
997
1018
|
mode_label = "test-only"
|
|
998
|
-
elif
|
|
1019
|
+
elif test:
|
|
999
1020
|
mode_label = "test"
|
|
1000
1021
|
else:
|
|
1001
1022
|
mode_label = "full batch"
|
|
1002
|
-
|
|
1023
|
+
|
|
1003
1024
|
env = None
|
|
1004
|
-
if
|
|
1025
|
+
if test:
|
|
1005
1026
|
import os
|
|
1027
|
+
|
|
1006
1028
|
env = os.environ.copy()
|
|
1007
|
-
env[
|
|
1008
|
-
if
|
|
1009
|
-
env[
|
|
1010
|
-
|
|
1029
|
+
env["MASSTER_TEST"] = "1"
|
|
1030
|
+
if not run:
|
|
1031
|
+
env["MASSTER_STOP_AFTER_TEST"] = "1"
|
|
1032
|
+
|
|
1011
1033
|
# Execute the workflow script
|
|
1012
1034
|
print(f">> Executing {mode_label} processing workflow...")
|
|
1013
|
-
print(f"
|
|
1035
|
+
print(f"Running: {workflow_script_path.name}")
|
|
1014
1036
|
print("=" * 60)
|
|
1015
|
-
|
|
1037
|
+
|
|
1016
1038
|
import subprocess
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1039
|
+
|
|
1040
|
+
result = subprocess.run([sys.executable, str(workflow_script_path)], cwd=str(self.folder_path), env=env)
|
|
1041
|
+
|
|
1021
1042
|
success = result.returncode == 0
|
|
1022
|
-
|
|
1043
|
+
|
|
1023
1044
|
if success:
|
|
1024
1045
|
print("=" * 60)
|
|
1025
|
-
if
|
|
1026
|
-
print("
|
|
1027
|
-
print("
|
|
1046
|
+
if test and not run:
|
|
1047
|
+
print("Test-only processing completed successfully!")
|
|
1048
|
+
print("Single file validated - ready for full batch")
|
|
1028
1049
|
print(" wizard.run()")
|
|
1029
|
-
elif
|
|
1030
|
-
print("
|
|
1031
|
-
print("
|
|
1050
|
+
elif test:
|
|
1051
|
+
print("Test processing completed successfully!")
|
|
1052
|
+
print("Next step: Run full batch")
|
|
1032
1053
|
print(" wizard.run()")
|
|
1033
1054
|
else:
|
|
1034
|
-
|
|
1035
|
-
print("
|
|
1036
|
-
print("
|
|
1055
|
+
notebook_path = self.folder_path / "2_interactive_analysis.py"
|
|
1056
|
+
print("Sample processing completed successfully!")
|
|
1057
|
+
print("Next step: Run interactive analysis")
|
|
1058
|
+
print(f" uv run marimo edit {notebook_path}")
|
|
1037
1059
|
print("=" * 60)
|
|
1038
|
-
|
|
1039
|
-
next_step =
|
|
1040
|
-
|
|
1041
|
-
|
|
1060
|
+
|
|
1061
|
+
next_step = "Next: wizard.run()" if test else f"Next: uv run marimo edit {self.folder_path / '2_interactive_analysis.py'}"
|
|
1062
|
+
|
|
1042
1063
|
return {
|
|
1043
1064
|
"status": "success",
|
|
1044
1065
|
"message": f"{mode_label.capitalize()} processing completed successfully",
|
|
1045
1066
|
"instructions": [
|
|
1046
|
-
f"
|
|
1067
|
+
f"{mode_label.capitalize()} processing completed",
|
|
1047
1068
|
next_step
|
|
1048
1069
|
]
|
|
1049
1070
|
}
|
|
@@ -1052,30 +1073,30 @@ class Wizard:
|
|
|
1052
1073
|
"status": "error",
|
|
1053
1074
|
"message": f"Workflow execution failed with return code {result.returncode}",
|
|
1054
1075
|
"instructions": [
|
|
1055
|
-
"
|
|
1076
|
+
"Check the error messages above",
|
|
1056
1077
|
"Review parameters in 1_masster_workflow.py",
|
|
1057
|
-
f"Try running manually: python {workflow_script_path.name}"
|
|
1058
|
-
]
|
|
1078
|
+
f"Try running manually: python {workflow_script_path.name}",
|
|
1079
|
+
],
|
|
1059
1080
|
}
|
|
1060
|
-
|
|
1081
|
+
|
|
1061
1082
|
except Exception as e:
|
|
1062
1083
|
return {
|
|
1063
1084
|
"status": "error",
|
|
1064
1085
|
"message": f"Failed to execute workflow: {e}",
|
|
1065
1086
|
"instructions": [
|
|
1066
|
-
"
|
|
1087
|
+
"Execution failed",
|
|
1067
1088
|
"Check that source files exist and are accessible",
|
|
1068
|
-
"Verify folder permissions"
|
|
1069
|
-
]
|
|
1089
|
+
"Verify folder permissions",
|
|
1090
|
+
],
|
|
1070
1091
|
}
|
|
1071
1092
|
|
|
1072
1093
|
def _generate_script_content(self) -> str:
|
|
1073
1094
|
"""Generate the complete analysis script content."""
|
|
1074
|
-
|
|
1095
|
+
|
|
1075
1096
|
# Convert Path objects to strings for JSON serialization
|
|
1076
1097
|
params_dict = {}
|
|
1077
1098
|
for key, value in self.params.__dict__.items():
|
|
1078
|
-
if key ==
|
|
1099
|
+
if key == "_param_metadata": # Skip metadata in generated script
|
|
1079
1100
|
continue
|
|
1080
1101
|
if isinstance(value, Path):
|
|
1081
1102
|
params_dict[key] = str(value)
|
|
@@ -1084,251 +1105,288 @@ class Wizard:
|
|
|
1084
1105
|
|
|
1085
1106
|
# Obtain list of files in source with extension wiff, .raw, .mzML
|
|
1086
1107
|
raw_files = []
|
|
1087
|
-
for ext in params_dict.get(
|
|
1108
|
+
for ext in params_dict.get("file_extensions", []):
|
|
1088
1109
|
raw_files.extend(glob.glob(f"{params_dict.get('source', '')}/**/*{ext}", recursive=True))
|
|
1089
1110
|
|
|
1090
1111
|
# Create readable PARAMS dict with comments
|
|
1091
1112
|
params_lines = []
|
|
1092
|
-
params_lines.append(
|
|
1093
|
-
params_lines.append(
|
|
1094
|
-
|
|
1113
|
+
params_lines.append("# Analysis parameters")
|
|
1114
|
+
params_lines.append("PARAMS = {")
|
|
1115
|
+
|
|
1095
1116
|
# Core Configuration
|
|
1096
|
-
params_lines.append(
|
|
1117
|
+
params_lines.append(" # === Core Configuration ===")
|
|
1097
1118
|
params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
|
|
1098
1119
|
params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
|
|
1099
|
-
params_lines.append(
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1120
|
+
params_lines.append(
|
|
1121
|
+
f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode ("positive" or "negative")'
|
|
1122
|
+
)
|
|
1123
|
+
params_lines.append(
|
|
1124
|
+
f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing'
|
|
1125
|
+
)
|
|
1126
|
+
params_lines.append("")
|
|
1127
|
+
|
|
1103
1128
|
# File Discovery
|
|
1104
|
-
params_lines.append(
|
|
1105
|
-
params_lines.append(
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
params_lines.append(
|
|
1109
|
-
|
|
1129
|
+
params_lines.append(" # === File Discovery ===")
|
|
1130
|
+
params_lines.append(
|
|
1131
|
+
f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for'
|
|
1132
|
+
)
|
|
1133
|
+
params_lines.append(
|
|
1134
|
+
f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively'
|
|
1135
|
+
)
|
|
1136
|
+
params_lines.append(
|
|
1137
|
+
f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip'
|
|
1138
|
+
)
|
|
1139
|
+
params_lines.append("")
|
|
1140
|
+
|
|
1110
1141
|
# Processing Parameters
|
|
1111
|
-
params_lines.append(
|
|
1112
|
-
params_lines.append(
|
|
1113
|
-
|
|
1142
|
+
params_lines.append(" # === Processing Parameters ===")
|
|
1143
|
+
params_lines.append(
|
|
1144
|
+
f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation'
|
|
1145
|
+
)
|
|
1146
|
+
params_lines.append(
|
|
1147
|
+
f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")'
|
|
1148
|
+
)
|
|
1114
1149
|
params_lines.append(f' "noise": {params_dict.get("noise", 50.0)}, # Noise threshold for feature detection')
|
|
1115
|
-
params_lines.append(
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1150
|
+
params_lines.append(
|
|
1151
|
+
f' "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)}, # Chromatographic peak full width at half maximum (seconds)'
|
|
1152
|
+
)
|
|
1153
|
+
params_lines.append(
|
|
1154
|
+
f' "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)}, # Minimum signal-to-noise ratio for chromatographic peaks'
|
|
1155
|
+
)
|
|
1156
|
+
params_lines.append("")
|
|
1157
|
+
|
|
1119
1158
|
# Alignment & Merging
|
|
1120
|
-
params_lines.append(
|
|
1121
|
-
params_lines.append(
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
params_lines.append(
|
|
1125
|
-
|
|
1126
|
-
|
|
1159
|
+
params_lines.append(" # === Alignment & Merging ===")
|
|
1160
|
+
params_lines.append(
|
|
1161
|
+
f' "rt_tol": {params_dict.get("rt_tol", 2.0)}, # Retention time tolerance for alignment (seconds)'
|
|
1162
|
+
)
|
|
1163
|
+
params_lines.append(
|
|
1164
|
+
f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)'
|
|
1165
|
+
)
|
|
1166
|
+
params_lines.append(
|
|
1167
|
+
f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment'
|
|
1168
|
+
)
|
|
1169
|
+
params_lines.append(
|
|
1170
|
+
f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature'
|
|
1171
|
+
)
|
|
1172
|
+
params_lines.append(
|
|
1173
|
+
f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features'
|
|
1174
|
+
)
|
|
1175
|
+
params_lines.append("")
|
|
1127
1176
|
|
|
1128
1177
|
# Sample Processing
|
|
1129
|
-
params_lines.append(
|
|
1130
|
-
params_lines.append(
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1178
|
+
params_lines.append(" # === Sample Processing (used in add_samples_from_folder) ===")
|
|
1179
|
+
params_lines.append(
|
|
1180
|
+
f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch'
|
|
1181
|
+
)
|
|
1182
|
+
params_lines.append(
|
|
1183
|
+
f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)'
|
|
1184
|
+
)
|
|
1185
|
+
params_lines.append("")
|
|
1186
|
+
|
|
1134
1187
|
# Script Options
|
|
1135
|
-
params_lines.append(
|
|
1136
|
-
params_lines.append(
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1188
|
+
params_lines.append(" # === Script Options ===")
|
|
1189
|
+
params_lines.append(
|
|
1190
|
+
f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability'
|
|
1191
|
+
)
|
|
1192
|
+
params_lines.append(
|
|
1193
|
+
f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files'
|
|
1194
|
+
)
|
|
1195
|
+
params_lines.append(
|
|
1196
|
+
f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing'
|
|
1197
|
+
)
|
|
1198
|
+
|
|
1199
|
+
params_lines.append("}")
|
|
1200
|
+
|
|
1142
1201
|
# Create script lines
|
|
1143
1202
|
script_lines = [
|
|
1144
|
-
|
|
1203
|
+
"#!/usr/bin/env python3",
|
|
1145
1204
|
'"""',
|
|
1146
|
-
|
|
1147
|
-
f
|
|
1205
|
+
"Automated Mass Spectrometry Data Analysis Pipeline",
|
|
1206
|
+
f"Generated by masster wizard v{version}",
|
|
1148
1207
|
'"""',
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1208
|
+
"",
|
|
1209
|
+
"import sys",
|
|
1210
|
+
"import time",
|
|
1211
|
+
"from pathlib import Path",
|
|
1212
|
+
"",
|
|
1213
|
+
"# Import masster modules",
|
|
1214
|
+
"from masster.study import Study",
|
|
1215
|
+
"from masster import __version__",
|
|
1216
|
+
"",
|
|
1158
1217
|
]
|
|
1159
|
-
|
|
1218
|
+
|
|
1160
1219
|
# Add the formatted PARAMS
|
|
1161
1220
|
script_lines.extend(params_lines)
|
|
1162
|
-
|
|
1221
|
+
|
|
1163
1222
|
# Add the main function and pipeline
|
|
1164
1223
|
script_lines.extend([
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1224
|
+
"",
|
|
1225
|
+
"",
|
|
1226
|
+
"def discover_raw_files(source_folder, file_extensions, search_subfolders=True):",
|
|
1168
1227
|
' """Discover raw data files in the source folder."""',
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1228
|
+
" source_path = Path(source_folder)",
|
|
1229
|
+
" raw_files = []",
|
|
1230
|
+
" ",
|
|
1231
|
+
" for ext in file_extensions:",
|
|
1232
|
+
" if search_subfolders:",
|
|
1174
1233
|
' pattern = f"**/*{ext}"',
|
|
1175
|
-
|
|
1176
|
-
|
|
1234
|
+
" files = list(source_path.rglob(pattern))",
|
|
1235
|
+
" else:",
|
|
1177
1236
|
' pattern = f"*{ext}"',
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1237
|
+
" files = list(source_path.glob(pattern))",
|
|
1238
|
+
" raw_files.extend(files)",
|
|
1239
|
+
" ",
|
|
1240
|
+
" return raw_files",
|
|
1241
|
+
"",
|
|
1242
|
+
"",
|
|
1243
|
+
"def process_single_file(args):",
|
|
1185
1244
|
' """Process a single raw file to sample5 format - module level for multiprocessing."""',
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1245
|
+
" raw_file, output_folder = args",
|
|
1246
|
+
" from masster.sample import Sample",
|
|
1247
|
+
" ",
|
|
1248
|
+
" try:",
|
|
1249
|
+
" # Create sample5 filename",
|
|
1250
|
+
" sample_name = raw_file.stem",
|
|
1192
1251
|
' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1252
|
+
" ",
|
|
1253
|
+
" # Skip if sample5 already exists",
|
|
1254
|
+
" if sample5_path.exists():",
|
|
1196
1255
|
' print(f" Skipping {raw_file.name} (sample5 already exists)")',
|
|
1197
|
-
|
|
1198
|
-
|
|
1256
|
+
" return str(sample5_path)",
|
|
1257
|
+
" ",
|
|
1199
1258
|
' print(f" Converting {raw_file.name}...")',
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1259
|
+
" ",
|
|
1260
|
+
" # Load and process raw file with full pipeline",
|
|
1261
|
+
" sample = Sample(log_label=sample_name)",
|
|
1262
|
+
" sample.load(filename=str(raw_file))",
|
|
1263
|
+
" sample.find_features(",
|
|
1264
|
+
" noise=PARAMS['noise'],",
|
|
1265
|
+
" chrom_fwhm=PARAMS['chrom_fwhm'],",
|
|
1266
|
+
" chrom_peak_snr=PARAMS['chrom_peak_snr']",
|
|
1267
|
+
" )",
|
|
1268
|
+
" sample.find_adducts(adducts=PARAMS['adducts'])",
|
|
1269
|
+
" sample.find_ms2()",
|
|
1270
|
+
" sample.find_iso()",
|
|
1271
|
+
" # sample.export_mgf()",
|
|
1272
|
+
" # sample.export_mztab()",
|
|
1214
1273
|
' # sample.plot_2d(filename="{sample_name}.html")',
|
|
1215
|
-
|
|
1216
|
-
|
|
1274
|
+
" sample.save(str(sample5_path))",
|
|
1275
|
+
" ",
|
|
1217
1276
|
' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1277
|
+
" return str(sample5_path)",
|
|
1278
|
+
" ",
|
|
1279
|
+
" except Exception as e:",
|
|
1221
1280
|
' print(f" ERROR processing {raw_file.name}: {e}")',
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1281
|
+
" return None",
|
|
1282
|
+
"",
|
|
1283
|
+
"",
|
|
1284
|
+
"def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):",
|
|
1226
1285
|
' """Convert raw data files to sample5 format."""',
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1286
|
+
" import concurrent.futures",
|
|
1287
|
+
" import os",
|
|
1288
|
+
" ",
|
|
1289
|
+
" # Create output directory",
|
|
1290
|
+
" os.makedirs(output_folder, exist_ok=True)",
|
|
1291
|
+
" ",
|
|
1292
|
+
" # Prepare arguments for multiprocessing",
|
|
1293
|
+
" file_args = [(raw_file, output_folder) for raw_file in raw_files]",
|
|
1294
|
+
" ",
|
|
1295
|
+
" # Process files in parallel",
|
|
1296
|
+
" sample5_files = []",
|
|
1297
|
+
" with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:",
|
|
1298
|
+
" futures = [executor.submit(process_single_file, args) for args in file_args]",
|
|
1299
|
+
" ",
|
|
1300
|
+
" for future in concurrent.futures.as_completed(futures):",
|
|
1301
|
+
" result = future.result()",
|
|
1302
|
+
" if result:",
|
|
1303
|
+
" sample5_files.append(result)",
|
|
1304
|
+
" ",
|
|
1305
|
+
" return sample5_files",
|
|
1306
|
+
"",
|
|
1307
|
+
"",
|
|
1308
|
+
"def main():",
|
|
1250
1309
|
' """Main analysis pipeline."""',
|
|
1251
|
-
|
|
1310
|
+
" try:",
|
|
1252
1311
|
' print("=" * 70)',
|
|
1253
1312
|
f' print("masster {version} - Automated MS Data Analysis")',
|
|
1254
1313
|
' print("=" * 70)',
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1314
|
+
" print(f\"Source: {PARAMS['source']}\")",
|
|
1315
|
+
" print(f\"Output: {PARAMS['folder']}\")",
|
|
1316
|
+
" print(f\"Polarity: {PARAMS['polarity']}\")",
|
|
1317
|
+
" print(f\"CPU Cores: {PARAMS['num_cores']}\")",
|
|
1259
1318
|
' print("=" * 70)',
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1319
|
+
" ",
|
|
1320
|
+
" start_time = time.time()",
|
|
1321
|
+
" ",
|
|
1322
|
+
" # Step 1: Discover raw data files",
|
|
1264
1323
|
' print("\\nStep 1/7: Discovering raw data files...")',
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1324
|
+
" raw_files = discover_raw_files(",
|
|
1325
|
+
" PARAMS['source'],",
|
|
1326
|
+
" PARAMS['file_extensions'],",
|
|
1327
|
+
" PARAMS['search_subfolders']",
|
|
1328
|
+
" )",
|
|
1329
|
+
" ",
|
|
1330
|
+
" if not raw_files:",
|
|
1272
1331
|
' print("No raw data files found!")',
|
|
1273
|
-
|
|
1274
|
-
|
|
1332
|
+
" return False",
|
|
1333
|
+
" ",
|
|
1275
1334
|
' print(f"Found {len(raw_files)} raw data files")',
|
|
1276
|
-
|
|
1335
|
+
" for f in raw_files[:5]: # Show first 5 files",
|
|
1277
1336
|
' print(f" {f.name}")',
|
|
1278
|
-
|
|
1337
|
+
" if len(raw_files) > 5:",
|
|
1279
1338
|
' print(f" ... and {len(raw_files) - 5} more")',
|
|
1280
|
-
|
|
1281
|
-
|
|
1339
|
+
" ",
|
|
1340
|
+
" # Step 2: Process raw files",
|
|
1282
1341
|
' print("\\nStep 2/7: Processing raw files...")',
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1342
|
+
" sample5_files = convert_raw_to_sample5(",
|
|
1343
|
+
" raw_files,",
|
|
1344
|
+
" PARAMS['folder'],",
|
|
1345
|
+
" PARAMS['polarity'],",
|
|
1346
|
+
" PARAMS['num_cores']",
|
|
1347
|
+
" )",
|
|
1348
|
+
" ",
|
|
1349
|
+
" if not sample5_files:",
|
|
1291
1350
|
' print("No sample5 files were created!")',
|
|
1292
|
-
|
|
1293
|
-
|
|
1351
|
+
" return False",
|
|
1352
|
+
" ",
|
|
1294
1353
|
' print(f"Successfully processed {len(sample5_files)} files to sample5")',
|
|
1295
|
-
|
|
1296
|
-
|
|
1354
|
+
" ",
|
|
1355
|
+
" # Step 3: Create and configure study",
|
|
1297
1356
|
' print("\\nStep 3/7: Initializing study...")',
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1357
|
+
" study = Study(folder=PARAMS['folder'])",
|
|
1358
|
+
" study.polarity = PARAMS['polarity']",
|
|
1359
|
+
" study.adducts = PARAMS['adducts']",
|
|
1360
|
+
" ",
|
|
1361
|
+
" # Step 4: Add sample5 files to study",
|
|
1303
1362
|
' print("\\nStep 4/7: Adding samples to study...")',
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1363
|
+
" study.add(str(Path(PARAMS['folder']) / \"*.sample5\"))",
|
|
1364
|
+
" study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))",
|
|
1365
|
+
" ",
|
|
1366
|
+
" # Step 5: Core processing",
|
|
1308
1367
|
' print("\\nStep 5/7: Processing...")',
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1368
|
+
" study.align(",
|
|
1369
|
+
" algorithm=PARAMS['alignment_method'],",
|
|
1370
|
+
" rt_tol=PARAMS['rt_tol']",
|
|
1371
|
+
" )",
|
|
1372
|
+
" ",
|
|
1373
|
+
" study.merge(",
|
|
1315
1374
|
' method="qt",',
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
' # Step 6/7: Saving results',
|
|
1375
|
+
" min_samples=PARAMS['min_samples_per_feature'],",
|
|
1376
|
+
" threads=PARAMS['num_cores'],",
|
|
1377
|
+
" rt_tol=PARAMS['rt_tol'], )",
|
|
1378
|
+
" study.find_iso()",
|
|
1379
|
+
" study.fill()",
|
|
1380
|
+
" study.integrate()",
|
|
1381
|
+
" ",
|
|
1382
|
+
" # Step 6/7: Saving results",
|
|
1325
1383
|
' print("\\nStep 6/7: Saving results...")',
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1384
|
+
" study.save()",
|
|
1385
|
+
" study.export_xlsx()",
|
|
1386
|
+
" study.export_mgf()",
|
|
1387
|
+
" study.export_mztab()",
|
|
1388
|
+
" ",
|
|
1389
|
+
" # Step 7: Plots",
|
|
1332
1390
|
' print("\\nStep 7/7: Exporting plots...")',
|
|
1333
1391
|
' study.plot_consensus_2d(filename="consensus.html")',
|
|
1334
1392
|
' study.plot_consensus_2d(filename="consensus.png")',
|
|
@@ -1340,11 +1398,10 @@ class Wizard:
|
|
|
1340
1398
|
' study.plot_bpc(filename="bpc.png")',
|
|
1341
1399
|
' study.plot_rt_correction(filename="rt_correction.html")',
|
|
1342
1400
|
' study.plot_rt_correction(filename="rt_correction.png")',
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
' total_time = time.time() - start_time',
|
|
1401
|
+
" ",
|
|
1402
|
+
" # Print summary",
|
|
1403
|
+
" study.info()",
|
|
1404
|
+
" total_time = time.time() - start_time",
|
|
1348
1405
|
' print("\\n" + "=" * 70)',
|
|
1349
1406
|
' print("ANALYSIS COMPLETE")',
|
|
1350
1407
|
' print("=" * 70)',
|
|
@@ -1354,41 +1411,41 @@ class Wizard:
|
|
|
1354
1411
|
' if hasattr(study, "consensus_df"):',
|
|
1355
1412
|
' print(f"Consensus features generated: {len(study.consensus_df)}")',
|
|
1356
1413
|
' print("=" * 70)',
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1414
|
+
" ",
|
|
1415
|
+
" return True",
|
|
1416
|
+
" ",
|
|
1417
|
+
" except KeyboardInterrupt:",
|
|
1361
1418
|
' print("\\nAnalysis interrupted by user")',
|
|
1362
|
-
|
|
1363
|
-
|
|
1419
|
+
" return False",
|
|
1420
|
+
" except Exception as e:",
|
|
1364
1421
|
' print(f"Analysis failed with error: {e}")',
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1422
|
+
" import traceback",
|
|
1423
|
+
" traceback.print_exc()",
|
|
1424
|
+
" return False",
|
|
1425
|
+
"",
|
|
1426
|
+
"",
|
|
1370
1427
|
'if __name__ == "__main__":',
|
|
1371
|
-
|
|
1372
|
-
|
|
1428
|
+
" success = main()",
|
|
1429
|
+
" sys.exit(0 if success else 1)",
|
|
1373
1430
|
])
|
|
1374
|
-
|
|
1375
|
-
return
|
|
1431
|
+
|
|
1432
|
+
return "\n".join(script_lines)
|
|
1376
1433
|
|
|
1377
1434
|
|
|
1378
1435
|
def create_scripts(
|
|
1379
|
-
source: str = "",
|
|
1380
|
-
folder: str = "",
|
|
1436
|
+
source: str = "",
|
|
1437
|
+
folder: str = "",
|
|
1381
1438
|
polarity: Optional[str] = None,
|
|
1382
1439
|
adducts: Optional[List[str]] = None,
|
|
1383
1440
|
num_cores: int = 0,
|
|
1384
|
-
**kwargs
|
|
1441
|
+
**kwargs,
|
|
1385
1442
|
) -> Dict[str, Any]:
|
|
1386
1443
|
"""
|
|
1387
1444
|
Create analysis scripts without explicitly instantiating a Wizard.
|
|
1388
|
-
|
|
1445
|
+
|
|
1389
1446
|
This is a convenience function that creates a Wizard instance internally
|
|
1390
1447
|
and calls its create_scripts() method.
|
|
1391
|
-
|
|
1448
|
+
|
|
1392
1449
|
Parameters:
|
|
1393
1450
|
source: Directory containing raw data files
|
|
1394
1451
|
folder: Output directory for processed study
|
|
@@ -1396,7 +1453,7 @@ def create_scripts(
|
|
|
1396
1453
|
adducts: List of adduct specifications (auto-set if None)
|
|
1397
1454
|
num_cores: Number of CPU cores (0 = auto-detect)
|
|
1398
1455
|
**kwargs: Additional parameters
|
|
1399
|
-
|
|
1456
|
+
|
|
1400
1457
|
Returns:
|
|
1401
1458
|
Dictionary containing:
|
|
1402
1459
|
- status: "success" or "error"
|
|
@@ -1404,42 +1461,35 @@ def create_scripts(
|
|
|
1404
1461
|
- instructions: List of next steps
|
|
1405
1462
|
- files_created: List of created file paths
|
|
1406
1463
|
- source_info: Metadata about source files
|
|
1407
|
-
|
|
1464
|
+
|
|
1408
1465
|
Example:
|
|
1409
1466
|
>>> import masster.wizard
|
|
1410
1467
|
>>> result = masster.wizard.create_scripts(
|
|
1411
1468
|
... source=r'D:\\Data\\raw_files',
|
|
1412
|
-
... folder=r'D:\\Data\\output',
|
|
1469
|
+
... folder=r'D:\\Data\\output',
|
|
1413
1470
|
... polarity='negative'
|
|
1414
1471
|
... )
|
|
1415
1472
|
>>> print("Status:", result["status"])
|
|
1416
1473
|
"""
|
|
1417
|
-
|
|
1474
|
+
|
|
1418
1475
|
try:
|
|
1419
1476
|
# Auto-detect optimal number of cores if not specified
|
|
1420
1477
|
if num_cores <= 0:
|
|
1421
1478
|
num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
|
|
1422
|
-
|
|
1479
|
+
|
|
1423
1480
|
# Create Wizard instance
|
|
1424
|
-
wizard = Wizard(
|
|
1425
|
-
|
|
1426
|
-
folder=folder,
|
|
1427
|
-
polarity=polarity,
|
|
1428
|
-
adducts=adducts,
|
|
1429
|
-
num_cores=num_cores,
|
|
1430
|
-
**kwargs
|
|
1431
|
-
)
|
|
1432
|
-
|
|
1481
|
+
wizard = Wizard(source=source, folder=folder, polarity=polarity, adducts=adducts, num_cores=num_cores, **kwargs)
|
|
1482
|
+
|
|
1433
1483
|
# Call the instance method
|
|
1434
1484
|
return wizard.create_scripts()
|
|
1435
|
-
|
|
1485
|
+
|
|
1436
1486
|
except Exception as e:
|
|
1437
1487
|
return {
|
|
1438
1488
|
"status": "error",
|
|
1439
1489
|
"message": f"Failed to create scripts: {e}",
|
|
1440
1490
|
"instructions": [],
|
|
1441
1491
|
"files_created": [],
|
|
1442
|
-
"source_info": {}
|
|
1492
|
+
"source_info": {},
|
|
1443
1493
|
}
|
|
1444
1494
|
|
|
1445
1495
|
|