masster 0.5.21__py3-none-any.whl → 0.5.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +100 -204
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +765 -714
- {masster-0.5.21.dist-info → masster-0.5.23.dist-info}/METADATA +27 -1
- {masster-0.5.21.dist-info → masster-0.5.23.dist-info}/RECORD +37 -37
- {masster-0.5.21.dist-info → masster-0.5.23.dist-info}/WHEEL +0 -0
- {masster-0.5.21.dist-info → masster-0.5.23.dist-info}/entry_points.txt +0 -0
- {masster-0.5.21.dist-info → masster-0.5.23.dist-info}/licenses/LICENSE +0 -0
masster/wizard/wizard.py
CHANGED
|
@@ -34,6 +34,7 @@ wizard = Wizard(
|
|
|
34
34
|
|
|
35
35
|
from __future__ import annotations
|
|
36
36
|
|
|
37
|
+
from math import e
|
|
37
38
|
import os
|
|
38
39
|
import sys
|
|
39
40
|
import time
|
|
@@ -58,99 +59,99 @@ from masster._version import __version__ as version
|
|
|
58
59
|
class wizard_def:
|
|
59
60
|
"""
|
|
60
61
|
Default parameters for the Wizard automated processing system.
|
|
61
|
-
|
|
62
|
+
|
|
62
63
|
This class provides comprehensive configuration for all stages of automated
|
|
63
64
|
mass spectrometry data processing from raw files to final results.
|
|
64
|
-
|
|
65
|
+
|
|
65
66
|
Attributes:
|
|
66
67
|
# Core Configuration
|
|
67
68
|
source (str): Path to directory containing raw data files
|
|
68
69
|
folder (str): Output directory for processed study
|
|
69
70
|
polarity (Optional[str]): Ion polarity mode ("positive", "negative", or None for auto-detection)
|
|
70
71
|
num_cores (int): Number of CPU cores to use for parallel processing
|
|
71
|
-
|
|
72
|
+
|
|
72
73
|
# File Discovery
|
|
73
74
|
file_extensions (List[str]): File extensions to search for
|
|
74
75
|
search_subfolders (bool): Whether to search subdirectories
|
|
75
76
|
skip_patterns (List[str]): Filename patterns to skip
|
|
76
|
-
|
|
77
|
+
|
|
77
78
|
# Processing Parameters
|
|
78
79
|
adducts (List[str]): Adduct specifications for given polarity
|
|
79
80
|
batch_size (int): Number of files to process per batch
|
|
80
81
|
memory_limit_gb (float): Memory limit for processing (GB)
|
|
81
|
-
|
|
82
|
+
|
|
82
83
|
# Resume & Recovery
|
|
83
84
|
resume_enabled (bool): Enable automatic resume capability
|
|
84
85
|
force_reprocess (bool): Force reprocessing of existing files
|
|
85
86
|
backup_enabled (bool): Create backups of intermediate results
|
|
86
|
-
|
|
87
|
+
|
|
87
88
|
# Output & Export
|
|
88
89
|
generate_plots (bool): Generate visualization plots
|
|
89
90
|
export_formats (List[str]): Output formats to generate
|
|
90
91
|
compress_output (bool): Compress final study file
|
|
91
|
-
|
|
92
|
+
|
|
92
93
|
# Logging
|
|
93
94
|
log_level (str): Logging detail level
|
|
94
95
|
log_to_file (bool): Save logs to file
|
|
95
96
|
progress_interval (int): Progress update interval (seconds)
|
|
96
97
|
"""
|
|
97
|
-
|
|
98
|
+
|
|
98
99
|
# === Core Configuration ===
|
|
99
100
|
source: str = ""
|
|
100
|
-
folder: str = ""
|
|
101
|
+
folder: str = ""
|
|
101
102
|
polarity: Optional[str] = None
|
|
102
103
|
num_cores: int = 4
|
|
103
|
-
|
|
104
|
+
|
|
104
105
|
# === File Discovery ===
|
|
105
106
|
file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"])
|
|
106
107
|
search_subfolders: bool = True
|
|
107
108
|
skip_patterns: List[str] = field(default_factory=lambda: ["blank", "test"])
|
|
108
|
-
|
|
109
|
+
|
|
109
110
|
# === Processing Parameters ===
|
|
110
111
|
adducts: List[str] = field(default_factory=list) # Will be set based on polarity
|
|
111
112
|
batch_size: int = 8
|
|
112
113
|
memory_limit_gb: float = 16.0
|
|
113
114
|
max_file_size_gb: float = 4.0
|
|
114
|
-
|
|
115
|
+
|
|
115
116
|
# === Resume & Recovery ===
|
|
116
117
|
resume_enabled: bool = True
|
|
117
118
|
force_reprocess: bool = False
|
|
118
119
|
backup_enabled: bool = True
|
|
119
120
|
checkpoint_interval: int = 10 # Save progress every N files
|
|
120
|
-
|
|
121
|
+
|
|
121
122
|
# === Study Assembly ===
|
|
122
123
|
min_samples_for_merge: int = 2
|
|
123
124
|
rt_tolerance: float = 1.5
|
|
124
125
|
mz_max_diff: float = 0.01
|
|
125
126
|
alignment_algorithm: str = "kd"
|
|
126
127
|
merge_method: str = "qt"
|
|
127
|
-
|
|
128
|
+
|
|
128
129
|
# === Feature Detection ===
|
|
129
|
-
chrom_fwhm: float =
|
|
130
|
-
noise: float =
|
|
130
|
+
chrom_fwhm: float | None = None
|
|
131
|
+
noise: float | None = None
|
|
131
132
|
chrom_peak_snr: float = 5.0
|
|
132
133
|
tol_ppm: float = 10.0
|
|
133
134
|
detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
|
|
134
|
-
|
|
135
|
+
|
|
135
136
|
# === Output & Export ===
|
|
136
137
|
generate_plots: bool = True
|
|
137
138
|
generate_interactive: bool = True
|
|
138
139
|
export_formats: List[str] = field(default_factory=lambda: ["csv", "mgf", "xlsx"])
|
|
139
140
|
compress_output: bool = True
|
|
140
141
|
adaptive_compression: bool = True # Adapt based on study size
|
|
141
|
-
|
|
142
|
+
|
|
142
143
|
# === Logging ===
|
|
143
144
|
log_level: str = "INFO"
|
|
144
145
|
log_to_file: bool = True
|
|
145
146
|
progress_interval: int = 30 # seconds
|
|
146
147
|
verbose_progress: bool = True
|
|
147
|
-
|
|
148
|
+
|
|
148
149
|
# === Advanced Options ===
|
|
149
150
|
use_process_pool: bool = True # vs ThreadPoolExecutor
|
|
150
151
|
optimize_memory: bool = True
|
|
151
152
|
cleanup_temp_files: bool = True
|
|
152
153
|
validate_outputs: bool = True
|
|
153
|
-
|
|
154
|
+
|
|
154
155
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
155
156
|
default_factory=lambda: {
|
|
156
157
|
"source": {
|
|
@@ -159,7 +160,7 @@ class wizard_def:
|
|
|
159
160
|
"required": True,
|
|
160
161
|
},
|
|
161
162
|
"folder": {
|
|
162
|
-
"dtype": str,
|
|
163
|
+
"dtype": str,
|
|
163
164
|
"description": "Output directory for processed study",
|
|
164
165
|
"required": True,
|
|
165
166
|
},
|
|
@@ -193,26 +194,26 @@ class wizard_def:
|
|
|
193
194
|
},
|
|
194
195
|
repr=False,
|
|
195
196
|
)
|
|
196
|
-
|
|
197
|
+
|
|
197
198
|
def __post_init__(self):
|
|
198
199
|
"""Set polarity-specific defaults after initialization."""
|
|
199
200
|
# Set default adducts based on polarity if not provided
|
|
200
201
|
if not self.adducts:
|
|
201
202
|
if self.polarity and self.polarity.lower() in ["positive", "pos"]:
|
|
202
203
|
self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
|
|
203
|
-
elif self.polarity and self.polarity.lower() in ["negative", "neg"]:
|
|
204
|
+
elif self.polarity and self.polarity.lower() in ["negative", "neg"]:
|
|
204
205
|
self.adducts = ["H-1:-:1.0", "CH2O2:0:0.5"]
|
|
205
206
|
else:
|
|
206
207
|
# Default to positive if polarity is None or unknown
|
|
207
208
|
self.adducts = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]
|
|
208
|
-
|
|
209
|
+
|
|
209
210
|
# Validate num_cores
|
|
210
211
|
max_cores = multiprocessing.cpu_count()
|
|
211
212
|
if self.num_cores <= 0:
|
|
212
213
|
self.num_cores = max_cores
|
|
213
214
|
elif self.num_cores > max_cores:
|
|
214
215
|
self.num_cores = max_cores
|
|
215
|
-
|
|
216
|
+
|
|
216
217
|
# Ensure paths are absolute
|
|
217
218
|
if self.source:
|
|
218
219
|
self.source = os.path.abspath(self.source)
|
|
@@ -223,24 +224,24 @@ class wizard_def:
|
|
|
223
224
|
class Wizard:
|
|
224
225
|
"""
|
|
225
226
|
Simplified Wizard for automated mass spectrometry data processing.
|
|
226
|
-
|
|
227
|
+
|
|
227
228
|
The Wizard provides a clean interface for creating and executing analysis scripts
|
|
228
229
|
that process raw MS data through the complete pipeline: file discovery, feature
|
|
229
230
|
detection, sample processing, study assembly, alignment, merging, and export.
|
|
230
|
-
|
|
231
|
+
|
|
231
232
|
Core functions:
|
|
232
233
|
- create_scripts(): Generate standalone analysis scripts
|
|
233
234
|
- test_only(): Process only one file for parameter validation
|
|
234
235
|
- test_and_run(): Test with single file, then run full batch if successful
|
|
235
236
|
- run(): Execute full batch processing on all files
|
|
236
|
-
|
|
237
|
+
|
|
237
238
|
Recommended workflow:
|
|
238
239
|
1. wizard = Wizard(source="raw_data", folder="output")
|
|
239
240
|
2. wizard.create_scripts() # Generate analysis scripts
|
|
240
241
|
3. wizard.test_only() # Validate with single file
|
|
241
242
|
4. wizard.run() # Process all files
|
|
242
243
|
"""
|
|
243
|
-
|
|
244
|
+
|
|
244
245
|
def __init__(
|
|
245
246
|
self,
|
|
246
247
|
source: str = "",
|
|
@@ -248,11 +249,11 @@ class Wizard:
|
|
|
248
249
|
polarity: Optional[str] = None,
|
|
249
250
|
adducts: Optional[List[str]] = None,
|
|
250
251
|
num_cores: int = 6,
|
|
251
|
-
**kwargs
|
|
252
|
+
**kwargs,
|
|
252
253
|
):
|
|
253
254
|
"""
|
|
254
255
|
Initialize the Wizard with analysis parameters.
|
|
255
|
-
|
|
256
|
+
|
|
256
257
|
Parameters:
|
|
257
258
|
source: Directory containing raw data files
|
|
258
259
|
folder: Output directory for processed study
|
|
@@ -261,145 +262,43 @@ class Wizard:
|
|
|
261
262
|
num_cores: Number of CPU cores (0 = auto-detect 75% of available)
|
|
262
263
|
**kwargs: Additional parameters (see wizard_def for full list)
|
|
263
264
|
"""
|
|
264
|
-
|
|
265
|
+
|
|
265
266
|
# Auto-detect optimal number of cores if not specified
|
|
266
267
|
if num_cores <= 0:
|
|
267
268
|
num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
|
|
268
|
-
|
|
269
|
+
|
|
269
270
|
# Create parameters instance
|
|
270
271
|
if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
|
|
271
272
|
self.params = kwargs.pop("params")
|
|
272
273
|
else:
|
|
273
274
|
# Create default parameters
|
|
274
|
-
self.params = wizard_def(
|
|
275
|
-
|
|
276
|
-
folder=folder,
|
|
277
|
-
polarity=polarity,
|
|
278
|
-
num_cores=num_cores
|
|
279
|
-
)
|
|
280
|
-
|
|
275
|
+
self.params = wizard_def(source=source, folder=folder, polarity=polarity, num_cores=num_cores)
|
|
276
|
+
|
|
281
277
|
# Set adducts if provided
|
|
282
278
|
if adducts is not None:
|
|
283
279
|
self.params.adducts = adducts
|
|
284
|
-
|
|
280
|
+
|
|
285
281
|
# Update with any additional parameters
|
|
286
282
|
for key, value in kwargs.items():
|
|
287
283
|
if hasattr(self.params, key):
|
|
288
284
|
setattr(self.params, key, value)
|
|
289
|
-
|
|
285
|
+
|
|
290
286
|
# Validate required parameters
|
|
291
287
|
if not self.params.source:
|
|
292
288
|
raise ValueError("source is required")
|
|
293
289
|
if not self.params.folder:
|
|
294
290
|
raise ValueError("folder is required")
|
|
295
|
-
|
|
291
|
+
|
|
296
292
|
# Create and validate paths
|
|
297
293
|
self.source_path = Path(self.params.source)
|
|
298
|
-
self.folder_path = Path(self.params.folder)
|
|
294
|
+
self.folder_path = Path(self.params.folder)
|
|
299
295
|
self.folder_path.mkdir(parents=True, exist_ok=True)
|
|
300
|
-
|
|
301
|
-
# Auto-infer polarity from the first file if polarity is None
|
|
302
|
-
if self.params.polarity is None:
|
|
303
|
-
inferred_polarity = self._infer_polarity_from_first_file()
|
|
304
|
-
if inferred_polarity:
|
|
305
|
-
self.params.polarity = inferred_polarity
|
|
306
|
-
# Update adducts based on inferred polarity
|
|
307
|
-
self.params.__post_init__()
|
|
308
296
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
Inferred polarity string ("positive" or "negative") or "positive" as fallback
|
|
315
|
-
"""
|
|
316
|
-
try:
|
|
317
|
-
# Find first file
|
|
318
|
-
for extension in ['.wiff', '.raw', '.mzML']:
|
|
319
|
-
pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
|
|
320
|
-
files = list(self.source_path.rglob(pattern))
|
|
321
|
-
if files:
|
|
322
|
-
first_file = files[0]
|
|
323
|
-
break
|
|
324
|
-
else:
|
|
325
|
-
return 'positive'
|
|
326
|
-
|
|
327
|
-
# Handle different file formats
|
|
328
|
-
if first_file.suffix.lower() == '.wiff':
|
|
329
|
-
return self._infer_polarity_from_wiff(str(first_file))
|
|
330
|
-
elif first_file.suffix.lower() == '.raw':
|
|
331
|
-
return self._infer_polarity_from_raw(str(first_file))
|
|
332
|
-
elif first_file.suffix.lower() == '.mzml':
|
|
333
|
-
return self._infer_polarity_from_mzml(str(first_file))
|
|
334
|
-
|
|
335
|
-
except Exception:
|
|
336
|
-
# Silently fall back to default if inference fails
|
|
337
|
-
pass
|
|
338
|
-
|
|
339
|
-
return 'positive'
|
|
340
|
-
|
|
341
|
-
def _infer_polarity_from_wiff(self, filename: str) -> str:
|
|
342
|
-
"""Infer polarity from WIFF file."""
|
|
343
|
-
try:
|
|
344
|
-
from masster.sample.load import _wiff_to_dict
|
|
345
|
-
|
|
346
|
-
# Extract metadata from first file
|
|
347
|
-
metadata_df = _wiff_to_dict(filename)
|
|
348
|
-
|
|
349
|
-
if not metadata_df.empty and 'polarity' in metadata_df.columns:
|
|
350
|
-
# Get polarity from first experiment
|
|
351
|
-
first_polarity = metadata_df['polarity'].iloc[0]
|
|
352
|
-
|
|
353
|
-
# Convert numeric polarity codes to string
|
|
354
|
-
if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
|
|
355
|
-
return "positive"
|
|
356
|
-
elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
|
|
357
|
-
return "negative"
|
|
358
|
-
except Exception:
|
|
359
|
-
pass
|
|
360
|
-
return 'positive'
|
|
361
|
-
|
|
362
|
-
def _infer_polarity_from_raw(self, filename: str) -> str:
|
|
363
|
-
"""Infer polarity from Thermo RAW file."""
|
|
364
|
-
try:
|
|
365
|
-
from masster.sample.thermo import ThermoRawFileReader
|
|
366
|
-
|
|
367
|
-
with ThermoRawFileReader(filename) as raw_reader:
|
|
368
|
-
# Get polarity from first scan
|
|
369
|
-
first_scan = 1
|
|
370
|
-
polarity = raw_reader.get_polarity_from_scan_event(first_scan)
|
|
371
|
-
if polarity in ['positive', 'negative']:
|
|
372
|
-
return polarity
|
|
373
|
-
except Exception:
|
|
374
|
-
pass
|
|
375
|
-
return 'positive'
|
|
376
|
-
|
|
377
|
-
def _infer_polarity_from_mzml(self, filename: str) -> str:
|
|
378
|
-
"""Infer polarity from mzML file."""
|
|
379
|
-
try:
|
|
380
|
-
# Import pyopenms with warnings suppression
|
|
381
|
-
import warnings
|
|
382
|
-
with warnings.catch_warnings():
|
|
383
|
-
warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
|
|
384
|
-
import pyopenms as oms
|
|
385
|
-
|
|
386
|
-
# Load the first few spectra to check polarity
|
|
387
|
-
omsexp = oms.MSExperiment()
|
|
388
|
-
oms.MzMLFile().load(filename, omsexp)
|
|
389
|
-
|
|
390
|
-
if omsexp.getNrSpectra() > 0:
|
|
391
|
-
first_spectrum = omsexp.getSpectra()[0]
|
|
392
|
-
try:
|
|
393
|
-
pol = first_spectrum.getInstrumentSettings().getPolarity()
|
|
394
|
-
if pol == 1:
|
|
395
|
-
return "positive"
|
|
396
|
-
elif pol == 2:
|
|
397
|
-
return "negative"
|
|
398
|
-
except Exception:
|
|
399
|
-
pass
|
|
400
|
-
except Exception:
|
|
401
|
-
pass
|
|
402
|
-
return 'positive'
|
|
297
|
+
# Set default polarity if not specified
|
|
298
|
+
if self.params.polarity is None:
|
|
299
|
+
self.params.polarity = "positive"
|
|
300
|
+
# Update adducts based on default polarity
|
|
301
|
+
self.params.__post_init__()
|
|
403
302
|
|
|
404
303
|
@property
|
|
405
304
|
def polarity(self) -> Optional[str]:
|
|
@@ -414,13 +313,13 @@ class Wizard:
|
|
|
414
313
|
def create_scripts(self) -> Dict[str, Any]:
|
|
415
314
|
"""
|
|
416
315
|
Generate analysis scripts based on source file analysis.
|
|
417
|
-
|
|
316
|
+
|
|
418
317
|
This method:
|
|
419
318
|
1. Analyzes the source files to extract metadata
|
|
420
319
|
2. Creates 1_masster_workflow.py with sample processing logic
|
|
421
320
|
3. Creates 2_interactive_analysis.py marimo notebook for study exploration
|
|
422
321
|
4. Returns instructions for next steps
|
|
423
|
-
|
|
322
|
+
|
|
424
323
|
Returns:
|
|
425
324
|
Dictionary containing:
|
|
426
325
|
- status: "success" or "error"
|
|
@@ -432,50 +331,79 @@ class Wizard:
|
|
|
432
331
|
try:
|
|
433
332
|
# Step 1: Analyze source files to extract metadata
|
|
434
333
|
source_info = self._analyze_source_files()
|
|
435
|
-
|
|
334
|
+
|
|
335
|
+
# Report extracted information from first file
|
|
336
|
+
print("\nInformation retrieval complete:")
|
|
337
|
+
print(f" Number of files found: {source_info.get('number_of_files', 0)}")
|
|
338
|
+
print(f" File types detected: {', '.join(source_info.get('file_types', []))}")
|
|
339
|
+
#print(f" First file: {source_info.get('first_file', 'None')}")
|
|
340
|
+
if source_info.get('first_file'):
|
|
341
|
+
print(f" Detected polarity: {source_info.get('polarity', 'unknown')}")
|
|
342
|
+
print(f" Detected detector type: {source_info.get('detector_type', 'unknown')}")
|
|
343
|
+
if source_info.get('baseline', 0) > 0:
|
|
344
|
+
print(f" Estimated baseline intensity: {source_info.get('baseline', 0):.1f}")
|
|
345
|
+
if source_info.get('length_minutes', 0) > 0:
|
|
346
|
+
print(f" Run length: {source_info.get('length_minutes', 0):.1f} minutes")
|
|
347
|
+
else:
|
|
348
|
+
print(" Run length: Could not determine")
|
|
349
|
+
if source_info.get('ms1_scans_per_second', 0) > 0:
|
|
350
|
+
print(f" MS1 scans per second: {source_info.get('ms1_scans_per_second', 0):.2f}")
|
|
351
|
+
print()
|
|
352
|
+
|
|
436
353
|
# Update wizard parameters based on detected metadata
|
|
437
|
-
if source_info.get(
|
|
438
|
-
self.params.polarity = source_info[
|
|
439
|
-
|
|
354
|
+
if source_info.get("polarity") and source_info["polarity"] != "positive":
|
|
355
|
+
self.params.polarity = source_info["polarity"]
|
|
356
|
+
|
|
440
357
|
files_created = []
|
|
441
|
-
|
|
358
|
+
|
|
442
359
|
# Step 2: Create 1_masster_workflow.py
|
|
443
360
|
workflow_script_path = self.folder_path / "1_masster_workflow.py"
|
|
361
|
+
print(f"Creating workflow script: {workflow_script_path}")
|
|
444
362
|
workflow_content = self._generate_workflow_script_content(source_info)
|
|
445
363
|
|
|
446
364
|
# Apply test mode modifications
|
|
447
365
|
workflow_content = self._add_test_mode_support(workflow_content)
|
|
448
|
-
|
|
449
|
-
with open(workflow_script_path,
|
|
366
|
+
|
|
367
|
+
with open(workflow_script_path, "w", encoding="utf-8") as f:
|
|
450
368
|
f.write(workflow_content)
|
|
451
369
|
files_created.append(str(workflow_script_path))
|
|
452
|
-
|
|
370
|
+
|
|
453
371
|
# Step 3: Create 2_interactive_analysis.py marimo notebook
|
|
454
372
|
notebook_path = self.folder_path / "2_interactive_analysis.py"
|
|
373
|
+
print(f"Creating interactive analysis notebook: {notebook_path}")
|
|
455
374
|
notebook_content = self._generate_interactive_notebook_content(source_info)
|
|
456
|
-
|
|
457
|
-
with open(notebook_path,
|
|
375
|
+
|
|
376
|
+
with open(notebook_path, "w", encoding="utf-8") as f:
|
|
458
377
|
f.write(notebook_content)
|
|
459
378
|
files_created.append(str(notebook_path))
|
|
460
|
-
|
|
379
|
+
|
|
461
380
|
# Step 4: Generate instructions
|
|
462
381
|
instructions = self._generate_instructions(source_info, files_created)
|
|
463
|
-
|
|
382
|
+
|
|
464
383
|
return {
|
|
465
384
|
"status": "success",
|
|
466
385
|
"message": f"Successfully created {len(files_created)} script files",
|
|
467
386
|
"instructions": instructions,
|
|
468
387
|
"files_created": files_created,
|
|
469
|
-
"source_info": source_info
|
|
388
|
+
"source_info": source_info,
|
|
470
389
|
}
|
|
471
|
-
|
|
390
|
+
|
|
472
391
|
except Exception as e:
|
|
392
|
+
import traceback
|
|
393
|
+
tb = traceback.extract_tb(e.__traceback__)
|
|
394
|
+
if tb:
|
|
395
|
+
line_number = tb[-1].lineno
|
|
396
|
+
function_name = tb[-1].name
|
|
397
|
+
error_location = f" (at line {line_number} in {function_name})"
|
|
398
|
+
else:
|
|
399
|
+
error_location = ""
|
|
400
|
+
|
|
473
401
|
return {
|
|
474
|
-
"status": "error",
|
|
475
|
-
"message": f"Failed to create scripts: {e}",
|
|
402
|
+
"status": "error",
|
|
403
|
+
"message": f"Failed to create scripts: {e}{error_location}",
|
|
476
404
|
"instructions": [],
|
|
477
405
|
"files_created": [],
|
|
478
|
-
"source_info": {}
|
|
406
|
+
"source_info": {},
|
|
479
407
|
}
|
|
480
408
|
|
|
481
409
|
def _analyze_source_files(self) -> Dict[str, Any]:
|
|
@@ -483,16 +411,20 @@ class Wizard:
|
|
|
483
411
|
result = {
|
|
484
412
|
"number_of_files": 0,
|
|
485
413
|
"file_types": [],
|
|
486
|
-
"
|
|
414
|
+
"detector_type": "tof",
|
|
415
|
+
"polarity": None,
|
|
416
|
+
"baseline": None,
|
|
487
417
|
"length_minutes": 0.0,
|
|
488
|
-
"
|
|
418
|
+
"ms1_scans_per_second": 0.0,
|
|
419
|
+
"first_file": None,
|
|
420
|
+
|
|
489
421
|
}
|
|
490
|
-
|
|
422
|
+
|
|
491
423
|
try:
|
|
492
424
|
# Find raw data files
|
|
493
425
|
extensions = [".wiff", ".raw", ".mzML"]
|
|
494
426
|
raw_files = []
|
|
495
|
-
|
|
427
|
+
|
|
496
428
|
for ext in extensions:
|
|
497
429
|
pattern = f"**/*{ext}"
|
|
498
430
|
files = list(self.source_path.rglob(pattern))
|
|
@@ -500,246 +432,284 @@ class Wizard:
|
|
|
500
432
|
raw_files.extend(files)
|
|
501
433
|
if ext not in result["file_types"]:
|
|
502
434
|
result["file_types"].append(ext)
|
|
503
|
-
|
|
435
|
+
|
|
504
436
|
result["number_of_files"] = len(raw_files)
|
|
505
|
-
|
|
437
|
+
|
|
506
438
|
if raw_files:
|
|
507
439
|
result["first_file"] = str(raw_files[0])
|
|
508
|
-
#
|
|
509
|
-
|
|
510
|
-
|
|
440
|
+
# load first file to infer polarity and length
|
|
441
|
+
print(f"Analyzing first file: {raw_files[0]}")
|
|
442
|
+
from masster import Sample
|
|
443
|
+
sample = Sample(filename=result["first_file"], logging_level='WARNING')
|
|
444
|
+
result['polarity'] = sample.polarity
|
|
445
|
+
# take max from polars ms1_df['rt']
|
|
446
|
+
if sample.ms1_df is not None:
|
|
447
|
+
if not sample.ms1_df.is_empty() and 'rt' in sample.ms1_df.columns:
|
|
448
|
+
max_rt = sample.ms1_df['rt'].max()
|
|
449
|
+
if max_rt is not None and isinstance(max_rt, (int, float)) and max_rt > 0:
|
|
450
|
+
result["length_minutes"] = float(max_rt) / 60.0
|
|
451
|
+
result["ms1_scans_per_second"] = len(sample.ms1_df) / float(max_rt) / 60.0
|
|
452
|
+
|
|
453
|
+
baseline = sample.ms1_df['inty'].quantile(0.001)
|
|
454
|
+
if baseline is not None and isinstance(baseline, (int, float)):
|
|
455
|
+
result["baseline"] = float(baseline)
|
|
456
|
+
if baseline > 5e3:
|
|
457
|
+
result["detector_type"] = "orbitrap"
|
|
458
|
+
else:
|
|
459
|
+
result["detector_type"] = "tof"
|
|
460
|
+
|
|
511
461
|
except Exception as e:
|
|
512
462
|
print(f"Warning: Could not analyze source files: {e}")
|
|
513
|
-
|
|
463
|
+
|
|
514
464
|
return result
|
|
515
465
|
|
|
516
466
|
def _generate_workflow_script_content(self, source_info: Dict[str, Any]) -> str:
|
|
517
467
|
"""Generate the content for 1_masster_workflow.py script."""
|
|
518
|
-
|
|
468
|
+
|
|
469
|
+
# Logic
|
|
470
|
+
noise = self.params.noise
|
|
471
|
+
if noise is None:
|
|
472
|
+
if source_info.get("detector_type") == "orbitrap":
|
|
473
|
+
noise = max(self.params.noise or 50.0, 5e4)
|
|
474
|
+
elif source_info.get("detector_type") == "tof":
|
|
475
|
+
default_noise = self.params.noise or 50.0
|
|
476
|
+
baseline = source_info.get("baseline", default_noise / 2.0)
|
|
477
|
+
noise = baseline * 2
|
|
478
|
+
|
|
479
|
+
chrom_fwhm = self.params.chrom_fwhm
|
|
480
|
+
if chrom_fwhm is None:
|
|
481
|
+
if source_info.get("length_minutes", 0) > 0:
|
|
482
|
+
if source_info["length_minutes"] < 10:
|
|
483
|
+
chrom_fwhm = 0.5
|
|
484
|
+
else:
|
|
485
|
+
chrom_fwhm = 2.0
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
# Generate script content
|
|
519
489
|
script_lines = [
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
490
|
+
"#!/usr/bin/env python3",
|
|
491
|
+
'"""',
|
|
492
|
+
"Automated Mass Spectrometry Data Analysis Pipeline",
|
|
493
|
+
"Generated by masster wizard",
|
|
524
494
|
'"""',
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
'
|
|
537
|
-
'
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
495
|
+
"",
|
|
496
|
+
"import os",
|
|
497
|
+
"import sys",
|
|
498
|
+
"import time",
|
|
499
|
+
"from pathlib import Path",
|
|
500
|
+
"",
|
|
501
|
+
"# Import masster modules",
|
|
502
|
+
"from masster.study import Study",
|
|
503
|
+
"from masster import __version__",
|
|
504
|
+
"",
|
|
505
|
+
"# Test mode configuration",
|
|
506
|
+
'TEST = os.environ.get("MASSTER_TEST", "0") == "1"',
|
|
507
|
+
'STOP_AFTER_TEST = os.environ.get("MASSTER_STOP_AFTER_TEST", "0") == "1" # Only run test, don\'t continue to full batch',
|
|
508
|
+
"",
|
|
509
|
+
"# Analysis parameters",
|
|
510
|
+
"PARAMS = {",
|
|
511
|
+
" # === Core Configuration ===",
|
|
542
512
|
f' "source": {str(self.source_path)!r}, # Directory containing raw data files',
|
|
543
513
|
f' "folder": {str(self.folder_path)!r}, # Output directory for processed study',
|
|
544
514
|
f' "polarity": {self.params.polarity!r}, # Ion polarity mode ("positive" or "negative")',
|
|
545
515
|
f' "num_cores": {self.params.num_cores}, # Number of CPU cores for parallel processing',
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
' "
|
|
549
|
-
' "
|
|
550
|
-
|
|
551
|
-
|
|
516
|
+
"",
|
|
517
|
+
" # === Test Mode ===",
|
|
518
|
+
' "test": TEST, # Process only first file for testing',
|
|
519
|
+
' "stop_after_test": STOP_AFTER_TEST, # Stop after test, don\'t run full batch',
|
|
520
|
+
"",
|
|
521
|
+
" # === File Discovery ===",
|
|
552
522
|
f' "file_extensions": {self.params.file_extensions!r}, # File extensions to search for',
|
|
553
523
|
f' "search_subfolders": {self.params.search_subfolders}, # Whether to search subdirectories recursively',
|
|
554
524
|
f' "skip_patterns": {self.params.skip_patterns!r}, # Filename patterns to skip',
|
|
555
|
-
|
|
556
|
-
|
|
525
|
+
"",
|
|
526
|
+
" # === Processing Parameters ===",
|
|
557
527
|
f' "adducts": {self.params.adducts!r}, # Adduct specifications for feature detection and annotation',
|
|
558
|
-
f' "noise": {
|
|
528
|
+
f' "noise": {noise}, # Noise threshold for feature detection',
|
|
559
529
|
f' "chrom_fwhm": {self.params.chrom_fwhm}, # Chromatographic peak full width at half maximum (seconds)',
|
|
560
530
|
f' "chrom_peak_snr": {self.params.chrom_peak_snr}, # Minimum signal-to-noise ratio for chromatographic peaks',
|
|
561
|
-
|
|
562
|
-
|
|
531
|
+
"",
|
|
532
|
+
" # === Alignment & Merging ===",
|
|
563
533
|
f' "rt_tol": {self.params.rt_tolerance}, # Retention time tolerance for alignment (seconds)',
|
|
564
534
|
f' "mz_tol": {self.params.mz_max_diff}, # Mass-to-charge ratio tolerance for alignment (Da)',
|
|
565
535
|
f' "alignment_method": {self.params.alignment_algorithm!r}, # Algorithm for sample alignment',
|
|
566
536
|
f' "min_samples_per_feature": {self.params.min_samples_for_merge}, # Minimum samples required per consensus feature',
|
|
567
537
|
f' "merge_method": {self.params.merge_method!r}, # Method for merging consensus features',
|
|
568
|
-
|
|
569
|
-
|
|
538
|
+
"",
|
|
539
|
+
" # === Sample Processing (used in add_samples_from_folder) ===",
|
|
570
540
|
f' "batch_size": {self.params.batch_size}, # Number of files to process per batch',
|
|
571
541
|
f' "memory_limit_gb": {self.params.memory_limit_gb}, # Memory limit for processing (GB)',
|
|
572
|
-
|
|
573
|
-
|
|
542
|
+
"",
|
|
543
|
+
" # === Script Options ===",
|
|
574
544
|
f' "resume_enabled": {self.params.resume_enabled}, # Enable automatic resume capability',
|
|
575
545
|
f' "force_reprocess": {self.params.force_reprocess}, # Force reprocessing of existing files',
|
|
576
546
|
f' "cleanup_temp_files": {self.params.cleanup_temp_files}, # Clean up temporary files after processing',
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
547
|
+
"}",
|
|
548
|
+
"",
|
|
549
|
+
"",
|
|
550
|
+
"def discover_raw_files(source_folder, file_extensions, search_subfolders=True):",
|
|
581
551
|
' """Discover raw data files in the source folder."""',
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
552
|
+
" source_path = Path(source_folder)",
|
|
553
|
+
" raw_files = []",
|
|
554
|
+
" ",
|
|
555
|
+
" for ext in file_extensions:",
|
|
556
|
+
" if search_subfolders:",
|
|
587
557
|
' pattern = f"**/*{ext}"',
|
|
588
|
-
|
|
589
|
-
|
|
558
|
+
" files = list(source_path.rglob(pattern))",
|
|
559
|
+
" else:",
|
|
590
560
|
' pattern = f"*{ext}"',
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
561
|
+
" files = list(source_path.glob(pattern))",
|
|
562
|
+
" raw_files.extend(files)",
|
|
563
|
+
" ",
|
|
564
|
+
" return raw_files",
|
|
565
|
+
"",
|
|
566
|
+
"",
|
|
567
|
+
"def process_single_file(args):",
|
|
598
568
|
' """Process a single raw file to sample5 format - module level for multiprocessing."""',
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
569
|
+
" raw_file, output_folder = args",
|
|
570
|
+
" from masster.sample import Sample",
|
|
571
|
+
" ",
|
|
572
|
+
" try:",
|
|
573
|
+
" # Create sample5 filename",
|
|
574
|
+
" sample_name = raw_file.stem",
|
|
605
575
|
' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
|
|
606
|
-
|
|
607
|
-
|
|
576
|
+
" ",
|
|
577
|
+
" # Skip if sample5 already exists",
|
|
608
578
|
' if sample5_path.exists() and not PARAMS["force_reprocess"]:',
|
|
609
579
|
' print(f" Skipping {raw_file.name} (sample5 already exists)")',
|
|
610
|
-
|
|
611
|
-
|
|
580
|
+
" return str(sample5_path)",
|
|
581
|
+
" ",
|
|
612
582
|
' print(f" Converting {raw_file.name}...")',
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
583
|
+
" ",
|
|
584
|
+
" # Load and process raw file with full pipeline",
|
|
585
|
+
" sample = Sample(log_label=sample_name)",
|
|
586
|
+
" sample.load(filename=str(raw_file))",
|
|
587
|
+
" sample.find_features(",
|
|
618
588
|
' noise=PARAMS["noise"],',
|
|
619
589
|
' chrom_fwhm=PARAMS["chrom_fwhm"],',
|
|
620
590
|
' chrom_peak_snr=PARAMS["chrom_peak_snr"]',
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
591
|
+
" )",
|
|
592
|
+
" sample.find_ms2()",
|
|
593
|
+
" sample.find_iso()",
|
|
594
|
+
" # sample.export_mgf()",
|
|
625
595
|
' # sample.plot_2d(filename=f"{sample5_path.replace(".sample5", ".html")}")',
|
|
626
|
-
|
|
627
|
-
|
|
596
|
+
" sample.save(str(sample5_path))",
|
|
597
|
+
" ",
|
|
628
598
|
' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
599
|
+
" return str(sample5_path)",
|
|
600
|
+
" ",
|
|
601
|
+
" except Exception as e:",
|
|
632
602
|
' print(f" ERROR processing {raw_file.name}: {e}")',
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
603
|
+
" return None",
|
|
604
|
+
"",
|
|
605
|
+
"",
|
|
606
|
+
"def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):",
|
|
637
607
|
' """Convert raw data files to sample5 format."""',
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
608
|
+
" import concurrent.futures",
|
|
609
|
+
" import os",
|
|
610
|
+
" ",
|
|
611
|
+
" # Create output directory",
|
|
612
|
+
" os.makedirs(output_folder, exist_ok=True)",
|
|
613
|
+
" ",
|
|
614
|
+
" # Prepare arguments for multiprocessing",
|
|
615
|
+
" file_args = [(raw_file, output_folder) for raw_file in raw_files]",
|
|
616
|
+
" ",
|
|
617
|
+
" # Process files in parallel",
|
|
618
|
+
" sample5_files = []",
|
|
619
|
+
" with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:",
|
|
620
|
+
" futures = [executor.submit(process_single_file, args) for args in file_args]",
|
|
621
|
+
" ",
|
|
622
|
+
" for future in concurrent.futures.as_completed(futures):",
|
|
623
|
+
" result = future.result()",
|
|
624
|
+
" if result:",
|
|
625
|
+
" sample5_files.append(result)",
|
|
626
|
+
" ",
|
|
627
|
+
" return sample5_files",
|
|
628
|
+
"",
|
|
629
|
+
"",
|
|
630
|
+
"def main():",
|
|
661
631
|
' """Main analysis pipeline."""',
|
|
662
|
-
|
|
632
|
+
" try:",
|
|
663
633
|
' print("=" * 70)',
|
|
664
634
|
f' print("masster {version} - Automated MS Data Analysis")',
|
|
665
635
|
' print("=" * 70)',
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
636
|
+
" print(f\"Source: {PARAMS['source']}\")",
|
|
637
|
+
" print(f\"Output: {PARAMS['folder']}\")",
|
|
638
|
+
" print(f\"Polarity: {PARAMS['polarity']}\")",
|
|
639
|
+
" print(f\"CPU Cores: {PARAMS['num_cores']}\")",
|
|
670
640
|
' print("=" * 70)',
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
641
|
+
" ",
|
|
642
|
+
" start_time = time.time()",
|
|
643
|
+
" ",
|
|
644
|
+
" # Step 1: Discover raw data files",
|
|
675
645
|
' print("\\nStep 1/7: Discovering raw data files...")',
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
646
|
+
" raw_files = discover_raw_files(",
|
|
647
|
+
" PARAMS['source'],",
|
|
648
|
+
" PARAMS['file_extensions'],",
|
|
649
|
+
" PARAMS['search_subfolders']",
|
|
650
|
+
" )",
|
|
651
|
+
" ",
|
|
652
|
+
" if not raw_files:",
|
|
683
653
|
' print("No raw data files found!")',
|
|
684
|
-
|
|
685
|
-
|
|
654
|
+
" return False",
|
|
655
|
+
" ",
|
|
686
656
|
' print(f"Found {len(raw_files)} raw data files")',
|
|
687
|
-
|
|
657
|
+
" for f in raw_files[:5]: # Show first 5 files",
|
|
688
658
|
' print(f" {f.name}")',
|
|
689
|
-
|
|
659
|
+
" if len(raw_files) > 5:",
|
|
690
660
|
' print(f" ... and {len(raw_files) - 5} more")',
|
|
691
|
-
|
|
692
|
-
|
|
661
|
+
" ",
|
|
662
|
+
" # Step 2: Process raw files",
|
|
693
663
|
' print("\\nStep 2/7: Processing raw files...")',
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
664
|
+
" sample5_files = convert_raw_to_sample5(",
|
|
665
|
+
" raw_files,",
|
|
666
|
+
" PARAMS['folder'],",
|
|
667
|
+
" PARAMS['polarity'],",
|
|
668
|
+
" PARAMS['num_cores']",
|
|
669
|
+
" )",
|
|
670
|
+
" ",
|
|
671
|
+
" if not sample5_files:",
|
|
702
672
|
' print("No sample5 files were created!")',
|
|
703
|
-
|
|
704
|
-
|
|
673
|
+
" return False",
|
|
674
|
+
" ",
|
|
705
675
|
' print(f"Successfully processed {len(sample5_files)} files to sample5")',
|
|
706
|
-
|
|
707
|
-
|
|
676
|
+
" ",
|
|
677
|
+
" # Step 3: Create and configure study",
|
|
708
678
|
' print("\\nStep 3/7: Initializing study...")',
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
679
|
+
" study = Study(folder=PARAMS['folder'])",
|
|
680
|
+
" study.polarity = PARAMS['polarity']",
|
|
681
|
+
" study.adducts = PARAMS['adducts']",
|
|
682
|
+
" ",
|
|
683
|
+
" # Step 4: Add sample5 files to study",
|
|
714
684
|
' print("\\nStep 4/7: Adding samples to study...")',
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
685
|
+
" study.add(str(Path(PARAMS['folder']) / \"*.sample5\"))",
|
|
686
|
+
" study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))",
|
|
687
|
+
" ",
|
|
688
|
+
" # Step 5: Core processing",
|
|
719
689
|
' print("\\nStep 5/7: Processing...")',
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
690
|
+
" study.align(",
|
|
691
|
+
" algorithm=PARAMS['alignment_method'],",
|
|
692
|
+
" rt_tol=PARAMS['rt_tol']",
|
|
693
|
+
" )",
|
|
694
|
+
" ",
|
|
695
|
+
" study.merge(",
|
|
726
696
|
' method="qt",',
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
697
|
+
" min_samples=PARAMS['min_samples_per_feature'],",
|
|
698
|
+
" threads=PARAMS['num_cores'],",
|
|
699
|
+
" rt_tol=PARAMS['rt_tol']",
|
|
700
|
+
" )",
|
|
701
|
+
" study.find_iso()",
|
|
702
|
+
" study.fill()",
|
|
703
|
+
" study.integrate()",
|
|
704
|
+
" ",
|
|
705
|
+
" # Step 6/7: Saving results",
|
|
736
706
|
' print("\\nStep 6/7: Saving results...")',
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
707
|
+
" study.save()",
|
|
708
|
+
" study.export_xlsx()",
|
|
709
|
+
" study.export_mgf()",
|
|
710
|
+
" study.export_mztab()",
|
|
711
|
+
" ",
|
|
712
|
+
" # Step 7: Plots",
|
|
743
713
|
' print("\\nStep 7/7: Exporting plots...")',
|
|
744
714
|
' study.plot_consensus_2d(filename="consensus.html")',
|
|
745
715
|
' study.plot_consensus_2d(filename="consensus.png")',
|
|
@@ -751,10 +721,10 @@ class Wizard:
|
|
|
751
721
|
' study.plot_bpc(filename="bpc.png")',
|
|
752
722
|
' study.plot_rt_correction(filename="rt_correction.html")',
|
|
753
723
|
' study.plot_rt_correction(filename="rt_correction.png")',
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
724
|
+
" ",
|
|
725
|
+
" # Print summary",
|
|
726
|
+
" study.info()",
|
|
727
|
+
" total_time = time.time() - start_time",
|
|
758
728
|
' print("\\n" + "=" * 70)',
|
|
759
729
|
' print("ANALYSIS COMPLETE")',
|
|
760
730
|
' print("=" * 70)',
|
|
@@ -764,50 +734,50 @@ class Wizard:
|
|
|
764
734
|
' if hasattr(study, "consensus_df"):',
|
|
765
735
|
' print(f"Consensus features generated: {len(study.consensus_df)}")',
|
|
766
736
|
' print("=" * 70)',
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
737
|
+
" ",
|
|
738
|
+
" return True",
|
|
739
|
+
" ",
|
|
740
|
+
" except KeyboardInterrupt:",
|
|
771
741
|
' print("\\nAnalysis interrupted by user")',
|
|
772
|
-
|
|
773
|
-
|
|
742
|
+
" return False",
|
|
743
|
+
" except Exception as e:",
|
|
774
744
|
' print(f"Analysis failed with error: {e}")',
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
745
|
+
" import traceback",
|
|
746
|
+
" traceback.print_exc()",
|
|
747
|
+
" return False",
|
|
748
|
+
"",
|
|
749
|
+
"",
|
|
780
750
|
'if __name__ == "__main__":',
|
|
781
|
-
|
|
782
|
-
|
|
751
|
+
" success = main()",
|
|
752
|
+
" sys.exit(0 if success else 1)",
|
|
783
753
|
]
|
|
784
|
-
|
|
785
|
-
return
|
|
754
|
+
|
|
755
|
+
return "\n".join(script_lines)
|
|
786
756
|
|
|
787
757
|
def _generate_interactive_notebook_content(self, source_info: Dict[str, Any]) -> str:
|
|
788
758
|
"""Generate the content for 2_interactive_analysis.py marimo notebook."""
|
|
789
|
-
|
|
759
|
+
|
|
790
760
|
notebook_lines = [
|
|
791
|
-
|
|
792
|
-
|
|
761
|
+
"import marimo",
|
|
762
|
+
"",
|
|
793
763
|
'__generated_with = "0.9.14"',
|
|
794
764
|
'app = marimo.App(width="medium")',
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
765
|
+
"",
|
|
766
|
+
"@app.cell",
|
|
767
|
+
"def __():",
|
|
768
|
+
" import marimo as mo",
|
|
769
|
+
" return (mo,)",
|
|
770
|
+
"",
|
|
771
|
+
"@app.cell",
|
|
772
|
+
"def __(mo):",
|
|
803
773
|
' mo.md(r"""',
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
f
|
|
807
|
-
f
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
774
|
+
" # MASSter Interactive Analysis",
|
|
775
|
+
" ",
|
|
776
|
+
f" **Source:** {source_info.get('number_of_files', 0)} files detected",
|
|
777
|
+
f" **Polarity:** {source_info.get('polarity', 'unknown')}",
|
|
778
|
+
" ",
|
|
779
|
+
" This notebook provides interactive exploration of your processed study.",
|
|
780
|
+
" Make sure you have run `python 1_masster_workflow.py` first.",
|
|
811
781
|
' """)',
|
|
812
782
|
' return ()',
|
|
813
783
|
'',
|
|
@@ -818,7 +788,8 @@ class Wizard:
|
|
|
818
788
|
'',
|
|
819
789
|
'@app.cell',
|
|
820
790
|
'def __(masster):',
|
|
821
|
-
' study = masster.Study(folder=
|
|
791
|
+
f' study = masster.Study(folder={str(self.folder_path)!r})',
|
|
792
|
+
' study.load()',
|
|
822
793
|
' return (study,)',
|
|
823
794
|
'',
|
|
824
795
|
'@app.cell',
|
|
@@ -827,18 +798,20 @@ class Wizard:
|
|
|
827
798
|
' return ()',
|
|
828
799
|
'',
|
|
829
800
|
'if __name__ == "__main__":',
|
|
830
|
-
|
|
801
|
+
" app.run()",
|
|
831
802
|
]
|
|
832
|
-
|
|
833
|
-
return
|
|
803
|
+
|
|
804
|
+
return "\n".join(notebook_lines)
|
|
834
805
|
|
|
835
806
|
def _generate_instructions(self, source_info: Dict[str, Any], files_created: List[str]) -> List[str]:
|
|
836
807
|
"""Generate usage instructions for the created scripts."""
|
|
837
|
-
instructions = [
|
|
808
|
+
instructions = [
|
|
809
|
+
f"Source analysis: {source_info.get('number_of_files', 0)} files found",
|
|
838
810
|
f"Polarity detected: {source_info.get('polarity', 'unknown')}",
|
|
839
|
-
"Files created:"
|
|
811
|
+
"Files created:",
|
|
812
|
+
]
|
|
840
813
|
for file_path in files_created:
|
|
841
|
-
instructions.append(f"
|
|
814
|
+
instructions.append(f" {str(Path(file_path).resolve())}")
|
|
842
815
|
|
|
843
816
|
# Find the workflow script name from created files
|
|
844
817
|
workflow_script_name = "1_masster_workflow.py"
|
|
@@ -846,7 +819,7 @@ class Wizard:
|
|
|
846
819
|
if Path(file_path).name == "1_masster_workflow.py":
|
|
847
820
|
workflow_script_name = Path(file_path).name
|
|
848
821
|
break
|
|
849
|
-
|
|
822
|
+
|
|
850
823
|
instructions.extend([
|
|
851
824
|
"",
|
|
852
825
|
"Next steps:",
|
|
@@ -863,87 +836,127 @@ class Wizard:
|
|
|
863
836
|
"",
|
|
864
837
|
"4. INTERACTIVE ANALYSIS:",
|
|
865
838
|
f" uv run marimo edit {Path('2_interactive_analysis.py').name}",
|
|
866
|
-
""
|
|
867
|
-
)
|
|
868
|
-
|
|
839
|
+
"",
|
|
840
|
+
])
|
|
841
|
+
|
|
869
842
|
return instructions
|
|
870
843
|
|
|
871
844
|
def _add_test_mode_support(self, workflow_content: str) -> str:
|
|
872
845
|
"""Add test mode functionality to the generated workflow script."""
|
|
873
|
-
lines = workflow_content.split(
|
|
874
|
-
|
|
846
|
+
lines = workflow_content.split("\n")
|
|
847
|
+
|
|
875
848
|
# Insert test mode code after print statements in main function
|
|
876
849
|
for i, line in enumerate(lines):
|
|
877
850
|
# Add test mode print after the masster version line
|
|
878
851
|
if 'print("masster' in line and 'Automated MS Data Analysis")' in line:
|
|
879
|
-
lines.insert(i + 1, ' if
|
|
880
|
-
lines.insert(i + 2, ' print("
|
|
852
|
+
lines.insert(i + 1, ' if TEST:')
|
|
853
|
+
lines.insert(i + 2, ' print("TEST MODE: Processing single file only")')
|
|
881
854
|
break
|
|
882
|
-
|
|
855
|
+
|
|
883
856
|
# Add mode info after num_cores print
|
|
884
857
|
for i, line in enumerate(lines):
|
|
885
|
-
if
|
|
886
|
-
lines.insert(i + 1,
|
|
887
|
-
lines.insert(i + 2,
|
|
858
|
+
if "print(f\"CPU Cores: {PARAMS['num_cores']}\")" in line:
|
|
859
|
+
lines.insert(i + 1, " if TEST:")
|
|
860
|
+
lines.insert(i + 2, " print(f\"Mode: {'Test Only' if STOP_AFTER_TEST else 'Test + Full Batch'}\")")
|
|
888
861
|
break
|
|
889
|
-
|
|
862
|
+
|
|
890
863
|
# Add file limitation logic after file listing
|
|
891
864
|
for i, line in enumerate(lines):
|
|
892
865
|
if 'print(f" ... and {len(raw_files) - 5} more")' in line:
|
|
893
866
|
lines.insert(i + 1, ' ')
|
|
894
867
|
lines.insert(i + 2, ' # Limit to first file in test mode')
|
|
895
|
-
lines.insert(i + 3, ' if
|
|
868
|
+
lines.insert(i + 3, ' if TEST:')
|
|
896
869
|
lines.insert(i + 4, ' raw_files = raw_files[:1]')
|
|
897
|
-
lines.insert(i + 5, ' print(f"\\
|
|
870
|
+
lines.insert(i + 5, ' print(f"\\nTEST MODE: Processing only first file: {raw_files[0].name}")')
|
|
898
871
|
break
|
|
899
|
-
|
|
872
|
+
|
|
900
873
|
# Modify num_cores for test mode
|
|
901
874
|
for i, line in enumerate(lines):
|
|
902
|
-
if
|
|
903
|
-
lines[i] = line.replace(
|
|
875
|
+
if "PARAMS['num_cores']" in line and "convert_raw_to_sample5(" in lines[i - 2 : i + 3]:
|
|
876
|
+
lines[i] = line.replace(
|
|
877
|
+
"PARAMS['num_cores']", "PARAMS['num_cores'] if not TEST else 1 # Use single core for test"
|
|
878
|
+
)
|
|
904
879
|
break
|
|
905
|
-
|
|
880
|
+
|
|
906
881
|
# Add test-only exit logic after successful processing
|
|
907
882
|
for i, line in enumerate(lines):
|
|
908
883
|
if 'print(f"Successfully processed {len(sample5_files)} files to sample5")' in line:
|
|
909
884
|
lines.insert(i + 1, ' ')
|
|
910
|
-
lines.insert(i + 2, ' # Stop here if test
|
|
911
|
-
lines.insert(i + 3, ' if
|
|
912
|
-
lines.insert(i + 4, ' print("\\
|
|
885
|
+
lines.insert(i + 2, ' # Stop here if stop-after-test mode')
|
|
886
|
+
lines.insert(i + 3, ' if STOP_AFTER_TEST:')
|
|
887
|
+
lines.insert(i + 4, ' print("\\nSTOP AFTER TEST mode: Stopping after successful single file processing")')
|
|
913
888
|
lines.insert(i + 5, ' print(f"Test file created: {sample5_files[0]}")')
|
|
914
889
|
lines.insert(i + 6, ' print("\\nTo run full batch, use: wizard.run()")')
|
|
915
|
-
lines.insert(i + 7,
|
|
890
|
+
lines.insert(i + 7, " total_time = time.time() - start_time")
|
|
916
891
|
lines.insert(i + 8, ' print(f"\\nTest processing time: {total_time:.1f} seconds")')
|
|
917
|
-
lines.insert(i + 9,
|
|
892
|
+
lines.insert(i + 9, " return True")
|
|
918
893
|
break
|
|
919
|
-
|
|
920
|
-
return
|
|
894
|
+
|
|
895
|
+
return "\n".join(lines)
|
|
921
896
|
|
|
922
897
|
def test_and_run(self) -> Dict[str, Any]:
|
|
923
898
|
"""
|
|
924
899
|
Test the sample processing workflow with a single file, then run full batch.
|
|
925
|
-
|
|
926
|
-
This method runs the 1_masster_workflow.py script in test mode to process
|
|
927
|
-
the first raw file for validation, then automatically continues with the
|
|
928
|
-
full batch if the test succeeds. The script must already exist - call
|
|
900
|
+
|
|
901
|
+
This method first runs the 1_masster_workflow.py script in test-only mode to process
|
|
902
|
+
the first raw file for validation, then automatically continues with the
|
|
903
|
+
full batch if the test succeeds. The script must already exist - call
|
|
929
904
|
create_scripts() first if needed.
|
|
930
|
-
|
|
905
|
+
|
|
931
906
|
Returns:
|
|
932
907
|
Dictionary containing:
|
|
933
908
|
- status: "success" or "error"
|
|
934
909
|
- message: Status message
|
|
935
910
|
- instructions: List of next steps
|
|
936
911
|
"""
|
|
937
|
-
|
|
912
|
+
# Step 1: Run test-only mode first
|
|
913
|
+
print("Step 1/2: Testing with single file...")
|
|
914
|
+
test_result = self._execute_workflow(test=True, run=False)
|
|
915
|
+
|
|
916
|
+
if test_result["status"] != "success":
|
|
917
|
+
return {
|
|
918
|
+
"status": "error",
|
|
919
|
+
"message": f"Test failed: {test_result['message']}",
|
|
920
|
+
"instructions": [
|
|
921
|
+
"Single file test failed",
|
|
922
|
+
"Review parameters in 1_masster_workflow.py",
|
|
923
|
+
"Fix issues and try again",
|
|
924
|
+
],
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
print("Test successful! Proceeding with full batch...")
|
|
928
|
+
print("Step 2/2: Processing all files...")
|
|
929
|
+
|
|
930
|
+
# Step 2: Run full batch mode
|
|
931
|
+
full_result = self._execute_workflow(test=False, run=True)
|
|
932
|
+
|
|
933
|
+
return full_result
|
|
938
934
|
|
|
939
935
|
def test_only(self) -> Dict[str, Any]:
|
|
940
936
|
"""
|
|
941
937
|
Test the sample processing workflow with a single file only.
|
|
942
|
-
|
|
938
|
+
|
|
943
939
|
This method runs the 1_masster_workflow.py script in test-only mode to process
|
|
944
940
|
only the first raw file and then stops (does not continue to full study processing).
|
|
945
941
|
The script must already exist - call create_scripts() first if needed.
|
|
946
|
-
|
|
942
|
+
|
|
943
|
+
Returns:
|
|
944
|
+
Dictionary containing:
|
|
945
|
+
- status: "success" or "error"
|
|
946
|
+
- message: Status message
|
|
947
|
+
- instructions: List of next steps
|
|
948
|
+
- test_file: Path to the processed test file (if successful)
|
|
949
|
+
"""
|
|
950
|
+
return self._execute_workflow(test=True, run=False)
|
|
951
|
+
|
|
952
|
+
def test(self) -> Dict[str, Any]:
|
|
953
|
+
"""
|
|
954
|
+
Test the sample processing workflow with a single file only.
|
|
955
|
+
|
|
956
|
+
This method runs the 1_masster_workflow.py script in test-only mode to process
|
|
957
|
+
only the first raw file and then stops (does not continue to full study processing).
|
|
958
|
+
The script must already exist - call create_scripts() first if needed.
|
|
959
|
+
|
|
947
960
|
Returns:
|
|
948
961
|
Dictionary containing:
|
|
949
962
|
- status: "success" or "error"
|
|
@@ -951,98 +964,107 @@ class Wizard:
|
|
|
951
964
|
- instructions: List of next steps
|
|
952
965
|
- test_file: Path to the processed test file (if successful)
|
|
953
966
|
"""
|
|
954
|
-
return self._execute_workflow(
|
|
967
|
+
return self._execute_workflow(test=True, run=False)
|
|
968
|
+
|
|
955
969
|
|
|
956
970
|
def run(self) -> Dict[str, Any]:
|
|
957
971
|
"""
|
|
958
972
|
Run the sample processing workflow.
|
|
959
|
-
|
|
973
|
+
|
|
960
974
|
This method runs the 1_masster_workflow.py script to process raw files.
|
|
961
975
|
The script must already exist - call create_scripts() first if needed.
|
|
962
|
-
|
|
976
|
+
|
|
963
977
|
Returns:
|
|
964
978
|
Dictionary containing:
|
|
965
979
|
- status: "success" or "error"
|
|
966
980
|
- message: Status message
|
|
967
981
|
- instructions: List of next steps
|
|
968
982
|
"""
|
|
969
|
-
return self._execute_workflow(
|
|
983
|
+
return self._execute_workflow(test=False, run=True)
|
|
970
984
|
|
|
971
|
-
def _execute_workflow(self,
|
|
985
|
+
def _execute_workflow(self, test: bool = False, run: bool = True) -> Dict[str, Any]:
|
|
972
986
|
"""
|
|
973
987
|
Execute the workflow script in either test or full mode.
|
|
974
|
-
|
|
988
|
+
|
|
975
989
|
Args:
|
|
976
|
-
|
|
977
|
-
|
|
990
|
+
test: If True, run in test mode (single file), otherwise full batch
|
|
991
|
+
run: If False, stop after test (only used with test=True), if True continue with full processing
|
|
978
992
|
"""
|
|
979
993
|
try:
|
|
980
994
|
workflow_script_path = self.folder_path / "1_masster_workflow.py"
|
|
981
|
-
|
|
995
|
+
|
|
982
996
|
# Check if workflow script exists
|
|
983
997
|
if not workflow_script_path.exists():
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
"
|
|
990
|
-
"
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
998
|
+
print("Workflow script not found. Creating scripts automatically...")
|
|
999
|
+
create_result = self.create_scripts()
|
|
1000
|
+
|
|
1001
|
+
if create_result["status"] == "error":
|
|
1002
|
+
return {
|
|
1003
|
+
"status": "error",
|
|
1004
|
+
"message": f"Failed to create workflow script: {create_result['message']}",
|
|
1005
|
+
"instructions": [
|
|
1006
|
+
"Could not create 1_masster_workflow.py",
|
|
1007
|
+
"Please check source path and permissions",
|
|
1008
|
+
],
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
print(f"Created {len(create_result.get('files_created', []))} script files")
|
|
1012
|
+
for file_path in create_result.get('files_created', []):
|
|
1013
|
+
print(f" - {file_path}")
|
|
1014
|
+
print("Proceeding with workflow execution...")
|
|
1015
|
+
|
|
994
1016
|
# Setup execution mode
|
|
995
|
-
if
|
|
1017
|
+
if test and not run:
|
|
996
1018
|
mode_label = "test-only"
|
|
997
|
-
elif
|
|
1019
|
+
elif test:
|
|
998
1020
|
mode_label = "test"
|
|
999
1021
|
else:
|
|
1000
1022
|
mode_label = "full batch"
|
|
1001
|
-
|
|
1023
|
+
|
|
1002
1024
|
env = None
|
|
1003
|
-
if
|
|
1025
|
+
if test:
|
|
1004
1026
|
import os
|
|
1027
|
+
|
|
1005
1028
|
env = os.environ.copy()
|
|
1006
|
-
env[
|
|
1007
|
-
if
|
|
1008
|
-
env[
|
|
1009
|
-
|
|
1029
|
+
env["MASSTER_TEST"] = "1"
|
|
1030
|
+
if not run:
|
|
1031
|
+
env["MASSTER_STOP_AFTER_TEST"] = "1"
|
|
1032
|
+
|
|
1010
1033
|
# Execute the workflow script
|
|
1011
1034
|
print(f">> Executing {mode_label} processing workflow...")
|
|
1012
|
-
print(f"
|
|
1035
|
+
print(f"Running: {workflow_script_path.name}")
|
|
1013
1036
|
print("=" * 60)
|
|
1014
|
-
|
|
1037
|
+
|
|
1015
1038
|
import subprocess
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1039
|
+
|
|
1040
|
+
result = subprocess.run([sys.executable, str(workflow_script_path)], cwd=str(self.folder_path), env=env)
|
|
1041
|
+
|
|
1020
1042
|
success = result.returncode == 0
|
|
1021
|
-
|
|
1043
|
+
|
|
1022
1044
|
if success:
|
|
1023
1045
|
print("=" * 60)
|
|
1024
|
-
if
|
|
1025
|
-
print("
|
|
1026
|
-
print("
|
|
1046
|
+
if test and not run:
|
|
1047
|
+
print("Test-only processing completed successfully!")
|
|
1048
|
+
print("Single file validated - ready for full batch")
|
|
1027
1049
|
print(" wizard.run()")
|
|
1028
|
-
elif
|
|
1029
|
-
print("
|
|
1030
|
-
print("
|
|
1050
|
+
elif test:
|
|
1051
|
+
print("Test processing completed successfully!")
|
|
1052
|
+
print("Next step: Run full batch")
|
|
1031
1053
|
print(" wizard.run()")
|
|
1032
1054
|
else:
|
|
1033
|
-
|
|
1034
|
-
print("
|
|
1035
|
-
print("
|
|
1055
|
+
notebook_path = self.folder_path / "2_interactive_analysis.py"
|
|
1056
|
+
print("Sample processing completed successfully!")
|
|
1057
|
+
print("Next step: Run interactive analysis")
|
|
1058
|
+
print(f" uv run marimo edit {notebook_path}")
|
|
1036
1059
|
print("=" * 60)
|
|
1037
|
-
|
|
1038
|
-
next_step =
|
|
1039
|
-
|
|
1040
|
-
|
|
1060
|
+
|
|
1061
|
+
next_step = "Next: wizard.run()" if test else f"Next: uv run marimo edit {self.folder_path / '2_interactive_analysis.py'}"
|
|
1062
|
+
|
|
1041
1063
|
return {
|
|
1042
1064
|
"status": "success",
|
|
1043
1065
|
"message": f"{mode_label.capitalize()} processing completed successfully",
|
|
1044
1066
|
"instructions": [
|
|
1045
|
-
f"
|
|
1067
|
+
f"{mode_label.capitalize()} processing completed",
|
|
1046
1068
|
next_step
|
|
1047
1069
|
]
|
|
1048
1070
|
}
|
|
@@ -1051,30 +1073,30 @@ class Wizard:
|
|
|
1051
1073
|
"status": "error",
|
|
1052
1074
|
"message": f"Workflow execution failed with return code {result.returncode}",
|
|
1053
1075
|
"instructions": [
|
|
1054
|
-
"
|
|
1076
|
+
"Check the error messages above",
|
|
1055
1077
|
"Review parameters in 1_masster_workflow.py",
|
|
1056
|
-
f"Try running manually: python {workflow_script_path.name}"
|
|
1057
|
-
]
|
|
1078
|
+
f"Try running manually: python {workflow_script_path.name}",
|
|
1079
|
+
],
|
|
1058
1080
|
}
|
|
1059
|
-
|
|
1081
|
+
|
|
1060
1082
|
except Exception as e:
|
|
1061
1083
|
return {
|
|
1062
1084
|
"status": "error",
|
|
1063
1085
|
"message": f"Failed to execute workflow: {e}",
|
|
1064
1086
|
"instructions": [
|
|
1065
|
-
"
|
|
1087
|
+
"Execution failed",
|
|
1066
1088
|
"Check that source files exist and are accessible",
|
|
1067
|
-
"Verify folder permissions"
|
|
1068
|
-
]
|
|
1089
|
+
"Verify folder permissions",
|
|
1090
|
+
],
|
|
1069
1091
|
}
|
|
1070
1092
|
|
|
1071
1093
|
def _generate_script_content(self) -> str:
|
|
1072
1094
|
"""Generate the complete analysis script content."""
|
|
1073
|
-
|
|
1095
|
+
|
|
1074
1096
|
# Convert Path objects to strings for JSON serialization
|
|
1075
1097
|
params_dict = {}
|
|
1076
1098
|
for key, value in self.params.__dict__.items():
|
|
1077
|
-
if key ==
|
|
1099
|
+
if key == "_param_metadata": # Skip metadata in generated script
|
|
1078
1100
|
continue
|
|
1079
1101
|
if isinstance(value, Path):
|
|
1080
1102
|
params_dict[key] = str(value)
|
|
@@ -1083,251 +1105,288 @@ class Wizard:
|
|
|
1083
1105
|
|
|
1084
1106
|
# Obtain list of files in source with extension wiff, .raw, .mzML
|
|
1085
1107
|
raw_files = []
|
|
1086
|
-
for ext in params_dict.get(
|
|
1108
|
+
for ext in params_dict.get("file_extensions", []):
|
|
1087
1109
|
raw_files.extend(glob.glob(f"{params_dict.get('source', '')}/**/*{ext}", recursive=True))
|
|
1088
1110
|
|
|
1089
1111
|
# Create readable PARAMS dict with comments
|
|
1090
1112
|
params_lines = []
|
|
1091
|
-
params_lines.append(
|
|
1092
|
-
params_lines.append(
|
|
1093
|
-
|
|
1113
|
+
params_lines.append("# Analysis parameters")
|
|
1114
|
+
params_lines.append("PARAMS = {")
|
|
1115
|
+
|
|
1094
1116
|
# Core Configuration
|
|
1095
|
-
params_lines.append(
|
|
1117
|
+
params_lines.append(" # === Core Configuration ===")
|
|
1096
1118
|
params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
|
|
1097
1119
|
params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
|
|
1098
|
-
params_lines.append(
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1120
|
+
params_lines.append(
|
|
1121
|
+
f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode ("positive" or "negative")'
|
|
1122
|
+
)
|
|
1123
|
+
params_lines.append(
|
|
1124
|
+
f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing'
|
|
1125
|
+
)
|
|
1126
|
+
params_lines.append("")
|
|
1127
|
+
|
|
1102
1128
|
# File Discovery
|
|
1103
|
-
params_lines.append(
|
|
1104
|
-
params_lines.append(
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
params_lines.append(
|
|
1108
|
-
|
|
1129
|
+
params_lines.append(" # === File Discovery ===")
|
|
1130
|
+
params_lines.append(
|
|
1131
|
+
f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for'
|
|
1132
|
+
)
|
|
1133
|
+
params_lines.append(
|
|
1134
|
+
f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively'
|
|
1135
|
+
)
|
|
1136
|
+
params_lines.append(
|
|
1137
|
+
f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip'
|
|
1138
|
+
)
|
|
1139
|
+
params_lines.append("")
|
|
1140
|
+
|
|
1109
1141
|
# Processing Parameters
|
|
1110
|
-
params_lines.append(
|
|
1111
|
-
params_lines.append(
|
|
1112
|
-
|
|
1142
|
+
params_lines.append(" # === Processing Parameters ===")
|
|
1143
|
+
params_lines.append(
|
|
1144
|
+
f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation'
|
|
1145
|
+
)
|
|
1146
|
+
params_lines.append(
|
|
1147
|
+
f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")'
|
|
1148
|
+
)
|
|
1113
1149
|
params_lines.append(f' "noise": {params_dict.get("noise", 50.0)}, # Noise threshold for feature detection')
|
|
1114
|
-
params_lines.append(
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1150
|
+
params_lines.append(
|
|
1151
|
+
f' "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)}, # Chromatographic peak full width at half maximum (seconds)'
|
|
1152
|
+
)
|
|
1153
|
+
params_lines.append(
|
|
1154
|
+
f' "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)}, # Minimum signal-to-noise ratio for chromatographic peaks'
|
|
1155
|
+
)
|
|
1156
|
+
params_lines.append("")
|
|
1157
|
+
|
|
1118
1158
|
# Alignment & Merging
|
|
1119
|
-
params_lines.append(
|
|
1120
|
-
params_lines.append(
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
params_lines.append(
|
|
1124
|
-
|
|
1125
|
-
|
|
1159
|
+
params_lines.append(" # === Alignment & Merging ===")
|
|
1160
|
+
params_lines.append(
|
|
1161
|
+
f' "rt_tol": {params_dict.get("rt_tol", 2.0)}, # Retention time tolerance for alignment (seconds)'
|
|
1162
|
+
)
|
|
1163
|
+
params_lines.append(
|
|
1164
|
+
f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)'
|
|
1165
|
+
)
|
|
1166
|
+
params_lines.append(
|
|
1167
|
+
f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment'
|
|
1168
|
+
)
|
|
1169
|
+
params_lines.append(
|
|
1170
|
+
f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature'
|
|
1171
|
+
)
|
|
1172
|
+
params_lines.append(
|
|
1173
|
+
f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features'
|
|
1174
|
+
)
|
|
1175
|
+
params_lines.append("")
|
|
1126
1176
|
|
|
1127
1177
|
# Sample Processing
|
|
1128
|
-
params_lines.append(
|
|
1129
|
-
params_lines.append(
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1178
|
+
params_lines.append(" # === Sample Processing (used in add_samples_from_folder) ===")
|
|
1179
|
+
params_lines.append(
|
|
1180
|
+
f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch'
|
|
1181
|
+
)
|
|
1182
|
+
params_lines.append(
|
|
1183
|
+
f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)'
|
|
1184
|
+
)
|
|
1185
|
+
params_lines.append("")
|
|
1186
|
+
|
|
1133
1187
|
# Script Options
|
|
1134
|
-
params_lines.append(
|
|
1135
|
-
params_lines.append(
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1188
|
+
params_lines.append(" # === Script Options ===")
|
|
1189
|
+
params_lines.append(
|
|
1190
|
+
f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability'
|
|
1191
|
+
)
|
|
1192
|
+
params_lines.append(
|
|
1193
|
+
f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files'
|
|
1194
|
+
)
|
|
1195
|
+
params_lines.append(
|
|
1196
|
+
f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing'
|
|
1197
|
+
)
|
|
1198
|
+
|
|
1199
|
+
params_lines.append("}")
|
|
1200
|
+
|
|
1141
1201
|
# Create script lines
|
|
1142
1202
|
script_lines = [
|
|
1143
|
-
|
|
1203
|
+
"#!/usr/bin/env python3",
|
|
1144
1204
|
'"""',
|
|
1145
|
-
|
|
1146
|
-
f
|
|
1205
|
+
"Automated Mass Spectrometry Data Analysis Pipeline",
|
|
1206
|
+
f"Generated by masster wizard v{version}",
|
|
1147
1207
|
'"""',
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1208
|
+
"",
|
|
1209
|
+
"import sys",
|
|
1210
|
+
"import time",
|
|
1211
|
+
"from pathlib import Path",
|
|
1212
|
+
"",
|
|
1213
|
+
"# Import masster modules",
|
|
1214
|
+
"from masster.study import Study",
|
|
1215
|
+
"from masster import __version__",
|
|
1216
|
+
"",
|
|
1157
1217
|
]
|
|
1158
|
-
|
|
1218
|
+
|
|
1159
1219
|
# Add the formatted PARAMS
|
|
1160
1220
|
script_lines.extend(params_lines)
|
|
1161
|
-
|
|
1221
|
+
|
|
1162
1222
|
# Add the main function and pipeline
|
|
1163
1223
|
script_lines.extend([
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1224
|
+
"",
|
|
1225
|
+
"",
|
|
1226
|
+
"def discover_raw_files(source_folder, file_extensions, search_subfolders=True):",
|
|
1167
1227
|
' """Discover raw data files in the source folder."""',
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1228
|
+
" source_path = Path(source_folder)",
|
|
1229
|
+
" raw_files = []",
|
|
1230
|
+
" ",
|
|
1231
|
+
" for ext in file_extensions:",
|
|
1232
|
+
" if search_subfolders:",
|
|
1173
1233
|
' pattern = f"**/*{ext}"',
|
|
1174
|
-
|
|
1175
|
-
|
|
1234
|
+
" files = list(source_path.rglob(pattern))",
|
|
1235
|
+
" else:",
|
|
1176
1236
|
' pattern = f"*{ext}"',
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1237
|
+
" files = list(source_path.glob(pattern))",
|
|
1238
|
+
" raw_files.extend(files)",
|
|
1239
|
+
" ",
|
|
1240
|
+
" return raw_files",
|
|
1241
|
+
"",
|
|
1242
|
+
"",
|
|
1243
|
+
"def process_single_file(args):",
|
|
1184
1244
|
' """Process a single raw file to sample5 format - module level for multiprocessing."""',
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1245
|
+
" raw_file, output_folder = args",
|
|
1246
|
+
" from masster.sample import Sample",
|
|
1247
|
+
" ",
|
|
1248
|
+
" try:",
|
|
1249
|
+
" # Create sample5 filename",
|
|
1250
|
+
" sample_name = raw_file.stem",
|
|
1191
1251
|
' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1252
|
+
" ",
|
|
1253
|
+
" # Skip if sample5 already exists",
|
|
1254
|
+
" if sample5_path.exists():",
|
|
1195
1255
|
' print(f" Skipping {raw_file.name} (sample5 already exists)")',
|
|
1196
|
-
|
|
1197
|
-
|
|
1256
|
+
" return str(sample5_path)",
|
|
1257
|
+
" ",
|
|
1198
1258
|
' print(f" Converting {raw_file.name}...")',
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1259
|
+
" ",
|
|
1260
|
+
" # Load and process raw file with full pipeline",
|
|
1261
|
+
" sample = Sample(log_label=sample_name)",
|
|
1262
|
+
" sample.load(filename=str(raw_file))",
|
|
1263
|
+
" sample.find_features(",
|
|
1264
|
+
" noise=PARAMS['noise'],",
|
|
1265
|
+
" chrom_fwhm=PARAMS['chrom_fwhm'],",
|
|
1266
|
+
" chrom_peak_snr=PARAMS['chrom_peak_snr']",
|
|
1267
|
+
" )",
|
|
1268
|
+
" sample.find_adducts(adducts=PARAMS['adducts'])",
|
|
1269
|
+
" sample.find_ms2()",
|
|
1270
|
+
" # sample.find_iso()",
|
|
1271
|
+
" # sample.export_mgf()",
|
|
1272
|
+
" # sample.export_mztab()",
|
|
1213
1273
|
' # sample.plot_2d(filename="{sample_name}.html")',
|
|
1214
|
-
|
|
1215
|
-
|
|
1274
|
+
" sample.save(str(sample5_path))",
|
|
1275
|
+
" ",
|
|
1216
1276
|
' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1277
|
+
" return str(sample5_path)",
|
|
1278
|
+
" ",
|
|
1279
|
+
" except Exception as e:",
|
|
1220
1280
|
' print(f" ERROR processing {raw_file.name}: {e}")',
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1281
|
+
" return None",
|
|
1282
|
+
"",
|
|
1283
|
+
"",
|
|
1284
|
+
"def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):",
|
|
1225
1285
|
' """Convert raw data files to sample5 format."""',
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1286
|
+
" import concurrent.futures",
|
|
1287
|
+
" import os",
|
|
1288
|
+
" ",
|
|
1289
|
+
" # Create output directory",
|
|
1290
|
+
" os.makedirs(output_folder, exist_ok=True)",
|
|
1291
|
+
" ",
|
|
1292
|
+
" # Prepare arguments for multiprocessing",
|
|
1293
|
+
" file_args = [(raw_file, output_folder) for raw_file in raw_files]",
|
|
1294
|
+
" ",
|
|
1295
|
+
" # Process files in parallel",
|
|
1296
|
+
" sample5_files = []",
|
|
1297
|
+
" with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:",
|
|
1298
|
+
" futures = [executor.submit(process_single_file, args) for args in file_args]",
|
|
1299
|
+
" ",
|
|
1300
|
+
" for future in concurrent.futures.as_completed(futures):",
|
|
1301
|
+
" result = future.result()",
|
|
1302
|
+
" if result:",
|
|
1303
|
+
" sample5_files.append(result)",
|
|
1304
|
+
" ",
|
|
1305
|
+
" return sample5_files",
|
|
1306
|
+
"",
|
|
1307
|
+
"",
|
|
1308
|
+
"def main():",
|
|
1249
1309
|
' """Main analysis pipeline."""',
|
|
1250
|
-
|
|
1310
|
+
" try:",
|
|
1251
1311
|
' print("=" * 70)',
|
|
1252
1312
|
f' print("masster {version} - Automated MS Data Analysis")',
|
|
1253
1313
|
' print("=" * 70)',
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1314
|
+
" print(f\"Source: {PARAMS['source']}\")",
|
|
1315
|
+
" print(f\"Output: {PARAMS['folder']}\")",
|
|
1316
|
+
" print(f\"Polarity: {PARAMS['polarity']}\")",
|
|
1317
|
+
" print(f\"CPU Cores: {PARAMS['num_cores']}\")",
|
|
1258
1318
|
' print("=" * 70)',
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1319
|
+
" ",
|
|
1320
|
+
" start_time = time.time()",
|
|
1321
|
+
" ",
|
|
1322
|
+
" # Step 1: Discover raw data files",
|
|
1263
1323
|
' print("\\nStep 1/7: Discovering raw data files...")',
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1324
|
+
" raw_files = discover_raw_files(",
|
|
1325
|
+
" PARAMS['source'],",
|
|
1326
|
+
" PARAMS['file_extensions'],",
|
|
1327
|
+
" PARAMS['search_subfolders']",
|
|
1328
|
+
" )",
|
|
1329
|
+
" ",
|
|
1330
|
+
" if not raw_files:",
|
|
1271
1331
|
' print("No raw data files found!")',
|
|
1272
|
-
|
|
1273
|
-
|
|
1332
|
+
" return False",
|
|
1333
|
+
" ",
|
|
1274
1334
|
' print(f"Found {len(raw_files)} raw data files")',
|
|
1275
|
-
|
|
1335
|
+
" for f in raw_files[:5]: # Show first 5 files",
|
|
1276
1336
|
' print(f" {f.name}")',
|
|
1277
|
-
|
|
1337
|
+
" if len(raw_files) > 5:",
|
|
1278
1338
|
' print(f" ... and {len(raw_files) - 5} more")',
|
|
1279
|
-
|
|
1280
|
-
|
|
1339
|
+
" ",
|
|
1340
|
+
" # Step 2: Process raw files",
|
|
1281
1341
|
' print("\\nStep 2/7: Processing raw files...")',
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1342
|
+
" sample5_files = convert_raw_to_sample5(",
|
|
1343
|
+
" raw_files,",
|
|
1344
|
+
" PARAMS['folder'],",
|
|
1345
|
+
" PARAMS['polarity'],",
|
|
1346
|
+
" PARAMS['num_cores']",
|
|
1347
|
+
" )",
|
|
1348
|
+
" ",
|
|
1349
|
+
" if not sample5_files:",
|
|
1290
1350
|
' print("No sample5 files were created!")',
|
|
1291
|
-
|
|
1292
|
-
|
|
1351
|
+
" return False",
|
|
1352
|
+
" ",
|
|
1293
1353
|
' print(f"Successfully processed {len(sample5_files)} files to sample5")',
|
|
1294
|
-
|
|
1295
|
-
|
|
1354
|
+
" ",
|
|
1355
|
+
" # Step 3: Create and configure study",
|
|
1296
1356
|
' print("\\nStep 3/7: Initializing study...")',
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1357
|
+
" study = Study(folder=PARAMS['folder'])",
|
|
1358
|
+
" study.polarity = PARAMS['polarity']",
|
|
1359
|
+
" study.adducts = PARAMS['adducts']",
|
|
1360
|
+
" ",
|
|
1361
|
+
" # Step 4: Add sample5 files to study",
|
|
1302
1362
|
' print("\\nStep 4/7: Adding samples to study...")',
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1363
|
+
" study.add(str(Path(PARAMS['folder']) / \"*.sample5\"))",
|
|
1364
|
+
" study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))",
|
|
1365
|
+
" ",
|
|
1366
|
+
" # Step 5: Core processing",
|
|
1307
1367
|
' print("\\nStep 5/7: Processing...")',
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1368
|
+
" study.align(",
|
|
1369
|
+
" algorithm=PARAMS['alignment_method'],",
|
|
1370
|
+
" rt_tol=PARAMS['rt_tol']",
|
|
1371
|
+
" )",
|
|
1372
|
+
" ",
|
|
1373
|
+
" study.merge(",
|
|
1314
1374
|
' method="qt",',
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
' # Step 6/7: Saving results',
|
|
1375
|
+
" min_samples=PARAMS['min_samples_per_feature'],",
|
|
1376
|
+
" threads=PARAMS['num_cores'],",
|
|
1377
|
+
" rt_tol=PARAMS['rt_tol'], )",
|
|
1378
|
+
" study.find_iso()",
|
|
1379
|
+
" study.fill()",
|
|
1380
|
+
" study.integrate()",
|
|
1381
|
+
" ",
|
|
1382
|
+
" # Step 6/7: Saving results",
|
|
1324
1383
|
' print("\\nStep 6/7: Saving results...")',
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1384
|
+
" study.save()",
|
|
1385
|
+
" study.export_xlsx()",
|
|
1386
|
+
" study.export_mgf()",
|
|
1387
|
+
" study.export_mztab()",
|
|
1388
|
+
" ",
|
|
1389
|
+
" # Step 7: Plots",
|
|
1331
1390
|
' print("\\nStep 7/7: Exporting plots...")',
|
|
1332
1391
|
' study.plot_consensus_2d(filename="consensus.html")',
|
|
1333
1392
|
' study.plot_consensus_2d(filename="consensus.png")',
|
|
@@ -1339,11 +1398,10 @@ class Wizard:
|
|
|
1339
1398
|
' study.plot_bpc(filename="bpc.png")',
|
|
1340
1399
|
' study.plot_rt_correction(filename="rt_correction.html")',
|
|
1341
1400
|
' study.plot_rt_correction(filename="rt_correction.png")',
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
' total_time = time.time() - start_time',
|
|
1401
|
+
" ",
|
|
1402
|
+
" # Print summary",
|
|
1403
|
+
" study.info()",
|
|
1404
|
+
" total_time = time.time() - start_time",
|
|
1347
1405
|
' print("\\n" + "=" * 70)',
|
|
1348
1406
|
' print("ANALYSIS COMPLETE")',
|
|
1349
1407
|
' print("=" * 70)',
|
|
@@ -1353,41 +1411,41 @@ class Wizard:
|
|
|
1353
1411
|
' if hasattr(study, "consensus_df"):',
|
|
1354
1412
|
' print(f"Consensus features generated: {len(study.consensus_df)}")',
|
|
1355
1413
|
' print("=" * 70)',
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1414
|
+
" ",
|
|
1415
|
+
" return True",
|
|
1416
|
+
" ",
|
|
1417
|
+
" except KeyboardInterrupt:",
|
|
1360
1418
|
' print("\\nAnalysis interrupted by user")',
|
|
1361
|
-
|
|
1362
|
-
|
|
1419
|
+
" return False",
|
|
1420
|
+
" except Exception as e:",
|
|
1363
1421
|
' print(f"Analysis failed with error: {e}")',
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1422
|
+
" import traceback",
|
|
1423
|
+
" traceback.print_exc()",
|
|
1424
|
+
" return False",
|
|
1425
|
+
"",
|
|
1426
|
+
"",
|
|
1369
1427
|
'if __name__ == "__main__":',
|
|
1370
|
-
|
|
1371
|
-
|
|
1428
|
+
" success = main()",
|
|
1429
|
+
" sys.exit(0 if success else 1)",
|
|
1372
1430
|
])
|
|
1373
|
-
|
|
1374
|
-
return
|
|
1431
|
+
|
|
1432
|
+
return "\n".join(script_lines)
|
|
1375
1433
|
|
|
1376
1434
|
|
|
1377
1435
|
def create_scripts(
|
|
1378
|
-
source: str = "",
|
|
1379
|
-
folder: str = "",
|
|
1436
|
+
source: str = "",
|
|
1437
|
+
folder: str = "",
|
|
1380
1438
|
polarity: Optional[str] = None,
|
|
1381
1439
|
adducts: Optional[List[str]] = None,
|
|
1382
1440
|
num_cores: int = 0,
|
|
1383
|
-
**kwargs
|
|
1441
|
+
**kwargs,
|
|
1384
1442
|
) -> Dict[str, Any]:
|
|
1385
1443
|
"""
|
|
1386
1444
|
Create analysis scripts without explicitly instantiating a Wizard.
|
|
1387
|
-
|
|
1445
|
+
|
|
1388
1446
|
This is a convenience function that creates a Wizard instance internally
|
|
1389
1447
|
and calls its create_scripts() method.
|
|
1390
|
-
|
|
1448
|
+
|
|
1391
1449
|
Parameters:
|
|
1392
1450
|
source: Directory containing raw data files
|
|
1393
1451
|
folder: Output directory for processed study
|
|
@@ -1395,7 +1453,7 @@ def create_scripts(
|
|
|
1395
1453
|
adducts: List of adduct specifications (auto-set if None)
|
|
1396
1454
|
num_cores: Number of CPU cores (0 = auto-detect)
|
|
1397
1455
|
**kwargs: Additional parameters
|
|
1398
|
-
|
|
1456
|
+
|
|
1399
1457
|
Returns:
|
|
1400
1458
|
Dictionary containing:
|
|
1401
1459
|
- status: "success" or "error"
|
|
@@ -1403,42 +1461,35 @@ def create_scripts(
|
|
|
1403
1461
|
- instructions: List of next steps
|
|
1404
1462
|
- files_created: List of created file paths
|
|
1405
1463
|
- source_info: Metadata about source files
|
|
1406
|
-
|
|
1464
|
+
|
|
1407
1465
|
Example:
|
|
1408
1466
|
>>> import masster.wizard
|
|
1409
1467
|
>>> result = masster.wizard.create_scripts(
|
|
1410
1468
|
... source=r'D:\\Data\\raw_files',
|
|
1411
|
-
... folder=r'D:\\Data\\output',
|
|
1469
|
+
... folder=r'D:\\Data\\output',
|
|
1412
1470
|
... polarity='negative'
|
|
1413
1471
|
... )
|
|
1414
1472
|
>>> print("Status:", result["status"])
|
|
1415
1473
|
"""
|
|
1416
|
-
|
|
1474
|
+
|
|
1417
1475
|
try:
|
|
1418
1476
|
# Auto-detect optimal number of cores if not specified
|
|
1419
1477
|
if num_cores <= 0:
|
|
1420
1478
|
num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
|
|
1421
|
-
|
|
1479
|
+
|
|
1422
1480
|
# Create Wizard instance
|
|
1423
|
-
wizard = Wizard(
|
|
1424
|
-
|
|
1425
|
-
folder=folder,
|
|
1426
|
-
polarity=polarity,
|
|
1427
|
-
adducts=adducts,
|
|
1428
|
-
num_cores=num_cores,
|
|
1429
|
-
**kwargs
|
|
1430
|
-
)
|
|
1431
|
-
|
|
1481
|
+
wizard = Wizard(source=source, folder=folder, polarity=polarity, adducts=adducts, num_cores=num_cores, **kwargs)
|
|
1482
|
+
|
|
1432
1483
|
# Call the instance method
|
|
1433
1484
|
return wizard.create_scripts()
|
|
1434
|
-
|
|
1485
|
+
|
|
1435
1486
|
except Exception as e:
|
|
1436
1487
|
return {
|
|
1437
1488
|
"status": "error",
|
|
1438
1489
|
"message": f"Failed to create scripts: {e}",
|
|
1439
1490
|
"instructions": [],
|
|
1440
1491
|
"files_created": [],
|
|
1441
|
-
"source_info": {}
|
|
1492
|
+
"source_info": {},
|
|
1442
1493
|
}
|
|
1443
1494
|
|
|
1444
1495
|
|