masster 0.4.19__py3-none-any.whl → 0.4.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -1
- masster/_version.py +1 -1
- masster/logger.py +42 -0
- masster/sample/h5.py +58 -1
- masster/sample/load.py +12 -5
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -9
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +137 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +238 -152
- masster/study/h5.py +65 -1
- masster/study/helpers.py +55 -14
- masster/study/merge.py +910 -67
- masster/study/plot.py +50 -7
- masster/study/processing.py +257 -1
- masster/study/save.py +48 -5
- masster/study/study.py +34 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +8 -2
- masster/wizard/wizard.py +612 -876
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/METADATA +1 -1
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/RECORD +27 -30
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- masster/wizard.py +0 -1175
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/WHEEL +0 -0
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/entry_points.txt +0 -0
- {masster-0.4.19.dist-info → masster-0.4.21.dist-info}/licenses/LICENSE +0 -0
masster/wizard/wizard.py
CHANGED
|
@@ -23,28 +23,22 @@ from masster import Wizard, wizard_def
|
|
|
23
23
|
|
|
24
24
|
# Create wizard with default parameters
|
|
25
25
|
wizard = Wizard(
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
source="./raw_data",
|
|
27
|
+
folder="./processed_study",
|
|
28
28
|
polarity="positive",
|
|
29
29
|
num_cores=4
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
# Run complete processing pipeline
|
|
33
|
-
wizard.run_full_pipeline()
|
|
34
|
-
|
|
35
|
-
# Or run individual steps
|
|
36
|
-
wizard.convert_to_sample5()
|
|
37
|
-
wizard.assemble_study()
|
|
38
|
-
wizard.align_and_merge()
|
|
39
|
-
wizard.generate_plots()
|
|
40
|
-
wizard.export_results()
|
|
41
32
|
```
|
|
42
33
|
"""
|
|
43
34
|
|
|
44
35
|
from __future__ import annotations
|
|
45
36
|
|
|
46
37
|
import os
|
|
38
|
+
import sys
|
|
47
39
|
import time
|
|
40
|
+
import importlib
|
|
41
|
+
import glob
|
|
48
42
|
import multiprocessing
|
|
49
43
|
from pathlib import Path
|
|
50
44
|
from typing import Optional, Any, Dict, List
|
|
@@ -57,6 +51,7 @@ from masster.logger import MassterLogger
|
|
|
57
51
|
from masster.study.defaults.study_def import study_defaults
|
|
58
52
|
from masster.study.defaults.align_def import align_defaults
|
|
59
53
|
from masster.study.defaults.merge_def import merge_defaults
|
|
54
|
+
from masster._version import __version__ as version
|
|
60
55
|
|
|
61
56
|
|
|
62
57
|
@dataclass
|
|
@@ -69,8 +64,8 @@ class wizard_def:
|
|
|
69
64
|
|
|
70
65
|
Attributes:
|
|
71
66
|
# Core Configuration
|
|
72
|
-
|
|
73
|
-
|
|
67
|
+
source (str): Path to directory containing raw data files
|
|
68
|
+
folder (str): Output directory for processed study
|
|
74
69
|
polarity (str): Ion polarity mode ("positive" or "negative")
|
|
75
70
|
num_cores (int): Number of CPU cores to use for parallel processing
|
|
76
71
|
|
|
@@ -101,15 +96,15 @@ class wizard_def:
|
|
|
101
96
|
"""
|
|
102
97
|
|
|
103
98
|
# === Core Configuration ===
|
|
104
|
-
|
|
105
|
-
|
|
99
|
+
source: str = ""
|
|
100
|
+
folder: str = ""
|
|
106
101
|
polarity: str = "positive"
|
|
107
102
|
num_cores: int = 4
|
|
108
103
|
|
|
109
104
|
# === File Discovery ===
|
|
110
|
-
file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"
|
|
105
|
+
file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"])
|
|
111
106
|
search_subfolders: bool = True
|
|
112
|
-
skip_patterns: List[str] = field(default_factory=lambda: ["blank", "
|
|
107
|
+
skip_patterns: List[str] = field(default_factory=lambda: ["blank", "test"])
|
|
113
108
|
|
|
114
109
|
# === Processing Parameters ===
|
|
115
110
|
adducts: List[str] = field(default_factory=list) # Will be set based on polarity
|
|
@@ -124,17 +119,18 @@ class wizard_def:
|
|
|
124
119
|
checkpoint_interval: int = 10 # Save progress every N files
|
|
125
120
|
|
|
126
121
|
# === Study Assembly ===
|
|
127
|
-
min_samples_for_merge: int =
|
|
122
|
+
min_samples_for_merge: int = 2
|
|
128
123
|
rt_tolerance: float = 1.5
|
|
129
|
-
|
|
124
|
+
mz_max_diff: float = 0.01
|
|
130
125
|
alignment_algorithm: str = "kd"
|
|
131
|
-
merge_method: str = "
|
|
126
|
+
merge_method: str = "qt"
|
|
132
127
|
|
|
133
128
|
# === Feature Detection ===
|
|
134
|
-
chrom_fwhm: float = 0.
|
|
135
|
-
noise_threshold: float =
|
|
129
|
+
chrom_fwhm: float = 0.5
|
|
130
|
+
noise_threshold: float = 200.0
|
|
136
131
|
chrom_peak_snr: float = 5.0
|
|
137
132
|
tol_ppm: float = 10.0
|
|
133
|
+
detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
|
|
138
134
|
|
|
139
135
|
# === Output & Export ===
|
|
140
136
|
generate_plots: bool = True
|
|
@@ -154,15 +150,15 @@ class wizard_def:
|
|
|
154
150
|
optimize_memory: bool = True
|
|
155
151
|
cleanup_temp_files: bool = True
|
|
156
152
|
validate_outputs: bool = True
|
|
157
|
-
|
|
153
|
+
|
|
158
154
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
159
155
|
default_factory=lambda: {
|
|
160
|
-
"
|
|
156
|
+
"source": {
|
|
161
157
|
"dtype": str,
|
|
162
158
|
"description": "Path to directory containing raw data files",
|
|
163
159
|
"required": True,
|
|
164
160
|
},
|
|
165
|
-
"
|
|
161
|
+
"folder": {
|
|
166
162
|
"dtype": str,
|
|
167
163
|
"description": "Output directory for processed study",
|
|
168
164
|
"required": True,
|
|
@@ -218,62 +214,63 @@ class wizard_def:
|
|
|
218
214
|
self.num_cores = max_cores
|
|
219
215
|
|
|
220
216
|
# Ensure paths are absolute
|
|
221
|
-
if self.
|
|
222
|
-
self.
|
|
223
|
-
if self.
|
|
224
|
-
self.
|
|
217
|
+
if self.source:
|
|
218
|
+
self.source = os.path.abspath(self.source)
|
|
219
|
+
if self.folder:
|
|
220
|
+
self.folder = os.path.abspath(self.folder)
|
|
225
221
|
|
|
226
222
|
|
|
227
223
|
class Wizard:
|
|
228
224
|
"""
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
The Wizard class provides end-to-end automation for processing collections
|
|
232
|
-
of mass spectrometry files from raw data to final study results, including:
|
|
225
|
+
Simplified Wizard for automated mass spectrometry data processing.
|
|
233
226
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
4. Intelligent resume capability for interrupted processes
|
|
238
|
-
5. Adaptive optimization based on study size and system resources
|
|
227
|
+
The Wizard provides a clean interface for creating and executing analysis scripts
|
|
228
|
+
that process raw MS data through the complete pipeline: file discovery, feature
|
|
229
|
+
detection, sample processing, study assembly, alignment, merging, and export.
|
|
239
230
|
|
|
240
|
-
|
|
241
|
-
|
|
231
|
+
This simplified version focuses on two core functions:
|
|
232
|
+
- create_script(): Generate standalone analysis scripts
|
|
233
|
+
- execute(): Create and run analysis scripts
|
|
242
234
|
"""
|
|
243
235
|
|
|
244
236
|
def __init__(
|
|
245
237
|
self,
|
|
246
|
-
|
|
247
|
-
|
|
238
|
+
source: str = "",
|
|
239
|
+
folder: str = "",
|
|
248
240
|
polarity: str = "positive",
|
|
249
241
|
adducts: Optional[List[str]] = None,
|
|
250
|
-
num_cores: int =
|
|
242
|
+
num_cores: int = 0,
|
|
251
243
|
**kwargs
|
|
252
244
|
):
|
|
253
245
|
"""
|
|
254
|
-
Initialize the Wizard
|
|
246
|
+
Initialize the Wizard with analysis parameters.
|
|
255
247
|
|
|
256
248
|
Parameters:
|
|
257
|
-
|
|
258
|
-
|
|
249
|
+
source: Directory containing raw data files
|
|
250
|
+
folder: Output directory for processed study
|
|
259
251
|
polarity: Ion polarity mode ("positive" or "negative")
|
|
260
252
|
adducts: List of adduct specifications (auto-set if None)
|
|
261
|
-
num_cores: Number of CPU cores
|
|
253
|
+
num_cores: Number of CPU cores (0 = auto-detect 75% of available)
|
|
262
254
|
**kwargs: Additional parameters (see wizard_def for full list)
|
|
263
255
|
"""
|
|
264
256
|
|
|
257
|
+
# Auto-detect optimal number of cores if not specified
|
|
258
|
+
if num_cores <= 0:
|
|
259
|
+
num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
|
|
260
|
+
|
|
265
261
|
# Create parameters instance
|
|
266
262
|
if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
|
|
267
263
|
self.params = kwargs.pop("params")
|
|
268
264
|
else:
|
|
269
|
-
# Create default parameters
|
|
265
|
+
# Create default parameters
|
|
270
266
|
self.params = wizard_def(
|
|
271
|
-
|
|
272
|
-
|
|
267
|
+
source=source,
|
|
268
|
+
folder=folder,
|
|
273
269
|
polarity=polarity,
|
|
274
270
|
num_cores=num_cores
|
|
275
271
|
)
|
|
276
272
|
|
|
273
|
+
# Set adducts if provided
|
|
277
274
|
if adducts is not None:
|
|
278
275
|
self.params.adducts = adducts
|
|
279
276
|
|
|
@@ -283,893 +280,632 @@ class Wizard:
|
|
|
283
280
|
setattr(self.params, key, value)
|
|
284
281
|
|
|
285
282
|
# Validate required parameters
|
|
286
|
-
if not self.params.
|
|
287
|
-
raise ValueError("
|
|
288
|
-
if not self.params.
|
|
289
|
-
raise ValueError("
|
|
290
|
-
|
|
291
|
-
# Create
|
|
292
|
-
self.
|
|
293
|
-
self.
|
|
294
|
-
self.
|
|
295
|
-
|
|
296
|
-
#
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
# Create checkpoint file path
|
|
307
|
-
self.checkpoint_file = self.study_folder_path / "wizard_checkpoint.json"
|
|
308
|
-
|
|
309
|
-
self.logger.info(f"Wizard initialized for {self.polarity} mode")
|
|
310
|
-
self.logger.info(f"Data source: {self.data_source_path}")
|
|
311
|
-
self.logger.info(f"Study folder: {self.study_folder_path}")
|
|
312
|
-
self.logger.info(f"Using {self.params.num_cores} CPU cores")
|
|
313
|
-
|
|
314
|
-
# Load checkpoint if resuming
|
|
315
|
-
if self.params.resume_enabled:
|
|
316
|
-
self._load_checkpoint()
|
|
317
|
-
|
|
318
|
-
@property
|
|
319
|
-
def polarity(self) -> str:
|
|
320
|
-
"""Get the polarity setting."""
|
|
321
|
-
return self.params.polarity
|
|
322
|
-
|
|
323
|
-
@property
|
|
324
|
-
def adducts(self) -> List[str]:
|
|
325
|
-
"""Get the adducts list."""
|
|
326
|
-
return self.params.adducts
|
|
327
|
-
|
|
328
|
-
def _setup_logging(self):
|
|
329
|
-
"""Setup comprehensive logging system."""
|
|
330
|
-
# Create logger
|
|
331
|
-
log_label = f"Wizard-{self.polarity}"
|
|
332
|
-
|
|
333
|
-
if self.params.log_to_file:
|
|
334
|
-
log_file = self.study_folder_path / "wizard.log"
|
|
335
|
-
sink = str(log_file)
|
|
336
|
-
else:
|
|
337
|
-
sink = "sys.stdout"
|
|
338
|
-
|
|
339
|
-
self.logger = MassterLogger(
|
|
340
|
-
instance_type="wizard",
|
|
341
|
-
level=self.params.log_level.upper(),
|
|
342
|
-
label=log_label,
|
|
343
|
-
sink=sink,
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
# Also create a simple file logger for critical info
|
|
347
|
-
self.log_file = self.study_folder_path / "processing.log"
|
|
348
|
-
|
|
349
|
-
def _log_progress(self, message: str, level: str = "INFO"):
|
|
350
|
-
"""Log progress message with timestamp."""
|
|
351
|
-
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
352
|
-
full_message = f"[{timestamp}] {message}"
|
|
353
|
-
|
|
354
|
-
# Log to masster logger
|
|
355
|
-
getattr(self.logger, level.lower())(message)
|
|
356
|
-
|
|
357
|
-
# Also write to simple log file
|
|
358
|
-
with open(self.log_file, "a", encoding="utf-8") as f:
|
|
359
|
-
f.write(f"{full_message}\n")
|
|
360
|
-
|
|
361
|
-
if self.params.verbose_progress and level in ["INFO", "WARNING", "ERROR"]:
|
|
362
|
-
print(full_message)
|
|
363
|
-
|
|
364
|
-
def _save_checkpoint(self):
|
|
365
|
-
"""Save processing checkpoint for resume capability."""
|
|
366
|
-
if not self.params.resume_enabled:
|
|
367
|
-
return
|
|
368
|
-
|
|
369
|
-
import json
|
|
370
|
-
checkpoint_data = {
|
|
371
|
-
"timestamp": datetime.now().isoformat(),
|
|
372
|
-
"current_step": self.current_step,
|
|
373
|
-
"processed_files": self.processed_files,
|
|
374
|
-
"failed_files": self.failed_files,
|
|
375
|
-
"params": {
|
|
376
|
-
"data_source": self.params.data_source,
|
|
377
|
-
"study_folder": self.params.study_folder,
|
|
378
|
-
"polarity": self.params.polarity,
|
|
379
|
-
"adducts": self.params.adducts,
|
|
380
|
-
"num_cores": self.params.num_cores,
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
try:
|
|
385
|
-
with open(self.checkpoint_file, "w") as f:
|
|
386
|
-
json.dump(checkpoint_data, f, indent=2)
|
|
387
|
-
self.logger.debug(f"Checkpoint saved: {len(self.processed_files)} files processed")
|
|
388
|
-
except Exception as e:
|
|
389
|
-
self.logger.warning(f"Failed to save checkpoint: {e}")
|
|
390
|
-
|
|
391
|
-
def _load_checkpoint(self):
|
|
392
|
-
"""Load processing checkpoint for resume capability."""
|
|
393
|
-
if not self.checkpoint_file.exists():
|
|
394
|
-
return
|
|
395
|
-
|
|
396
|
-
import json
|
|
397
|
-
try:
|
|
398
|
-
with open(self.checkpoint_file, "r") as f:
|
|
399
|
-
checkpoint_data = json.load(f)
|
|
400
|
-
|
|
401
|
-
self.processed_files = checkpoint_data.get("processed_files", [])
|
|
402
|
-
self.failed_files = checkpoint_data.get("failed_files", [])
|
|
403
|
-
self.current_step = checkpoint_data.get("current_step", "initialized")
|
|
404
|
-
|
|
405
|
-
self.logger.info(f"Resuming from checkpoint: {len(self.processed_files)} files already processed")
|
|
406
|
-
self.logger.info(f"Previous step: {self.current_step}")
|
|
407
|
-
|
|
408
|
-
except Exception as e:
|
|
409
|
-
self.logger.warning(f"Failed to load checkpoint: {e}")
|
|
410
|
-
self.processed_files = []
|
|
411
|
-
self.failed_files = []
|
|
412
|
-
|
|
413
|
-
def discover_files(self) -> List[Path]:
|
|
283
|
+
if not self.params.source:
|
|
284
|
+
raise ValueError("source is required")
|
|
285
|
+
if not self.params.folder:
|
|
286
|
+
raise ValueError("folder is required")
|
|
287
|
+
|
|
288
|
+
# Create and validate paths
|
|
289
|
+
self.source_path = Path(self.params.source)
|
|
290
|
+
self.folder_path = Path(self.params.folder)
|
|
291
|
+
self.folder_path.mkdir(parents=True, exist_ok=True)
|
|
292
|
+
|
|
293
|
+
# Auto-infer polarity from the first file if not explicitly set by user
|
|
294
|
+
if polarity == "positive" and "polarity" not in kwargs:
|
|
295
|
+
inferred_polarity = self._infer_polarity_from_first_file()
|
|
296
|
+
if inferred_polarity:
|
|
297
|
+
self.params.polarity = inferred_polarity
|
|
298
|
+
# Update adducts based on inferred polarity
|
|
299
|
+
self.params.__post_init__()
|
|
300
|
+
|
|
301
|
+
def _infer_polarity_from_first_file(self) -> str:
|
|
414
302
|
"""
|
|
415
|
-
|
|
303
|
+
Infer polarity from the first available raw data file.
|
|
416
304
|
|
|
417
305
|
Returns:
|
|
418
|
-
|
|
306
|
+
Inferred polarity string ("positive" or "negative") or None if detection fails
|
|
419
307
|
"""
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
files = list(self.data_source_path.rglob(pattern))
|
|
308
|
+
try:
|
|
309
|
+
# Find first file
|
|
310
|
+
for extension in ['.wiff', '.raw', '.mzML', '.d']:
|
|
311
|
+
pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
|
|
312
|
+
files = list(self.source_path.rglob(pattern))
|
|
313
|
+
if files:
|
|
314
|
+
first_file = files[0]
|
|
315
|
+
break
|
|
429
316
|
else:
|
|
430
|
-
|
|
431
|
-
files = list(self.data_source_path.glob(pattern))
|
|
317
|
+
return None
|
|
432
318
|
|
|
433
|
-
#
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
skip_file = True
|
|
440
|
-
self.logger.debug(f"Skipping file (matches pattern '{pattern}'): {file_path.name}")
|
|
441
|
-
break
|
|
319
|
+
# Only implement for .wiff files initially (most common format)
|
|
320
|
+
if first_file.suffix.lower() == '.wiff':
|
|
321
|
+
from masster.sample.load import _wiff_to_dict
|
|
322
|
+
|
|
323
|
+
# Extract metadata from first file
|
|
324
|
+
metadata_df = _wiff_to_dict(str(first_file))
|
|
442
325
|
|
|
443
|
-
if not
|
|
444
|
-
#
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
326
|
+
if not metadata_df.empty and 'polarity' in metadata_df.columns:
|
|
327
|
+
# Get polarity from first experiment
|
|
328
|
+
first_polarity = metadata_df['polarity'].iloc[0]
|
|
329
|
+
|
|
330
|
+
# Convert numeric polarity codes to string
|
|
331
|
+
if first_polarity == 1 or str(first_polarity).lower() in ['positive', 'pos', '+']:
|
|
332
|
+
return "positive"
|
|
333
|
+
elif first_polarity == -1 or str(first_polarity).lower() in ['negative', 'neg', '-']:
|
|
334
|
+
return "negative"
|
|
335
|
+
|
|
336
|
+
except Exception:
|
|
337
|
+
# Silently fall back to default if inference fails
|
|
338
|
+
pass
|
|
453
339
|
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
340
|
+
return None
|
|
341
|
+
|
|
342
|
+
@property
|
|
343
|
+
def polarity(self) -> str:
|
|
344
|
+
"""Get the ion polarity mode."""
|
|
345
|
+
return self.params.polarity
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def adducts(self) -> List[str]:
|
|
349
|
+
"""Get the adduct specifications."""
|
|
350
|
+
return self.params.adducts
|
|
351
|
+
|
|
352
|
+
def create_script(self, filename: str = "run_masster.py") -> bool:
|
|
465
353
|
"""
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
This method replicates the core processing from parallel_sample_processing.py
|
|
469
|
-
but with wizard-specific configuration and error handling.
|
|
354
|
+
Generate a standalone Python script for the analysis pipeline.
|
|
470
355
|
|
|
471
356
|
Parameters:
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
357
|
+
filename: Name for the generated script file
|
|
358
|
+
|
|
475
359
|
Returns:
|
|
476
|
-
|
|
360
|
+
True if script was generated successfully, False otherwise
|
|
477
361
|
"""
|
|
478
|
-
import gc
|
|
479
|
-
|
|
480
|
-
# Generate output filename
|
|
481
|
-
file_out = file_path.stem + '.sample5'
|
|
482
|
-
output_file = self.study_folder_path / file_out
|
|
483
|
-
|
|
484
|
-
# Initialize masster Sample with delayed import
|
|
485
|
-
import masster
|
|
486
|
-
sample = masster.Sample(
|
|
487
|
-
log_label=file_path.name,
|
|
488
|
-
log_level='ERROR' # Reduce logging overhead in parallel processing
|
|
489
|
-
)
|
|
490
|
-
|
|
491
|
-
# Check if file should be skipped
|
|
492
|
-
skip = False
|
|
493
|
-
if not reset and not self.params.force_reprocess and output_file.exists():
|
|
494
|
-
try:
|
|
495
|
-
# Attempt to load existing processed file to verify it's valid
|
|
496
|
-
sample.load(str(output_file))
|
|
497
|
-
skip = True
|
|
498
|
-
except Exception:
|
|
499
|
-
# If loading fails, file needs to be reprocessed
|
|
500
|
-
skip = False
|
|
501
|
-
|
|
502
|
-
if skip:
|
|
503
|
-
self.logger.debug(f"Skipping {file_path.name} (already processed)")
|
|
504
|
-
return output_file.stem
|
|
505
|
-
|
|
506
|
-
self.logger.info(f"Processing {file_path.name}")
|
|
507
|
-
|
|
508
362
|
try:
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
# STEP 2: Feature detection - First pass (strict parameters)
|
|
513
|
-
sample.find_features(
|
|
514
|
-
chrom_fwhm=self.params.chrom_fwhm,
|
|
515
|
-
noise=self.params.noise_threshold,
|
|
516
|
-
tol_ppm=self.params.tol_ppm,
|
|
517
|
-
chrom_peak_snr=self.params.chrom_peak_snr,
|
|
518
|
-
min_trace_length_multiplier=0.5,
|
|
519
|
-
chrom_fwhm_min=self.params.chrom_fwhm
|
|
520
|
-
)
|
|
521
|
-
|
|
522
|
-
# STEP 3: Feature detection - Second pass (relaxed parameters)
|
|
523
|
-
sample.find_features(
|
|
524
|
-
chrom_peak_snr=self.params.chrom_peak_snr,
|
|
525
|
-
noise=self.params.noise_threshold / 10, # Lower noise threshold
|
|
526
|
-
chrom_fwhm=2.0 # Wider peaks
|
|
527
|
-
)
|
|
528
|
-
|
|
529
|
-
# STEP 4: Adduct detection
|
|
530
|
-
sample.find_adducts(adducts=self.adducts)
|
|
531
|
-
|
|
532
|
-
# STEP 5: MS2 spectrum identification
|
|
533
|
-
sample.find_ms2()
|
|
534
|
-
|
|
535
|
-
# STEP 6: Save processed data
|
|
536
|
-
sample.save(filename=str(output_file))
|
|
537
|
-
|
|
538
|
-
# STEP 7: Generate additional outputs if requested
|
|
539
|
-
if "csv" in self.params.export_formats:
|
|
540
|
-
csv_file = output_file.with_suffix('.features.csv')
|
|
541
|
-
sample.export_features(filename=str(csv_file))
|
|
542
|
-
|
|
543
|
-
if "mgf" in self.params.export_formats:
|
|
544
|
-
mgf_file = output_file.with_suffix('.mgf')
|
|
545
|
-
sample.export_mgf(filename=str(mgf_file), use_cache=False)
|
|
363
|
+
script_path = self.folder_path / filename
|
|
364
|
+
script_content = self._generate_script_content()
|
|
546
365
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
sample.plot_2d(filename=str(plot_file), markersize=4)
|
|
366
|
+
with open(script_path, 'w', encoding='utf-8') as f:
|
|
367
|
+
f.write(script_content)
|
|
550
368
|
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
del sample
|
|
554
|
-
gc.collect()
|
|
555
|
-
|
|
556
|
-
return result
|
|
369
|
+
print(f"Analysis script created: {script_path}")
|
|
370
|
+
return True
|
|
557
371
|
|
|
558
372
|
except Exception as e:
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
def _process_batch(self, file_batch: List[Path]) -> List[str]:
|
|
565
|
-
"""Process a batch of files in a single worker."""
|
|
566
|
-
results = []
|
|
567
|
-
for file_path in file_batch:
|
|
568
|
-
result = self._process_single_file(file_path)
|
|
569
|
-
if result:
|
|
570
|
-
results.append(result)
|
|
571
|
-
else:
|
|
572
|
-
results.append(None)
|
|
573
|
-
return results
|
|
574
|
-
|
|
575
|
-
def convert_to_sample5(self, file_list: Optional[List[Path]] = None) -> bool:
|
|
373
|
+
print(f"Failed to create script: {e}")
|
|
374
|
+
return False
|
|
375
|
+
|
|
376
|
+
def execute(self, filename: str = "run_masster.py") -> bool:
|
|
576
377
|
"""
|
|
577
|
-
|
|
378
|
+
Create and execute a standalone analysis script.
|
|
578
379
|
|
|
579
380
|
Parameters:
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
Returns:
|
|
583
|
-
True if conversion completed successfully
|
|
584
|
-
"""
|
|
585
|
-
self._log_progress("=== Starting Sample5 Conversion ===")
|
|
586
|
-
self.current_step = "converting_to_sample5"
|
|
587
|
-
|
|
588
|
-
if file_list is None:
|
|
589
|
-
file_list = self.discover_files()
|
|
590
|
-
|
|
591
|
-
if not file_list:
|
|
592
|
-
self.logger.warning("No files found for conversion")
|
|
593
|
-
return False
|
|
594
|
-
|
|
595
|
-
# Filter out already processed files if resuming
|
|
596
|
-
if self.params.resume_enabled and self.processed_files:
|
|
597
|
-
remaining_files = []
|
|
598
|
-
for file_path in file_list:
|
|
599
|
-
if str(file_path) not in self.processed_files:
|
|
600
|
-
remaining_files.append(file_path)
|
|
601
|
-
file_list = remaining_files
|
|
381
|
+
filename: Name for the generated script file
|
|
602
382
|
|
|
603
|
-
if not file_list:
|
|
604
|
-
self._log_progress("All files already processed")
|
|
605
|
-
return True
|
|
606
|
-
|
|
607
|
-
self._log_progress(f"Converting {len(file_list)} files to sample5 format")
|
|
608
|
-
|
|
609
|
-
conversion_start = time.time()
|
|
610
|
-
successful_count = 0
|
|
611
|
-
failed_count = 0
|
|
612
|
-
|
|
613
|
-
if self.params.use_process_pool:
|
|
614
|
-
# ProcessPoolExecutor approach - better for CPU-intensive work
|
|
615
|
-
if len(file_list) <= self.params.batch_size:
|
|
616
|
-
# Few files: process individually
|
|
617
|
-
self.logger.info(f"Processing {len(file_list)} files individually with {self.params.num_cores} workers")
|
|
618
|
-
|
|
619
|
-
with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
620
|
-
futures = [
|
|
621
|
-
executor.submit(self._process_single_file, file_path)
|
|
622
|
-
for file_path in file_list
|
|
623
|
-
]
|
|
624
|
-
|
|
625
|
-
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
626
|
-
result = future.result()
|
|
627
|
-
if result:
|
|
628
|
-
successful_count += 1
|
|
629
|
-
self.processed_files.append(str(file_list[i]))
|
|
630
|
-
else:
|
|
631
|
-
failed_count += 1
|
|
632
|
-
self.failed_files.append(str(file_list[i]))
|
|
633
|
-
|
|
634
|
-
# Progress update and checkpoint
|
|
635
|
-
if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
|
|
636
|
-
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
637
|
-
self._log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")
|
|
638
|
-
self._save_checkpoint()
|
|
639
|
-
|
|
640
|
-
else:
|
|
641
|
-
# Many files: process in batches
|
|
642
|
-
batches = [
|
|
643
|
-
file_list[i:i + self.params.batch_size]
|
|
644
|
-
for i in range(0, len(file_list), self.params.batch_size)
|
|
645
|
-
]
|
|
646
|
-
|
|
647
|
-
self.logger.info(f"Processing {len(file_list)} files in {len(batches)} batches")
|
|
648
|
-
|
|
649
|
-
with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
650
|
-
futures = [executor.submit(self._process_batch, batch) for batch in batches]
|
|
651
|
-
|
|
652
|
-
for batch_idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
653
|
-
batch_results = future.result()
|
|
654
|
-
batch = batches[batch_idx]
|
|
655
|
-
|
|
656
|
-
for i, result in enumerate(batch_results):
|
|
657
|
-
if result:
|
|
658
|
-
successful_count += 1
|
|
659
|
-
self.processed_files.append(str(batch[i]))
|
|
660
|
-
else:
|
|
661
|
-
failed_count += 1
|
|
662
|
-
self.failed_files.append(str(batch[i]))
|
|
663
|
-
|
|
664
|
-
# Progress update
|
|
665
|
-
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
666
|
-
self._log_progress(f"Batch {batch_idx + 1}/{len(batches)} complete. Progress: {progress:.1f}%")
|
|
667
|
-
self._save_checkpoint()
|
|
668
|
-
|
|
669
|
-
else:
|
|
670
|
-
# ThreadPoolExecutor approach
|
|
671
|
-
self.logger.info(f"Processing {len(file_list)} files with {self.params.num_cores} threads")
|
|
672
|
-
|
|
673
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
674
|
-
futures = [
|
|
675
|
-
executor.submit(self._process_single_file, file_path)
|
|
676
|
-
for file_path in file_list
|
|
677
|
-
]
|
|
678
|
-
|
|
679
|
-
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
680
|
-
result = future.result()
|
|
681
|
-
if result:
|
|
682
|
-
successful_count += 1
|
|
683
|
-
self.processed_files.append(str(file_list[i]))
|
|
684
|
-
else:
|
|
685
|
-
failed_count += 1
|
|
686
|
-
self.failed_files.append(str(file_list[i]))
|
|
687
|
-
|
|
688
|
-
if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
|
|
689
|
-
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
690
|
-
self._log_progress(f"Progress: {progress:.1f}%")
|
|
691
|
-
self._save_checkpoint()
|
|
692
|
-
|
|
693
|
-
conversion_time = time.time() - conversion_start
|
|
694
|
-
|
|
695
|
-
self._log_progress("=== Sample5 Conversion Complete ===")
|
|
696
|
-
self._log_progress(f"Successful: {successful_count}")
|
|
697
|
-
self._log_progress(f"Failed: {failed_count}")
|
|
698
|
-
self._log_progress(f"Total time: {conversion_time:.1f} seconds")
|
|
699
|
-
|
|
700
|
-
if failed_count > 0:
|
|
701
|
-
self.logger.warning(f"{failed_count} files failed to process")
|
|
702
|
-
for failed_file in self.failed_files[-failed_count:]:
|
|
703
|
-
self.logger.warning(f"Failed: {failed_file}")
|
|
704
|
-
|
|
705
|
-
self._save_checkpoint()
|
|
706
|
-
return successful_count > 0
|
|
707
|
-
|
|
708
|
-
def assemble_study(self) -> bool:
|
|
709
|
-
"""
|
|
710
|
-
Assemble processed sample5 files into a study.
|
|
711
|
-
|
|
712
383
|
Returns:
|
|
713
|
-
True if
|
|
384
|
+
True if execution completed successfully, False otherwise
|
|
714
385
|
"""
|
|
715
|
-
|
|
716
|
-
self.
|
|
717
|
-
|
|
718
|
-
# Find all sample5 files
|
|
719
|
-
sample5_files = list(self.study_folder_path.glob("*.sample5"))
|
|
720
|
-
|
|
721
|
-
if not sample5_files:
|
|
722
|
-
self.logger.error("No sample5 files found for study assembly")
|
|
386
|
+
# First create the script
|
|
387
|
+
if not self.create_script(filename):
|
|
723
388
|
return False
|
|
724
|
-
|
|
725
|
-
|
|
389
|
+
|
|
390
|
+
# Then execute it
|
|
391
|
+
script_path = self.folder_path / filename
|
|
726
392
|
|
|
727
393
|
try:
|
|
728
|
-
|
|
729
|
-
import masster
|
|
730
|
-
study_params = study_defaults(
|
|
731
|
-
folder=str(self.study_folder_path),
|
|
732
|
-
polarity=self.polarity,
|
|
733
|
-
log_level="INFO",
|
|
734
|
-
log_label=f"Study-{self.polarity}",
|
|
735
|
-
adducts=self.adducts
|
|
736
|
-
)
|
|
737
|
-
|
|
738
|
-
self.study = masster.Study(params=study_params)
|
|
394
|
+
print("Executing...")
|
|
739
395
|
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
396
|
+
import subprocess
|
|
397
|
+
result = subprocess.run([
|
|
398
|
+
sys.executable, str(script_path)
|
|
399
|
+
], cwd=str(self.folder_path), encoding='utf-8', errors='replace')
|
|
743
400
|
|
|
744
|
-
|
|
401
|
+
success = result.returncode == 0
|
|
745
402
|
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
chrom_coherence=0.3,
|
|
753
|
-
chrom_prominence_scaled=1
|
|
754
|
-
)
|
|
755
|
-
self.study.features_filter(feature_selection)
|
|
403
|
+
if success:
|
|
404
|
+
print("=" * 70)
|
|
405
|
+
print("Script execution completed successfully")
|
|
406
|
+
else:
|
|
407
|
+
print("=" * 70)
|
|
408
|
+
print(f"Script execution failed with return code: {result.returncode}")
|
|
756
409
|
|
|
757
|
-
|
|
758
|
-
self._log_progress(f"Feature filtering: {initial_features} -> {final_features} features")
|
|
759
|
-
|
|
760
|
-
self._save_checkpoint()
|
|
761
|
-
return True
|
|
410
|
+
return success
|
|
762
411
|
|
|
763
412
|
except Exception as e:
|
|
764
|
-
|
|
413
|
+
print(f"Error during script execution: {e}")
|
|
765
414
|
return False
|
|
766
|
-
|
|
767
|
-
def
|
|
768
|
-
"""
|
|
769
|
-
|
|
415
|
+
|
|
416
|
+
def _generate_script_content(self) -> str:
|
|
417
|
+
"""Generate the complete analysis script content."""
|
|
418
|
+
|
|
419
|
+
# Convert Path objects to strings for JSON serialization
|
|
420
|
+
params_dict = {}
|
|
421
|
+
for key, value in self.params.__dict__.items():
|
|
422
|
+
if key == '_param_metadata': # Skip metadata in generated script
|
|
423
|
+
continue
|
|
424
|
+
if isinstance(value, Path):
|
|
425
|
+
params_dict[key] = str(value)
|
|
426
|
+
else:
|
|
427
|
+
params_dict[key] = value
|
|
428
|
+
|
|
429
|
+
# Obtain list of files in source with extension wiff, .raw, .mzML
|
|
430
|
+
raw_files = []
|
|
431
|
+
for ext in params_dict.get('file_extensions', []):
|
|
432
|
+
raw_files.extend(glob.glob(f"{params_dict.get('source', '')}/**/*{ext}", recursive=True))
|
|
433
|
+
|
|
434
|
+
# Create readable PARAMS dict with comments
|
|
435
|
+
params_lines = []
|
|
436
|
+
params_lines.append('# Analysis parameters')
|
|
437
|
+
params_lines.append('PARAMS = {')
|
|
770
438
|
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
"""
|
|
774
|
-
|
|
775
|
-
|
|
439
|
+
# Core Configuration
|
|
440
|
+
params_lines.append(' # === Core Configuration ===')
|
|
441
|
+
params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
|
|
442
|
+
params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
|
|
443
|
+
params_lines.append(f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode ("positive" or "negative")')
|
|
444
|
+
params_lines.append(f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing')
|
|
445
|
+
params_lines.append('')
|
|
776
446
|
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
447
|
+
# File Discovery
|
|
448
|
+
params_lines.append(' # === File Discovery ===')
|
|
449
|
+
params_lines.append(f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for')
|
|
450
|
+
params_lines.append(f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively')
|
|
451
|
+
params_lines.append(f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip')
|
|
452
|
+
params_lines.append('')
|
|
780
453
|
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
454
|
+
# Processing Parameters
|
|
455
|
+
params_lines.append(' # === Processing Parameters ===')
|
|
456
|
+
params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
|
|
457
|
+
params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
|
|
458
|
+
params_lines.append('')
|
|
459
|
+
|
|
460
|
+
# Alignment & Merging
|
|
461
|
+
params_lines.append(' # === Alignment & Merging ===')
|
|
462
|
+
params_lines.append(f' "rt_tol": {params_dict.get("rt_tol", 2.0)}, # Retention time tolerance for alignment (seconds)')
|
|
463
|
+
params_lines.append(f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)')
|
|
464
|
+
params_lines.append(f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment')
|
|
465
|
+
params_lines.append(f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature')
|
|
466
|
+
params_lines.append(f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features')
|
|
467
|
+
params_lines.append('')
|
|
468
|
+
|
|
469
|
+
# Sample Processing
|
|
470
|
+
params_lines.append(' # === Sample Processing (used in add_samples_from_folder) ===')
|
|
471
|
+
params_lines.append(f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch')
|
|
472
|
+
params_lines.append(f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)')
|
|
473
|
+
params_lines.append('')
|
|
474
|
+
|
|
475
|
+
# Script Options
|
|
476
|
+
params_lines.append(' # === Script Options ===')
|
|
477
|
+
params_lines.append(f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability')
|
|
478
|
+
params_lines.append(f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files')
|
|
479
|
+
params_lines.append(f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing')
|
|
480
|
+
|
|
481
|
+
params_lines.append('}')
|
|
482
|
+
|
|
483
|
+
# Create script lines
|
|
484
|
+
script_lines = [
|
|
485
|
+
'#!/usr/bin/env python3',
|
|
486
|
+
'"""',
|
|
487
|
+
'Automated Mass Spectrometry Data Analysis Pipeline',
|
|
488
|
+
f'Generated by masster wizard v{version}',
|
|
489
|
+
'"""',
|
|
490
|
+
'',
|
|
491
|
+
'import sys',
|
|
492
|
+
'import time',
|
|
493
|
+
'from pathlib import Path',
|
|
494
|
+
'',
|
|
495
|
+
'# Import masster modules',
|
|
496
|
+
'from masster.study import Study',
|
|
497
|
+
'from masster import __version__',
|
|
498
|
+
'',
|
|
499
|
+
]
|
|
500
|
+
|
|
501
|
+
# Add the formatted PARAMS
|
|
502
|
+
script_lines.extend(params_lines)
|
|
503
|
+
|
|
504
|
+
# Add the main function and pipeline
|
|
505
|
+
script_lines.extend([
|
|
506
|
+
'',
|
|
507
|
+
'',
|
|
508
|
+
'def discover_raw_files(source_folder, file_extensions, search_subfolders=True):',
|
|
509
|
+
' """Discover raw data files in the source folder."""',
|
|
510
|
+
' source_path = Path(source_folder)',
|
|
511
|
+
' raw_files = []',
|
|
512
|
+
' ',
|
|
513
|
+
' for ext in file_extensions:',
|
|
514
|
+
' if search_subfolders:',
|
|
515
|
+
' pattern = f"**/*{ext}"',
|
|
516
|
+
' files = list(source_path.rglob(pattern))',
|
|
517
|
+
' else:',
|
|
518
|
+
' pattern = f"*{ext}"',
|
|
519
|
+
' files = list(source_path.glob(pattern))',
|
|
520
|
+
' raw_files.extend(files)',
|
|
521
|
+
' ',
|
|
522
|
+
' return raw_files',
|
|
523
|
+
'',
|
|
524
|
+
'',
|
|
525
|
+
'def process_single_file(args):',
|
|
526
|
+
' """Process a single raw file to sample5 format - module level for multiprocessing."""',
|
|
527
|
+
' raw_file, output_folder = args',
|
|
528
|
+
' from masster.sample import Sample',
|
|
529
|
+
' ',
|
|
530
|
+
' try:',
|
|
531
|
+
' # Create sample5 filename',
|
|
532
|
+
' sample_name = raw_file.stem',
|
|
533
|
+
' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
|
|
534
|
+
' ',
|
|
535
|
+
' # Skip if sample5 already exists',
|
|
536
|
+
' if sample5_path.exists():',
|
|
537
|
+
' print(f" Skipping {raw_file.name} (sample5 already exists)")',
|
|
538
|
+
' return str(sample5_path)',
|
|
539
|
+
' ',
|
|
540
|
+
' print(f" Converting {raw_file.name}...")',
|
|
541
|
+
' ',
|
|
542
|
+
' # Load and process raw file with full pipeline',
|
|
543
|
+
' sample = Sample(log_label=sample_name)',
|
|
544
|
+
' sample.load(filename=str(raw_file))',
|
|
545
|
+
' sample.find_features(',
|
|
546
|
+
' noise=PARAMS[\'noise_threshold\'],',
|
|
547
|
+
' chrom_fwhm=PARAMS[\'smoothing_width\'],',
|
|
548
|
+
' chrom_peak_snr=PARAMS[\'peak_threshold\']',
|
|
549
|
+
' )',
|
|
550
|
+
' sample.find_adducts(adducts=PARAMS[\'adducts\'])',
|
|
551
|
+
' sample.find_ms2()',
|
|
552
|
+
' # sample.find_iso()',
|
|
553
|
+
' # sample.export_mgf()',
|
|
554
|
+
' # sample.export_mztab()',
|
|
555
|
+
' # sample.plot_2d(filename="{sample_name}.html")',
|
|
556
|
+
' sample.save(str(sample5_path))',
|
|
557
|
+
' ',
|
|
558
|
+
' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
|
|
559
|
+
' return str(sample5_path)',
|
|
560
|
+
' ',
|
|
561
|
+
' except Exception as e:',
|
|
562
|
+
' print(f" ERROR processing {raw_file.name}: {e}")',
|
|
563
|
+
' return None',
|
|
564
|
+
'',
|
|
565
|
+
'',
|
|
566
|
+
'def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):',
|
|
567
|
+
' """Convert raw data files to sample5 format."""',
|
|
568
|
+
' import concurrent.futures',
|
|
569
|
+
' import os',
|
|
570
|
+
' ',
|
|
571
|
+
' # Create output directory',
|
|
572
|
+
' os.makedirs(output_folder, exist_ok=True)',
|
|
573
|
+
' ',
|
|
574
|
+
' # Prepare arguments for multiprocessing',
|
|
575
|
+
' file_args = [(raw_file, output_folder) for raw_file in raw_files]',
|
|
576
|
+
' ',
|
|
577
|
+
' # Process files in parallel',
|
|
578
|
+
' sample5_files = []',
|
|
579
|
+
' with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:',
|
|
580
|
+
' futures = [executor.submit(process_single_file, args) for args in file_args]',
|
|
581
|
+
' ',
|
|
582
|
+
' for future in concurrent.futures.as_completed(futures):',
|
|
583
|
+
' result = future.result()',
|
|
584
|
+
' if result:',
|
|
585
|
+
' sample5_files.append(result)',
|
|
586
|
+
' ',
|
|
587
|
+
' return sample5_files',
|
|
588
|
+
'',
|
|
589
|
+
'',
|
|
590
|
+
'def main():',
|
|
591
|
+
' """Main analysis pipeline."""',
|
|
592
|
+
' try:',
|
|
593
|
+
' print("=" * 70)',
|
|
594
|
+
f' print("masster {version} - Automated MS Data Analysis")',
|
|
595
|
+
' print("=" * 70)',
|
|
596
|
+
' print(f"Source: {PARAMS[\'source\']}")',
|
|
597
|
+
' print(f"Output: {PARAMS[\'folder\']}")',
|
|
598
|
+
' print(f"Polarity: {PARAMS[\'polarity\']}")',
|
|
599
|
+
' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
|
|
600
|
+
' print("=" * 70)',
|
|
601
|
+
' ',
|
|
602
|
+
' start_time = time.time()',
|
|
603
|
+
' ',
|
|
604
|
+
' # Step 1: Discover raw data files',
|
|
605
|
+
' print("\\nStep 1/7: Discovering raw data files...")',
|
|
606
|
+
' raw_files = discover_raw_files(',
|
|
607
|
+
' PARAMS[\'source\'],',
|
|
608
|
+
' PARAMS[\'file_extensions\'],',
|
|
609
|
+
' PARAMS[\'search_subfolders\']',
|
|
610
|
+
' )',
|
|
611
|
+
' ',
|
|
612
|
+
' if not raw_files:',
|
|
613
|
+
' print("No raw data files found!")',
|
|
614
|
+
' return False',
|
|
615
|
+
' ',
|
|
616
|
+
' print(f"Found {len(raw_files)} raw data files")',
|
|
617
|
+
' for f in raw_files[:5]: # Show first 5 files',
|
|
618
|
+
' print(f" {f.name}")',
|
|
619
|
+
' if len(raw_files) > 5:',
|
|
620
|
+
' print(f" ... and {len(raw_files) - 5} more")',
|
|
621
|
+
' ',
|
|
622
|
+
' # Step 2: Process raw files',
|
|
623
|
+
' print("\\nStep 2/7: Processing raw files...")',
|
|
624
|
+
' sample5_files = convert_raw_to_sample5(',
|
|
625
|
+
' raw_files,',
|
|
626
|
+
' PARAMS[\'folder\'],',
|
|
627
|
+
' PARAMS[\'polarity\'],',
|
|
628
|
+
' PARAMS[\'num_cores\']',
|
|
629
|
+
' )',
|
|
630
|
+
' ',
|
|
631
|
+
' if not sample5_files:',
|
|
632
|
+
' print("No sample5 files were created!")',
|
|
633
|
+
' return False',
|
|
634
|
+
' ',
|
|
635
|
+
' print(f"Successfully processed {len(sample5_files)} files to sample5")',
|
|
636
|
+
' ',
|
|
637
|
+
' # Step 3: Create and configure study',
|
|
638
|
+
' print("\\nStep 3/7: Initializing study...")',
|
|
639
|
+
' study = Study(folder=PARAMS[\'folder\'])',
|
|
640
|
+
' study.polarity = PARAMS[\'polarity\']',
|
|
641
|
+
' study.adducts = PARAMS[\'adducts\']',
|
|
642
|
+
' ',
|
|
643
|
+
' # Step 4: Add sample5 files to study',
|
|
644
|
+
' print("\\nStep 4/7: Adding samples to study...")',
|
|
645
|
+
' study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
|
|
646
|
+
' ',
|
|
647
|
+
' # Step 5: Core processing',
|
|
648
|
+
' print("\\nStep 5/7: Processing...")',
|
|
649
|
+
' study.align(',
|
|
650
|
+
' algorithm=PARAMS[\'alignment_method\'],',
|
|
651
|
+
' rt_tol=PARAMS[\'rt_tol\']',
|
|
652
|
+
' )',
|
|
653
|
+
' ',
|
|
654
|
+
' # Merge and create consensus features',
|
|
655
|
+
' study.merge(',
|
|
656
|
+
' min_samples=PARAMS[\'min_samples_per_feature\'],',
|
|
657
|
+
' threads=PARAMS[\'num_cores\'],',
|
|
658
|
+
' rt_tol=PARAMS[\'rt_tol\'],',
|
|
659
|
+
' mz_tol=PARAMS[\'mz_tol\']',
|
|
660
|
+
' )',
|
|
661
|
+
' study.find_iso()',
|
|
662
|
+
' study.fill(min_samples_rel=0.0)',
|
|
663
|
+
' study.integrate()',
|
|
664
|
+
' ',
|
|
665
|
+
' # Step 6/7: Saving results',
|
|
666
|
+
' print("\\nStep 6/7: Saving results...")',
|
|
667
|
+
' study.save()',
|
|
668
|
+
' study.export_xlsx()',
|
|
669
|
+
' study.export_mgf()',
|
|
670
|
+
' study.export_mztab()',
|
|
671
|
+
' ',
|
|
672
|
+
' # Step 7: Plots',
|
|
673
|
+
' print("\\nStep 7/7: Exporting plots...")',
|
|
674
|
+
' study.plot_consensus_2d(filename="consensus.html")',
|
|
675
|
+
' study.plot_consensus_2d(filename="consensus.png")',
|
|
676
|
+
' study.plot_alignment(filename="alignment.html")',
|
|
677
|
+
' study.plot_alignment(filename="alignment.png")',
|
|
678
|
+
' study.plot_pca(filename="pca.html")',
|
|
679
|
+
' study.plot_pca(filename="pca.png")',
|
|
680
|
+
' study.plot_bpc(filename="bpc.html")',
|
|
681
|
+
' study.plot_bpc(filename="bpc.png")',
|
|
682
|
+
' study.plot_rt_correction(filename="rt_correction.html")',
|
|
683
|
+
' study.plot_rt_correction(filename="rt_correction.png")',
|
|
684
|
+
|
|
685
|
+
' ',
|
|
686
|
+
' # Print summary',
|
|
687
|
+
' study.info()',
|
|
688
|
+
' total_time = time.time() - start_time',
|
|
689
|
+
' print("\\n" + "=" * 70)',
|
|
690
|
+
' print("ANALYSIS COMPLETE")',
|
|
691
|
+
' print("=" * 70)',
|
|
692
|
+
' print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")',
|
|
693
|
+
' print(f"Raw files processed: {len(raw_files)}")',
|
|
694
|
+
' print(f"Sample5 files created: {len(sample5_files)}")',
|
|
695
|
+
' if hasattr(study, "consensus_df"):',
|
|
696
|
+
' print(f"Consensus features generated: {len(study.consensus_df)}")',
|
|
697
|
+
' print("=" * 70)',
|
|
698
|
+
' ',
|
|
699
|
+
' return True',
|
|
700
|
+
' ',
|
|
701
|
+
' except KeyboardInterrupt:',
|
|
702
|
+
' print("\\nAnalysis interrupted by user")',
|
|
703
|
+
' return False',
|
|
704
|
+
' except Exception as e:',
|
|
705
|
+
' print(f"Analysis failed with error: {e}")',
|
|
706
|
+
' import traceback',
|
|
707
|
+
' traceback.print_exc()',
|
|
708
|
+
' return False',
|
|
709
|
+
'',
|
|
710
|
+
'',
|
|
711
|
+
'if __name__ == "__main__":',
|
|
712
|
+
' success = main()',
|
|
713
|
+
' sys.exit(0 if success else 1)',
|
|
714
|
+
])
|
|
715
|
+
|
|
716
|
+
return '\n'.join(script_lines)
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def create_script(
|
|
720
|
+
source: str,
|
|
721
|
+
folder: str,
|
|
722
|
+
filename: str = 'run_masster.py',
|
|
723
|
+
polarity: str = "positive",
|
|
724
|
+
adducts: Optional[List[str]] = None,
|
|
725
|
+
params: Optional[wizard_def] = None,
|
|
726
|
+
num_cores: int = 0,
|
|
727
|
+
**kwargs
|
|
728
|
+
) -> bool:
|
|
729
|
+
"""
|
|
730
|
+
Create a standalone analysis script without initializing a Wizard instance.
|
|
731
|
+
|
|
732
|
+
This function generates a Python script that replicates automated processing
|
|
733
|
+
steps with the specified configuration. The script can be executed independently
|
|
734
|
+
to perform the same analysis.
|
|
735
|
+
|
|
736
|
+
Parameters:
|
|
737
|
+
source: Directory containing raw data files
|
|
738
|
+
folder: Output directory for processed study
|
|
739
|
+
filename: Filename for the generated script (should end with .py)
|
|
740
|
+
polarity: Ion polarity mode ("positive" or "negative")
|
|
741
|
+
adducts: List of adduct specifications (auto-set if None)
|
|
742
|
+
params: Custom wizard_def parameters (optional)
|
|
743
|
+
num_cores: Number of CPU cores (0 = auto-detect)
|
|
744
|
+
**kwargs: Additional parameters to override defaults
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
True if script was generated successfully, False otherwise
|
|
748
|
+
|
|
749
|
+
Example:
|
|
750
|
+
>>> from masster.wizard import create_script
|
|
751
|
+
>>> create_script(
|
|
752
|
+
... source=r'D:\\Data\\raw_files',
|
|
753
|
+
... folder=r'D:\\Data\\output',
|
|
754
|
+
... filename='run_masster.py',
|
|
755
|
+
... polarity='positive'
|
|
756
|
+
... )
|
|
757
|
+
"""
|
|
758
|
+
|
|
759
|
+
try:
|
|
760
|
+
# Create parameters
|
|
761
|
+
if params is not None:
|
|
762
|
+
# Use provided params as base
|
|
763
|
+
wizard_params = params
|
|
764
|
+
# Update with provided values
|
|
765
|
+
wizard_params.source = source
|
|
766
|
+
wizard_params.folder = folder
|
|
767
|
+
if polarity != "positive": # Only override if explicitly different
|
|
768
|
+
wizard_params.polarity = polarity
|
|
769
|
+
if num_cores > 0:
|
|
770
|
+
wizard_params.num_cores = num_cores
|
|
771
|
+
if adducts is not None:
|
|
772
|
+
wizard_params.adducts = adducts
|
|
773
|
+
else:
|
|
774
|
+
# Create new params with provided values
|
|
775
|
+
wizard_params = wizard_def(
|
|
776
|
+
source=source,
|
|
777
|
+
folder=folder,
|
|
778
|
+
polarity=polarity,
|
|
779
|
+
num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
|
|
798
780
|
)
|
|
799
781
|
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
# Log results
|
|
804
|
-
num_consensus = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
|
|
805
|
-
self._log_progress(f"Generated {num_consensus} consensus features")
|
|
806
|
-
|
|
807
|
-
# Get study info
|
|
808
|
-
if hasattr(self.study, 'info'):
|
|
809
|
-
self.study.info()
|
|
810
|
-
|
|
811
|
-
self._save_checkpoint()
|
|
812
|
-
return True
|
|
813
|
-
|
|
814
|
-
except Exception as e:
|
|
815
|
-
self.logger.error(f"Failed to align and merge: {e}")
|
|
816
|
-
return False
|
|
817
|
-
|
|
818
|
-
def generate_plots(self) -> bool:
|
|
819
|
-
"""
|
|
820
|
-
Generate visualization plots for the study.
|
|
821
|
-
|
|
822
|
-
Returns:
|
|
823
|
-
True if plot generation was successful
|
|
824
|
-
"""
|
|
825
|
-
if not self.params.generate_plots:
|
|
826
|
-
self._log_progress("Plot generation disabled, skipping...")
|
|
827
|
-
return True
|
|
782
|
+
if adducts is not None:
|
|
783
|
+
wizard_params.adducts = adducts
|
|
828
784
|
|
|
829
|
-
|
|
830
|
-
|
|
785
|
+
# Apply any additional kwargs
|
|
786
|
+
for key, value in kwargs.items():
|
|
787
|
+
if hasattr(wizard_params, key):
|
|
788
|
+
setattr(wizard_params, key, value)
|
|
831
789
|
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
790
|
+
# Ensure study folder exists
|
|
791
|
+
study_path = Path(folder)
|
|
792
|
+
study_path.mkdir(parents=True, exist_ok=True)
|
|
835
793
|
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
# Alignment plot
|
|
840
|
-
if hasattr(self.study, 'plot_alignment'):
|
|
841
|
-
alignment_plot = self.study_folder_path / "alignment_plot.html"
|
|
842
|
-
self.study.plot_alignment(filename=str(alignment_plot))
|
|
843
|
-
plots_generated += 1
|
|
844
|
-
self.logger.info(f"Generated alignment plot: {alignment_plot}")
|
|
845
|
-
|
|
846
|
-
# Consensus 2D plot
|
|
847
|
-
if hasattr(self.study, 'plot_consensus_2d'):
|
|
848
|
-
consensus_2d_plot = self.study_folder_path / "consensus_2d.html"
|
|
849
|
-
self.study.plot_consensus_2d(filename=str(consensus_2d_plot))
|
|
850
|
-
plots_generated += 1
|
|
851
|
-
self.logger.info(f"Generated consensus 2D plot: {consensus_2d_plot}")
|
|
852
|
-
|
|
853
|
-
# PCA plot
|
|
854
|
-
if hasattr(self.study, 'plot_pca'):
|
|
855
|
-
pca_plot = self.study_folder_path / "pca_plot.html"
|
|
856
|
-
self.study.plot_pca(filename=str(pca_plot))
|
|
857
|
-
plots_generated += 1
|
|
858
|
-
self.logger.info(f"Generated PCA plot: {pca_plot}")
|
|
859
|
-
|
|
860
|
-
# Consensus statistics
|
|
861
|
-
if hasattr(self.study, 'plot_consensus_stats'):
|
|
862
|
-
stats_plot = self.study_folder_path / "consensus_stats.html"
|
|
863
|
-
self.study.plot_consensus_stats(filename=str(stats_plot))
|
|
864
|
-
plots_generated += 1
|
|
865
|
-
self.logger.info(f"Generated statistics plot: {stats_plot}")
|
|
866
|
-
|
|
867
|
-
self._log_progress(f"Generated {plots_generated} visualization plots")
|
|
868
|
-
self._save_checkpoint()
|
|
869
|
-
return True
|
|
870
|
-
|
|
871
|
-
except Exception as e:
|
|
872
|
-
self.logger.error(f"Failed to generate plots: {e}")
|
|
873
|
-
return False
|
|
874
|
-
|
|
875
|
-
def export_results(self) -> bool:
|
|
876
|
-
"""
|
|
877
|
-
Export study results in requested formats.
|
|
794
|
+
# Create a temporary Wizard instance to generate the script
|
|
795
|
+
temp_wizard = Wizard(params=wizard_params)
|
|
878
796
|
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
"""
|
|
882
|
-
self._log_progress("=== Exporting Study Results ===")
|
|
883
|
-
self.current_step = "exporting_results"
|
|
797
|
+
# Generate the script using the instance method
|
|
798
|
+
success = temp_wizard.create_script(filename)
|
|
884
799
|
|
|
885
|
-
|
|
886
|
-
self.logger.error("Study not available. Complete previous steps first.")
|
|
887
|
-
return False
|
|
800
|
+
return success
|
|
888
801
|
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
802
|
+
except Exception as e:
|
|
803
|
+
print(f"Failed to create script: {e}")
|
|
804
|
+
import traceback
|
|
805
|
+
traceback.print_exc()
|
|
806
|
+
return False
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def execute(
|
|
810
|
+
source: str,
|
|
811
|
+
folder: str,
|
|
812
|
+
filename: str = 'run_masster.py',
|
|
813
|
+
polarity: str = "positive",
|
|
814
|
+
adducts: Optional[List[str]] = None,
|
|
815
|
+
params: Optional[wizard_def] = None,
|
|
816
|
+
num_cores: int = 0,
|
|
817
|
+
**kwargs
|
|
818
|
+
) -> bool:
|
|
819
|
+
"""
|
|
820
|
+
Create and execute a standalone analysis script for automated MS data processing.
|
|
821
|
+
|
|
822
|
+
This function generates a Python script with the same parameters as create_script(),
|
|
823
|
+
but immediately executes it after creation. Combines script generation and execution
|
|
824
|
+
in a single step.
|
|
825
|
+
|
|
826
|
+
Parameters:
|
|
827
|
+
source: Directory containing raw data files
|
|
828
|
+
folder: Output directory for processed study
|
|
829
|
+
filename: Filename for the generated script (should end with .py)
|
|
830
|
+
polarity: Ion polarity mode ("positive" or "negative")
|
|
831
|
+
adducts: List of adduct specifications (auto-set if None)
|
|
832
|
+
params: Custom wizard_def parameters (optional)
|
|
833
|
+
num_cores: Number of CPU cores (0 = auto-detect)
|
|
834
|
+
**kwargs: Additional parameters to override defaults
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
True if script was created and executed successfully, False otherwise
|
|
838
|
+
|
|
839
|
+
Example:
|
|
840
|
+
>>> from masster.wizard import execute
|
|
841
|
+
>>> execute(
|
|
842
|
+
... source=r'D:\\Data\\raw_files',
|
|
843
|
+
... folder=r'D:\\Data\\output',
|
|
844
|
+
... polarity='positive'
|
|
845
|
+
... )
|
|
846
|
+
"""
|
|
928
847
|
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
return False
|
|
848
|
+
try:
|
|
849
|
+
# First, create the script using create_script()
|
|
850
|
+
script_created = create_script(
|
|
851
|
+
source=source,
|
|
852
|
+
folder=folder,
|
|
853
|
+
filename=filename,
|
|
854
|
+
polarity=polarity,
|
|
855
|
+
adducts=adducts,
|
|
856
|
+
params=params,
|
|
857
|
+
num_cores=num_cores,
|
|
858
|
+
**kwargs
|
|
859
|
+
)
|
|
942
860
|
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
# Determine optimal save format based on study size
|
|
947
|
-
num_samples = len(self.study.samples_df)
|
|
948
|
-
num_features = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
|
|
949
|
-
|
|
950
|
-
if self.params.adaptive_compression:
|
|
951
|
-
# Use compressed format for large studies
|
|
952
|
-
if num_samples > 50 or num_features > 10000:
|
|
953
|
-
self.logger.info(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")
|
|
954
|
-
self.params.compress_output = True
|
|
955
|
-
else:
|
|
956
|
-
self.logger.info(f"Small study ({num_samples} samples, {num_features} features) - using standard format")
|
|
957
|
-
self.params.compress_output = False
|
|
958
|
-
|
|
959
|
-
# Save study
|
|
960
|
-
if self.params.compress_output and hasattr(self.study, 'save_compressed'):
|
|
961
|
-
self.study.save_compressed(filename=str(study_file))
|
|
962
|
-
self.logger.info(f"Saved compressed study: {study_file}")
|
|
963
|
-
else:
|
|
964
|
-
self.study.save(filename=str(study_file))
|
|
965
|
-
self.logger.info(f"Saved study: {study_file}")
|
|
966
|
-
|
|
967
|
-
# Save metadata summary
|
|
968
|
-
metadata_file = self.study_folder_path / "study_metadata.txt"
|
|
969
|
-
with open(metadata_file, "w") as f:
|
|
970
|
-
f.write("Study Processing Summary\n")
|
|
971
|
-
f.write("========================\n")
|
|
972
|
-
f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
973
|
-
f.write(f"Polarity: {self.polarity}\n")
|
|
974
|
-
f.write(f"Adducts: {', '.join(self.adducts)}\n")
|
|
975
|
-
f.write(f"Number of Samples: {num_samples}\n")
|
|
976
|
-
f.write(f"Number of Consensus Features: {num_features}\n")
|
|
977
|
-
f.write(f"Successful Files: {len(self.processed_files)}\n")
|
|
978
|
-
f.write(f"Failed Files: {len(self.failed_files)}\n")
|
|
979
|
-
f.write(f"RT Tolerance: {self.params.rt_tolerance}s\n")
|
|
980
|
-
f.write(f"m/z Tolerance: {self.params.mz_tolerance} Da\n")
|
|
981
|
-
f.write(f"Merge Method: {self.params.merge_method}\n")
|
|
982
|
-
f.write(f"Processing Time: {self._get_total_processing_time()}\n")
|
|
983
|
-
|
|
984
|
-
self._log_progress(f"Saved study metadata: {metadata_file}")
|
|
985
|
-
self._save_checkpoint()
|
|
986
|
-
return True
|
|
987
|
-
|
|
988
|
-
except Exception as e:
|
|
989
|
-
self.logger.error(f"Failed to save study: {e}")
|
|
861
|
+
if not script_created:
|
|
862
|
+
print("Failed to create analysis script")
|
|
990
863
|
return False
|
|
991
|
-
|
|
992
|
-
def cleanup_temp_files(self) -> bool:
|
|
993
|
-
"""
|
|
994
|
-
Clean up temporary files if requested.
|
|
995
|
-
|
|
996
|
-
Returns:
|
|
997
|
-
True if cleanup was successful
|
|
998
|
-
"""
|
|
999
|
-
if not self.params.cleanup_temp_files:
|
|
1000
|
-
return True
|
|
1001
864
|
|
|
1002
|
-
|
|
865
|
+
# Get the full path to the created script
|
|
866
|
+
study_path = Path(folder)
|
|
867
|
+
script_path = study_path / Path(filename).name
|
|
1003
868
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
# Remove individual sample plots if study plots were generated
|
|
1008
|
-
if self.params.generate_plots:
|
|
1009
|
-
temp_plots = list(self.study_folder_path.glob("*_2d.html"))
|
|
1010
|
-
for plot_file in temp_plots:
|
|
1011
|
-
if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:
|
|
1012
|
-
plot_file.unlink()
|
|
1013
|
-
cleaned_count += 1
|
|
1014
|
-
|
|
1015
|
-
# Remove checkpoint file
|
|
1016
|
-
if self.checkpoint_file.exists():
|
|
1017
|
-
self.checkpoint_file.unlink()
|
|
1018
|
-
cleaned_count += 1
|
|
1019
|
-
|
|
1020
|
-
self._log_progress(f"Cleaned up {cleaned_count} temporary files")
|
|
1021
|
-
return True
|
|
1022
|
-
|
|
1023
|
-
except Exception as e:
|
|
1024
|
-
self.logger.error(f"Failed to cleanup temp files: {e}")
|
|
869
|
+
if not script_path.exists():
|
|
870
|
+
print(f"Script file not found: {script_path}")
|
|
1025
871
|
return False
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
""
|
|
1029
|
-
Run the complete automated processing pipeline.
|
|
1030
|
-
|
|
1031
|
-
This method executes all processing steps in sequence:
|
|
1032
|
-
1. Convert raw files to sample5 format
|
|
1033
|
-
2. Assemble study from sample5 files
|
|
1034
|
-
3. Align and merge features
|
|
1035
|
-
4. Generate visualization plots
|
|
1036
|
-
5. Export results in requested formats
|
|
1037
|
-
6. Save final study
|
|
1038
|
-
7. Clean up temporary files
|
|
1039
|
-
|
|
1040
|
-
Returns:
|
|
1041
|
-
True if the entire pipeline completed successfully
|
|
1042
|
-
"""
|
|
1043
|
-
self._log_progress("=" * 60)
|
|
1044
|
-
self._log_progress("STARTING AUTOMATED STUDY PROCESSING PIPELINE")
|
|
1045
|
-
self._log_progress("=" * 60)
|
|
872
|
+
|
|
873
|
+
print(f"Executing...")
|
|
874
|
+
#print("=" * 70)
|
|
1046
875
|
|
|
1047
|
-
|
|
1048
|
-
|
|
876
|
+
# Execute the script using subprocess with real-time output
|
|
877
|
+
import subprocess
|
|
1049
878
|
|
|
879
|
+
# Run the script with Python, letting it inherit our stdout/stderr
|
|
1050
880
|
try:
|
|
1051
|
-
#
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
# Step 2: Assemble study
|
|
1057
|
-
if not self.assemble_study():
|
|
1058
|
-
self.logger.error("Study assembly failed")
|
|
1059
|
-
return False
|
|
1060
|
-
|
|
1061
|
-
# Step 3: Align and merge
|
|
1062
|
-
if not self.align_and_merge():
|
|
1063
|
-
self.logger.error("Feature alignment and merging failed")
|
|
1064
|
-
return False
|
|
1065
|
-
|
|
1066
|
-
# Step 4: Generate plots
|
|
1067
|
-
if not self.generate_plots():
|
|
1068
|
-
self.logger.warning("Plot generation failed, continuing...")
|
|
1069
|
-
pipeline_success = False
|
|
1070
|
-
|
|
1071
|
-
# Step 5: Export results
|
|
1072
|
-
if not self.export_results():
|
|
1073
|
-
self.logger.warning("Result export failed, continuing...")
|
|
1074
|
-
pipeline_success = False
|
|
881
|
+
# Use subprocess.run for direct output inheritance - no capturing/re-printing
|
|
882
|
+
result = subprocess.run([
|
|
883
|
+
sys.executable, str(script_path)
|
|
884
|
+
], cwd=str(study_path))
|
|
1075
885
|
|
|
1076
|
-
|
|
1077
|
-
if not self.save_study():
|
|
1078
|
-
self.logger.error("Study saving failed")
|
|
1079
|
-
return False
|
|
886
|
+
return_code = result.returncode
|
|
1080
887
|
|
|
1081
|
-
# Step 7: Cleanup
|
|
1082
|
-
if not self.cleanup_temp_files():
|
|
1083
|
-
self.logger.warning("Cleanup failed, continuing...")
|
|
1084
|
-
|
|
1085
|
-
# Final summary
|
|
1086
|
-
total_time = time.time() - self.start_time
|
|
1087
|
-
self._log_progress("=" * 60)
|
|
1088
|
-
self._log_progress("PIPELINE COMPLETED SUCCESSFULLY")
|
|
1089
|
-
self._log_progress(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
|
|
1090
|
-
self._log_progress(f"Files processed: {len(self.processed_files)}")
|
|
1091
|
-
self._log_progress(f"Files failed: {len(self.failed_files)}")
|
|
1092
|
-
if hasattr(self.study, 'consensus_df'):
|
|
1093
|
-
self._log_progress(f"Consensus features: {len(self.study.consensus_df)}")
|
|
1094
|
-
self._log_progress("=" * 60)
|
|
1095
|
-
|
|
1096
|
-
return pipeline_success
|
|
1097
|
-
|
|
1098
|
-
except KeyboardInterrupt:
|
|
1099
|
-
self.logger.info("Pipeline interrupted by user")
|
|
1100
|
-
self._save_checkpoint()
|
|
1101
|
-
return False
|
|
1102
888
|
except Exception as e:
|
|
1103
|
-
|
|
1104
|
-
self._save_checkpoint()
|
|
889
|
+
print(f"Error during script execution: {e}")
|
|
1105
890
|
return False
|
|
1106
|
-
|
|
1107
|
-
def _get_total_processing_time(self) -> str:
|
|
1108
|
-
"""Get formatted total processing time."""
|
|
1109
|
-
if self.start_time is None:
|
|
1110
|
-
return "Unknown"
|
|
1111
891
|
|
|
1112
|
-
|
|
1113
|
-
hours = int(total_seconds // 3600)
|
|
1114
|
-
minutes = int((total_seconds % 3600) // 60)
|
|
1115
|
-
seconds = int(total_seconds % 60)
|
|
892
|
+
success = return_code == 0
|
|
1116
893
|
|
|
1117
|
-
if
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
return f"{minutes}m {seconds}s"
|
|
894
|
+
if success:
|
|
895
|
+
print("=" * 70)
|
|
896
|
+
print("Script execution completed successfully")
|
|
1121
897
|
else:
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
Get current processing status.
|
|
1127
|
-
|
|
1128
|
-
Returns:
|
|
1129
|
-
Dictionary with current status information
|
|
1130
|
-
"""
|
|
1131
|
-
return {
|
|
1132
|
-
"current_step": self.current_step,
|
|
1133
|
-
"processed_files": len(self.processed_files),
|
|
1134
|
-
"failed_files": len(self.failed_files),
|
|
1135
|
-
"study_loaded": self.study is not None,
|
|
1136
|
-
"start_time": self.start_time.isoformat() if self.start_time else None,
|
|
1137
|
-
"processing_time": self._get_total_processing_time(),
|
|
1138
|
-
"parameters": {
|
|
1139
|
-
"data_source": self.params.data_source,
|
|
1140
|
-
"study_folder": self.params.study_folder,
|
|
1141
|
-
"polarity": self.params.polarity,
|
|
1142
|
-
"num_cores": self.params.num_cores,
|
|
1143
|
-
"adducts": self.params.adducts,
|
|
1144
|
-
}
|
|
1145
|
-
}
|
|
1146
|
-
|
|
1147
|
-
def info(self):
|
|
1148
|
-
"""Print comprehensive wizard status information."""
|
|
1149
|
-
status = self.get_status()
|
|
1150
|
-
|
|
1151
|
-
print("\n" + "=" * 50)
|
|
1152
|
-
print("WIZARD STATUS")
|
|
1153
|
-
print("=" * 50)
|
|
1154
|
-
print(f"Current Step: {status['current_step']}")
|
|
1155
|
-
print(f"Data Source: {self.params.data_source}")
|
|
1156
|
-
print(f"Study Folder: {self.params.study_folder}")
|
|
1157
|
-
print(f"Polarity: {status['parameters']['polarity']}")
|
|
1158
|
-
print(f"CPU Cores: {status['parameters']['num_cores']}")
|
|
1159
|
-
print(f"Adducts: {', '.join(status['parameters']['adducts'])}")
|
|
1160
|
-
print(f"Processing Time: {status['processing_time']}")
|
|
1161
|
-
print(f"Files Processed: {status['processed_files']}")
|
|
1162
|
-
print(f"Files Failed: {status['failed_files']}")
|
|
1163
|
-
print(f"Study Loaded: {status['study_loaded']}")
|
|
1164
|
-
|
|
1165
|
-
if self.study is not None and hasattr(self.study, 'samples_df'):
|
|
1166
|
-
print(f"Samples in Study: {len(self.study.samples_df)}")
|
|
1167
|
-
|
|
1168
|
-
if self.study is not None and hasattr(self.study, 'consensus_df'):
|
|
1169
|
-
print(f"Consensus Features: {len(self.study.consensus_df)}")
|
|
898
|
+
print("=" * 70)
|
|
899
|
+
print(f"Script execution failed with return code: {return_code}")
|
|
900
|
+
|
|
901
|
+
return success
|
|
1170
902
|
|
|
1171
|
-
|
|
903
|
+
except Exception as e:
|
|
904
|
+
print(f"Failed to execute script: {e}")
|
|
905
|
+
import traceback
|
|
906
|
+
traceback.print_exc()
|
|
907
|
+
return False
|
|
1172
908
|
|
|
1173
909
|
|
|
1174
|
-
# Export the main classes
|
|
1175
|
-
__all__ = ["Wizard", "wizard_def"]
|
|
910
|
+
# Export the main classes and functions
|
|
911
|
+
__all__ = ["Wizard", "wizard_def", "create_script", "execute"]
|