masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +6 -0
- masster/_version.py +1 -1
- masster/sample/h5.py +58 -1
- masster/sample/load.py +7 -1
- masster/sample/plot.py +56 -65
- masster/sample/processing.py +158 -0
- masster/sample/sample.py +2 -0
- masster/sample/sample5_schema.json +3 -0
- masster/sample/save.py +135 -59
- masster/spectrum.py +58 -9
- masster/study/export.py +240 -154
- masster/study/h5.py +65 -1
- masster/study/helpers.py +3 -3
- masster/study/load.py +39 -3
- masster/study/merge.py +25 -10
- masster/study/plot.py +162 -192
- masster/study/processing.py +362 -12
- masster/study/save.py +48 -5
- masster/study/study.py +16 -3
- masster/study/study5_schema.json +3 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +435 -1871
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/METADATA +1 -1
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/RECORD +27 -29
- masster/wizard/test_structure.py +0 -49
- masster/wizard/test_wizard.py +0 -285
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/WHEEL +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/entry_points.txt +0 -0
- {masster-0.4.20.dist-info → masster-0.4.22.dist-info}/licenses/LICENSE +0 -0
masster/wizard/wizard.py
CHANGED
|
@@ -23,21 +23,12 @@ from masster import Wizard, wizard_def
|
|
|
23
23
|
|
|
24
24
|
# Create wizard with default parameters
|
|
25
25
|
wizard = Wizard(
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
source="./raw_data",
|
|
27
|
+
folder="./processed_study",
|
|
28
28
|
polarity="positive",
|
|
29
29
|
num_cores=4
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
# Run complete processing pipeline
|
|
33
|
-
wizard.run_full_pipeline()
|
|
34
|
-
|
|
35
|
-
# Or run individual steps
|
|
36
|
-
wizard.convert_to_sample5()
|
|
37
|
-
wizard.assemble_study()
|
|
38
|
-
wizard.align_and_merge()
|
|
39
|
-
wizard.generate_plots()
|
|
40
|
-
wizard.export_results()
|
|
41
32
|
```
|
|
42
33
|
"""
|
|
43
34
|
|
|
@@ -47,6 +38,7 @@ import os
|
|
|
47
38
|
import sys
|
|
48
39
|
import time
|
|
49
40
|
import importlib
|
|
41
|
+
import glob
|
|
50
42
|
import multiprocessing
|
|
51
43
|
from pathlib import Path
|
|
52
44
|
from typing import Optional, Any, Dict, List
|
|
@@ -59,6 +51,7 @@ from masster.logger import MassterLogger
|
|
|
59
51
|
from masster.study.defaults.study_def import study_defaults
|
|
60
52
|
from masster.study.defaults.align_def import align_defaults
|
|
61
53
|
from masster.study.defaults.merge_def import merge_defaults
|
|
54
|
+
from masster._version import __version__ as version
|
|
62
55
|
|
|
63
56
|
|
|
64
57
|
@dataclass
|
|
@@ -71,8 +64,8 @@ class wizard_def:
|
|
|
71
64
|
|
|
72
65
|
Attributes:
|
|
73
66
|
# Core Configuration
|
|
74
|
-
|
|
75
|
-
|
|
67
|
+
source (str): Path to directory containing raw data files
|
|
68
|
+
folder (str): Output directory for processed study
|
|
76
69
|
polarity (str): Ion polarity mode ("positive" or "negative")
|
|
77
70
|
num_cores (int): Number of CPU cores to use for parallel processing
|
|
78
71
|
|
|
@@ -103,15 +96,15 @@ class wizard_def:
|
|
|
103
96
|
"""
|
|
104
97
|
|
|
105
98
|
# === Core Configuration ===
|
|
106
|
-
|
|
107
|
-
|
|
99
|
+
source: str = ""
|
|
100
|
+
folder: str = ""
|
|
108
101
|
polarity: str = "positive"
|
|
109
102
|
num_cores: int = 4
|
|
110
103
|
|
|
111
104
|
# === File Discovery ===
|
|
112
|
-
file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"
|
|
105
|
+
file_extensions: List[str] = field(default_factory=lambda: [".wiff", ".raw", ".mzML"])
|
|
113
106
|
search_subfolders: bool = True
|
|
114
|
-
skip_patterns: List[str] = field(default_factory=lambda: ["blank", "
|
|
107
|
+
skip_patterns: List[str] = field(default_factory=lambda: ["blank", "test"])
|
|
115
108
|
|
|
116
109
|
# === Processing Parameters ===
|
|
117
110
|
adducts: List[str] = field(default_factory=list) # Will be set based on polarity
|
|
@@ -130,11 +123,11 @@ class wizard_def:
|
|
|
130
123
|
rt_tolerance: float = 1.5
|
|
131
124
|
mz_max_diff: float = 0.01
|
|
132
125
|
alignment_algorithm: str = "kd"
|
|
133
|
-
merge_method: str = "
|
|
126
|
+
merge_method: str = "qt"
|
|
134
127
|
|
|
135
128
|
# === Feature Detection ===
|
|
136
129
|
chrom_fwhm: float = 0.5
|
|
137
|
-
|
|
130
|
+
noise: float = 50.0
|
|
138
131
|
chrom_peak_snr: float = 5.0
|
|
139
132
|
tol_ppm: float = 10.0
|
|
140
133
|
detector_type: str = "unknown" # Detected detector type ("orbitrap", "quadrupole", "unknown")
|
|
@@ -157,15 +150,15 @@ class wizard_def:
|
|
|
157
150
|
optimize_memory: bool = True
|
|
158
151
|
cleanup_temp_files: bool = True
|
|
159
152
|
validate_outputs: bool = True
|
|
160
|
-
|
|
153
|
+
|
|
161
154
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
162
155
|
default_factory=lambda: {
|
|
163
|
-
"
|
|
156
|
+
"source": {
|
|
164
157
|
"dtype": str,
|
|
165
158
|
"description": "Path to directory containing raw data files",
|
|
166
159
|
"required": True,
|
|
167
160
|
},
|
|
168
|
-
"
|
|
161
|
+
"folder": {
|
|
169
162
|
"dtype": str,
|
|
170
163
|
"description": "Output directory for processed study",
|
|
171
164
|
"required": True,
|
|
@@ -221,57 +214,47 @@ class wizard_def:
|
|
|
221
214
|
self.num_cores = max_cores
|
|
222
215
|
|
|
223
216
|
# Ensure paths are absolute
|
|
224
|
-
if self.
|
|
225
|
-
self.
|
|
226
|
-
if self.
|
|
227
|
-
self.
|
|
217
|
+
if self.source:
|
|
218
|
+
self.source = os.path.abspath(self.source)
|
|
219
|
+
if self.folder:
|
|
220
|
+
self.folder = os.path.abspath(self.folder)
|
|
228
221
|
|
|
229
222
|
|
|
230
223
|
class Wizard:
|
|
231
224
|
"""
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
The Wizard class provides end-to-end automation for processing collections
|
|
235
|
-
of mass spectrometry files from raw data to final study results, including:
|
|
225
|
+
Simplified Wizard for automated mass spectrometry data processing.
|
|
236
226
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
4. Automated plot generation and result export
|
|
241
|
-
5. Intelligent resume capability for interrupted processes
|
|
242
|
-
6. Adaptive optimization based on study size and system resources
|
|
227
|
+
The Wizard provides a clean interface for creating and executing analysis scripts
|
|
228
|
+
that process raw MS data through the complete pipeline: file discovery, feature
|
|
229
|
+
detection, sample processing, study assembly, alignment, merging, and export.
|
|
243
230
|
|
|
244
|
-
|
|
245
|
-
-
|
|
246
|
-
-
|
|
247
|
-
- .mzML files: Check metadata for Orbitrap detection
|
|
248
|
-
|
|
249
|
-
The wizard handles the complete workflow with minimal user intervention
|
|
250
|
-
while providing comprehensive logging and progress tracking.
|
|
231
|
+
This simplified version focuses on two core functions:
|
|
232
|
+
- create_script(): Generate standalone analysis scripts
|
|
233
|
+
- execute(): Create and run analysis scripts
|
|
251
234
|
"""
|
|
252
235
|
|
|
253
236
|
def __init__(
|
|
254
237
|
self,
|
|
255
|
-
|
|
256
|
-
|
|
238
|
+
source: str = "",
|
|
239
|
+
folder: str = "",
|
|
257
240
|
polarity: str = "positive",
|
|
258
241
|
adducts: Optional[List[str]] = None,
|
|
259
|
-
num_cores: int =
|
|
242
|
+
num_cores: int = 0,
|
|
260
243
|
**kwargs
|
|
261
244
|
):
|
|
262
245
|
"""
|
|
263
|
-
Initialize the Wizard
|
|
246
|
+
Initialize the Wizard with analysis parameters.
|
|
264
247
|
|
|
265
248
|
Parameters:
|
|
266
|
-
|
|
267
|
-
|
|
249
|
+
source: Directory containing raw data files
|
|
250
|
+
folder: Output directory for processed study
|
|
268
251
|
polarity: Ion polarity mode ("positive" or "negative")
|
|
269
252
|
adducts: List of adduct specifications (auto-set if None)
|
|
270
|
-
num_cores: Number of CPU cores
|
|
253
|
+
num_cores: Number of CPU cores (0 = auto-detect 75% of available)
|
|
271
254
|
**kwargs: Additional parameters (see wizard_def for full list)
|
|
272
255
|
"""
|
|
273
256
|
|
|
274
|
-
# Auto-detect optimal number of cores
|
|
257
|
+
# Auto-detect optimal number of cores if not specified
|
|
275
258
|
if num_cores <= 0:
|
|
276
259
|
num_cores = max(1, int(multiprocessing.cpu_count() * 0.75))
|
|
277
260
|
|
|
@@ -279,14 +262,15 @@ class Wizard:
|
|
|
279
262
|
if "params" in kwargs and isinstance(kwargs["params"], wizard_def):
|
|
280
263
|
self.params = kwargs.pop("params")
|
|
281
264
|
else:
|
|
282
|
-
# Create default parameters
|
|
265
|
+
# Create default parameters
|
|
283
266
|
self.params = wizard_def(
|
|
284
|
-
|
|
285
|
-
|
|
267
|
+
source=source,
|
|
268
|
+
folder=folder,
|
|
286
269
|
polarity=polarity,
|
|
287
270
|
num_cores=num_cores
|
|
288
271
|
)
|
|
289
272
|
|
|
273
|
+
# Set adducts if provided
|
|
290
274
|
if adducts is not None:
|
|
291
275
|
self.params.adducts = adducts
|
|
292
276
|
|
|
@@ -296,15 +280,15 @@ class Wizard:
|
|
|
296
280
|
setattr(self.params, key, value)
|
|
297
281
|
|
|
298
282
|
# Validate required parameters
|
|
299
|
-
if not self.params.
|
|
300
|
-
raise ValueError("
|
|
301
|
-
if not self.params.
|
|
302
|
-
raise ValueError("
|
|
283
|
+
if not self.params.source:
|
|
284
|
+
raise ValueError("source is required")
|
|
285
|
+
if not self.params.folder:
|
|
286
|
+
raise ValueError("folder is required")
|
|
303
287
|
|
|
304
|
-
# Create
|
|
305
|
-
self.
|
|
306
|
-
self.
|
|
307
|
-
self.
|
|
288
|
+
# Create and validate paths
|
|
289
|
+
self.source_path = Path(self.params.source)
|
|
290
|
+
self.folder_path = Path(self.params.folder)
|
|
291
|
+
self.folder_path.mkdir(parents=True, exist_ok=True)
|
|
308
292
|
|
|
309
293
|
# Auto-infer polarity from the first file if not explicitly set by user
|
|
310
294
|
if polarity == "positive" and "polarity" not in kwargs:
|
|
@@ -313,29 +297,7 @@ class Wizard:
|
|
|
313
297
|
self.params.polarity = inferred_polarity
|
|
314
298
|
# Update adducts based on inferred polarity
|
|
315
299
|
self.params.__post_init__()
|
|
316
|
-
|
|
317
|
-
# Setup logging
|
|
318
|
-
self._setup_logging()
|
|
319
|
-
|
|
320
|
-
# Initialize state tracking
|
|
321
|
-
self.processed_files = []
|
|
322
|
-
self.failed_files = []
|
|
323
|
-
self.study = None
|
|
324
|
-
self.start_time = None
|
|
325
|
-
self.current_step = "initialized"
|
|
326
|
-
|
|
327
|
-
# Create checkpoint file path
|
|
328
|
-
self.checkpoint_file = self.study_folder_path / "wizard_checkpoint.json"
|
|
329
|
-
|
|
330
|
-
self.logger.info(f"Wizard initialized for {self.polarity} mode")
|
|
331
|
-
self.logger.info(f"Data source: {self.data_source_path}")
|
|
332
|
-
self.logger.info(f"Study folder: {self.study_folder_path}")
|
|
333
|
-
self.logger.info(f"Using {self.params.num_cores} CPU cores")
|
|
334
|
-
|
|
335
|
-
# Load checkpoint if resuming
|
|
336
|
-
if self.params.resume_enabled:
|
|
337
|
-
self._load_checkpoint()
|
|
338
|
-
|
|
300
|
+
|
|
339
301
|
def _infer_polarity_from_first_file(self) -> str:
|
|
340
302
|
"""
|
|
341
303
|
Infer polarity from the first available raw data file.
|
|
@@ -345,15 +307,15 @@ class Wizard:
|
|
|
345
307
|
"""
|
|
346
308
|
try:
|
|
347
309
|
# Find first file
|
|
348
|
-
for extension in ['.wiff', '.raw', '.mzML'
|
|
310
|
+
for extension in ['.wiff', '.raw', '.mzML']:
|
|
349
311
|
pattern = f"**/*{extension}" if True else f"*{extension}" # search_subfolders=True
|
|
350
|
-
files = list(self.
|
|
312
|
+
files = list(self.source_path.rglob(pattern))
|
|
351
313
|
if files:
|
|
352
314
|
first_file = files[0]
|
|
353
315
|
break
|
|
354
316
|
else:
|
|
355
|
-
return
|
|
356
|
-
|
|
317
|
+
return 'positive'
|
|
318
|
+
|
|
357
319
|
# Only implement for .wiff files initially (most common format)
|
|
358
320
|
if first_file.suffix.lower() == '.wiff':
|
|
359
321
|
from masster.sample.load import _wiff_to_dict
|
|
@@ -375,1827 +337,361 @@ class Wizard:
|
|
|
375
337
|
# Silently fall back to default if inference fails
|
|
376
338
|
pass
|
|
377
339
|
|
|
378
|
-
return
|
|
379
|
-
|
|
340
|
+
return 'positive'
|
|
341
|
+
|
|
380
342
|
@property
|
|
381
343
|
def polarity(self) -> str:
|
|
382
|
-
"""Get the polarity
|
|
344
|
+
"""Get the ion polarity mode."""
|
|
383
345
|
return self.params.polarity
|
|
384
|
-
|
|
385
|
-
@property
|
|
346
|
+
|
|
347
|
+
@property
|
|
386
348
|
def adducts(self) -> List[str]:
|
|
387
|
-
"""Get the
|
|
349
|
+
"""Get the adduct specifications."""
|
|
388
350
|
return self.params.adducts
|
|
389
|
-
|
|
390
|
-
def _reload(self):
|
|
391
|
-
"""
|
|
392
|
-
Reloads all masster modules to pick up any changes to their source code,
|
|
393
|
-
and updates the instance's class reference to the newly reloaded class version.
|
|
394
|
-
This ensures that the instance uses the latest implementation without restarting the interpreter.
|
|
395
|
-
"""
|
|
396
|
-
# Reset logger configuration flags to allow proper reconfiguration after reload
|
|
397
|
-
try:
|
|
398
|
-
import masster.logger as logger_module
|
|
399
|
-
|
|
400
|
-
if hasattr(logger_module, "_WIZARD_LOGGER_CONFIGURED"):
|
|
401
|
-
logger_module._WIZARD_LOGGER_CONFIGURED = False
|
|
402
|
-
except Exception:
|
|
403
|
-
pass
|
|
404
|
-
|
|
405
|
-
# Get the base module name (masster)
|
|
406
|
-
base_modname = self.__class__.__module__.split(".")[0]
|
|
407
|
-
current_module = self.__class__.__module__
|
|
408
|
-
|
|
409
|
-
# Dynamically find all wizard submodules
|
|
410
|
-
wizard_modules = []
|
|
411
|
-
wizard_module_prefix = f"{base_modname}.wizard."
|
|
412
|
-
|
|
413
|
-
# Get all currently loaded modules that are part of the wizard package
|
|
414
|
-
for module_name in sys.modules:
|
|
415
|
-
if (
|
|
416
|
-
module_name.startswith(wizard_module_prefix)
|
|
417
|
-
and module_name != current_module
|
|
418
|
-
):
|
|
419
|
-
wizard_modules.append(module_name)
|
|
420
|
-
|
|
421
|
-
# Add core masster modules
|
|
422
|
-
core_modules = [
|
|
423
|
-
f"{base_modname}._version",
|
|
424
|
-
f"{base_modname}.chromatogram",
|
|
425
|
-
f"{base_modname}.spectrum",
|
|
426
|
-
f"{base_modname}.logger",
|
|
427
|
-
]
|
|
428
|
-
|
|
429
|
-
# Add sample submodules
|
|
430
|
-
sample_modules = []
|
|
431
|
-
sample_module_prefix = f"{base_modname}.sample."
|
|
432
|
-
for module_name in sys.modules:
|
|
433
|
-
if (
|
|
434
|
-
module_name.startswith(sample_module_prefix)
|
|
435
|
-
and module_name != current_module
|
|
436
|
-
):
|
|
437
|
-
sample_modules.append(module_name)
|
|
438
351
|
|
|
439
|
-
|
|
440
|
-
study_modules = []
|
|
441
|
-
study_module_prefix = f"{base_modname}.study."
|
|
442
|
-
for module_name in sys.modules:
|
|
443
|
-
if (
|
|
444
|
-
module_name.startswith(study_module_prefix)
|
|
445
|
-
and module_name != current_module
|
|
446
|
-
):
|
|
447
|
-
study_modules.append(module_name)
|
|
448
|
-
|
|
449
|
-
all_modules_to_reload = (
|
|
450
|
-
core_modules + wizard_modules + sample_modules + study_modules
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
# Reload all discovered modules
|
|
454
|
-
for full_module_name in all_modules_to_reload:
|
|
455
|
-
try:
|
|
456
|
-
if full_module_name in sys.modules:
|
|
457
|
-
mod = sys.modules[full_module_name]
|
|
458
|
-
importlib.reload(mod)
|
|
459
|
-
self.logger.debug(f"Reloaded module: {full_module_name}")
|
|
460
|
-
except Exception as e:
|
|
461
|
-
self.logger.warning(f"Failed to reload module {full_module_name}: {e}")
|
|
462
|
-
|
|
463
|
-
# Finally, reload the current module (wizard.py)
|
|
464
|
-
try:
|
|
465
|
-
mod = __import__(current_module, fromlist=[current_module.split(".")[0]])
|
|
466
|
-
importlib.reload(mod)
|
|
467
|
-
|
|
468
|
-
# Get the updated class reference from the reloaded module
|
|
469
|
-
new = getattr(mod, self.__class__.__name__)
|
|
470
|
-
# Update the class reference of the instance
|
|
471
|
-
self.__class__ = new
|
|
472
|
-
|
|
473
|
-
self.logger.debug("Module reload completed")
|
|
474
|
-
except Exception as e:
|
|
475
|
-
self.logger.error(f"Failed to reload current module {current_module}: {e}")
|
|
476
|
-
|
|
477
|
-
def _setup_logging(self):
|
|
478
|
-
"""Setup comprehensive logging system."""
|
|
479
|
-
# Create logger
|
|
480
|
-
log_label = f"Wizard-{self.polarity}"
|
|
481
|
-
|
|
482
|
-
if self.params.log_to_file:
|
|
483
|
-
log_file = self.study_folder_path / "wizard.log"
|
|
484
|
-
sink = str(log_file)
|
|
485
|
-
else:
|
|
486
|
-
sink = "sys.stdout"
|
|
487
|
-
|
|
488
|
-
self.logger = MassterLogger(
|
|
489
|
-
instance_type="wizard",
|
|
490
|
-
level=self.params.log_level.upper(),
|
|
491
|
-
label=log_label,
|
|
492
|
-
sink=sink,
|
|
493
|
-
)
|
|
494
|
-
|
|
495
|
-
# Also create a simple file logger for critical info
|
|
496
|
-
self.log_file = self.study_folder_path / "processing.log"
|
|
497
|
-
|
|
498
|
-
def _log_progress(self, message: str, level: str = "INFO"):
|
|
499
|
-
"""Log progress message with timestamp."""
|
|
500
|
-
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
501
|
-
full_message = f"[{timestamp}] {message}"
|
|
502
|
-
|
|
503
|
-
# Log to masster logger
|
|
504
|
-
getattr(self.logger, level.lower())(message)
|
|
505
|
-
|
|
506
|
-
# Also write to simple log file
|
|
507
|
-
with open(self.log_file, "a", encoding="utf-8") as f:
|
|
508
|
-
f.write(f"{full_message}\n")
|
|
509
|
-
|
|
510
|
-
if self.params.verbose_progress and level in ["INFO", "WARNING", "ERROR"]:
|
|
511
|
-
print(full_message)
|
|
512
|
-
|
|
513
|
-
def _save_checkpoint(self):
|
|
514
|
-
"""Save processing checkpoint for resume capability."""
|
|
515
|
-
if not self.params.resume_enabled:
|
|
516
|
-
return
|
|
517
|
-
|
|
518
|
-
import json
|
|
519
|
-
checkpoint_data = {
|
|
520
|
-
"timestamp": datetime.now().isoformat(),
|
|
521
|
-
"current_step": self.current_step,
|
|
522
|
-
"processed_files": self.processed_files,
|
|
523
|
-
"failed_files": self.failed_files,
|
|
524
|
-
"params": {
|
|
525
|
-
"data_source": self.params.data_source,
|
|
526
|
-
"study_folder": self.params.study_folder,
|
|
527
|
-
"polarity": self.params.polarity,
|
|
528
|
-
"adducts": self.params.adducts,
|
|
529
|
-
"num_cores": self.params.num_cores,
|
|
530
|
-
}
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
try:
|
|
534
|
-
with open(self.checkpoint_file, "w") as f:
|
|
535
|
-
json.dump(checkpoint_data, f, indent=2)
|
|
536
|
-
self.logger.debug(f"Checkpoint saved: {len(self.processed_files)} files processed")
|
|
537
|
-
except Exception as e:
|
|
538
|
-
self.logger.warning(f"Failed to save checkpoint: {e}")
|
|
539
|
-
|
|
540
|
-
def _load_checkpoint(self):
|
|
541
|
-
"""Load processing checkpoint for resume capability."""
|
|
542
|
-
if not self.checkpoint_file.exists():
|
|
543
|
-
return
|
|
544
|
-
|
|
545
|
-
import json
|
|
546
|
-
try:
|
|
547
|
-
with open(self.checkpoint_file, "r") as f:
|
|
548
|
-
checkpoint_data = json.load(f)
|
|
549
|
-
|
|
550
|
-
self.processed_files = checkpoint_data.get("processed_files", [])
|
|
551
|
-
self.failed_files = checkpoint_data.get("failed_files", [])
|
|
552
|
-
self.current_step = checkpoint_data.get("current_step", "initialized")
|
|
553
|
-
|
|
554
|
-
self.logger.info(f"Resuming from checkpoint: {len(self.processed_files)} files already processed")
|
|
555
|
-
self.logger.info(f"Previous step: {self.current_step}")
|
|
556
|
-
|
|
557
|
-
except Exception as e:
|
|
558
|
-
self.logger.warning(f"Failed to load checkpoint: {e}")
|
|
559
|
-
self.processed_files = []
|
|
560
|
-
self.failed_files = []
|
|
561
|
-
|
|
562
|
-
def discover_files(self) -> List[Path]:
|
|
563
|
-
"""
|
|
564
|
-
Discover raw data files in the source directory.
|
|
565
|
-
|
|
566
|
-
Returns:
|
|
567
|
-
List of file paths found for processing
|
|
568
|
-
"""
|
|
569
|
-
self._log_progress("Discovering raw data files...")
|
|
570
|
-
self.current_step = "discovering_files"
|
|
571
|
-
|
|
572
|
-
found_files = []
|
|
573
|
-
|
|
574
|
-
for extension in self.params.file_extensions:
|
|
575
|
-
if self.params.search_subfolders:
|
|
576
|
-
pattern = f"**/*{extension}"
|
|
577
|
-
files = list(self.data_source_path.rglob(pattern))
|
|
578
|
-
else:
|
|
579
|
-
pattern = f"*{extension}"
|
|
580
|
-
files = list(self.data_source_path.glob(pattern))
|
|
581
|
-
|
|
582
|
-
# Filter out files matching skip patterns
|
|
583
|
-
filtered_files = []
|
|
584
|
-
for file_path in files:
|
|
585
|
-
skip_file = False
|
|
586
|
-
for pattern in self.params.skip_patterns:
|
|
587
|
-
if pattern.lower() in file_path.name.lower():
|
|
588
|
-
skip_file = True
|
|
589
|
-
self.logger.debug(f"Skipping file (matches pattern '{pattern}'): {file_path.name}")
|
|
590
|
-
break
|
|
591
|
-
|
|
592
|
-
if not skip_file:
|
|
593
|
-
# Check file size
|
|
594
|
-
try:
|
|
595
|
-
file_size_gb = file_path.stat().st_size / (1024**3)
|
|
596
|
-
if file_size_gb > self.params.max_file_size_gb:
|
|
597
|
-
self.logger.warning(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")
|
|
598
|
-
filtered_files.append(file_path)
|
|
599
|
-
except Exception as e:
|
|
600
|
-
self.logger.warning(f"Could not check file size for {file_path}: {e}")
|
|
601
|
-
filtered_files.append(file_path)
|
|
602
|
-
|
|
603
|
-
found_files.extend(filtered_files)
|
|
604
|
-
self.logger.info(f"Found {len(filtered_files)} {extension} files")
|
|
605
|
-
|
|
606
|
-
# Remove duplicates and sort
|
|
607
|
-
found_files = sorted(list(set(found_files)))
|
|
608
|
-
|
|
609
|
-
self._log_progress(f"Total files discovered: {len(found_files)}")
|
|
610
|
-
|
|
611
|
-
return found_files
|
|
612
|
-
|
|
613
|
-
def _process_single_file(self, file_path: Path, reset: bool = False) -> Optional[str]:
|
|
614
|
-
"""
|
|
615
|
-
Process a single file to sample5 format.
|
|
616
|
-
|
|
617
|
-
This method replicates the core processing from parallel_sample_processing.py
|
|
618
|
-
but with wizard-specific configuration and error handling.
|
|
619
|
-
|
|
620
|
-
Parameters:
|
|
621
|
-
file_path: Path to the raw data file
|
|
622
|
-
reset: Force reprocessing even if output exists
|
|
623
|
-
|
|
624
|
-
Returns:
|
|
625
|
-
Base filename of output on success, None on failure
|
|
626
|
-
"""
|
|
627
|
-
import gc
|
|
628
|
-
|
|
629
|
-
# Generate output filename
|
|
630
|
-
file_out = file_path.stem + '.sample5'
|
|
631
|
-
output_file = self.study_folder_path / file_out
|
|
632
|
-
|
|
633
|
-
# Initialize masster Sample with delayed import
|
|
634
|
-
import masster
|
|
635
|
-
sample = masster.Sample(
|
|
636
|
-
log_label=file_path.name,
|
|
637
|
-
log_level='ERROR' # Reduce logging overhead in parallel processing
|
|
638
|
-
)
|
|
639
|
-
|
|
640
|
-
# Check if file should be skipped
|
|
641
|
-
skip = False
|
|
642
|
-
if not reset and not self.params.force_reprocess and output_file.exists():
|
|
643
|
-
try:
|
|
644
|
-
# Attempt to load existing processed file to verify it's valid
|
|
645
|
-
sample.load(str(output_file))
|
|
646
|
-
skip = True
|
|
647
|
-
except Exception:
|
|
648
|
-
# If loading fails, file needs to be reprocessed
|
|
649
|
-
skip = False
|
|
650
|
-
|
|
651
|
-
if skip:
|
|
652
|
-
self.logger.debug(f"Skipping {file_path.name} (already processed)")
|
|
653
|
-
return output_file.stem
|
|
654
|
-
|
|
655
|
-
self.logger.info(f"Processing {file_path.name}")
|
|
656
|
-
|
|
657
|
-
try:
|
|
658
|
-
# STEP 1: Load raw data
|
|
659
|
-
sample.load(str(file_path))
|
|
660
|
-
|
|
661
|
-
# STEP 2: Feature detection - First pass (strict parameters)
|
|
662
|
-
sample.find_features(
|
|
663
|
-
chrom_fwhm=self.params.chrom_fwhm,
|
|
664
|
-
noise=self.params.noise_threshold,
|
|
665
|
-
tol_ppm=self.params.tol_ppm,
|
|
666
|
-
chrom_peak_snr=self.params.chrom_peak_snr,
|
|
667
|
-
min_trace_length_multiplier=0.5,
|
|
668
|
-
chrom_fwhm_min=self.params.chrom_fwhm
|
|
669
|
-
)
|
|
670
|
-
|
|
671
|
-
# STEP 3: Feature detection - Second pass (relaxed parameters)
|
|
672
|
-
sample.find_features(
|
|
673
|
-
chrom_peak_snr=self.params.chrom_peak_snr,
|
|
674
|
-
noise=self.params.noise_threshold / 10, # Lower noise threshold
|
|
675
|
-
chrom_fwhm=2.0 # Wider peaks
|
|
676
|
-
)
|
|
677
|
-
|
|
678
|
-
# STEP 3.5: Validate feature detection results
|
|
679
|
-
if not hasattr(sample, 'features_df') or sample.features_df is None or len(sample.features_df) == 0:
|
|
680
|
-
self.logger.warning(f"No features detected in {file_path.name} - skipping additional processing")
|
|
681
|
-
# Still save the sample5 file for record keeping
|
|
682
|
-
sample.save(filename=str(output_file))
|
|
683
|
-
return output_file.stem
|
|
684
|
-
|
|
685
|
-
self.logger.info(f"Detected {len(sample.features_df)} features in {file_path.name}")
|
|
686
|
-
|
|
687
|
-
# STEP 4: Adduct detection
|
|
688
|
-
sample.find_adducts(adducts=self.adducts)
|
|
689
|
-
|
|
690
|
-
# STEP 5: MS2 spectrum identification
|
|
691
|
-
sample.find_ms2()
|
|
692
|
-
|
|
693
|
-
# STEP 6: Save processed data
|
|
694
|
-
sample.save(filename=str(output_file))
|
|
695
|
-
|
|
696
|
-
# STEP 7: Generate additional outputs (only for samples with features)
|
|
697
|
-
# Skip CSV export and individual MGF export as requested
|
|
698
|
-
|
|
699
|
-
if self.params.generate_plots:
|
|
700
|
-
plot_file = output_file.parent / (output_file.stem + "_2d.html")
|
|
701
|
-
sample.plot_2d(filename=str(plot_file), markersize=4)
|
|
702
|
-
|
|
703
|
-
# Memory cleanup
|
|
704
|
-
result = output_file.stem
|
|
705
|
-
del sample
|
|
706
|
-
gc.collect()
|
|
707
|
-
|
|
708
|
-
return result
|
|
709
|
-
|
|
710
|
-
except Exception as e:
|
|
711
|
-
self.logger.error(f"Error processing {file_path.name}: {e}")
|
|
712
|
-
# Cleanup on error
|
|
713
|
-
gc.collect()
|
|
714
|
-
return None
|
|
715
|
-
|
|
716
|
-
def _process_batch(self, file_batch: List[Path]) -> List[str]:
|
|
717
|
-
"""Process a batch of files in a single worker."""
|
|
718
|
-
results = []
|
|
719
|
-
for file_path in file_batch:
|
|
720
|
-
result = self._process_single_file(file_path)
|
|
721
|
-
if result:
|
|
722
|
-
results.append(result)
|
|
723
|
-
else:
|
|
724
|
-
results.append(None)
|
|
725
|
-
return results
|
|
726
|
-
|
|
727
|
-
def convert_to_sample5(self, file_list: Optional[List[Path]] = None) -> bool:
|
|
352
|
+
def create_script(self, filename: str = "run_masster.py") -> bool:
|
|
728
353
|
"""
|
|
729
|
-
|
|
354
|
+
Generate a standalone Python script for the analysis pipeline.
|
|
730
355
|
|
|
731
356
|
Parameters:
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
Returns:
|
|
735
|
-
True if conversion completed successfully
|
|
736
|
-
"""
|
|
737
|
-
self._log_progress("=== Starting Sample5 Conversion ===")
|
|
738
|
-
self.current_step = "converting_to_sample5"
|
|
739
|
-
|
|
740
|
-
if file_list is None:
|
|
741
|
-
file_list = self.discover_files()
|
|
742
|
-
|
|
743
|
-
if not file_list:
|
|
744
|
-
self.logger.warning("No files found for conversion")
|
|
745
|
-
return False
|
|
746
|
-
|
|
747
|
-
# Detect detector type and adjust parameters before processing
|
|
748
|
-
detector_type = self._detect_detector_type()
|
|
749
|
-
self._adjust_parameters_for_detector(detector_type)
|
|
750
|
-
|
|
751
|
-
# Filter out already processed files if resuming
|
|
752
|
-
if self.params.resume_enabled and self.processed_files:
|
|
753
|
-
remaining_files = []
|
|
754
|
-
for file_path in file_list:
|
|
755
|
-
if str(file_path) not in self.processed_files:
|
|
756
|
-
remaining_files.append(file_path)
|
|
757
|
-
file_list = remaining_files
|
|
758
|
-
|
|
759
|
-
if not file_list:
|
|
760
|
-
self._log_progress("All files already processed")
|
|
761
|
-
return True
|
|
762
|
-
|
|
763
|
-
self._log_progress(f"Converting {len(file_list)} files to sample5 format")
|
|
764
|
-
|
|
765
|
-
conversion_start = time.time()
|
|
766
|
-
successful_count = 0
|
|
767
|
-
failed_count = 0
|
|
768
|
-
|
|
769
|
-
if self.params.use_process_pool:
|
|
770
|
-
# ProcessPoolExecutor approach - better for CPU-intensive work
|
|
771
|
-
if len(file_list) <= self.params.batch_size:
|
|
772
|
-
# Few files: process individually
|
|
773
|
-
self.logger.info(f"Processing {len(file_list)} files individually with {self.params.num_cores} workers")
|
|
774
|
-
|
|
775
|
-
with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
776
|
-
futures = [
|
|
777
|
-
executor.submit(self._process_single_file, file_path)
|
|
778
|
-
for file_path in file_list
|
|
779
|
-
]
|
|
780
|
-
|
|
781
|
-
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
782
|
-
result = future.result()
|
|
783
|
-
if result:
|
|
784
|
-
successful_count += 1
|
|
785
|
-
self.processed_files.append(str(file_list[i]))
|
|
786
|
-
else:
|
|
787
|
-
failed_count += 1
|
|
788
|
-
self.failed_files.append(str(file_list[i]))
|
|
789
|
-
|
|
790
|
-
# Progress update and checkpoint
|
|
791
|
-
if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
|
|
792
|
-
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
793
|
-
self._log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")
|
|
794
|
-
self._save_checkpoint()
|
|
795
|
-
|
|
796
|
-
else:
|
|
797
|
-
# Many files: process in batches
|
|
798
|
-
batches = [
|
|
799
|
-
file_list[i:i + self.params.batch_size]
|
|
800
|
-
for i in range(0, len(file_list), self.params.batch_size)
|
|
801
|
-
]
|
|
802
|
-
|
|
803
|
-
self.logger.info(f"Processing {len(file_list)} files in {len(batches)} batches")
|
|
804
|
-
|
|
805
|
-
with concurrent.futures.ProcessPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
806
|
-
futures = [executor.submit(self._process_batch, batch) for batch in batches]
|
|
807
|
-
|
|
808
|
-
for batch_idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
809
|
-
batch_results = future.result()
|
|
810
|
-
batch = batches[batch_idx]
|
|
811
|
-
|
|
812
|
-
for i, result in enumerate(batch_results):
|
|
813
|
-
if result:
|
|
814
|
-
successful_count += 1
|
|
815
|
-
self.processed_files.append(str(batch[i]))
|
|
816
|
-
else:
|
|
817
|
-
failed_count += 1
|
|
818
|
-
self.failed_files.append(str(batch[i]))
|
|
819
|
-
|
|
820
|
-
# Progress update
|
|
821
|
-
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
822
|
-
self._log_progress(f"Batch {batch_idx + 1}/{len(batches)} complete. Progress: {progress:.1f}%")
|
|
823
|
-
self._save_checkpoint()
|
|
824
|
-
|
|
825
|
-
else:
|
|
826
|
-
# ThreadPoolExecutor approach
|
|
827
|
-
self.logger.info(f"Processing {len(file_list)} files with {self.params.num_cores} threads")
|
|
357
|
+
filename: Name for the generated script file
|
|
828
358
|
|
|
829
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=self.params.num_cores) as executor:
|
|
830
|
-
futures = [
|
|
831
|
-
executor.submit(self._process_single_file, file_path)
|
|
832
|
-
for file_path in file_list
|
|
833
|
-
]
|
|
834
|
-
|
|
835
|
-
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
836
|
-
result = future.result()
|
|
837
|
-
if result:
|
|
838
|
-
successful_count += 1
|
|
839
|
-
self.processed_files.append(str(file_list[i]))
|
|
840
|
-
else:
|
|
841
|
-
failed_count += 1
|
|
842
|
-
self.failed_files.append(str(file_list[i]))
|
|
843
|
-
|
|
844
|
-
if (successful_count + failed_count) % self.params.checkpoint_interval == 0:
|
|
845
|
-
progress = (successful_count + failed_count) / len(file_list) * 100
|
|
846
|
-
self._log_progress(f"Progress: {progress:.1f}%")
|
|
847
|
-
self._save_checkpoint()
|
|
848
|
-
|
|
849
|
-
conversion_time = time.time() - conversion_start
|
|
850
|
-
|
|
851
|
-
self._log_progress("=== Sample5 Conversion Complete ===")
|
|
852
|
-
self._log_progress(f"Successful: {successful_count}")
|
|
853
|
-
self._log_progress(f"Failed: {failed_count}")
|
|
854
|
-
self._log_progress(f"Total time: {conversion_time:.1f} seconds")
|
|
855
|
-
|
|
856
|
-
if failed_count > 0:
|
|
857
|
-
self.logger.warning(f"{failed_count} files failed to process")
|
|
858
|
-
for failed_file in self.failed_files[-failed_count:]:
|
|
859
|
-
self.logger.warning(f"Failed: {failed_file}")
|
|
860
|
-
|
|
861
|
-
self._save_checkpoint()
|
|
862
|
-
return successful_count > 0
|
|
863
|
-
|
|
864
|
-
def _detect_detector_type(self) -> str:
|
|
865
|
-
"""
|
|
866
|
-
Detect the type of MS detector from the first available file.
|
|
867
|
-
|
|
868
|
-
Simplified detection rules:
|
|
869
|
-
- .raw files: Assume Orbitrap (Thermo instruments)
|
|
870
|
-
- .wiff files: Assume Quadrupole (SCIEX instruments)
|
|
871
|
-
- .mzML files: Check metadata for Orbitrap detection
|
|
872
|
-
|
|
873
359
|
Returns:
|
|
874
|
-
|
|
360
|
+
True if script was generated successfully, False otherwise
|
|
875
361
|
"""
|
|
876
362
|
try:
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
if self.params.search_subfolders:
|
|
880
|
-
pattern = f"**/*{extension}"
|
|
881
|
-
files = list(self.data_source_path.rglob(pattern))
|
|
882
|
-
else:
|
|
883
|
-
pattern = f"*{extension}"
|
|
884
|
-
files = list(self.data_source_path.glob(pattern))
|
|
885
|
-
if files:
|
|
886
|
-
first_file = files[0]
|
|
887
|
-
break
|
|
888
|
-
else:
|
|
889
|
-
self.logger.warning("No raw files found for detector detection")
|
|
890
|
-
return "unknown"
|
|
363
|
+
script_path = self.folder_path / filename
|
|
364
|
+
script_content = self._generate_script_content()
|
|
891
365
|
|
|
892
|
-
|
|
366
|
+
with open(script_path, 'w', encoding='utf-8') as f:
|
|
367
|
+
f.write(script_content)
|
|
893
368
|
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
# RAW files are Thermo -> assume Orbitrap
|
|
897
|
-
detector_type = "orbitrap"
|
|
898
|
-
self.logger.info("Detected .raw file -> Thermo Orbitrap detector")
|
|
899
|
-
return detector_type
|
|
900
|
-
|
|
901
|
-
elif first_file.suffix.lower() in ['.wiff', '.wiff2']:
|
|
902
|
-
# WIFF files are SCIEX -> assume Quadrupole
|
|
903
|
-
detector_type = "quadrupole"
|
|
904
|
-
self.logger.info("Detected .wiff file -> SCIEX Quadrupole detector")
|
|
905
|
-
return detector_type
|
|
369
|
+
print(f"Analysis script created: {script_path}")
|
|
370
|
+
return True
|
|
906
371
|
|
|
907
|
-
elif first_file.suffix.lower() == '.mzml':
|
|
908
|
-
# For mzML files, check metadata for Orbitrap detection
|
|
909
|
-
try:
|
|
910
|
-
import warnings
|
|
911
|
-
with warnings.catch_warnings():
|
|
912
|
-
warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
|
|
913
|
-
import pyopenms as oms
|
|
914
|
-
|
|
915
|
-
exp = oms.MSExperiment()
|
|
916
|
-
oms.MzMLFile().load(str(first_file), exp)
|
|
917
|
-
|
|
918
|
-
# Check instrument metadata for Orbitrap keywords
|
|
919
|
-
instrument_info = []
|
|
920
|
-
if hasattr(exp, 'getExperimentalSettings'):
|
|
921
|
-
settings = exp.getExperimentalSettings()
|
|
922
|
-
if hasattr(settings, 'getInstrument'):
|
|
923
|
-
instrument = settings.getInstrument()
|
|
924
|
-
if hasattr(instrument, 'getName'):
|
|
925
|
-
name = instrument.getName().decode() if hasattr(instrument.getName(), 'decode') else str(instrument.getName())
|
|
926
|
-
instrument_info.append(name.lower())
|
|
927
|
-
if hasattr(instrument, 'getModel'):
|
|
928
|
-
model = instrument.getModel().decode() if hasattr(instrument.getModel(), 'decode') else str(instrument.getModel())
|
|
929
|
-
instrument_info.append(model.lower())
|
|
930
|
-
|
|
931
|
-
# Check for Orbitrap keywords in instrument info
|
|
932
|
-
orbitrap_keywords = ['orbitrap', 'exactive', 'q-exactive', 'exploris', 'fusion', 'lumos', 'velos', 'elite']
|
|
933
|
-
instrument_text = ' '.join(instrument_info)
|
|
934
|
-
|
|
935
|
-
if any(keyword in instrument_text for keyword in orbitrap_keywords):
|
|
936
|
-
detector_type = "orbitrap"
|
|
937
|
-
self.logger.info(f"Detected mzML with Orbitrap instrument: {instrument_text}")
|
|
938
|
-
else:
|
|
939
|
-
detector_type = "unknown"
|
|
940
|
-
self.logger.info(f"Detected mzML with unknown instrument: {instrument_text}")
|
|
941
|
-
|
|
942
|
-
return detector_type
|
|
943
|
-
|
|
944
|
-
except Exception as e:
|
|
945
|
-
self.logger.warning(f"Failed to analyze mzML file for detector type: {e}")
|
|
946
|
-
return "unknown"
|
|
947
|
-
|
|
948
372
|
except Exception as e:
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
def _adjust_parameters_for_detector(self, detector_type: str):
|
|
373
|
+
print(f"Failed to create script: {e}")
|
|
374
|
+
return False
|
|
375
|
+
|
|
376
|
+
def execute(self, filename: str = "run_masster.py") -> bool:
|
|
954
377
|
"""
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
Simplified rules:
|
|
958
|
-
- "orbitrap": Use 1e5 noise threshold (high background noise)
|
|
959
|
-
- "quadrupole": Use 200 noise threshold (default, lower noise)
|
|
960
|
-
- "unknown": Use 200 noise threshold (default)
|
|
378
|
+
Create and execute a standalone analysis script.
|
|
961
379
|
|
|
962
380
|
Parameters:
|
|
963
|
-
|
|
964
|
-
"""
|
|
965
|
-
original_noise = self.params.noise_threshold
|
|
966
|
-
self.params.detector_type = detector_type # Store the detected type
|
|
967
|
-
|
|
968
|
-
if detector_type == "orbitrap":
|
|
969
|
-
# Orbitraps have much higher background noise, use 1e5 threshold
|
|
970
|
-
self.params.noise_threshold = 1e5
|
|
971
|
-
self._log_progress(f"Detector: Orbitrap detected - adjusted noise threshold: {original_noise} -> {self.params.noise_threshold}")
|
|
381
|
+
filename: Name for the generated script file
|
|
972
382
|
|
|
973
|
-
elif detector_type == "quadrupole":
|
|
974
|
-
# Quadrupole instruments have lower noise, use default threshold
|
|
975
|
-
self.params.noise_threshold = 200.0
|
|
976
|
-
self._log_progress(f"Detector: Quadrupole detected - noise threshold: {self.params.noise_threshold}")
|
|
977
|
-
|
|
978
|
-
else:
|
|
979
|
-
# Unknown detector type, keep default
|
|
980
|
-
self.params.noise_threshold = 200.0
|
|
981
|
-
self._log_progress(f"Detector: Unknown type detected - using default noise threshold: {self.params.noise_threshold}")
|
|
982
|
-
|
|
983
|
-
def assemble_study(self) -> bool:
|
|
984
|
-
"""
|
|
985
|
-
Assemble processed sample5 files into a study.
|
|
986
|
-
|
|
987
383
|
Returns:
|
|
988
|
-
True if
|
|
384
|
+
True if execution completed successfully, False otherwise
|
|
989
385
|
"""
|
|
990
|
-
|
|
991
|
-
self.
|
|
992
|
-
|
|
993
|
-
# Find all sample5 files
|
|
994
|
-
sample5_files = list(self.study_folder_path.glob("*.sample5"))
|
|
995
|
-
|
|
996
|
-
if not sample5_files:
|
|
997
|
-
self.logger.error("No sample5 files found for study assembly")
|
|
386
|
+
# First create the script
|
|
387
|
+
if not self.create_script(filename):
|
|
998
388
|
return False
|
|
999
|
-
|
|
1000
|
-
|
|
389
|
+
|
|
390
|
+
# Then execute it
|
|
391
|
+
script_path = self.folder_path / filename
|
|
1001
392
|
|
|
1002
393
|
try:
|
|
1003
|
-
|
|
1004
|
-
detector_type = self._detect_detector_type()
|
|
1005
|
-
self._adjust_parameters_for_detector(detector_type)
|
|
1006
|
-
|
|
1007
|
-
# Create study with optimized settings
|
|
1008
|
-
import masster
|
|
1009
|
-
study_params = study_defaults(
|
|
1010
|
-
folder=str(self.study_folder_path),
|
|
1011
|
-
polarity=self.polarity,
|
|
1012
|
-
log_level="INFO",
|
|
1013
|
-
log_label=f"Study-{self.polarity}",
|
|
1014
|
-
adducts=self.adducts
|
|
1015
|
-
)
|
|
394
|
+
print("Executing...")
|
|
1016
395
|
|
|
1017
|
-
|
|
396
|
+
import subprocess
|
|
397
|
+
result = subprocess.run([
|
|
398
|
+
sys.executable, str(script_path)
|
|
399
|
+
], cwd=str(self.folder_path), encoding='utf-8', errors='replace')
|
|
1018
400
|
|
|
1019
|
-
|
|
1020
|
-
sample5_pattern = str(self.study_folder_path / "*.sample5")
|
|
1021
|
-
self.study.add(sample5_pattern)
|
|
401
|
+
success = result.returncode == 0
|
|
1022
402
|
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
# Apply feature filtering
|
|
1030
|
-
feature_selection = self.study.features_select(
|
|
1031
|
-
chrom_coherence=0.3,
|
|
1032
|
-
chrom_prominence_scaled=1
|
|
1033
|
-
)
|
|
1034
|
-
self.study.features_filter(feature_selection)
|
|
403
|
+
if success:
|
|
404
|
+
print("=" * 70)
|
|
405
|
+
print("Script execution completed successfully")
|
|
406
|
+
else:
|
|
407
|
+
print("=" * 70)
|
|
408
|
+
print(f"Script execution failed with return code: {result.returncode}")
|
|
1035
409
|
|
|
1036
|
-
|
|
1037
|
-
self._log_progress(f"Feature filtering: {initial_features} -> {final_features} features")
|
|
1038
|
-
|
|
1039
|
-
self._save_checkpoint()
|
|
1040
|
-
return True
|
|
410
|
+
return success
|
|
1041
411
|
|
|
1042
412
|
except Exception as e:
|
|
1043
|
-
|
|
1044
|
-
return False
|
|
1045
|
-
|
|
1046
|
-
def align_and_merge(self) -> bool:
|
|
1047
|
-
"""
|
|
1048
|
-
Perform feature alignment and merging.
|
|
1049
|
-
|
|
1050
|
-
Returns:
|
|
1051
|
-
True if alignment and merging were successful
|
|
1052
|
-
"""
|
|
1053
|
-
self._log_progress("=== Starting Feature Alignment and Merging ===")
|
|
1054
|
-
self.current_step = "aligning_and_merging"
|
|
1055
|
-
|
|
1056
|
-
if self.study is None:
|
|
1057
|
-
self.logger.error("Study not assembled. Run assemble_study() first.")
|
|
413
|
+
print(f"Error during script execution: {e}")
|
|
1058
414
|
return False
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
415
|
+
|
|
416
|
+
def _generate_script_content(self) -> str:
|
|
417
|
+
"""Generate the complete analysis script content."""
|
|
418
|
+
|
|
419
|
+
# Convert Path objects to strings for JSON serialization
|
|
420
|
+
params_dict = {}
|
|
421
|
+
for key, value in self.params.__dict__.items():
|
|
422
|
+
if key == '_param_metadata': # Skip metadata in generated script
|
|
423
|
+
continue
|
|
424
|
+
if isinstance(value, Path):
|
|
425
|
+
params_dict[key] = str(value)
|
|
1069
426
|
else:
|
|
1070
|
-
|
|
1071
|
-
alignment_algorithm = "kd"
|
|
1072
|
-
merge_method = "qt-chunked"
|
|
1073
|
-
self.logger.info(f"Large study ({num_samples} samples) - using kd alignment and qt-chunked merge")
|
|
1074
|
-
|
|
1075
|
-
# Align features across samples
|
|
1076
|
-
align_params = align_defaults(
|
|
1077
|
-
rt_tol=self.params.rt_tolerance,
|
|
1078
|
-
mz_max_diff=self.params.mz_max_diff,
|
|
1079
|
-
algorithm=alignment_algorithm
|
|
1080
|
-
)
|
|
427
|
+
params_dict[key] = value
|
|
1081
428
|
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
)
|
|
1092
|
-
|
|
1093
|
-
self.logger.info(f"Merging features using {merge_method} method")
|
|
1094
|
-
self.study.merge(params=merge_params)
|
|
1095
|
-
|
|
1096
|
-
# Log results
|
|
1097
|
-
num_consensus = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
|
|
1098
|
-
self._log_progress(f"Generated {num_consensus} consensus features")
|
|
1099
|
-
|
|
1100
|
-
# Get study info
|
|
1101
|
-
if hasattr(self.study, 'info'):
|
|
1102
|
-
self.study.info()
|
|
1103
|
-
|
|
1104
|
-
self._save_checkpoint()
|
|
1105
|
-
return True
|
|
1106
|
-
|
|
1107
|
-
except Exception as e:
|
|
1108
|
-
self.logger.error(f"Failed to align and merge: {e}")
|
|
1109
|
-
return False
|
|
1110
|
-
|
|
1111
|
-
def generate_plots(self) -> bool:
|
|
1112
|
-
"""
|
|
1113
|
-
Generate visualization plots for the study.
|
|
1114
|
-
|
|
1115
|
-
Returns:
|
|
1116
|
-
True if plot generation was successful
|
|
1117
|
-
"""
|
|
1118
|
-
if not self.params.generate_plots:
|
|
1119
|
-
self._log_progress("Plot generation disabled, skipping...")
|
|
1120
|
-
return True
|
|
1121
|
-
|
|
1122
|
-
self._log_progress("=== Generating Visualization Plots ===")
|
|
1123
|
-
self.current_step = "generating_plots"
|
|
429
|
+
# Obtain list of files in source with extension wiff, .raw, .mzML
|
|
430
|
+
raw_files = []
|
|
431
|
+
for ext in params_dict.get('file_extensions', []):
|
|
432
|
+
raw_files.extend(glob.glob(f"{params_dict.get('source', '')}/**/*{ext}", recursive=True))
|
|
433
|
+
|
|
434
|
+
# Create readable PARAMS dict with comments
|
|
435
|
+
params_lines = []
|
|
436
|
+
params_lines.append('# Analysis parameters')
|
|
437
|
+
params_lines.append('PARAMS = {')
|
|
1124
438
|
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
439
|
+
# Core Configuration
|
|
440
|
+
params_lines.append(' # === Core Configuration ===')
|
|
441
|
+
params_lines.append(f' "source": {params_dict.get("source", "")!r}, # Directory containing raw data files')
|
|
442
|
+
params_lines.append(f' "folder": {params_dict.get("folder", "")!r}, # Output directory for processed study')
|
|
443
|
+
params_lines.append(f' "polarity": {params_dict.get("polarity", "positive")!r}, # Ion polarity mode ("positive" or "negative")')
|
|
444
|
+
params_lines.append(f' "num_cores": {params_dict.get("num_cores", 4)}, # Number of CPU cores for parallel processing')
|
|
445
|
+
params_lines.append('')
|
|
1128
446
|
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
self.study.plot_alignment(filename=str(alignment_plot))
|
|
1136
|
-
plots_generated += 1
|
|
1137
|
-
self.logger.info(f"Generated alignment plot: {alignment_plot}")
|
|
1138
|
-
|
|
1139
|
-
# Consensus 2D plot
|
|
1140
|
-
if hasattr(self.study, 'plot_consensus_2d'):
|
|
1141
|
-
consensus_2d_plot = self.study_folder_path / "consensus_2d.html"
|
|
1142
|
-
self.study.plot_consensus_2d(filename=str(consensus_2d_plot))
|
|
1143
|
-
plots_generated += 1
|
|
1144
|
-
self.logger.info(f"Generated consensus 2D plot: {consensus_2d_plot}")
|
|
1145
|
-
|
|
1146
|
-
# PCA plot
|
|
1147
|
-
if hasattr(self.study, 'plot_pca'):
|
|
1148
|
-
pca_plot = self.study_folder_path / "pca_plot.html"
|
|
1149
|
-
self.study.plot_pca(filename=str(pca_plot))
|
|
1150
|
-
plots_generated += 1
|
|
1151
|
-
self.logger.info(f"Generated PCA plot: {pca_plot}")
|
|
1152
|
-
|
|
1153
|
-
# Consensus statistics
|
|
1154
|
-
if hasattr(self.study, 'plot_consensus_stats'):
|
|
1155
|
-
stats_plot = self.study_folder_path / "consensus_stats.html"
|
|
1156
|
-
self.study.plot_consensus_stats(filename=str(stats_plot))
|
|
1157
|
-
plots_generated += 1
|
|
1158
|
-
self.logger.info(f"Generated statistics plot: {stats_plot}")
|
|
1159
|
-
|
|
1160
|
-
self._log_progress(f"Generated {plots_generated} visualization plots")
|
|
1161
|
-
self._save_checkpoint()
|
|
1162
|
-
return True
|
|
1163
|
-
|
|
1164
|
-
except Exception as e:
|
|
1165
|
-
self.logger.error(f"Failed to generate plots: {e}")
|
|
1166
|
-
return False
|
|
1167
|
-
|
|
1168
|
-
def export_results(self) -> bool:
|
|
1169
|
-
"""
|
|
1170
|
-
Export study results in requested formats.
|
|
447
|
+
# File Discovery
|
|
448
|
+
params_lines.append(' # === File Discovery ===')
|
|
449
|
+
params_lines.append(f' "file_extensions": {params_dict.get("file_extensions", [".wiff", ".raw", ".mzML"])!r}, # File extensions to search for')
|
|
450
|
+
params_lines.append(f' "search_subfolders": {params_dict.get("search_subfolders", True)}, # Whether to search subdirectories recursively')
|
|
451
|
+
params_lines.append(f' "skip_patterns": {params_dict.get("skip_patterns", ["blank", "condition"])!r}, # Filename patterns to skip')
|
|
452
|
+
params_lines.append('')
|
|
1171
453
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
"""
|
|
1175
|
-
|
|
1176
|
-
|
|
454
|
+
# Processing Parameters
|
|
455
|
+
params_lines.append(' # === Processing Parameters ===')
|
|
456
|
+
params_lines.append(f' "adducts": {params_dict.get("adducts", [])!r}, # Adduct specifications for feature detection and annotation')
|
|
457
|
+
params_lines.append(f' "detector_type": {params_dict.get("detector_type", "unknown")!r}, # MS detector type ("orbitrap", "tof", "unknown")')
|
|
458
|
+
params_lines.append('')
|
|
459
|
+
|
|
460
|
+
# Alignment & Merging
|
|
461
|
+
params_lines.append(' # === Alignment & Merging ===')
|
|
462
|
+
params_lines.append(f' "rt_tol": {params_dict.get("rt_tol", 2.0)}, # Retention time tolerance for alignment (seconds)')
|
|
463
|
+
params_lines.append(f' "mz_tol": {params_dict.get("mz_tol", 0.01)}, # Mass-to-charge ratio tolerance for alignment (Da)')
|
|
464
|
+
params_lines.append(f' "alignment_method": {params_dict.get("alignment_method", "kd")!r}, # Algorithm for sample alignment')
|
|
465
|
+
params_lines.append(f' "min_samples_per_feature": {params_dict.get("min_samples_per_feature", 1)}, # Minimum samples required per consensus feature')
|
|
466
|
+
params_lines.append(f' "merge_method": {params_dict.get("merge_method", "qt")!r}, # Method for merging consensus features')
|
|
467
|
+
params_lines.append('')
|
|
468
|
+
|
|
469
|
+
# Sample Processing
|
|
470
|
+
params_lines.append(' # === Sample Processing (used in add_samples_from_folder) ===')
|
|
471
|
+
params_lines.append(f' "batch_size": {params_dict.get("batch_size", 8)}, # Number of files to process per batch')
|
|
472
|
+
params_lines.append(f' "memory_limit_gb": {params_dict.get("memory_limit_gb", 16.0)}, # Memory limit for processing (GB)')
|
|
473
|
+
params_lines.append('')
|
|
474
|
+
|
|
475
|
+
# Script Options
|
|
476
|
+
params_lines.append(' # === Script Options ===')
|
|
477
|
+
params_lines.append(f' "resume_enabled": {params_dict.get("resume_enabled", True)}, # Enable automatic resume capability')
|
|
478
|
+
params_lines.append(f' "force_reprocess": {params_dict.get("force_reprocess", False)}, # Force reprocessing of existing files')
|
|
479
|
+
params_lines.append(f' "cleanup_temp_files": {params_dict.get("cleanup_temp_files", True)}, # Clean up temporary files after processing')
|
|
480
|
+
|
|
481
|
+
params_lines.append('}')
|
|
482
|
+
|
|
483
|
+
# Create script lines
|
|
484
|
+
script_lines = [
|
|
485
|
+
'#!/usr/bin/env python3',
|
|
486
|
+
'"""',
|
|
487
|
+
'Automated Mass Spectrometry Data Analysis Pipeline',
|
|
488
|
+
f'Generated by masster wizard v{version}',
|
|
489
|
+
'"""',
|
|
490
|
+
'',
|
|
491
|
+
'import sys',
|
|
492
|
+
'import time',
|
|
493
|
+
'from pathlib import Path',
|
|
494
|
+
'',
|
|
495
|
+
'# Import masster modules',
|
|
496
|
+
'from masster.study import Study',
|
|
497
|
+
'from masster import __version__',
|
|
498
|
+
'',
|
|
499
|
+
]
|
|
1177
500
|
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
return False
|
|
501
|
+
# Add the formatted PARAMS
|
|
502
|
+
script_lines.extend(params_lines)
|
|
1181
503
|
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
# Export consensus features as CSV
|
|
1186
|
-
if "csv" in self.params.export_formats:
|
|
1187
|
-
csv_file = self.study_folder_path / "consensus_features.csv"
|
|
1188
|
-
if hasattr(self.study.consensus_df, 'write_csv'):
|
|
1189
|
-
self.study.consensus_df.write_csv(str(csv_file))
|
|
1190
|
-
exports_completed += 1
|
|
1191
|
-
self.logger.info(f"Exported CSV: {csv_file}")
|
|
1192
|
-
|
|
1193
|
-
# Export as Excel
|
|
1194
|
-
if "xlsx" in self.params.export_formats and hasattr(self.study, 'export_xlsx'):
|
|
1195
|
-
xlsx_file = self.study_folder_path / "study_results.xlsx"
|
|
1196
|
-
self.study.export_xlsx(filename=str(xlsx_file))
|
|
1197
|
-
exports_completed += 1
|
|
1198
|
-
self.logger.info(f"Exported Excel: {xlsx_file}")
|
|
1199
|
-
|
|
1200
|
-
# Export MGF for MS2 spectra
|
|
1201
|
-
if "mgf" in self.params.export_formats and hasattr(self.study, 'export_mgf'):
|
|
1202
|
-
mgf_file = self.study_folder_path / "consensus_ms2.mgf"
|
|
1203
|
-
self.study.export_mgf(filename=str(mgf_file))
|
|
1204
|
-
exports_completed += 1
|
|
1205
|
-
self.logger.info(f"Exported MGF: {mgf_file}")
|
|
1206
|
-
|
|
1207
|
-
# Export as Parquet for efficient storage
|
|
1208
|
-
if "parquet" in self.params.export_formats and hasattr(self.study, 'export_parquet'):
|
|
1209
|
-
parquet_file = self.study_folder_path / "study_data.parquet"
|
|
1210
|
-
self.study.export_parquet(filename=str(parquet_file))
|
|
1211
|
-
exports_completed += 1
|
|
1212
|
-
self.logger.info(f"Exported Parquet: {parquet_file}")
|
|
1213
|
-
|
|
1214
|
-
self._log_progress(f"Completed {exports_completed} exports")
|
|
1215
|
-
self._save_checkpoint()
|
|
1216
|
-
|
|
1217
|
-
# Always perform additional export methods as requested
|
|
1218
|
-
self._export_additional_formats()
|
|
1219
|
-
|
|
1220
|
-
return True
|
|
1221
|
-
|
|
1222
|
-
except Exception as e:
|
|
1223
|
-
self.logger.error(f"Failed to export results: {e}")
|
|
1224
|
-
return False
|
|
1225
|
-
|
|
1226
|
-
def _export_additional_formats(self):
|
|
1227
|
-
"""Export additional formats: xlsx, parquet, save, and mgf."""
|
|
1228
|
-
self.logger.info("=== Exporting Additional Formats ===")
|
|
1229
|
-
|
|
1230
|
-
try:
|
|
1231
|
-
# Force export xlsx (study results in Excel format)
|
|
1232
|
-
xlsx_file = self.study_folder_path / "study_results.xlsx"
|
|
1233
|
-
if hasattr(self.study, 'export_xlsx'):
|
|
1234
|
-
self.study.export_xlsx(filename=str(xlsx_file))
|
|
1235
|
-
self.logger.info(f"Exported Excel: {xlsx_file}")
|
|
1236
|
-
|
|
1237
|
-
# Force export parquet (efficient binary format)
|
|
1238
|
-
parquet_file = self.study_folder_path / "study_data.parquet"
|
|
1239
|
-
if hasattr(self.study, 'export_parquet'):
|
|
1240
|
-
self.study.export_parquet(filename=str(parquet_file))
|
|
1241
|
-
self.logger.info(f"Exported Parquet: {parquet_file}")
|
|
1242
|
-
|
|
1243
|
-
# Force save the study in study5 format
|
|
1244
|
-
study_file = self.study_folder_path / "final_study.study5"
|
|
1245
|
-
self.study.save(filename=str(study_file))
|
|
1246
|
-
self.logger.info(f"Saved study: {study_file}")
|
|
1247
|
-
|
|
1248
|
-
# Force export MGF for MS2 spectra
|
|
1249
|
-
mgf_file = self.study_folder_path / "consensus_ms2.mgf"
|
|
1250
|
-
if hasattr(self.study, 'export_mgf'):
|
|
1251
|
-
self.study.export_mgf(filename=str(mgf_file))
|
|
1252
|
-
self.logger.info(f"Exported MGF: {mgf_file}")
|
|
1253
|
-
|
|
1254
|
-
except Exception as e:
|
|
1255
|
-
self.logger.warning(f"Some additional exports failed: {e}")
|
|
1256
|
-
|
|
1257
|
-
def save_study(self) -> bool:
|
|
1258
|
-
"""
|
|
1259
|
-
Save the final study in optimized format.
|
|
1260
|
-
|
|
1261
|
-
Returns:
|
|
1262
|
-
True if study was saved successfully
|
|
1263
|
-
"""
|
|
1264
|
-
self._log_progress("=== Saving Final Study ===")
|
|
1265
|
-
self.current_step = "saving_study"
|
|
1266
|
-
|
|
1267
|
-
if self.study is None:
|
|
1268
|
-
self.logger.error("Study not available. Complete previous steps first.")
|
|
1269
|
-
return False
|
|
1270
|
-
|
|
1271
|
-
try:
|
|
1272
|
-
study_file = self.study_folder_path / "final_study.study5"
|
|
1273
|
-
|
|
1274
|
-
# Determine optimal save format based on study size
|
|
1275
|
-
num_samples = len(self.study.samples_df)
|
|
1276
|
-
num_features = len(self.study.consensus_df) if hasattr(self.study, 'consensus_df') else 0
|
|
1277
|
-
|
|
1278
|
-
if self.params.adaptive_compression:
|
|
1279
|
-
# Use compressed format for large studies
|
|
1280
|
-
if num_samples > 50 or num_features > 10000:
|
|
1281
|
-
self.logger.info(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")
|
|
1282
|
-
self.params.compress_output = True
|
|
1283
|
-
else:
|
|
1284
|
-
self.logger.info(f"Small study ({num_samples} samples, {num_features} features) - using standard format")
|
|
1285
|
-
self.params.compress_output = False
|
|
1286
|
-
|
|
1287
|
-
# Save study
|
|
1288
|
-
if self.params.compress_output and hasattr(self.study, 'save_compressed'):
|
|
1289
|
-
self.study.save_compressed(filename=str(study_file))
|
|
1290
|
-
self.logger.info(f"Saved compressed study: {study_file}")
|
|
1291
|
-
else:
|
|
1292
|
-
self.study.save(filename=str(study_file))
|
|
1293
|
-
self.logger.info(f"Saved study: {study_file}")
|
|
1294
|
-
|
|
1295
|
-
# Save metadata summary
|
|
1296
|
-
metadata_file = self.study_folder_path / "study_metadata.txt"
|
|
1297
|
-
with open(metadata_file, "w") as f:
|
|
1298
|
-
f.write("Study Processing Summary\n")
|
|
1299
|
-
f.write("========================\n")
|
|
1300
|
-
f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
1301
|
-
f.write(f"Polarity: {self.polarity}\n")
|
|
1302
|
-
f.write(f"Adducts: {', '.join(self.adducts)}\n")
|
|
1303
|
-
f.write(f"Number of Samples: {num_samples}\n")
|
|
1304
|
-
f.write(f"Number of Consensus Features: {num_features}\n")
|
|
1305
|
-
f.write(f"Successful Files: {len(self.processed_files)}\n")
|
|
1306
|
-
f.write(f"Failed Files: {len(self.failed_files)}\n")
|
|
1307
|
-
f.write(f"RT Tolerance: {self.params.rt_tolerance}s\n")
|
|
1308
|
-
f.write(f"m/z Max Diff: {self.params.mz_max_diff} Da\n")
|
|
1309
|
-
f.write(f"Merge Method: {self.params.merge_method}\n")
|
|
1310
|
-
f.write(f"Processing Time: {self._get_total_processing_time()}\n")
|
|
1311
|
-
|
|
1312
|
-
self._log_progress(f"Saved study metadata: {metadata_file}")
|
|
1313
|
-
self._save_checkpoint()
|
|
1314
|
-
return True
|
|
1315
|
-
|
|
1316
|
-
except Exception as e:
|
|
1317
|
-
self.logger.error(f"Failed to save study: {e}")
|
|
1318
|
-
return False
|
|
1319
|
-
|
|
1320
|
-
def cleanup_temp_files(self) -> bool:
|
|
1321
|
-
"""
|
|
1322
|
-
Clean up temporary files if requested.
|
|
1323
|
-
|
|
1324
|
-
Returns:
|
|
1325
|
-
True if cleanup was successful
|
|
1326
|
-
"""
|
|
1327
|
-
if not self.params.cleanup_temp_files:
|
|
1328
|
-
return True
|
|
1329
|
-
|
|
1330
|
-
self._log_progress("=== Cleaning Up Temporary Files ===")
|
|
1331
|
-
|
|
1332
|
-
try:
|
|
1333
|
-
cleaned_count = 0
|
|
1334
|
-
|
|
1335
|
-
# Remove individual sample plots if study plots were generated
|
|
1336
|
-
if self.params.generate_plots:
|
|
1337
|
-
temp_plots = list(self.study_folder_path.glob("*_2d.html"))
|
|
1338
|
-
for plot_file in temp_plots:
|
|
1339
|
-
if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:
|
|
1340
|
-
plot_file.unlink()
|
|
1341
|
-
cleaned_count += 1
|
|
1342
|
-
|
|
1343
|
-
# Remove checkpoint file
|
|
1344
|
-
if self.checkpoint_file.exists():
|
|
1345
|
-
self.checkpoint_file.unlink()
|
|
1346
|
-
cleaned_count += 1
|
|
1347
|
-
|
|
1348
|
-
self._log_progress(f"Cleaned up {cleaned_count} temporary files")
|
|
1349
|
-
return True
|
|
1350
|
-
|
|
1351
|
-
except Exception as e:
|
|
1352
|
-
self.logger.error(f"Failed to cleanup temp files: {e}")
|
|
1353
|
-
return False
|
|
1354
|
-
|
|
1355
|
-
def run_full_pipeline(self) -> bool:
|
|
1356
|
-
"""
|
|
1357
|
-
Run the complete automated processing pipeline.
|
|
1358
|
-
|
|
1359
|
-
This method executes all processing steps in sequence:
|
|
1360
|
-
1. Convert raw files to sample5 format
|
|
1361
|
-
2. Assemble study from sample5 files
|
|
1362
|
-
3. Align and merge features
|
|
1363
|
-
4. Generate visualization plots
|
|
1364
|
-
5. Export results in requested formats
|
|
1365
|
-
6. Save final study
|
|
1366
|
-
7. Clean up temporary files
|
|
1367
|
-
|
|
1368
|
-
Returns:
|
|
1369
|
-
True if the entire pipeline completed successfully
|
|
1370
|
-
"""
|
|
1371
|
-
self._log_progress("=" * 60)
|
|
1372
|
-
self._log_progress("STARTING AUTOMATED STUDY PROCESSING PIPELINE")
|
|
1373
|
-
self._log_progress("=" * 60)
|
|
1374
|
-
|
|
1375
|
-
self.start_time = time.time()
|
|
1376
|
-
pipeline_success = True
|
|
1377
|
-
|
|
1378
|
-
try:
|
|
1379
|
-
# Step 1: Convert to sample5
|
|
1380
|
-
if not self.convert_to_sample5():
|
|
1381
|
-
self.logger.error("Sample5 conversion failed")
|
|
1382
|
-
return False
|
|
1383
|
-
|
|
1384
|
-
# Step 2: Assemble study
|
|
1385
|
-
if not self.assemble_study():
|
|
1386
|
-
self.logger.error("Study assembly failed")
|
|
1387
|
-
return False
|
|
1388
|
-
|
|
1389
|
-
# Step 3: Align and merge
|
|
1390
|
-
if not self.align_and_merge():
|
|
1391
|
-
self.logger.error("Feature alignment and merging failed")
|
|
1392
|
-
return False
|
|
1393
|
-
|
|
1394
|
-
# Step 4: Generate plots
|
|
1395
|
-
if not self.generate_plots():
|
|
1396
|
-
self.logger.warning("Plot generation failed, continuing...")
|
|
1397
|
-
pipeline_success = False
|
|
1398
|
-
|
|
1399
|
-
# Step 5: Export results
|
|
1400
|
-
if not self.export_results():
|
|
1401
|
-
self.logger.warning("Result export failed, continuing...")
|
|
1402
|
-
pipeline_success = False
|
|
1403
|
-
|
|
1404
|
-
# Step 6: Save study
|
|
1405
|
-
if not self.save_study():
|
|
1406
|
-
self.logger.error("Study saving failed")
|
|
1407
|
-
return False
|
|
1408
|
-
|
|
1409
|
-
# Step 7: Cleanup
|
|
1410
|
-
if not self.cleanup_temp_files():
|
|
1411
|
-
self.logger.warning("Cleanup failed, continuing...")
|
|
1412
|
-
|
|
1413
|
-
# Final summary
|
|
1414
|
-
total_time = time.time() - self.start_time
|
|
1415
|
-
self._log_progress("=" * 60)
|
|
1416
|
-
self._log_progress("PIPELINE COMPLETED SUCCESSFULLY")
|
|
1417
|
-
self._log_progress(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
|
|
1418
|
-
self._log_progress(f"Files processed: {len(self.processed_files)}")
|
|
1419
|
-
self._log_progress(f"Files failed: {len(self.failed_files)}")
|
|
1420
|
-
if hasattr(self.study, 'consensus_df'):
|
|
1421
|
-
self._log_progress(f"Consensus features: {len(self.study.consensus_df)}")
|
|
1422
|
-
self._log_progress("=" * 60)
|
|
1423
|
-
|
|
1424
|
-
return pipeline_success
|
|
1425
|
-
|
|
1426
|
-
except KeyboardInterrupt:
|
|
1427
|
-
self.logger.info("Pipeline interrupted by user")
|
|
1428
|
-
self._save_checkpoint()
|
|
1429
|
-
return False
|
|
1430
|
-
except Exception as e:
|
|
1431
|
-
self.logger.error(f"Pipeline failed with unexpected error: {e}")
|
|
1432
|
-
self._save_checkpoint()
|
|
1433
|
-
return False
|
|
1434
|
-
|
|
1435
|
-
def _get_total_processing_time(self) -> str:
|
|
1436
|
-
"""Get formatted total processing time."""
|
|
1437
|
-
if self.start_time is None:
|
|
1438
|
-
return "Unknown"
|
|
1439
|
-
|
|
1440
|
-
total_seconds = time.time() - self.start_time
|
|
1441
|
-
hours = int(total_seconds // 3600)
|
|
1442
|
-
minutes = int((total_seconds % 3600) // 60)
|
|
1443
|
-
seconds = int(total_seconds % 60)
|
|
1444
|
-
|
|
1445
|
-
if hours > 0:
|
|
1446
|
-
return f"{hours}h {minutes}m {seconds}s"
|
|
1447
|
-
elif minutes > 0:
|
|
1448
|
-
return f"{minutes}m {seconds}s"
|
|
1449
|
-
else:
|
|
1450
|
-
return f"{seconds}s"
|
|
1451
|
-
|
|
1452
|
-
def get_status(self) -> Dict[str, Any]:
|
|
1453
|
-
"""
|
|
1454
|
-
Get current processing status.
|
|
1455
|
-
|
|
1456
|
-
Returns:
|
|
1457
|
-
Dictionary with current status information
|
|
1458
|
-
"""
|
|
1459
|
-
return {
|
|
1460
|
-
"current_step": self.current_step,
|
|
1461
|
-
"processed_files": len(self.processed_files),
|
|
1462
|
-
"failed_files": len(self.failed_files),
|
|
1463
|
-
"study_loaded": self.study is not None,
|
|
1464
|
-
"start_time": self.start_time.isoformat() if self.start_time else None,
|
|
1465
|
-
"processing_time": self._get_total_processing_time(),
|
|
1466
|
-
"parameters": {
|
|
1467
|
-
"data_source": self.params.data_source,
|
|
1468
|
-
"study_folder": self.params.study_folder,
|
|
1469
|
-
"polarity": self.params.polarity,
|
|
1470
|
-
"num_cores": self.params.num_cores,
|
|
1471
|
-
"adducts": self.params.adducts,
|
|
1472
|
-
}
|
|
1473
|
-
}
|
|
1474
|
-
|
|
1475
|
-
def execute(self) -> bool:
|
|
1476
|
-
"""
|
|
1477
|
-
Execute the complete automated processing pipeline.
|
|
1478
|
-
|
|
1479
|
-
This is a convenience method that runs the full pipeline with the wizard's
|
|
1480
|
-
current configuration. It performs standalone analysis of the samples/studies
|
|
1481
|
-
as proposed by the Wizard.
|
|
1482
|
-
|
|
1483
|
-
Returns:
|
|
1484
|
-
True if execution completed successfully, False otherwise
|
|
1485
|
-
"""
|
|
1486
|
-
self._log_progress("Executing Wizard automated processing...")
|
|
1487
|
-
return self.run_full_pipeline()
|
|
1488
|
-
|
|
1489
|
-
def export_script(self, filename: str) -> bool:
|
|
1490
|
-
"""
|
|
1491
|
-
Generate a Python script that replicates the wizard's processing steps.
|
|
1492
|
-
|
|
1493
|
-
Creates a standalone Python script that can be executed independently
|
|
1494
|
-
to perform the same analysis as the wizard with the current configuration.
|
|
1495
|
-
The script will be saved in the study folder.
|
|
1496
|
-
|
|
1497
|
-
This is useful for:
|
|
1498
|
-
- Creating reproducible analysis scripts
|
|
1499
|
-
- Customizing processing steps
|
|
1500
|
-
- Running analysis in different environments
|
|
1501
|
-
- Batch processing automation
|
|
1502
|
-
|
|
1503
|
-
Parameters:
|
|
1504
|
-
filename: Filename for the script (should end with .py). Script will be saved in the study folder.
|
|
1505
|
-
|
|
1506
|
-
Returns:
|
|
1507
|
-
True if script was generated successfully, False otherwise
|
|
1508
|
-
"""
|
|
1509
|
-
self._log_progress("Generating analysis script...")
|
|
1510
|
-
|
|
1511
|
-
try:
|
|
1512
|
-
# Ensure the filename is just a filename, not a full path
|
|
1513
|
-
script_filename = Path(filename).name
|
|
1514
|
-
if not script_filename.endswith('.py'):
|
|
1515
|
-
script_filename = script_filename.replace(Path(script_filename).suffix, '') + '.py'
|
|
1516
|
-
|
|
1517
|
-
# Place the script in the study folder
|
|
1518
|
-
script_path = self.study_folder_path / script_filename
|
|
1519
|
-
|
|
1520
|
-
# Generate the script content
|
|
1521
|
-
script_content = self._generate_script_content()
|
|
1522
|
-
|
|
1523
|
-
# Write the script
|
|
1524
|
-
with open(script_path, 'w', encoding='utf-8') as f:
|
|
1525
|
-
f.write(script_content)
|
|
1526
|
-
|
|
1527
|
-
self._log_progress(f"Analysis script saved: {os.path.abspath(script_path)}")
|
|
1528
|
-
self.logger.info(f"Generated standalone analysis script: {os.path.abspath(script_path)}")
|
|
1529
|
-
|
|
1530
|
-
return True
|
|
1531
|
-
|
|
1532
|
-
except Exception as e:
|
|
1533
|
-
self.logger.error(f"Failed to generate script: {e}")
|
|
1534
|
-
return False
|
|
1535
|
-
|
|
1536
|
-
def to_script(self, filename: str) -> bool:
|
|
1537
|
-
"""
|
|
1538
|
-
[DEPRECATED] Use export_script() instead.
|
|
1539
|
-
|
|
1540
|
-
Backward compatibility alias for export_script().
|
|
1541
|
-
"""
|
|
1542
|
-
return self.export_script(filename)
|
|
1543
|
-
|
|
1544
|
-
def _generate_script_content(self) -> str:
|
|
1545
|
-
"""
|
|
1546
|
-
Generate the content for the standalone analysis script.
|
|
1547
|
-
|
|
1548
|
-
Returns:
|
|
1549
|
-
Complete Python script content as string
|
|
1550
|
-
"""
|
|
1551
|
-
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
1552
|
-
|
|
1553
|
-
# Create script header
|
|
1554
|
-
script_lines = [
|
|
1555
|
-
'#!/usr/bin/env python3',
|
|
1556
|
-
'"""',
|
|
1557
|
-
'Standalone Mass Spectrometry Analysis Script',
|
|
1558
|
-
f'Generated by masster.Wizard on {timestamp}',
|
|
504
|
+
# Add the main function and pipeline
|
|
505
|
+
script_lines.extend([
|
|
1559
506
|
'',
|
|
1560
|
-
'This script replicates the automated processing pipeline configured',
|
|
1561
|
-
'in the Wizard with the following settings:',
|
|
1562
|
-
f'- Data Source: {self.params.data_source.replace(chr(92), chr(92)*2)}',
|
|
1563
|
-
f'- Study Folder: {self.params.study_folder.replace(chr(92), chr(92)*2)}',
|
|
1564
|
-
f'- Polarity: {self.params.polarity}',
|
|
1565
|
-
f'- Number of Cores: {self.params.num_cores}',
|
|
1566
|
-
f'- Adducts: {", ".join(self.params.adducts)}',
|
|
1567
|
-
f'- RT Tolerance: {self.params.rt_tolerance}s',
|
|
1568
|
-
f'- m/z Max Diff: {self.params.mz_max_diff} Da',
|
|
1569
|
-
f'- Merge Method: {self.params.merge_method}',
|
|
1570
|
-
'"""',
|
|
1571
|
-
'',
|
|
1572
|
-
'import os',
|
|
1573
|
-
'import sys',
|
|
1574
|
-
'import time',
|
|
1575
|
-
'import multiprocessing',
|
|
1576
|
-
'from pathlib import Path',
|
|
1577
|
-
'from typing import List, Optional',
|
|
1578
|
-
'import concurrent.futures',
|
|
1579
|
-
'from datetime import datetime',
|
|
1580
507
|
'',
|
|
1581
|
-
'
|
|
1582
|
-
'
|
|
1583
|
-
'
|
|
1584
|
-
'
|
|
1585
|
-
' possible_paths = [',
|
|
1586
|
-
' Path(__file__).parent.parent, # Script is in masster subfolder',
|
|
1587
|
-
' Path(__file__).parent.parent.parent, # Script is in study folder',
|
|
1588
|
-
' Path(os.getcwd()), # Current working directory',
|
|
1589
|
-
' Path(os.getcwd()).parent, # Parent of current directory',
|
|
1590
|
-
' Path(r"D:\\SW\\massistant"), # Specific development path',
|
|
1591
|
-
' Path.home() / "massistant", # Home directory',
|
|
1592
|
-
' Path.home() / "SW" / "massistant", # Common dev location',
|
|
1593
|
-
' ]',
|
|
508
|
+
'def discover_raw_files(source_folder, file_extensions, search_subfolders=True):',
|
|
509
|
+
' """Discover raw data files in the source folder."""',
|
|
510
|
+
' source_path = Path(source_folder)',
|
|
511
|
+
' raw_files = []',
|
|
1594
512
|
' ',
|
|
1595
|
-
'
|
|
1596
|
-
'
|
|
1597
|
-
'
|
|
1598
|
-
'
|
|
1599
|
-
' if str(possible_path) not in sys.path:',
|
|
1600
|
-
' sys.path.insert(0, str(possible_path))',
|
|
1601
|
-
' masster_found = True',
|
|
1602
|
-
' print(f"Found masster at: {possible_path}")',
|
|
1603
|
-
' break',
|
|
1604
|
-
' ',
|
|
1605
|
-
' if not masster_found:',
|
|
1606
|
-
' # Try adding current directory to path as fallback',
|
|
1607
|
-
' current_dir = Path(os.getcwd())',
|
|
1608
|
-
' if str(current_dir) not in sys.path:',
|
|
1609
|
-
' sys.path.insert(0, str(current_dir))',
|
|
1610
|
-
' ',
|
|
1611
|
-
' import masster',
|
|
1612
|
-
'except ImportError as e:',
|
|
1613
|
-
' print(f"Error: masster library not found. {e}")',
|
|
1614
|
-
' print("Please ensure masster is installed or run this script from the masster directory.")',
|
|
1615
|
-
' print("You can install masster with: pip install -e .")',
|
|
1616
|
-
' sys.exit(1)',
|
|
1617
|
-
'',
|
|
1618
|
-
'',
|
|
1619
|
-
'def infer_polarity_from_first_file():',
|
|
1620
|
-
' """Infer polarity from the first available raw data file."""',
|
|
1621
|
-
' try:',
|
|
1622
|
-
' data_source_path = Path(DATA_SOURCE)',
|
|
1623
|
-
' # Find first file',
|
|
1624
|
-
' for extension in [\'.wiff\', \'.raw\', \'.mzML\', \'.d\']:',
|
|
1625
|
-
' pattern = f"**/*{extension}"',
|
|
1626
|
-
' files = list(data_source_path.rglob(pattern))',
|
|
1627
|
-
' if files:',
|
|
1628
|
-
' first_file = files[0]',
|
|
1629
|
-
' break',
|
|
513
|
+
' for ext in file_extensions:',
|
|
514
|
+
' if search_subfolders:',
|
|
515
|
+
' pattern = f"**/*{ext}"',
|
|
516
|
+
' files = list(source_path.rglob(pattern))',
|
|
1630
517
|
' else:',
|
|
1631
|
-
'
|
|
1632
|
-
'
|
|
1633
|
-
'
|
|
1634
|
-
' if first_file.suffix.lower() == \'.wiff\':',
|
|
1635
|
-
' from masster.sample.load import _wiff_to_dict',
|
|
1636
|
-
' ',
|
|
1637
|
-
' # Extract metadata from first file',
|
|
1638
|
-
' metadata_df = _wiff_to_dict(str(first_file))',
|
|
1639
|
-
' ',
|
|
1640
|
-
' if not metadata_df.empty and \'polarity\' in metadata_df.columns:',
|
|
1641
|
-
' # Get polarity from first experiment',
|
|
1642
|
-
' first_polarity = metadata_df[\'polarity\'].iloc[0]',
|
|
1643
|
-
' ',
|
|
1644
|
-
' # Convert numeric polarity codes to string',
|
|
1645
|
-
' if first_polarity == 1 or str(first_polarity).lower() in [\'positive\', \'pos\', \'+\']:',
|
|
1646
|
-
' return "positive"',
|
|
1647
|
-
' elif first_polarity == -1 or str(first_polarity).lower() in [\'negative\', \'neg\', \'-\']:',
|
|
1648
|
-
' return "negative"',
|
|
1649
|
-
' except Exception:',
|
|
1650
|
-
' pass',
|
|
1651
|
-
' return None',
|
|
1652
|
-
'',
|
|
1653
|
-
'',
|
|
1654
|
-
'# Configuration Parameters',
|
|
1655
|
-
f'DATA_SOURCE = r"{self.params.data_source}"',
|
|
1656
|
-
f'STUDY_FOLDER = r"{self.params.study_folder}"',
|
|
1657
|
-
'',
|
|
1658
|
-
'# Auto-infer polarity from first file, fall back to default',
|
|
1659
|
-
'detected_polarity = infer_polarity_from_first_file()',
|
|
1660
|
-
f'POLARITY = detected_polarity or "{self.params.polarity}"',
|
|
1661
|
-
'NUM_CORES = max(1, int(multiprocessing.cpu_count() * 0.75)) # Auto-detect 75% of cores',
|
|
1662
|
-
'',
|
|
1663
|
-
'# Set adducts based on detected polarity',
|
|
1664
|
-
'if POLARITY.lower() in ["positive", "pos"]:',
|
|
1665
|
-
' ADDUCTS = ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"]',
|
|
1666
|
-
'elif POLARITY.lower() in ["negative", "neg"]:',
|
|
1667
|
-
' ADDUCTS = ["H-1:-:1.0", "CH2O2:0:0.5"]',
|
|
1668
|
-
'else:',
|
|
1669
|
-
f' ADDUCTS = {self.params.adducts!r} # Fall back to original',
|
|
1670
|
-
f'RT_TOLERANCE = {self.params.rt_tolerance}',
|
|
1671
|
-
f'MZ_TOLERANCE = {self.params.mz_max_diff}',
|
|
1672
|
-
f'MERGE_METHOD = "{self.params.merge_method}"',
|
|
1673
|
-
f'BATCH_SIZE = {self.params.batch_size}',
|
|
1674
|
-
f'CHROM_FWHM = {self.params.chrom_fwhm}',
|
|
1675
|
-
f'NOISE_THRESHOLD = {self.params.noise_threshold}',
|
|
1676
|
-
f'CHROM_PEAK_SNR = {self.params.chrom_peak_snr}',
|
|
1677
|
-
f'TOL_PPM = {self.params.tol_ppm}',
|
|
1678
|
-
f'MIN_SAMPLES_FOR_MERGE = {self.params.min_samples_for_merge}',
|
|
1679
|
-
'',
|
|
1680
|
-
'# File discovery settings',
|
|
1681
|
-
"FILE_EXTENSIONS = ['.wiff', '.raw', '.mzML']",
|
|
1682
|
-
f'SEARCH_SUBFOLDERS = {self.params.search_subfolders}',
|
|
1683
|
-
"SKIP_PATTERNS = []",
|
|
1684
|
-
f'MAX_FILE_SIZE_GB = {self.params.max_file_size_gb}',
|
|
1685
|
-
'',
|
|
1686
|
-
'# Output settings',
|
|
1687
|
-
f'GENERATE_PLOTS = {self.params.generate_plots}',
|
|
1688
|
-
f'EXPORT_FORMATS = {self.params.export_formats!r}',
|
|
1689
|
-
f'COMPRESS_OUTPUT = {self.params.compress_output}',
|
|
1690
|
-
f'CLEANUP_TEMP_FILES = {self.params.cleanup_temp_files}',
|
|
1691
|
-
'',
|
|
1692
|
-
'',
|
|
1693
|
-
'def log_progress(message: str):',
|
|
1694
|
-
' """Log progress message with timestamp."""',
|
|
1695
|
-
' timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")',
|
|
1696
|
-
' print(f"[{timestamp}] {message}")',
|
|
1697
|
-
'',
|
|
1698
|
-
'',
|
|
1699
|
-
'def discover_files() -> List[Path]:',
|
|
1700
|
-
' """Discover raw data files in the source directory."""',
|
|
1701
|
-
' log_progress("Discovering raw data files...")',
|
|
1702
|
-
' data_source_path = Path(DATA_SOURCE)',
|
|
1703
|
-
' found_files = []',
|
|
1704
|
-
' ',
|
|
1705
|
-
' for extension in FILE_EXTENSIONS:',
|
|
1706
|
-
' if SEARCH_SUBFOLDERS:',
|
|
1707
|
-
' pattern = f"**/*{extension}"',
|
|
1708
|
-
' files = list(data_source_path.rglob(pattern))',
|
|
1709
|
-
' else:',
|
|
1710
|
-
' pattern = f"*{extension}"',
|
|
1711
|
-
' files = list(data_source_path.glob(pattern))',
|
|
1712
|
-
' ',
|
|
1713
|
-
' # Filter out files matching skip patterns',
|
|
1714
|
-
' filtered_files = []',
|
|
1715
|
-
' for file_path in files:',
|
|
1716
|
-
' skip_file = False',
|
|
1717
|
-
' for pattern in SKIP_PATTERNS:',
|
|
1718
|
-
' if pattern.lower() in file_path.name.lower():',
|
|
1719
|
-
' skip_file = True',
|
|
1720
|
-
' print(f"Skipping file (matches pattern \'{pattern}\'): {file_path.name}")',
|
|
1721
|
-
' break',
|
|
1722
|
-
' ',
|
|
1723
|
-
' if not skip_file:',
|
|
1724
|
-
' # Check file size',
|
|
1725
|
-
' try:',
|
|
1726
|
-
' file_size_gb = file_path.stat().st_size / (1024**3)',
|
|
1727
|
-
' if file_size_gb > MAX_FILE_SIZE_GB:',
|
|
1728
|
-
' print(f"Large file ({file_size_gb:.1f}GB): {file_path.name}")',
|
|
1729
|
-
' filtered_files.append(file_path)',
|
|
1730
|
-
' except Exception as e:',
|
|
1731
|
-
' print(f"Could not check file size for {file_path}: {e}")',
|
|
1732
|
-
' filtered_files.append(file_path)',
|
|
1733
|
-
' ',
|
|
1734
|
-
' found_files.extend(filtered_files)',
|
|
1735
|
-
' log_progress(f"Found {len(filtered_files)} {extension} files")',
|
|
518
|
+
' pattern = f"*{ext}"',
|
|
519
|
+
' files = list(source_path.glob(pattern))',
|
|
520
|
+
' raw_files.extend(files)',
|
|
1736
521
|
' ',
|
|
1737
|
-
'
|
|
1738
|
-
' found_files = sorted(list(set(found_files)))',
|
|
1739
|
-
' log_progress(f"Total files discovered: {len(found_files)}")',
|
|
1740
|
-
' return found_files',
|
|
522
|
+
' return raw_files',
|
|
1741
523
|
'',
|
|
1742
524
|
'',
|
|
1743
|
-
'def process_single_file(
|
|
1744
|
-
' """Process a single file to sample5 format."""',
|
|
1745
|
-
'
|
|
1746
|
-
'
|
|
1747
|
-
' ',
|
|
1748
|
-
' # Generate output filename',
|
|
1749
|
-
' file_out = file_path.stem + ".sample5"',
|
|
1750
|
-
' output_file = study_folder_path / file_out',
|
|
1751
|
-
' ',
|
|
1752
|
-
' # Check if file already exists',
|
|
1753
|
-
' if output_file.exists():',
|
|
1754
|
-
' try:',
|
|
1755
|
-
' # Try to load existing file to verify it\'s valid',
|
|
1756
|
-
' sample = masster.Sample(log_level="ERROR")',
|
|
1757
|
-
' sample.load(str(output_file))',
|
|
1758
|
-
' print(f"Skipping {file_path.name} (already processed)")',
|
|
1759
|
-
' return output_file.stem',
|
|
1760
|
-
' except Exception:',
|
|
1761
|
-
' # If loading fails, file needs to be reprocessed',
|
|
1762
|
-
' pass',
|
|
1763
|
-
' ',
|
|
1764
|
-
' print(f"Processing {file_path.name}")',
|
|
525
|
+
'def process_single_file(args):',
|
|
526
|
+
' """Process a single raw file to sample5 format - module level for multiprocessing."""',
|
|
527
|
+
' raw_file, output_folder = args',
|
|
528
|
+
' from masster.sample import Sample',
|
|
1765
529
|
' ',
|
|
1766
530
|
' try:',
|
|
1767
|
-
' #
|
|
1768
|
-
'
|
|
1769
|
-
'
|
|
1770
|
-
' log_level="ERROR" # Reduce logging overhead',
|
|
1771
|
-
' )',
|
|
531
|
+
' # Create sample5 filename',
|
|
532
|
+
' sample_name = raw_file.stem',
|
|
533
|
+
' sample5_path = Path(output_folder) / f"{sample_name}.sample5"',
|
|
1772
534
|
' ',
|
|
1773
|
-
' #
|
|
1774
|
-
'
|
|
535
|
+
' # Skip if sample5 already exists',
|
|
536
|
+
' if sample5_path.exists():',
|
|
537
|
+
' print(f" Skipping {raw_file.name} (sample5 already exists)")',
|
|
538
|
+
' return str(sample5_path)',
|
|
1775
539
|
' ',
|
|
1776
|
-
'
|
|
1777
|
-
' sample.find_features(',
|
|
1778
|
-
' chrom_fwhm=CHROM_FWHM,',
|
|
1779
|
-
' noise=NOISE_THRESHOLD,',
|
|
1780
|
-
' tol_ppm=TOL_PPM,',
|
|
1781
|
-
' chrom_peak_snr=CHROM_PEAK_SNR,',
|
|
1782
|
-
' min_trace_length_multiplier=0.5,',
|
|
1783
|
-
' chrom_fwhm_min=CHROM_FWHM',
|
|
1784
|
-
' )',
|
|
540
|
+
' print(f" Converting {raw_file.name}...")',
|
|
1785
541
|
' ',
|
|
1786
|
-
' #
|
|
542
|
+
' # Load and process raw file with full pipeline',
|
|
543
|
+
' sample = Sample(log_label=sample_name)',
|
|
544
|
+
' sample.load(filename=str(raw_file))',
|
|
1787
545
|
' sample.find_features(',
|
|
1788
|
-
'
|
|
1789
|
-
'
|
|
1790
|
-
'
|
|
546
|
+
' noise=PARAMS[\'noise\'],',
|
|
547
|
+
' chrom_fwhm=PARAMS[\'chrom_fwhm\'],',
|
|
548
|
+
' chrom_peak_snr=PARAMS[\'chrom_peak_snr\']',
|
|
1791
549
|
' )',
|
|
1792
|
-
' ',
|
|
1793
|
-
' # STEP 3.5: Validate feature detection results',
|
|
1794
|
-
' if not hasattr(sample, "features_df") or sample.features_df is None or len(sample.features_df) == 0:',
|
|
1795
|
-
' print(f"WARNING: No features detected in {file_path.name} - skipping additional processing")',
|
|
1796
|
-
' # Still save the sample5 file for record keeping',
|
|
1797
|
-
' sample.save(filename=str(output_file))',
|
|
1798
|
-
' return output_file.stem',
|
|
1799
|
-
' ',
|
|
1800
|
-
' print(f"Detected {len(sample.features_df)} features in {file_path.name}")',
|
|
1801
|
-
' ',
|
|
1802
|
-
' # STEP 4: Adduct detection',
|
|
1803
|
-
' sample.find_adducts(adducts=ADDUCTS)',
|
|
1804
|
-
' ',
|
|
1805
|
-
' # STEP 5: MS2 spectrum identification',
|
|
550
|
+
' sample.find_adducts(adducts=PARAMS[\'adducts\'])',
|
|
1806
551
|
' sample.find_ms2()',
|
|
552
|
+
' # sample.find_iso()',
|
|
553
|
+
' # sample.export_mgf()',
|
|
554
|
+
' # sample.export_mztab()',
|
|
555
|
+
' # sample.plot_2d(filename="{sample_name}.html")',
|
|
556
|
+
' sample.save(str(sample5_path))',
|
|
1807
557
|
' ',
|
|
1808
|
-
' #
|
|
1809
|
-
'
|
|
1810
|
-
' ',
|
|
1811
|
-
' # STEP 7: Generate additional outputs (only for samples with features)',
|
|
1812
|
-
' # Skip CSV export and individual MGF export as requested',
|
|
1813
|
-
' ',
|
|
1814
|
-
' if GENERATE_PLOTS:',
|
|
1815
|
-
' plot_file = output_file.parent / (output_file.stem + "_2d.html")',
|
|
1816
|
-
' sample.plot_2d(filename=str(plot_file), markersize=4)',
|
|
1817
|
-
' ',
|
|
1818
|
-
' # Memory cleanup',
|
|
1819
|
-
' result = output_file.stem',
|
|
1820
|
-
' del sample',
|
|
1821
|
-
' gc.collect()',
|
|
1822
|
-
' return result',
|
|
558
|
+
' # print(f" Completed {raw_file.name} -> {sample5_path.name}")',
|
|
559
|
+
' return str(sample5_path)',
|
|
1823
560
|
' ',
|
|
1824
561
|
' except Exception as e:',
|
|
1825
|
-
' print(f"
|
|
1826
|
-
' gc.collect()',
|
|
562
|
+
' print(f" ERROR processing {raw_file.name}: {e}")',
|
|
1827
563
|
' return None',
|
|
1828
564
|
'',
|
|
1829
565
|
'',
|
|
1830
|
-
'def
|
|
1831
|
-
' """Convert raw data files to sample5 format
|
|
1832
|
-
'
|
|
1833
|
-
'
|
|
566
|
+
'def convert_raw_to_sample5(raw_files, output_folder, polarity, num_cores):',
|
|
567
|
+
' """Convert raw data files to sample5 format."""',
|
|
568
|
+
' import concurrent.futures',
|
|
569
|
+
' import os',
|
|
570
|
+
' ',
|
|
571
|
+
' # Create output directory',
|
|
572
|
+
' os.makedirs(output_folder, exist_ok=True)',
|
|
1834
573
|
' ',
|
|
1835
|
-
'
|
|
1836
|
-
'
|
|
1837
|
-
' failed_count = 0',
|
|
574
|
+
' # Prepare arguments for multiprocessing',
|
|
575
|
+
' file_args = [(raw_file, output_folder) for raw_file in raw_files]',
|
|
1838
576
|
' ',
|
|
1839
|
-
'
|
|
1840
|
-
'
|
|
577
|
+
' # Process files in parallel',
|
|
578
|
+
' sample5_files = []',
|
|
579
|
+
' with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:',
|
|
580
|
+
' futures = [executor.submit(process_single_file, args) for args in file_args]',
|
|
1841
581
|
' ',
|
|
1842
|
-
' for
|
|
582
|
+
' for future in concurrent.futures.as_completed(futures):',
|
|
1843
583
|
' result = future.result()',
|
|
1844
584
|
' if result:',
|
|
1845
|
-
'
|
|
1846
|
-
' else:',
|
|
1847
|
-
' failed_count += 1',
|
|
1848
|
-
' ',
|
|
1849
|
-
' # Progress update',
|
|
1850
|
-
' if (successful_count + failed_count) % 10 == 0:',
|
|
1851
|
-
' progress = (successful_count + failed_count) / len(file_list) * 100',
|
|
1852
|
-
' log_progress(f"Progress: {progress:.1f}% ({successful_count} successful, {failed_count} failed)")',
|
|
585
|
+
' sample5_files.append(result)',
|
|
1853
586
|
' ',
|
|
1854
|
-
'
|
|
1855
|
-
' log_progress("=== Sample5 Conversion Complete ===")',
|
|
1856
|
-
' log_progress(f"Successful: {successful_count}")',
|
|
1857
|
-
' log_progress(f"Failed: {failed_count}")',
|
|
1858
|
-
' log_progress(f"Total time: {conversion_time:.1f} seconds")',
|
|
1859
|
-
' ',
|
|
1860
|
-
' return successful_count > 0',
|
|
1861
|
-
'',
|
|
1862
|
-
'',
|
|
1863
|
-
'def assemble_study() -> masster.Study:',
|
|
1864
|
-
' """Assemble processed sample5 files into a study."""',
|
|
1865
|
-
' log_progress("=== Starting Study Assembly ===")',
|
|
1866
|
-
' study_folder_path = Path(STUDY_FOLDER)',
|
|
1867
|
-
' ',
|
|
1868
|
-
' # Find all sample5 files',
|
|
1869
|
-
' sample5_files = list(study_folder_path.glob("*.sample5"))',
|
|
1870
|
-
' if not sample5_files:',
|
|
1871
|
-
' raise RuntimeError("No sample5 files found for study assembly")',
|
|
1872
|
-
' ',
|
|
1873
|
-
' log_progress(f"Assembling study from {len(sample5_files)} sample5 files")',
|
|
1874
|
-
' ',
|
|
1875
|
-
' # Create study with optimized settings',
|
|
1876
|
-
' from masster.study.defaults.study_def import study_defaults',
|
|
1877
|
-
' study_params = study_defaults(',
|
|
1878
|
-
' folder=str(study_folder_path),',
|
|
1879
|
-
' polarity=POLARITY,',
|
|
1880
|
-
' log_level="INFO",',
|
|
1881
|
-
f' log_label="Study-{self.params.polarity}",',
|
|
1882
|
-
' adducts=ADDUCTS',
|
|
1883
|
-
' )',
|
|
1884
|
-
' ',
|
|
1885
|
-
' study = masster.Study(params=study_params)',
|
|
1886
|
-
' ',
|
|
1887
|
-
' # Add all sample5 files',
|
|
1888
|
-
' sample5_pattern = str(study_folder_path / "*.sample5")',
|
|
1889
|
-
' study.add(sample5_pattern)',
|
|
1890
|
-
' log_progress(f"Added {len(study.samples_df)} samples to study")',
|
|
1891
|
-
' ',
|
|
1892
|
-
' # Filter features based on quality criteria',
|
|
1893
|
-
' if hasattr(study, "features_filter"):',
|
|
1894
|
-
' initial_features = len(study.features_df) if hasattr(study, "features_df") else 0',
|
|
1895
|
-
' feature_selection = study.features_select(',
|
|
1896
|
-
' chrom_coherence=0.3,',
|
|
1897
|
-
' chrom_prominence_scaled=1',
|
|
1898
|
-
' )',
|
|
1899
|
-
' study.features_filter(feature_selection)',
|
|
1900
|
-
' final_features = len(study.features_df) if hasattr(study, "features_df") else 0',
|
|
1901
|
-
' log_progress(f"Feature filtering: {initial_features} -> {final_features} features")',
|
|
1902
|
-
' ',
|
|
1903
|
-
' return study',
|
|
1904
|
-
'',
|
|
1905
|
-
'',
|
|
1906
|
-
'def align_and_merge(study: masster.Study) -> masster.Study:',
|
|
1907
|
-
' """Perform feature alignment and merging."""',
|
|
1908
|
-
' log_progress("=== Starting Feature Alignment and Merging ===")',
|
|
1909
|
-
' ',
|
|
1910
|
-
' # Import alignment and merge defaults',
|
|
1911
|
-
' from masster.study.defaults.align_def import align_defaults',
|
|
1912
|
-
' from masster.study.defaults.merge_def import merge_defaults',
|
|
1913
|
-
' ',
|
|
1914
|
-
' # Determine optimal algorithms based on study size',
|
|
1915
|
-
' num_samples = len(study.samples_df)',
|
|
1916
|
-
' ',
|
|
1917
|
-
' if num_samples < 500:',
|
|
1918
|
-
' # For smaller studies: use qt for both alignment and merge',
|
|
1919
|
-
' alignment_algorithm = "qt"',
|
|
1920
|
-
' merge_method = "qt"',
|
|
1921
|
-
' log_progress(f"Small study ({num_samples} samples) - using qt algorithms")',
|
|
1922
|
-
' else:',
|
|
1923
|
-
' # For larger studies: use kd for alignment and qt-chunked for merge',
|
|
1924
|
-
' alignment_algorithm = "kd"',
|
|
1925
|
-
' merge_method = "qt-chunked"',
|
|
1926
|
-
' log_progress(f"Large study ({num_samples} samples) - using kd alignment and qt-chunked merge")',
|
|
1927
|
-
' ',
|
|
1928
|
-
' # Align features across samples',
|
|
1929
|
-
' align_params = align_defaults(',
|
|
1930
|
-
' rt_tol=RT_TOLERANCE,',
|
|
1931
|
-
' mz_max_diff=MZ_TOLERANCE,',
|
|
1932
|
-
' algorithm=alignment_algorithm',
|
|
1933
|
-
' )',
|
|
1934
|
-
' ',
|
|
1935
|
-
' log_progress(f"Aligning features with RT tolerance {RT_TOLERANCE}s, m/z tolerance {MZ_TOLERANCE} Da, algorithm: {alignment_algorithm}")',
|
|
1936
|
-
' study.align(params=align_params)',
|
|
1937
|
-
' ',
|
|
1938
|
-
' # Merge aligned features',
|
|
1939
|
-
' merge_params = merge_defaults(',
|
|
1940
|
-
' method=merge_method,',
|
|
1941
|
-
' rt_tol=RT_TOLERANCE,',
|
|
1942
|
-
' mz_tol=MZ_TOLERANCE,',
|
|
1943
|
-
' min_samples=MIN_SAMPLES_FOR_MERGE',
|
|
1944
|
-
' )',
|
|
1945
|
-
' ',
|
|
1946
|
-
' log_progress(f"Merging features using {merge_method} method")',
|
|
1947
|
-
' study.merge(params=merge_params)',
|
|
1948
|
-
' ',
|
|
1949
|
-
' # Log results',
|
|
1950
|
-
' num_consensus = len(study.consensus_df) if hasattr(study, "consensus_df") else 0',
|
|
1951
|
-
' log_progress(f"Generated {num_consensus} consensus features")',
|
|
1952
|
-
' ',
|
|
1953
|
-
' # Get study info',
|
|
1954
|
-
' if hasattr(study, "info"):',
|
|
1955
|
-
' study.info()',
|
|
1956
|
-
' ',
|
|
1957
|
-
' return study',
|
|
1958
|
-
'',
|
|
1959
|
-
'',
|
|
1960
|
-
'def generate_plots(study: masster.Study) -> bool:',
|
|
1961
|
-
' """Generate visualization plots for the study."""',
|
|
1962
|
-
' if not GENERATE_PLOTS:',
|
|
1963
|
-
' log_progress("Plot generation disabled, skipping...")',
|
|
1964
|
-
' return True',
|
|
1965
|
-
' ',
|
|
1966
|
-
' log_progress("=== Generating Visualization Plots ===")',
|
|
1967
|
-
' study_folder_path = Path(STUDY_FOLDER)',
|
|
1968
|
-
' plots_generated = 0',
|
|
1969
|
-
' ',
|
|
1970
|
-
' try:',
|
|
1971
|
-
' # Alignment plot',
|
|
1972
|
-
' if hasattr(study, "plot_alignment"):',
|
|
1973
|
-
' alignment_plot = study_folder_path / "alignment_plot.html"',
|
|
1974
|
-
' study.plot_alignment(filename=str(alignment_plot))',
|
|
1975
|
-
' plots_generated += 1',
|
|
1976
|
-
' log_progress(f"Generated alignment plot: {alignment_plot}")',
|
|
1977
|
-
' ',
|
|
1978
|
-
' # Consensus 2D plot',
|
|
1979
|
-
' if hasattr(study, "plot_consensus_2d"):',
|
|
1980
|
-
' consensus_2d_plot = study_folder_path / "consensus_2d.html"',
|
|
1981
|
-
' study.plot_consensus_2d(filename=str(consensus_2d_plot))',
|
|
1982
|
-
' plots_generated += 1',
|
|
1983
|
-
' log_progress(f"Generated consensus 2D plot: {consensus_2d_plot}")',
|
|
1984
|
-
' ',
|
|
1985
|
-
' # PCA plot',
|
|
1986
|
-
' if hasattr(study, "plot_pca"):',
|
|
1987
|
-
' pca_plot = study_folder_path / "pca_plot.html"',
|
|
1988
|
-
' study.plot_pca(filename=str(pca_plot))',
|
|
1989
|
-
' plots_generated += 1',
|
|
1990
|
-
' log_progress(f"Generated PCA plot: {pca_plot}")',
|
|
1991
|
-
' ',
|
|
1992
|
-
' # Consensus statistics',
|
|
1993
|
-
' if hasattr(study, "plot_consensus_stats"):',
|
|
1994
|
-
' stats_plot = study_folder_path / "consensus_stats.html"',
|
|
1995
|
-
' study.plot_consensus_stats(filename=str(stats_plot))',
|
|
1996
|
-
' plots_generated += 1',
|
|
1997
|
-
' log_progress(f"Generated statistics plot: {stats_plot}")',
|
|
1998
|
-
' ',
|
|
1999
|
-
' log_progress(f"Generated {plots_generated} visualization plots")',
|
|
2000
|
-
' return True',
|
|
2001
|
-
' ',
|
|
2002
|
-
' except Exception as e:',
|
|
2003
|
-
' print(f"Failed to generate plots: {e}")',
|
|
2004
|
-
' return False',
|
|
2005
|
-
'',
|
|
2006
|
-
'',
|
|
2007
|
-
'def export_results(study: masster.Study) -> bool:',
|
|
2008
|
-
' """Export study results in requested formats."""',
|
|
2009
|
-
' log_progress("=== Exporting Study Results ===")',
|
|
2010
|
-
' study_folder_path = Path(STUDY_FOLDER)',
|
|
2011
|
-
' exports_completed = 0',
|
|
2012
|
-
' ',
|
|
2013
|
-
' try:',
|
|
2014
|
-
' # Skip CSV export as requested',
|
|
2015
|
-
' ',
|
|
2016
|
-
' # Export as Excel',
|
|
2017
|
-
' if "xlsx" in EXPORT_FORMATS and hasattr(study, "export_xlsx"):',
|
|
2018
|
-
' xlsx_file = study_folder_path / "study_results.xlsx"',
|
|
2019
|
-
' study.export_xlsx(filename=str(xlsx_file))',
|
|
2020
|
-
' exports_completed += 1',
|
|
2021
|
-
' log_progress(f"Exported Excel: {xlsx_file}")',
|
|
2022
|
-
' ',
|
|
2023
|
-
' # Export MGF for MS2 spectra',
|
|
2024
|
-
' if "mgf" in EXPORT_FORMATS and hasattr(study, "export_mgf"):',
|
|
2025
|
-
' mgf_file = study_folder_path / "consensus_ms2.mgf"',
|
|
2026
|
-
' study.export_mgf(filename=str(mgf_file))',
|
|
2027
|
-
' exports_completed += 1',
|
|
2028
|
-
' log_progress(f"Exported MGF: {mgf_file}")',
|
|
2029
|
-
' ',
|
|
2030
|
-
' # Export as Parquet for efficient storage',
|
|
2031
|
-
' if "parquet" in EXPORT_FORMATS and hasattr(study, "export_parquet"):',
|
|
2032
|
-
' parquet_file = study_folder_path / "study_data.parquet"',
|
|
2033
|
-
' study.export_parquet(filename=str(parquet_file))',
|
|
2034
|
-
' exports_completed += 1',
|
|
2035
|
-
' log_progress(f"Exported Parquet: {parquet_file}")',
|
|
2036
|
-
' ',
|
|
2037
|
-
' log_progress(f"Completed {exports_completed} exports")',
|
|
2038
|
-
' ',
|
|
2039
|
-
' # Always perform additional exports as requested',
|
|
2040
|
-
' log_progress("=== Exporting Additional Formats ===")',
|
|
2041
|
-
' ',
|
|
2042
|
-
' try:',
|
|
2043
|
-
' # Force export xlsx (study results in Excel format)',
|
|
2044
|
-
' xlsx_file = study_folder_path / "study_results.xlsx"',
|
|
2045
|
-
' if hasattr(study, "export_xlsx"):',
|
|
2046
|
-
' study.export_xlsx(filename=str(xlsx_file))',
|
|
2047
|
-
' log_progress(f"Exported Excel: {xlsx_file}")',
|
|
2048
|
-
' ',
|
|
2049
|
-
' # Force export parquet (efficient binary format)',
|
|
2050
|
-
' parquet_file = study_folder_path / "study_data.parquet"',
|
|
2051
|
-
' if hasattr(study, "export_parquet"):',
|
|
2052
|
-
' study.export_parquet(filename=str(parquet_file))',
|
|
2053
|
-
' log_progress(f"Exported Parquet: {parquet_file}")',
|
|
2054
|
-
' ',
|
|
2055
|
-
' # Force save the study in study5 format',
|
|
2056
|
-
' study_file = study_folder_path / "final_study.study5"',
|
|
2057
|
-
' study.save(filename=str(study_file))',
|
|
2058
|
-
' log_progress(f"Saved study: {study_file}")',
|
|
2059
|
-
' ',
|
|
2060
|
-
' # Force export MGF for MS2 spectra',
|
|
2061
|
-
' mgf_file = study_folder_path / "consensus_ms2.mgf"',
|
|
2062
|
-
' if hasattr(study, "export_mgf"):',
|
|
2063
|
-
' study.export_mgf(filename=str(mgf_file))',
|
|
2064
|
-
' log_progress(f"Exported MGF: {mgf_file}")',
|
|
2065
|
-
' ',
|
|
2066
|
-
' except Exception as e:',
|
|
2067
|
-
' print(f"Some additional exports failed: {e}")',
|
|
2068
|
-
' ',
|
|
2069
|
-
' return True',
|
|
2070
|
-
' ',
|
|
2071
|
-
' except Exception as e:',
|
|
2072
|
-
' print(f"Failed to export results: {e}")',
|
|
2073
|
-
' return False',
|
|
2074
|
-
'',
|
|
2075
|
-
'',
|
|
2076
|
-
'def save_study(study: masster.Study) -> bool:',
|
|
2077
|
-
' """Save the final study in optimized format."""',
|
|
2078
|
-
' log_progress("=== Saving Final Study ===")',
|
|
2079
|
-
' study_folder_path = Path(STUDY_FOLDER)',
|
|
2080
|
-
' ',
|
|
2081
|
-
' try:',
|
|
2082
|
-
' study_file = study_folder_path / "final_study.study5"',
|
|
2083
|
-
' ',
|
|
2084
|
-
' # Determine optimal save format based on study size',
|
|
2085
|
-
' num_samples = len(study.samples_df)',
|
|
2086
|
-
' num_features = len(study.consensus_df) if hasattr(study, "consensus_df") else 0',
|
|
2087
|
-
' ',
|
|
2088
|
-
' if num_samples > 50 or num_features > 10000:',
|
|
2089
|
-
' log_progress(f"Large study detected ({num_samples} samples, {num_features} features) - using compressed format")',
|
|
2090
|
-
' compress_output = True',
|
|
2091
|
-
' else:',
|
|
2092
|
-
' log_progress(f"Small study ({num_samples} samples, {num_features} features) - using standard format")',
|
|
2093
|
-
' compress_output = False',
|
|
2094
|
-
' ',
|
|
2095
|
-
' # Save study',
|
|
2096
|
-
' if compress_output and hasattr(study, "save_compressed"):',
|
|
2097
|
-
' study.save_compressed(filename=str(study_file))',
|
|
2098
|
-
' log_progress(f"Saved compressed study: {study_file}")',
|
|
2099
|
-
' else:',
|
|
2100
|
-
' study.save(filename=str(study_file))',
|
|
2101
|
-
' log_progress(f"Saved study: {study_file}")',
|
|
2102
|
-
' ',
|
|
2103
|
-
' # Save metadata summary',
|
|
2104
|
-
' metadata_file = study_folder_path / "study_metadata.txt"',
|
|
2105
|
-
' with open(metadata_file, "w") as f:',
|
|
2106
|
-
' f.write("Study Processing Summary\\n")',
|
|
2107
|
-
' f.write("========================\\n")',
|
|
2108
|
-
' f.write(f"Processing Date: {datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\')}\\n")',
|
|
2109
|
-
' f.write(f"Polarity: {POLARITY}\\n")',
|
|
2110
|
-
' f.write(f"Adducts: {\', \'.join(ADDUCTS)}\\n")',
|
|
2111
|
-
' f.write(f"Number of Samples: {num_samples}\\n")',
|
|
2112
|
-
' f.write(f"Number of Consensus Features: {num_features}\\n")',
|
|
2113
|
-
' f.write(f"RT Tolerance: {RT_TOLERANCE}s\\n")',
|
|
2114
|
-
' f.write(f"m/z Tolerance: {MZ_TOLERANCE} Da\\n")',
|
|
2115
|
-
' f.write(f"Merge Method: {MERGE_METHOD}\\n")',
|
|
2116
|
-
' ',
|
|
2117
|
-
' log_progress(f"Saved study metadata: {metadata_file}")',
|
|
2118
|
-
' return True',
|
|
2119
|
-
' ',
|
|
2120
|
-
' except Exception as e:',
|
|
2121
|
-
' print(f"Failed to save study: {e}")',
|
|
2122
|
-
' return False',
|
|
2123
|
-
'',
|
|
2124
|
-
'',
|
|
2125
|
-
'def cleanup_temp_files() -> bool:',
|
|
2126
|
-
' """Clean up temporary files if requested."""',
|
|
2127
|
-
' if not CLEANUP_TEMP_FILES:',
|
|
2128
|
-
' return True',
|
|
2129
|
-
' ',
|
|
2130
|
-
' log_progress("=== Cleaning Up Temporary Files ===")',
|
|
2131
|
-
' study_folder_path = Path(STUDY_FOLDER)',
|
|
2132
|
-
' ',
|
|
2133
|
-
' try:',
|
|
2134
|
-
' cleaned_count = 0',
|
|
2135
|
-
' ',
|
|
2136
|
-
' # Remove individual sample plots if study plots were generated',
|
|
2137
|
-
' if GENERATE_PLOTS:',
|
|
2138
|
-
' temp_plots = list(study_folder_path.glob("*_2d.html"))',
|
|
2139
|
-
' for plot_file in temp_plots:',
|
|
2140
|
-
' if plot_file.name not in ["alignment_plot.html", "consensus_2d.html", "pca_plot.html"]:',
|
|
2141
|
-
' plot_file.unlink()',
|
|
2142
|
-
' cleaned_count += 1',
|
|
2143
|
-
' ',
|
|
2144
|
-
' log_progress(f"Cleaned up {cleaned_count} temporary files")',
|
|
2145
|
-
' return True',
|
|
2146
|
-
' ',
|
|
2147
|
-
' except Exception as e:',
|
|
2148
|
-
' print(f"Failed to cleanup temp files: {e}")',
|
|
2149
|
-
' return False',
|
|
587
|
+
' return sample5_files',
|
|
2150
588
|
'',
|
|
2151
589
|
'',
|
|
2152
590
|
'def main():',
|
|
2153
|
-
' """Main
|
|
2154
|
-
' print("=" * 70)',
|
|
2155
|
-
' print("AUTOMATED MASS SPECTROMETRY ANALYSIS SCRIPT")',
|
|
2156
|
-
f' print("Generated by masster.Wizard on {timestamp}")',
|
|
2157
|
-
' print("=" * 70)',
|
|
2158
|
-
' ',
|
|
2159
|
-
' start_time = time.time()',
|
|
2160
|
-
' ',
|
|
591
|
+
' """Main analysis pipeline."""',
|
|
2161
592
|
' try:',
|
|
2162
|
-
'
|
|
2163
|
-
'
|
|
593
|
+
' print("=" * 70)',
|
|
594
|
+
f' print("masster {version} - Automated MS Data Analysis")',
|
|
595
|
+
' print("=" * 70)',
|
|
596
|
+
' print(f"Source: {PARAMS[\'source\']}")',
|
|
597
|
+
' print(f"Output: {PARAMS[\'folder\']}")',
|
|
598
|
+
' print(f"Polarity: {PARAMS[\'polarity\']}")',
|
|
599
|
+
' print(f"CPU Cores: {PARAMS[\'num_cores\']}")',
|
|
600
|
+
' print("=" * 70)',
|
|
2164
601
|
' ',
|
|
2165
|
-
'
|
|
2166
|
-
'
|
|
2167
|
-
'
|
|
2168
|
-
'
|
|
2169
|
-
'
|
|
602
|
+
' start_time = time.time()',
|
|
603
|
+
' ',
|
|
604
|
+
' # Step 1: Discover raw data files',
|
|
605
|
+
' print("\\nStep 1/7: Discovering raw data files...")',
|
|
606
|
+
' raw_files = discover_raw_files(',
|
|
607
|
+
' PARAMS[\'source\'],',
|
|
608
|
+
' PARAMS[\'file_extensions\'],',
|
|
609
|
+
' PARAMS[\'search_subfolders\']',
|
|
610
|
+
' )',
|
|
2170
611
|
' ',
|
|
2171
|
-
'
|
|
2172
|
-
'
|
|
2173
|
-
' print("Sample5 conversion failed")',
|
|
612
|
+
' if not raw_files:',
|
|
613
|
+
' print("No raw data files found!")',
|
|
2174
614
|
' return False',
|
|
2175
615
|
' ',
|
|
2176
|
-
'
|
|
2177
|
-
'
|
|
616
|
+
' print(f"Found {len(raw_files)} raw data files")',
|
|
617
|
+
' for f in raw_files[:5]: # Show first 5 files',
|
|
618
|
+
' print(f" {f.name}")',
|
|
619
|
+
' if len(raw_files) > 5:',
|
|
620
|
+
' print(f" ... and {len(raw_files) - 5} more")',
|
|
621
|
+
' ',
|
|
622
|
+
' # Step 2: Process raw files',
|
|
623
|
+
' print("\\nStep 2/7: Processing raw files...")',
|
|
624
|
+
' sample5_files = convert_raw_to_sample5(',
|
|
625
|
+
' raw_files,',
|
|
626
|
+
' PARAMS[\'folder\'],',
|
|
627
|
+
' PARAMS[\'polarity\'],',
|
|
628
|
+
' PARAMS[\'num_cores\']',
|
|
629
|
+
' )',
|
|
2178
630
|
' ',
|
|
2179
|
-
'
|
|
2180
|
-
'
|
|
631
|
+
' if not sample5_files:',
|
|
632
|
+
' print("No sample5 files were created!")',
|
|
633
|
+
' return False',
|
|
2181
634
|
' ',
|
|
2182
|
-
'
|
|
2183
|
-
' generate_plots(study)',
|
|
635
|
+
' print(f"Successfully processed {len(sample5_files)} files to sample5")',
|
|
2184
636
|
' ',
|
|
2185
|
-
' # Step
|
|
2186
|
-
'
|
|
637
|
+
' # Step 3: Create and configure study',
|
|
638
|
+
' print("\\nStep 3/7: Initializing study...")',
|
|
639
|
+
' study = Study(folder=PARAMS[\'folder\'])',
|
|
640
|
+
' study.polarity = PARAMS[\'polarity\']',
|
|
641
|
+
' study.adducts = PARAMS[\'adducts\']',
|
|
2187
642
|
' ',
|
|
2188
|
-
' # Step
|
|
2189
|
-
'
|
|
643
|
+
' # Step 4: Add sample5 files to study',
|
|
644
|
+
' print("\\nStep 4/7: Adding samples to study...")',
|
|
645
|
+
' study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
|
|
2190
646
|
' ',
|
|
2191
|
-
' # Step
|
|
2192
|
-
'
|
|
647
|
+
' # Step 5: Core processing',
|
|
648
|
+
' print("\\nStep 5/7: Processing...")',
|
|
649
|
+
' study.align(',
|
|
650
|
+
' algorithm=PARAMS[\'alignment_method\'],',
|
|
651
|
+
' rt_tol=PARAMS[\'rt_tol\']',
|
|
652
|
+
' )',
|
|
2193
653
|
' ',
|
|
2194
|
-
' #
|
|
654
|
+
' # Merge and create consensus features',
|
|
655
|
+
' study.merge(',
|
|
656
|
+
' min_samples=PARAMS[\'min_samples_per_feature\'],',
|
|
657
|
+
' threads=PARAMS[\'num_cores\'],',
|
|
658
|
+
' rt_tol=PARAMS[\'rt_tol\'],',
|
|
659
|
+
' mz_tol=PARAMS[\'mz_tol\']',
|
|
660
|
+
' )',
|
|
661
|
+
' study.find_iso()',
|
|
662
|
+
' study.fill(min_samples_rel=0.0)',
|
|
663
|
+
' study.integrate()',
|
|
664
|
+
' ',
|
|
665
|
+
' # Step 6/7: Saving results',
|
|
666
|
+
' print("\\nStep 6/7: Saving results...")',
|
|
667
|
+
' study.save()',
|
|
668
|
+
' study.export_xlsx()',
|
|
669
|
+
' study.export_mgf()',
|
|
670
|
+
' study.export_mztab()',
|
|
671
|
+
' ',
|
|
672
|
+
' # Step 7: Plots',
|
|
673
|
+
' print("\\nStep 7/7: Exporting plots...")',
|
|
674
|
+
' study.plot_consensus_2d(filename="consensus.html")',
|
|
675
|
+
' study.plot_consensus_2d(filename="consensus.png")',
|
|
676
|
+
' study.plot_alignment(filename="alignment.html")',
|
|
677
|
+
' study.plot_alignment(filename="alignment.png")',
|
|
678
|
+
' study.plot_pca(filename="pca.html")',
|
|
679
|
+
' study.plot_pca(filename="pca.png")',
|
|
680
|
+
' study.plot_bpc(filename="bpc.html")',
|
|
681
|
+
' study.plot_bpc(filename="bpc.png")',
|
|
682
|
+
' study.plot_rt_correction(filename="rt_correction.html")',
|
|
683
|
+
' study.plot_rt_correction(filename="rt_correction.png")',
|
|
684
|
+
|
|
685
|
+
' ',
|
|
686
|
+
' # Print summary',
|
|
687
|
+
' study.info()',
|
|
2195
688
|
' total_time = time.time() - start_time',
|
|
689
|
+
' print("\\n" + "=" * 70)',
|
|
690
|
+
' print("ANALYSIS COMPLETE")',
|
|
2196
691
|
' print("=" * 70)',
|
|
2197
|
-
' print("ANALYSIS COMPLETED SUCCESSFULLY")',
|
|
2198
692
|
' print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")',
|
|
693
|
+
' print(f"Raw files processed: {len(raw_files)}")',
|
|
694
|
+
' print(f"Sample5 files created: {len(sample5_files)}")',
|
|
2199
695
|
' if hasattr(study, "consensus_df"):',
|
|
2200
696
|
' print(f"Consensus features generated: {len(study.consensus_df)}")',
|
|
2201
697
|
' print("=" * 70)',
|
|
@@ -2215,43 +711,15 @@ class Wizard:
|
|
|
2215
711
|
'if __name__ == "__main__":',
|
|
2216
712
|
' success = main()',
|
|
2217
713
|
' sys.exit(0 if success else 1)',
|
|
2218
|
-
]
|
|
714
|
+
])
|
|
2219
715
|
|
|
2220
716
|
return '\n'.join(script_lines)
|
|
2221
|
-
|
|
2222
|
-
def info(self):
|
|
2223
|
-
"""Print comprehensive wizard status information."""
|
|
2224
|
-
status = self.get_status()
|
|
2225
|
-
|
|
2226
|
-
print("\n" + "=" * 50)
|
|
2227
|
-
print("WIZARD STATUS")
|
|
2228
|
-
print("=" * 50)
|
|
2229
|
-
print(f"Current Step: {status['current_step']}")
|
|
2230
|
-
print(f"Data Source: {self.params.data_source}")
|
|
2231
|
-
print(f"Study Folder: {self.params.study_folder}")
|
|
2232
|
-
print(f"Polarity: {status['parameters']['polarity']}")
|
|
2233
|
-
print(f"CPU Cores: {status['parameters']['num_cores']}")
|
|
2234
|
-
print(f"Adducts: {', '.join(status['parameters']['adducts'])}")
|
|
2235
|
-
print(f"Detector Type: {self.params.detector_type}")
|
|
2236
|
-
print(f"Noise Threshold: {self.params.noise_threshold}")
|
|
2237
|
-
print(f"Processing Time: {status['processing_time']}")
|
|
2238
|
-
print(f"Files Processed: {status['processed_files']}")
|
|
2239
|
-
print(f"Files Failed: {status['failed_files']}")
|
|
2240
|
-
print(f"Study Loaded: {status['study_loaded']}")
|
|
2241
|
-
|
|
2242
|
-
if self.study is not None and hasattr(self.study, 'samples_df'):
|
|
2243
|
-
print(f"Samples in Study: {len(self.study.samples_df)}")
|
|
2244
|
-
|
|
2245
|
-
if self.study is not None and hasattr(self.study, 'consensus_df'):
|
|
2246
|
-
print(f"Consensus Features: {len(self.study.consensus_df)}")
|
|
2247
|
-
|
|
2248
|
-
print("=" * 50)
|
|
2249
717
|
|
|
2250
718
|
|
|
2251
719
|
def create_script(
|
|
2252
720
|
source: str,
|
|
2253
|
-
|
|
2254
|
-
filename: str,
|
|
721
|
+
folder: str,
|
|
722
|
+
filename: str = 'run_masster.py',
|
|
2255
723
|
polarity: str = "positive",
|
|
2256
724
|
adducts: Optional[List[str]] = None,
|
|
2257
725
|
params: Optional[wizard_def] = None,
|
|
@@ -2267,7 +735,7 @@ def create_script(
|
|
|
2267
735
|
|
|
2268
736
|
Parameters:
|
|
2269
737
|
source: Directory containing raw data files
|
|
2270
|
-
|
|
738
|
+
folder: Output directory for processed study
|
|
2271
739
|
filename: Filename for the generated script (should end with .py)
|
|
2272
740
|
polarity: Ion polarity mode ("positive" or "negative")
|
|
2273
741
|
adducts: List of adduct specifications (auto-set if None)
|
|
@@ -2282,7 +750,7 @@ def create_script(
|
|
|
2282
750
|
>>> from masster.wizard import create_script
|
|
2283
751
|
>>> create_script(
|
|
2284
752
|
... source=r'D:\\Data\\raw_files',
|
|
2285
|
-
...
|
|
753
|
+
... folder=r'D:\\Data\\output',
|
|
2286
754
|
... filename='run_masster.py',
|
|
2287
755
|
... polarity='positive'
|
|
2288
756
|
... )
|
|
@@ -2294,8 +762,8 @@ def create_script(
|
|
|
2294
762
|
# Use provided params as base
|
|
2295
763
|
wizard_params = params
|
|
2296
764
|
# Update with provided values
|
|
2297
|
-
wizard_params.
|
|
2298
|
-
wizard_params.
|
|
765
|
+
wizard_params.source = source
|
|
766
|
+
wizard_params.folder = folder
|
|
2299
767
|
if polarity != "positive": # Only override if explicitly different
|
|
2300
768
|
wizard_params.polarity = polarity
|
|
2301
769
|
if num_cores > 0:
|
|
@@ -2305,8 +773,8 @@ def create_script(
|
|
|
2305
773
|
else:
|
|
2306
774
|
# Create new params with provided values
|
|
2307
775
|
wizard_params = wizard_def(
|
|
2308
|
-
|
|
2309
|
-
|
|
776
|
+
source=source,
|
|
777
|
+
folder=folder,
|
|
2310
778
|
polarity=polarity,
|
|
2311
779
|
num_cores=max(1, int(multiprocessing.cpu_count() * 0.75)) if num_cores <= 0 else num_cores
|
|
2312
780
|
)
|
|
@@ -2320,28 +788,124 @@ def create_script(
|
|
|
2320
788
|
setattr(wizard_params, key, value)
|
|
2321
789
|
|
|
2322
790
|
# Ensure study folder exists
|
|
2323
|
-
study_path = Path(
|
|
791
|
+
study_path = Path(folder)
|
|
2324
792
|
study_path.mkdir(parents=True, exist_ok=True)
|
|
2325
793
|
|
|
2326
794
|
# Create a temporary Wizard instance to generate the script
|
|
2327
795
|
temp_wizard = Wizard(params=wizard_params)
|
|
2328
796
|
|
|
2329
|
-
# Generate the script using the
|
|
2330
|
-
success = temp_wizard.
|
|
797
|
+
# Generate the script using the instance method
|
|
798
|
+
success = temp_wizard.create_script(filename)
|
|
799
|
+
|
|
800
|
+
return success
|
|
801
|
+
|
|
802
|
+
except Exception as e:
|
|
803
|
+
print(f"Failed to create script: {e}")
|
|
804
|
+
import traceback
|
|
805
|
+
traceback.print_exc()
|
|
806
|
+
return False
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def execute(
|
|
810
|
+
source: str,
|
|
811
|
+
folder: str,
|
|
812
|
+
filename: str = 'run_masster.py',
|
|
813
|
+
polarity: str = "positive",
|
|
814
|
+
adducts: Optional[List[str]] = None,
|
|
815
|
+
params: Optional[wizard_def] = None,
|
|
816
|
+
num_cores: int = 0,
|
|
817
|
+
**kwargs
|
|
818
|
+
) -> bool:
|
|
819
|
+
"""
|
|
820
|
+
Create and execute a standalone analysis script for automated MS data processing.
|
|
821
|
+
|
|
822
|
+
This function generates a Python script with the same parameters as create_script(),
|
|
823
|
+
but immediately executes it after creation. Combines script generation and execution
|
|
824
|
+
in a single step.
|
|
825
|
+
|
|
826
|
+
Parameters:
|
|
827
|
+
source: Directory containing raw data files
|
|
828
|
+
folder: Output directory for processed study
|
|
829
|
+
filename: Filename for the generated script (should end with .py)
|
|
830
|
+
polarity: Ion polarity mode ("positive" or "negative")
|
|
831
|
+
adducts: List of adduct specifications (auto-set if None)
|
|
832
|
+
params: Custom wizard_def parameters (optional)
|
|
833
|
+
num_cores: Number of CPU cores (0 = auto-detect)
|
|
834
|
+
**kwargs: Additional parameters to override defaults
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
True if script was created and executed successfully, False otherwise
|
|
838
|
+
|
|
839
|
+
Example:
|
|
840
|
+
>>> from masster.wizard import execute
|
|
841
|
+
>>> execute(
|
|
842
|
+
... source=r'D:\\Data\\raw_files',
|
|
843
|
+
... folder=r'D:\\Data\\output',
|
|
844
|
+
... polarity='positive'
|
|
845
|
+
... )
|
|
846
|
+
"""
|
|
847
|
+
|
|
848
|
+
try:
|
|
849
|
+
# First, create the script using create_script()
|
|
850
|
+
script_created = create_script(
|
|
851
|
+
source=source,
|
|
852
|
+
folder=folder,
|
|
853
|
+
filename=filename,
|
|
854
|
+
polarity=polarity,
|
|
855
|
+
adducts=adducts,
|
|
856
|
+
params=params,
|
|
857
|
+
num_cores=num_cores,
|
|
858
|
+
**kwargs
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
if not script_created:
|
|
862
|
+
print("Failed to create analysis script")
|
|
863
|
+
return False
|
|
864
|
+
|
|
865
|
+
# Get the full path to the created script
|
|
866
|
+
study_path = Path(folder)
|
|
867
|
+
script_path = study_path / Path(filename).name
|
|
868
|
+
|
|
869
|
+
if not script_path.exists():
|
|
870
|
+
print(f"Script file not found: {script_path}")
|
|
871
|
+
return False
|
|
872
|
+
|
|
873
|
+
print(f"Executing...")
|
|
874
|
+
#print("=" * 70)
|
|
875
|
+
|
|
876
|
+
# Execute the script using subprocess with real-time output
|
|
877
|
+
import subprocess
|
|
878
|
+
|
|
879
|
+
# Run the script with Python, letting it inherit our stdout/stderr
|
|
880
|
+
try:
|
|
881
|
+
# Use subprocess.run for direct output inheritance - no capturing/re-printing
|
|
882
|
+
result = subprocess.run([
|
|
883
|
+
sys.executable, str(script_path)
|
|
884
|
+
], cwd=str(study_path))
|
|
885
|
+
|
|
886
|
+
return_code = result.returncode
|
|
887
|
+
|
|
888
|
+
except Exception as e:
|
|
889
|
+
print(f"Error during script execution: {e}")
|
|
890
|
+
return False
|
|
891
|
+
|
|
892
|
+
success = return_code == 0
|
|
2331
893
|
|
|
2332
894
|
if success:
|
|
2333
|
-
|
|
2334
|
-
print(
|
|
2335
|
-
|
|
895
|
+
print("=" * 70)
|
|
896
|
+
print("Script execution completed successfully")
|
|
897
|
+
else:
|
|
898
|
+
print("=" * 70)
|
|
899
|
+
print(f"Script execution failed with return code: {return_code}")
|
|
2336
900
|
|
|
2337
901
|
return success
|
|
2338
902
|
|
|
2339
903
|
except Exception as e:
|
|
2340
|
-
print(f"Failed to
|
|
904
|
+
print(f"Failed to execute script: {e}")
|
|
2341
905
|
import traceback
|
|
2342
906
|
traceback.print_exc()
|
|
2343
907
|
return False
|
|
2344
908
|
|
|
2345
909
|
|
|
2346
910
|
# Export the main classes and functions
|
|
2347
|
-
__all__ = ["Wizard", "wizard_def", "create_script"]
|
|
911
|
+
__all__ = ["Wizard", "wizard_def", "create_script", "execute"]
|