masster 0.4.22__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa.csv +22 -0
- masster/lib/lib.py +6 -0
- masster/sample/adducts.py +1 -1
- masster/sample/load.py +10 -9
- masster/sample/plot.py +1 -1
- masster/sample/processing.py +4 -4
- masster/sample/sample.py +29 -32
- masster/study/analysis.py +1762 -0
- masster/study/defaults/fill_def.py +1 -1
- masster/study/export.py +5 -3
- masster/study/h5.py +3 -0
- masster/study/helpers.py +153 -80
- masster/study/id.py +545 -4
- masster/study/load.py +33 -59
- masster/study/merge.py +413 -315
- masster/study/parameters.py +3 -3
- masster/study/plot.py +398 -43
- masster/study/processing.py +6 -14
- masster/study/save.py +8 -4
- masster/study/study.py +179 -139
- masster/study/study5_schema.json +9 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/METADATA +54 -14
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/RECORD +27 -25
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/WHEEL +0 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/licenses/LICENSE +0 -0
masster/study/study.py
CHANGED
|
@@ -52,36 +52,26 @@ import sys
|
|
|
52
52
|
import polars as pl
|
|
53
53
|
|
|
54
54
|
# Study-specific imports
|
|
55
|
-
from masster.study.
|
|
56
|
-
from masster.study.h5 import _save_study5
|
|
57
|
-
from masster.study.h5 import _save_study5_compressed
|
|
58
|
-
from masster.study.h5 import _load_ms1
|
|
55
|
+
from masster.study.analysis import analyze_umap
|
|
59
56
|
from masster.study.helpers import _get_consensus_uids
|
|
60
|
-
from masster.study.helpers import
|
|
61
|
-
from masster.study.helpers import
|
|
62
|
-
from masster.study.helpers import _ensure_features_df_schema_order
|
|
57
|
+
from masster.study.helpers import _get_features_uids
|
|
58
|
+
from masster.study.helpers import _get_samples_uids
|
|
63
59
|
from masster.study.helpers import compress
|
|
64
|
-
from masster.study.helpers import compress_features
|
|
65
|
-
from masster.study.helpers import compress_ms2
|
|
66
|
-
from masster.study.helpers import compress_chrom
|
|
67
|
-
from masster.study.helpers import restore_features
|
|
68
|
-
from masster.study.helpers import restore_chrom
|
|
69
|
-
from masster.study.helpers import restore_ms2
|
|
70
60
|
from masster.study.helpers import decompress
|
|
71
61
|
from masster.study.helpers import fill_reset
|
|
72
62
|
from masster.study.helpers import get_chrom
|
|
73
|
-
from masster.study.helpers import
|
|
63
|
+
from masster.study.helpers import get_samples
|
|
74
64
|
from masster.study.helpers import get_consensus
|
|
75
65
|
from masster.study.helpers import get_consensus_matches
|
|
76
66
|
from masster.study.helpers import get_consensus_matrix
|
|
77
67
|
from masster.study.helpers import get_orphans
|
|
68
|
+
from masster.study.helpers import get_sample_stats
|
|
78
69
|
from masster.study.helpers import get_gaps_matrix
|
|
79
70
|
from masster.study.helpers import get_gaps_stats
|
|
80
71
|
from masster.study.helpers import align_reset
|
|
81
|
-
from masster.study.helpers import
|
|
82
|
-
from masster.study.helpers import
|
|
83
|
-
from masster.study.helpers import
|
|
84
|
-
from masster.study.helpers import sample_color_reset
|
|
72
|
+
from masster.study.helpers import set_study_folder
|
|
73
|
+
from masster.study.helpers import set_samples_source
|
|
74
|
+
from masster.study.helpers import set_samples_color
|
|
85
75
|
from masster.study.helpers import sample_name_replace
|
|
86
76
|
from masster.study.helpers import sample_name_reset
|
|
87
77
|
from masster.study.helpers import samples_select
|
|
@@ -94,61 +84,34 @@ from masster.study.helpers import consensus_filter
|
|
|
94
84
|
from masster.study.helpers import consensus_delete
|
|
95
85
|
from masster.study.load import add
|
|
96
86
|
from masster.study.load import add_sample
|
|
97
|
-
from masster.study.load import _add_samples_batch
|
|
98
|
-
from masster.study.load import _add_sample_optimized
|
|
99
|
-
from masster.study.load import _add_sample_standard
|
|
100
|
-
from masster.study.load import _sample_color_reset_optimized
|
|
101
|
-
from masster.study.load import fill_single
|
|
102
87
|
from masster.study.load import fill
|
|
103
|
-
from masster.study.load import _process_sample_for_parallel_fill
|
|
104
|
-
from masster.study.load import _get_missing_consensus_sample_combinations
|
|
105
88
|
from masster.study.load import load
|
|
106
|
-
from masster.study.load import
|
|
107
|
-
from masster.study.load import load_features
|
|
108
|
-
from masster.study.load import sanitize
|
|
89
|
+
from masster.study.load import _load_features
|
|
109
90
|
from masster.study.plot import plot_alignment
|
|
110
91
|
from masster.study.plot import plot_consensus_2d
|
|
111
92
|
from masster.study.plot import plot_samples_2d
|
|
112
93
|
from masster.study.plot import plot_consensus_stats
|
|
113
94
|
from masster.study.plot import plot_chrom
|
|
114
|
-
from masster.study.plot import
|
|
95
|
+
from masster.study.plot import plot_samples_pca
|
|
96
|
+
from masster.study.plot import plot_samples_umap
|
|
115
97
|
from masster.study.plot import plot_bpc
|
|
116
98
|
from masster.study.plot import plot_tic
|
|
117
99
|
from masster.study.plot import plot_eic
|
|
118
100
|
from masster.study.plot import plot_rt_correction
|
|
119
101
|
from masster.study.processing import align
|
|
120
102
|
from masster.study.merge import merge
|
|
121
|
-
from masster.study.merge import _reset_consensus_data
|
|
122
|
-
from masster.study.merge import _extract_consensus_features
|
|
123
|
-
from masster.study.merge import _perform_adduct_grouping
|
|
124
|
-
from masster.study.merge import _consensus_cleanup
|
|
125
|
-
from masster.study.merge import _identify_adduct_by_mass_shift
|
|
126
|
-
from masster.study.merge import _finalize_merge
|
|
127
|
-
from masster.study.merge import _count_tight_clusters
|
|
128
103
|
from masster.study.processing import integrate
|
|
129
104
|
from masster.study.processing import find_ms2
|
|
130
105
|
from masster.study.processing import find_iso
|
|
131
106
|
from masster.study.processing import reset_iso
|
|
132
|
-
from masster.study.parameters import
|
|
107
|
+
from masster.study.parameters import update_history
|
|
133
108
|
from masster.study.parameters import get_parameters
|
|
134
109
|
from masster.study.parameters import update_parameters
|
|
135
110
|
from masster.study.parameters import get_parameters_property
|
|
136
111
|
from masster.study.parameters import set_parameters_property
|
|
137
|
-
from masster.study.save import save, save_consensus,
|
|
138
|
-
from masster.study.export import
|
|
139
|
-
export_mgf,
|
|
140
|
-
export_mztab,
|
|
141
|
-
export_xlsx,
|
|
142
|
-
export_parquet,
|
|
143
|
-
_get_mgf_df,
|
|
144
|
-
)
|
|
112
|
+
from masster.study.save import save, save_consensus, save_samples
|
|
113
|
+
from masster.study.export import export_mgf, export_mztab, export_xlsx, export_parquet
|
|
145
114
|
from masster.study.id import lib_load, identify, get_id, id_reset, lib_reset
|
|
146
|
-
from masster.study.id import (
|
|
147
|
-
_get_adducts,
|
|
148
|
-
_calculate_formula_mass_shift,
|
|
149
|
-
_format_adduct_name,
|
|
150
|
-
_parse_element_counts,
|
|
151
|
-
)
|
|
152
115
|
|
|
153
116
|
from masster.logger import MassterLogger
|
|
154
117
|
from masster.study.defaults.study_def import study_defaults
|
|
@@ -253,8 +216,24 @@ class Study:
|
|
|
253
216
|
For backward compatibility, original signature is supported:
|
|
254
217
|
Study(folder=..., label=..., log_level=..., log_label=..., log_sink=...)
|
|
255
218
|
"""
|
|
256
|
-
#
|
|
219
|
+
# ===== PARAMETER INITIALIZATION =====
|
|
220
|
+
auto_load_filename = self._init_parameters(filename, kwargs)
|
|
221
|
+
|
|
222
|
+
# ===== DATA STRUCTURES INITIALIZATION =====
|
|
223
|
+
self._init_data_structures()
|
|
224
|
+
|
|
225
|
+
# ===== LOGGER INITIALIZATION =====
|
|
226
|
+
self._init_logger()
|
|
227
|
+
|
|
228
|
+
# ===== AUTO-LOAD FILE IF PROVIDED =====
|
|
229
|
+
if auto_load_filename is not None:
|
|
230
|
+
self.load(filename=auto_load_filename)
|
|
257
231
|
|
|
232
|
+
# ===== SAMPLE CACHE =====
|
|
233
|
+
self._samples_cache = {}
|
|
234
|
+
|
|
235
|
+
def _init_parameters(self, filename, kwargs):
|
|
236
|
+
"""Initialize parameters and handle filename for auto-loading."""
|
|
258
237
|
# Handle filename parameter for automatic loading
|
|
259
238
|
auto_load_filename = None
|
|
260
239
|
if filename is not None:
|
|
@@ -281,13 +260,11 @@ class Study:
|
|
|
281
260
|
if hasattr(params, key):
|
|
282
261
|
params.set(key, value, validate=True)
|
|
283
262
|
|
|
284
|
-
#
|
|
285
|
-
self.filename = None
|
|
286
|
-
|
|
287
|
-
# Store parameter instance for method access
|
|
263
|
+
# Store parameter instance and initialize history
|
|
264
|
+
self.filename = None # Keeps a pointer to study5 whenever it's saved or loaded
|
|
288
265
|
self.parameters = params
|
|
289
266
|
self.history = {}
|
|
290
|
-
self.
|
|
267
|
+
self.update_history(["study"], params.to_dict())
|
|
291
268
|
|
|
292
269
|
# Set instance attributes (ensure proper string values for logger)
|
|
293
270
|
self.folder = params.folder
|
|
@@ -301,10 +278,15 @@ class Study:
|
|
|
301
278
|
self.log_label = params.log_label + " | " if params.log_label else ""
|
|
302
279
|
self.log_sink = params.log_sink
|
|
303
280
|
|
|
281
|
+
# Create folder if it doesn't exist
|
|
304
282
|
if self.folder is not None and not os.path.exists(self.folder):
|
|
305
|
-
# create the folder if it does not exist
|
|
306
283
|
os.makedirs(self.folder)
|
|
284
|
+
|
|
285
|
+
return auto_load_filename
|
|
307
286
|
|
|
287
|
+
def _init_data_structures(self):
|
|
288
|
+
"""Initialize all data structures used by the Study."""
|
|
289
|
+
# Sample information DataFrame
|
|
308
290
|
self.samples_df = pl.DataFrame(
|
|
309
291
|
{
|
|
310
292
|
"sample_uid": [],
|
|
@@ -337,21 +319,24 @@ class Study:
|
|
|
337
319
|
"num_ms2": pl.Int64,
|
|
338
320
|
},
|
|
339
321
|
)
|
|
322
|
+
|
|
323
|
+
# Feature-related data structures
|
|
340
324
|
self.features_maps = []
|
|
341
325
|
self.features_df = pl.DataFrame()
|
|
326
|
+
|
|
327
|
+
# Consensus-related data structures
|
|
342
328
|
self.consensus_ms2 = pl.DataFrame()
|
|
343
329
|
self.consensus_df = pl.DataFrame()
|
|
344
330
|
self.consensus_map = None
|
|
345
331
|
self.consensus_mapping_df = pl.DataFrame()
|
|
346
332
|
self.alignment_ref_index = None
|
|
347
333
|
|
|
348
|
-
# Library
|
|
349
|
-
self.lib_df = pl.DataFrame()
|
|
334
|
+
# Library and identification data structures
|
|
335
|
+
self.lib_df = pl.DataFrame() # populated by lib_load
|
|
336
|
+
self.id_df = pl.DataFrame() # populated by identify
|
|
350
337
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
# Initialize independent logger
|
|
338
|
+
def _init_logger(self):
|
|
339
|
+
"""Initialize the logger for this Study instance."""
|
|
355
340
|
self.logger = MassterLogger(
|
|
356
341
|
instance_type="study",
|
|
357
342
|
level=self.log_level.upper(),
|
|
@@ -361,85 +346,68 @@ class Study:
|
|
|
361
346
|
self.logger.debug(f"Study folder: {self.folder}")
|
|
362
347
|
self.logger.debug(f"Polarity: {self.polarity}")
|
|
363
348
|
|
|
364
|
-
# Auto-load study file if filename was provided
|
|
365
|
-
if auto_load_filename is not None:
|
|
366
|
-
self.load(filename=auto_load_filename)
|
|
367
|
-
|
|
368
|
-
# cache for Sample instances created/loaded by this Study
|
|
369
|
-
self._samples_cache = {}
|
|
370
|
-
|
|
371
|
-
# ===== ATTACH MODULE FUNCTIONS AS CLASS METHODS =====
|
|
372
|
-
|
|
373
349
|
# === File I/O Operations ===
|
|
374
350
|
load = load
|
|
375
351
|
save = save
|
|
376
352
|
save_consensus = save_consensus
|
|
377
353
|
save_samples = save_samples
|
|
378
|
-
|
|
379
|
-
set_folder = set_folder
|
|
354
|
+
set_study_folder = set_study_folder
|
|
380
355
|
|
|
381
356
|
# === Sample Management ===
|
|
382
357
|
add = add
|
|
383
|
-
add_folder = add # backward compatibility alias
|
|
384
358
|
add_sample = add_sample
|
|
385
359
|
|
|
386
360
|
# === Core Processing Operations ===
|
|
387
361
|
align = align
|
|
388
|
-
merge = merge
|
|
389
|
-
|
|
362
|
+
merge = merge
|
|
363
|
+
|
|
390
364
|
find_ms2 = find_ms2
|
|
391
365
|
find_iso = find_iso
|
|
392
366
|
reset_iso = reset_iso
|
|
393
367
|
iso_reset = reset_iso
|
|
394
368
|
integrate = integrate
|
|
395
|
-
|
|
369
|
+
|
|
396
370
|
fill = fill
|
|
397
|
-
|
|
398
|
-
fill_single = fill_single
|
|
399
|
-
fill_chrom_single = fill_single # Backward compatibility alias
|
|
371
|
+
|
|
400
372
|
# === Data Retrieval and Access ===
|
|
401
373
|
get_consensus = get_consensus
|
|
402
374
|
get_chrom = get_chrom
|
|
403
|
-
|
|
375
|
+
get_samples = get_samples
|
|
404
376
|
get_consensus_matches = get_consensus_matches
|
|
405
377
|
get_consensus_matrix = get_consensus_matrix
|
|
406
378
|
get_gaps_matrix = get_gaps_matrix
|
|
407
379
|
get_gaps_stats = get_gaps_stats
|
|
408
380
|
get_orphans = get_orphans
|
|
381
|
+
get_sample_stats = get_sample_stats
|
|
409
382
|
|
|
410
383
|
# === Data Selection and Filtering ===
|
|
411
384
|
samples_select = samples_select
|
|
412
385
|
samples_delete = samples_delete
|
|
386
|
+
|
|
413
387
|
features_select = features_select
|
|
414
388
|
features_filter = features_filter
|
|
415
389
|
features_delete = features_delete
|
|
416
390
|
consensus_select = consensus_select
|
|
417
391
|
consensus_filter = consensus_filter
|
|
418
392
|
consensus_delete = consensus_delete
|
|
419
|
-
# Backward compatibility aliases
|
|
420
|
-
filter_consensus = consensus_filter
|
|
421
|
-
select_consensus = consensus_select
|
|
422
|
-
filter_features = features_filter
|
|
423
|
-
select_features = features_select
|
|
424
|
-
consensus_find = merge
|
|
425
|
-
|
|
426
|
-
# === Sample Metadata and Styling ===
|
|
427
|
-
set_source = set_source
|
|
428
|
-
sample_color = sample_color
|
|
429
|
-
sample_color_reset = sample_color_reset
|
|
430
|
-
reset_sample_color = sample_color_reset
|
|
431
|
-
name_replace = sample_name_replace
|
|
432
|
-
name_reset = sample_name_reset
|
|
433
|
-
reset_name = sample_name_reset
|
|
434
393
|
|
|
394
|
+
# === Sample Metadata and Styling ===
|
|
395
|
+
set_samples_source = set_samples_source
|
|
396
|
+
set_samples_color = set_samples_color
|
|
397
|
+
|
|
398
|
+
samples_name_replace = sample_name_replace
|
|
399
|
+
samples_name_reset = sample_name_reset
|
|
400
|
+
|
|
401
|
+
# Backward compatibility aliases for renamed methods
|
|
402
|
+
set_folder = set_study_folder
|
|
403
|
+
set_source = set_samples_source
|
|
404
|
+
#sample_color = set_samples_color
|
|
405
|
+
#get_sample = get_samples
|
|
406
|
+
#load_features = _load_features
|
|
407
|
+
store_history = update_history
|
|
408
|
+
|
|
435
409
|
# === Data Compression and Storage ===
|
|
436
410
|
compress = compress
|
|
437
|
-
compress_features = compress_features
|
|
438
|
-
compress_ms2 = compress_ms2
|
|
439
|
-
compress_chrom = compress_chrom
|
|
440
|
-
restore_features = restore_features
|
|
441
|
-
restore_chrom = restore_chrom
|
|
442
|
-
restore_ms2 = restore_ms2
|
|
443
411
|
decompress = decompress
|
|
444
412
|
|
|
445
413
|
# === Reset Operations ===
|
|
@@ -453,12 +421,16 @@ class Study:
|
|
|
453
421
|
plot_chrom = plot_chrom
|
|
454
422
|
plot_consensus_2d = plot_consensus_2d
|
|
455
423
|
plot_consensus_stats = plot_consensus_stats
|
|
456
|
-
|
|
424
|
+
plot_samples_pca = plot_samples_pca
|
|
425
|
+
plot_samples_umap = plot_samples_umap
|
|
457
426
|
plot_samples_2d = plot_samples_2d
|
|
458
427
|
plot_bpc = plot_bpc
|
|
459
428
|
plot_rt_correction = plot_rt_correction
|
|
460
429
|
plot_tic = plot_tic
|
|
461
430
|
plot_eic = plot_eic
|
|
431
|
+
|
|
432
|
+
# === Analysis Operations ===
|
|
433
|
+
analyze_umap = analyze_umap
|
|
462
434
|
|
|
463
435
|
# === Export Operations ===
|
|
464
436
|
export_mgf = export_mgf
|
|
@@ -468,6 +440,11 @@ class Study:
|
|
|
468
440
|
|
|
469
441
|
# === Identification and Library Matching ===
|
|
470
442
|
lib_load = lib_load
|
|
443
|
+
|
|
444
|
+
def lib_to_consensus(self, **kwargs):
|
|
445
|
+
"""Create consensus features from library entries."""
|
|
446
|
+
from masster.study.id import lib_to_consensus as _lib_to_consensus
|
|
447
|
+
return _lib_to_consensus(self, **kwargs)
|
|
471
448
|
identify = identify
|
|
472
449
|
get_id = get_id
|
|
473
450
|
id_reset = id_reset
|
|
@@ -476,45 +453,23 @@ class Study:
|
|
|
476
453
|
reset_lib = lib_reset
|
|
477
454
|
|
|
478
455
|
# === Parameter Management ===
|
|
479
|
-
|
|
456
|
+
update_history = update_history
|
|
480
457
|
get_parameters = get_parameters
|
|
481
458
|
update_parameters = update_parameters
|
|
482
459
|
get_parameters_property = get_parameters_property
|
|
483
460
|
set_parameters_property = set_parameters_property
|
|
484
461
|
|
|
485
462
|
# === Private/Internal Methods ===
|
|
486
|
-
_add_samples_batch = _add_samples_batch
|
|
487
|
-
_add_sample_optimized = _add_sample_optimized
|
|
488
|
-
_add_sample_standard = _add_sample_standard
|
|
489
|
-
_sample_color_reset_optimized = _sample_color_reset_optimized
|
|
490
|
-
_load_study5 = _load_study5
|
|
491
|
-
_save_study5 = _save_study5
|
|
492
|
-
_save_study5_compressed = _save_study5_compressed
|
|
493
|
-
_load_ms1 = _load_ms1
|
|
494
463
|
_get_consensus_uids = _get_consensus_uids
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
)
|
|
502
|
-
_load_consensusXML = _load_consensusXML
|
|
503
|
-
load_features = load_features
|
|
504
|
-
_save_consensusXML = _save_consensusXML
|
|
505
|
-
_get_mgf_df = _get_mgf_df
|
|
506
|
-
_get_adducts = _get_adducts
|
|
507
|
-
_calculate_formula_mass_shift = _calculate_formula_mass_shift
|
|
508
|
-
_format_adduct_name = _format_adduct_name
|
|
509
|
-
_parse_element_counts = _parse_element_counts
|
|
464
|
+
_get_features_uids = _get_features_uids
|
|
465
|
+
_get_samples_uids = _get_samples_uids
|
|
466
|
+
_load_features = _load_features
|
|
467
|
+
|
|
468
|
+
# Note: _load_study5 and _save_study5 are not exposed as class methods
|
|
469
|
+
# They are used internally by load() and save() methods only
|
|
510
470
|
|
|
511
471
|
# === Merge Helper Methods ===
|
|
512
|
-
|
|
513
|
-
_extract_consensus_features = _extract_consensus_features
|
|
514
|
-
_perform_adduct_grouping = _perform_adduct_grouping
|
|
515
|
-
_consensus_cleanup = _consensus_cleanup
|
|
516
|
-
_identify_adduct_by_mass_shift = _identify_adduct_by_mass_shift
|
|
517
|
-
_finalize_merge = _finalize_merge
|
|
472
|
+
# (All merge helper methods are now internal to the merge module)
|
|
518
473
|
|
|
519
474
|
# === Default Parameters ===
|
|
520
475
|
study_defaults = study_defaults
|
|
@@ -612,6 +567,83 @@ class Study:
|
|
|
612
567
|
except Exception as e:
|
|
613
568
|
self.logger.error(f"Failed to reload current module {current_module}: {e}")
|
|
614
569
|
|
|
570
|
+
def _sanitize_null_ids(self):
|
|
571
|
+
"""
|
|
572
|
+
Sanitize null feature_id and consensus_id values by replacing them with new integer IDs.
|
|
573
|
+
For feature_id: generates large sequential integers that can be converted by merge/align functions.
|
|
574
|
+
For consensus_id: uses 16-character UUID strings (as expected by merge function).
|
|
575
|
+
"""
|
|
576
|
+
import uuid
|
|
577
|
+
import polars as pl
|
|
578
|
+
import time
|
|
579
|
+
|
|
580
|
+
# Sanitize features_df feature_id column
|
|
581
|
+
if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
|
|
582
|
+
# Check for null feature_ids
|
|
583
|
+
null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
|
|
584
|
+
if null_feature_ids > 0:
|
|
585
|
+
self.logger.info(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
|
|
586
|
+
|
|
587
|
+
# Find the maximum existing feature_id (convert strings to int if possible)
|
|
588
|
+
max_existing_id = 0
|
|
589
|
+
existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
|
|
590
|
+
for fid in existing_ids:
|
|
591
|
+
try:
|
|
592
|
+
int_id = int(fid)
|
|
593
|
+
max_existing_id = max(max_existing_id, int_id)
|
|
594
|
+
except (ValueError, TypeError):
|
|
595
|
+
# Skip non-integer IDs
|
|
596
|
+
pass
|
|
597
|
+
|
|
598
|
+
# Generate new sequential integer IDs starting from max + timestamp offset
|
|
599
|
+
# Use timestamp to ensure uniqueness across different sanitization runs
|
|
600
|
+
base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
|
|
601
|
+
new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
|
|
602
|
+
uid_index = 0
|
|
603
|
+
|
|
604
|
+
# Create a list to store all feature_ids
|
|
605
|
+
feature_ids = []
|
|
606
|
+
for feature_id in self.features_df["feature_id"].to_list():
|
|
607
|
+
if feature_id is None:
|
|
608
|
+
feature_ids.append(new_int_ids[uid_index])
|
|
609
|
+
uid_index += 1
|
|
610
|
+
else:
|
|
611
|
+
feature_ids.append(feature_id)
|
|
612
|
+
|
|
613
|
+
# Update the DataFrame with sanitized feature_ids
|
|
614
|
+
self.features_df = self.features_df.with_columns(
|
|
615
|
+
pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
self.logger.info(f"Successfully sanitized {null_feature_ids} feature_id values")
|
|
619
|
+
|
|
620
|
+
# Sanitize consensus_df consensus_id column
|
|
621
|
+
if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
622
|
+
if "consensus_id" in self.consensus_df.columns:
|
|
623
|
+
null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
|
|
624
|
+
if null_consensus_ids > 0:
|
|
625
|
+
self.logger.info(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
|
|
626
|
+
|
|
627
|
+
# Generate new UIDs for null values using the same method as merge()
|
|
628
|
+
new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
|
|
629
|
+
uid_index = 0
|
|
630
|
+
|
|
631
|
+
# Create a list to store all consensus_ids
|
|
632
|
+
consensus_ids = []
|
|
633
|
+
for consensus_id in self.consensus_df["consensus_id"].to_list():
|
|
634
|
+
if consensus_id is None:
|
|
635
|
+
consensus_ids.append(new_uids[uid_index])
|
|
636
|
+
uid_index += 1
|
|
637
|
+
else:
|
|
638
|
+
consensus_ids.append(consensus_id)
|
|
639
|
+
|
|
640
|
+
# Update the DataFrame with sanitized consensus_ids
|
|
641
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
642
|
+
pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
self.logger.info(f"Successfully sanitized {null_consensus_ids} consensus_id values")
|
|
646
|
+
|
|
615
647
|
def __dir__(self):
|
|
616
648
|
"""
|
|
617
649
|
Custom __dir__ implementation to hide internal methods starting with '_'
|
|
@@ -627,16 +659,24 @@ class Study:
|
|
|
627
659
|
'find_consensus', # alias for merge
|
|
628
660
|
'integrate_chrom', # alias for integrate
|
|
629
661
|
'fill_chrom', # alias for fill
|
|
630
|
-
'fill_chrom_single', # alias for fill_single
|
|
631
662
|
'filter_consensus', # alias for consensus_filter
|
|
632
663
|
'select_consensus', # alias for consensus_select
|
|
633
664
|
'filter_features', # alias for features_filter
|
|
634
665
|
'select_features', # alias for features_select
|
|
635
666
|
'consensus_find', # alias for merge
|
|
667
|
+
# Backward compatibility for renamed methods
|
|
668
|
+
'set_folder', # alias for set_study_folder
|
|
669
|
+
'set_source', # alias for set_samples_source
|
|
670
|
+
'sample_color', # alias for set_samples_color
|
|
671
|
+
'get_sample', # alias for get_samples
|
|
672
|
+
'load_features', # alias for _load_features
|
|
673
|
+
'store_history', # alias for update_history
|
|
674
|
+
'sample_color_reset', # alias for set_samples_color(by=None)
|
|
675
|
+
'reset_sample_color', # alias for sample_color_reset
|
|
636
676
|
}
|
|
637
677
|
|
|
638
678
|
# Get all attributes from the class
|
|
639
|
-
all_attrs = set()
|
|
679
|
+
all_attrs: set[str] = set()
|
|
640
680
|
|
|
641
681
|
# Add attributes from the class and all its bases
|
|
642
682
|
for cls in self.__class__.__mro__:
|
|
@@ -895,8 +935,9 @@ class Study:
|
|
|
895
935
|
tight_clusters_count = 0
|
|
896
936
|
if consensus_df_len > 0:
|
|
897
937
|
try:
|
|
938
|
+
from masster.study.merge import _count_tight_clusters
|
|
898
939
|
tight_clusters_count = _count_tight_clusters(self, mz_tol=0.04, rt_tol=0.3)
|
|
899
|
-
except Exception
|
|
940
|
+
except Exception:
|
|
900
941
|
# If tight clusters calculation fails, just use 0
|
|
901
942
|
tight_clusters_count = 0
|
|
902
943
|
|
|
@@ -952,7 +993,6 @@ class Study:
|
|
|
952
993
|
|
|
953
994
|
print(summary)
|
|
954
995
|
|
|
955
|
-
|
|
956
996
|
if __name__ == "__main__":
|
|
957
997
|
# This block is executed when the script is run directly
|
|
958
998
|
pass
|
masster/study/study5_schema.json
CHANGED
|
@@ -327,6 +327,9 @@
|
|
|
327
327
|
"formula": {
|
|
328
328
|
"dtype": "pl.String"
|
|
329
329
|
},
|
|
330
|
+
"iso": {
|
|
331
|
+
"dtype": "pl.Int64"
|
|
332
|
+
},
|
|
330
333
|
"adduct": {
|
|
331
334
|
"dtype": "pl.String"
|
|
332
335
|
},
|
|
@@ -342,6 +345,9 @@
|
|
|
342
345
|
"rt": {
|
|
343
346
|
"dtype": "pl.Null"
|
|
344
347
|
},
|
|
348
|
+
"quant_group": {
|
|
349
|
+
"dtype": "pl.Int64"
|
|
350
|
+
},
|
|
345
351
|
"db_id": {
|
|
346
352
|
"dtype": "pl.String"
|
|
347
353
|
},
|
|
@@ -369,6 +375,9 @@
|
|
|
369
375
|
},
|
|
370
376
|
"score": {
|
|
371
377
|
"dtype": "pl.Float64"
|
|
378
|
+
},
|
|
379
|
+
"iso": {
|
|
380
|
+
"dtype": "pl.Int64"
|
|
372
381
|
}
|
|
373
382
|
}
|
|
374
383
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: masster
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Mass spectrometry data analysis package
|
|
5
5
|
Project-URL: homepage, https://github.com/zamboni-lab/masster
|
|
6
6
|
Project-URL: repository, https://github.com/zamboni-lab/masster
|
|
@@ -686,6 +686,7 @@ Requires-Dist: bokeh>=3.7.3
|
|
|
686
686
|
Requires-Dist: cmap>=0.6.2
|
|
687
687
|
Requires-Dist: datashader>=0.18.1
|
|
688
688
|
Requires-Dist: h5py>=3.14.0
|
|
689
|
+
Requires-Dist: hdbscan>=0.8.40
|
|
689
690
|
Requires-Dist: holoviews>=1.21.0
|
|
690
691
|
Requires-Dist: hvplot>=0.11.3
|
|
691
692
|
Requires-Dist: loguru>=0.7.3
|
|
@@ -701,6 +702,7 @@ Requires-Dist: pythonnet>=3.0.0
|
|
|
701
702
|
Requires-Dist: scikit-learn>=1.7.1
|
|
702
703
|
Requires-Dist: scipy>=1.12.0
|
|
703
704
|
Requires-Dist: tqdm>=4.65.0
|
|
705
|
+
Requires-Dist: umap-learn>=0.5.9.post2
|
|
704
706
|
Provides-Extra: dev
|
|
705
707
|
Requires-Dist: bandit>=1.7.0; extra == 'dev'
|
|
706
708
|
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
@@ -726,12 +728,13 @@ Description-Content-Type: text/markdown
|
|
|
726
728
|
|
|
727
729
|
# MASSter
|
|
728
730
|
|
|
729
|
-
**MASSter** is a
|
|
731
|
+
**MASSter** is a Python package for the analysis of mass spectrometry data, tailored for the purpose of metabolomics and LC-MS data processing. It is designed to deal with DDA, and hides functionalities for DIA and ZTScan DIA data. The sample-centric feature detection uses OpenMS. All other functionalities for e.g. centroiding, RT alignment, adduct and isotopomer detection, merging of multiple samples, gap-filling, quantification, etc. were redesigned and engineered to maximize scalability (tested with 3000 LC-MS), speed, quality, and results.
|
|
730
732
|
|
|
731
733
|
This is a poorly documented, stable branch of the development codebase in use in the Zamboni lab.
|
|
732
734
|
|
|
733
|
-
|
|
735
|
+
## Prerequisites
|
|
734
736
|
|
|
737
|
+
**MASSter** reads raw (Thermo), wiff (SCIEX), or mzML data. It's recommended to provide raw, profile data.
|
|
735
738
|
|
|
736
739
|
## Installation
|
|
737
740
|
|
|
@@ -739,20 +742,60 @@ Some of the core processing functions are derived from OpenMS. We use the same n
|
|
|
739
742
|
pip install masster
|
|
740
743
|
```
|
|
741
744
|
|
|
742
|
-
|
|
745
|
+
## Basic usage
|
|
746
|
+
### Quick start: use the wizard
|
|
747
|
+
|
|
748
|
+
```python
|
|
749
|
+
import masster
|
|
750
|
+
masster.wizard.execute(
|
|
751
|
+
source=r'..\..\folder_with_raw_data',
|
|
752
|
+
folder=r'..\..folder_to_store_results'
|
|
753
|
+
)
|
|
754
|
+
```
|
|
743
755
|
|
|
756
|
+
This will run a wizard that should perform all key steps and save the results to the `folder`.
|
|
757
|
+
|
|
758
|
+
### Basic workflow for analyzing a single sample
|
|
744
759
|
```python
|
|
745
760
|
import masster
|
|
761
|
+
sample = masster.Sample(filename='...') # full path to a *.raw, *.wiff, or *.mzML file
|
|
762
|
+
# process
|
|
763
|
+
sample.find_features(chrom_fwhm=0.5, noise=50) # for orbitrap data, set noise to 1e5
|
|
764
|
+
sample.find_adducts()
|
|
765
|
+
sample.find_ms2()
|
|
766
|
+
|
|
767
|
+
# access data
|
|
768
|
+
sample.features_df
|
|
769
|
+
|
|
770
|
+
# save results
|
|
771
|
+
sample.save() # stores to *.sample5, our custom hdf5 format
|
|
772
|
+
sample.export_mgf()
|
|
773
|
+
|
|
774
|
+
# some plots
|
|
775
|
+
sample.plot_bpc()
|
|
776
|
+
sample.plot_tic()
|
|
777
|
+
sample.plot_2d()
|
|
778
|
+
sample.plot_features_stats()
|
|
779
|
+
|
|
780
|
+
# explore methods
|
|
781
|
+
dir(study)
|
|
782
|
+
```
|
|
783
|
+
|
|
784
|
+
### Basic Workflow for analyzing LC-MS study with 2-... samples
|
|
746
785
|
|
|
786
|
+
```python
|
|
787
|
+
import masster
|
|
747
788
|
# Initialize the Study object with the default folder
|
|
748
|
-
study = masster.Study(
|
|
789
|
+
study = masster.Study(folder=r'D:\...\mylcms')
|
|
749
790
|
|
|
750
791
|
# Load data from folder with raw data, here: WIFF
|
|
751
792
|
study.add(r'D:\...\...\...\*.wiff')
|
|
752
793
|
|
|
753
794
|
# Perform retention time correction
|
|
754
|
-
study.align(
|
|
795
|
+
study.align(rt_tol=2.0)
|
|
755
796
|
study.plot_alignment()
|
|
797
|
+
study.plot_bpc()
|
|
798
|
+
study.plot_rt_correction()
|
|
756
799
|
|
|
757
800
|
# Find consensus features
|
|
758
801
|
study.merge(min_samples=3)
|
|
@@ -772,18 +815,15 @@ study.export_parquet()
|
|
|
772
815
|
|
|
773
816
|
# Save the study to .study5
|
|
774
817
|
study.save()
|
|
775
|
-
```
|
|
776
|
-
|
|
777
|
-
## Requirements
|
|
778
818
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
819
|
+
# Some of the plots...
|
|
820
|
+
study.plot_samples_pca()
|
|
821
|
+
study.plot_samples_umap()
|
|
822
|
+
study.plot_samples_2d()
|
|
823
|
+
```
|
|
782
824
|
|
|
783
825
|
## License
|
|
784
|
-
|
|
785
826
|
GNU Affero General Public License v3
|
|
786
827
|
|
|
787
828
|
## Citation
|
|
788
|
-
|
|
789
829
|
If you use Masster in your research, please cite this repository.
|