masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/chromatogram.py +2 -2
- masster/data/libs/urine.csv +3 -3
- masster/logger.py +8 -8
- masster/sample/adducts.py +337 -263
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +557 -278
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +2 -2
- masster/sample/load.py +25 -11
- masster/sample/plot.py +5 -5
- masster/sample/processing.py +115 -85
- masster/sample/sample.py +28 -15
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +34 -11
- masster/spectrum.py +2 -2
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +354 -204
- masster/study/h5.py +557 -155
- masster/study/helpers.py +487 -194
- masster/study/id.py +536 -347
- masster/study/load.py +228 -138
- masster/study/plot.py +68 -68
- masster/study/processing.py +455 -253
- masster/study/save.py +14 -4
- masster/study/study.py +122 -40
- masster/study/study5_schema.json +149 -149
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/METADATA +5 -3
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/RECORD +34 -34
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/WHEEL +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/entry_points.txt +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/top_level.txt +0 -0
masster/study/save.py
CHANGED
|
@@ -48,8 +48,14 @@ def save(self, filename=None, add_timestamp=True, compress=False):
|
|
|
48
48
|
# Log file size information for performance monitoring
|
|
49
49
|
if hasattr(self, "features_df") and not self.features_df.is_empty():
|
|
50
50
|
feature_count = len(self.features_df)
|
|
51
|
-
sample_count =
|
|
52
|
-
|
|
51
|
+
sample_count = (
|
|
52
|
+
len(self.samples_df)
|
|
53
|
+
if hasattr(self, "samples_df") and not self.samples_df.is_empty()
|
|
54
|
+
else 0
|
|
55
|
+
)
|
|
56
|
+
self.logger.info(
|
|
57
|
+
f"Saving study with {sample_count} samples and {feature_count} features to {filename}",
|
|
58
|
+
)
|
|
53
59
|
|
|
54
60
|
# Use compressed mode for large datasets
|
|
55
61
|
if compress:
|
|
@@ -121,7 +127,9 @@ def save_samples(self, samples=None):
|
|
|
121
127
|
if sample_path.endswith(".sample5"):
|
|
122
128
|
# If sample_path is a .sample5 file, save featureXML in the same directory
|
|
123
129
|
featurexml_filename = sample_path.replace(".sample5", ".featureXML")
|
|
124
|
-
self.logger.debug(
|
|
130
|
+
self.logger.debug(
|
|
131
|
+
f"Saving featureXML alongside .sample5 file: {featurexml_filename}",
|
|
132
|
+
)
|
|
125
133
|
else:
|
|
126
134
|
# Fallback to study folder or current directory (original behavior)
|
|
127
135
|
if self.folder is not None:
|
|
@@ -134,7 +142,9 @@ def save_samples(self, samples=None):
|
|
|
134
142
|
os.getcwd(),
|
|
135
143
|
sample_name + ".featureXML",
|
|
136
144
|
)
|
|
137
|
-
self.logger.debug(
|
|
145
|
+
self.logger.debug(
|
|
146
|
+
f"Saving featureXML to default location: {featurexml_filename}",
|
|
147
|
+
)
|
|
138
148
|
|
|
139
149
|
fh = oms.FeatureXMLFile()
|
|
140
150
|
if sample_index is not None and sample_index < len(self.features_maps):
|
masster/study/study.py
CHANGED
|
@@ -125,11 +125,22 @@ from masster.study.parameters import update_parameters
|
|
|
125
125
|
from masster.study.parameters import get_parameters_property
|
|
126
126
|
from masster.study.parameters import set_parameters_property
|
|
127
127
|
from masster.study.save import save, save_consensus, _save_consensusXML, save_samples
|
|
128
|
-
from masster.study.export import
|
|
128
|
+
from masster.study.export import (
|
|
129
|
+
export_mgf,
|
|
130
|
+
export_mztab,
|
|
131
|
+
export_xlsx,
|
|
132
|
+
export_parquet,
|
|
133
|
+
_get_mgf_df,
|
|
134
|
+
)
|
|
129
135
|
from masster.study.id import lib_load, identify, get_id, id_reset, lib_reset
|
|
130
|
-
from masster.study.id import
|
|
131
|
-
|
|
132
|
-
|
|
136
|
+
from masster.study.id import (
|
|
137
|
+
_get_adducts,
|
|
138
|
+
_calculate_formula_mass_shift,
|
|
139
|
+
_format_adduct_name,
|
|
140
|
+
_parse_element_counts,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
from masster.logger import MasterLogger
|
|
133
144
|
from masster.study.defaults.study_def import study_defaults
|
|
134
145
|
from masster.study.defaults.align_def import align_defaults
|
|
135
146
|
from masster.study.defaults.export_def import export_mgf_defaults
|
|
@@ -177,8 +188,8 @@ class Study:
|
|
|
177
188
|
- `export_consensus()`: Export consensus features for downstream analysis.
|
|
178
189
|
|
|
179
190
|
Example Usage:
|
|
180
|
-
>>> from masster import
|
|
181
|
-
>>> study_obj =
|
|
191
|
+
>>> from masster import Study
|
|
192
|
+
>>> study_obj = Study(folder="./data")
|
|
182
193
|
>>> study_obj.load_folder("./mzml_files")
|
|
183
194
|
>>> study_obj.process_all()
|
|
184
195
|
>>> study_obj.align()
|
|
@@ -272,7 +283,11 @@ class Study:
|
|
|
272
283
|
# Set instance attributes (ensure proper string values for logger)
|
|
273
284
|
self.folder = params.folder
|
|
274
285
|
self.label = params.label
|
|
275
|
-
self.polarity =
|
|
286
|
+
self.polarity = (
|
|
287
|
+
params.polarity
|
|
288
|
+
if params.polarity in ["positive", "negative", "pos", "neg"]
|
|
289
|
+
else "positive"
|
|
290
|
+
)
|
|
276
291
|
self.log_level = params.log_level.upper() if params.log_level else "INFO"
|
|
277
292
|
self.log_label = params.log_label + " | " if params.log_label else ""
|
|
278
293
|
self.log_sink = params.log_sink
|
|
@@ -327,7 +342,7 @@ class Study:
|
|
|
327
342
|
self.id_df = pl.DataFrame()
|
|
328
343
|
|
|
329
344
|
# Initialize independent logger
|
|
330
|
-
self.logger =
|
|
345
|
+
self.logger = MasterLogger(
|
|
331
346
|
instance_type="study",
|
|
332
347
|
level=self.log_level.upper(),
|
|
333
348
|
label=self.log_label,
|
|
@@ -427,7 +442,9 @@ class Study:
|
|
|
427
442
|
fill = fill
|
|
428
443
|
fill_chrom = fill # Backward compatibility alias
|
|
429
444
|
_process_sample_for_parallel_fill = _process_sample_for_parallel_fill
|
|
430
|
-
_get_missing_consensus_sample_combinations =
|
|
445
|
+
_get_missing_consensus_sample_combinations = (
|
|
446
|
+
_get_missing_consensus_sample_combinations
|
|
447
|
+
)
|
|
431
448
|
_load_consensusXML = _load_consensusXML
|
|
432
449
|
load_features = load_features
|
|
433
450
|
sanitize = sanitize
|
|
@@ -485,7 +502,10 @@ class Study:
|
|
|
485
502
|
|
|
486
503
|
# Get all currently loaded modules that are part of the study package
|
|
487
504
|
for module_name in sys.modules:
|
|
488
|
-
if
|
|
505
|
+
if (
|
|
506
|
+
module_name.startswith(study_module_prefix)
|
|
507
|
+
and module_name != current_module
|
|
508
|
+
):
|
|
489
509
|
study_modules.append(module_name)
|
|
490
510
|
|
|
491
511
|
# Add core masster modules
|
|
@@ -500,7 +520,10 @@ class Study:
|
|
|
500
520
|
sample_modules = []
|
|
501
521
|
sample_module_prefix = f"{base_modname}.sample."
|
|
502
522
|
for module_name in sys.modules:
|
|
503
|
-
if
|
|
523
|
+
if (
|
|
524
|
+
module_name.startswith(sample_module_prefix)
|
|
525
|
+
and module_name != current_module
|
|
526
|
+
):
|
|
504
527
|
sample_modules.append(module_name)
|
|
505
528
|
|
|
506
529
|
all_modules_to_reload = core_modules + sample_modules + study_modules
|
|
@@ -538,7 +561,12 @@ class Study:
|
|
|
538
561
|
"""
|
|
539
562
|
return ""
|
|
540
563
|
|
|
541
|
-
def logger_update(
|
|
564
|
+
def logger_update(
|
|
565
|
+
self,
|
|
566
|
+
level: str | None = None,
|
|
567
|
+
label: str | None = None,
|
|
568
|
+
sink: str | None = None,
|
|
569
|
+
):
|
|
542
570
|
"""Update the logging configuration for this Study instance.
|
|
543
571
|
|
|
544
572
|
Args:
|
|
@@ -570,17 +598,21 @@ class Study:
|
|
|
570
598
|
that are out of normal range.
|
|
571
599
|
"""
|
|
572
600
|
# Cache DataFrame lengths and existence checks
|
|
573
|
-
consensus_df_len =
|
|
601
|
+
consensus_df_len = (
|
|
602
|
+
len(self.consensus_df) if not self.consensus_df.is_empty() else 0
|
|
603
|
+
)
|
|
574
604
|
samples_df_len = len(self.samples_df) if not self.samples_df.is_empty() else 0
|
|
575
605
|
|
|
576
606
|
# Calculate consensus statistics only if consensus_df exists and has data
|
|
577
607
|
if consensus_df_len > 0:
|
|
578
608
|
# Execute the aggregation once
|
|
579
|
-
stats_result = self.consensus_df.select(
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
609
|
+
stats_result = self.consensus_df.select(
|
|
610
|
+
[
|
|
611
|
+
pl.col("number_samples").min().alias("min_samples"),
|
|
612
|
+
pl.col("number_samples").mean().alias("mean_samples"),
|
|
613
|
+
pl.col("number_samples").max().alias("max_samples"),
|
|
614
|
+
],
|
|
615
|
+
).row(0)
|
|
584
616
|
|
|
585
617
|
min_samples = stats_result[0] if stats_result[0] is not None else 0
|
|
586
618
|
mean_samples = stats_result[1] if stats_result[1] is not None else 0
|
|
@@ -592,7 +624,9 @@ class Study:
|
|
|
592
624
|
|
|
593
625
|
# Count only features where 'filled' == False
|
|
594
626
|
if not self.features_df.is_empty() and "filled" in self.features_df.columns:
|
|
595
|
-
unfilled_features_count = self.features_df.filter(
|
|
627
|
+
unfilled_features_count = self.features_df.filter(
|
|
628
|
+
~self.features_df["filled"],
|
|
629
|
+
).height
|
|
596
630
|
else:
|
|
597
631
|
unfilled_features_count = 0
|
|
598
632
|
|
|
@@ -615,12 +649,20 @@ class Study:
|
|
|
615
649
|
if unfilled_dtype != consensus_dtype:
|
|
616
650
|
# Cast both to Int64 if possible, otherwise keep as string
|
|
617
651
|
try:
|
|
618
|
-
unfilled_features = unfilled_features.with_columns(
|
|
619
|
-
|
|
652
|
+
unfilled_features = unfilled_features.with_columns(
|
|
653
|
+
pl.col("feature_uid").cast(pl.Int64),
|
|
654
|
+
)
|
|
655
|
+
consensus_feature_uids = [
|
|
656
|
+
int(uid) for uid in consensus_feature_uids
|
|
657
|
+
]
|
|
620
658
|
except Exception:
|
|
621
659
|
# If casting fails, ensure both are strings
|
|
622
|
-
unfilled_features = unfilled_features.with_columns(
|
|
623
|
-
|
|
660
|
+
unfilled_features = unfilled_features.with_columns(
|
|
661
|
+
pl.col("feature_uid").cast(pl.Utf8),
|
|
662
|
+
)
|
|
663
|
+
consensus_feature_uids = [
|
|
664
|
+
str(uid) for uid in consensus_feature_uids
|
|
665
|
+
]
|
|
624
666
|
|
|
625
667
|
# Count unfilled features that are in consensus
|
|
626
668
|
in_consensus_count = unfilled_features.filter(
|
|
@@ -629,14 +671,22 @@ class Study:
|
|
|
629
671
|
|
|
630
672
|
# Calculate ratios that sum to 100%
|
|
631
673
|
total_unfilled = unfilled_features.height
|
|
632
|
-
ratio_in_consensus_to_total = (
|
|
633
|
-
|
|
674
|
+
ratio_in_consensus_to_total = (
|
|
675
|
+
(in_consensus_count / total_unfilled * 100) if total_unfilled > 0 else 0
|
|
676
|
+
)
|
|
677
|
+
ratio_not_in_consensus_to_total = (
|
|
678
|
+
100 - ratio_in_consensus_to_total if total_unfilled > 0 else 0
|
|
679
|
+
)
|
|
634
680
|
else:
|
|
635
681
|
ratio_in_consensus_to_total = 0
|
|
636
682
|
ratio_not_in_consensus_to_total = 0
|
|
637
683
|
|
|
638
684
|
# Optimize chrom completeness calculation
|
|
639
|
-
if
|
|
685
|
+
if (
|
|
686
|
+
consensus_df_len > 0
|
|
687
|
+
and samples_df_len > 0
|
|
688
|
+
and not self.features_df.is_empty()
|
|
689
|
+
):
|
|
640
690
|
# Ensure matching data types for join keys
|
|
641
691
|
features_dtype = self.features_df["feature_uid"].dtype
|
|
642
692
|
consensus_dtype = self.consensus_mapping_df["feature_uid"].dtype
|
|
@@ -644,13 +694,17 @@ class Study:
|
|
|
644
694
|
if features_dtype != consensus_dtype:
|
|
645
695
|
# Try to cast both to Int64, fallback to string if needed
|
|
646
696
|
try:
|
|
647
|
-
self.features_df = self.features_df.with_columns(
|
|
697
|
+
self.features_df = self.features_df.with_columns(
|
|
698
|
+
pl.col("feature_uid").cast(pl.Int64),
|
|
699
|
+
)
|
|
648
700
|
self.consensus_mapping_df = self.consensus_mapping_df.with_columns(
|
|
649
701
|
pl.col("feature_uid").cast(pl.Int64),
|
|
650
702
|
)
|
|
651
703
|
except Exception:
|
|
652
704
|
# If casting to Int64 fails, cast both to string
|
|
653
|
-
self.features_df = self.features_df.with_columns(
|
|
705
|
+
self.features_df = self.features_df.with_columns(
|
|
706
|
+
pl.col("feature_uid").cast(pl.Utf8),
|
|
707
|
+
)
|
|
654
708
|
self.consensus_mapping_df = self.consensus_mapping_df.with_columns(
|
|
655
709
|
pl.col("feature_uid").cast(pl.Utf8),
|
|
656
710
|
)
|
|
@@ -671,7 +725,9 @@ class Study:
|
|
|
671
725
|
else:
|
|
672
726
|
non_null_chroms = 0
|
|
673
727
|
total_possible = samples_df_len * consensus_df_len
|
|
674
|
-
chrom_completeness =
|
|
728
|
+
chrom_completeness = (
|
|
729
|
+
non_null_chroms / total_possible if total_possible > 0 else 0
|
|
730
|
+
)
|
|
675
731
|
else:
|
|
676
732
|
chrom_completeness = 0
|
|
677
733
|
|
|
@@ -683,23 +739,37 @@ class Study:
|
|
|
683
739
|
|
|
684
740
|
if not self.consensus_df.is_empty():
|
|
685
741
|
# Compute RT spread using only consensus rows with number_samples >= half the number of samples
|
|
686
|
-
threshold =
|
|
742
|
+
threshold = (
|
|
743
|
+
self.consensus_df.select(pl.col("number_samples").max()).item() / 2
|
|
744
|
+
if not self.samples_df.is_empty()
|
|
745
|
+
else 0
|
|
746
|
+
)
|
|
687
747
|
filtered = self.consensus_df.filter(pl.col("number_samples") >= threshold)
|
|
688
748
|
if filtered.is_empty():
|
|
689
749
|
rt_spread = -1.0
|
|
690
750
|
else:
|
|
691
|
-
rt_spread_row = filtered.select(
|
|
692
|
-
|
|
751
|
+
rt_spread_row = filtered.select(
|
|
752
|
+
(pl.col("rt_max") - pl.col("rt_min")).mean(),
|
|
753
|
+
).row(0)
|
|
754
|
+
rt_spread = (
|
|
755
|
+
float(rt_spread_row[0])
|
|
756
|
+
if rt_spread_row and rt_spread_row[0] is not None
|
|
757
|
+
else 0.0
|
|
758
|
+
)
|
|
693
759
|
else:
|
|
694
760
|
rt_spread = -1.0
|
|
695
761
|
|
|
696
762
|
# Calculate percentage of consensus features with MS2
|
|
697
763
|
consensus_with_ms2_percentage = (
|
|
698
|
-
(consensus_with_ms2_count / consensus_df_len * 100)
|
|
764
|
+
(consensus_with_ms2_count / consensus_df_len * 100)
|
|
765
|
+
if consensus_df_len > 0
|
|
766
|
+
else 0
|
|
699
767
|
)
|
|
700
768
|
|
|
701
769
|
# Total MS2 spectra count
|
|
702
|
-
total_ms2_count =
|
|
770
|
+
total_ms2_count = (
|
|
771
|
+
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
772
|
+
)
|
|
703
773
|
|
|
704
774
|
# Estimate memory usage
|
|
705
775
|
memory_usage = (
|
|
@@ -712,15 +782,27 @@ class Study:
|
|
|
712
782
|
|
|
713
783
|
# Add warning symbols for out-of-range values
|
|
714
784
|
consensus_warning = f" {_WARNING_SYMBOL}" if consensus_df_len < 50 else ""
|
|
715
|
-
|
|
785
|
+
|
|
716
786
|
rt_spread_text = "N/A" if rt_spread < 0 else f"{rt_spread:.3f}s"
|
|
717
|
-
rt_spread_warning =
|
|
718
|
-
|
|
787
|
+
rt_spread_warning = (
|
|
788
|
+
f" {_WARNING_SYMBOL}"
|
|
789
|
+
if rt_spread >= 0 and (rt_spread > 5 or rt_spread < 0.1)
|
|
790
|
+
else ""
|
|
791
|
+
)
|
|
792
|
+
|
|
719
793
|
chrom_completeness_pct = chrom_completeness * 100
|
|
720
|
-
chrom_warning =
|
|
721
|
-
|
|
794
|
+
chrom_warning = (
|
|
795
|
+
f" {_WARNING_SYMBOL}"
|
|
796
|
+
if chrom_completeness_pct < 10 and chrom_completeness_pct >= 0
|
|
797
|
+
else ""
|
|
798
|
+
)
|
|
799
|
+
|
|
722
800
|
max_samples_warning = ""
|
|
723
|
-
if
|
|
801
|
+
if (
|
|
802
|
+
isinstance(max_samples, (int, float))
|
|
803
|
+
and samples_df_len > 0
|
|
804
|
+
and max_samples > 0
|
|
805
|
+
):
|
|
724
806
|
if max_samples < samples_df_len / 3.0:
|
|
725
807
|
max_samples_warning = f" {_WARNING_SYMBOL}"
|
|
726
808
|
elif max_samples < samples_df_len * 0.8:
|