masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/study.py CHANGED
@@ -52,104 +52,104 @@ import sys
52
52
  import polars as pl
53
53
 
54
54
  # Study-specific imports
55
- from master.study.h5 import _load_study5
56
- from master.study.h5 import _save_study5
57
- from master.study.h5 import _save_study5_compressed
58
- from master.study.helpers import _get_consensus_uids
59
- from master.study.helpers import _get_feature_uids
60
- from master.study.helpers import _get_sample_uids
61
- from master.study.helpers import _ensure_features_df_schema_order
62
- from master.study.helpers import compress
63
- from master.study.helpers import compress_features
64
- from master.study.helpers import compress_ms2
65
- from master.study.helpers import compress_chrom
66
- from master.study.helpers import restore_features
67
- from master.study.helpers import restore_chrom
68
- from master.study.helpers import restore_ms2
69
- from master.study.helpers import decompress
70
- from master.study.helpers import fill_reset
71
- from master.study.helpers import get_chrom
72
- from master.study.helpers import get_sample
73
- from master.study.helpers import get_consensus
74
- from master.study.helpers import get_consensus_matches
75
- from master.study.helpers import get_consensus_matrix
76
- from master.study.helpers import get_orphans
77
- from master.study.helpers import get_gaps_matrix
78
- from master.study.helpers import get_gaps_stats
79
- from master.study.helpers import align_reset
80
- from master.study.helpers import set_folder
81
- from master.study.helpers import set_source
82
- from master.study.helpers import sample_color
83
- from master.study.helpers import sample_color_reset
84
- from master.study.helpers import sample_name_replace
85
- from master.study.helpers import sample_name_reset
86
- from master.study.helpers import samples_select
87
- from master.study.helpers import samples_delete
88
- from master.study.helpers import features_select
89
- from master.study.helpers import features_filter
90
- from master.study.helpers import features_delete
91
- from master.study.helpers import consensus_select
92
- from master.study.helpers import consensus_filter
93
- from master.study.helpers import consensus_delete
94
- from master.study.load import add
95
- from master.study.load import add_sample
96
- from master.study.load import _add_samples_batch
97
- from master.study.load import _add_sample_optimized
98
- from master.study.load import _add_sample_standard
99
- from master.study.load import _sample_color_reset_optimized
100
- from master.study.load import fill_single
101
- from master.study.load import fill
102
- from master.study.load import _process_sample_for_parallel_fill
103
- from master.study.load import _get_missing_consensus_sample_combinations
104
- from master.study.load import load
105
- from master.study.load import _load_consensusXML
106
- from master.study.load import load_features
107
- from master.study.load import sanitize
108
- from master.study.plot import plot_alignment
109
- from master.study.plot import plot_consensus_2d
110
- from master.study.plot import plot_samples_2d
111
- from master.study.plot import plot_consensus_stats
112
- from master.study.plot import plot_chrom
113
- from master.study.plot import plot_pca
114
- from master.study.plot import plot_bpc
115
- from master.study.plot import plot_tic
116
- from master.study.plot import plot_eic
117
- from master.study.plot import plot_rt_correction
118
- from master.study.processing import align
119
- from master.study.processing import merge
120
- from master.study.processing import integrate
121
- from master.study.processing import find_ms2
122
- from master.study.parameters import store_history
123
- from master.study.parameters import get_parameters
124
- from master.study.parameters import update_parameters
125
- from master.study.parameters import get_parameters_property
126
- from master.study.parameters import set_parameters_property
127
- from master.study.save import save
128
- from master.study.save import save_consensus
129
- from master.study.save import _save_consensusXML
130
- from master.study.save import save_samples
131
- from master.study.export import export_mgf
132
- from master.study.export import export_mztab
133
- from master.study.export import _get_mgf_df
134
- from master.study.id import lib_load, identify, get_id
135
-
136
- from master.logger import MasterLogger
137
- from master.study.defaults.study_def import study_defaults
138
- from master.study.defaults.align_def import align_defaults
139
- from master.study.defaults.export_def import export_mgf_defaults
140
- from master.study.defaults.fill_chrom_def import fill_chrom_defaults
141
- from master.study.defaults.fill_def import fill_defaults
142
- from master.study.defaults.find_consensus_def import find_consensus_defaults
143
- from master.study.defaults.find_ms2_def import find_ms2_defaults
144
- from master.study.defaults.integrate_chrom_def import integrate_chrom_defaults
145
- from master.study.defaults.integrate_def import integrate_defaults
146
- from master.study.defaults.merge_def import merge_defaults
55
+ from masster.study.h5 import _load_study5
56
+ from masster.study.h5 import _save_study5
57
+ from masster.study.h5 import _save_study5_compressed
58
+ from masster.study.helpers import _get_consensus_uids
59
+ from masster.study.helpers import _get_feature_uids
60
+ from masster.study.helpers import _get_sample_uids
61
+ from masster.study.helpers import _ensure_features_df_schema_order
62
+ from masster.study.helpers import compress
63
+ from masster.study.helpers import compress_features
64
+ from masster.study.helpers import compress_ms2
65
+ from masster.study.helpers import compress_chrom
66
+ from masster.study.helpers import restore_features
67
+ from masster.study.helpers import restore_chrom
68
+ from masster.study.helpers import restore_ms2
69
+ from masster.study.helpers import decompress
70
+ from masster.study.helpers import fill_reset
71
+ from masster.study.helpers import get_chrom
72
+ from masster.study.helpers import get_sample
73
+ from masster.study.helpers import get_consensus
74
+ from masster.study.helpers import get_consensus_matches
75
+ from masster.study.helpers import get_consensus_matrix
76
+ from masster.study.helpers import get_orphans
77
+ from masster.study.helpers import get_gaps_matrix
78
+ from masster.study.helpers import get_gaps_stats
79
+ from masster.study.helpers import align_reset
80
+ from masster.study.helpers import set_folder
81
+ from masster.study.helpers import set_source
82
+ from masster.study.helpers import sample_color
83
+ from masster.study.helpers import sample_color_reset
84
+ from masster.study.helpers import sample_name_replace
85
+ from masster.study.helpers import sample_name_reset
86
+ from masster.study.helpers import samples_select
87
+ from masster.study.helpers import samples_delete
88
+ from masster.study.helpers import features_select
89
+ from masster.study.helpers import features_filter
90
+ from masster.study.helpers import features_delete
91
+ from masster.study.helpers import consensus_select
92
+ from masster.study.helpers import consensus_filter
93
+ from masster.study.helpers import consensus_delete
94
+ from masster.study.load import add
95
+ from masster.study.load import add_sample
96
+ from masster.study.load import _add_samples_batch
97
+ from masster.study.load import _add_sample_optimized
98
+ from masster.study.load import _add_sample_standard
99
+ from masster.study.load import _sample_color_reset_optimized
100
+ from masster.study.load import fill_single
101
+ from masster.study.load import fill
102
+ from masster.study.load import _process_sample_for_parallel_fill
103
+ from masster.study.load import _get_missing_consensus_sample_combinations
104
+ from masster.study.load import load
105
+ from masster.study.load import _load_consensusXML
106
+ from masster.study.load import load_features
107
+ from masster.study.load import sanitize
108
+ from masster.study.plot import plot_alignment
109
+ from masster.study.plot import plot_consensus_2d
110
+ from masster.study.plot import plot_samples_2d
111
+ from masster.study.plot import plot_consensus_stats
112
+ from masster.study.plot import plot_chrom
113
+ from masster.study.plot import plot_pca
114
+ from masster.study.plot import plot_bpc
115
+ from masster.study.plot import plot_tic
116
+ from masster.study.plot import plot_eic
117
+ from masster.study.plot import plot_rt_correction
118
+ from masster.study.processing import align
119
+ from masster.study.processing import merge
120
+ from masster.study.processing import integrate
121
+ from masster.study.processing import find_ms2
122
+ from masster.study.parameters import store_history
123
+ from masster.study.parameters import get_parameters
124
+ from masster.study.parameters import update_parameters
125
+ from masster.study.parameters import get_parameters_property
126
+ from masster.study.parameters import set_parameters_property
127
+ from masster.study.save import save
128
+ from masster.study.save import save_consensus
129
+ from masster.study.save import _save_consensusXML
130
+ from masster.study.save import save_samples
131
+ from masster.study.export import export_mgf
132
+ from masster.study.export import export_mztab
133
+ from masster.study.export import _get_mgf_df
134
+ from masster.study.id import lib_load, identify, get_id
135
+
136
+ from masster.logger import MassterLogger
137
+ from masster.study.defaults.study_def import study_defaults
138
+ from masster.study.defaults.align_def import align_defaults
139
+ from masster.study.defaults.export_def import export_mgf_defaults
140
+ from masster.study.defaults.fill_chrom_def import fill_chrom_defaults
141
+ from masster.study.defaults.fill_def import fill_defaults
142
+ from masster.study.defaults.find_consensus_def import find_consensus_defaults
143
+ from masster.study.defaults.find_ms2_def import find_ms2_defaults
144
+ from masster.study.defaults.integrate_chrom_def import integrate_chrom_defaults
145
+ from masster.study.defaults.integrate_def import integrate_defaults
146
+ from masster.study.defaults.merge_def import merge_defaults
147
147
 
148
148
  # Import sample defaults
149
- from master.sample.defaults.sample_def import sample_defaults
150
- from master.sample.defaults.find_features_def import find_features_defaults
151
- from master.sample.defaults.find_adducts_def import find_adducts_defaults
152
- from master.sample.defaults.get_spectrum_def import get_spectrum_defaults
149
+ from masster.sample.defaults.sample_def import sample_defaults
150
+ from masster.sample.defaults.find_features_def import find_features_defaults
151
+ from masster.sample.defaults.find_adducts_def import find_adducts_defaults
152
+ from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
153
153
 
154
154
  # Warning symbols for info display
155
155
  _WARNING_SYMBOL = "⚠️" # Yellow warning triangle
@@ -181,7 +181,7 @@ class Study:
181
181
  - `export_consensus()`: Export consensus features for downstream analysis.
182
182
 
183
183
  Example Usage:
184
- >>> from master import study
184
+ >>> from masster import study
185
185
  >>> study_obj = study(folder="./data")
186
186
  >>> study_obj.load_folder("./mzml_files")
187
187
  >>> study_obj.process_all()
@@ -276,11 +276,7 @@ class Study:
276
276
  # Set instance attributes (ensure proper string values for logger)
277
277
  self.folder = params.folder
278
278
  self.label = params.label
279
- self.polarity = (
280
- params.polarity
281
- if params.polarity in ["positive", "negative", "pos", "neg"]
282
- else "positive"
283
- )
279
+ self.polarity = params.polarity if params.polarity in ["positive", "negative", "pos", "neg"] else "positive"
284
280
  self.log_level = params.log_level.upper() if params.log_level else "INFO"
285
281
  self.log_label = params.log_label + " | " if params.log_label else ""
286
282
  self.log_sink = params.log_sink
@@ -335,7 +331,7 @@ class Study:
335
331
  self.id_df = pl.DataFrame()
336
332
 
337
333
  # Initialize independent logger
338
- self.logger = MasterLogger(
334
+ self.logger = MassterLogger(
339
335
  instance_type="study",
340
336
  level=self.log_level.upper(),
341
337
  label=self.log_label,
@@ -436,9 +432,7 @@ class Study:
436
432
  fill = fill
437
433
  fill_chrom = fill # Backward compatibility alias
438
434
  _process_sample_for_parallel_fill = _process_sample_for_parallel_fill
439
- _get_missing_consensus_sample_combinations = (
440
- _get_missing_consensus_sample_combinations
441
- )
435
+ _get_missing_consensus_sample_combinations = _get_missing_consensus_sample_combinations
442
436
  _load_consensusXML = _load_consensusXML
443
437
  load_features = load_features
444
438
  sanitize = sanitize
@@ -465,20 +459,20 @@ class Study:
465
459
 
466
460
  def _reload(self):
467
461
  """
468
- Reloads all master modules to pick up any changes to their source code,
462
+ Reloads all masster modules to pick up any changes to their source code,
469
463
  and updates the instance's class reference to the newly reloaded class version.
470
464
  This ensures that the instance uses the latest implementation without restarting the interpreter.
471
465
  """
472
466
  # Reset logger configuration flags to allow proper reconfiguration after reload
473
467
  """ try:
474
- import master.sample.logger as logger_module
468
+ import masster.sample.logger as logger_module
475
469
 
476
470
  if hasattr(logger_module, "_STUDY_LOGGER_CONFIGURED"):
477
471
  logger_module._STUDY_LOGGER_CONFIGURED = False
478
472
  except Exception:
479
473
  pass"""
480
474
 
481
- # Get the base module name (master)
475
+ # Get the base module name (masster)
482
476
  base_modname = self.__class__.__module__.split(".")[0]
483
477
  current_module = self.__class__.__module__
484
478
 
@@ -488,13 +482,10 @@ class Study:
488
482
 
489
483
  # Get all currently loaded modules that are part of the study package
490
484
  for module_name in sys.modules:
491
- if (
492
- module_name.startswith(study_module_prefix)
493
- and module_name != current_module
494
- ):
485
+ if module_name.startswith(study_module_prefix) and module_name != current_module:
495
486
  study_modules.append(module_name)
496
487
 
497
- # Add core master modules
488
+ # Add core masster modules
498
489
  core_modules = [
499
490
  f"{base_modname}._version",
500
491
  f"{base_modname}.chromatogram",
@@ -506,10 +497,7 @@ class Study:
506
497
  sample_modules = []
507
498
  sample_module_prefix = f"{base_modname}.sample."
508
499
  for module_name in sys.modules:
509
- if (
510
- module_name.startswith(sample_module_prefix)
511
- and module_name != current_module
512
- ):
500
+ if module_name.startswith(sample_module_prefix) and module_name != current_module:
513
501
  sample_modules.append(module_name)
514
502
 
515
503
  all_modules_to_reload = core_modules + sample_modules + study_modules
@@ -541,11 +529,11 @@ class Study:
541
529
  def _get_adducts(self, adducts_list: list = None, **kwargs):
542
530
  """
543
531
  Generate comprehensive adduct specifications for study-level adduct filtering.
544
-
532
+
545
533
  This method creates a DataFrame of adduct combinations that will be used to filter
546
534
  and score adducts at the study level. Similar to sample._get_adducts() but uses
547
535
  study-level parameters and constraints.
548
-
536
+
549
537
  Parameters
550
538
  ----------
551
539
  adducts_list : List[str], optional
@@ -554,10 +542,10 @@ class Study:
554
542
  **kwargs : dict
555
543
  Override parameters, including:
556
544
  - charge_min: Minimum charge to consider (default 1)
557
- - charge_max: Maximum charge to consider (default 3)
545
+ - charge_max: Maximum charge to consider (default 3)
558
546
  - max_combinations: Maximum number of adduct components to combine (default 3)
559
547
  - min_probability: Minimum probability threshold (default from study parameters)
560
-
548
+
561
549
  Returns
562
550
  -------
563
551
  pl.DataFrame
@@ -569,304 +557,272 @@ class Study:
569
557
  - complexity: Number of adduct components (1-3)
570
558
  """
571
559
  # Import required modules
572
-
560
+ from collections import Counter
561
+ from itertools import combinations
562
+ import numpy as np
563
+
573
564
  # Use provided adducts list or get from study parameters
574
565
  if adducts_list is None:
575
- adducts_list = (
576
- self.parameters.adducts
577
- if hasattr(self.parameters, "adducts") and self.parameters.adducts
578
- else []
579
- )
580
-
566
+ adducts_list = self.parameters.adducts if hasattr(self.parameters, 'adducts') and self.parameters.adducts else []
567
+
581
568
  # Get parameters with study-specific defaults
582
- charge_min = kwargs.get("charge_min", -3) # Allow negative charges
583
- charge_max = kwargs.get("charge_max", 3) # Study uses up to charge ±3
584
- max_combinations = kwargs.get("max_combinations", 3) # Up to 3 combinations
585
- min_probability = kwargs.get(
586
- "min_probability",
587
- getattr(self.parameters, "adduct_min_probability", 0.04),
588
- )
589
-
569
+ charge_min = kwargs.get('charge_min', -3) # Allow negative charges
570
+ charge_max = kwargs.get('charge_max', 3) # Study uses up to charge ±3
571
+ max_combinations = kwargs.get('max_combinations', 3) # Up to 3 combinations
572
+ min_probability = kwargs.get('min_probability', getattr(self.parameters, 'adduct_min_probability', 0.04))
573
+
590
574
  # Parse base adduct specifications
591
575
  base_specs = []
592
-
576
+
593
577
  for adduct_str in adducts_list:
594
- if not isinstance(adduct_str, str) or ":" not in adduct_str:
578
+ if not isinstance(adduct_str, str) or ':' not in adduct_str:
595
579
  continue
596
-
580
+
597
581
  try:
598
- parts = adduct_str.split(":")
582
+ parts = adduct_str.split(':')
599
583
  if len(parts) != 3:
600
584
  continue
601
-
585
+
602
586
  formula_part = parts[0]
603
- charge = int(parts[1])
587
+ charge = int(parts[1])
604
588
  probability = float(parts[2])
605
-
589
+
606
590
  # Calculate mass shift from formula
607
591
  mass_shift = self._calculate_formula_mass_shift(formula_part)
608
-
609
- base_specs.append(
610
- {
611
- "formula": formula_part,
612
- "charge": charge,
613
- "mass_shift": mass_shift,
614
- "probability": probability,
615
- "raw_string": adduct_str,
616
- },
617
- )
618
-
592
+
593
+ base_specs.append({
594
+ 'formula': formula_part,
595
+ 'charge': charge,
596
+ 'mass_shift': mass_shift,
597
+ 'probability': probability,
598
+ 'raw_string': adduct_str
599
+ })
600
+
619
601
  except (ValueError, IndexError):
620
602
  continue
621
-
603
+
622
604
  if not base_specs:
623
605
  # Return empty DataFrame with correct schema
624
- return pl.DataFrame(
625
- {
626
- "name": [],
627
- "charge": [],
628
- "mass_shift": [],
629
- "probability": [],
630
- "complexity": [],
631
- },
632
- )
633
-
606
+ return pl.DataFrame({
607
+ 'name': [],
608
+ 'charge': [],
609
+ 'mass_shift': [],
610
+ 'probability': [],
611
+ 'complexity': []
612
+ })
613
+
634
614
  # Generate all valid combinations
635
615
  combinations_list = []
636
-
616
+
637
617
  # Separate specs by charge type
638
- positive_specs = [spec for spec in base_specs if spec["charge"] > 0]
639
- negative_specs = [spec for spec in base_specs if spec["charge"] < 0]
640
- neutral_specs = [spec for spec in base_specs if spec["charge"] == 0]
641
-
618
+ positive_specs = [spec for spec in base_specs if spec['charge'] > 0]
619
+ negative_specs = [spec for spec in base_specs if spec['charge'] < 0]
620
+ neutral_specs = [spec for spec in base_specs if spec['charge'] == 0]
621
+
642
622
  # 1. Single adducts (filter out neutral adducts with charge == 0)
643
623
  for spec in base_specs:
644
- if charge_min <= spec["charge"] <= charge_max and spec["charge"] != 0:
624
+ if charge_min <= spec['charge'] <= charge_max and spec['charge'] != 0:
645
625
  formatted_name = self._format_adduct_name([spec])
646
- combinations_list.append(
647
- {
648
- "components": [spec],
649
- "formatted_name": formatted_name,
650
- "total_mass_shift": spec["mass_shift"],
651
- "total_charge": spec["charge"],
652
- "combined_probability": spec["probability"],
653
- "complexity": 1,
654
- },
655
- )
656
-
626
+ combinations_list.append({
627
+ 'components': [spec],
628
+ 'formatted_name': formatted_name,
629
+ 'total_mass_shift': spec['mass_shift'],
630
+ 'total_charge': spec['charge'],
631
+ 'combined_probability': spec['probability'],
632
+ 'complexity': 1
633
+ })
634
+
657
635
  # 2. Generate multiply charged versions (2H+, 3H+, etc.) - already excludes charge==0
658
636
  for spec in positive_specs + negative_specs:
659
- base_charge = spec["charge"]
660
- for multiplier in range(
661
- 2,
662
- min(max_combinations + 1, 4),
663
- ): # Up to 3x multiplier
637
+ base_charge = spec['charge']
638
+ for multiplier in range(2, min(max_combinations + 1, 4)): # Up to 3x multiplier
664
639
  total_charge = base_charge * multiplier
665
640
  if charge_min <= total_charge <= charge_max and total_charge != 0:
666
641
  components = [spec] * multiplier
667
642
  formatted_name = self._format_adduct_name(components)
668
-
669
- combinations_list.append(
670
- {
671
- "components": components,
672
- "formatted_name": formatted_name,
673
- "total_mass_shift": spec["mass_shift"] * multiplier,
674
- "total_charge": total_charge,
675
- "combined_probability": spec["probability"] ** multiplier,
676
- "complexity": multiplier,
677
- },
678
- )
679
-
643
+
644
+ combinations_list.append({
645
+ 'components': components,
646
+ 'formatted_name': formatted_name,
647
+ 'total_mass_shift': spec['mass_shift'] * multiplier,
648
+ 'total_charge': total_charge,
649
+ 'combined_probability': spec['probability'] ** multiplier,
650
+ 'complexity': multiplier
651
+ })
652
+
680
653
  # 3. Mixed combinations (2-component) - limited for study level, filter out charge==0
681
654
  if max_combinations >= 2:
682
655
  # Positive + Neutral (1 neutral loss only) - but exclude if total charge == 0
683
656
  for pos_spec in positive_specs[:2]: # Limit to first 2 positive specs
684
657
  for neut_spec in neutral_specs[:1]: # Only 1 neutral loss
685
- total_charge = pos_spec["charge"] + neut_spec["charge"]
658
+ total_charge = pos_spec['charge'] + neut_spec['charge']
686
659
  if charge_min <= total_charge <= charge_max and total_charge != 0:
687
660
  components = [pos_spec, neut_spec]
688
661
  formatted_name = self._format_adduct_name(components)
689
- combinations_list.append(
690
- {
691
- "components": components,
692
- "formatted_name": formatted_name,
693
- "total_mass_shift": pos_spec["mass_shift"]
694
- + neut_spec["mass_shift"],
695
- "total_charge": total_charge,
696
- "combined_probability": pos_spec["probability"]
697
- * neut_spec["probability"],
698
- "complexity": 2,
699
- },
700
- )
701
-
662
+ combinations_list.append({
663
+ 'components': components,
664
+ 'formatted_name': formatted_name,
665
+ 'total_mass_shift': pos_spec['mass_shift'] + neut_spec['mass_shift'],
666
+ 'total_charge': total_charge,
667
+ 'combined_probability': pos_spec['probability'] * neut_spec['probability'],
668
+ 'complexity': 2
669
+ })
670
+
702
671
  # Convert to polars DataFrame
703
672
  if combinations_list:
704
- combinations_list.sort(
705
- key=lambda x: (-x["combined_probability"], x["complexity"]),
706
- )
707
-
708
- adducts_df = pl.DataFrame(
709
- [
710
- {
711
- "name": combo["formatted_name"],
712
- "charge": combo["total_charge"],
713
- "mass_shift": combo["total_mass_shift"],
714
- "probability": combo["combined_probability"],
715
- "complexity": combo["complexity"],
716
- }
717
- for combo in combinations_list
718
- ],
719
- )
720
-
673
+ combinations_list.sort(key=lambda x: (-x['combined_probability'], x['complexity']))
674
+
675
+ adducts_df = pl.DataFrame([
676
+ {
677
+ 'name': combo['formatted_name'],
678
+ 'charge': combo['total_charge'],
679
+ 'mass_shift': combo['total_mass_shift'],
680
+ 'probability': combo['combined_probability'],
681
+ 'complexity': combo['complexity']
682
+ }
683
+ for combo in combinations_list
684
+ ])
685
+
721
686
  # Filter by minimum probability threshold
722
687
  if min_probability > 0.0:
723
688
  adducts_before_filter = len(adducts_df)
724
689
  adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
725
690
  adducts_after_filter = len(adducts_df)
726
-
727
- self.logger.debug(
728
- f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})",
729
- )
730
-
691
+
692
+ self.logger.debug(f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})")
693
+
731
694
  else:
732
695
  # Return empty DataFrame with correct schema
733
- adducts_df = pl.DataFrame(
734
- {
735
- "name": [],
736
- "charge": [],
737
- "mass_shift": [],
738
- "probability": [],
739
- "complexity": [],
740
- },
741
- )
742
-
696
+ adducts_df = pl.DataFrame({
697
+ 'name': [],
698
+ 'charge': [],
699
+ 'mass_shift': [],
700
+ 'probability': [],
701
+ 'complexity': []
702
+ })
703
+
743
704
  return adducts_df
744
705
 
745
706
  def _calculate_formula_mass_shift(self, formula: str) -> float:
746
707
  """Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc."""
747
708
  # Standard atomic masses
748
709
  atomic_masses = {
749
- "H": 1.007825,
750
- "C": 12.0,
751
- "N": 14.003074,
752
- "O": 15.994915,
753
- "Na": 22.989769,
754
- "K": 38.963707,
755
- "Li": 7.016003,
756
- "Ca": 39.962591,
757
- "Mg": 23.985042,
758
- "Fe": 55.934938,
759
- "Cl": 34.968853,
760
- "Br": 78.918336,
761
- "I": 126.904473,
762
- "P": 30.973762,
763
- "S": 31.972071,
710
+ 'H': 1.007825,
711
+ 'C': 12.0,
712
+ 'N': 14.003074,
713
+ 'O': 15.994915,
714
+ 'Na': 22.989769,
715
+ 'K': 38.963707,
716
+ 'Li': 7.016003,
717
+ 'Ca': 39.962591,
718
+ 'Mg': 23.985042,
719
+ 'Fe': 55.934938,
720
+ 'Cl': 34.968853,
721
+ 'Br': 78.918336,
722
+ 'I': 126.904473,
723
+ 'P': 30.973762,
724
+ 'S': 31.972071
764
725
  }
765
-
726
+
766
727
  total_mass = 0.0
767
-
728
+
768
729
  # Parse formula by splitting on + and - while preserving the operators
769
730
  parts = []
770
731
  current_part = ""
771
732
  current_sign = 1
772
-
733
+
773
734
  for char in formula:
774
- if char == "+":
735
+ if char == '+':
775
736
  if current_part:
776
737
  parts.append((current_sign, current_part))
777
738
  current_part = ""
778
739
  current_sign = 1
779
- elif char == "-":
740
+ elif char == '-':
780
741
  if current_part:
781
742
  parts.append((current_sign, current_part))
782
743
  current_part = ""
783
744
  current_sign = -1
784
745
  else:
785
746
  current_part += char
786
-
747
+
787
748
  if current_part:
788
749
  parts.append((current_sign, current_part))
789
-
750
+
790
751
  # Process each part
791
752
  for sign, part in parts:
792
753
  if not part:
793
754
  continue
794
-
755
+
795
756
  # Parse element and count (e.g., "H2O" -> H:2, O:1)
796
757
  elements = self._parse_element_counts(part)
797
-
758
+
798
759
  for element, count in elements.items():
799
760
  if element in atomic_masses:
800
761
  total_mass += sign * atomic_masses[element] * count
801
-
762
+
802
763
  return total_mass
803
764
 
804
765
  def _parse_element_counts(self, formula_part: str) -> dict[str, int]:
805
766
  """Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
806
767
  elements = {}
807
768
  i = 0
808
-
769
+
809
770
  while i < len(formula_part):
810
771
  # Get element (uppercase letter, possibly followed by lowercase)
811
772
  element = formula_part[i]
812
773
  i += 1
813
-
774
+
814
775
  while i < len(formula_part) and formula_part[i].islower():
815
776
  element += formula_part[i]
816
777
  i += 1
817
-
778
+
818
779
  # Get count (digits following element)
819
780
  count_str = ""
820
781
  while i < len(formula_part) and formula_part[i].isdigit():
821
782
  count_str += formula_part[i]
822
783
  i += 1
823
-
784
+
824
785
  count = int(count_str) if count_str else 1
825
786
  elements[element] = elements.get(element, 0) + count
826
-
787
+
827
788
  return elements
828
789
 
829
790
  def _format_adduct_name(self, components: list[dict]) -> str:
830
791
  """Format adduct name from components like [M+H]1+ or [M+2H]2+"""
831
792
  if not components:
832
793
  return "[M]"
833
-
794
+
834
795
  # Count occurrences of each formula
835
796
  from collections import Counter
836
-
837
- formula_counts = Counter(comp["formula"] for comp in components)
838
- total_charge = sum(comp["charge"] for comp in components)
839
-
797
+ formula_counts = Counter(comp['formula'] for comp in components)
798
+ total_charge = sum(comp['charge'] for comp in components)
799
+
840
800
  # Build formula part with proper multipliers
841
801
  formula_parts = []
842
- for formula, count in sorted(
843
- formula_counts.items(),
844
- ): # Sort for consistent ordering
802
+ for formula, count in sorted(formula_counts.items()): # Sort for consistent ordering
845
803
  if count == 1:
846
804
  formula_parts.append(formula)
847
805
  else:
848
806
  # For multiple occurrences, use count prefix (e.g., 2H, 3Na)
849
807
  # Handle special case where formula might already start with + or -
850
- if formula.startswith(("+", "-")):
808
+ if formula.startswith(('+', '-')):
851
809
  sign = formula[0]
852
810
  base_formula = formula[1:]
853
811
  formula_parts.append(f"{sign}{count}{base_formula}")
854
812
  else:
855
813
  formula_parts.append(f"{count}{formula}")
856
-
814
+
857
815
  # Combine formula parts
858
816
  formula = "".join(formula_parts)
859
-
817
+
860
818
  # Format charge
861
819
  if total_charge == 0:
862
820
  charge_str = ""
863
821
  elif abs(total_charge) == 1:
864
822
  charge_str = "1+" if total_charge > 0 else "1-"
865
823
  else:
866
- charge_str = (
867
- f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
868
- )
869
-
824
+ charge_str = f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
825
+
870
826
  return f"[M{formula}]{charge_str}"
871
827
 
872
828
  def __str__(self):
@@ -878,12 +834,7 @@ class Study:
878
834
  """
879
835
  return ""
880
836
 
881
- def logger_update(
882
- self,
883
- level: str | None = None,
884
- label: str | None = None,
885
- sink: str | None = None,
886
- ):
837
+ def logger_update(self, level: str | None = None, label: str | None = None, sink: str | None = None):
887
838
  """Update the logging configuration for this Study instance.
888
839
 
889
840
  Args:
@@ -915,21 +866,17 @@ class Study:
915
866
  that are out of normal range.
916
867
  """
917
868
  # Cache DataFrame lengths and existence checks
918
- consensus_df_len = (
919
- len(self.consensus_df) if not self.consensus_df.is_empty() else 0
920
- )
869
+ consensus_df_len = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
921
870
  samples_df_len = len(self.samples_df) if not self.samples_df.is_empty() else 0
922
871
 
923
872
  # Calculate consensus statistics only if consensus_df exists and has data
924
873
  if consensus_df_len > 0:
925
874
  # Execute the aggregation once
926
- stats_result = self.consensus_df.select(
927
- [
928
- pl.col("number_samples").min().alias("min_samples"),
929
- pl.col("number_samples").mean().alias("mean_samples"),
930
- pl.col("number_samples").max().alias("max_samples"),
931
- ],
932
- ).row(0)
875
+ stats_result = self.consensus_df.select([
876
+ pl.col("number_samples").min().alias("min_samples"),
877
+ pl.col("number_samples").mean().alias("mean_samples"),
878
+ pl.col("number_samples").max().alias("max_samples"),
879
+ ]).row(0)
933
880
 
934
881
  min_samples = stats_result[0] if stats_result[0] is not None else 0
935
882
  mean_samples = stats_result[1] if stats_result[1] is not None else 0
@@ -941,9 +888,7 @@ class Study:
941
888
 
942
889
  # Count only features where 'filled' == False
943
890
  if not self.features_df.is_empty() and "filled" in self.features_df.columns:
944
- unfilled_features_count = self.features_df.filter(
945
- ~self.features_df["filled"],
946
- ).height
891
+ unfilled_features_count = self.features_df.filter(~self.features_df["filled"]).height
947
892
  else:
948
893
  unfilled_features_count = 0
949
894
 
@@ -966,20 +911,12 @@ class Study:
966
911
  if unfilled_dtype != consensus_dtype:
967
912
  # Cast both to Int64 if possible, otherwise keep as string
968
913
  try:
969
- unfilled_features = unfilled_features.with_columns(
970
- pl.col("feature_uid").cast(pl.Int64),
971
- )
972
- consensus_feature_uids = [
973
- int(uid) for uid in consensus_feature_uids
974
- ]
914
+ unfilled_features = unfilled_features.with_columns(pl.col("feature_uid").cast(pl.Int64))
915
+ consensus_feature_uids = [int(uid) for uid in consensus_feature_uids]
975
916
  except Exception:
976
917
  # If casting fails, ensure both are strings
977
- unfilled_features = unfilled_features.with_columns(
978
- pl.col("feature_uid").cast(pl.Utf8),
979
- )
980
- consensus_feature_uids = [
981
- str(uid) for uid in consensus_feature_uids
982
- ]
918
+ unfilled_features = unfilled_features.with_columns(pl.col("feature_uid").cast(pl.Utf8))
919
+ consensus_feature_uids = [str(uid) for uid in consensus_feature_uids]
983
920
 
984
921
  # Count unfilled features that are in consensus
985
922
  in_consensus_count = unfilled_features.filter(
@@ -988,22 +925,14 @@ class Study:
988
925
 
989
926
  # Calculate ratios that sum to 100%
990
927
  total_unfilled = unfilled_features.height
991
- ratio_in_consensus_to_total = (
992
- (in_consensus_count / total_unfilled * 100) if total_unfilled > 0 else 0
993
- )
994
- ratio_not_in_consensus_to_total = (
995
- 100 - ratio_in_consensus_to_total if total_unfilled > 0 else 0
996
- )
928
+ ratio_in_consensus_to_total = (in_consensus_count / total_unfilled * 100) if total_unfilled > 0 else 0
929
+ ratio_not_in_consensus_to_total = 100 - ratio_in_consensus_to_total if total_unfilled > 0 else 0
997
930
  else:
998
931
  ratio_in_consensus_to_total = 0
999
932
  ratio_not_in_consensus_to_total = 0
1000
933
 
1001
934
  # Optimize chrom completeness calculation
1002
- if (
1003
- consensus_df_len > 0
1004
- and samples_df_len > 0
1005
- and not self.features_df.is_empty()
1006
- ):
935
+ if consensus_df_len > 0 and samples_df_len > 0 and not self.features_df.is_empty():
1007
936
  # Ensure matching data types for join keys
1008
937
  features_dtype = self.features_df["feature_uid"].dtype
1009
938
  consensus_dtype = self.consensus_mapping_df["feature_uid"].dtype
@@ -1011,17 +940,13 @@ class Study:
1011
940
  if features_dtype != consensus_dtype:
1012
941
  # Try to cast both to Int64, fallback to string if needed
1013
942
  try:
1014
- self.features_df = self.features_df.with_columns(
1015
- pl.col("feature_uid").cast(pl.Int64),
1016
- )
943
+ self.features_df = self.features_df.with_columns(pl.col("feature_uid").cast(pl.Int64))
1017
944
  self.consensus_mapping_df = self.consensus_mapping_df.with_columns(
1018
945
  pl.col("feature_uid").cast(pl.Int64),
1019
946
  )
1020
947
  except Exception:
1021
948
  # If casting to Int64 fails, cast both to string
1022
- self.features_df = self.features_df.with_columns(
1023
- pl.col("feature_uid").cast(pl.Utf8),
1024
- )
949
+ self.features_df = self.features_df.with_columns(pl.col("feature_uid").cast(pl.Utf8))
1025
950
  self.consensus_mapping_df = self.consensus_mapping_df.with_columns(
1026
951
  pl.col("feature_uid").cast(pl.Utf8),
1027
952
  )
@@ -1042,9 +967,7 @@ class Study:
1042
967
  else:
1043
968
  non_null_chroms = 0
1044
969
  total_possible = samples_df_len * consensus_df_len
1045
- chrom_completeness = (
1046
- non_null_chroms / total_possible if total_possible > 0 else 0
1047
- )
970
+ chrom_completeness = non_null_chroms / total_possible if total_possible > 0 else 0
1048
971
  else:
1049
972
  chrom_completeness = 0
1050
973
 
@@ -1056,37 +979,23 @@ class Study:
1056
979
 
1057
980
  if not self.consensus_df.is_empty():
1058
981
  # Compute RT spread using only consensus rows with number_samples >= half the number of samples
1059
- threshold = (
1060
- self.consensus_df.select(pl.col("number_samples").max()).item() / 2
1061
- if not self.samples_df.is_empty()
1062
- else 0
1063
- )
982
+ threshold = self.consensus_df.select(pl.col("number_samples").max()).item() / 2 if not self.samples_df.is_empty() else 0
1064
983
  filtered = self.consensus_df.filter(pl.col("number_samples") >= threshold)
1065
984
  if filtered.is_empty():
1066
985
  rt_spread = -1.0
1067
986
  else:
1068
- rt_spread_row = filtered.select(
1069
- (pl.col("rt_max") - pl.col("rt_min")).mean(),
1070
- ).row(0)
1071
- rt_spread = (
1072
- float(rt_spread_row[0])
1073
- if rt_spread_row and rt_spread_row[0] is not None
1074
- else 0.0
1075
- )
987
+ rt_spread_row = filtered.select((pl.col("rt_max") - pl.col("rt_min")).mean()).row(0)
988
+ rt_spread = float(rt_spread_row[0]) if rt_spread_row and rt_spread_row[0] is not None else 0.0
1076
989
  else:
1077
990
  rt_spread = -1.0
1078
991
 
1079
992
  # Calculate percentage of consensus features with MS2
1080
993
  consensus_with_ms2_percentage = (
1081
- (consensus_with_ms2_count / consensus_df_len * 100)
1082
- if consensus_df_len > 0
1083
- else 0
994
+ (consensus_with_ms2_count / consensus_df_len * 100) if consensus_df_len > 0 else 0
1084
995
  )
1085
996
 
1086
997
  # Total MS2 spectra count
1087
- total_ms2_count = (
1088
- len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
1089
- )
998
+ total_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
1090
999
 
1091
1000
  # Estimate memory usage
1092
1001
  memory_usage = (
@@ -1099,27 +1008,15 @@ class Study:
1099
1008
 
1100
1009
  # Add warning symbols for out-of-range values
1101
1010
  consensus_warning = f" {_WARNING_SYMBOL}" if consensus_df_len < 50 else ""
1102
-
1011
+
1103
1012
  rt_spread_text = "N/A" if rt_spread < 0 else f"{rt_spread:.3f}s"
1104
- rt_spread_warning = (
1105
- f" {_WARNING_SYMBOL}"
1106
- if rt_spread >= 0 and (rt_spread > 5 or rt_spread < 0.1)
1107
- else ""
1108
- )
1109
-
1013
+ rt_spread_warning = f" {_WARNING_SYMBOL}" if rt_spread >= 0 and (rt_spread > 5 or rt_spread < 0.1) else ""
1014
+
1110
1015
  chrom_completeness_pct = chrom_completeness * 100
1111
- chrom_warning = (
1112
- f" {_WARNING_SYMBOL}"
1113
- if chrom_completeness_pct < 10 and chrom_completeness_pct >= 0
1114
- else ""
1115
- )
1116
-
1016
+ chrom_warning = f" {_WARNING_SYMBOL}" if chrom_completeness_pct < 10 and chrom_completeness_pct >= 0 else ""
1017
+
1117
1018
  max_samples_warning = ""
1118
- if (
1119
- isinstance(max_samples, (int, float))
1120
- and samples_df_len > 0
1121
- and max_samples > 0
1122
- ):
1019
+ if isinstance(max_samples, (int, float)) and samples_df_len > 0 and max_samples > 0:
1123
1020
  if max_samples < samples_df_len / 3.0:
1124
1021
  max_samples_warning = f" {_WARNING_SYMBOL}"
1125
1022
  elif max_samples < samples_df_len * 0.8: