masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/save.py CHANGED
@@ -48,8 +48,14 @@ def save(self, filename=None, add_timestamp=True, compress=False):
48
48
  # Log file size information for performance monitoring
49
49
  if hasattr(self, "features_df") and not self.features_df.is_empty():
50
50
  feature_count = len(self.features_df)
51
- sample_count = len(self.samples_df) if hasattr(self, "samples_df") and not self.samples_df.is_empty() else 0
52
- self.logger.info(f"Saving study with {sample_count} samples and {feature_count} features to {filename}")
51
+ sample_count = (
52
+ len(self.samples_df)
53
+ if hasattr(self, "samples_df") and not self.samples_df.is_empty()
54
+ else 0
55
+ )
56
+ self.logger.info(
57
+ f"Saving study with {sample_count} samples and {feature_count} features to {filename}",
58
+ )
53
59
 
54
60
  # Use compressed mode for large datasets
55
61
  if compress:
@@ -121,7 +127,9 @@ def save_samples(self, samples=None):
121
127
  if sample_path.endswith(".sample5"):
122
128
  # If sample_path is a .sample5 file, save featureXML in the same directory
123
129
  featurexml_filename = sample_path.replace(".sample5", ".featureXML")
124
- self.logger.debug(f"Saving featureXML alongside .sample5 file: {featurexml_filename}")
130
+ self.logger.debug(
131
+ f"Saving featureXML alongside .sample5 file: {featurexml_filename}",
132
+ )
125
133
  else:
126
134
  # Fallback to study folder or current directory (original behavior)
127
135
  if self.folder is not None:
@@ -134,7 +142,9 @@ def save_samples(self, samples=None):
134
142
  os.getcwd(),
135
143
  sample_name + ".featureXML",
136
144
  )
137
- self.logger.debug(f"Saving featureXML to default location: {featurexml_filename}")
145
+ self.logger.debug(
146
+ f"Saving featureXML to default location: {featurexml_filename}",
147
+ )
138
148
 
139
149
  fh = oms.FeatureXMLFile()
140
150
  if sample_index is not None and sample_index < len(self.features_maps):
masster/study/study.py CHANGED
@@ -125,11 +125,22 @@ from masster.study.parameters import update_parameters
125
125
  from masster.study.parameters import get_parameters_property
126
126
  from masster.study.parameters import set_parameters_property
127
127
  from masster.study.save import save, save_consensus, _save_consensusXML, save_samples
128
- from masster.study.export import export_mgf, export_mztab, export_xlsx, export_parquet, _get_mgf_df
128
+ from masster.study.export import (
129
+ export_mgf,
130
+ export_mztab,
131
+ export_xlsx,
132
+ export_parquet,
133
+ _get_mgf_df,
134
+ )
129
135
  from masster.study.id import lib_load, identify, get_id, id_reset, lib_reset
130
- from masster.study.id import _get_adducts, _calculate_formula_mass_shift, _format_adduct_name, _parse_element_counts
131
-
132
- from masster.logger import MassterLogger
136
+ from masster.study.id import (
137
+ _get_adducts,
138
+ _calculate_formula_mass_shift,
139
+ _format_adduct_name,
140
+ _parse_element_counts,
141
+ )
142
+
143
+ from masster.logger import MasterLogger
133
144
  from masster.study.defaults.study_def import study_defaults
134
145
  from masster.study.defaults.align_def import align_defaults
135
146
  from masster.study.defaults.export_def import export_mgf_defaults
@@ -177,8 +188,8 @@ class Study:
177
188
  - `export_consensus()`: Export consensus features for downstream analysis.
178
189
 
179
190
  Example Usage:
180
- >>> from masster import study
181
- >>> study_obj = study(folder="./data")
191
+ >>> from masster import Study
192
+ >>> study_obj = Study(folder="./data")
182
193
  >>> study_obj.load_folder("./mzml_files")
183
194
  >>> study_obj.process_all()
184
195
  >>> study_obj.align()
@@ -272,7 +283,11 @@ class Study:
272
283
  # Set instance attributes (ensure proper string values for logger)
273
284
  self.folder = params.folder
274
285
  self.label = params.label
275
- self.polarity = params.polarity if params.polarity in ["positive", "negative", "pos", "neg"] else "positive"
286
+ self.polarity = (
287
+ params.polarity
288
+ if params.polarity in ["positive", "negative", "pos", "neg"]
289
+ else "positive"
290
+ )
276
291
  self.log_level = params.log_level.upper() if params.log_level else "INFO"
277
292
  self.log_label = params.log_label + " | " if params.log_label else ""
278
293
  self.log_sink = params.log_sink
@@ -327,7 +342,7 @@ class Study:
327
342
  self.id_df = pl.DataFrame()
328
343
 
329
344
  # Initialize independent logger
330
- self.logger = MassterLogger(
345
+ self.logger = MasterLogger(
331
346
  instance_type="study",
332
347
  level=self.log_level.upper(),
333
348
  label=self.log_label,
@@ -427,7 +442,9 @@ class Study:
427
442
  fill = fill
428
443
  fill_chrom = fill # Backward compatibility alias
429
444
  _process_sample_for_parallel_fill = _process_sample_for_parallel_fill
430
- _get_missing_consensus_sample_combinations = _get_missing_consensus_sample_combinations
445
+ _get_missing_consensus_sample_combinations = (
446
+ _get_missing_consensus_sample_combinations
447
+ )
431
448
  _load_consensusXML = _load_consensusXML
432
449
  load_features = load_features
433
450
  sanitize = sanitize
@@ -485,7 +502,10 @@ class Study:
485
502
 
486
503
  # Get all currently loaded modules that are part of the study package
487
504
  for module_name in sys.modules:
488
- if module_name.startswith(study_module_prefix) and module_name != current_module:
505
+ if (
506
+ module_name.startswith(study_module_prefix)
507
+ and module_name != current_module
508
+ ):
489
509
  study_modules.append(module_name)
490
510
 
491
511
  # Add core masster modules
@@ -500,7 +520,10 @@ class Study:
500
520
  sample_modules = []
501
521
  sample_module_prefix = f"{base_modname}.sample."
502
522
  for module_name in sys.modules:
503
- if module_name.startswith(sample_module_prefix) and module_name != current_module:
523
+ if (
524
+ module_name.startswith(sample_module_prefix)
525
+ and module_name != current_module
526
+ ):
504
527
  sample_modules.append(module_name)
505
528
 
506
529
  all_modules_to_reload = core_modules + sample_modules + study_modules
@@ -538,7 +561,12 @@ class Study:
538
561
  """
539
562
  return ""
540
563
 
541
- def logger_update(self, level: str | None = None, label: str | None = None, sink: str | None = None):
564
+ def logger_update(
565
+ self,
566
+ level: str | None = None,
567
+ label: str | None = None,
568
+ sink: str | None = None,
569
+ ):
542
570
  """Update the logging configuration for this Study instance.
543
571
 
544
572
  Args:
@@ -570,17 +598,21 @@ class Study:
570
598
  that are out of normal range.
571
599
  """
572
600
  # Cache DataFrame lengths and existence checks
573
- consensus_df_len = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
601
+ consensus_df_len = (
602
+ len(self.consensus_df) if not self.consensus_df.is_empty() else 0
603
+ )
574
604
  samples_df_len = len(self.samples_df) if not self.samples_df.is_empty() else 0
575
605
 
576
606
  # Calculate consensus statistics only if consensus_df exists and has data
577
607
  if consensus_df_len > 0:
578
608
  # Execute the aggregation once
579
- stats_result = self.consensus_df.select([
580
- pl.col("number_samples").min().alias("min_samples"),
581
- pl.col("number_samples").mean().alias("mean_samples"),
582
- pl.col("number_samples").max().alias("max_samples"),
583
- ]).row(0)
609
+ stats_result = self.consensus_df.select(
610
+ [
611
+ pl.col("number_samples").min().alias("min_samples"),
612
+ pl.col("number_samples").mean().alias("mean_samples"),
613
+ pl.col("number_samples").max().alias("max_samples"),
614
+ ],
615
+ ).row(0)
584
616
 
585
617
  min_samples = stats_result[0] if stats_result[0] is not None else 0
586
618
  mean_samples = stats_result[1] if stats_result[1] is not None else 0
@@ -592,7 +624,9 @@ class Study:
592
624
 
593
625
  # Count only features where 'filled' == False
594
626
  if not self.features_df.is_empty() and "filled" in self.features_df.columns:
595
- unfilled_features_count = self.features_df.filter(~self.features_df["filled"]).height
627
+ unfilled_features_count = self.features_df.filter(
628
+ ~self.features_df["filled"],
629
+ ).height
596
630
  else:
597
631
  unfilled_features_count = 0
598
632
 
@@ -615,12 +649,20 @@ class Study:
615
649
  if unfilled_dtype != consensus_dtype:
616
650
  # Cast both to Int64 if possible, otherwise keep as string
617
651
  try:
618
- unfilled_features = unfilled_features.with_columns(pl.col("feature_uid").cast(pl.Int64))
619
- consensus_feature_uids = [int(uid) for uid in consensus_feature_uids]
652
+ unfilled_features = unfilled_features.with_columns(
653
+ pl.col("feature_uid").cast(pl.Int64),
654
+ )
655
+ consensus_feature_uids = [
656
+ int(uid) for uid in consensus_feature_uids
657
+ ]
620
658
  except Exception:
621
659
  # If casting fails, ensure both are strings
622
- unfilled_features = unfilled_features.with_columns(pl.col("feature_uid").cast(pl.Utf8))
623
- consensus_feature_uids = [str(uid) for uid in consensus_feature_uids]
660
+ unfilled_features = unfilled_features.with_columns(
661
+ pl.col("feature_uid").cast(pl.Utf8),
662
+ )
663
+ consensus_feature_uids = [
664
+ str(uid) for uid in consensus_feature_uids
665
+ ]
624
666
 
625
667
  # Count unfilled features that are in consensus
626
668
  in_consensus_count = unfilled_features.filter(
@@ -629,14 +671,22 @@ class Study:
629
671
 
630
672
  # Calculate ratios that sum to 100%
631
673
  total_unfilled = unfilled_features.height
632
- ratio_in_consensus_to_total = (in_consensus_count / total_unfilled * 100) if total_unfilled > 0 else 0
633
- ratio_not_in_consensus_to_total = 100 - ratio_in_consensus_to_total if total_unfilled > 0 else 0
674
+ ratio_in_consensus_to_total = (
675
+ (in_consensus_count / total_unfilled * 100) if total_unfilled > 0 else 0
676
+ )
677
+ ratio_not_in_consensus_to_total = (
678
+ 100 - ratio_in_consensus_to_total if total_unfilled > 0 else 0
679
+ )
634
680
  else:
635
681
  ratio_in_consensus_to_total = 0
636
682
  ratio_not_in_consensus_to_total = 0
637
683
 
638
684
  # Optimize chrom completeness calculation
639
- if consensus_df_len > 0 and samples_df_len > 0 and not self.features_df.is_empty():
685
+ if (
686
+ consensus_df_len > 0
687
+ and samples_df_len > 0
688
+ and not self.features_df.is_empty()
689
+ ):
640
690
  # Ensure matching data types for join keys
641
691
  features_dtype = self.features_df["feature_uid"].dtype
642
692
  consensus_dtype = self.consensus_mapping_df["feature_uid"].dtype
@@ -644,13 +694,17 @@ class Study:
644
694
  if features_dtype != consensus_dtype:
645
695
  # Try to cast both to Int64, fallback to string if needed
646
696
  try:
647
- self.features_df = self.features_df.with_columns(pl.col("feature_uid").cast(pl.Int64))
697
+ self.features_df = self.features_df.with_columns(
698
+ pl.col("feature_uid").cast(pl.Int64),
699
+ )
648
700
  self.consensus_mapping_df = self.consensus_mapping_df.with_columns(
649
701
  pl.col("feature_uid").cast(pl.Int64),
650
702
  )
651
703
  except Exception:
652
704
  # If casting to Int64 fails, cast both to string
653
- self.features_df = self.features_df.with_columns(pl.col("feature_uid").cast(pl.Utf8))
705
+ self.features_df = self.features_df.with_columns(
706
+ pl.col("feature_uid").cast(pl.Utf8),
707
+ )
654
708
  self.consensus_mapping_df = self.consensus_mapping_df.with_columns(
655
709
  pl.col("feature_uid").cast(pl.Utf8),
656
710
  )
@@ -671,7 +725,9 @@ class Study:
671
725
  else:
672
726
  non_null_chroms = 0
673
727
  total_possible = samples_df_len * consensus_df_len
674
- chrom_completeness = non_null_chroms / total_possible if total_possible > 0 else 0
728
+ chrom_completeness = (
729
+ non_null_chroms / total_possible if total_possible > 0 else 0
730
+ )
675
731
  else:
676
732
  chrom_completeness = 0
677
733
 
@@ -683,23 +739,37 @@ class Study:
683
739
 
684
740
  if not self.consensus_df.is_empty():
685
741
  # Compute RT spread using only consensus rows with number_samples >= half the number of samples
686
- threshold = self.consensus_df.select(pl.col("number_samples").max()).item() / 2 if not self.samples_df.is_empty() else 0
742
+ threshold = (
743
+ self.consensus_df.select(pl.col("number_samples").max()).item() / 2
744
+ if not self.samples_df.is_empty()
745
+ else 0
746
+ )
687
747
  filtered = self.consensus_df.filter(pl.col("number_samples") >= threshold)
688
748
  if filtered.is_empty():
689
749
  rt_spread = -1.0
690
750
  else:
691
- rt_spread_row = filtered.select((pl.col("rt_max") - pl.col("rt_min")).mean()).row(0)
692
- rt_spread = float(rt_spread_row[0]) if rt_spread_row and rt_spread_row[0] is not None else 0.0
751
+ rt_spread_row = filtered.select(
752
+ (pl.col("rt_max") - pl.col("rt_min")).mean(),
753
+ ).row(0)
754
+ rt_spread = (
755
+ float(rt_spread_row[0])
756
+ if rt_spread_row and rt_spread_row[0] is not None
757
+ else 0.0
758
+ )
693
759
  else:
694
760
  rt_spread = -1.0
695
761
 
696
762
  # Calculate percentage of consensus features with MS2
697
763
  consensus_with_ms2_percentage = (
698
- (consensus_with_ms2_count / consensus_df_len * 100) if consensus_df_len > 0 else 0
764
+ (consensus_with_ms2_count / consensus_df_len * 100)
765
+ if consensus_df_len > 0
766
+ else 0
699
767
  )
700
768
 
701
769
  # Total MS2 spectra count
702
- total_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
770
+ total_ms2_count = (
771
+ len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
772
+ )
703
773
 
704
774
  # Estimate memory usage
705
775
  memory_usage = (
@@ -712,15 +782,27 @@ class Study:
712
782
 
713
783
  # Add warning symbols for out-of-range values
714
784
  consensus_warning = f" {_WARNING_SYMBOL}" if consensus_df_len < 50 else ""
715
-
785
+
716
786
  rt_spread_text = "N/A" if rt_spread < 0 else f"{rt_spread:.3f}s"
717
- rt_spread_warning = f" {_WARNING_SYMBOL}" if rt_spread >= 0 and (rt_spread > 5 or rt_spread < 0.1) else ""
718
-
787
+ rt_spread_warning = (
788
+ f" {_WARNING_SYMBOL}"
789
+ if rt_spread >= 0 and (rt_spread > 5 or rt_spread < 0.1)
790
+ else ""
791
+ )
792
+
719
793
  chrom_completeness_pct = chrom_completeness * 100
720
- chrom_warning = f" {_WARNING_SYMBOL}" if chrom_completeness_pct < 10 and chrom_completeness_pct >= 0 else ""
721
-
794
+ chrom_warning = (
795
+ f" {_WARNING_SYMBOL}"
796
+ if chrom_completeness_pct < 10 and chrom_completeness_pct >= 0
797
+ else ""
798
+ )
799
+
722
800
  max_samples_warning = ""
723
- if isinstance(max_samples, (int, float)) and samples_df_len > 0 and max_samples > 0:
801
+ if (
802
+ isinstance(max_samples, (int, float))
803
+ and samples_df_len > 0
804
+ and max_samples > 0
805
+ ):
724
806
  if max_samples < samples_df_len / 3.0:
725
807
  max_samples_warning = f" {_WARNING_SYMBOL}"
726
808
  elif max_samples < samples_df_len * 0.8: