masster 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/_version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.2.1"
4
+ __version__ = "0.2.3"
5
5
 
6
6
 
7
7
  def get_version():
masster/study/h5.py CHANGED
@@ -2,25 +2,7 @@
2
2
  _study_h5.py
3
3
 
4
4
  This module provides HDF5-based save/load functionality for the Study class.
5
- It handles seria elif col == "chrom":
6
- # elif col == "spectrum":
7
- # Handle single Spectrum objects
8
- data_as_str = []
9
- for item in data:
10
- if item is not None:
11
- data_as_str.append(item.to_json())
12
- else:
13
- data_as_str.append("None")
14
- group.create_dataset(col, data=data_as_str, **optimal_compression)hromatogram objects
15
- data_as_str = []
16
- for item in data:
17
- if item is not None:
18
- data_as_str.append(item.to_json())
19
- else:
20
- data_as_str.append("None")
21
- group.create_dataset(col, data=data_as_str, **optimal_compression) else:
22
- data_as_str.append("null")
23
- group.create_dataset(col, data=data_as_str, **optimal_compression)n and deserialization of Polars DataFrames with complex objects
5
+ It handles serialization and deserialization of Polars DataFrames with complex objects
24
6
  like Chromatogram and Spectrum instances.
25
7
 
26
8
  Key Features:
@@ -449,7 +431,7 @@ def _save_study5(self, filename=None):
449
431
  if not filename.endswith(".study5"):
450
432
  filename += ".study5"
451
433
 
452
- self.logger.debug(f"Saving study to {filename}")
434
+ self.logger.info(f"Saving study to {filename}")
453
435
 
454
436
  # delete existing file if it exists
455
437
  if os.path.exists(filename):
@@ -529,7 +511,7 @@ def _save_study5(self, filename=None):
529
511
  data = consensus_ms2[col] if dtype == "object" else consensus_ms2[col].to_list()
530
512
  _save_dataframe_column(consensus_ms2_group, col, data, dtype, self.logger)
531
513
 
532
- self.logger.info(f"Study saved to {filename}")
514
+ self.logger.debug(f"Save completed for {filename}")
533
515
 
534
516
 
535
517
  def _load_study5(self, filename=None):
@@ -551,6 +533,11 @@ def _load_study5(self, filename=None):
551
533
  - Properly handles MS2 scan lists and spectrum lists
552
534
  - Restores parameters dictionary from JSON serialization
553
535
  """
536
+ from datetime import datetime
537
+ from tqdm import tqdm
538
+
539
+ self.logger.info(f"Loading study from {filename}")
540
+
554
541
  # Handle default filename
555
542
  if filename is None:
556
543
  if self.default_folder is not None:
@@ -573,134 +560,327 @@ def _load_study5(self, filename=None):
573
560
  if not schema:
574
561
  self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
575
562
 
576
- with h5py.File(filename, "r") as f:
577
- # Load metadata
578
- if "metadata" in f:
579
- metadata = f["metadata"]
580
- self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
581
- if hasattr(self, "label"):
582
- self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
583
-
584
- # Load parameters from JSON
585
- if "parameters" in metadata:
586
- try:
587
- parameters_data = metadata["parameters"][()]
588
- if isinstance(parameters_data, bytes):
589
- parameters_data = parameters_data.decode("utf-8")
563
+ # Define loading steps for progress tracking
564
+ loading_steps = [
565
+ "metadata",
566
+ "samples_df",
567
+ "features_df",
568
+ "consensus_df",
569
+ "consensus_mapping_df",
570
+ "consensus_ms2"
571
+ ]
572
+
573
+ # Check if progress bar should be disabled based on log level
574
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
590
575
 
591
- if parameters_data and parameters_data != "":
592
- self.history = json.loads(parameters_data)
593
- else:
576
+ with h5py.File(filename, "r") as f:
577
+ # Use progress bar to show loading progress
578
+ with tqdm(
579
+ total=len(loading_steps),
580
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
581
+ disable=tdqm_disable,
582
+ ) as pbar:
583
+
584
+ # Load metadata
585
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
586
+ if "metadata" in f:
587
+ metadata = f["metadata"]
588
+ self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
589
+ if hasattr(self, "label"):
590
+ self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
591
+
592
+ # Load parameters from JSON
593
+ if "parameters" in metadata:
594
+ try:
595
+ parameters_data = metadata["parameters"][()]
596
+ if isinstance(parameters_data, bytes):
597
+ parameters_data = parameters_data.decode("utf-8")
598
+
599
+ if parameters_data and parameters_data != "":
600
+ self.history = json.loads(parameters_data)
601
+ else:
602
+ self.history = {}
603
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
604
+ self.logger.warning(f"Failed to deserialize parameters: {e}")
594
605
  self.history = {}
595
- except (json.JSONDecodeError, ValueError, TypeError) as e:
596
- self.logger.warning(f"Failed to deserialize parameters: {e}")
606
+ else:
597
607
  self.history = {}
598
- else:
599
- self.history = {}
600
608
 
601
- # Reconstruct self.parameters from loaded history
602
- from masster.study.defaults.study_def import study_defaults
603
-
604
- # Always create a fresh study_defaults object to ensure we have all defaults
605
- self.parameters = study_defaults()
606
-
607
- # Update parameters from loaded history if available
608
- if self.history and "study" in self.history:
609
- study_params = self.history["study"]
610
- if isinstance(study_params, dict):
611
- failed_params = self.parameters.set_from_dict(study_params, validate=False)
612
- if failed_params:
613
- self.logger.debug(f"Could not set study parameters: {failed_params}")
609
+ # Reconstruct self.parameters from loaded history
610
+ from masster.study.defaults.study_def import study_defaults
611
+
612
+ # Always create a fresh study_defaults object to ensure we have all defaults
613
+ self.parameters = study_defaults()
614
+
615
+ # Update parameters from loaded history if available
616
+ if self.history and "study" in self.history:
617
+ study_params = self.history["study"]
618
+ if isinstance(study_params, dict):
619
+ failed_params = self.parameters.set_from_dict(study_params, validate=False)
620
+ if failed_params:
621
+ self.logger.debug(f"Could not set study parameters: {failed_params}")
622
+ else:
623
+ self.logger.debug("Successfully updated parameters from loaded history")
614
624
  else:
615
- self.logger.debug("Successfully updated parameters from loaded history")
625
+ self.logger.debug("Study parameters in history are not a valid dictionary")
616
626
  else:
617
- self.logger.debug("Study parameters in history are not a valid dictionary")
627
+ self.logger.debug("No study parameters found in history, using defaults")
628
+
629
+ # Synchronize instance attributes with parameters (similar to __init__)
630
+ # Note: default_folder and label are already loaded from metadata attributes above
631
+ # but we ensure they match the parameters for consistency
632
+ if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
633
+ self.default_folder = self.parameters.default_folder
634
+ if hasattr(self.parameters, 'label') and self.parameters.label is not None:
635
+ self.label = self.parameters.label
636
+ if hasattr(self.parameters, 'log_level'):
637
+ self.log_level = self.parameters.log_level
638
+ if hasattr(self.parameters, 'log_label'):
639
+ self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
640
+ if hasattr(self.parameters, 'log_sink'):
641
+ self.log_sink = self.parameters.log_sink
642
+ pbar.update(1)
643
+
644
+ # Load samples_df
645
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
646
+ if "samples" in f and len(f["samples"].keys()) > 0:
647
+ self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
618
648
  else:
619
- self.logger.debug("No study parameters found in history, using defaults")
620
-
621
- # Synchronize instance attributes with parameters (similar to __init__)
622
- # Note: default_folder and label are already loaded from metadata attributes above
623
- # but we ensure they match the parameters for consistency
624
- if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
625
- self.default_folder = self.parameters.default_folder
626
- if hasattr(self.parameters, 'label') and self.parameters.label is not None:
627
- self.label = self.parameters.label
628
- if hasattr(self.parameters, 'log_level'):
629
- self.log_level = self.parameters.log_level
630
- if hasattr(self.parameters, 'log_label'):
631
- self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
632
- if hasattr(self.parameters, 'log_sink'):
633
- self.log_sink = self.parameters.log_sink
634
-
635
- # Load samples_df
636
- if "samples" in f and len(f["samples"].keys()) > 0:
637
- self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
649
+ # Initialize empty samples_df with the correct schema if no data exists
650
+ self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
651
+ self.samples_df = pl.DataFrame(
652
+ {
653
+ "sample_uid": [],
654
+ "sample_name": [],
655
+ "sample_path": [],
656
+ "sample_type": [],
657
+ "size": [],
658
+ "map_id": [],
659
+ },
660
+ schema={
661
+ "sample_uid": pl.Int64,
662
+ "sample_name": pl.Utf8,
663
+ "sample_path": pl.Utf8,
664
+ "sample_type": pl.Utf8,
665
+ "size": pl.Int64,
666
+ "map_id": pl.Utf8,
667
+ },
668
+ )
669
+ pbar.update(1)
670
+
671
+ # Load features_df
672
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
673
+ if "features" in f and len(f["features"].keys()) > 0:
674
+ object_columns = ["chrom", "ms2_scans", "ms2_specs"]
675
+ self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
676
+ else:
677
+ self.features_df = None
678
+ pbar.update(1)
679
+
680
+ # Load consensus_df
681
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
682
+ if "consensus" in f and len(f["consensus"].keys()) > 0:
683
+ self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
684
+ else:
685
+ self.consensus_df = None
686
+ pbar.update(1)
687
+
688
+ # Load consensus_mapping_df
689
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
690
+ if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
691
+ self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
692
+ else:
693
+ self.consensus_mapping_df = None
694
+ pbar.update(1)
695
+
696
+ # Load consensus_ms2
697
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2")
698
+ if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
699
+ object_columns = ["spec"]
700
+ self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
701
+ else:
702
+ self.consensus_ms2 = None
703
+ pbar.update(1)
704
+
705
+ self.logger.info(f"Study loaded from {filename}")
706
+
707
+
708
+ def _load_h5(self, filename=None):
709
+ """
710
+ Load Study instance data from a legacy .h5 HDF5 file with progress tracking.
711
+
712
+ This is a legacy method for loading older HDF5 format files. For new files,
713
+ use _load_study5() which has improved schema handling and performance.
714
+
715
+ Args:
716
+ filename (str, optional): Path to the .h5 HDF5 file to load. If None, uses default.
717
+
718
+ Returns:
719
+ None (modifies self in place)
720
+
721
+ Notes:
722
+ - Legacy format loader with basic DataFrame reconstruction
723
+ - Includes progress bar for loading steps
724
+ - For new projects, prefer _load_study5() method
725
+ """
726
+ from datetime import datetime
727
+ from tqdm import tqdm
728
+
729
+ # Handle default filename
730
+ if filename is None:
731
+ if self.default_folder is not None:
732
+ filename = os.path.join(self.default_folder, "study.h5")
638
733
  else:
639
- # Initialize empty samples_df with the correct schema if no data exists
640
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
641
- self.samples_df = pl.DataFrame(
642
- {
643
- "sample_uid": [],
644
- "sample_name": [],
645
- "sample_path": [],
646
- "sample_type": [],
647
- "size": [],
648
- "map_id": [],
649
- },
650
- schema={
651
- "sample_uid": pl.Int64,
652
- "sample_name": pl.Utf8,
653
- "sample_path": pl.Utf8,
654
- "sample_type": pl.Utf8,
655
- "size": pl.Int64,
656
- "map_id": pl.Utf8,
657
- },
658
- )
659
- # Initialize empty samples_df with the correct schema if no data exists
660
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
661
- self.samples_df = pl.DataFrame(
662
- {
734
+ self.logger.error("Either filename or default_folder must be provided")
735
+ return
736
+
737
+ # Add .h5 extension if not provided
738
+ if not filename.endswith(".h5"):
739
+ filename += ".h5"
740
+
741
+ if not os.path.exists(filename):
742
+ self.logger.error(f"File {filename} does not exist")
743
+ return
744
+
745
+ # Define loading steps for progress tracking
746
+ loading_steps = [
747
+ "metadata",
748
+ "samples_df",
749
+ "features_df",
750
+ "consensus_df",
751
+ "consensus_mapping_df"
752
+ ]
753
+
754
+ # Check if progress bar should be disabled based on log level
755
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
756
+
757
+ with h5py.File(filename, "r") as f:
758
+ # Use progress bar to show loading progress
759
+ with tqdm(
760
+ total=len(loading_steps),
761
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading legacy study",
762
+ disable=tdqm_disable,
763
+ ) as pbar:
764
+
765
+ # Load metadata
766
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
767
+ if "metadata" in f:
768
+ metadata = f["metadata"]
769
+ self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
770
+ if hasattr(self, "label"):
771
+ self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
772
+
773
+ # Load parameters from JSON if available
774
+ if "parameters" in metadata:
775
+ try:
776
+ parameters_data = metadata["parameters"][()]
777
+ if isinstance(parameters_data, bytes):
778
+ parameters_data = parameters_data.decode("utf-8")
779
+
780
+ if parameters_data and parameters_data != "":
781
+ self.history = json.loads(parameters_data)
782
+ else:
783
+ self.history = {}
784
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
785
+ self.logger.warning(f"Failed to deserialize parameters: {e}")
786
+ self.history = {}
787
+ else:
788
+ self.history = {}
789
+ pbar.update(1)
790
+
791
+ # Load samples_df (legacy format)
792
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
793
+ if "samples" in f and len(f["samples"].keys()) > 0:
794
+ samples_data = {}
795
+ for col in f["samples"].keys():
796
+ column_data = f["samples"][col][:]
797
+ # Handle byte strings
798
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
799
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
800
+ samples_data[col] = column_data
801
+
802
+ if samples_data:
803
+ self.samples_df = pl.DataFrame(samples_data)
804
+ else:
805
+ # Initialize empty samples_df
806
+ self.samples_df = pl.DataFrame({
807
+ "sample_uid": [],
808
+ "sample_name": [],
809
+ "sample_path": [],
810
+ "sample_type": [],
811
+ "size": [],
812
+ "map_id": [],
813
+ })
814
+ else:
815
+ self.samples_df = pl.DataFrame({
663
816
  "sample_uid": [],
664
817
  "sample_name": [],
665
818
  "sample_path": [],
666
819
  "sample_type": [],
667
820
  "size": [],
668
821
  "map_id": [],
669
- },
670
- schema={
671
- "sample_uid": pl.Int64,
672
- "sample_name": pl.Utf8,
673
- "sample_path": pl.Utf8,
674
- "sample_type": pl.Utf8,
675
- "size": pl.Int64,
676
- "map_id": pl.Utf8,
677
- },
678
- )
679
-
680
- # Load features_df
681
- if "features" in f and len(f["features"].keys()) > 0:
682
- object_columns = ["chrom", "ms2_scans", "ms2_specs"]
683
- self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
684
- else:
685
- self.features_df = None
686
-
687
- # Load consensus_df
688
- if "consensus" in f and len(f["consensus"].keys()) > 0:
689
- self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
690
- else:
691
- self.consensus_df = None
692
-
693
- # Load consensus_mapping_df
694
- if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
695
- self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
696
- else:
697
- self.consensus_mapping_df = None
698
-
699
- # Load consensus_ms2
700
- if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
701
- object_columns = ["spec"]
702
- self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
703
- else:
704
- self.consensus_ms2 = None
822
+ })
823
+ pbar.update(1)
824
+
825
+ # Load features_df (legacy format)
826
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
827
+ if "features" in f and len(f["features"].keys()) > 0:
828
+ features_data = {}
829
+ for col in f["features"].keys():
830
+ column_data = f["features"][col][:]
831
+ # Handle special object columns
832
+ if col in ["chrom", "ms2_specs"]:
833
+ reconstructed_data = _reconstruct_object_column(column_data, col)
834
+ features_data[col] = reconstructed_data
835
+ else:
836
+ # Handle byte strings
837
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
838
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
839
+ features_data[col] = column_data
840
+
841
+ if features_data:
842
+ # Create DataFrame with Object columns handled properly
843
+ object_columns = ["chrom", "ms2_specs"]
844
+ self.features_df = _create_dataframe_with_objects(features_data, object_columns)
845
+ else:
846
+ self.features_df = None
847
+ else:
848
+ self.features_df = None
849
+ pbar.update(1)
850
+
851
+ # Load consensus_df (legacy format)
852
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
853
+ if "consensus" in f and len(f["consensus"].keys()) > 0:
854
+ consensus_data = {}
855
+ for col in f["consensus"].keys():
856
+ column_data = f["consensus"][col][:]
857
+ # Handle byte strings
858
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
859
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
860
+ consensus_data[col] = column_data
861
+
862
+ if consensus_data:
863
+ self.consensus_df = pl.DataFrame(consensus_data)
864
+ else:
865
+ self.consensus_df = None
866
+ else:
867
+ self.consensus_df = None
868
+ pbar.update(1)
869
+
870
+ # Load consensus_mapping_df (legacy format)
871
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
872
+ if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
873
+ mapping_data = {}
874
+ for col in f["consensus_mapping"].keys():
875
+ column_data = f["consensus_mapping"][col][:]
876
+ mapping_data[col] = column_data
877
+
878
+ if mapping_data:
879
+ self.consensus_mapping_df = pl.DataFrame(mapping_data)
880
+ else:
881
+ self.consensus_mapping_df = None
882
+ else:
883
+ self.consensus_mapping_df = None
884
+ pbar.update(1)
705
885
 
706
- self.logger.info(f"Study loaded from {filename}")
886
+ self.logger.info(f"Legacy study loaded from {filename}")
masster/study/load.py CHANGED
@@ -256,7 +256,8 @@ def load(self, filename=None):
256
256
  else:
257
257
  self.logger.error("Either filename or default_folder must be provided")
258
258
  return
259
-
259
+
260
+ self.logger.info(f"Loading study from {filename}")
260
261
  self._load_study5(filename)
261
262
  # After loading the study, check if consensus XML exists and load it
262
263
  consensus_xml_path = filename.replace(".study5", ".consensusXML")
@@ -267,13 +268,13 @@ def load(self, filename=None):
267
268
  self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
268
269
 
269
270
 
270
- def fill_chrom(
271
+ def fill_chrom_single(
271
272
  self,
272
- uids=fill_chrom_defaults().uids,
273
- mz_tol=fill_chrom_defaults().mz_tol,
274
- rt_tol=fill_chrom_defaults().rt_tol,
275
- min_samples_rel=fill_chrom_defaults().min_samples_rel,
276
- min_samples_abs=fill_chrom_defaults().min_samples_abs,
273
+ uids=None,
274
+ mz_tol: float = 0.010,
275
+ rt_tol: float = 10.0,
276
+ min_samples_rel: float = 0.0,
277
+ min_samples_abs: int = 2,
277
278
  ):
278
279
  """Fill missing chromatograms by extracting from raw data.
279
280
 
@@ -281,10 +282,10 @@ def fill_chrom(
281
282
 
282
283
  Args:
283
284
  uids: Consensus UIDs to process (default: all)
284
- mz_tol: m/z tolerance for extraction
285
- rt_tol: RT tolerance for extraction
286
- min_samples_rel: Relative minimum sample threshold
287
- min_samples_abs: Absolute minimum sample threshold
285
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
286
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
287
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
288
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
288
289
  """
289
290
  uids = self._get_consensus_uids(uids)
290
291
 
@@ -685,28 +686,28 @@ def _process_sample_for_parallel_fill(
685
686
  return new_features, new_mapping, counter
686
687
 
687
688
 
688
- def fill_chrom_parallel(
689
+ def fill_chrom(
689
690
  self,
690
- uids=fill_chrom_defaults().uids,
691
- mz_tol=fill_chrom_defaults().mz_tol,
692
- rt_tol=fill_chrom_defaults().rt_tol,
693
- min_samples_rel=fill_chrom_defaults().min_samples_rel,
694
- min_samples_abs=fill_chrom_defaults().min_samples_abs,
691
+ uids=None,
692
+ mz_tol: float = 0.010,
693
+ rt_tol: float = 10.0,
694
+ min_samples_rel: float = 0.0,
695
+ min_samples_abs: int = 2,
695
696
  num_workers=4,
696
697
  ):
697
698
  """Fill missing chromatograms by extracting from raw data using parallel processing.
698
699
 
699
700
  Args:
700
701
  uids: Consensus UIDs to process (default: all)
701
- mz_tol: m/z tolerance for extraction
702
- rt_tol: RT tolerance for extraction
703
- min_samples_rel: Relative minimum sample threshold
704
- min_samples_abs: Absolute minimum sample threshold
702
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
703
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
704
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
705
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
705
706
  num_workers: Number of parallel workers (default: 4)
706
707
  """
707
708
  uids = self._get_consensus_uids(uids)
708
709
 
709
- self.logger.info("Gap filling...")
710
+ self.logger.info(f"Gap filling with {num_workers} workers...")
710
711
  self.logger.debug(
711
712
  f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
712
713
  )
@@ -1075,115 +1076,3 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1075
1076
  fh.load(filename, self.consensus_map)
1076
1077
  self.logger.debug(f"Loaded consensus map from {filename}.")
1077
1078
 
1078
-
1079
- """def find_features(
1080
- self,
1081
- reset=None,
1082
- chrom_peak_snr=None,
1083
- noise=None,
1084
- chrom_fwhm=None,
1085
- chrom_coherence=None,
1086
- prominence_scaled=None,
1087
- link_ms2=None,
1088
- save_mgf=None,
1089
- save_stats=None,
1090
- ):
1091
- self.logger.debug("Finding features for all samples in the study.")
1092
- # Initialize default parameters inside the function
1093
- if reset is None:
1094
- reset = False
1095
- if chrom_peak_snr is None:
1096
- chrom_peak_snr = 10.0
1097
- if noise is None:
1098
- noise = 200
1099
-
1100
- # Create parameter object and update with provided values
1101
- params = fill_chrom_defaults()
1102
-
1103
- # Set explicit parameters
1104
- params.set('uids', uids, validate=True)
1105
- params.set('mz_tol', mz_tol, validate=True)
1106
- params.set('rt_tol', rt_tol, validate=True)
1107
- params.set('min_samples_rel', min_samples_rel, validate=True)
1108
- params.set('min_samples_abs', min_samples_abs, validate=True)
1109
-
1110
- # Store parameters in the Study object
1111
- self.store_history(["fill_chrom"], params.to_dict())
1112
- self.logger.debug("Parameters stored to fill_chrom")
1113
-
1114
- if chrom_fwhm is None:
1115
- chrom_fwhm = 1.0
1116
- if chrom_coherence is None:
1117
- chrom_coherence = 0.3
1118
- if prominence_scaled is None:
1119
- prominence_scaled = 1.0
1120
- if link_ms2 is None:
1121
- link_ms2 = True
1122
- if save_mgf is None:
1123
- save_mgf = False
1124
- if save_stats is None:
1125
- save_stats = False
1126
-
1127
- # iterate over all samples in samples_df - using Polars iteration
1128
- for index, row_dict in enumerate(self.samples_df.iter_rows(named=True)):
1129
- # check if features_maps is None
1130
- if self.features_maps[index] is not None and not reset:
1131
- # skip this sample
1132
- continue
1133
- if self.features_maps[index] is not None and not reset:
1134
- # skip this sample
1135
- continue
1136
- # load the sample
1137
- ddaobj = Sample(row_dict["sample_path"])
1138
- # find features
1139
- ddaobj.find_features(
1140
- chrom_peak_snr=chrom_peak_snr,
1141
- noise=noise,
1142
- chrom_fwhm=chrom_fwhm,
1143
- )
1144
- ddaobj.filter_features(
1145
- prominence_scaled=prominence_scaled,
1146
- coherence=chrom_coherence,
1147
- )
1148
- # link MS2
1149
- if link_ms2:
1150
- ddaobj.find_ms2()
1151
-
1152
- # add to features_maps at the index of the sample
1153
- self.features_maps[index] = ddaobj.features
1154
- # add to features_df
1155
- f_df = ddaobj.features_df.clone()
1156
- # add column 'feature_uid' with the uid as uint64
1157
-
1158
- f_df = f_df.with_columns(pl.lit(row_dict["sample_uid"]).alias("sample_uid"))
1159
- # move sample_uid to the first column
1160
- other_cols = [col for col in f_df.columns if col != "sample_uid"]
1161
- f_df = f_df.select(["sample_uid"] + other_cols)
1162
-
1163
- offset = (
1164
- self.features_df.get_column("feature_uid").max() + 1
1165
- if not self.features_df.is_empty()
1166
- else 1
1167
- )
1168
- f_df = f_df.with_columns(
1169
- pl.int_range(offset, offset + len(f_df)).alias("feature_uid"),
1170
- )
1171
- # remove all rows with sample_uid=row_dict['sample_uid']
1172
- self.features_df = self.features_df.filter(
1173
- pl.col("sample_uid") != row_dict["sample_uid"],
1174
- )
1175
- self.features_df = pl.concat([self.features_df, f_df])
1176
-
1177
- if self.default_folder is not None:
1178
- bname = os.path.join(self.default_folder, row_dict["sample_name"])
1179
- ddaobj.save(filename=bname + ".mzpkl")
1180
- ddaobj.save_features(filename=bname + ".featureXML")
1181
- else:
1182
- bname = row_dict["sample_path"].replace(".mzpkl", "").replace(".wiff", "")
1183
- ddaobj.save(filename=bname + ".mzpkl")
1184
- ddaobj.save_features(filename=bname + ".featureXML")
1185
- if save_stats:
1186
- ddaobj.save_stats(filename=bname + "_stats.csv")
1187
- if save_mgf:
1188
- ddaobj.save_mgf(filename=bname + ".mgf", include_all_ms1=True)
1189
- """
masster/study/save.py CHANGED
@@ -122,12 +122,6 @@ def _save_consensusXML(self, filename:str):
122
122
  return
123
123
 
124
124
  fh = oms.ConsensusXMLFile()
125
- # check if filename includes any path
126
- if not os.path.isabs(filename):
127
- if self.default_folder is not None:
128
- filename = os.path.join(self.default_folder, filename)
129
- else:
130
- filename = os.path.join(os.getcwd(), filename)
131
125
  fh.store(filename, self.consensus_map)
132
126
  self.logger.info(f"Saved consensus map to {filename}")
133
127
 
masster/study/study.py CHANGED
@@ -71,8 +71,8 @@ from masster.study.helpers import set_default_folder
71
71
  from masster.study.load import add_folder
72
72
  from masster.study.load import add_sample
73
73
  from masster.study.load import (
74
+ fill_chrom_single,
74
75
  fill_chrom,
75
- fill_chrom_parallel,
76
76
  _process_sample_for_parallel_fill,
77
77
  )
78
78
  from masster.study.load import _get_missing_consensus_sample_combinations
@@ -147,6 +147,7 @@ class Study:
147
147
 
148
148
  def __init__(
149
149
  self,
150
+ filename=None,
150
151
  **kwargs,
151
152
  ):
152
153
  """
@@ -156,6 +157,10 @@ class Study:
156
157
  data storage, and processing parameters used for study-level analysis.
157
158
 
158
159
  Parameters:
160
+ filename (str, optional): Path to a .study5 file to load automatically.
161
+ If provided, the default_folder will be set to the
162
+ directory containing this file, and the study will
163
+ be loaded automatically.
159
164
  **kwargs: Keyword arguments for setting study parameters. Can include:
160
165
  - A study_defaults instance to set all parameters at once (pass as params=study_defaults(...))
161
166
  - Individual parameter names and values (see study_defaults for available parameters)
@@ -172,6 +177,20 @@ class Study:
172
177
  """
173
178
  # Initialize default parameters
174
179
 
180
+ # Handle filename parameter for automatic loading
181
+ auto_load_filename = None
182
+ if filename is not None:
183
+ if not filename.endswith('.study5'):
184
+ raise ValueError("filename must be a .study5 file")
185
+ if not os.path.exists(filename):
186
+ raise FileNotFoundError(f"Study file not found: {filename}")
187
+
188
+ # Set default_folder to the directory containing the file if not already specified
189
+ if 'default_folder' not in kwargs:
190
+ kwargs['default_folder'] = os.path.dirname(os.path.abspath(filename))
191
+
192
+ auto_load_filename = filename
193
+
175
194
  # Check if a study_defaults instance was passed
176
195
  if "params" in kwargs and isinstance(kwargs["params"], study_defaults):
177
196
  params = kwargs.pop("params")
@@ -234,6 +253,10 @@ class Study:
234
253
  sink=self.log_sink
235
254
  )
236
255
 
256
+ # Auto-load study file if filename was provided
257
+ if auto_load_filename is not None:
258
+ self.load(filename=auto_load_filename)
259
+
237
260
 
238
261
 
239
262
  # Attach module functions as class methods
@@ -242,7 +265,7 @@ class Study:
242
265
  save_consensus = save_consensus
243
266
  save_samples = save_samples
244
267
  align = align
245
- fill_chrom = fill_chrom
268
+ fill_chrom_single = fill_chrom_single
246
269
  find_consensus = find_consensus
247
270
  find_ms2 = find_ms2
248
271
  integrate_chrom = integrate_chrom
@@ -276,7 +299,7 @@ class Study:
276
299
  get_gaps_stats = get_gaps_stats
277
300
  get_orphans = get_orphans
278
301
  set_default_folder = set_default_folder
279
- fill_chrom_parallel = fill_chrom_parallel
302
+ fill_chrom = fill_chrom
280
303
  _process_sample_for_parallel_fill = _process_sample_for_parallel_fill
281
304
  _get_missing_consensus_sample_combinations = _get_missing_consensus_sample_combinations
282
305
  _load_consensusXML = _load_consensusXML
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -730,18 +730,11 @@ Description-Content-Type: text/markdown
730
730
 
731
731
  # MASSter
732
732
 
733
- **MASSter** is a comprehensive Python package for mass spectrometry data analysis, designed for metabolomics and LC-MS data processing. It provides tools for feature detection, alignment, consensus building, and interactive visualization of mass spectrometry datasets.
733
+ **MASSter** is a comprehensive Python package for mass spectrometry data analysis, designed for metabolomics and LC-MS data processing. It provides tools for feature detection, alignment, consensus building, and interactive visualization of mass spectrometry datasets. It is designed to deal with DDA, and hides functionalities for DIA and ZTScan DIA data.
734
734
 
735
735
  Most core processing functions are derived from OpenMS. We use the same nomenclature and refer to their documentation for an explanation of the parameters. To a large extent, however, you should be able to use the defaults (=no parameters) when calling processing steps.
736
736
 
737
- ## Features
738
-
739
- - **Mass spectrometry data processing**: Support for multiple file formats (.wiff, .mzML, .raw, .mzpkl)
740
- - **Feature detection and alignment**: Automated chromatographic peak detection and retention time alignment
741
- - **Consensus feature building**: Identification of features across multiple samples
742
- - **Interactive visualizations**: 2D plots, chromatograms, and statistical dashboards
743
- - **Batch processing**: Process entire studies with multiple samples
744
- - **Export capabilities**: MGF export for spectral library searches
737
+ This is a poorly documented, stable branch of the development codebase in use in the Zamboni lab. Novel functionalities will be added based on need and requests.
745
738
 
746
739
  ## Installation
747
740
 
@@ -749,9 +742,7 @@ Most core processing functions are derived from OpenMS. We use the same nomencla
749
742
  pip install masster
750
743
  ```
751
744
 
752
- ## Quick Start
753
-
754
- ### Basic Workflow
745
+ ### Basic Workflow for analyzing LC-MS study with 2-... samples
755
746
 
756
747
  ```python
757
748
  import masster
@@ -769,146 +760,19 @@ study.align(rt_max_diff=2.0)
769
760
  study.find_consensus(min_samples=3)
770
761
 
771
762
  # Retrieve missing data for quantification
772
- study.fill_chrom_parallel()
763
+ study.fill_chrom(abs_)
773
764
 
774
765
  # Integrate according to consensus metadata
775
766
  study.integrate_chrom()
776
767
 
777
- # link MS2 across the whole study
768
+ # link MS2 across the whole study and export them
778
769
  study.find_ms2()
779
-
780
- # Export MGF file
781
770
  study.export_mgf()
782
771
 
783
- # Save the study
772
+ # Save the study to .study5
784
773
  study.save()
785
774
  ```
786
775
 
787
- ### Single Sample Processing
788
-
789
- ```python
790
- from masster.sample import Sample
791
-
792
- # Load a single sample (mzML, RAW, WIFF)
793
- sample = Sample("path/to/your/file.mzML")
794
-
795
- # Detect features
796
- sample.find_features(chrom_peak_snr=10, noise=500, chrom_fwhm=1.0)
797
-
798
- # Detect adducts
799
- sample.find_adducts()
800
-
801
- # Find MS2 spectra
802
- sample.find_ms2()
803
-
804
- # Save results
805
- sample.save()
806
- ```
807
-
808
- ## Visualization Examples
809
-
810
- Masster provides extensive plotting capabilities for data exploration and quality control:
811
-
812
- ### 2D Data Visualization
813
-
814
- ```python
815
- # Plot 2D overview of MS data with detected features
816
- sample.plot_2d(
817
- filename="overview_2d.html",
818
- show_features=True,
819
- show_ms2=True,
820
- title="MS Data Overview"
821
- )
822
-
823
- # Plot with feature filtering
824
- sample.plot_2d(
825
- filename="features_ms2_only.html",
826
- show_only_features_with_ms2=True,
827
- markersize=8
828
- )
829
- ```
830
-
831
- ### Study-Level Plots
832
-
833
- ```python
834
- # Plot features from multiple samples
835
- study.plot_samples_2d(
836
- samples=None, # Use all samples
837
- filename="multi_sample_overview.html",
838
- markersize=3,
839
- alpha_max=0.8
840
- )
841
-
842
- # Plot consensus features
843
- study.plot_consensus_2d(
844
- filename="consensus_features.html",
845
- colorby="number_samples",
846
- sizeby="inty_mean"
847
- )
848
-
849
- # Plot chromatograms for specific features
850
- study.plot_chrom(
851
- uids=[1, 2, 3], # Feature UIDs
852
- filename="chromatograms.html",
853
- aligned=True
854
- )
855
- ```
856
-
857
- ### Quality Control Plots
858
-
859
- ```python
860
- # Plot DDA acquisition statistics
861
- sample.plot_dda_stats(filename="dda_stats.html")
862
-
863
- # Plot feature statistics
864
- sample.plot_feature_stats(filename="feature_stats.html")
865
-
866
- # Plot total ion chromatogram
867
- sample.plot_tic(filename="tic.html")
868
- ```
869
-
870
- ### Advanced Plotting Options
871
-
872
- ```python
873
- # Plot with Oracle annotation data
874
- sample.plot_2d_oracle(
875
- oracle_folder="path/to/oracle/results",
876
- colorby="hg", # Color by chemical class
877
- filename="annotated_features.html"
878
- )
879
-
880
- # Plot MS2 cycle view
881
- sample.plot_ms2_cycle(
882
- cycle=100,
883
- filename="ms2_cycle.html",
884
- centroid=True
885
- )
886
-
887
- # Plot extracted ion chromatogram
888
- sample.plot_eic(
889
- feature_uid=123,
890
- rt_tol=10,
891
- mz_tol=0.005,
892
- filename="eic.html"
893
- )
894
- ```
895
-
896
- ## File Format Support
897
-
898
- - **Input formats**: .wiff, .mzML, .raw files
899
- - **Intermediate formats**: .sample5 and .study5 (HDF5) for fast loading
900
- - **Export formats**: .mgf, .csv
901
- - **Visualization**: .html (interactive), .png, .svg
902
-
903
- ## Advanced Features
904
-
905
- ### Batch Processing
906
- Use the command-line interface for processing multiple files:
907
-
908
- ```bash
909
- python -m masster.demo.example_batch_process input_directory --recursive --dest output_directory
910
- ```
911
-
912
776
  ## Requirements
913
777
 
914
778
  - Python ≥ 3.11
@@ -919,13 +783,6 @@ python -m masster.demo.example_batch_process input_directory --recursive --dest
919
783
 
920
784
  GNU Affero General Public License v3
921
785
 
922
- ## Contributing
923
-
924
- Contributions are welcome! Please see our contributing guidelines and code of conduct.
925
-
926
786
  ## Citation
927
787
 
928
- If you use Masster in your research, please cite:
929
- ```
930
- [Citation details to be added]
931
- ```
788
+ If you use Masster in your research, please cite this repository.
@@ -1,5 +1,5 @@
1
1
  masster/__init__.py,sha256=xeh-hwR_2umE0CpRXn8t22wbkt4IT-FBEzeJknL8J6c,670
2
- masster/_version.py,sha256=yivSeSaLoFmSzFJ3xhHAIjpI_6_SVEIqEZxVZ-NVYPU,239
2
+ masster/_version.py,sha256=-QmvlpTZa_4FtjijQydS9z8bCyNLc0Gv3QiTHg5Ncro,239
3
3
  masster/chromatogram.py,sha256=f25rMrNvCQN0A93wp9QPdG3H4FiOlYPbRY3H4yd7Q5Y,18910
4
4
  masster/logger.py,sha256=9uzuVEPwQkVlnsqT_eVvh33FZY_FIm3Wn2TaJcGhZP8,10674
5
5
  masster/spectrum.py,sha256=XiClDcN1uiG-_2TIr7Bqp7x8gWvHPbC5oh3zUu3fr6Y,46789
@@ -26,14 +26,14 @@ masster/sample/defaults/get_spectrum_def.py,sha256=hy3t3zbIVvKRQmVQl8xAXrmQ4LSDb
26
26
  masster/sample/defaults/sample_def.py,sha256=WHjw-jsYinPKCC02J2Fn5SGB2OW12ntEQn-sHmqESqs,13758
27
27
  masster/study/__init__.py,sha256=bTbxmTgBAL_1iB73JE8fKdo9wik9m4dcmMppElU0V18,157
28
28
  masster/study/export.py,sha256=xmT2WhAuSGGcqHw8Wa44r6g5ud1mzzywOc3TnNqNh8E,12624
29
- masster/study/h5.py,sha256=IwNvqgFw9aRMH6tgfxotE5gb0i_ug0siIal0im_v3mk,30762
29
+ masster/study/h5.py,sha256=BPpcEV_fZ3dJCEkzEga_V1zUkKQEj_kxAeMSF56sSts,39260
30
30
  masster/study/helpers.py,sha256=ePh5hPgSAgfu7-crsm4th0QYGeQbHk9kNj7OyHMclpQ,15860
31
- masster/study/load.py,sha256=SptaAH3L1jAk_tbSY6WpuLeekrcqjIL5HuF2NH5cfQc,42626
31
+ masster/study/load.py,sha256=rTmm5E-UsTg0SJqwa4i4II5ca82m8OEn05yWW2G_YPc,38718
32
32
  masster/study/parameters.py,sha256=iKCIf7_bivi0Jkz4hreKmCyusXpQX5IIuuhnmS52-Q4,3177
33
33
  masster/study/plot.py,sha256=nY6zWKUOhlyDHra4BI0c8dx7PX5fHFW8v2Ma9YpscvU,21437
34
34
  masster/study/processing.py,sha256=PjfpsVASaR0uSE4vqKzBppq4jM3HexzbGw_bn5kDwdA,42552
35
- masster/study/save.py,sha256=hfbYoGMaBwKPvoTm5eV3OJoSw7o3Rbed68S4RaEz1I8,5053
36
- masster/study/study.py,sha256=9n-u_7mNynDOTAjwN_sm6AixpApKLVoImeNF56ryIQ4,20382
35
+ masster/study/save.py,sha256=_DmnAwhlZQRNeVDLNER63pXVhinV-poKMvJlIz6Bt-Y,4791
36
+ masster/study/study.py,sha256=gXc1j4wljbw-Zx-JPsyYO86EoXPaR0N7D2GepJZOPhA,21530
37
37
  masster/study/study5_schema.json,sha256=7LfsgI-dZGpoaPiAy0kh6gDJL4yKuA7-7PHbo9j4A6E,4630
38
38
  masster/study/defaults/__init__.py,sha256=wkul1Qq83nPHI5XebWvu3yKjp5tF8OdZDJJho8r2_qA,569
39
39
  masster/study/defaults/align_def.py,sha256=8Itwit6gaqVhF9A3w9V-uqgKlcQE6uCXyC3ul_gPWFo,8872
@@ -43,8 +43,8 @@ masster/study/defaults/find_consensus_def.py,sha256=artvErq4w07SfHB0WHi68ZjxGg0X
43
43
  masster/study/defaults/find_ms2_def.py,sha256=k-GmnCKgQuVO6M-EAjzGOqgdFrqZviRaNAdiFmwVujY,4907
44
44
  masster/study/defaults/integrate_chrom_def.py,sha256=FY9QdJpdWe18sYucrwNKoZYY0eoOo0a_hcdkZHm_W00,7107
45
45
  masster/study/defaults/study_def.py,sha256=SzUzd2YTGDGCHNMR-Dw57j5PprEnPhpITonv7wx6HQA,9035
46
- masster-0.2.1.dist-info/METADATA,sha256=b2SrmjarfUyfV7Vh5FHIZEgZRH_R9rKa2VleZbs7EoQ,47257
47
- masster-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
- masster-0.2.1.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
49
- masster-0.2.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
50
- masster-0.2.1.dist-info/RECORD,,
46
+ masster-0.2.3.dist-info/METADATA,sha256=hYc0JozT_r5KPMj4znX9ee0omRbd1p8sK9SU9OaIEm8,44324
47
+ masster-0.2.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
+ masster-0.2.3.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
49
+ masster-0.2.3.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
50
+ masster-0.2.3.dist-info/RECORD,,