masster 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/_version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.2.1"
4
+ __version__ = "0.2.2"
5
5
 
6
6
 
7
7
  def get_version():
masster/study/h5.py CHANGED
@@ -2,25 +2,7 @@
2
2
  _study_h5.py
3
3
 
4
4
  This module provides HDF5-based save/load functionality for the Study class.
5
- It handles seria elif col == "chrom":
6
- # elif col == "spectrum":
7
- # Handle single Spectrum objects
8
- data_as_str = []
9
- for item in data:
10
- if item is not None:
11
- data_as_str.append(item.to_json())
12
- else:
13
- data_as_str.append("None")
14
- group.create_dataset(col, data=data_as_str, **optimal_compression)hromatogram objects
15
- data_as_str = []
16
- for item in data:
17
- if item is not None:
18
- data_as_str.append(item.to_json())
19
- else:
20
- data_as_str.append("None")
21
- group.create_dataset(col, data=data_as_str, **optimal_compression) else:
22
- data_as_str.append("null")
23
- group.create_dataset(col, data=data_as_str, **optimal_compression)n and deserialization of Polars DataFrames with complex objects
5
+ It handles serialization and deserialization of Polars DataFrames with complex objects
24
6
  like Chromatogram and Spectrum instances.
25
7
 
26
8
  Key Features:
@@ -449,7 +431,7 @@ def _save_study5(self, filename=None):
449
431
  if not filename.endswith(".study5"):
450
432
  filename += ".study5"
451
433
 
452
- self.logger.debug(f"Saving study to {filename}")
434
+ self.logger.info(f"Saving study to {filename}")
453
435
 
454
436
  # delete existing file if it exists
455
437
  if os.path.exists(filename):
@@ -529,7 +511,7 @@ def _save_study5(self, filename=None):
529
511
  data = consensus_ms2[col] if dtype == "object" else consensus_ms2[col].to_list()
530
512
  _save_dataframe_column(consensus_ms2_group, col, data, dtype, self.logger)
531
513
 
532
- self.logger.info(f"Study saved to {filename}")
514
+ self.logger.debug(f"Save completed for {filename}")
533
515
 
534
516
 
535
517
  def _load_study5(self, filename=None):
@@ -551,6 +533,11 @@ def _load_study5(self, filename=None):
551
533
  - Properly handles MS2 scan lists and spectrum lists
552
534
  - Restores parameters dictionary from JSON serialization
553
535
  """
536
+ from datetime import datetime
537
+ from tqdm import tqdm
538
+
539
+ self.logger.info(f"Loading study from {filename}")
540
+
554
541
  # Handle default filename
555
542
  if filename is None:
556
543
  if self.default_folder is not None:
@@ -573,134 +560,327 @@ def _load_study5(self, filename=None):
573
560
  if not schema:
574
561
  self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
575
562
 
576
- with h5py.File(filename, "r") as f:
577
- # Load metadata
578
- if "metadata" in f:
579
- metadata = f["metadata"]
580
- self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
581
- if hasattr(self, "label"):
582
- self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
583
-
584
- # Load parameters from JSON
585
- if "parameters" in metadata:
586
- try:
587
- parameters_data = metadata["parameters"][()]
588
- if isinstance(parameters_data, bytes):
589
- parameters_data = parameters_data.decode("utf-8")
563
+ # Define loading steps for progress tracking
564
+ loading_steps = [
565
+ "metadata",
566
+ "samples_df",
567
+ "features_df",
568
+ "consensus_df",
569
+ "consensus_mapping_df",
570
+ "consensus_ms2"
571
+ ]
572
+
573
+ # Check if progress bar should be disabled based on log level
574
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
590
575
 
591
- if parameters_data and parameters_data != "":
592
- self.history = json.loads(parameters_data)
593
- else:
576
+ with h5py.File(filename, "r") as f:
577
+ # Use progress bar to show loading progress
578
+ with tqdm(
579
+ total=len(loading_steps),
580
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
581
+ disable=tdqm_disable,
582
+ ) as pbar:
583
+
584
+ # Load metadata
585
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
586
+ if "metadata" in f:
587
+ metadata = f["metadata"]
588
+ self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
589
+ if hasattr(self, "label"):
590
+ self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
591
+
592
+ # Load parameters from JSON
593
+ if "parameters" in metadata:
594
+ try:
595
+ parameters_data = metadata["parameters"][()]
596
+ if isinstance(parameters_data, bytes):
597
+ parameters_data = parameters_data.decode("utf-8")
598
+
599
+ if parameters_data and parameters_data != "":
600
+ self.history = json.loads(parameters_data)
601
+ else:
602
+ self.history = {}
603
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
604
+ self.logger.warning(f"Failed to deserialize parameters: {e}")
594
605
  self.history = {}
595
- except (json.JSONDecodeError, ValueError, TypeError) as e:
596
- self.logger.warning(f"Failed to deserialize parameters: {e}")
606
+ else:
597
607
  self.history = {}
598
- else:
599
- self.history = {}
600
608
 
601
- # Reconstruct self.parameters from loaded history
602
- from masster.study.defaults.study_def import study_defaults
603
-
604
- # Always create a fresh study_defaults object to ensure we have all defaults
605
- self.parameters = study_defaults()
606
-
607
- # Update parameters from loaded history if available
608
- if self.history and "study" in self.history:
609
- study_params = self.history["study"]
610
- if isinstance(study_params, dict):
611
- failed_params = self.parameters.set_from_dict(study_params, validate=False)
612
- if failed_params:
613
- self.logger.debug(f"Could not set study parameters: {failed_params}")
609
+ # Reconstruct self.parameters from loaded history
610
+ from masster.study.defaults.study_def import study_defaults
611
+
612
+ # Always create a fresh study_defaults object to ensure we have all defaults
613
+ self.parameters = study_defaults()
614
+
615
+ # Update parameters from loaded history if available
616
+ if self.history and "study" in self.history:
617
+ study_params = self.history["study"]
618
+ if isinstance(study_params, dict):
619
+ failed_params = self.parameters.set_from_dict(study_params, validate=False)
620
+ if failed_params:
621
+ self.logger.debug(f"Could not set study parameters: {failed_params}")
622
+ else:
623
+ self.logger.debug("Successfully updated parameters from loaded history")
614
624
  else:
615
- self.logger.debug("Successfully updated parameters from loaded history")
625
+ self.logger.debug("Study parameters in history are not a valid dictionary")
616
626
  else:
617
- self.logger.debug("Study parameters in history are not a valid dictionary")
627
+ self.logger.debug("No study parameters found in history, using defaults")
628
+
629
+ # Synchronize instance attributes with parameters (similar to __init__)
630
+ # Note: default_folder and label are already loaded from metadata attributes above
631
+ # but we ensure they match the parameters for consistency
632
+ if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
633
+ self.default_folder = self.parameters.default_folder
634
+ if hasattr(self.parameters, 'label') and self.parameters.label is not None:
635
+ self.label = self.parameters.label
636
+ if hasattr(self.parameters, 'log_level'):
637
+ self.log_level = self.parameters.log_level
638
+ if hasattr(self.parameters, 'log_label'):
639
+ self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
640
+ if hasattr(self.parameters, 'log_sink'):
641
+ self.log_sink = self.parameters.log_sink
642
+ pbar.update(1)
643
+
644
+ # Load samples_df
645
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
646
+ if "samples" in f and len(f["samples"].keys()) > 0:
647
+ self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
618
648
  else:
619
- self.logger.debug("No study parameters found in history, using defaults")
620
-
621
- # Synchronize instance attributes with parameters (similar to __init__)
622
- # Note: default_folder and label are already loaded from metadata attributes above
623
- # but we ensure they match the parameters for consistency
624
- if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
625
- self.default_folder = self.parameters.default_folder
626
- if hasattr(self.parameters, 'label') and self.parameters.label is not None:
627
- self.label = self.parameters.label
628
- if hasattr(self.parameters, 'log_level'):
629
- self.log_level = self.parameters.log_level
630
- if hasattr(self.parameters, 'log_label'):
631
- self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
632
- if hasattr(self.parameters, 'log_sink'):
633
- self.log_sink = self.parameters.log_sink
634
-
635
- # Load samples_df
636
- if "samples" in f and len(f["samples"].keys()) > 0:
637
- self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
649
+ # Initialize empty samples_df with the correct schema if no data exists
650
+ self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
651
+ self.samples_df = pl.DataFrame(
652
+ {
653
+ "sample_uid": [],
654
+ "sample_name": [],
655
+ "sample_path": [],
656
+ "sample_type": [],
657
+ "size": [],
658
+ "map_id": [],
659
+ },
660
+ schema={
661
+ "sample_uid": pl.Int64,
662
+ "sample_name": pl.Utf8,
663
+ "sample_path": pl.Utf8,
664
+ "sample_type": pl.Utf8,
665
+ "size": pl.Int64,
666
+ "map_id": pl.Utf8,
667
+ },
668
+ )
669
+ pbar.update(1)
670
+
671
+ # Load features_df
672
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
673
+ if "features" in f and len(f["features"].keys()) > 0:
674
+ object_columns = ["chrom", "ms2_scans", "ms2_specs"]
675
+ self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
676
+ else:
677
+ self.features_df = None
678
+ pbar.update(1)
679
+
680
+ # Load consensus_df
681
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
682
+ if "consensus" in f and len(f["consensus"].keys()) > 0:
683
+ self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
684
+ else:
685
+ self.consensus_df = None
686
+ pbar.update(1)
687
+
688
+ # Load consensus_mapping_df
689
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
690
+ if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
691
+ self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
692
+ else:
693
+ self.consensus_mapping_df = None
694
+ pbar.update(1)
695
+
696
+ # Load consensus_ms2
697
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2")
698
+ if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
699
+ object_columns = ["spec"]
700
+ self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
701
+ else:
702
+ self.consensus_ms2 = None
703
+ pbar.update(1)
704
+
705
+ self.logger.info(f"Study loaded from {filename}")
706
+
707
+
708
+ def _load_h5(self, filename=None):
709
+ """
710
+ Load Study instance data from a legacy .h5 HDF5 file with progress tracking.
711
+
712
+ This is a legacy method for loading older HDF5 format files. For new files,
713
+ use _load_study5() which has improved schema handling and performance.
714
+
715
+ Args:
716
+ filename (str, optional): Path to the .h5 HDF5 file to load. If None, uses default.
717
+
718
+ Returns:
719
+ None (modifies self in place)
720
+
721
+ Notes:
722
+ - Legacy format loader with basic DataFrame reconstruction
723
+ - Includes progress bar for loading steps
724
+ - For new projects, prefer _load_study5() method
725
+ """
726
+ from datetime import datetime
727
+ from tqdm import tqdm
728
+
729
+ # Handle default filename
730
+ if filename is None:
731
+ if self.default_folder is not None:
732
+ filename = os.path.join(self.default_folder, "study.h5")
638
733
  else:
639
- # Initialize empty samples_df with the correct schema if no data exists
640
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
641
- self.samples_df = pl.DataFrame(
642
- {
643
- "sample_uid": [],
644
- "sample_name": [],
645
- "sample_path": [],
646
- "sample_type": [],
647
- "size": [],
648
- "map_id": [],
649
- },
650
- schema={
651
- "sample_uid": pl.Int64,
652
- "sample_name": pl.Utf8,
653
- "sample_path": pl.Utf8,
654
- "sample_type": pl.Utf8,
655
- "size": pl.Int64,
656
- "map_id": pl.Utf8,
657
- },
658
- )
659
- # Initialize empty samples_df with the correct schema if no data exists
660
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
661
- self.samples_df = pl.DataFrame(
662
- {
734
+ self.logger.error("Either filename or default_folder must be provided")
735
+ return
736
+
737
+ # Add .h5 extension if not provided
738
+ if not filename.endswith(".h5"):
739
+ filename += ".h5"
740
+
741
+ if not os.path.exists(filename):
742
+ self.logger.error(f"File {filename} does not exist")
743
+ return
744
+
745
+ # Define loading steps for progress tracking
746
+ loading_steps = [
747
+ "metadata",
748
+ "samples_df",
749
+ "features_df",
750
+ "consensus_df",
751
+ "consensus_mapping_df"
752
+ ]
753
+
754
+ # Check if progress bar should be disabled based on log level
755
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
756
+
757
+ with h5py.File(filename, "r") as f:
758
+ # Use progress bar to show loading progress
759
+ with tqdm(
760
+ total=len(loading_steps),
761
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading legacy study",
762
+ disable=tdqm_disable,
763
+ ) as pbar:
764
+
765
+ # Load metadata
766
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
767
+ if "metadata" in f:
768
+ metadata = f["metadata"]
769
+ self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
770
+ if hasattr(self, "label"):
771
+ self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
772
+
773
+ # Load parameters from JSON if available
774
+ if "parameters" in metadata:
775
+ try:
776
+ parameters_data = metadata["parameters"][()]
777
+ if isinstance(parameters_data, bytes):
778
+ parameters_data = parameters_data.decode("utf-8")
779
+
780
+ if parameters_data and parameters_data != "":
781
+ self.history = json.loads(parameters_data)
782
+ else:
783
+ self.history = {}
784
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
785
+ self.logger.warning(f"Failed to deserialize parameters: {e}")
786
+ self.history = {}
787
+ else:
788
+ self.history = {}
789
+ pbar.update(1)
790
+
791
+ # Load samples_df (legacy format)
792
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
793
+ if "samples" in f and len(f["samples"].keys()) > 0:
794
+ samples_data = {}
795
+ for col in f["samples"].keys():
796
+ column_data = f["samples"][col][:]
797
+ # Handle byte strings
798
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
799
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
800
+ samples_data[col] = column_data
801
+
802
+ if samples_data:
803
+ self.samples_df = pl.DataFrame(samples_data)
804
+ else:
805
+ # Initialize empty samples_df
806
+ self.samples_df = pl.DataFrame({
807
+ "sample_uid": [],
808
+ "sample_name": [],
809
+ "sample_path": [],
810
+ "sample_type": [],
811
+ "size": [],
812
+ "map_id": [],
813
+ })
814
+ else:
815
+ self.samples_df = pl.DataFrame({
663
816
  "sample_uid": [],
664
817
  "sample_name": [],
665
818
  "sample_path": [],
666
819
  "sample_type": [],
667
820
  "size": [],
668
821
  "map_id": [],
669
- },
670
- schema={
671
- "sample_uid": pl.Int64,
672
- "sample_name": pl.Utf8,
673
- "sample_path": pl.Utf8,
674
- "sample_type": pl.Utf8,
675
- "size": pl.Int64,
676
- "map_id": pl.Utf8,
677
- },
678
- )
679
-
680
- # Load features_df
681
- if "features" in f and len(f["features"].keys()) > 0:
682
- object_columns = ["chrom", "ms2_scans", "ms2_specs"]
683
- self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
684
- else:
685
- self.features_df = None
686
-
687
- # Load consensus_df
688
- if "consensus" in f and len(f["consensus"].keys()) > 0:
689
- self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
690
- else:
691
- self.consensus_df = None
692
-
693
- # Load consensus_mapping_df
694
- if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
695
- self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
696
- else:
697
- self.consensus_mapping_df = None
698
-
699
- # Load consensus_ms2
700
- if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
701
- object_columns = ["spec"]
702
- self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
703
- else:
704
- self.consensus_ms2 = None
822
+ })
823
+ pbar.update(1)
824
+
825
+ # Load features_df (legacy format)
826
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
827
+ if "features" in f and len(f["features"].keys()) > 0:
828
+ features_data = {}
829
+ for col in f["features"].keys():
830
+ column_data = f["features"][col][:]
831
+ # Handle special object columns
832
+ if col in ["chrom", "ms2_specs"]:
833
+ reconstructed_data = _reconstruct_object_column(column_data, col)
834
+ features_data[col] = reconstructed_data
835
+ else:
836
+ # Handle byte strings
837
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
838
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
839
+ features_data[col] = column_data
840
+
841
+ if features_data:
842
+ # Create DataFrame with Object columns handled properly
843
+ object_columns = ["chrom", "ms2_specs"]
844
+ self.features_df = _create_dataframe_with_objects(features_data, object_columns)
845
+ else:
846
+ self.features_df = None
847
+ else:
848
+ self.features_df = None
849
+ pbar.update(1)
850
+
851
+ # Load consensus_df (legacy format)
852
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
853
+ if "consensus" in f and len(f["consensus"].keys()) > 0:
854
+ consensus_data = {}
855
+ for col in f["consensus"].keys():
856
+ column_data = f["consensus"][col][:]
857
+ # Handle byte strings
858
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
859
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
860
+ consensus_data[col] = column_data
861
+
862
+ if consensus_data:
863
+ self.consensus_df = pl.DataFrame(consensus_data)
864
+ else:
865
+ self.consensus_df = None
866
+ else:
867
+ self.consensus_df = None
868
+ pbar.update(1)
869
+
870
+ # Load consensus_mapping_df (legacy format)
871
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
872
+ if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
873
+ mapping_data = {}
874
+ for col in f["consensus_mapping"].keys():
875
+ column_data = f["consensus_mapping"][col][:]
876
+ mapping_data[col] = column_data
877
+
878
+ if mapping_data:
879
+ self.consensus_mapping_df = pl.DataFrame(mapping_data)
880
+ else:
881
+ self.consensus_mapping_df = None
882
+ else:
883
+ self.consensus_mapping_df = None
884
+ pbar.update(1)
705
885
 
706
- self.logger.info(f"Study loaded from {filename}")
886
+ self.logger.info(f"Legacy study loaded from {filename}")
masster/study/load.py CHANGED
@@ -256,7 +256,8 @@ def load(self, filename=None):
256
256
  else:
257
257
  self.logger.error("Either filename or default_folder must be provided")
258
258
  return
259
-
259
+
260
+ self.logger.info(f"Loading study from {filename}")
260
261
  self._load_study5(filename)
261
262
  # After loading the study, check if consensus XML exists and load it
262
263
  consensus_xml_path = filename.replace(".study5", ".consensusXML")
@@ -267,13 +268,13 @@ def load(self, filename=None):
267
268
  self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
268
269
 
269
270
 
270
- def fill_chrom(
271
+ def fill_chrom_single(
271
272
  self,
272
- uids=fill_chrom_defaults().uids,
273
- mz_tol=fill_chrom_defaults().mz_tol,
274
- rt_tol=fill_chrom_defaults().rt_tol,
275
- min_samples_rel=fill_chrom_defaults().min_samples_rel,
276
- min_samples_abs=fill_chrom_defaults().min_samples_abs,
273
+ uids=None,
274
+ mz_tol: float = 0.010,
275
+ rt_tol: float = 10.0,
276
+ min_samples_rel: float = 0.0,
277
+ min_samples_abs: int = 2,
277
278
  ):
278
279
  """Fill missing chromatograms by extracting from raw data.
279
280
 
@@ -281,10 +282,10 @@ def fill_chrom(
281
282
 
282
283
  Args:
283
284
  uids: Consensus UIDs to process (default: all)
284
- mz_tol: m/z tolerance for extraction
285
- rt_tol: RT tolerance for extraction
286
- min_samples_rel: Relative minimum sample threshold
287
- min_samples_abs: Absolute minimum sample threshold
285
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
286
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
287
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
288
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
288
289
  """
289
290
  uids = self._get_consensus_uids(uids)
290
291
 
@@ -685,28 +686,28 @@ def _process_sample_for_parallel_fill(
685
686
  return new_features, new_mapping, counter
686
687
 
687
688
 
688
- def fill_chrom_parallel(
689
+ def fill_chrom(
689
690
  self,
690
- uids=fill_chrom_defaults().uids,
691
- mz_tol=fill_chrom_defaults().mz_tol,
692
- rt_tol=fill_chrom_defaults().rt_tol,
693
- min_samples_rel=fill_chrom_defaults().min_samples_rel,
694
- min_samples_abs=fill_chrom_defaults().min_samples_abs,
691
+ uids=None,
692
+ mz_tol: float = 0.010,
693
+ rt_tol: float = 10.0,
694
+ min_samples_rel: float = 0.0,
695
+ min_samples_abs: int = 2,
695
696
  num_workers=4,
696
697
  ):
697
698
  """Fill missing chromatograms by extracting from raw data using parallel processing.
698
699
 
699
700
  Args:
700
701
  uids: Consensus UIDs to process (default: all)
701
- mz_tol: m/z tolerance for extraction
702
- rt_tol: RT tolerance for extraction
703
- min_samples_rel: Relative minimum sample threshold
704
- min_samples_abs: Absolute minimum sample threshold
702
+ mz_tol: m/z tolerance for extraction (default: 0.010 Da)
703
+ rt_tol: RT tolerance for extraction (default: 10.0 seconds)
704
+ min_samples_rel: Relative minimum sample threshold (default: 0.0)
705
+ min_samples_abs: Absolute minimum sample threshold (default: 2)
705
706
  num_workers: Number of parallel workers (default: 4)
706
707
  """
707
708
  uids = self._get_consensus_uids(uids)
708
709
 
709
- self.logger.info("Gap filling...")
710
+ self.logger.info(f"Gap filling with {num_workers} workers...")
710
711
  self.logger.debug(
711
712
  f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
712
713
  )
@@ -1075,115 +1076,3 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1075
1076
  fh.load(filename, self.consensus_map)
1076
1077
  self.logger.debug(f"Loaded consensus map from {filename}.")
1077
1078
 
1078
-
1079
- """def find_features(
1080
- self,
1081
- reset=None,
1082
- chrom_peak_snr=None,
1083
- noise=None,
1084
- chrom_fwhm=None,
1085
- chrom_coherence=None,
1086
- prominence_scaled=None,
1087
- link_ms2=None,
1088
- save_mgf=None,
1089
- save_stats=None,
1090
- ):
1091
- self.logger.debug("Finding features for all samples in the study.")
1092
- # Initialize default parameters inside the function
1093
- if reset is None:
1094
- reset = False
1095
- if chrom_peak_snr is None:
1096
- chrom_peak_snr = 10.0
1097
- if noise is None:
1098
- noise = 200
1099
-
1100
- # Create parameter object and update with provided values
1101
- params = fill_chrom_defaults()
1102
-
1103
- # Set explicit parameters
1104
- params.set('uids', uids, validate=True)
1105
- params.set('mz_tol', mz_tol, validate=True)
1106
- params.set('rt_tol', rt_tol, validate=True)
1107
- params.set('min_samples_rel', min_samples_rel, validate=True)
1108
- params.set('min_samples_abs', min_samples_abs, validate=True)
1109
-
1110
- # Store parameters in the Study object
1111
- self.store_history(["fill_chrom"], params.to_dict())
1112
- self.logger.debug("Parameters stored to fill_chrom")
1113
-
1114
- if chrom_fwhm is None:
1115
- chrom_fwhm = 1.0
1116
- if chrom_coherence is None:
1117
- chrom_coherence = 0.3
1118
- if prominence_scaled is None:
1119
- prominence_scaled = 1.0
1120
- if link_ms2 is None:
1121
- link_ms2 = True
1122
- if save_mgf is None:
1123
- save_mgf = False
1124
- if save_stats is None:
1125
- save_stats = False
1126
-
1127
- # iterate over all samples in samples_df - using Polars iteration
1128
- for index, row_dict in enumerate(self.samples_df.iter_rows(named=True)):
1129
- # check if features_maps is None
1130
- if self.features_maps[index] is not None and not reset:
1131
- # skip this sample
1132
- continue
1133
- if self.features_maps[index] is not None and not reset:
1134
- # skip this sample
1135
- continue
1136
- # load the sample
1137
- ddaobj = Sample(row_dict["sample_path"])
1138
- # find features
1139
- ddaobj.find_features(
1140
- chrom_peak_snr=chrom_peak_snr,
1141
- noise=noise,
1142
- chrom_fwhm=chrom_fwhm,
1143
- )
1144
- ddaobj.filter_features(
1145
- prominence_scaled=prominence_scaled,
1146
- coherence=chrom_coherence,
1147
- )
1148
- # link MS2
1149
- if link_ms2:
1150
- ddaobj.find_ms2()
1151
-
1152
- # add to features_maps at the index of the sample
1153
- self.features_maps[index] = ddaobj.features
1154
- # add to features_df
1155
- f_df = ddaobj.features_df.clone()
1156
- # add column 'feature_uid' with the uid as uint64
1157
-
1158
- f_df = f_df.with_columns(pl.lit(row_dict["sample_uid"]).alias("sample_uid"))
1159
- # move sample_uid to the first column
1160
- other_cols = [col for col in f_df.columns if col != "sample_uid"]
1161
- f_df = f_df.select(["sample_uid"] + other_cols)
1162
-
1163
- offset = (
1164
- self.features_df.get_column("feature_uid").max() + 1
1165
- if not self.features_df.is_empty()
1166
- else 1
1167
- )
1168
- f_df = f_df.with_columns(
1169
- pl.int_range(offset, offset + len(f_df)).alias("feature_uid"),
1170
- )
1171
- # remove all rows with sample_uid=row_dict['sample_uid']
1172
- self.features_df = self.features_df.filter(
1173
- pl.col("sample_uid") != row_dict["sample_uid"],
1174
- )
1175
- self.features_df = pl.concat([self.features_df, f_df])
1176
-
1177
- if self.default_folder is not None:
1178
- bname = os.path.join(self.default_folder, row_dict["sample_name"])
1179
- ddaobj.save(filename=bname + ".mzpkl")
1180
- ddaobj.save_features(filename=bname + ".featureXML")
1181
- else:
1182
- bname = row_dict["sample_path"].replace(".mzpkl", "").replace(".wiff", "")
1183
- ddaobj.save(filename=bname + ".mzpkl")
1184
- ddaobj.save_features(filename=bname + ".featureXML")
1185
- if save_stats:
1186
- ddaobj.save_stats(filename=bname + "_stats.csv")
1187
- if save_mgf:
1188
- ddaobj.save_mgf(filename=bname + ".mgf", include_all_ms1=True)
1189
- """
masster/study/study.py CHANGED
@@ -71,8 +71,8 @@ from masster.study.helpers import set_default_folder
71
71
  from masster.study.load import add_folder
72
72
  from masster.study.load import add_sample
73
73
  from masster.study.load import (
74
+ fill_chrom_single,
74
75
  fill_chrom,
75
- fill_chrom_parallel,
76
76
  _process_sample_for_parallel_fill,
77
77
  )
78
78
  from masster.study.load import _get_missing_consensus_sample_combinations
@@ -242,7 +242,7 @@ class Study:
242
242
  save_consensus = save_consensus
243
243
  save_samples = save_samples
244
244
  align = align
245
- fill_chrom = fill_chrom
245
+ fill_chrom_single = fill_chrom_single
246
246
  find_consensus = find_consensus
247
247
  find_ms2 = find_ms2
248
248
  integrate_chrom = integrate_chrom
@@ -276,7 +276,7 @@ class Study:
276
276
  get_gaps_stats = get_gaps_stats
277
277
  get_orphans = get_orphans
278
278
  set_default_folder = set_default_folder
279
- fill_chrom_parallel = fill_chrom_parallel
279
+ fill_chrom = fill_chrom
280
280
  _process_sample_for_parallel_fill = _process_sample_for_parallel_fill
281
281
  _get_missing_consensus_sample_combinations = _get_missing_consensus_sample_combinations
282
282
  _load_consensusXML = _load_consensusXML
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -730,10 +730,12 @@ Description-Content-Type: text/markdown
730
730
 
731
731
  # MASSter
732
732
 
733
- **MASSter** is a comprehensive Python package for mass spectrometry data analysis, designed for metabolomics and LC-MS data processing. It provides tools for feature detection, alignment, consensus building, and interactive visualization of mass spectrometry datasets.
733
+ **MASSter** is a comprehensive Python package for mass spectrometry data analysis, designed for metabolomics and LC-MS data processing. It provides tools for feature detection, alignment, consensus building, and interactive visualization of mass spectrometry datasets. It is designed to deal with DDA, and hides functionalities for DIA and ZTScan DIA data.
734
734
 
735
735
  Most core processing functions are derived from OpenMS. We use the same nomenclature and refer to their documentation for an explanation of the parameters. To a large extent, however, you should be able to use the defaults (=no parameters) when calling processing steps.
736
736
 
737
+ This is a poorly documented, stable branch of the development codebase in use in the Zamboni lab. Novel functionalities will be added based on need and requests.
738
+
737
739
  ## Features
738
740
 
739
741
  - **Mass spectrometry data processing**: Support for multiple file formats (.wiff, .mzML, .raw, .mzpkl)
@@ -751,7 +753,7 @@ pip install masster
751
753
 
752
754
  ## Quick Start
753
755
 
754
- ### Basic Workflow
756
+ ### Basic Workflow for analyzing LC-MS study with 2-... samples
755
757
 
756
758
  ```python
757
759
  import masster
@@ -769,21 +771,33 @@ study.align(rt_max_diff=2.0)
769
771
  study.find_consensus(min_samples=3)
770
772
 
771
773
  # Retrieve missing data for quantification
772
- study.fill_chrom_parallel()
774
+ study.fill_chrom(abs_)
773
775
 
774
776
  # Integrate according to consensus metadata
775
777
  study.integrate_chrom()
776
778
 
777
- # link MS2 across the whole study
779
+ # link MS2 across the whole study and export them
778
780
  study.find_ms2()
779
-
780
- # Export MGF file
781
781
  study.export_mgf()
782
782
 
783
- # Save the study
783
+ # Save the study to .study5
784
784
  study.save()
785
785
  ```
786
786
 
787
+ ### Study-Level Plots
788
+
789
+ ```python
790
+ # Plot features from multiple samples
791
+ study.plot_samples_2d()
792
+
793
+ # Plot consensus features
794
+ study.plot_consensus_2d()
795
+
796
+ # Plot overlaid chromatograms for specific consensus features (use their uid)
797
+ study.plot_chrom(uids=[1, 2, 3])
798
+ ```
799
+
800
+
787
801
  ### Single Sample Processing
788
802
 
789
803
  ```python
@@ -801,15 +815,13 @@ sample.find_adducts()
801
815
  # Find MS2 spectra
802
816
  sample.find_ms2()
803
817
 
804
- # Save results
818
+ # Save results to .sample5
805
819
  sample.save()
806
820
  ```
807
821
 
808
- ## Visualization Examples
809
-
810
822
  Masster provides extensive plotting capabilities for data exploration and quality control:
811
823
 
812
- ### 2D Data Visualization
824
+ ### Single sample visualization
813
825
 
814
826
  ```python
815
827
  # Plot 2D overview of MS data with detected features
@@ -822,35 +834,14 @@ sample.plot_2d(
822
834
 
823
835
  # Plot with feature filtering
824
836
  sample.plot_2d(
825
- filename="features_ms2_only.html",
826
- show_only_features_with_ms2=True,
827
- markersize=8
837
+ filename="features_ms2_only.html"
828
838
  )
829
- ```
830
839
 
831
- ### Study-Level Plots
832
-
833
- ```python
834
- # Plot features from multiple samples
835
- study.plot_samples_2d(
836
- samples=None, # Use all samples
837
- filename="multi_sample_overview.html",
838
- markersize=3,
839
- alpha_max=0.8
840
- )
841
-
842
- # Plot consensus features
843
- study.plot_consensus_2d(
844
- filename="consensus_features.html",
845
- colorby="number_samples",
846
- sizeby="inty_mean"
847
- )
848
-
849
- # Plot chromatograms for specific features
850
- study.plot_chrom(
851
- uids=[1, 2, 3], # Feature UIDs
852
- filename="chromatograms.html",
853
- aligned=True
840
+ # Plot extracted ion chromatogram
841
+ sample.plot_eic(
842
+ feature_uid=123,
843
+ rt_tol=10,
844
+ mz_tol=0.005
854
845
  )
855
846
  ```
856
847
 
@@ -883,14 +874,6 @@ sample.plot_ms2_cycle(
883
874
  filename="ms2_cycle.html",
884
875
  centroid=True
885
876
  )
886
-
887
- # Plot extracted ion chromatogram
888
- sample.plot_eic(
889
- feature_uid=123,
890
- rt_tol=10,
891
- mz_tol=0.005,
892
- filename="eic.html"
893
- )
894
877
  ```
895
878
 
896
879
  ## File Format Support
@@ -919,13 +902,6 @@ python -m masster.demo.example_batch_process input_directory --recursive --dest
919
902
 
920
903
  GNU Affero General Public License v3
921
904
 
922
- ## Contributing
923
-
924
- Contributions are welcome! Please see our contributing guidelines and code of conduct.
925
-
926
905
  ## Citation
927
906
 
928
- If you use Masster in your research, please cite:
929
- ```
930
- [Citation details to be added]
931
- ```
907
+ If you use Masster in your research, please cite this repository.
@@ -1,5 +1,5 @@
1
1
  masster/__init__.py,sha256=xeh-hwR_2umE0CpRXn8t22wbkt4IT-FBEzeJknL8J6c,670
2
- masster/_version.py,sha256=yivSeSaLoFmSzFJ3xhHAIjpI_6_SVEIqEZxVZ-NVYPU,239
2
+ masster/_version.py,sha256=CE1l1ajIZe8NnlO_z6wnQkbMvTvA7ky9zk-mlENBeo4,239
3
3
  masster/chromatogram.py,sha256=f25rMrNvCQN0A93wp9QPdG3H4FiOlYPbRY3H4yd7Q5Y,18910
4
4
  masster/logger.py,sha256=9uzuVEPwQkVlnsqT_eVvh33FZY_FIm3Wn2TaJcGhZP8,10674
5
5
  masster/spectrum.py,sha256=XiClDcN1uiG-_2TIr7Bqp7x8gWvHPbC5oh3zUu3fr6Y,46789
@@ -26,14 +26,14 @@ masster/sample/defaults/get_spectrum_def.py,sha256=hy3t3zbIVvKRQmVQl8xAXrmQ4LSDb
26
26
  masster/sample/defaults/sample_def.py,sha256=WHjw-jsYinPKCC02J2Fn5SGB2OW12ntEQn-sHmqESqs,13758
27
27
  masster/study/__init__.py,sha256=bTbxmTgBAL_1iB73JE8fKdo9wik9m4dcmMppElU0V18,157
28
28
  masster/study/export.py,sha256=xmT2WhAuSGGcqHw8Wa44r6g5ud1mzzywOc3TnNqNh8E,12624
29
- masster/study/h5.py,sha256=IwNvqgFw9aRMH6tgfxotE5gb0i_ug0siIal0im_v3mk,30762
29
+ masster/study/h5.py,sha256=BPpcEV_fZ3dJCEkzEga_V1zUkKQEj_kxAeMSF56sSts,39260
30
30
  masster/study/helpers.py,sha256=ePh5hPgSAgfu7-crsm4th0QYGeQbHk9kNj7OyHMclpQ,15860
31
- masster/study/load.py,sha256=SptaAH3L1jAk_tbSY6WpuLeekrcqjIL5HuF2NH5cfQc,42626
31
+ masster/study/load.py,sha256=rTmm5E-UsTg0SJqwa4i4II5ca82m8OEn05yWW2G_YPc,38718
32
32
  masster/study/parameters.py,sha256=iKCIf7_bivi0Jkz4hreKmCyusXpQX5IIuuhnmS52-Q4,3177
33
33
  masster/study/plot.py,sha256=nY6zWKUOhlyDHra4BI0c8dx7PX5fHFW8v2Ma9YpscvU,21437
34
34
  masster/study/processing.py,sha256=PjfpsVASaR0uSE4vqKzBppq4jM3HexzbGw_bn5kDwdA,42552
35
35
  masster/study/save.py,sha256=hfbYoGMaBwKPvoTm5eV3OJoSw7o3Rbed68S4RaEz1I8,5053
36
- masster/study/study.py,sha256=9n-u_7mNynDOTAjwN_sm6AixpApKLVoImeNF56ryIQ4,20382
36
+ masster/study/study.py,sha256=jScjivP6UyBvF_BoNpPeJq2Q_ON0wtPxCVldqAEwCOU,20376
37
37
  masster/study/study5_schema.json,sha256=7LfsgI-dZGpoaPiAy0kh6gDJL4yKuA7-7PHbo9j4A6E,4630
38
38
  masster/study/defaults/__init__.py,sha256=wkul1Qq83nPHI5XebWvu3yKjp5tF8OdZDJJho8r2_qA,569
39
39
  masster/study/defaults/align_def.py,sha256=8Itwit6gaqVhF9A3w9V-uqgKlcQE6uCXyC3ul_gPWFo,8872
@@ -43,8 +43,8 @@ masster/study/defaults/find_consensus_def.py,sha256=artvErq4w07SfHB0WHi68ZjxGg0X
43
43
  masster/study/defaults/find_ms2_def.py,sha256=k-GmnCKgQuVO6M-EAjzGOqgdFrqZviRaNAdiFmwVujY,4907
44
44
  masster/study/defaults/integrate_chrom_def.py,sha256=FY9QdJpdWe18sYucrwNKoZYY0eoOo0a_hcdkZHm_W00,7107
45
45
  masster/study/defaults/study_def.py,sha256=SzUzd2YTGDGCHNMR-Dw57j5PprEnPhpITonv7wx6HQA,9035
46
- masster-0.2.1.dist-info/METADATA,sha256=b2SrmjarfUyfV7Vh5FHIZEgZRH_R9rKa2VleZbs7EoQ,47257
47
- masster-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
- masster-0.2.1.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
49
- masster-0.2.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
50
- masster-0.2.1.dist-info/RECORD,,
46
+ masster-0.2.2.dist-info/METADATA,sha256=YDUoFk7NrZYKgoPNkStSPn7G9jCbtKrLxxa9Q-fBKF8,47089
47
+ masster-0.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
+ masster-0.2.2.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
49
+ masster-0.2.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
50
+ masster-0.2.2.dist-info/RECORD,,