masster 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/h5.py CHANGED
@@ -2,25 +2,7 @@
2
2
  _study_h5.py
3
3
 
4
4
  This module provides HDF5-based save/load functionality for the Study class.
5
- It handles seria elif col == "chrom":
6
- # elif col == "spectrum":
7
- # Handle single Spectrum objects
8
- data_as_str = []
9
- for item in data:
10
- if item is not None:
11
- data_as_str.append(item.to_json())
12
- else:
13
- data_as_str.append("None")
14
- group.create_dataset(col, data=data_as_str, **optimal_compression)hromatogram objects
15
- data_as_str = []
16
- for item in data:
17
- if item is not None:
18
- data_as_str.append(item.to_json())
19
- else:
20
- data_as_str.append("None")
21
- group.create_dataset(col, data=data_as_str, **optimal_compression) else:
22
- data_as_str.append("null")
23
- group.create_dataset(col, data=data_as_str, **optimal_compression)n and deserialization of Polars DataFrames with complex objects
5
+ It handles serialization and deserialization of Polars DataFrames with complex objects
24
6
  like Chromatogram and Spectrum instances.
25
7
 
26
8
  Key Features:
@@ -449,7 +431,7 @@ def _save_study5(self, filename=None):
449
431
  if not filename.endswith(".study5"):
450
432
  filename += ".study5"
451
433
 
452
- self.logger.debug(f"Saving study to {filename}")
434
+ self.logger.info(f"Saving study to {filename}")
453
435
 
454
436
  # delete existing file if it exists
455
437
  if os.path.exists(filename):
@@ -529,8 +511,7 @@ def _save_study5(self, filename=None):
529
511
  data = consensus_ms2[col] if dtype == "object" else consensus_ms2[col].to_list()
530
512
  _save_dataframe_column(consensus_ms2_group, col, data, dtype, self.logger)
531
513
 
532
- self.logger.info(f"Study saved to {filename}")
533
- self.logger.info(f"Study saved to {filename}")
514
+ self.logger.debug(f"Save completed for {filename}")
534
515
 
535
516
 
536
517
  def _load_study5(self, filename=None):
@@ -552,6 +533,11 @@ def _load_study5(self, filename=None):
552
533
  - Properly handles MS2 scan lists and spectrum lists
553
534
  - Restores parameters dictionary from JSON serialization
554
535
  """
536
+ from datetime import datetime
537
+ from tqdm import tqdm
538
+
539
+ self.logger.info(f"Loading study from {filename}")
540
+
555
541
  # Handle default filename
556
542
  if filename is None:
557
543
  if self.default_folder is not None:
@@ -574,134 +560,327 @@ def _load_study5(self, filename=None):
574
560
  if not schema:
575
561
  self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
576
562
 
577
- with h5py.File(filename, "r") as f:
578
- # Load metadata
579
- if "metadata" in f:
580
- metadata = f["metadata"]
581
- self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
582
- if hasattr(self, "label"):
583
- self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
584
-
585
- # Load parameters from JSON
586
- if "parameters" in metadata:
587
- try:
588
- parameters_data = metadata["parameters"][()]
589
- if isinstance(parameters_data, bytes):
590
- parameters_data = parameters_data.decode("utf-8")
563
+ # Define loading steps for progress tracking
564
+ loading_steps = [
565
+ "metadata",
566
+ "samples_df",
567
+ "features_df",
568
+ "consensus_df",
569
+ "consensus_mapping_df",
570
+ "consensus_ms2"
571
+ ]
572
+
573
+ # Check if progress bar should be disabled based on log level
574
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
591
575
 
592
- if parameters_data and parameters_data != "":
593
- self.history = json.loads(parameters_data)
594
- else:
576
+ with h5py.File(filename, "r") as f:
577
+ # Use progress bar to show loading progress
578
+ with tqdm(
579
+ total=len(loading_steps),
580
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
581
+ disable=tdqm_disable,
582
+ ) as pbar:
583
+
584
+ # Load metadata
585
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
586
+ if "metadata" in f:
587
+ metadata = f["metadata"]
588
+ self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
589
+ if hasattr(self, "label"):
590
+ self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
591
+
592
+ # Load parameters from JSON
593
+ if "parameters" in metadata:
594
+ try:
595
+ parameters_data = metadata["parameters"][()]
596
+ if isinstance(parameters_data, bytes):
597
+ parameters_data = parameters_data.decode("utf-8")
598
+
599
+ if parameters_data and parameters_data != "":
600
+ self.history = json.loads(parameters_data)
601
+ else:
602
+ self.history = {}
603
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
604
+ self.logger.warning(f"Failed to deserialize parameters: {e}")
595
605
  self.history = {}
596
- except (json.JSONDecodeError, ValueError, TypeError) as e:
597
- self.logger.warning(f"Failed to deserialize parameters: {e}")
606
+ else:
598
607
  self.history = {}
599
- else:
600
- self.history = {}
601
608
 
602
- # Reconstruct self.parameters from loaded history
603
- from masster.study.defaults.study_def import study_defaults
604
-
605
- # Always create a fresh study_defaults object to ensure we have all defaults
606
- self.parameters = study_defaults()
607
-
608
- # Update parameters from loaded history if available
609
- if self.history and "study" in self.history:
610
- study_params = self.history["study"]
611
- if isinstance(study_params, dict):
612
- failed_params = self.parameters.set_from_dict(study_params, validate=False)
613
- if failed_params:
614
- self.logger.debug(f"Could not set study parameters: {failed_params}")
609
+ # Reconstruct self.parameters from loaded history
610
+ from masster.study.defaults.study_def import study_defaults
611
+
612
+ # Always create a fresh study_defaults object to ensure we have all defaults
613
+ self.parameters = study_defaults()
614
+
615
+ # Update parameters from loaded history if available
616
+ if self.history and "study" in self.history:
617
+ study_params = self.history["study"]
618
+ if isinstance(study_params, dict):
619
+ failed_params = self.parameters.set_from_dict(study_params, validate=False)
620
+ if failed_params:
621
+ self.logger.debug(f"Could not set study parameters: {failed_params}")
622
+ else:
623
+ self.logger.debug("Successfully updated parameters from loaded history")
615
624
  else:
616
- self.logger.debug("Successfully updated parameters from loaded history")
625
+ self.logger.debug("Study parameters in history are not a valid dictionary")
617
626
  else:
618
- self.logger.debug("Study parameters in history are not a valid dictionary")
627
+ self.logger.debug("No study parameters found in history, using defaults")
628
+
629
+ # Synchronize instance attributes with parameters (similar to __init__)
630
+ # Note: default_folder and label are already loaded from metadata attributes above
631
+ # but we ensure they match the parameters for consistency
632
+ if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
633
+ self.default_folder = self.parameters.default_folder
634
+ if hasattr(self.parameters, 'label') and self.parameters.label is not None:
635
+ self.label = self.parameters.label
636
+ if hasattr(self.parameters, 'log_level'):
637
+ self.log_level = self.parameters.log_level
638
+ if hasattr(self.parameters, 'log_label'):
639
+ self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
640
+ if hasattr(self.parameters, 'log_sink'):
641
+ self.log_sink = self.parameters.log_sink
642
+ pbar.update(1)
643
+
644
+ # Load samples_df
645
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
646
+ if "samples" in f and len(f["samples"].keys()) > 0:
647
+ self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
619
648
  else:
620
- self.logger.debug("No study parameters found in history, using defaults")
621
-
622
- # Synchronize instance attributes with parameters (similar to __init__)
623
- # Note: default_folder and label are already loaded from metadata attributes above
624
- # but we ensure they match the parameters for consistency
625
- if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
626
- self.default_folder = self.parameters.default_folder
627
- if hasattr(self.parameters, 'label') and self.parameters.label is not None:
628
- self.label = self.parameters.label
629
- if hasattr(self.parameters, 'log_level'):
630
- self.log_level = self.parameters.log_level
631
- if hasattr(self.parameters, 'log_label'):
632
- self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
633
- if hasattr(self.parameters, 'log_sink'):
634
- self.log_sink = self.parameters.log_sink
635
-
636
- # Load samples_df
637
- if "samples" in f and len(f["samples"].keys()) > 0:
638
- self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
649
+ # Initialize empty samples_df with the correct schema if no data exists
650
+ self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
651
+ self.samples_df = pl.DataFrame(
652
+ {
653
+ "sample_uid": [],
654
+ "sample_name": [],
655
+ "sample_path": [],
656
+ "sample_type": [],
657
+ "size": [],
658
+ "map_id": [],
659
+ },
660
+ schema={
661
+ "sample_uid": pl.Int64,
662
+ "sample_name": pl.Utf8,
663
+ "sample_path": pl.Utf8,
664
+ "sample_type": pl.Utf8,
665
+ "size": pl.Int64,
666
+ "map_id": pl.Utf8,
667
+ },
668
+ )
669
+ pbar.update(1)
670
+
671
+ # Load features_df
672
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
673
+ if "features" in f and len(f["features"].keys()) > 0:
674
+ object_columns = ["chrom", "ms2_scans", "ms2_specs"]
675
+ self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
676
+ else:
677
+ self.features_df = None
678
+ pbar.update(1)
679
+
680
+ # Load consensus_df
681
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
682
+ if "consensus" in f and len(f["consensus"].keys()) > 0:
683
+ self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
684
+ else:
685
+ self.consensus_df = None
686
+ pbar.update(1)
687
+
688
+ # Load consensus_mapping_df
689
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
690
+ if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
691
+ self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
692
+ else:
693
+ self.consensus_mapping_df = None
694
+ pbar.update(1)
695
+
696
+ # Load consensus_ms2
697
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2")
698
+ if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
699
+ object_columns = ["spec"]
700
+ self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
701
+ else:
702
+ self.consensus_ms2 = None
703
+ pbar.update(1)
704
+
705
+ self.logger.info(f"Study loaded from {filename}")
706
+
707
+
708
+ def _load_h5(self, filename=None):
709
+ """
710
+ Load Study instance data from a legacy .h5 HDF5 file with progress tracking.
711
+
712
+ This is a legacy method for loading older HDF5 format files. For new files,
713
+ use _load_study5() which has improved schema handling and performance.
714
+
715
+ Args:
716
+ filename (str, optional): Path to the .h5 HDF5 file to load. If None, uses default.
717
+
718
+ Returns:
719
+ None (modifies self in place)
720
+
721
+ Notes:
722
+ - Legacy format loader with basic DataFrame reconstruction
723
+ - Includes progress bar for loading steps
724
+ - For new projects, prefer _load_study5() method
725
+ """
726
+ from datetime import datetime
727
+ from tqdm import tqdm
728
+
729
+ # Handle default filename
730
+ if filename is None:
731
+ if self.default_folder is not None:
732
+ filename = os.path.join(self.default_folder, "study.h5")
639
733
  else:
640
- # Initialize empty samples_df with the correct schema if no data exists
641
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
642
- self.samples_df = pl.DataFrame(
643
- {
644
- "sample_uid": [],
645
- "sample_name": [],
646
- "sample_path": [],
647
- "sample_type": [],
648
- "size": [],
649
- "map_id": [],
650
- },
651
- schema={
652
- "sample_uid": pl.Int64,
653
- "sample_name": pl.Utf8,
654
- "sample_path": pl.Utf8,
655
- "sample_type": pl.Utf8,
656
- "size": pl.Int64,
657
- "map_id": pl.Utf8,
658
- },
659
- )
660
- # Initialize empty samples_df with the correct schema if no data exists
661
- self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
662
- self.samples_df = pl.DataFrame(
663
- {
734
+ self.logger.error("Either filename or default_folder must be provided")
735
+ return
736
+
737
+ # Add .h5 extension if not provided
738
+ if not filename.endswith(".h5"):
739
+ filename += ".h5"
740
+
741
+ if not os.path.exists(filename):
742
+ self.logger.error(f"File {filename} does not exist")
743
+ return
744
+
745
+ # Define loading steps for progress tracking
746
+ loading_steps = [
747
+ "metadata",
748
+ "samples_df",
749
+ "features_df",
750
+ "consensus_df",
751
+ "consensus_mapping_df"
752
+ ]
753
+
754
+ # Check if progress bar should be disabled based on log level
755
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
756
+
757
+ with h5py.File(filename, "r") as f:
758
+ # Use progress bar to show loading progress
759
+ with tqdm(
760
+ total=len(loading_steps),
761
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading legacy study",
762
+ disable=tdqm_disable,
763
+ ) as pbar:
764
+
765
+ # Load metadata
766
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
767
+ if "metadata" in f:
768
+ metadata = f["metadata"]
769
+ self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
770
+ if hasattr(self, "label"):
771
+ self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
772
+
773
+ # Load parameters from JSON if available
774
+ if "parameters" in metadata:
775
+ try:
776
+ parameters_data = metadata["parameters"][()]
777
+ if isinstance(parameters_data, bytes):
778
+ parameters_data = parameters_data.decode("utf-8")
779
+
780
+ if parameters_data and parameters_data != "":
781
+ self.history = json.loads(parameters_data)
782
+ else:
783
+ self.history = {}
784
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
785
+ self.logger.warning(f"Failed to deserialize parameters: {e}")
786
+ self.history = {}
787
+ else:
788
+ self.history = {}
789
+ pbar.update(1)
790
+
791
+ # Load samples_df (legacy format)
792
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
793
+ if "samples" in f and len(f["samples"].keys()) > 0:
794
+ samples_data = {}
795
+ for col in f["samples"].keys():
796
+ column_data = f["samples"][col][:]
797
+ # Handle byte strings
798
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
799
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
800
+ samples_data[col] = column_data
801
+
802
+ if samples_data:
803
+ self.samples_df = pl.DataFrame(samples_data)
804
+ else:
805
+ # Initialize empty samples_df
806
+ self.samples_df = pl.DataFrame({
807
+ "sample_uid": [],
808
+ "sample_name": [],
809
+ "sample_path": [],
810
+ "sample_type": [],
811
+ "size": [],
812
+ "map_id": [],
813
+ })
814
+ else:
815
+ self.samples_df = pl.DataFrame({
664
816
  "sample_uid": [],
665
817
  "sample_name": [],
666
818
  "sample_path": [],
667
819
  "sample_type": [],
668
820
  "size": [],
669
821
  "map_id": [],
670
- },
671
- schema={
672
- "sample_uid": pl.Int64,
673
- "sample_name": pl.Utf8,
674
- "sample_path": pl.Utf8,
675
- "sample_type": pl.Utf8,
676
- "size": pl.Int64,
677
- "map_id": pl.Utf8,
678
- },
679
- )
680
-
681
- # Load features_df
682
- if "features" in f and len(f["features"].keys()) > 0:
683
- object_columns = ["chrom", "ms2_scans", "ms2_specs"]
684
- self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
685
- else:
686
- self.features_df = None
687
-
688
- # Load consensus_df
689
- if "consensus" in f and len(f["consensus"].keys()) > 0:
690
- self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
691
- else:
692
- self.consensus_df = None
693
-
694
- # Load consensus_mapping_df
695
- if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
696
- self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
697
- else:
698
- self.consensus_mapping_df = None
699
-
700
- # Load consensus_ms2
701
- if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
702
- object_columns = ["spec"]
703
- self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
704
- else:
705
- self.consensus_ms2 = None
822
+ })
823
+ pbar.update(1)
824
+
825
+ # Load features_df (legacy format)
826
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
827
+ if "features" in f and len(f["features"].keys()) > 0:
828
+ features_data = {}
829
+ for col in f["features"].keys():
830
+ column_data = f["features"][col][:]
831
+ # Handle special object columns
832
+ if col in ["chrom", "ms2_specs"]:
833
+ reconstructed_data = _reconstruct_object_column(column_data, col)
834
+ features_data[col] = reconstructed_data
835
+ else:
836
+ # Handle byte strings
837
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
838
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
839
+ features_data[col] = column_data
840
+
841
+ if features_data:
842
+ # Create DataFrame with Object columns handled properly
843
+ object_columns = ["chrom", "ms2_specs"]
844
+ self.features_df = _create_dataframe_with_objects(features_data, object_columns)
845
+ else:
846
+ self.features_df = None
847
+ else:
848
+ self.features_df = None
849
+ pbar.update(1)
850
+
851
+ # Load consensus_df (legacy format)
852
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
853
+ if "consensus" in f and len(f["consensus"].keys()) > 0:
854
+ consensus_data = {}
855
+ for col in f["consensus"].keys():
856
+ column_data = f["consensus"][col][:]
857
+ # Handle byte strings
858
+ if len(column_data) > 0 and isinstance(column_data[0], bytes):
859
+ column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
860
+ consensus_data[col] = column_data
861
+
862
+ if consensus_data:
863
+ self.consensus_df = pl.DataFrame(consensus_data)
864
+ else:
865
+ self.consensus_df = None
866
+ else:
867
+ self.consensus_df = None
868
+ pbar.update(1)
869
+
870
+ # Load consensus_mapping_df (legacy format)
871
+ pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
872
+ if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
873
+ mapping_data = {}
874
+ for col in f["consensus_mapping"].keys():
875
+ column_data = f["consensus_mapping"][col][:]
876
+ mapping_data[col] = column_data
877
+
878
+ if mapping_data:
879
+ self.consensus_mapping_df = pl.DataFrame(mapping_data)
880
+ else:
881
+ self.consensus_mapping_df = None
882
+ else:
883
+ self.consensus_mapping_df = None
884
+ pbar.update(1)
706
885
 
707
- self.logger.info(f"Study loaded from {filename}")
886
+ self.logger.info(f"Legacy study loaded from {filename}")
masster/study/helpers.py CHANGED
@@ -114,45 +114,6 @@ def get_chrom(self, uids=None, samples=None):
114
114
  # Return as Polars DataFrame (can handle complex objects like Chromatogram)
115
115
  return df2_pivoted
116
116
 
117
- '''
118
- def migrate_adduct_columns(self):
119
- """
120
- Migrate adduct_right and adduct_mass_right columns to adduct and adduct_mass.
121
- This fixes an issue where join operations created _right suffixed columns.
122
- """
123
- if self.features_df.is_empty():
124
- return
125
-
126
- # Check if we have the _right suffixed columns
127
- has_adduct_right = "adduct_right" in self.features_df.columns
128
- has_adduct_mass_right = "adduct_mass_right" in self.features_df.columns
129
- has_adduct = "adduct" in self.features_df.columns
130
- has_adduct_mass = "adduct_mass" in self.features_df.columns
131
-
132
- if has_adduct_right or has_adduct_mass_right:
133
- self.logger.info("Migrating adduct column names...")
134
-
135
- # Start with all columns except those we're replacing/dropping
136
- columns_to_keep = [
137
- col
138
- for col in self.features_df.columns
139
- if col not in ["adduct_right", "adduct_mass_right", "adduct", "adduct_mass"]
140
- ]
141
-
142
- # Add the migrated columns
143
- if has_adduct_right:
144
- columns_to_keep.append(pl.col("adduct_right").alias("adduct"))
145
- if has_adduct_mass_right:
146
- columns_to_keep.append(pl.col("adduct_mass_right").alias("adduct_mass"))
147
-
148
- # Apply the migration
149
- self.features_df = self.features_df.select(columns_to_keep)
150
-
151
- self.logger.success("Adduct column migration completed.")
152
- else:
153
- self.logger.info("No adduct column migration needed.")
154
- '''
155
-
156
117
  def set_default_folder(self, folder):
157
118
  """
158
119
  Set the default folder for saving and loading files.
@@ -448,6 +409,12 @@ def _get_sample_uids(self, samples=None, seed=42):
448
409
  sample_uids = list(set(sample_uids))
449
410
  return sample_uids
450
411
 
412
+ def get_orphans(self):
413
+ """
414
+ Get all features that are not in the consensus mapping.
415
+ """
416
+ not_in_consensus = self.features_df.filter(~self.features_df['feature_uid'].is_in(self.consensus_mapping_df['feature_uid'].to_list()))
417
+ return not_in_consensus
451
418
 
452
419
  def compress(self):
453
420
  """