masster 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/h5.py +18 -2
- masster/sample/sample5_schema.json +76 -58
- masster/study/h5.py +317 -138
- masster/study/helpers.py +6 -39
- masster/study/load.py +23 -134
- masster/study/study.py +29 -11
- {masster-0.2.0.dist-info → masster-0.2.2.dist-info}/METADATA +31 -55
- {masster-0.2.0.dist-info → masster-0.2.2.dist-info}/RECORD +12 -12
- {masster-0.2.0.dist-info → masster-0.2.2.dist-info}/WHEEL +0 -0
- {masster-0.2.0.dist-info → masster-0.2.2.dist-info}/entry_points.txt +0 -0
- {masster-0.2.0.dist-info → masster-0.2.2.dist-info}/licenses/LICENSE +0 -0
masster/study/h5.py
CHANGED
|
@@ -2,25 +2,7 @@
|
|
|
2
2
|
_study_h5.py
|
|
3
3
|
|
|
4
4
|
This module provides HDF5-based save/load functionality for the Study class.
|
|
5
|
-
It handles
|
|
6
|
-
# elif col == "spectrum":
|
|
7
|
-
# Handle single Spectrum objects
|
|
8
|
-
data_as_str = []
|
|
9
|
-
for item in data:
|
|
10
|
-
if item is not None:
|
|
11
|
-
data_as_str.append(item.to_json())
|
|
12
|
-
else:
|
|
13
|
-
data_as_str.append("None")
|
|
14
|
-
group.create_dataset(col, data=data_as_str, **optimal_compression)hromatogram objects
|
|
15
|
-
data_as_str = []
|
|
16
|
-
for item in data:
|
|
17
|
-
if item is not None:
|
|
18
|
-
data_as_str.append(item.to_json())
|
|
19
|
-
else:
|
|
20
|
-
data_as_str.append("None")
|
|
21
|
-
group.create_dataset(col, data=data_as_str, **optimal_compression) else:
|
|
22
|
-
data_as_str.append("null")
|
|
23
|
-
group.create_dataset(col, data=data_as_str, **optimal_compression)n and deserialization of Polars DataFrames with complex objects
|
|
5
|
+
It handles serialization and deserialization of Polars DataFrames with complex objects
|
|
24
6
|
like Chromatogram and Spectrum instances.
|
|
25
7
|
|
|
26
8
|
Key Features:
|
|
@@ -449,7 +431,7 @@ def _save_study5(self, filename=None):
|
|
|
449
431
|
if not filename.endswith(".study5"):
|
|
450
432
|
filename += ".study5"
|
|
451
433
|
|
|
452
|
-
self.logger.
|
|
434
|
+
self.logger.info(f"Saving study to {filename}")
|
|
453
435
|
|
|
454
436
|
# delete existing file if it exists
|
|
455
437
|
if os.path.exists(filename):
|
|
@@ -529,8 +511,7 @@ def _save_study5(self, filename=None):
|
|
|
529
511
|
data = consensus_ms2[col] if dtype == "object" else consensus_ms2[col].to_list()
|
|
530
512
|
_save_dataframe_column(consensus_ms2_group, col, data, dtype, self.logger)
|
|
531
513
|
|
|
532
|
-
self.logger.
|
|
533
|
-
self.logger.info(f"Study saved to {filename}")
|
|
514
|
+
self.logger.debug(f"Save completed for {filename}")
|
|
534
515
|
|
|
535
516
|
|
|
536
517
|
def _load_study5(self, filename=None):
|
|
@@ -552,6 +533,11 @@ def _load_study5(self, filename=None):
|
|
|
552
533
|
- Properly handles MS2 scan lists and spectrum lists
|
|
553
534
|
- Restores parameters dictionary from JSON serialization
|
|
554
535
|
"""
|
|
536
|
+
from datetime import datetime
|
|
537
|
+
from tqdm import tqdm
|
|
538
|
+
|
|
539
|
+
self.logger.info(f"Loading study from {filename}")
|
|
540
|
+
|
|
555
541
|
# Handle default filename
|
|
556
542
|
if filename is None:
|
|
557
543
|
if self.default_folder is not None:
|
|
@@ -574,134 +560,327 @@ def _load_study5(self, filename=None):
|
|
|
574
560
|
if not schema:
|
|
575
561
|
self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
|
|
576
562
|
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
if isinstance(parameters_data, bytes):
|
|
590
|
-
parameters_data = parameters_data.decode("utf-8")
|
|
563
|
+
# Define loading steps for progress tracking
|
|
564
|
+
loading_steps = [
|
|
565
|
+
"metadata",
|
|
566
|
+
"samples_df",
|
|
567
|
+
"features_df",
|
|
568
|
+
"consensus_df",
|
|
569
|
+
"consensus_mapping_df",
|
|
570
|
+
"consensus_ms2"
|
|
571
|
+
]
|
|
572
|
+
|
|
573
|
+
# Check if progress bar should be disabled based on log level
|
|
574
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
591
575
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
576
|
+
with h5py.File(filename, "r") as f:
|
|
577
|
+
# Use progress bar to show loading progress
|
|
578
|
+
with tqdm(
|
|
579
|
+
total=len(loading_steps),
|
|
580
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading study",
|
|
581
|
+
disable=tdqm_disable,
|
|
582
|
+
) as pbar:
|
|
583
|
+
|
|
584
|
+
# Load metadata
|
|
585
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
|
|
586
|
+
if "metadata" in f:
|
|
587
|
+
metadata = f["metadata"]
|
|
588
|
+
self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
|
|
589
|
+
if hasattr(self, "label"):
|
|
590
|
+
self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
|
|
591
|
+
|
|
592
|
+
# Load parameters from JSON
|
|
593
|
+
if "parameters" in metadata:
|
|
594
|
+
try:
|
|
595
|
+
parameters_data = metadata["parameters"][()]
|
|
596
|
+
if isinstance(parameters_data, bytes):
|
|
597
|
+
parameters_data = parameters_data.decode("utf-8")
|
|
598
|
+
|
|
599
|
+
if parameters_data and parameters_data != "":
|
|
600
|
+
self.history = json.loads(parameters_data)
|
|
601
|
+
else:
|
|
602
|
+
self.history = {}
|
|
603
|
+
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
|
604
|
+
self.logger.warning(f"Failed to deserialize parameters: {e}")
|
|
595
605
|
self.history = {}
|
|
596
|
-
|
|
597
|
-
self.logger.warning(f"Failed to deserialize parameters: {e}")
|
|
606
|
+
else:
|
|
598
607
|
self.history = {}
|
|
599
|
-
else:
|
|
600
|
-
self.history = {}
|
|
601
608
|
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
609
|
+
# Reconstruct self.parameters from loaded history
|
|
610
|
+
from masster.study.defaults.study_def import study_defaults
|
|
611
|
+
|
|
612
|
+
# Always create a fresh study_defaults object to ensure we have all defaults
|
|
613
|
+
self.parameters = study_defaults()
|
|
614
|
+
|
|
615
|
+
# Update parameters from loaded history if available
|
|
616
|
+
if self.history and "study" in self.history:
|
|
617
|
+
study_params = self.history["study"]
|
|
618
|
+
if isinstance(study_params, dict):
|
|
619
|
+
failed_params = self.parameters.set_from_dict(study_params, validate=False)
|
|
620
|
+
if failed_params:
|
|
621
|
+
self.logger.debug(f"Could not set study parameters: {failed_params}")
|
|
622
|
+
else:
|
|
623
|
+
self.logger.debug("Successfully updated parameters from loaded history")
|
|
615
624
|
else:
|
|
616
|
-
self.logger.debug("
|
|
625
|
+
self.logger.debug("Study parameters in history are not a valid dictionary")
|
|
617
626
|
else:
|
|
618
|
-
self.logger.debug("
|
|
627
|
+
self.logger.debug("No study parameters found in history, using defaults")
|
|
628
|
+
|
|
629
|
+
# Synchronize instance attributes with parameters (similar to __init__)
|
|
630
|
+
# Note: default_folder and label are already loaded from metadata attributes above
|
|
631
|
+
# but we ensure they match the parameters for consistency
|
|
632
|
+
if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
|
|
633
|
+
self.default_folder = self.parameters.default_folder
|
|
634
|
+
if hasattr(self.parameters, 'label') and self.parameters.label is not None:
|
|
635
|
+
self.label = self.parameters.label
|
|
636
|
+
if hasattr(self.parameters, 'log_level'):
|
|
637
|
+
self.log_level = self.parameters.log_level
|
|
638
|
+
if hasattr(self.parameters, 'log_label'):
|
|
639
|
+
self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
|
|
640
|
+
if hasattr(self.parameters, 'log_sink'):
|
|
641
|
+
self.log_sink = self.parameters.log_sink
|
|
642
|
+
pbar.update(1)
|
|
643
|
+
|
|
644
|
+
# Load samples_df
|
|
645
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
|
|
646
|
+
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
647
|
+
self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
|
|
619
648
|
else:
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
649
|
+
# Initialize empty samples_df with the correct schema if no data exists
|
|
650
|
+
self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
|
|
651
|
+
self.samples_df = pl.DataFrame(
|
|
652
|
+
{
|
|
653
|
+
"sample_uid": [],
|
|
654
|
+
"sample_name": [],
|
|
655
|
+
"sample_path": [],
|
|
656
|
+
"sample_type": [],
|
|
657
|
+
"size": [],
|
|
658
|
+
"map_id": [],
|
|
659
|
+
},
|
|
660
|
+
schema={
|
|
661
|
+
"sample_uid": pl.Int64,
|
|
662
|
+
"sample_name": pl.Utf8,
|
|
663
|
+
"sample_path": pl.Utf8,
|
|
664
|
+
"sample_type": pl.Utf8,
|
|
665
|
+
"size": pl.Int64,
|
|
666
|
+
"map_id": pl.Utf8,
|
|
667
|
+
},
|
|
668
|
+
)
|
|
669
|
+
pbar.update(1)
|
|
670
|
+
|
|
671
|
+
# Load features_df
|
|
672
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
|
|
673
|
+
if "features" in f and len(f["features"].keys()) > 0:
|
|
674
|
+
object_columns = ["chrom", "ms2_scans", "ms2_specs"]
|
|
675
|
+
self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
|
|
676
|
+
else:
|
|
677
|
+
self.features_df = None
|
|
678
|
+
pbar.update(1)
|
|
679
|
+
|
|
680
|
+
# Load consensus_df
|
|
681
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
|
|
682
|
+
if "consensus" in f and len(f["consensus"].keys()) > 0:
|
|
683
|
+
self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
|
|
684
|
+
else:
|
|
685
|
+
self.consensus_df = None
|
|
686
|
+
pbar.update(1)
|
|
687
|
+
|
|
688
|
+
# Load consensus_mapping_df
|
|
689
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
|
|
690
|
+
if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
|
|
691
|
+
self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
|
|
692
|
+
else:
|
|
693
|
+
self.consensus_mapping_df = None
|
|
694
|
+
pbar.update(1)
|
|
695
|
+
|
|
696
|
+
# Load consensus_ms2
|
|
697
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2")
|
|
698
|
+
if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
|
|
699
|
+
object_columns = ["spec"]
|
|
700
|
+
self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
|
|
701
|
+
else:
|
|
702
|
+
self.consensus_ms2 = None
|
|
703
|
+
pbar.update(1)
|
|
704
|
+
|
|
705
|
+
self.logger.info(f"Study loaded from {filename}")
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def _load_h5(self, filename=None):
|
|
709
|
+
"""
|
|
710
|
+
Load Study instance data from a legacy .h5 HDF5 file with progress tracking.
|
|
711
|
+
|
|
712
|
+
This is a legacy method for loading older HDF5 format files. For new files,
|
|
713
|
+
use _load_study5() which has improved schema handling and performance.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
filename (str, optional): Path to the .h5 HDF5 file to load. If None, uses default.
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
None (modifies self in place)
|
|
720
|
+
|
|
721
|
+
Notes:
|
|
722
|
+
- Legacy format loader with basic DataFrame reconstruction
|
|
723
|
+
- Includes progress bar for loading steps
|
|
724
|
+
- For new projects, prefer _load_study5() method
|
|
725
|
+
"""
|
|
726
|
+
from datetime import datetime
|
|
727
|
+
from tqdm import tqdm
|
|
728
|
+
|
|
729
|
+
# Handle default filename
|
|
730
|
+
if filename is None:
|
|
731
|
+
if self.default_folder is not None:
|
|
732
|
+
filename = os.path.join(self.default_folder, "study.h5")
|
|
639
733
|
else:
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
734
|
+
self.logger.error("Either filename or default_folder must be provided")
|
|
735
|
+
return
|
|
736
|
+
|
|
737
|
+
# Add .h5 extension if not provided
|
|
738
|
+
if not filename.endswith(".h5"):
|
|
739
|
+
filename += ".h5"
|
|
740
|
+
|
|
741
|
+
if not os.path.exists(filename):
|
|
742
|
+
self.logger.error(f"File {filename} does not exist")
|
|
743
|
+
return
|
|
744
|
+
|
|
745
|
+
# Define loading steps for progress tracking
|
|
746
|
+
loading_steps = [
|
|
747
|
+
"metadata",
|
|
748
|
+
"samples_df",
|
|
749
|
+
"features_df",
|
|
750
|
+
"consensus_df",
|
|
751
|
+
"consensus_mapping_df"
|
|
752
|
+
]
|
|
753
|
+
|
|
754
|
+
# Check if progress bar should be disabled based on log level
|
|
755
|
+
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
756
|
+
|
|
757
|
+
with h5py.File(filename, "r") as f:
|
|
758
|
+
# Use progress bar to show loading progress
|
|
759
|
+
with tqdm(
|
|
760
|
+
total=len(loading_steps),
|
|
761
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading legacy study",
|
|
762
|
+
disable=tdqm_disable,
|
|
763
|
+
) as pbar:
|
|
764
|
+
|
|
765
|
+
# Load metadata
|
|
766
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata")
|
|
767
|
+
if "metadata" in f:
|
|
768
|
+
metadata = f["metadata"]
|
|
769
|
+
self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
|
|
770
|
+
if hasattr(self, "label"):
|
|
771
|
+
self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
|
|
772
|
+
|
|
773
|
+
# Load parameters from JSON if available
|
|
774
|
+
if "parameters" in metadata:
|
|
775
|
+
try:
|
|
776
|
+
parameters_data = metadata["parameters"][()]
|
|
777
|
+
if isinstance(parameters_data, bytes):
|
|
778
|
+
parameters_data = parameters_data.decode("utf-8")
|
|
779
|
+
|
|
780
|
+
if parameters_data and parameters_data != "":
|
|
781
|
+
self.history = json.loads(parameters_data)
|
|
782
|
+
else:
|
|
783
|
+
self.history = {}
|
|
784
|
+
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
|
785
|
+
self.logger.warning(f"Failed to deserialize parameters: {e}")
|
|
786
|
+
self.history = {}
|
|
787
|
+
else:
|
|
788
|
+
self.history = {}
|
|
789
|
+
pbar.update(1)
|
|
790
|
+
|
|
791
|
+
# Load samples_df (legacy format)
|
|
792
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples")
|
|
793
|
+
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
794
|
+
samples_data = {}
|
|
795
|
+
for col in f["samples"].keys():
|
|
796
|
+
column_data = f["samples"][col][:]
|
|
797
|
+
# Handle byte strings
|
|
798
|
+
if len(column_data) > 0 and isinstance(column_data[0], bytes):
|
|
799
|
+
column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
|
|
800
|
+
samples_data[col] = column_data
|
|
801
|
+
|
|
802
|
+
if samples_data:
|
|
803
|
+
self.samples_df = pl.DataFrame(samples_data)
|
|
804
|
+
else:
|
|
805
|
+
# Initialize empty samples_df
|
|
806
|
+
self.samples_df = pl.DataFrame({
|
|
807
|
+
"sample_uid": [],
|
|
808
|
+
"sample_name": [],
|
|
809
|
+
"sample_path": [],
|
|
810
|
+
"sample_type": [],
|
|
811
|
+
"size": [],
|
|
812
|
+
"map_id": [],
|
|
813
|
+
})
|
|
814
|
+
else:
|
|
815
|
+
self.samples_df = pl.DataFrame({
|
|
664
816
|
"sample_uid": [],
|
|
665
817
|
"sample_name": [],
|
|
666
818
|
"sample_path": [],
|
|
667
819
|
"sample_type": [],
|
|
668
820
|
"size": [],
|
|
669
821
|
"map_id": [],
|
|
670
|
-
}
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
822
|
+
})
|
|
823
|
+
pbar.update(1)
|
|
824
|
+
|
|
825
|
+
# Load features_df (legacy format)
|
|
826
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features")
|
|
827
|
+
if "features" in f and len(f["features"].keys()) > 0:
|
|
828
|
+
features_data = {}
|
|
829
|
+
for col in f["features"].keys():
|
|
830
|
+
column_data = f["features"][col][:]
|
|
831
|
+
# Handle special object columns
|
|
832
|
+
if col in ["chrom", "ms2_specs"]:
|
|
833
|
+
reconstructed_data = _reconstruct_object_column(column_data, col)
|
|
834
|
+
features_data[col] = reconstructed_data
|
|
835
|
+
else:
|
|
836
|
+
# Handle byte strings
|
|
837
|
+
if len(column_data) > 0 and isinstance(column_data[0], bytes):
|
|
838
|
+
column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
|
|
839
|
+
features_data[col] = column_data
|
|
840
|
+
|
|
841
|
+
if features_data:
|
|
842
|
+
# Create DataFrame with Object columns handled properly
|
|
843
|
+
object_columns = ["chrom", "ms2_specs"]
|
|
844
|
+
self.features_df = _create_dataframe_with_objects(features_data, object_columns)
|
|
845
|
+
else:
|
|
846
|
+
self.features_df = None
|
|
847
|
+
else:
|
|
848
|
+
self.features_df = None
|
|
849
|
+
pbar.update(1)
|
|
850
|
+
|
|
851
|
+
# Load consensus_df (legacy format)
|
|
852
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus")
|
|
853
|
+
if "consensus" in f and len(f["consensus"].keys()) > 0:
|
|
854
|
+
consensus_data = {}
|
|
855
|
+
for col in f["consensus"].keys():
|
|
856
|
+
column_data = f["consensus"][col][:]
|
|
857
|
+
# Handle byte strings
|
|
858
|
+
if len(column_data) > 0 and isinstance(column_data[0], bytes):
|
|
859
|
+
column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
|
|
860
|
+
consensus_data[col] = column_data
|
|
861
|
+
|
|
862
|
+
if consensus_data:
|
|
863
|
+
self.consensus_df = pl.DataFrame(consensus_data)
|
|
864
|
+
else:
|
|
865
|
+
self.consensus_df = None
|
|
866
|
+
else:
|
|
867
|
+
self.consensus_df = None
|
|
868
|
+
pbar.update(1)
|
|
869
|
+
|
|
870
|
+
# Load consensus_mapping_df (legacy format)
|
|
871
|
+
pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping")
|
|
872
|
+
if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
|
|
873
|
+
mapping_data = {}
|
|
874
|
+
for col in f["consensus_mapping"].keys():
|
|
875
|
+
column_data = f["consensus_mapping"][col][:]
|
|
876
|
+
mapping_data[col] = column_data
|
|
877
|
+
|
|
878
|
+
if mapping_data:
|
|
879
|
+
self.consensus_mapping_df = pl.DataFrame(mapping_data)
|
|
880
|
+
else:
|
|
881
|
+
self.consensus_mapping_df = None
|
|
882
|
+
else:
|
|
883
|
+
self.consensus_mapping_df = None
|
|
884
|
+
pbar.update(1)
|
|
706
885
|
|
|
707
|
-
self.logger.info(f"
|
|
886
|
+
self.logger.info(f"Legacy study loaded from {filename}")
|
masster/study/helpers.py
CHANGED
|
@@ -114,45 +114,6 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
114
114
|
# Return as Polars DataFrame (can handle complex objects like Chromatogram)
|
|
115
115
|
return df2_pivoted
|
|
116
116
|
|
|
117
|
-
'''
|
|
118
|
-
def migrate_adduct_columns(self):
|
|
119
|
-
"""
|
|
120
|
-
Migrate adduct_right and adduct_mass_right columns to adduct and adduct_mass.
|
|
121
|
-
This fixes an issue where join operations created _right suffixed columns.
|
|
122
|
-
"""
|
|
123
|
-
if self.features_df.is_empty():
|
|
124
|
-
return
|
|
125
|
-
|
|
126
|
-
# Check if we have the _right suffixed columns
|
|
127
|
-
has_adduct_right = "adduct_right" in self.features_df.columns
|
|
128
|
-
has_adduct_mass_right = "adduct_mass_right" in self.features_df.columns
|
|
129
|
-
has_adduct = "adduct" in self.features_df.columns
|
|
130
|
-
has_adduct_mass = "adduct_mass" in self.features_df.columns
|
|
131
|
-
|
|
132
|
-
if has_adduct_right or has_adduct_mass_right:
|
|
133
|
-
self.logger.info("Migrating adduct column names...")
|
|
134
|
-
|
|
135
|
-
# Start with all columns except those we're replacing/dropping
|
|
136
|
-
columns_to_keep = [
|
|
137
|
-
col
|
|
138
|
-
for col in self.features_df.columns
|
|
139
|
-
if col not in ["adduct_right", "adduct_mass_right", "adduct", "adduct_mass"]
|
|
140
|
-
]
|
|
141
|
-
|
|
142
|
-
# Add the migrated columns
|
|
143
|
-
if has_adduct_right:
|
|
144
|
-
columns_to_keep.append(pl.col("adduct_right").alias("adduct"))
|
|
145
|
-
if has_adduct_mass_right:
|
|
146
|
-
columns_to_keep.append(pl.col("adduct_mass_right").alias("adduct_mass"))
|
|
147
|
-
|
|
148
|
-
# Apply the migration
|
|
149
|
-
self.features_df = self.features_df.select(columns_to_keep)
|
|
150
|
-
|
|
151
|
-
self.logger.success("Adduct column migration completed.")
|
|
152
|
-
else:
|
|
153
|
-
self.logger.info("No adduct column migration needed.")
|
|
154
|
-
'''
|
|
155
|
-
|
|
156
117
|
def set_default_folder(self, folder):
|
|
157
118
|
"""
|
|
158
119
|
Set the default folder for saving and loading files.
|
|
@@ -448,6 +409,12 @@ def _get_sample_uids(self, samples=None, seed=42):
|
|
|
448
409
|
sample_uids = list(set(sample_uids))
|
|
449
410
|
return sample_uids
|
|
450
411
|
|
|
412
|
+
def get_orphans(self):
|
|
413
|
+
"""
|
|
414
|
+
Get all features that are not in the consensus mapping.
|
|
415
|
+
"""
|
|
416
|
+
not_in_consensus = self.features_df.filter(~self.features_df['feature_uid'].is_in(self.consensus_mapping_df['feature_uid'].to_list()))
|
|
417
|
+
return not_in_consensus
|
|
451
418
|
|
|
452
419
|
def compress(self):
|
|
453
420
|
"""
|