PyPI - masster - Versions diffs - 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl - Mend

masster 0.3.17py3-none-any.whl → 0.3.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (18) hide show

masster/_version.py +1 -1
masster/sample/h5.py +1 -1
masster/sample/helpers.py +3 -7
masster/sample/load.py +2 -2
masster/sample/plot.py +2 -1
masster/study/export.py +27 -10
masster/study/h5.py +58 -40
masster/study/helpers.py +275 -225
masster/study/helpers_optimized.py +5 -5
masster/study/load.py +148 -121
masster/study/plot.py +306 -106
masster/study/processing.py +9 -5
masster/study/study.py +2 -6
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/METADATA +1 -1
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/RECORD +18 -18
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/WHEEL +0 -0
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/entry_points.txt +0 -0
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/licenses/LICENSE +0 -0

masster/study/load.py CHANGED Viewed

@@ -45,12 +45,12 @@ def add(
     """Add samples from a folder to the study.
     Args:
-        folder (str, optional): Path to folder containing sample files.
+        folder (str, optional): Path to folder containing sample files.
             Defaults to study folder or current working directory.
         reset (bool, optional): Whether to reset the study before adding samples.
             Defaults to False.
         adducts (optional): Adducts to use for sample loading. Defaults to None.
-        max_files (int, optional): Maximum number of files to process.
+        max_files (int, optional): Maximum number of files to process.
             Defaults to None (no limit).
         fast (bool, optional): Whether to use optimized loading that skips ms1_df
             for better performance. Defaults to True.
@@ -104,21 +104,27 @@ def add(
             for file in files:
                 if max_files is not None and counter >= max_files:
                     break
                 # Get filename without extension for blacklist check
                 basename = os.path.basename(file)
                 filename_no_ext = os.path.splitext(basename)[0]
                 # Check if this filename (without extension) is already in blacklist
                 if filename_no_ext not in blacklist:
                     files_to_process.append(file)
-                    if len(files_to_process) + counter >= (max_files or float('inf')):
+                    if len(files_to_process) + counter >= (max_files or float("inf")):
                         break
             # Batch process all files of this extension using ultra-optimized method
             if files_to_process:
                 self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
-                successful = self._add_samples_batch(files_to_process, reset=reset, adducts=adducts, blacklist=blacklist, fast=fast)
+                successful = self._add_samples_batch(
+                    files_to_process,
+                    reset=reset,
+                    adducts=adducts,
+                    blacklist=blacklist,
+                    fast=fast,
+                )
                 counter += successful
                 if successful > 0:
                     not_zero = True
@@ -140,7 +146,7 @@ def add(
 def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
     """
     Add a single sample to the study.
     Args:
         file (str): Path to the sample file
         type (str, optional): File type to force. Defaults to None (auto-detect).
@@ -148,31 +154,31 @@ def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
         adducts (optional): Adducts to use for sample loading. Defaults to None.
         fast (bool, optional): Whether to use optimized loading that skips ms1_df
             for better performance. Defaults to True.
     Returns:
         bool: True if successful, False otherwise.
     """
     if fast:
         # Use optimized method for better performance
         success = self._add_sample_optimized(
-            file,
-            type=type,
-            reset=reset,
+            file,
+            type=type,
+            reset=reset,
             adducts=adducts,
             skip_color_reset=False,  # Do color reset for individual calls
-            skip_schema_check=True   # Skip schema check for performance (safe with diagonal concat)
+            skip_schema_check=True,  # Skip schema check for performance (safe with diagonal concat)
         )
     else:
         # Use standard method with full ms1_df loading
         success = self._add_sample_standard(
-            file,
-            type=type,
-            reset=reset,
+            file,
+            type=type,
+            reset=reset,
             adducts=adducts,
             skip_color_reset=False,  # Do color reset for individual calls
-            skip_schema_check=True   # Skip schema check for performance
+            skip_schema_check=True,  # Skip schema check for performance
         )
     return success
@@ -1193,17 +1199,18 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
     fh.load(filename, self.consensus_map)
     self.logger.debug(f"Loaded consensus map from {filename}.")
 def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
     """
     Optimized batch addition of samples.
     Args:
         files (list): List of file paths to process
         reset (bool): Whether to reset features before processing
         adducts: Adducts to use for sample loading
         blacklist (set): Set of filenames already processed
         fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
     Performance optimizations:
     1. No per-sample color reset
     2. No schema enforcement during addition
@@ -1212,126 +1219,135 @@ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, f
     """
     if not files:
         return 0
     if blacklist is None:
         blacklist = set()
     self.logger.debug(f"Starting batch addition of {len(files)} samples (fast={fast})...")
     successful_additions = 0
     failed_additions = 0
     # Progress reporting setup
     tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
     for i, file in enumerate(
         tqdm(
             files,
             total=len(files),
             desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Batch add",
             disable=tqdm_disable,
-        )
+        ),
     ):
         try:
             # Choose between optimized and standard loading
             if fast:
                 success = self._add_sample_optimized(
-                    file,
-                    reset=reset,
+                    file,
+                    reset=reset,
                     adducts=adducts,
                     skip_color_reset=True,  # Skip color reset during batch
-                    skip_schema_check=True  # Skip schema enforcement
+                    skip_schema_check=True,  # Skip schema enforcement
                 )
             else:
                 success = self._add_sample_standard(
-                    file,
-                    reset=reset,
+                    file,
+                    reset=reset,
                     adducts=adducts,
                     skip_color_reset=True,  # Skip color reset during batch
-                    skip_schema_check=True  # Skip schema enforcement
+                    skip_schema_check=True,  # Skip schema enforcement
                 )
             if success:
                 # Add to blacklist for filename tracking
                 basename = os.path.basename(file)
                 filename_no_ext = os.path.splitext(basename)[0]
                 blacklist.add(filename_no_ext)
                 successful_additions += 1
         except Exception as e:
             self.logger.warning(f"Failed to add sample {file}: {e}")
             failed_additions += 1
             continue
     # Final cleanup operations done once at the end
     if successful_additions > 0:
         self.logger.debug("Performing final batch cleanup...")
         # Optional: Only do schema enforcement if specifically needed (usually not required)
         # self._ensure_features_df_schema_order()
         # Color assignment done once for all samples
         self._sample_color_reset_optimized()
         self.logger.debug(f"Batch addition complete: {successful_additions} successful, {failed_additions} failed")
     return successful_additions
-def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
+def _add_sample_optimized(
+    self,
+    file,
+    type=None,
+    reset=False,
+    adducts=None,
+    skip_color_reset=True,
+    skip_schema_check=True,
+):
     """
     Optimized add_sample with performance improvements integrated.
     Removes:
     - Schema enforcement (_ensure_features_df_schema_order)
     - Complex column alignment and type casting
     - Per-addition color reset
     - Unnecessary column reordering
     Returns True if successful, False otherwise.
     """
     self.logger.debug(f"Adding: {file}")
     # Basic validation
     basename = os.path.basename(file)
     sample_name = os.path.splitext(basename)[0]
     if sample_name in self.samples_df["sample_name"].to_list():
         self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
         return False
     if not os.path.exists(file):
         self.logger.error(f"File {file} does not exist.")
         return False
     if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
         self.logger.error(f"Unsupported file type: {file}")
         return False
     # Load sample
     ddaobj = Sample()
     ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
     # Use standard loading method temporarily to test if this fixes the astuple error
     ddaobj.load(file)
     if ddaobj.features_df is None and not reset:
         ddaobj.features = None
     if ddaobj.features is None or reset:
         ddaobj.find_features()
         ddaobj.find_adducts(adducts=adducts)
         ddaobj.find_ms2()
     self.features_maps.append(ddaobj.features)
     # Determine sample type
     sample_type = "sample" if type is None else type
     if "qc" in sample_name.lower():
         sample_type = "qc"
     if "blank" in sample_name.lower():
         sample_type = "blank"
     map_id_value = len(self.features_maps) - 1
     # Handle file paths
     if file.endswith(".sample5"):
         final_sample_path = file
@@ -1345,7 +1361,7 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
             final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
         ddaobj.save(final_sample_path)
         self.logger.debug(f"Saved converted sample: {final_sample_path}")
     # Efficient scan counting
     ms1_count = ms2_count = 0
     if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
@@ -1357,7 +1373,7 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
                 ms1_count = count
             elif level == 2:
                 ms2_count = count
     # Create sample entry
     next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
     new_sample = pl.DataFrame({
@@ -1375,11 +1391,11 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
         "num_ms1": [ms1_count],
         "num_ms2": [ms2_count],
     })
     self.samples_df = pl.concat([self.samples_df, new_sample])
     # SIMPLIFIED feature processing
-    current_sample_uid = len(self.samples_df) - 1
+    current_sample_uid = len(self.samples_df)
     # Add required columns with minimal operations
     columns_to_add = [
@@ -1387,92 +1403,100 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
         pl.lit(False).alias("filled"),
         pl.lit(-1.0).alias("chrom_area"),
     ]
     # Only add rt_original if it doesn't exist
     if "rt_original" not in ddaobj.features_df.columns:
         columns_to_add.append(pl.col("rt").alias("rt_original"))
     f_df = ddaobj.features_df.with_columns(columns_to_add)
     if self.features_df.is_empty():
         # First sample
         self.features_df = f_df.with_columns(
-            pl.int_range(pl.len()).add(1).alias("feature_uid")
+            pl.int_range(pl.len()).add(1).alias("feature_uid"),
         )
     else:
         # Subsequent samples - minimal overhead
         offset = self.features_df["feature_uid"].max() + 1
         f_df = f_df.with_columns(
-            pl.int_range(pl.len()).add(offset).alias("feature_uid")
+            pl.int_range(pl.len()).add(offset).alias("feature_uid"),
         )
         # OPTIMIZED: Use diagonal concatenation without any schema enforcement
         # This is the fastest concatenation method in Polars and handles type mismatches automatically
         self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
     # REMOVED ALL EXPENSIVE OPERATIONS:
-    # - No _ensure_features_df_schema_order()
+    # - No _ensure_features_df_schema_order()
     # - No complex column alignment
     # - No type casting loops
     # - No sample_color_reset()
     self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (optimized)")
     return True
-def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
+def _add_sample_standard(
+    self,
+    file,
+    type=None,
+    reset=False,
+    adducts=None,
+    skip_color_reset=True,
+    skip_schema_check=True,
+):
     """
     Standard add_sample method that uses full sample loading (includes ms1_df).
     This method uses the standard sample.load() method which loads all data
     including ms1_df, providing full functionality but potentially slower performance
     for large MS1 datasets.
     Returns True if successful, False otherwise.
     """
     self.logger.debug(f"Adding (standard): {file}")
     # Basic validation
     basename = os.path.basename(file)
     sample_name = os.path.splitext(basename)[0]
     if sample_name in self.samples_df["sample_name"].to_list():
         self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
         return False
     if not os.path.exists(file):
         self.logger.error(f"File {file} does not exist.")
         return False
     if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
         self.logger.error(f"Unsupported file type: {file}")
         return False
     # Load sample using standard method (includes ms1_df)
     ddaobj = Sample()
     ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
     # Use standard loading method that loads all data including ms1_df
     ddaobj.load(file)
     if ddaobj.features_df is None and not reset:
         ddaobj.features = None
     if ddaobj.features is None or reset:
         ddaobj.find_features()
         ddaobj.find_adducts(adducts=adducts)
         ddaobj.find_ms2()
     self.features_maps.append(ddaobj.features)
     # Determine sample type
     sample_type = "sample" if type is None else type
     if "qc" in sample_name.lower():
         sample_type = "qc"
     if "blank" in sample_name.lower():
         sample_type = "blank"
     map_id_value = len(self.features_maps) - 1
     # Handle file paths
     if file.endswith(".sample5"):
         final_sample_path = file
@@ -1486,7 +1510,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
             final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
         ddaobj.save(final_sample_path)
         self.logger.debug(f"Saved converted sample: {final_sample_path}")
     # Efficient scan counting
     ms1_count = ms2_count = 0
     if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
@@ -1498,7 +1522,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
                 ms1_count = count
             elif level == 2:
                 ms2_count = count
     # Create sample entry
     next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
     new_sample = pl.DataFrame({
@@ -1516,11 +1540,11 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
         "num_ms1": [ms1_count],
         "num_ms2": [ms2_count],
     })
     self.samples_df = pl.concat([self.samples_df, new_sample])
     # SIMPLIFIED feature processing
-    current_sample_uid = len(self.samples_df) - 1
+    current_sample_uid = len(self.samples_df)
     # Add required columns with minimal operations
     columns_to_add = [
@@ -1528,52 +1552,53 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
         pl.lit(False).alias("filled"),
         pl.lit(-1.0).alias("chrom_area"),
     ]
     # Only add rt_original if it doesn't exist
     if "rt_original" not in ddaobj.features_df.columns:
         columns_to_add.append(pl.col("rt").alias("rt_original"))
     f_df = ddaobj.features_df.with_columns(columns_to_add)
     if self.features_df.is_empty():
         # First sample
         self.features_df = f_df.with_columns(
-            pl.int_range(pl.len()).add(1).alias("feature_uid")
+            pl.int_range(pl.len()).add(1).alias("feature_uid"),
         )
     else:
         # Subsequent samples - minimal overhead
         offset = self.features_df["feature_uid"].max() + 1
         f_df = f_df.with_columns(
-            pl.int_range(pl.len()).add(offset).alias("feature_uid")
+            pl.int_range(pl.len()).add(offset).alias("feature_uid"),
         )
         # Use diagonal concatenation for flexibility
         self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
     self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
     return True
+    ## COMMENT AR: Is this intentional?
     # Use standard loading method that loads all data including ms1_df
     ddaobj.load(file)
     if ddaobj.features_df is None and not reset:
         ddaobj.features = None
     if ddaobj.features is None or reset:
         ddaobj.find_features()
         ddaobj.find_adducts(adducts=adducts)
         ddaobj.find_ms2()
     self.features_maps.append(ddaobj.features)
     # Determine sample type
     sample_type = "sample" if type is None else type
     if "qc" in sample_name.lower():
         sample_type = "qc"
     if "blank" in sample_name.lower():
         sample_type = "blank"
     map_id_value = len(self.features_maps) - 1
     # Handle file paths
     if file.endswith(".sample5"):
         final_sample_path = file
@@ -1587,7 +1612,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
             final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
         ddaobj.save(final_sample_path)
         self.logger.debug(f"Saved converted sample: {final_sample_path}")
     # Efficient scan counting
     ms1_count = ms2_count = 0
     if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
@@ -1599,7 +1624,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
                 ms1_count = count
             elif level == 2:
                 ms2_count = count
     # Create sample entry
     next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
     new_sample = pl.DataFrame({
@@ -1617,11 +1642,11 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
         "num_ms1": [ms1_count],
         "num_ms2": [ms2_count],
     })
     self.samples_df = pl.concat([self.samples_df, new_sample])
     # SIMPLIFIED feature processing
-    current_sample_uid = len(self.samples_df) - 1
+    current_sample_uid = len(self.samples_df)
     # Add required columns with minimal operations
     columns_to_add = [
@@ -1629,28 +1654,28 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
         pl.lit(False).alias("filled"),
         pl.lit(-1.0).alias("chrom_area"),
     ]
     # Only add rt_original if it doesn't exist
     if "rt_original" not in ddaobj.features_df.columns:
         columns_to_add.append(pl.col("rt").alias("rt_original"))
     f_df = ddaobj.features_df.with_columns(columns_to_add)
     if self.features_df.is_empty():
         # First sample
         self.features_df = f_df.with_columns(
-            pl.int_range(pl.len()).add(1).alias("feature_uid")
+            pl.int_range(pl.len()).add(1).alias("feature_uid"),
         )
     else:
         # Subsequent samples - minimal overhead
         offset = self.features_df["feature_uid"].max() + 1
         f_df = f_df.with_columns(
-            pl.int_range(pl.len()).add(offset).alias("feature_uid")
+            pl.int_range(pl.len()).add(offset).alias("feature_uid"),
         )
         # Use diagonal concatenation for flexibility
         self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
     self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
     return True
@@ -1662,36 +1687,38 @@ def _sample_color_reset_optimized(self):
     if self.samples_df is None or len(self.samples_df) == 0:
         self.logger.warning("No samples found in study.")
         return
     # Cache the colormap if not already cached
-    if not hasattr(self, '_cached_colormap'):
+    if not hasattr(self, "_cached_colormap"):
         try:
             from cmap import Colormap
-            self._cached_colormap = Colormap('turbo')
+            self._cached_colormap = Colormap("turbo")
         except ImportError:
             self.logger.warning("cmap package not available, using default colors")
             return
     cm = self._cached_colormap
     n_samples = len(self.samples_df)
     # Pre-allocate colors list for better performance
     colors = [None] * n_samples
     # Vectorized color generation
     for i in range(n_samples):
         normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
         color_rgba = cm(normalized_value)
         if len(color_rgba) >= 3:
             r, g, b = color_rgba[:3]
             if max(color_rgba[:3]) <= 1.0:
                 r, g, b = int(r * 255), int(g * 255), int(b * 255)
             colors[i] = f"#{r:02x}{g:02x}{b:02x}"
     # Update the sample_color column efficiently
     self.samples_df = self.samples_df.with_columns(
-        pl.Series("sample_color", colors).alias("sample_color")
+        pl.Series("sample_color", colors).alias("sample_color"),
     )
     self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")

masster 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

Potentially problematic release.

masster 0.3.17py3-none-any.whl → 0.3.19py3-none-any.whl