PyPI - masster - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

masster 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (33) hide show

masster/docs/SCX_API_Documentation.md +0 -0
masster/docs/SCX_DLL_Analysis.md +0 -0
masster/logger.py +92 -78
masster/sample/defaults/find_features_def.py +16 -6
masster/sample/defaults/sample_def.py +1 -1
masster/sample/h5.py +2 -2
masster/sample/helpers.py +137 -136
masster/sample/load.py +13 -9
masster/sample/plot.py +156 -131
masster/sample/processing.py +18 -12
masster/sample/sample.py +4 -4
masster/sample/sample5_schema.json +62 -62
masster/sample/save.py +16 -13
masster/sample/sciex.py +187 -176
masster/study/defaults/align_def.py +224 -6
masster/study/defaults/fill_chrom_def.py +1 -5
masster/study/defaults/integrate_chrom_def.py +1 -5
masster/study/defaults/study_def.py +2 -2
masster/study/export.py +144 -131
masster/study/h5.py +193 -133
masster/study/helpers.py +293 -245
masster/study/helpers_optimized.py +99 -57
masster/study/load.py +51 -25
masster/study/plot.py +453 -17
masster/study/processing.py +159 -76
masster/study/save.py +7 -7
masster/study/study.py +97 -88
masster/study/study5_schema.json +82 -82
{masster-0.3.10.dist-info → masster-0.3.11.dist-info}/METADATA +1 -1
{masster-0.3.10.dist-info → masster-0.3.11.dist-info}/RECORD +33 -31
{masster-0.3.10.dist-info → masster-0.3.11.dist-info}/WHEEL +0 -0
{masster-0.3.10.dist-info → masster-0.3.11.dist-info}/entry_points.txt +0 -0
{masster-0.3.10.dist-info → masster-0.3.11.dist-info}/licenses/LICENSE +0 -0

masster/study/helpers_optimized.py CHANGED Viewed

@@ -11,6 +11,7 @@ This module contains the optimized version of features_select that:
 import polars as pl
 def features_select_optimized(
     self,
     mz=None,
@@ -29,14 +30,14 @@ def features_select_optimized(
 ):
     """
     Optimized version of features_select with improved performance.
     Key optimizations:
     - Combines all filters into a single expression
     - Uses lazy evaluation for better performance
     - Reduces logging overhead
     - Pre-checks column existence once
     - Early return for no filters
     Args:
         mz: mass-to-charge ratio filter (tuple for range, single value for minimum)
         rt: retention time filter (tuple for range, single value for minimum)
@@ -51,30 +52,42 @@ def features_select_optimized(
         chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
         chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
         chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
     Returns:
         polars.DataFrame: Filtered features DataFrame
     """
     if self.features_df is None or self.features_df.is_empty():
         self.logger.warning("No features found in study.")
         return pl.DataFrame()
     # Early return if no filters provided
-    filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
-                     feature_uid, filled, quality, chrom_coherence,
-                     chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
+    filter_params = [
+        mz,
+        rt,
+        inty,
+        sample_uid,
+        sample_name,
+        consensus_uid,
+        feature_uid,
+        filled,
+        quality,
+        chrom_coherence,
+        chrom_prominence,
+        chrom_prominence_scaled,
+        chrom_height_scaled,
+    ]
     if all(param is None for param in filter_params):
         return self.features_df.clone()
     initial_count = len(self.features_df)
     # Pre-check available columns once
     available_columns = set(self.features_df.columns)
     # Build all filter conditions
     filter_conditions = []
     warnings = []
     # Filter by m/z
     if mz is not None:
         if isinstance(mz, tuple) and len(mz) == 2:
@@ -82,7 +95,7 @@ def features_select_optimized(
             filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
         else:
             filter_conditions.append(pl.col("mz") >= mz)
     # Filter by retention time
     if rt is not None:
         if isinstance(rt, tuple) and len(rt) == 2:
@@ -90,7 +103,7 @@ def features_select_optimized(
             filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
         else:
             filter_conditions.append(pl.col("rt") >= rt)
     # Filter by intensity
     if inty is not None:
         if isinstance(inty, tuple) and len(inty) == 2:
@@ -98,7 +111,7 @@ def features_select_optimized(
             filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
         else:
             filter_conditions.append(pl.col("inty") >= inty)
     # Filter by sample_uid
     if sample_uid is not None:
         if isinstance(sample_uid, (list, tuple)):
@@ -111,24 +124,24 @@ def features_select_optimized(
                 filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
         else:
             filter_conditions.append(pl.col("sample_uid") == sample_uid)
     # Filter by sample_name (requires pre-processing)
     if sample_name is not None:
         # Get sample_uids for the given sample names
         if isinstance(sample_name, list):
             sample_uids_for_names = self.samples_df.filter(
-                pl.col("sample_name").is_in(sample_name)
+                pl.col("sample_name").is_in(sample_name),
             )["sample_uid"].to_list()
         else:
             sample_uids_for_names = self.samples_df.filter(
-                pl.col("sample_name") == sample_name
+                pl.col("sample_name") == sample_name,
             )["sample_uid"].to_list()
         if sample_uids_for_names:
             filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
         else:
             filter_conditions.append(pl.lit(False))  # No matching samples
     # Filter by consensus_uid
     if consensus_uid is not None:
         if isinstance(consensus_uid, (list, tuple)):
@@ -141,7 +154,7 @@ def features_select_optimized(
                 filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
         else:
             filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
     # Filter by feature_uid
     if feature_uid is not None:
         if isinstance(feature_uid, (list, tuple)):
@@ -154,7 +167,7 @@ def features_select_optimized(
                 filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
         else:
             filter_conditions.append(pl.col("feature_uid") == feature_uid)
     # Filter by filled status
     if filled is not None:
         if "filled" in available_columns:
@@ -164,7 +177,7 @@ def features_select_optimized(
                 filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
         else:
             warnings.append("'filled' column not found in features_df")
     # Filter by quality
     if quality is not None:
         if "quality" in available_columns:
@@ -175,75 +188,85 @@ def features_select_optimized(
                 filter_conditions.append(pl.col("quality") >= quality)
         else:
             warnings.append("'quality' column not found in features_df")
     # Filter by chromatogram coherence
     if chrom_coherence is not None:
         if "chrom_coherence" in available_columns:
             if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
                 min_coherence, max_coherence = chrom_coherence
-                filter_conditions.append((pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence))
+                filter_conditions.append(
+                    (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
+                )
             else:
                 filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
         else:
             warnings.append("'chrom_coherence' column not found in features_df")
     # Filter by chromatogram prominence
     if chrom_prominence is not None:
         if "chrom_prominence" in available_columns:
             if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
                 min_prominence, max_prominence = chrom_prominence
-                filter_conditions.append((pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence))
+                filter_conditions.append(
+                    (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
+                )
             else:
                 filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
         else:
             warnings.append("'chrom_prominence' column not found in features_df")
     # Filter by scaled chromatogram prominence
     if chrom_prominence_scaled is not None:
         if "chrom_prominence_scaled" in available_columns:
             if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
                 min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
-                filter_conditions.append((pl.col("chrom_prominence_scaled") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled))
+                filter_conditions.append(
+                    (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
+                    & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
+                )
             else:
                 filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
         else:
             warnings.append("'chrom_prominence_scaled' column not found in features_df")
     # Filter by scaled chromatogram height
     if chrom_height_scaled is not None:
         if "chrom_height_scaled" in available_columns:
             if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
                 min_height_scaled, max_height_scaled = chrom_height_scaled
-                filter_conditions.append((pl.col("chrom_height_scaled") >= min_height_scaled) & (pl.col("chrom_height_scaled") <= max_height_scaled))
+                filter_conditions.append(
+                    (pl.col("chrom_height_scaled") >= min_height_scaled)
+                    & (pl.col("chrom_height_scaled") <= max_height_scaled)
+                )
             else:
                 filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
         else:
             warnings.append("'chrom_height_scaled' column not found in features_df")
     # Log warnings once at the end
     for warning in warnings:
         self.logger.warning(warning)
     # Apply all filters at once if any exist
     if filter_conditions:
         # Combine all conditions with AND
         combined_filter = filter_conditions[0]
         for condition in filter_conditions[1:]:
             combined_filter = combined_filter & condition
         # Apply the combined filter using lazy evaluation for better performance
         feats = self.features_df.lazy().filter(combined_filter).collect()
     else:
         feats = self.features_df.clone()
     final_count = len(feats)
     if final_count == 0:
         self.logger.warning("No features remaining after applying selection criteria.")
     else:
         removed_count = initial_count - final_count
         self.logger.info(f"Features selected: {final_count} (removed: {removed_count})")
     return feats
@@ -267,51 +290,70 @@ def features_select_benchmarked(
     Benchmarked version that compares old vs new implementation performance.
     """
     import time
     # Call the original method for comparison
     start_time = time.perf_counter()
     _ = self.features_select_original(
-        mz=mz, rt=rt, inty=inty, sample_uid=sample_uid, sample_name=sample_name,
-        consensus_uid=consensus_uid, feature_uid=feature_uid, filled=filled,
-        quality=quality, chrom_coherence=chrom_coherence,
-        chrom_prominence=chrom_prominence, chrom_prominence_scaled=chrom_prominence_scaled,
-        chrom_height_scaled=chrom_height_scaled
+        mz=mz,
+        rt=rt,
+        inty=inty,
+        sample_uid=sample_uid,
+        sample_name=sample_name,
+        consensus_uid=consensus_uid,
+        feature_uid=feature_uid,
+        filled=filled,
+        quality=quality,
+        chrom_coherence=chrom_coherence,
+        chrom_prominence=chrom_prominence,
+        chrom_prominence_scaled=chrom_prominence_scaled,
+        chrom_height_scaled=chrom_height_scaled,
     )
     original_time = time.perf_counter() - start_time
     # Call the optimized method
     start_time = time.perf_counter()
     result_optimized = features_select_optimized(
-        self, mz=mz, rt=rt, inty=inty, sample_uid=sample_uid, sample_name=sample_name,
-        consensus_uid=consensus_uid, feature_uid=feature_uid, filled=filled,
-        quality=quality, chrom_coherence=chrom_coherence,
-        chrom_prominence=chrom_prominence, chrom_prominence_scaled=chrom_prominence_scaled,
-        chrom_height_scaled=chrom_height_scaled
+        self,
+        mz=mz,
+        rt=rt,
+        inty=inty,
+        sample_uid=sample_uid,
+        sample_name=sample_name,
+        consensus_uid=consensus_uid,
+        feature_uid=feature_uid,
+        filled=filled,
+        quality=quality,
+        chrom_coherence=chrom_coherence,
+        chrom_prominence=chrom_prominence,
+        chrom_prominence_scaled=chrom_prominence_scaled,
+        chrom_height_scaled=chrom_height_scaled,
     )
     optimized_time = time.perf_counter() - start_time
     # Log performance comparison
-    speedup = original_time / optimized_time if optimized_time > 0 else float('inf')
-    self.logger.info(f"Performance comparison - Original: {original_time:.4f}s, Optimized: {optimized_time:.4f}s, Speedup: {speedup:.2f}x")
+    speedup = original_time / optimized_time if optimized_time > 0 else float("inf")
+    self.logger.info(
+        f"Performance comparison - Original: {original_time:.4f}s, Optimized: {optimized_time:.4f}s, Speedup: {speedup:.2f}x"
+    )
     return result_optimized
 def monkey_patch_study():
     """
     Apply the optimized features_select method to the Study class.
     Call this function to replace the original features_select with the optimized version.
     """
     from masster.study.study import Study
     # Store original method for benchmarking
     Study.features_select_original = Study.features_select
     # Replace with optimized version
     Study.features_select = features_select_optimized
     # Add benchmarked version as an option
     Study.features_select_benchmarked = features_select_benchmarked
     print("Successfully patched Study.features_select with optimized version")

masster/study/load.py CHANGED Viewed

@@ -48,10 +48,10 @@ def add(
             folder = os.getcwd()
     self.logger.debug(f"Adding files from: {folder}")
     # Define file extensions to search for in order of priority
     extensions = [".sample5", ".wiff", ".raw", ".mzML"]
     # Check if folder contains glob patterns
     if not any(char in folder for char in ["*", "?", "[", "]"]):
         search_folder = folder
@@ -68,7 +68,7 @@ def add(
     for ext in extensions:
         if max_files is not None and counter >= max_files:
             break
         # Build search pattern
         if any(char in folder for char in ["*", "?", "[", "]"]):
             # If folder already contains glob patterns, modify the extension
@@ -78,16 +78,16 @@ def add(
                 pattern = os.path.join(search_folder, "**", f"*{ext}")
         else:
             pattern = os.path.join(search_folder, "**", f"*{ext}")
         files = glob.glob(pattern, recursive=True)
         if len(files) > 0:
             # Limit files if max_files is specified
             remaining_slots = max_files - counter if max_files is not None else len(files)
             files = files[:remaining_slots]
             self.logger.debug(f"Found {len(files)} {ext} files")
             # Process files
             for i, file in enumerate(
                 tqdm(
@@ -99,18 +99,18 @@ def add(
             ):
                 if max_files is not None and counter >= max_files:
                     break
                 # Get filename without extension for blacklist check
                 basename = os.path.basename(file)
                 filename_no_ext = os.path.splitext(basename)[0]
                 # Check if this filename (without extension) is already in blacklist
                 if filename_no_ext in blacklist:
                     self.logger.debug(f"Skipping {file} - filename already processed")
                     continue
                 self.logger.debug(f"Add file {counter + 1}: {file}")
                 # Try to add the sample
                 try:
                     self.add_sample(file=file, reset=reset, adducts=adducts)
@@ -138,11 +138,11 @@ def add(
 # TODO type is not used
 def add_sample(self, file, type=None, reset=False, adducts=None):
     self.logger.debug(f"Adding: {file}")
     # Extract sample name by removing any known extension
     basename = os.path.basename(file)
     sample_name = os.path.splitext(basename)[0]
     # check if sample_name is already in the samples_df
     if sample_name in self.samples_df["sample_name"].to_list():
         self.logger.warning(
@@ -163,7 +163,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
     # Load the sample based on file type
     ddaobj = Sample()
     ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
     if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
         ddaobj.load(file)
     else:
@@ -178,7 +178,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
     if ddaobj.features is None or reset:
         ddaobj.find_features()
         ddaobj.find_adducts(adducts=adducts)
-        ddaobj.find_ms2()
+        ddaobj.find_ms2()
     self.features_maps.append(ddaobj.features)
@@ -194,7 +194,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
         # If input is already .sample5, keep it in original location
         final_sample_path = file
         self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
         # Check if there's a corresponding featureXML file in the same directory
         featurexml_path = file.replace(".sample5", ".featureXML")
         if os.path.exists(featurexml_path):
@@ -218,7 +218,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
     # Count MS1 and MS2 scans from the loaded sample
     ms1_count = 0
     ms2_count = 0
-    if hasattr(ddaobj, 'scans_df') and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
+    if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
         ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
         ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
@@ -230,7 +230,7 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
             "sample_type": [sample_type],
             "size": [int(ddaobj.features.size())],
             "map_id": [map_id_value],
-            "file_source": [getattr(ddaobj, 'file_source', file)],
+            "file_source": [getattr(ddaobj, "file_source", file)],
             "ms1": [ms1_count],
             "ms2": [ms2_count],
         },
@@ -304,8 +304,8 @@ def load(self, filename=None):
         else:
             self.logger.error("Either filename or folder must be provided")
             return
-    #self.logger.info(f"Loading study from {filename}")
+    # self.logger.info(f"Loading study from {filename}")
     self._load_study5(filename)
     # After loading the study, check if consensus XML exists and load it
     consensus_xml_path = filename.replace(".study5", ".consensusXML")
@@ -566,7 +566,20 @@ def _fill_chrom_single_impl(
             rows_to_add.append(new_row)
         # Create and add new DataFrame
-        new_df = pl.from_dicts(rows_to_add)
+        if rows_to_add:
+            # Ensure consistent data types by explicitly casting problematic columns
+            for row in rows_to_add:
+                # Cast numeric columns to ensure consistency
+                for key, value in row.items():
+                    if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
+                        row[key] = float(value)
+                    elif key in ["sample_id", "feature_id"] and value is not None:
+                        row[key] = int(value)
+            new_df = pl.from_dicts(rows_to_add, infer_schema_length=len(rows_to_add))
+        else:
+            # Handle empty case - create empty DataFrame with proper schema
+            new_df = pl.DataFrame(schema=self.features_df.schema)
         # Cast columns to match existing schema
         cast_exprs = []
@@ -606,8 +619,9 @@ def fill_single(self, **kwargs):
     """
     # parameters initialization
     from masster.study.defaults import fill_defaults
     params = fill_defaults()
     for key, value in kwargs.items():
         if isinstance(value, fill_defaults):
             params = value
@@ -959,7 +973,20 @@ def _fill_chrom_impl(
             rows_to_add.append(new_row)
         # Create and add new DataFrame
-        new_df = pl.from_dicts(rows_to_add)
+        if rows_to_add:
+            # Ensure consistent data types by explicitly casting problematic columns
+            for row in rows_to_add:
+                # Cast numeric columns to ensure consistency
+                for key, value in row.items():
+                    if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
+                        row[key] = float(value)
+                    elif key in ["sample_id", "feature_id"] and value is not None:
+                        row[key] = int(value)
+            new_df = pl.from_dicts(rows_to_add, infer_schema_length=len(rows_to_add))
+        else:
+            # Handle empty case - create empty DataFrame with proper schema
+            new_df = pl.DataFrame(schema=self.features_df.schema)
         # Cast columns to match existing schema
         cast_exprs = []
@@ -1001,7 +1028,7 @@ def fill(self, **kwargs):
     # parameters initialization
     params = fill_defaults()
     num_workers = kwargs.get("num_workers", 4)  # Default parameter not in defaults class
     for key, value in kwargs.items():
         if isinstance(value, fill_defaults):
             params = value
@@ -1228,4 +1255,3 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
     self.consensus_map = oms.ConsensusMap()
     fh.load(filename, self.consensus_map)
     self.logger.debug(f"Loaded consensus map from {filename}.")

masster 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

Potentially problematic release.

masster 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl