PyPI - masster - Versions diffs - 0.3.16__tar.gz → 0.3.17__tar.gz - Mend

masster 0.3.16tar.gz → 0.3.17tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (78) hide show

{masster-0.3.16 → masster-0.3.17}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.3.16
+Version: 0.3.17
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.3.16 → masster-0.3.17}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.3.16"
+version = "0.3.17"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.3.16 → masster-0.3.17}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.3.16"
+__version__ = "0.3.17"
 def get_version():

{masster-0.3.16 → masster-0.3.17}/src/masster/study/defaults/align_def.py RENAMED Viewed

@@ -24,6 +24,7 @@ class align_defaults:
         skip_blanks (bool): Whether to skip blank samples. Default is False.
         KD algorithm specific parameters:
+        min_samples (int): Minimum number of samples required for KD alignment. Default is 3.
         nr_partitions (int): Number of partitions in m/z dimension. Default is 100.
         warp_enabled (bool): Enable non-linear retention time transformation. Default is True.
         warp_rt_tol (float): RT tolerance for the LOWESS fit. Default is 5.0.
@@ -59,6 +60,7 @@ class align_defaults:
     algo: str = "pc"
     # KD algorithm specific parameters
+    min_samples: int = 3
     nr_partitions: int = 100
     warp_enabled: bool = True
     warp_rt_tol: float = 5.0
@@ -137,6 +139,13 @@ class align_defaults:
                 "allowed_values": ["pc", "kd"],
             },
             # KD algorithm specific parameters
+            "min_samples": {
+                "dtype": int,
+                "description": "Minimum number of samples required for KD alignment algorithm",
+                "default": 3,
+                "min_value": 2,
+                "max_value": 1000,
+            },
             "nr_partitions": {
                 "dtype": int,
                 "description": "Number of partitions in m/z dimension for KD algorithm",

{masster-0.3.16 → masster-0.3.17}/src/masster/study/load.py RENAMED Viewed

@@ -961,51 +961,96 @@ def _get_missing_consensus_sample_combinations(self, uids):
     """
     Efficiently identify which consensus_uid/sample combinations are missing.
     Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
+    Optimized for common scenarios:
+    - Early termination for fully-filled studies
+    - Efficient dictionary lookups instead of expensive DataFrame joins
+    - Smart handling of sparse vs dense missing data patterns
     """
-    # Get all consensus UIDs we're interested in
-    consensus_uids_set = set(uids)
-    # Get all sample UIDs and create lookup
-    all_sample_info = {}
-    for row in self.samples_df.select([
-        "sample_uid",
-        "sample_name",
-        "sample_path",
-    ]).iter_rows(named=True):
-        all_sample_info[row["sample_uid"]] = {
-            "sample_name": row["sample_name"],
-            "sample_path": row["sample_path"],
-        }
-    # Get existing consensus/sample combinations from consensus_mapping_df
-    existing_combinations = set()
-    consensus_mapping_filtered = self.consensus_mapping_df.filter(
-        pl.col("consensus_uid").is_in(list(consensus_uids_set)),
-    )
-    # Join with features_df to get sample_uid information
-    existing_features = consensus_mapping_filtered.join(
-        self.features_df.select(["feature_uid", "sample_uid"]),
-        on="feature_uid",
-        how="inner",
+    if not uids:
+        return []
+    n_consensus = len(uids)
+    n_samples = len(self.samples_df)
+    total_possible = n_consensus * n_samples
+    # Quick early termination check for fully/nearly filled studies
+    # This handles the common case where fill() is run on an already-filled study
+    consensus_counts = (
+        self.consensus_mapping_df
+        .filter(pl.col("consensus_uid").is_in(uids))
+        .group_by("consensus_uid")
+        .agg(pl.count("feature_uid").alias("count"))
     )
-    for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
-        existing_combinations.add((row[0], row[1]))  # (consensus_uid, sample_uid)
-    # Find missing combinations
-    missing_combinations = []
-    for consensus_uid in consensus_uids_set:
-        for sample_uid, sample_info in all_sample_info.items():
-            if (consensus_uid, sample_uid) not in existing_combinations:
-                missing_combinations.append((
-                    consensus_uid,
-                    sample_uid,
-                    sample_info["sample_name"],
-                    sample_info["sample_path"],
-                ))
-    return missing_combinations
+    total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
+    # If >95% filled, likely no gaps (common case)
+    if total_existing >= total_possible * 0.95:
+        self.logger.debug(f"Study appears {total_existing/total_possible*100:.1f}% filled, using sparse optimization")
+        # For sparse missing data, check each consensus feature individually
+        missing_combinations = []
+        uids_set = set(uids)
+        # Build efficient lookups
+        feature_to_sample = dict(
+            self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
+        )
+        # Get existing combinations for target UIDs only
+        existing_by_consensus = {}
+        for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows():
+            if consensus_uid in uids_set and feature_uid in feature_to_sample:
+                if consensus_uid not in existing_by_consensus:
+                    existing_by_consensus[consensus_uid] = set()
+                existing_by_consensus[consensus_uid].add(feature_to_sample[feature_uid])
+        # Get sample info once
+        all_samples = list(
+            self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
+        )
+        # Check for missing combinations
+        for consensus_uid in uids:
+            existing_samples = existing_by_consensus.get(consensus_uid, set())
+            for sample_uid, sample_name, sample_path in all_samples:
+                if sample_uid not in existing_samples:
+                    missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path))
+        return missing_combinations
+    else:
+        # For studies with many gaps, use bulk operations
+        self.logger.debug(f"Study {total_existing/total_possible*100:.1f}% filled, using bulk optimization")
+        # Build efficient lookups
+        uids_set = set(uids)
+        feature_to_sample = dict(
+            self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
+        )
+        # Build existing combinations set
+        existing_combinations = {
+            (consensus_uid, feature_to_sample[feature_uid])
+            for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows()
+            if consensus_uid in uids_set and feature_uid in feature_to_sample
+        }
+        # Get all sample info
+        all_samples = list(
+            self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
+        )
+        # Generate all missing combinations
+        missing_combinations = [
+            (consensus_uid, sample_uid, sample_name, sample_path)
+            for consensus_uid in uids
+            for sample_uid, sample_name, sample_path in all_samples
+            if (consensus_uid, sample_uid) not in existing_combinations
+        ]
+        return missing_combinations
 def sanitize(self):

{masster-0.3.16 → masster-0.3.17}/src/masster/study/processing.py RENAMED Viewed

@@ -33,6 +33,7 @@ def align(self, **kwargs):
         - algo (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD).
         KD algorithm specific parameters:
+        - min_samples (int): Minimum number of samples required for KD alignment.
         - nr_partitions (int): Number of partitions in m/z dimension.
         - warp_enabled (bool): Enable non-linear retention time transformation.
         - warp_rt_tol (float): RT tolerance for the LOWESS fit.
@@ -87,131 +88,17 @@ def align(self, **kwargs):
     fmaps = self.features_maps
-    # Initialize OpenMS parameters
-    params_oms = oms.Param()
-    # Choose alignment algorithm based on parameter
+    # Choose alignment algorithm
     algo = params.get("algo").lower()
-    # Set common parameters for both algorithms
-    if algo == "pc":
-        # Parameters specific to PoseClustering
-        params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
-        params_oms.setValue("pairfinder:ignore_charge", "true")
-        params_oms.setValue("max_num_peaks_considered", 1000)
-        params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
-        params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
-        params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
-        params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
-        params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
-        params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
-        params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
-        """
-        {b'max_num_peaks_considered': 1000,
-        b'superimposer:mz_pair_max_distance': 0.5,
-        b'superimposer:rt_pair_distance_fraction': 0.1,
-        b'superimposer:num_used_points': 2000,
-        b'superimposer:scaling_bucket_size': 0.005,
-        b'superimposer:shift_bucket_size': 3.0,
-        b'superimposer:max_shift': 1000.0,
-        b'superimposer:max_scaling': 2.0,
-        b'superimposer:dump_buckets': '',
-        b'superimposer:dump_pairs': '',
-        b'pairfinder:second_nearest_gap': 2.0,
-        b'pairfinder:use_identifications': 'false',
-        b'pairfinder:ignore_charge': 'false',
-        b'pairfinder:ignore_adduct': 'true',
-        b'pairfinder:distance_RT:max_difference': 100.0,
-        b'pairfinder:distance_RT:exponent': 1.0,
-        b'pairfinder:distance_RT:weight': 1.0,
-        b'pairfinder:distance_MZ:max_difference': 0.3,
-        b'pairfinder:distance_MZ:unit': 'Da',
-        b'pairfinder:distance_MZ:exponent': 2.0,
-        b'pairfinder:distance_MZ:weight': 1.0,
-        b'pairfinder:distance_intensity:exponent': 1.0,
-        b'pairfinder:distance_intensity:weight': 0.0,
-        b'pairfinder:distance_intensity:log_transform': 'disabled'}
-        """
-    elif algo == "kd":
-        # Parameters specific to KD algorithm
-        params_oms.setValue("mz_unit", "Da")
-        params_oms.setValue("nr_partitions", params.get("nr_partitions"))
-        # Warp parameters for non-linear RT transformation
-        params_oms.setValue("warp:enabled", "true" if params.get("warp_enabled") else "false")
-        params_oms.setValue("warp:rt_tol", params.get("warp_rt_tol"))
-        params_oms.setValue("warp:mz_tol", params.get("warp_mz_tol"))
-        params_oms.setValue("warp:max_pairwise_log_fc", params.get("warp_max_pairwise_log_fc"))
-        params_oms.setValue("warp:min_rel_cc_size", params.get("warp_min_rel_cc_size"))
-        params_oms.setValue("warp:max_nr_conflicts", params.get("warp_max_nr_conflicts"))
-        # Link parameters
-        params_oms.setValue("link:rt_tol", params.get("link_rt_tol"))
-        params_oms.setValue("link:mz_tol", params.get("link_mz_tol"))
-        params_oms.setValue("link:charge_merging", params.get("link_charge_merging"))
-        params_oms.setValue("link:adduct_merging", params.get("link_adduct_merging"))
-        # Distance parameters
-        params_oms.setValue("distance_RT:exponent", params.get("distance_RT_exponent"))
-        params_oms.setValue("distance_RT:weight", params.get("distance_RT_weight"))
-        params_oms.setValue("distance_MZ:exponent", params.get("distance_MZ_exponent"))
-        params_oms.setValue("distance_MZ:weight", params.get("distance_MZ_weight"))
-        params_oms.setValue("distance_intensity:exponent", params.get("distance_intensity_exponent"))
-        params_oms.setValue("distance_intensity:weight", params.get("distance_intensity_weight"))
-        params_oms.setValue("distance_intensity:log_transform", params.get("distance_intensity_log_transform"))
-        # LOWESS parameters
-        params_oms.setValue("LOWESS:span", params.get("LOWESS_span"))
-        params_oms.setValue("LOWESS:num_iterations", params.get("LOWESS_num_iterations"))
-        params_oms.setValue("LOWESS:delta", params.get("LOWESS_delta"))
-        params_oms.setValue("LOWESS:interpolation_type", params.get("LOWESS_interpolation_type"))
-        params_oms.setValue("LOWESS:extrapolation_type", params.get("LOWESS_extrapolation_type"))
     if algo == "pc":
-        aligner = oms.MapAlignmentAlgorithmPoseClustering()
-        self.logger.info("Starting alignment with PoseClustering")
-        # set ref_index to feature map index with largest number of features
-        ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
-        self.logger.debug(
-            f"Reference map is {self.samples_df.row(ref_index, named=True)['sample_name']}",
-        )
-        aligner.setParameters(params_oms)
-        aligner.setReference(fmaps[ref_index])
-        self.logger.debug(f"Parameters for alignment: {params}")
-        # perform alignment and transformation of feature maps to the reference map (exclude reference map)
-        tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-        for index, fm in tqdm(
-            list(enumerate(fmaps)),
-            total=len(fmaps),
-            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Align feature maps",
-            disable=tdqm_disable,
-        ):
-            if index == ref_index:
-                continue
-            if params.get("skip_blanks") and self.samples_df.row(index, named=True)["sample_type"] == "blank":
-                continue
-            trafo = oms.TransformationDescription()
-            aligner.align(fm, trafo)
-            transformer = oms.MapAlignmentTransformer()
-            transformer.transformRetentionTimes(fm, trafo, True)
-        self.alignment_ref_index = ref_index
+        _align_pose_clustering(self, fmaps, params)
     elif algo == "kd":
-        # KD algorithm requires num_maps and Param parameters
-        num_maps = len(fmaps)
-        aligner = oms.MapAlignmentAlgorithmKD(3, params_oms)
-        self.logger.info(f"Starting alignment with KD algorithm using {num_maps} maps")
-        kdtree = oms.KDTreeFeatureMaps()
-        kdtree.addMaps(fmaps)  # Add all feature maps to the KDTree
-        # kdtree.optimizeTree()
-        aligner.addRTFitData(kdtree)
-        aligner.fitLOWESS()
-        aligner.transform(kdtree)
+        _align_kd_algorithm(self, fmaps, params)
     else:
         self.logger.error(f"Unknown alignment algorithm '{algo}'")
+        self.logger.error(f"Unknown alignment algorithm '{algo}'")
     # check if rt_original exists in features_df, if not, add it after rt
     if "rt_original" not in self.features_df.columns:
@@ -1163,3 +1050,145 @@ def _find_closest_valley(chrom, rt, dir="left", threshold=0.9):
             else:
                 break
     return chrom.rt[idx]
+def _align_pose_clustering(study_obj, fmaps, params):
+    """Perform alignment using PoseClustering algorithm."""
+    import pyopenms as oms
+    from tqdm import tqdm
+    from datetime import datetime
+    # Create PC-specific OpenMS parameters
+    params_oms = oms.Param()
+    params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
+    params_oms.setValue("pairfinder:ignore_charge", "true")
+    params_oms.setValue("max_num_peaks_considered", 1000)
+    params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
+    params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
+    params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
+    params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
+    params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
+    params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
+    params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
+    aligner = oms.MapAlignmentAlgorithmPoseClustering()
+    study_obj.logger.info("Starting alignment with PoseClustering")
+    # Set ref_index to feature map index with largest number of features
+    ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
+    study_obj.logger.debug(
+        f"Reference map is {study_obj.samples_df.row(ref_index, named=True)['sample_name']}",
+    )
+    aligner.setParameters(params_oms)
+    aligner.setReference(fmaps[ref_index])
+    study_obj.logger.debug(f"Parameters for alignment: {params}")
+    # Perform alignment and transformation of feature maps to the reference map (exclude reference map)
+    tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
+    for index, fm in tqdm(
+        list(enumerate(fmaps)),
+        total=len(fmaps),
+        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {study_obj.log_label}Align feature maps",
+        disable=tdqm_disable,
+    ):
+        if index == ref_index:
+            continue
+        if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
+            continue
+        trafo = oms.TransformationDescription()
+        aligner.align(fm, trafo)
+        transformer = oms.MapAlignmentTransformer()
+        transformer.transformRetentionTimes(fm, trafo, True)
+    study_obj.alignment_ref_index = ref_index
+def _align_kd_algorithm(study_obj, fmaps, params):
+    """Perform alignment using KD algorithm."""
+    import pyopenms as oms
+    num_maps = len(fmaps)
+    study_obj.logger.info(f"Starting alignment with KD algorithm using {num_maps} maps")
+    try:
+        # Use the EXACT approach from test_oms.py that works
+        # First parameter is DIMENSIONS (3), not min_samples!
+        study_obj.logger.debug("Creating MapAlignmentAlgorithmKD with 3 dimensions and empty parameters...")
+        empty_params = oms.Param()  # Empty params - this is what worked in test_oms.py!
+        aligner = oms.MapAlignmentAlgorithmKD(3, empty_params)  # 3 = dimensions, not min_samples
+        study_obj.logger.debug("Created MapAlignmentAlgorithmKD successfully")
+        # Create KD-tree structure
+        kdtree = oms.KDTreeFeatureMaps()
+        # Set all required warping parameters based on OpenMS requirements
+        kd_params = oms.Param()
+        # Core warp parameters that OpenMS expects
+        kd_params.setValue(b"warp:min_rel_cc_size", 0.2, b"Minimum relative connected component size")
+        kd_params.setValue(b"warp:max_ratio_small_big", 0.5, b"Maximum ratio of small to big connected component")
+        kd_params.setValue(b"warp:min_score", 0.3, b"Minimum score for warping")
+        kd_params.setValue(b"warp:rt_tol", 5.0, b"RT tolerance for feature matching")
+        kd_params.setValue(b"warp:mz_tol", 0.015, b"m/z tolerance for feature matching")
+        # Additional potentially required parameters
+        kd_params.setValue(b"warp:max_shift", 30.0, b"Maximum RT shift allowed")
+        kd_params.setValue(b"warp:bins", 100, b"Number of bins for warping")
+        kdtree.setParameters(kd_params)
+        # Add all feature maps to KD-tree (NO limiting - this worked with 38k features!)
+        study_obj.logger.debug("Adding maps to KD-tree structure...")
+        kdtree.addMaps(fmaps)
+        study_obj.logger.debug("Successfully added maps to KD-tree")
+        # Add RT fitting data (this is where the magic happens)
+        study_obj.logger.debug("Adding RT fitting data to aligner...")
+        aligner.addRTFitData(kdtree)
+        study_obj.logger.debug("Successfully added RT fitting data")
+        # Perform LOWESS fitting
+        study_obj.logger.debug("Performing LOWESS fitting...")
+        aligner.fitLOWESS()
+        study_obj.logger.debug("Successfully completed LOWESS fitting")
+        # Apply transformations to feature maps
+        study_obj.logger.debug("Applying transformations to feature maps...")
+        for i, fmap in enumerate(fmaps):
+            trafo = oms.TransformationDescription()
+            aligner.getTransformation(i, trafo)
+            oms.MapAlignmentTransformer.transformRetentionTimes(fmap, trafo, True)
+        study_obj.logger.info("KD alignment completed successfully")
+    except Exception as e:
+        study_obj.logger.error(f"KD alignment failed with error: {e}")
+        study_obj.logger.info("Falling back to PoseClustering alignment...")
+        # Fallback to pose clustering with basic parameters
+        _align_pose_clustering_fallback(study_obj, fmaps, params)
+def _align_pose_clustering_fallback(study_obj, fmaps, params):
+    """Fallback PoseClustering alignment with minimal parameters."""
+    import pyopenms as oms
+    aligner = oms.MapAlignmentAlgorithmPoseClustering()
+    ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
+    # Set up basic parameters for pose clustering
+    pc_params = oms.Param()
+    pc_params.setValue("max_num_peaks_considered", 1000)
+    pc_params.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
+    pc_params.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
+    aligner.setParameters(pc_params)
+    aligner.setReference(fmaps[ref_index])
+    for index, fm in enumerate(fmaps):
+        if index == ref_index:
+            continue
+        trafo = oms.TransformationDescription()
+        aligner.align(fm, trafo)
+        transformer = oms.MapAlignmentTransformer()
+        transformer.transformRetentionTimes(fm, trafo, True)
+    study_obj.alignment_ref_index = ref_index

{masster-0.3.16 → masster-0.3.17}/uv.lock RENAMED Viewed

@@ -1372,7 +1372,7 @@ wheels = [
 [[package]]
 name = "masster"
-version = "0.3.16"
+version = "0.3.17"
 source = { editable = "." }
 dependencies = [
     { name = "alphabase" },