PyPI - masster - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl - Mend

masster 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (33) hide show

masster/docs/SCX_API_Documentation.md +0 -0
masster/docs/SCX_DLL_Analysis.md +0 -0
masster/logger.py +92 -78
masster/sample/defaults/find_features_def.py +16 -6
masster/sample/defaults/sample_def.py +1 -1
masster/sample/h5.py +2 -2
masster/sample/helpers.py +190 -140
masster/sample/load.py +13 -9
masster/sample/plot.py +256 -147
masster/sample/processing.py +18 -12
masster/sample/sample.py +10 -4
masster/sample/sample5_schema.json +38 -29
masster/sample/save.py +16 -13
masster/sample/sciex.py +187 -176
masster/study/defaults/align_def.py +231 -13
masster/study/defaults/fill_chrom_def.py +1 -5
masster/study/defaults/integrate_chrom_def.py +1 -5
masster/study/defaults/study_def.py +2 -2
masster/study/export.py +144 -131
masster/study/h5.py +193 -133
masster/study/helpers.py +757 -246
masster/study/helpers_optimized.py +99 -57
masster/study/load.py +57 -25
masster/study/plot.py +1244 -129
masster/study/processing.py +194 -86
masster/study/save.py +7 -7
masster/study/study.py +154 -89
masster/study/study5_schema.json +15 -15
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/METADATA +1 -1
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/RECORD +33 -31
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/WHEEL +0 -0
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/entry_points.txt +0 -0
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/licenses/LICENSE +0 -0

masster/study/processing.py CHANGED Viewed

@@ -17,7 +17,7 @@ from masster.study.defaults import (
 def align(self, **kwargs):
-    """Align feature maps using pose clustering and update feature RTs.
+    """Align feature maps using pose clustering or KD algorithm and update feature RTs.
     Parameters can be provided as an ``align_defaults`` instance or as
     individual keyword arguments; they are validated against the defaults class.
@@ -30,6 +30,32 @@ def align(self, **kwargs):
         - num_used_points (int): Number of points to use for alignment estimation.
         - save_features (bool): If True, save updated features after alignment.
         - skip_blanks (bool): If True, skip blank samples during alignment.
+        - algo (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD).
+        KD algorithm specific parameters:
+        - nr_partitions (int): Number of partitions in m/z dimension.
+        - warp_enabled (bool): Enable non-linear retention time transformation.
+        - warp_rt_tol (float): RT tolerance for the LOWESS fit.
+        - warp_mz_tol (float): m/z tolerance for the LOWESS fit.
+        - warp_max_pairwise_log_fc (float): Maximum absolute log10 fold-change threshold for pairing.
+        - warp_min_rel_cc_size (float): Minimum relative connected component size.
+        - warp_max_nr_conflicts (int): Allow up to this many conflicts per connected component for alignment.
+        - link_rt_tol (float): Width of RT tolerance window for linking features.
+        - link_mz_tol (float): m/z tolerance for linking features.
+        - link_charge_merging (str): Charge merging strategy for linking features.
+        - link_adduct_merging (str): Adduct merging strategy for linking features.
+        - distance_RT_exponent (float): Exponent for normalized RT differences.
+        - distance_RT_weight (float): Weight factor for final RT distances.
+        - distance_MZ_exponent (float): Exponent for normalized m/z differences.
+        - distance_MZ_weight (float): Weight factor for final m/z distances.
+        - distance_intensity_exponent (float): Exponent for differences in relative intensity.
+        - distance_intensity_weight (float): Weight factor for final intensity distances.
+        - distance_intensity_log_transform (str): Log-transform intensities.
+        - LOWESS_span (float): Fraction of datapoints for each local regression.
+        - LOWESS_num_iterations (int): Number of robustifying iterations for LOWESS fitting.
+        - LOWESS_delta (float): Parameter for LOWESS computations (negative auto-computes).
+        - LOWESS_interpolation_type (str): Method for interpolation between datapoints.
+        - LOWESS_extrapolation_type (str): Method for extrapolation outside data range.
     """
     # parameters initialization
     params = align_defaults()
@@ -57,78 +83,135 @@ def align(self, **kwargs):
         self.features_maps = []
         self.load_features()
-    self.logger.debug("Starting alignment")
+    # self.logger.debug("Starting alignment")
     fmaps = self.features_maps
-    # set ref_index to feature map index with largest number of features
-    ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
-    self.logger.info(
-        f"Align on {self.samples_df.row(ref_index, named=True)['sample_name']}",
-    )
-    aligner = oms.MapAlignmentAlgorithmPoseClustering()
+    # Initialize OpenMS parameters
     params_oms = oms.Param()
-    params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
-    params_oms.setValue("pairfinder:ignore_charge", "true")
-    params_oms.setValue("max_num_peaks_considered", 1000)
-    params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
-    params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
-    params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
-    params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
-    params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
-    params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
-    params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
-    aligner.setParameters(params_oms)
-    """
-    {b'max_num_peaks_considered': 1000,
-    b'superimposer:mz_pair_max_distance': 0.5,
-    b'superimposer:rt_pair_distance_fraction': 0.1,
-    b'superimposer:num_used_points': 2000,
-    b'superimposer:scaling_bucket_size': 0.005,
-    b'superimposer:shift_bucket_size': 3.0,
-    b'superimposer:max_shift': 1000.0,
-    b'superimposer:max_scaling': 2.0,
-    b'superimposer:dump_buckets': '',
-    b'superimposer:dump_pairs': '',
-    b'pairfinder:second_nearest_gap': 2.0,
-    b'pairfinder:use_identifications': 'false',
-    b'pairfinder:ignore_charge': 'false',
-    b'pairfinder:ignore_adduct': 'true',
-    b'pairfinder:distance_RT:max_difference': 100.0,
-    b'pairfinder:distance_RT:exponent': 1.0,
-    b'pairfinder:distance_RT:weight': 1.0,
-    b'pairfinder:distance_MZ:max_difference': 0.3,
-    b'pairfinder:distance_MZ:unit': 'Da',
-    b'pairfinder:distance_MZ:exponent': 2.0,
-    b'pairfinder:distance_MZ:weight': 1.0,
-    b'pairfinder:distance_intensity:exponent': 1.0,
-    b'pairfinder:distance_intensity:weight': 0.0,
-    b'pairfinder:distance_intensity:log_transform': 'disabled'} """
-    aligner.setReference(fmaps[ref_index])
-    self.logger.debug(f"Parameters for alignment: {params}")
-    tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-    # perform alignment and transformation of feature maps to the reference map (exclude reference map)
-    for index, fm in tqdm(
-        list(enumerate(fmaps)),
-        total=len(fmaps),
-        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Align feature maps",
-        disable=tdqm_disable,
-    ):
-        if index == ref_index:
-            continue
-        if params.get("skip_blanks") and self.samples_df.row(index, named=True)["sample_type"] == "blank":
-            continue
-        trafo = oms.TransformationDescription()
-        aligner.align(fm, trafo)
-        transformer = oms.MapAlignmentTransformer()
-        transformer.transformRetentionTimes(fm, trafo, True)
+    # Choose alignment algorithm based on parameter
+    algo = params.get("algo").lower()
+    # Set common parameters for both algorithms
+    if algo == "pc":
+        # Parameters specific to PoseClustering
+        params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
+        params_oms.setValue("pairfinder:ignore_charge", "true")
+        params_oms.setValue("max_num_peaks_considered", 1000)
+        params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
+        params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
+        params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
+        params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
+        params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
+        params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
+        params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
+        """
+        {b'max_num_peaks_considered': 1000,
+        b'superimposer:mz_pair_max_distance': 0.5,
+        b'superimposer:rt_pair_distance_fraction': 0.1,
+        b'superimposer:num_used_points': 2000,
+        b'superimposer:scaling_bucket_size': 0.005,
+        b'superimposer:shift_bucket_size': 3.0,
+        b'superimposer:max_shift': 1000.0,
+        b'superimposer:max_scaling': 2.0,
+        b'superimposer:dump_buckets': '',
+        b'superimposer:dump_pairs': '',
+        b'pairfinder:second_nearest_gap': 2.0,
+        b'pairfinder:use_identifications': 'false',
+        b'pairfinder:ignore_charge': 'false',
+        b'pairfinder:ignore_adduct': 'true',
+        b'pairfinder:distance_RT:max_difference': 100.0,
+        b'pairfinder:distance_RT:exponent': 1.0,
+        b'pairfinder:distance_RT:weight': 1.0,
+        b'pairfinder:distance_MZ:max_difference': 0.3,
+        b'pairfinder:distance_MZ:unit': 'Da',
+        b'pairfinder:distance_MZ:exponent': 2.0,
+        b'pairfinder:distance_MZ:weight': 1.0,
+        b'pairfinder:distance_intensity:exponent': 1.0,
+        b'pairfinder:distance_intensity:weight': 0.0,
+        b'pairfinder:distance_intensity:log_transform': 'disabled'}
+        """
+    elif algo == "kd":
+        # Parameters specific to KD algorithm
+        params_oms.setValue("mz_unit", "Da")
+        params_oms.setValue("nr_partitions", params.get("nr_partitions"))
+        # Warp parameters for non-linear RT transformation
+        params_oms.setValue("warp:enabled", "true" if params.get("warp_enabled") else "false")
+        params_oms.setValue("warp:rt_tol", params.get("warp_rt_tol"))
+        params_oms.setValue("warp:mz_tol", params.get("warp_mz_tol"))
+        params_oms.setValue("warp:max_pairwise_log_fc", params.get("warp_max_pairwise_log_fc"))
+        params_oms.setValue("warp:min_rel_cc_size", params.get("warp_min_rel_cc_size"))
+        params_oms.setValue("warp:max_nr_conflicts", params.get("warp_max_nr_conflicts"))
+        # Link parameters
+        params_oms.setValue("link:rt_tol", params.get("link_rt_tol"))
+        params_oms.setValue("link:mz_tol", params.get("link_mz_tol"))
+        params_oms.setValue("link:charge_merging", params.get("link_charge_merging"))
+        params_oms.setValue("link:adduct_merging", params.get("link_adduct_merging"))
+        # Distance parameters
+        params_oms.setValue("distance_RT:exponent", params.get("distance_RT_exponent"))
+        params_oms.setValue("distance_RT:weight", params.get("distance_RT_weight"))
+        params_oms.setValue("distance_MZ:exponent", params.get("distance_MZ_exponent"))
+        params_oms.setValue("distance_MZ:weight", params.get("distance_MZ_weight"))
+        params_oms.setValue("distance_intensity:exponent", params.get("distance_intensity_exponent"))
+        params_oms.setValue("distance_intensity:weight", params.get("distance_intensity_weight"))
+        params_oms.setValue("distance_intensity:log_transform", params.get("distance_intensity_log_transform"))
+        # LOWESS parameters
+        params_oms.setValue("LOWESS:span", params.get("LOWESS_span"))
+        params_oms.setValue("LOWESS:num_iterations", params.get("LOWESS_num_iterations"))
+        params_oms.setValue("LOWESS:delta", params.get("LOWESS_delta"))
+        params_oms.setValue("LOWESS:interpolation_type", params.get("LOWESS_interpolation_type"))
+        params_oms.setValue("LOWESS:extrapolation_type", params.get("LOWESS_extrapolation_type"))
+    if algo == "pc":
+        aligner = oms.MapAlignmentAlgorithmPoseClustering()
+        self.logger.info("Starting alignment with PoseClustering")
+        # set ref_index to feature map index with largest number of features
+        ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
+        self.logger.debug(
+            f"Reference map is {self.samples_df.row(ref_index, named=True)['sample_name']}",
+        )
+        aligner.setParameters(params_oms)
+        aligner.setReference(fmaps[ref_index])
+        self.logger.debug(f"Parameters for alignment: {params}")
+        # perform alignment and transformation of feature maps to the reference map (exclude reference map)
+        tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
+        for index, fm in tqdm(
+            list(enumerate(fmaps)),
+            total=len(fmaps),
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Align feature maps",
+            disable=tdqm_disable,
+        ):
+            if index == ref_index:
+                continue
+            if params.get("skip_blanks") and self.samples_df.row(index, named=True)["sample_type"] == "blank":
+                continue
+            trafo = oms.TransformationDescription()
+            aligner.align(fm, trafo)
+            transformer = oms.MapAlignmentTransformer()
+            transformer.transformRetentionTimes(fm, trafo, True)
+        self.alignment_ref_index = ref_index
+    elif algo == "kd":
+        # KD algorithm requires num_maps and Param parameters
+        num_maps = len(fmaps)
+        aligner = oms.MapAlignmentAlgorithmKD(3, params_oms)
+        self.logger.info(f"Starting alignment with KD algorithm using {num_maps} maps")
+        kdtree = oms.KDTreeFeatureMaps()
+        kdtree.addMaps(fmaps)  # Add all feature maps to the KDTree
+        # kdtree.optimizeTree()
+        aligner.addRTFitData(kdtree)
+        aligner.fitLOWESS()
+        aligner.transform(kdtree)
-    self.alignment_ref_index = ref_index
+    else:
+        self.logger.error(f"Unknown alignment algorithm '{algo}'")
     # check if rt_original exists in features_df, if not, add it after rt
     if "rt_original" not in self.features_df.columns:
@@ -139,7 +222,7 @@ def align(self, **kwargs):
     # iterate through all feature_maps and add the transformed retention times to the features_df
-    # Build a fast lookup for (sample_uid, feature_uid) to index in features_df
+    # Build a fast lookup for (sample_uid, featureUid) to index in features_df
     feats = self.features_df
     # Pre-build sample_uid lookup for faster access
@@ -150,7 +233,7 @@ def align(self, **kwargs):
     # Build the main lookup using feature_uid (not feature_id)
     if "feature_id" in feats.columns:
-        # Create lookup mapping (sample_uid, feature_uid) to DataFrame index using Polars
+        # Create lookup mapping (sample_uid, feature_id) to DataFrame index using Polars
         # Since we need a pandas-style index lookup, we'll create a simple dict
         sample_uids = feats.get_column("sample_uid").to_list()
@@ -206,14 +289,39 @@ def align(self, **kwargs):
     # Single batch update for all features at once
     if all_update_idx:
-        # Update "rt" column for specified indices using Polars
-        self.features_df = self.features_df.with_columns(
-            pl.when(pl.int_range(0, self.features_df.height).is_in(all_update_idx))
-            .then(pl.Series("rt", all_update_rt))
-            .otherwise(pl.col("rt"))
-            .alias("rt"),
-        )
-        # self.features_df.loc[all_update_idx, "rt_original"] = all_update_rt_original
+        # Build a full-length Python list of rt values, update specified indices,
+        # then replace the DataFrame column with a Series that has the same length
+        try:
+            current_rt = self.features_df["rt"].to_list()
+        except Exception:
+            current_rt = [None] * self.features_df.height
+        # Defensive: ensure list length equals dataframe height
+        if len(current_rt) != self.features_df.height:
+            current_rt = [None] * self.features_df.height
+        for idx, new_rt in zip(all_update_idx, all_update_rt):
+            current_rt[idx] = new_rt
+        new_cols = [pl.Series("rt", current_rt)]
+        # Update rt_original if corresponding updates were collected
+        if 'all_update_rt_original' in locals() and all_update_rt_original:
+            try:
+                current_rt_orig = self.features_df["rt_original"].to_list() if "rt_original" in self.features_df.columns else [None] * self.features_df.height
+            except Exception:
+                current_rt_orig = [None] * self.features_df.height
+            if len(current_rt_orig) != self.features_df.height:
+                current_rt_orig = [None] * self.features_df.height
+            for idx, new_orig in zip(all_update_idx, all_update_rt_original):
+                current_rt_orig[idx] = new_orig
+            new_cols.append(pl.Series("rt_original", current_rt_orig))
+        # Replace columns in one call
+        self.features_df = self.features_df.with_columns(*new_cols)
     self.logger.debug("Alignment completed successfully.")
@@ -238,8 +346,8 @@ def merge(self, **kwargs):
     self.consensus_df = pl.DataFrame()
     self.consensus_ms2 = pl.DataFrame()
     self.consensus_mapping_df = pl.DataFrame()
-    self.logger.info('Merging...')
+    self.logger.info("Merging...")
     # parameters initialization
     params = merge_defaults()
     for key, value in kwargs.items():
@@ -482,17 +590,17 @@ def merge(self, **kwargs):
         # Collect all adducts from feature_data_list to create consensus adduct information
         all_adducts = []
         adduct_masses = {}
         for fd in feature_data_list:
             # Get individual adduct and mass from each feature data (fd)
             adduct = fd.get("adduct")
             adduct_mass = fd.get("adduct_mass")
             if adduct is not None:
                 all_adducts.append(adduct)
                 if adduct_mass is not None:
                     adduct_masses[adduct] = adduct_mass
         # Calculate adduct_values for the consensus feature
         adduct_values = []
         if all_adducts:
@@ -506,9 +614,9 @@ def merge(self, **kwargs):
                     "adduct": str(adduct),
                     "count": int(count),
                     "percentage": float(round(percentage, 2)),
-                    "mass": float(mass) if mass is not None else None
+                    "mass": float(mass) if mass is not None else None,
                 })
         # Sort adduct_values by count in descending order
         adduct_values.sort(key=lambda x: x["count"], reverse=True)  # type: ignore[arg-type,return-value]
         # Store adduct_values for use in metadata
@@ -613,7 +721,7 @@ def find_ms2(self, **kwargs):
     """
     # Reset consensus_ms2 DataFrame at the start
     self.consensus_ms2 = pl.DataFrame()
     # parameters initialization
     params = find_ms2_defaults()
     for key, value in kwargs.items():

masster/study/save.py CHANGED Viewed

@@ -21,7 +21,7 @@ def save(self, filename=None, add_timestamp=True, compress=False):
         filename (str, optional): Target file name. If None, uses default.
         add_timestamp (bool, optional): If True, appends timestamp to avoid overwriting.
                                       Default True for safety (original behavior).
-        compress (bool, optional): If True, uses compressed mode and skips
+        compress (bool, optional): If True, uses compressed mode and skips
                                    some heavy columns for maximum speed. Default False.
     """
@@ -46,11 +46,11 @@ def save(self, filename=None, add_timestamp=True, compress=False):
         filename = f"{filename.replace('.study5', '')}_{timestamp}.study5"
     # Log file size information for performance monitoring
-    if hasattr(self, 'features_df') and not self.features_df.is_empty():
+    if hasattr(self, "features_df") and not self.features_df.is_empty():
         feature_count = len(self.features_df)
-        sample_count = len(self.samples_df) if hasattr(self, 'samples_df') and not self.samples_df.is_empty() else 0
+        sample_count = len(self.samples_df) if hasattr(self, "samples_df") and not self.samples_df.is_empty() else 0
         self.logger.info(f"Saving study with {sample_count} samples and {feature_count} features to {filename}")
     # Use compressed mode for large datasets
     if compress:
         self._save_study5_compressed(filename)
@@ -106,7 +106,7 @@ def save_samples(self, samples=None):
         ddaobj.save()
         sample_name = sample_row.row(0, named=True)["sample_name"]
         sample_path = sample_row.row(0, named=True)["sample_path"]
         # Find the index of this sample in the original order for features_maps
         sample_index = next(
             (
@@ -116,7 +116,7 @@ def save_samples(self, samples=None):
             ),
             None,
         )
         # Determine where to save the featureXML file based on sample_path location
         if sample_path.endswith(".sample5"):
             # If sample_path is a .sample5 file, save featureXML in the same directory
@@ -135,7 +135,7 @@ def save_samples(self, samples=None):
                     sample_name + ".featureXML",
                 )
             self.logger.debug(f"Saving featureXML to default location: {featurexml_filename}")
         fh = oms.FeatureXMLFile()
         if sample_index is not None and sample_index < len(self.features_maps):
             fh.store(featurexml_filename, self.features_maps[sample_index])

masster 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

Potentially problematic release.

masster 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl