PyPI - masster - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

masster 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (20) hide show

masster/_version.py +1 -1
masster/sample/load.py +5 -4
masster/study/defaults/align_def.py +0 -204
masster/study/defaults/fill_def.py +9 -1
masster/study/defaults/merge_def.py +20 -69
masster/study/export.py +25 -5
masster/study/h5.py +160 -42
masster/study/helpers.py +430 -53
masster/study/load.py +986 -158
masster/study/merge.py +683 -1076
masster/study/plot.py +43 -38
masster/study/processing.py +337 -280
masster/study/study.py +58 -135
masster/wizard/wizard.py +20 -6
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/METADATA +1 -1
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/RECORD +19 -20
masster/study/defaults/fill_chrom_def.py +0 -260
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/WHEEL +0 -0
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/entry_points.txt +0 -0
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/licenses/LICENSE +0 -0

masster/_version.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.5.1"
+__version__ = "0.5.4"
 def get_version():

masster/sample/load.py CHANGED Viewed

@@ -155,13 +155,14 @@ def load_noms1(
     # check if file is mzML
     if filename.lower().endswith(".mzml"):
-        self._load_mzML(filename)
+        _load_mzML(self, filename)
     elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
-        self._load_wiff(filename)
+        _load_wiff(self, filename)
     elif filename.lower().endswith(".raw"):
-        self._load_raw(filename)
+        _load_raw(self, filename)
     elif filename.lower().endswith(".sample5"):
-        self._load_sample5_study(filename)  # Use optimized version for study loading
+        from masster.sample.h5 import _load_sample5_study
+        _load_sample5_study(self, filename)  # Use optimized version for study loading
     else:
         raise ValueError("File must be .mzML, .wiff, *.raw, or .sample5")

masster/study/defaults/align_def.py CHANGED Viewed

@@ -24,30 +24,7 @@ class align_defaults:
         skip_blanks (bool): Whether to skip blank samples. Default is False.
         KD algorithm specific parameters:
-        min_samples (int): Minimum number of samples required for KD alignment. Default is 3.
-        nr_partitions (int): Number of partitions in m/z dimension. Default is 100.
-        warp_enabled (bool): Enable non-linear retention time transformation. Default is True.
-        warp_rt_tol (float): RT tolerance for the LOWESS fit. Default is 5.0.
         warp_mz_tol (float): m/z tolerance for the LOWESS fit. Default is 0.05.
-        warp_max_pairwise_log_fc (float): Maximum absolute log10 fold change threshold for pairing. Default is 0.5.
-        warp_min_rel_cc_size (float): Minimum relative connected component size. Default is 0.5.
-        warp_max_nr_conflicts (int): Allow up to this many conflicts per connected component for alignment. Default is 0.
-        link_rt_tol (float): Width of RT tolerance window for linking (seconds). Default is 30.0.
-        link_mz_tol (float): m/z tolerance for linking features (ppm or Da). Default is 10.0.
-        link_charge_merging (str): Charge merging strategy for linking. Default is "With_charge_zero".
-        link_adduct_merging (str): Adduct merging strategy for linking. Default is "Any".
-        distance_RT_exponent (float): Exponent for normalized RT differences. Default is 1.0.
-        distance_RT_weight (float): Weight factor for final RT distances. Default is 1.0.
-        distance_MZ_exponent (float): Exponent for normalized m/z differences. Default is 2.0.
-        distance_MZ_weight (float): Weight factor for final m/z distances. Default is 1.0.
-        distance_intensity_exponent (float): Exponent for differences in relative intensity. Default is 1.0.
-        distance_intensity_weight (float): Weight factor for final intensity distances. Default is 1.0.
-        distance_intensity_log_transform (str): Log-transform intensities. Default is "enabled".
-        LOWESS_span (float): Fraction of datapoints for each local regression. Default is 0.666666666666667.
-        LOWESS_num_iterations (int): Number of robustifying iterations for LOWESS fitting. Default is 3.
-        LOWESS_delta (float): Parameter for LOWESS computations (negative auto-computes). Default is -1.0.
-        LOWESS_interpolation_type (str): Method for interpolation between datapoints. Default is "cspline".
-        LOWESS_extrapolation_type (str): Method for extrapolation outside data range. Default is "four-point-linear".
     """
     rt_tol: float = 5.0
@@ -60,30 +37,7 @@ class align_defaults:
     algorithm: str = "kd"
     # KD algorithm specific parameters
-    min_samples: int = 3
-    nr_partitions: int = 100
-    warp_enabled: bool = True
-    warp_rt_tol: float = 5.0
     warp_mz_tol: float = 0.05
-    warp_max_pairwise_log_fc: float = 0.5
-    warp_min_rel_cc_size: float = 0.5
-    warp_max_nr_conflicts: int = 0
-    link_rt_tol: float = 30.0
-    link_mz_tol: float = 10.0
-    link_charge_merging: str = "With_charge_zero"
-    link_adduct_merging: str = "Any"
-    distance_RT_exponent: float = 1.0
-    distance_RT_weight: float = 1.0
-    distance_MZ_exponent: float = 2.0
-    distance_MZ_weight: float = 1.0
-    distance_intensity_exponent: float = 1.0
-    distance_intensity_weight: float = 1.0
-    distance_intensity_log_transform: str = "enabled"
-    LOWESS_span: float = 0.666666666666667
-    LOWESS_num_iterations: int = 3
-    LOWESS_delta: float = -1.0
-    LOWESS_interpolation_type: str = "cspline"
-    LOWESS_extrapolation_type: str = "four-point-linear"
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
@@ -139,32 +93,6 @@ class align_defaults:
                 "allowed_values": ["pc", "kd"],
             },
             # KD algorithm specific parameters
-            "min_samples": {
-                "dtype": int,
-                "description": "Minimum number of samples required for KD alignment algorithm",
-                "default": 3,
-                "min_value": 2,
-                "max_value": 1000,
-            },
-            "nr_partitions": {
-                "dtype": int,
-                "description": "Number of partitions in m/z dimension for KD algorithm",
-                "default": 100,
-                "min_value": 1,
-                "max_value": 1000,
-            },
-            "warp_enabled": {
-                "dtype": bool,
-                "description": "Enable non-linear retention time transformation for KD algorithm",
-                "default": True,
-            },
-            "warp_rt_tol": {
-                "dtype": float,
-                "description": "RT tolerance for the LOWESS fit in KD algorithm (seconds)",
-                "default": 5.0,
-                "min_value": 0.1,
-                "max_value": 60.0,
-            },
             "warp_mz_tol": {
                 "dtype": float,
                 "description": "m/z tolerance for the LOWESS fit in KD algorithm (Da)",
@@ -172,138 +100,6 @@ class align_defaults:
                 "min_value": 0.001,
                 "max_value": 1.0,
             },
-            "warp_max_pairwise_log_fc": {
-                "dtype": float,
-                "description": "Maximum absolute log10 fold change between two compatible signals during compatibility graph construction in KD algorithm",
-                "default": 0.5,
-                "min_value": -1.0,
-                "max_value": 10.0,
-            },
-            "warp_min_rel_cc_size": {
-                "dtype": float,
-                "description": "Minimum relative connected component size for KD algorithm",
-                "default": 0.5,
-                "min_value": 0.0,
-                "max_value": 1.0,
-            },
-            "warp_max_nr_conflicts": {
-                "dtype": int,
-                "description": "Allow up to this many conflicts (features from the same map) per connected component to be used for alignment (-1 means allow any number of conflicts)",
-                "default": 0,
-                "min_value": -1,
-                "max_value": 1000,
-            },
-            "link_rt_tol": {
-                "dtype": float,
-                "description": "Width of RT tolerance window for linking in KD algorithm (seconds)",
-                "default": 30.0,
-                "min_value": 0.0,
-                "max_value": 300.0,
-            },
-            "link_mz_tol": {
-                "dtype": float,
-                "description": "m/z tolerance for linking features in KD algorithm (ppm or Da)",
-                "default": 10.0,
-                "min_value": 0.0,
-                "max_value": 100.0,
-            },
-            "link_charge_merging": {
-                "dtype": str,
-                "description": "Charge merging strategy for linking features in KD algorithm",
-                "default": "With_charge_zero",
-                "allowed_values": ["Identical", "With_charge_zero", "Any"],
-            },
-            "link_adduct_merging": {
-                "dtype": str,
-                "description": "Adduct merging strategy for linking features in KD algorithm",
-                "default": "Any",
-                "allowed_values": ["Identical", "With_unknown_adducts", "Any"],
-            },
-            "distance_RT_exponent": {
-                "dtype": float,
-                "description": "Normalized RT differences are raised to this power in KD algorithm",
-                "default": 1.0,
-                "min_value": 0.0,
-                "max_value": 10.0,
-            },
-            "distance_RT_weight": {
-                "dtype": float,
-                "description": "Final RT distances are weighted by this factor in KD algorithm",
-                "default": 1.0,
-                "min_value": 0.0,
-                "max_value": 100.0,
-            },
-            "distance_MZ_exponent": {
-                "dtype": float,
-                "description": "Normalized m/z differences are raised to this power in KD algorithm",
-                "default": 2.0,
-                "min_value": 0.0,
-                "max_value": 10.0,
-            },
-            "distance_MZ_weight": {
-                "dtype": float,
-                "description": "Final m/z distances are weighted by this factor in KD algorithm",
-                "default": 1.0,
-                "min_value": 0.0,
-                "max_value": 100.0,
-            },
-            "distance_intensity_exponent": {
-                "dtype": float,
-                "description": "Differences in relative intensity are raised to this power in KD algorithm",
-                "default": 1.0,
-                "min_value": 0.0,
-                "max_value": 10.0,
-            },
-            "distance_intensity_weight": {
-                "dtype": float,
-                "description": "Final intensity distances are weighted by this factor in KD algorithm",
-                "default": 1.0,
-                "min_value": 0.0,
-                "max_value": 100.0,
-            },
-            "distance_intensity_log_transform": {
-                "dtype": str,
-                "description": "Log-transform intensities in KD algorithm distance calculation",
-                "default": "enabled",
-                "allowed_values": ["enabled", "disabled"],
-            },
-            "LOWESS_span": {
-                "dtype": float,
-                "description": "Fraction of datapoints to use for each local regression in LOWESS fitting",
-                "default": 0.666666666666667,
-                "min_value": 0.0,
-                "max_value": 1.0,
-            },
-            "LOWESS_num_iterations": {
-                "dtype": int,
-                "description": "Number of robustifying iterations for LOWESS fitting",
-                "default": 3,
-                "min_value": 0,
-                "max_value": 10,
-            },
-            "LOWESS_delta": {
-                "dtype": float,
-                "description": "Nonnegative parameter for LOWESS computations (negative value auto-computes)",
-                "default": -1.0,
-                "min_value": -1.0,
-                "max_value": 1000.0,
-            },
-            "LOWESS_interpolation_type": {
-                "dtype": str,
-                "description": "Method to use for interpolation between datapoints computed by LOWESS",
-                "default": "cspline",
-                "allowed_values": ["linear", "cspline", "akima"],
-            },
-            "LOWESS_extrapolation_type": {
-                "dtype": str,
-                "description": "Method to use for extrapolation outside the data range in LOWESS",
-                "default": "four-point-linear",
-                "allowed_values": [
-                    "two-point-linear",
-                    "four-point-linear",
-                    "global-linear",
-                ],
-            },
         },
         repr=False,
     )

masster/study/defaults/fill_def.py CHANGED Viewed

@@ -21,10 +21,11 @@ class fill_defaults:
     """
     uids: Optional[list] = None
-    mz_tol: float = 0.010
+    mz_tol: float = 0.050
     rt_tol: float = 10.0
     min_samples_rel: float = 0.00
     min_samples_abs: int = 5
+    threads: int = 6
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
@@ -61,6 +62,13 @@ class fill_defaults:
                 "min_value": 0,
                 "max_value": 100,
             },
+            "threads": {
+                "dtype": int,
+                "description": "Number of parallel threads",
+                "default": 6,
+                "min_value": 1,
+                "max_value": 32,
+            },
         },
         repr=False,
     )

masster/study/defaults/merge_def.py CHANGED Viewed

@@ -13,7 +13,7 @@ class merge_defaults:
     method selection, grouping tolerances, and algorithm-specific parameters.
     Attributes:
-        method (str): Merge method to use ('kd', 'qt', 'kd-nowarp', 'chunked'). Default is "kd".
+        method (str): Merge method to use ('kd', 'qt', 'kd_chunked', 'qt_chunked'). Default is "kd".
         min_samples (int): Minimum number of samples for a consensus feature. Default is 50.
         rt_tol (float): RT tolerance for grouping (seconds). Default is 2.0.
         mz_tol (float): m/z tolerance for grouping (Da for all methods). Default is 0.01.
@@ -25,38 +25,31 @@ class merge_defaults:
         link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
     """
-    method: str = "qt"
+    method: str = "kd"
     min_samples: int = 2
     rt_tol: float = 5.0
-    mz_tol: float = 0.01
+    mz_tol: float = 0.05
     chunk_size: int = 500
     nr_partitions: int = 1000
     min_rel_cc_size: float = 0.1
     max_pairwise_log_fc: float = -1.0
     max_nr_conflicts: int = 0
     link_ms2: bool = True
+    extract_ms1: bool = True
+    # Cross-chunk merging parameters
+    dechunking: str = "hierarchical"
     # Parallel processing parameters
     threads: Optional[int] = None
-    # KD-Strict specific parameters
-    optimize_rt_tol: bool = False
-    rt_tol_range: tuple = (0.5, 4.0)
-    rt_tol_steps: int = 7
-    secondary_merge_rt_tol: float = 1.0
-    secondary_merge_mz_tol: float = 0.005
-    min_sample_overlap: float = 0.8
-    max_rt_spread: float = 2.0  # Will default to 2x rt_tol
-    min_coherence: float = 0.0
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             "method": {
                 "dtype": str,
                 "description": "Merge method (algorithm) to use",
-                "default": "quality",
-                "allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
-                                 "kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
+                "default": "kd",
+                "allowed_values": ["kd", "qt",
                                  "kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
             },
             "min_samples": {
@@ -118,7 +111,17 @@ class merge_defaults:
                 "description": "Whether to link MS2 spectra to consensus features",
                 "default": True,
             },
-            # Parallel processing parameters
+            "extract_ms1": {
+                "dtype": bool,
+                "description": "Whether to extract MS1 chromatograms for consensus features",
+                "default": True,
+            },
+            "dechunking": {
+                "dtype": str,
+                "description": "Cross-chunk merging algorithm for chunked methods",
+                "default": "hierarchical",
+                "allowed_values": ["hierarchical", "kdtree"],
+            },
             "threads": {
                 "dtype": [int, type(None)],
                 "description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
@@ -126,58 +129,6 @@ class merge_defaults:
                 "min_value": 1,
                 "max_value": 32,
             },
-            # KD-Strict specific parameters
-            "optimize_rt_tol": {
-                "dtype": bool,
-                "description": "Enable RT tolerance optimization for kd-strict method",
-                "default": False,
-            },
-            "rt_tol_range": {
-                "dtype": tuple,
-                "description": "RT tolerance range for optimization (min, max) in seconds",
-                "default": (0.8, 2.0),
-            },
-            "rt_tol_steps": {
-                "dtype": int,
-                "description": "Number of steps for RT tolerance optimization",
-                "default": 5,
-                "min_value": 3,
-                "max_value": 20,
-            },
-            "secondary_merge_rt_tol": {
-                "dtype": float,
-                "description": "RT tolerance for secondary clustering in kd-strict (seconds)",
-                "default": 0.5,
-                "min_value": 0.1,
-                "max_value": 5.0,
-            },
-            "secondary_merge_mz_tol": {
-                "dtype": float,
-                "description": "m/z tolerance for secondary clustering in kd-strict (Da)",
-                "default": 0.005,
-                "min_value": 0.001,
-                "max_value": 0.1,
-            },
-            "min_sample_overlap": {
-                "dtype": float,
-                "description": "Minimum sample overlap ratio for merging features (0.0-1.0)",
-                "default": 0.8,
-                "min_value": 0.0,
-                "max_value": 1.0,
-            },
-            "max_rt_spread": {
-                "dtype": float,
-                "description": "Maximum allowed RT spread in seconds (None = 3x rt_tol)",
-                "default": None,
-                "min_value": 0.1,
-            },
-            "min_coherence": {
-                "dtype": float,
-                "description": "Minimum chromatographic coherence score (0.0 = disabled)",
-                "default": 0.0,
-                "min_value": 0.0,
-                "max_value": 1.0,
-            },
         },
         repr=False,
     )

masster/study/export.py CHANGED Viewed

@@ -551,7 +551,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
         else:
             self.logger.info("No identification data available for mzTab export")
     except Exception as e:
-        self.logger.warning(f"Could not retrieve identification data: {e}")
+        self.logger.debug(f"Could not retrieve identification data: {e}")
         id_data = None
         top_id_data = None
         full_id_data = None
@@ -1190,10 +1190,12 @@ def export_xlsx(self, filename: str | None = None) -> None:
     """
     Export the study data to an Excel workbook with multiple worksheets.
-    The Excel file contains three worksheets:
-    - consensus_df: Consensus features dataframe
-    - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
+    The Excel file contains five worksheets:
+    - samples: Samples dataframe
+    - consensus: Consensus features dataframe
     - identification: Identification results with library annotations (get_id)
+    - gaps: Gaps matrix showing filled vs non-filled features (get_gaps_matrix)
+    - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
     Args:
         filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
@@ -1263,7 +1265,25 @@ def export_xlsx(self, filename: str | None = None) -> None:
             f"Error getting identification data: {e}. Skipping identification worksheet.",
         )
-    # 4. Consensus matrix (last worksheet)
+    # 4. Gaps matrix (filled vs non-filled features)
+    try:
+        gaps_df = self.get_gaps_matrix()
+        if gaps_df is not None and not gaps_df.is_empty():
+            gaps_pandas = gaps_df.to_pandas()
+            worksheets["gaps"] = gaps_pandas
+            self.logger.debug(
+                f"Added gaps worksheet with {len(gaps_pandas)} rows",
+            )
+        else:
+            self.logger.warning(
+                "get_gaps_matrix() returned empty data, skipping gaps worksheet",
+            )
+    except Exception as e:
+        self.logger.debug(
+            f"Error getting gaps data: {e}. Skipping gaps worksheet.",
+        )
+    # 5. Consensus matrix (last worksheet)
     try:
         matrix_df = self.get_consensus_matrix()
         if matrix_df is not None and not matrix_df.is_empty():

masster 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

masster 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl