PyPI - masster - Versions diffs - 0.5.0__tar.gz → 0.5.3__tar.gz - Mend

masster 0.5.0tar.gz → 0.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (95) hide show

{masster-0.5.0 → masster-0.5.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.5.0
+Version: 0.5.3
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.5.0 → masster-0.5.3}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.5.0"
+version = "0.5.3"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.5.0 → masster-0.5.3}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.5.0"
+__version__ = "0.5.4"
 def get_version():

masster-0.5.3/src/masster/data/libs/aa.csv ADDED Viewed

@@ -0,0 +1,22 @@
+name,smiles,inchikey,formula,db_id,db
+L-Glutamic acid,N[C@@H](CCC(O)=O)C(O)=O,WHUUTDBJXJRKMK-VKHMYHEASA-N,C5H9NO4,CID:33032,pubchem
+L-Tyrosine,N[C@@H](CC1=CC=C(O)C=C1)C(O)=O,OUYCCCASQSFEME-QMMMGPOBSA-N,C9H11NO3,CID:6057,pubchem
+L-Phenylalanine,N[C@@H](CC1=CC=CC=C1)C(O)=O,COLNVLDHVKWLRT-QMMMGPOBSA-N,C9H11NO2,CID:6140,pubchem
+L-Alanine,C[C@H](N)C(O)=O,QNAYBMKLOCPYGJ-REOHCLBHSA-N,C3H7NO2,CID:5950,pubchem
+L-Proline,OC(=O)[C@@H]1CCCN1,ONIBWKKTOPOVIA-BYPYZUCNSA-N,C5H9NO2,CID:145742,pubchem
+L-Threonine,C[C@@H](O)[C@H](N)C(O)=O,AYFVYJQAPQTCCC-GBXIJSLDSA-N,C4H9NO3,CID:6288,pubchem
+L-Asparagine,N[C@@H](CC(N)=O)C(O)=O,DCXYFEDJOCDNAF-REOHCLBHSA-N,C4H8N2O3,CID:6267,pubchem
+L-Isoleucine,CC[C@H](C)[C@H](N)C(O)=O,AGPKZVBTJJNPAG-WHFBIAKZSA-N,C6H13NO2,CID:6306,pubchem
+L-Histidine,N[C@@H](CC1=CN=CN1)C(O)=O,HNDVDQJCIGZPNO-YFKPBYRVSA-N,C6H9N3O2,CID:6274,pubchem
+L-Lysine,NCCCC[C@H](N)C(O)=O,KDXKERNSBIXSRK-YFKPBYRVSA-N,C6H14N2O2,CID:5962,pubchem
+L-Serine,N[C@@H](CO)C(O)=O,MTCFGRXMJLQNBG-REOHCLBHSA-N,C3H7NO3,CID:5951,pubchem
+L-Aspartic acid,N[C@@H](CC(O)=O)C(O)=O,CKLJMWTZIZZHCS-REOHCLBHSA-N,C4H7NO4,CID:5960,pubchem
+L-Cystine,N[C@@H](CSSC[C@H](N)C(O)=O)C(O)=O,LEVWYRKDKASIDU-IMJSIDKUSA-N,C6H12N2O4S2,CID:67678,pubchem
+L-Arginine,N[C@@H](CCCNC(N)=N)C(O)=O,ODKSFYDXXFIFQN-BYPYZUCNSA-N,C6H14N4O2,CID:6322,pubchem
+L-Cysteine,N[C@@H](CS)C(O)=O,XUJNEKJLAYXESH-REOHCLBHSA-N,C3H7NO2S,CID:5862,pubchem
+L-Glutamine,N[C@@H](CCC(N)=O)C(O)=O,ZDXPYRJPNDTMRX-VKHMYHEASA-N,C5H10N2O3,CID:5961,pubchem
+L-Leucine,CC(C)C[C@H](N)C(O)=O,ROHFNLRQFUQHCH-YFKPBYRVSA-N,C6H13NO2,CID:6106,pubchem
+L-Methionine,CSCC[C@H](N)C(O)=O,FFEARJCKVFRZRR-BYPYZUCNSA-N,C5H11NO2S,CID:6137,pubchem
+L-Valine,CC(C)[C@H](N)C(O)=O,KZSNJWFQEVHDMF-BYPYZUCNSA-N,C5H11NO2,CID:6287,pubchem
+L-Tryptophan,N[C@@H](CC1=CNC2=C1C=CC=C2)C(O)=O,QIVBCDIJIAJPQS-VIFPVBQESA-N,C11H12N2O2,CID:6305,pubchem
+Glycine,NCC(O)=O,QNAYBMKLOCPYGJ-UHFFFAOYSA-N,C2H5NO2,CID:750,Glycine

{masster-0.5.0 → masster-0.5.3}/src/masster/lib/lib.py RENAMED Viewed

@@ -123,11 +123,13 @@ class Lib:
             "inchi": pl.Series([], dtype=pl.Utf8),
             "inchikey": pl.Series([], dtype=pl.Utf8),
             "formula": pl.Series([], dtype=pl.Utf8),
+            "iso": pl.Series([], dtype=pl.Int64),
             "adduct": pl.Series([], dtype=pl.Utf8),
             "m": pl.Series([], dtype=pl.Float64),
             "z": pl.Series([], dtype=pl.Int8),
             "mz": pl.Series([], dtype=pl.Float64),
             "rt": pl.Series([], dtype=pl.Float64),
+            "quant_group": pl.Series([], dtype=pl.Int64),
             "db_id": pl.Series([], dtype=pl.Utf8),
             "db": pl.Series([], dtype=pl.Utf8),
         })
@@ -245,11 +247,13 @@ class Lib:
                 "inchi": compound_data.get("inchi", ""),
                 "inchikey": compound_data.get("inchikey", ""),
                 "formula": compound_data["formula"],
+                "iso": 0,  # Default to zero
                 "adduct": adduct,
                 "m": adducted_mass,
                 "z": charge,
                 "mz": mz,
                 "rt": compound_data.get("rt", None),
+                "quant_group": counter,  # Use same as lib_uid for default
                 "db_id": compound_data.get("db_id", None),
                 "db": compound_data.get("db", None),
             }
@@ -526,12 +530,14 @@ class Lib:
                     "source_id": match_row.get("source_id"),
                     "name": match_row["name"],
                     "formula": match_row["formula"],
+                    "iso": match_row.get("iso", 0),
                     "adduct": match_row["adduct"],
                     "smiles": match_row["smiles"],
                     "inchi": match_row["inchi"],
                     "inchikey": match_row["inchikey"],
                     "lib_mz": match_row["mz"],
                     "lib_rt": match_row["rt"],
+                    "quant_group": match_row.get("quant_group"),
                     "delta_mz": abs(feature_mz - match_row["mz"]),
                     "delta_rt": abs(feature_rt - match_row["rt"]) if feature_rt is not None and match_row["rt"] is not None else None,
                 }

{masster-0.5.0 → masster-0.5.3}/src/masster/sample/load.py RENAMED Viewed

@@ -155,13 +155,14 @@ def load_noms1(
     # check if file is mzML
     if filename.lower().endswith(".mzml"):
-        self._load_mzML(filename)
+        _load_mzML(self, filename)
     elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
-        self._load_wiff(filename)
+        _load_wiff(self, filename)
     elif filename.lower().endswith(".raw"):
-        self._load_raw(filename)
+        _load_raw(self, filename)
     elif filename.lower().endswith(".sample5"):
-        self._load_sample5_study(filename)  # Use optimized version for study loading
+        from masster.sample.h5 import _load_sample5_study
+        _load_sample5_study(self, filename)  # Use optimized version for study loading
     else:
         raise ValueError("File must be .mzML, .wiff, *.raw, or .sample5")

masster-0.5.0/src/masster/study/defaults/fill_chrom_def.py → masster-0.5.3/src/masster/study/defaults/align_def.py RENAMED Viewed

@@ -1,65 +1,104 @@
-"""Parameter class for Study fill_chrom method."""
+"""Parameter class for Study align method."""
 from dataclasses import dataclass, field
-from typing import Optional, Any
+from typing import Any
 @dataclass
-class fill_chrom_defaults:
+class align_defaults:
     """
-    Parameter class for Study fill_chrom method.
+    Parameter class for Study align method.
-    This class encapsulates parameters for filling missing chromatograms
-    by extracting them from raw data across samples.
+    This class encapsulates parameters for feature alignment across samples,
+    including retention time and m/z tolerances, warping parameters, and
+    alignment algorithm settings.
     Attributes:
-        uids (Optional[list]): List of consensus UIDs to process. Default is None (all).
-        mz_tol (float): m/z tolerance for chromatogram extraction (Da). Default is 0.010.
-        rt_tol (float): RT tolerance for chromatogram extraction (seconds). Default is 10.0.
-        min_samples_rel (float): Minimum relative samples threshold. Default is 0.05.
-        min_samples_abs (int): Minimum absolute samples threshold. Default is 5.
+        algorithm (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD). Default is 'pc'.
+        rt_tol (float): Maximum retention time difference for alignment. Default is 60.0.
+        mz_max_diff (float): Maximum m/z difference for alignment. Default is 0.02.
+        rt_pair_distance_frac (float): Fraction of RT difference for pair distance. Default is 0.2.
+        mz_pair_max_distance (float): Maximum m/z pair distance. Default is 0.01.
+        num_used_points (int): Number of points used for alignment. Default is 1000.
+        save_features (bool): Whether to save features after alignment. Default is False.
+        skip_blanks (bool): Whether to skip blank samples. Default is False.
+        KD algorithm specific parameters:
+        warp_mz_tol (float): m/z tolerance for the LOWESS fit. Default is 0.05.
     """
-    uids: Optional[list] = None
-    mz_tol: float = 0.010
-    rt_tol: float = 10.0
-    min_samples_rel: float = 0.05
-    min_samples_abs: int = 5
+    rt_tol: float = 5.0
+    mz_max_diff: float = 0.01
+    rt_pair_distance_frac: float = 0.5
+    mz_pair_max_distance: float = 0.01
+    num_used_points: int = 1000
+    save_features: bool = False
+    skip_blanks: bool = False
+    algorithm: str = "kd"
+    # KD algorithm specific parameters
+    warp_mz_tol: float = 0.05
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
-            "uids": {
-                "dtype": "Optional[list]",
-                "description": "List of consensus UIDs to process (None for all)",
-                "default": None,
+            "rt_tol": {
+                "dtype": float,
+                "description": "Maximum retention time difference for alignment (seconds)",
+                "default": 5.0,
+                "min_value": 1.0,
+                "max_value": 30.0,
             },
-            "mz_tol": {
+            "mz_max_diff": {
                 "dtype": float,
-                "description": "m/z tolerance for chromatogram extraction (Da)",
-                "default": 0.010,
+                "description": "Maximum m/z difference for alignment (Da)",
+                "default": 0.01,
                 "min_value": 0.001,
-                "max_value": 0.1,
+                "max_value": 0.05,
             },
-            "rt_tol": {
+            "rt_pair_distance_frac": {
                 "dtype": float,
-                "description": "RT tolerance for chromatogram extraction (seconds)",
-                "default": 10.0,
-                "min_value": 1.0,
-                "max_value": 300.0,
+                "description": "Fraction of RT difference for pair distance calculation",
+                "default": 0.2,
+                "min_value": 0.1,
+                "max_value": 1.0,
             },
-            "min_samples_rel": {
+            "mz_pair_max_distance": {
                 "dtype": float,
-                "description": "Minimum relative samples threshold (fraction)",
-                "default": 0.05,
-                "min_value": 0.01,
-                "max_value": 1.0,
+                "description": "Maximum m/z pair distance (Da)",
+                "default": 0.01,
+                "min_value": 0.001,
+                "max_value": 0.2,
             },
-            "min_samples_abs": {
+            "num_used_points": {
                 "dtype": int,
-                "description": "Minimum absolute samples threshold",
-                "default": 5,
-                "min_value": 1,
-                "max_value": 100,
+                "description": "Number of points used for alignment",
+                "default": 1000,
+                "min_value": 10,
+                "max_value": 10000,
+            },
+            "save_features": {
+                "dtype": bool,
+                "description": "Whether to save features after alignment",
+                "default": False,
+            },
+            "skip_blanks": {
+                "dtype": bool,
+                "description": "Whether to skip blank samples during alignment",
+                "default": False,
+            },
+            "algorithm": {
+                "dtype": str,
+                "description": "Alignment algorithm to use",
+                "default": "pc",
+                "allowed_values": ["pc", "kd"],
+            },
+            # KD algorithm specific parameters
+            "warp_mz_tol": {
+                "dtype": float,
+                "description": "m/z tolerance for the LOWESS fit in KD algorithm (Da)",
+                "default": 0.05,
+                "min_value": 0.001,
+                "max_value": 1.0,
             },
         },
         repr=False,
@@ -111,14 +150,6 @@ class fill_chrom_defaults:
         metadata = self._param_metadata[param_name]
         expected_dtype = metadata["dtype"]
-        # Handle optional types
-        if isinstance(expected_dtype, str) and expected_dtype.startswith("Optional"):
-            if value is None:
-                return True
-            # Extract the inner type for validation
-            if "list" in expected_dtype:
-                expected_dtype = list
         # Type checking
         if expected_dtype is int:
             if not isinstance(value, int):
@@ -132,8 +163,8 @@ class fill_chrom_defaults:
                     value = float(value)
                 except (ValueError, TypeError):
                     return False
-        elif expected_dtype is list:
-            if not isinstance(value, list):
+        elif expected_dtype is bool:
+            if not isinstance(value, bool):
                 return False
         # Range validation for numeric types
@@ -143,6 +174,11 @@ class fill_chrom_defaults:
             if "max_value" in metadata and value > metadata["max_value"]:
                 return False
+        # Allowed values validation for string types
+        if expected_dtype is str and "allowed_values" in metadata:
+            if value not in metadata["allowed_values"]:
+                return False
         return True
     def set(self, param_name: str, value: Any, validate: bool = True) -> bool:
@@ -167,20 +203,18 @@ class fill_chrom_defaults:
         if param_name in self._param_metadata:
             expected_dtype = self._param_metadata[param_name]["dtype"]
-            # Handle optional types
-            if isinstance(expected_dtype, str) and expected_dtype.startswith("Optional") and value is not None:
-                if "int" in expected_dtype and not isinstance(value, int):
-                    try:
-                        value = int(value)
-                    except (ValueError, TypeError):
-                        if validate:
-                            return False
-                elif "float" in expected_dtype and not isinstance(value, float):
-                    try:
-                        value = float(value)
-                    except (ValueError, TypeError):
-                        if validate:
-                            return False
+            if expected_dtype is int and not isinstance(value, int):
+                try:
+                    value = int(value)
+                except (ValueError, TypeError):
+                    if validate:
+                        return False
+            elif expected_dtype is float and not isinstance(value, float):
+                try:
+                    value = float(value)
+                except (ValueError, TypeError):
+                    if validate:
+                        return False
         setattr(self, param_name, value)
         return True

{masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/fill_def.py RENAMED Viewed

@@ -21,10 +21,11 @@ class fill_defaults:
     """
     uids: Optional[list] = None
-    mz_tol: float = 0.010
+    mz_tol: float = 0.050
     rt_tol: float = 10.0
     min_samples_rel: float = 0.00
     min_samples_abs: int = 5
+    threads: int = 6
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
@@ -58,9 +59,16 @@ class fill_defaults:
                 "dtype": int,
                 "description": "Minimum absolute samples threshold",
                 "default": 5,
-                "min_value": 1,
+                "min_value": 0,
                 "max_value": 100,
             },
+            "threads": {
+                "dtype": int,
+                "description": "Number of parallel threads",
+                "default": 6,
+                "min_value": 1,
+                "max_value": 32,
+            },
         },
         repr=False,
     )

{masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/merge_def.py RENAMED Viewed

@@ -13,7 +13,7 @@ class merge_defaults:
     method selection, grouping tolerances, and algorithm-specific parameters.
     Attributes:
-        method (str): Merge method to use ('kd', 'qt', 'kd-nowarp', 'chunked'). Default is "kd".
+        method (str): Merge method to use ('kd', 'qt', 'kd_chunked', 'qt_chunked'). Default is "kd".
         min_samples (int): Minimum number of samples for a consensus feature. Default is 50.
         rt_tol (float): RT tolerance for grouping (seconds). Default is 2.0.
         mz_tol (float): m/z tolerance for grouping (Da for all methods). Default is 0.01.
@@ -25,38 +25,31 @@ class merge_defaults:
         link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
     """
-    method: str = "qt"
+    method: str = "kd"
     min_samples: int = 2
     rt_tol: float = 5.0
-    mz_tol: float = 0.01
+    mz_tol: float = 0.05
     chunk_size: int = 500
     nr_partitions: int = 1000
     min_rel_cc_size: float = 0.1
     max_pairwise_log_fc: float = -1.0
     max_nr_conflicts: int = 0
     link_ms2: bool = True
+    extract_ms1: bool = True
+    # Cross-chunk merging parameters
+    dechunking: str = "hierarchical"
     # Parallel processing parameters
     threads: Optional[int] = None
-    # KD-Strict specific parameters
-    optimize_rt_tol: bool = False
-    rt_tol_range: tuple = (0.5, 4.0)
-    rt_tol_steps: int = 7
-    secondary_merge_rt_tol: float = 1.0
-    secondary_merge_mz_tol: float = 0.005
-    min_sample_overlap: float = 0.8
-    max_rt_spread: float = 2.0  # Will default to 2x rt_tol
-    min_coherence: float = 0.0
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             "method": {
                 "dtype": str,
                 "description": "Merge method (algorithm) to use",
-                "default": "quality",
-                "allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
-                                 "kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
+                "default": "kd",
+                "allowed_values": ["kd", "qt",
                                  "kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
             },
             "min_samples": {
@@ -118,7 +111,17 @@ class merge_defaults:
                 "description": "Whether to link MS2 spectra to consensus features",
                 "default": True,
             },
-            # Parallel processing parameters
+            "extract_ms1": {
+                "dtype": bool,
+                "description": "Whether to extract MS1 chromatograms for consensus features",
+                "default": True,
+            },
+            "dechunking": {
+                "dtype": str,
+                "description": "Cross-chunk merging algorithm for chunked methods",
+                "default": "hierarchical",
+                "allowed_values": ["hierarchical", "kdtree"],
+            },
             "threads": {
                 "dtype": [int, type(None)],
                 "description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
@@ -126,58 +129,6 @@ class merge_defaults:
                 "min_value": 1,
                 "max_value": 32,
             },
-            # KD-Strict specific parameters
-            "optimize_rt_tol": {
-                "dtype": bool,
-                "description": "Enable RT tolerance optimization for kd-strict method",
-                "default": False,
-            },
-            "rt_tol_range": {
-                "dtype": tuple,
-                "description": "RT tolerance range for optimization (min, max) in seconds",
-                "default": (0.8, 2.0),
-            },
-            "rt_tol_steps": {
-                "dtype": int,
-                "description": "Number of steps for RT tolerance optimization",
-                "default": 5,
-                "min_value": 3,
-                "max_value": 20,
-            },
-            "secondary_merge_rt_tol": {
-                "dtype": float,
-                "description": "RT tolerance for secondary clustering in kd-strict (seconds)",
-                "default": 0.5,
-                "min_value": 0.1,
-                "max_value": 5.0,
-            },
-            "secondary_merge_mz_tol": {
-                "dtype": float,
-                "description": "m/z tolerance for secondary clustering in kd-strict (Da)",
-                "default": 0.005,
-                "min_value": 0.001,
-                "max_value": 0.1,
-            },
-            "min_sample_overlap": {
-                "dtype": float,
-                "description": "Minimum sample overlap ratio for merging features (0.0-1.0)",
-                "default": 0.8,
-                "min_value": 0.0,
-                "max_value": 1.0,
-            },
-            "max_rt_spread": {
-                "dtype": float,
-                "description": "Maximum allowed RT spread in seconds (None = 3x rt_tol)",
-                "default": None,
-                "min_value": 0.1,
-            },
-            "min_coherence": {
-                "dtype": float,
-                "description": "Minimum chromatographic coherence score (0.0 = disabled)",
-                "default": 0.0,
-                "min_value": 0.0,
-                "max_value": 1.0,
-            },
         },
         repr=False,
     )

{masster-0.5.0 → masster-0.5.3}/src/masster/study/export.py RENAMED Viewed

@@ -551,7 +551,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
         else:
             self.logger.info("No identification data available for mzTab export")
     except Exception as e:
-        self.logger.warning(f"Could not retrieve identification data: {e}")
+        self.logger.debug(f"Could not retrieve identification data: {e}")
         id_data = None
         top_id_data = None
         full_id_data = None
@@ -1190,10 +1190,12 @@ def export_xlsx(self, filename: str | None = None) -> None:
     """
     Export the study data to an Excel workbook with multiple worksheets.
-    The Excel file contains three worksheets:
-    - consensus_df: Consensus features dataframe
-    - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
+    The Excel file contains five worksheets:
+    - samples: Samples dataframe
+    - consensus: Consensus features dataframe
     - identification: Identification results with library annotations (get_id)
+    - gaps: Gaps matrix showing filled vs non-filled features (get_gaps_matrix)
+    - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
     Args:
         filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
@@ -1263,7 +1265,25 @@ def export_xlsx(self, filename: str | None = None) -> None:
             f"Error getting identification data: {e}. Skipping identification worksheet.",
         )
-    # 4. Consensus matrix (last worksheet)
+    # 4. Gaps matrix (filled vs non-filled features)
+    try:
+        gaps_df = self.get_gaps_matrix()
+        if gaps_df is not None and not gaps_df.is_empty():
+            gaps_pandas = gaps_df.to_pandas()
+            worksheets["gaps"] = gaps_pandas
+            self.logger.debug(
+                f"Added gaps worksheet with {len(gaps_pandas)} rows",
+            )
+        else:
+            self.logger.warning(
+                "get_gaps_matrix() returned empty data, skipping gaps worksheet",
+            )
+    except Exception as e:
+        self.logger.debug(
+            f"Error getting gaps data: {e}. Skipping gaps worksheet.",
+        )
+    # 5. Consensus matrix (last worksheet)
     try:
         matrix_df = self.get_consensus_matrix()
         if matrix_df is not None and not matrix_df.is_empty():

masster 0.5.0__tar.gz → 0.5.3__tar.gz

Potentially problematic release.

masster 0.5.0tar.gz → 0.5.3tar.gz