PyPI - masster - Versions diffs - 0.3.17__tar.gz → 0.3.18__tar.gz - Mend

masster 0.3.17tar.gz → 0.3.18tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (78) hide show

{masster-0.3.17 → masster-0.3.18}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.3.17
+Version: 0.3.18
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.3.17 → masster-0.3.18}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.3.17"
+version = "0.3.18"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.3.17 → masster-0.3.18}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.3.17"
+__version__ = "0.3.18"
 def get_version():

{masster-0.3.17 → masster-0.3.18}/src/masster/study/helpers.py RENAMED Viewed

@@ -479,7 +479,9 @@ def get_consensus(self, quant="chrom_area"):
     # sort by consensus_id
     df1 = df1.sort_index()
-    df2 = self.get_consensus_matrix(quant=quant)
+    df2_polars = self.get_consensus_matrix(quant=quant)
+    # Convert to pandas for merging (since the result is used for export)
+    df2 = df2_polars.to_pandas().set_index("consensus_uid")
     # sort df2 row by consensus_id
     df2 = df2.sort_index()
     # merge df and df2 on consensus_id
@@ -492,6 +494,7 @@ def get_consensus(self, quant="chrom_area"):
 def get_consensus_matrix(self, quant="chrom_area"):
     """
     Get a matrix of consensus features with samples as columns and consensus features as rows.
+    Optimized implementation that avoids expensive join operations.
     """
     if quant not in self.features_df.columns:
         self.logger.error(
@@ -499,41 +502,58 @@ def get_consensus_matrix(self, quant="chrom_area"):
         )
         return None
-    # Use Polars join instead of pandas merge
-    features_subset = self.features_df.select(["feature_uid", "sample_uid", quant])
-    consensus_mapping_subset = self.consensus_mapping_df.select([
-        "consensus_uid",
-        "feature_uid",
-    ])
-    df1 = features_subset.join(
-        consensus_mapping_subset,
-        on="feature_uid",
-        how="left",
-    )
-    # Convert to pandas for pivot operation (Polars pivot is still evolving)
-    df1_pd = df1.to_pandas()
-    df2 = df1_pd.pivot_table(
-        index="consensus_uid",
-        columns="sample_uid",
-        values=quant,
-        aggfunc="max",
-    )
-    # Create sample_uid to sample_name mapping using Polars
-    sample_mapping = dict(
-        self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
-    )
-    # replace sample_uid with sample_name in df2
-    df2 = df2.rename(columns=sample_mapping)
+    # Create a lookup dictionary from features_df for O(1) value access
+    feature_values = {}
+    for row in self.features_df.iter_rows(named=True):
+        feature_uid = row['feature_uid']
+        sample_uid = row['sample_uid']
+        value = row[quant] if row[quant] is not None else 0
+        feature_values[(feature_uid, sample_uid)] = value
+    # Build consensus matrix directly using the consensus_mapping_df
+    matrix_dict = {}
+    sample_mapping = dict(self.samples_df.select(["sample_uid", "sample_name"]).iter_rows())
+    for row in self.consensus_mapping_df.iter_rows(named=True):
+        consensus_uid = row['consensus_uid']
+        sample_uid = row['sample_uid']
+        feature_uid = row['feature_uid']
+        # Look up the quantification value
+        key = (feature_uid, sample_uid)
+        value = feature_values.get(key, 0)
+        if consensus_uid not in matrix_dict:
+            matrix_dict[consensus_uid] = {}
+        sample_name = sample_mapping.get(sample_uid, f"sample_{sample_uid}")
+        # Take max if multiple features map to same consensus/sample combination
+        if sample_name in matrix_dict[consensus_uid]:
+            matrix_dict[consensus_uid][sample_name] = max(matrix_dict[consensus_uid][sample_name], value)
+        else:
+            matrix_dict[consensus_uid][sample_name] = value
-    # round to integer
-    df2 = df2.round()
-    # set consensus_id as uint64
-    df2.index = df2.index.astype("uint64")
-    # set index to consensus_id
-    df2.index.name = "consensus_uid"
+    # Convert to Polars DataFrame with proper formatting
+    import polars as pl
+    # Convert matrix_dict to list of records for Polars
+    records = []
+    for consensus_uid, sample_values in matrix_dict.items():
+        record = {"consensus_uid": consensus_uid}
+        record.update(sample_values)
+        records.append(record)
+    # Create Polars DataFrame and set proper data types
+    df2 = pl.DataFrame(records)
+    # Fill null values with 0 and round numeric columns
+    numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
+    df2 = df2.with_columns([
+        pl.col("consensus_uid").cast(pl.UInt64),
+        *[pl.col(col).fill_null(0).round(0) for col in numeric_cols]
+    ])
     return df2

{masster-0.3.17 → masster-0.3.18}/src/masster/study/load.py RENAMED Viewed

@@ -1379,7 +1379,7 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
     self.samples_df = pl.concat([self.samples_df, new_sample])
     # SIMPLIFIED feature processing
-    current_sample_uid = len(self.samples_df) - 1
+    current_sample_uid = len(self.samples_df)
     # Add required columns with minimal operations
     columns_to_add = [
@@ -1520,7 +1520,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
     self.samples_df = pl.concat([self.samples_df, new_sample])
     # SIMPLIFIED feature processing
-    current_sample_uid = len(self.samples_df) - 1
+    current_sample_uid = len(self.samples_df)
     # Add required columns with minimal operations
     columns_to_add = [
@@ -1621,7 +1621,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
     self.samples_df = pl.concat([self.samples_df, new_sample])
     # SIMPLIFIED feature processing
-    current_sample_uid = len(self.samples_df) - 1
+    current_sample_uid = len(self.samples_df)
     # Add required columns with minimal operations
     columns_to_add = [
@@ -1695,3 +1695,4 @@ def _sample_color_reset_optimized(self):
     )
     self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")

{masster-0.3.17 → masster-0.3.18}/src/masster/study/plot.py RENAMED Viewed

@@ -17,7 +17,7 @@ hv.extension("bokeh")
 from bokeh.layouts import row as bokeh_row
-def plot_alignment(self, maps: bool = True, filename: str | None = None, width: int = 450, height: int = 450, markersize: int = 3):
+def plot_alignment(self, maps: bool = True, samples: int | list[int | str] | None = None, filename: str | None = None, width: int = 450, height: int = 450, markersize: int = 3):
     """Visualize retention time alignment using two synchronized Bokeh scatter plots.
     - When ``maps=True`` the function reads ``self.features_maps`` (list of FeatureMap)
@@ -27,6 +27,11 @@ def plot_alignment(self, maps: bool = True, filename: str | None = None, width:
     Parameters
     - maps: whether to use feature maps (default True).
+    - samples: Sample selection parameter, interpreted like in plot_samples_2d:
+        - None: show all samples
+        - int: show a random subset of N samples
+        - list of ints: show samples with these sample_uids
+        - list of strings: show samples with these sample_names
     - filename: optional HTML file path to save the plot.
     - width/height: pixel size of each subplot.
     - markersize: base marker size.
@@ -54,6 +59,32 @@ def plot_alignment(self, maps: bool = True, filename: str | None = None, width:
             self.logger.error("No feature maps available for plotting.")
             return
+        # Get sample_uids to limit which samples to show
+        sample_uids_to_show = self._get_sample_uids(samples)
+        # Filter feature maps based on sample selection
+        if sample_uids_to_show is not None:
+            # Get sample indices for the selected sample_uids
+            selected_indices = []
+            if hasattr(self, 'samples_df') and self.samples_df is not None and not self.samples_df.is_empty():
+                samples_info = self.samples_df.to_pandas()
+                for idx, row in samples_info.iterrows():
+                    if row.get('sample_uid') in sample_uids_to_show:
+                        selected_indices.append(idx)
+            else:
+                # If no samples_df, just limit to the first N samples
+                if isinstance(samples, int):
+                    selected_indices = list(range(min(samples, len(fmaps))))
+                else:
+                    selected_indices = list(range(len(fmaps)))
+            # Filter feature maps to only include selected indices
+            fmaps = [fmaps[i] for i in selected_indices if i < len(fmaps)]
+            if not fmaps:
+                self.logger.error("No feature maps match the selected samples.")
+                return
         # Reference (first) sample: use current RT for both before and after
         ref = fmaps[0]
         ref_rt = [f.getRT() for f in ref]
@@ -143,6 +174,28 @@ def plot_alignment(self, maps: bool = True, filename: str | None = None, width:
             self.logger.error("No sample identifier column found in features_df.")
             return
+        # Get sample_uids to limit which samples to show
+        sample_uids_to_show = self._get_sample_uids(samples)
+        # Filter features_df based on sample selection if specified
+        if sample_uids_to_show is not None:
+            if sample_col == 'sample_uid':
+                features_df = features_df.filter(pl.col('sample_uid').is_in(sample_uids_to_show))
+            else:
+                # Need to convert sample names to sample_uids if using sample_name column
+                if 'sample_uid' in features_df.columns:
+                    # Filter by sample_uid even though we're using sample_name as the primary column
+                    features_df = features_df.filter(pl.col('sample_uid').is_in(sample_uids_to_show))
+                else:
+                    # Convert sample_uids to sample_names and filter
+                    sample_names_to_show = []
+                    if hasattr(self, 'samples_df') and self.samples_df is not None:
+                        for uid in sample_uids_to_show:
+                            matching_rows = self.samples_df.filter(pl.col("sample_uid") == uid)
+                            if not matching_rows.is_empty():
+                                sample_names_to_show.append(matching_rows.row(0, named=True)["sample_name"])
+                    features_df = features_df.filter(pl.col('sample_name').is_in(sample_names_to_show))
         # Get unique samples using Polars
         samples = features_df.select(pl.col(sample_col)).unique().to_series().to_list()
@@ -1649,11 +1702,19 @@ def plot_pca(
     self.logger.debug(f"Performing PCA on consensus matrix with shape: {consensus_matrix.shape}")
-    # Convert consensus matrix to numpy if it's not already
-    if hasattr(consensus_matrix, "values"):
+    # Convert consensus matrix to numpy - handle both Polars and pandas DataFrames
+    if hasattr(consensus_matrix, "to_numpy"):
+        # Polars or pandas DataFrame
+        if hasattr(consensus_matrix, "select"):
+            # Polars DataFrame - exclude the consensus_uid column
+            numeric_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
+            matrix_data = consensus_matrix.select(numeric_cols).to_numpy()
+        else:
+            # Pandas DataFrame
+            matrix_data = consensus_matrix.to_numpy()
+    elif hasattr(consensus_matrix, "values"):
+        # Pandas DataFrame
         matrix_data = consensus_matrix.values
-    elif hasattr(consensus_matrix, "to_numpy"):
-        matrix_data = consensus_matrix.to_numpy()
     else:
         matrix_data = np.array(consensus_matrix)

{masster-0.3.17 → masster-0.3.18}/uv.lock RENAMED Viewed

@@ -1372,7 +1372,7 @@ wheels = [
 [[package]]
 name = "masster"
-version = "0.3.17"
+version = "0.3.18"
 source = { editable = "." }
 dependencies = [
     { name = "alphabase" },