PyPI - masster - Versions diffs - 0.5.27__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

masster 0.5.27py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (30) hide show

masster/_version.py +1 -1
masster/data/libs/aa_nort.json +240 -0
masster/data/libs/ccm_nort.json +1319 -0
masster/lib/lib.py +1 -1
masster/logger.py +0 -6
masster/sample/adducts.py +1 -1
masster/sample/defaults/find_adducts_def.py +1 -1
masster/sample/h5.py +152 -2
masster/sample/helpers.py +91 -5
masster/sample/id.py +1160 -0
masster/sample/importers.py +316 -0
masster/sample/plot.py +175 -71
masster/sample/sample.py +18 -3
masster/sample/sample5_schema.json +99 -1
masster/study/defaults/study_def.py +8 -12
masster/study/id.py +59 -12
masster/study/load.py +0 -11
masster/study/merge.py +153 -0
masster/study/plot.py +197 -0
masster/study/study.py +3 -1
masster/study/study5_schema.json +15 -0
masster/wizard/wizard.py +11 -12
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/METADATA +99 -60
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/RECORD +27 -26
masster/data/libs/aa.csv +0 -22
masster/data/libs/ccm.csv +0 -120
masster/data/libs/urine.csv +0 -4693
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/WHEEL +0 -0
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/entry_points.txt +0 -0
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/licenses/LICENSE +0 -0

masster/lib/lib.py CHANGED Viewed

@@ -772,7 +772,7 @@ class Lib:
                 skipped_compounds += 1
                 continue
-            formula = compound_record.get("formula", "")
+            formula = compound_record.get("formula", compound_record.get("Formula", ""))
             if not formula or not isinstance(formula, str):
                 skipped_compounds += 1
                 continue

masster/logger.py CHANGED Viewed

@@ -136,10 +136,8 @@ class MassterLogger:
                         f"\x1b[90m{module_name}:{func_name}:{line_no}\x1b[0m | "  # dim gray for location info
                     )
-                # Universal format: timestamp | level | location | label - message
                 # Universal format: timestamp | level | location | label - message
                 return (
-                    f"\x1b[90m{timestamp}\x1b[0m | "  # gray timestamp (universal for both themes)
                     f"\x1b[90m{timestamp}\x1b[0m | "  # gray timestamp (universal for both themes)
                     f"{level_color}{level_str}\x1b[0m | "  # colored level
                     f"{location_info}"  # location info for DEBUG/TRACE
@@ -200,7 +198,6 @@ class MassterLogger:
                 level_str = record.levelname.ljust(8)
                 level_color = level_colors.get(record.levelname, "\x1b[90m")  # default to gray instead of white
-                level_color = level_colors.get(record.levelname, "\x1b[90m")  # default to gray instead of white
                 label_part = self.label + " | " if self.label else ""
                 # For DEBUG and TRACE levels, add module/location information
@@ -221,7 +218,6 @@ class MassterLogger:
                 # Universal format: timestamp | level | location | label - message
                 return (
-                    f"\x1b[90m{timestamp}\x1b[0m | "  # gray timestamp (universal for both themes)
                     f"\x1b[90m{timestamp}\x1b[0m | "  # gray timestamp (universal for both themes)
                     f"{level_color}{level_str}\x1b[0m | "  # colored level
                     f"{location_info}"  # location info for DEBUG/TRACE
@@ -267,7 +263,6 @@ class MassterLogger:
                 level_str = record.levelname.ljust(8)
                 level_color = level_colors.get(record.levelname, "\x1b[90m")  # default to gray instead of white
-                level_color = level_colors.get(record.levelname, "\x1b[90m")  # default to gray instead of white
                 label_part = self.label + " | " if self.label else ""
                 # For DEBUG and TRACE levels, add module/location information
@@ -288,7 +283,6 @@ class MassterLogger:
                 # Universal format: timestamp | level | location | label - message
                 return (
-                    f"\x1b[90m{timestamp}\x1b[0m | "  # gray timestamp (universal for both themes)
                     f"\x1b[90m{timestamp}\x1b[0m | "  # gray timestamp (universal for both themes)
                     f"{level_color}{level_str}\x1b[0m | "  # colored level
                     f"{location_info}"  # location info for DEBUG/TRACE

masster/sample/adducts.py CHANGED Viewed

@@ -137,7 +137,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
                         "formatted_name": formatted_name,
                         "total_mass_shift": spec["mass_shift"] * multiplier,
                         "total_charge": total_charge,
-                        "combined_probability": spec["probability"] ** multiplier,
+                        "combined_probability": (spec["probability"] ** multiplier) / 2.0,
                         "complexity": multiplier,
                     },
                 )

masster/sample/defaults/find_adducts_def.py CHANGED Viewed

@@ -356,7 +356,7 @@ class find_adducts_defaults:
             ]
         elif adducts in ["neg", "negative"]:
             return [
-                "-H:-1:0.9",
+                "-H:-1:0.90",
                 "+Cl:-1:0.1",
                 "+CH2O2:0:0.15",
                 "-H2O:0:0.15",

masster/sample/h5.py CHANGED Viewed

@@ -319,7 +319,78 @@ def _save_sample5(
         params_json = json.dumps(save_data, indent=2)
         metadata_group.attrs["parameters"] = params_json
-        # Store lib and lib_match - removed (no longer saving lib data)
+        # Store lib_df and id_df (identification DataFrames)
+        if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
+            lib_group = f.create_group("lib")
+            for col in self.lib_df.columns:
+                data = self.lib_df[col].to_numpy()
+                # Handle different data types safely
+                if data.dtype == object:
+                    try:
+                        str_data = np.array(
+                            ["" if x is None else str(x) for x in data],
+                            dtype="S",
+                        )
+                        lib_group.create_dataset(
+                            col,
+                            data=str_data,
+                            compression="gzip",
+                        )
+                        lib_group[col].attrs["dtype"] = "string_converted"
+                    except Exception:
+                        json_data = np.array(
+                            [json.dumps(x, default=str) for x in data],
+                            dtype="S",
+                        )
+                        lib_group.create_dataset(
+                            col,
+                            data=json_data,
+                            compression="gzip",
+                        )
+                        lib_group[col].attrs["dtype"] = "json"
+                else:
+                    lib_group.create_dataset(
+                        col,
+                        data=data,
+                        compression="gzip",
+                    )
+            lib_group.attrs["columns"] = list(self.lib_df.columns)
+        if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
+            id_group = f.create_group("id")
+            for col in self.id_df.columns:
+                data = self.id_df[col].to_numpy()
+                # Handle different data types safely
+                if data.dtype == object:
+                    try:
+                        str_data = np.array(
+                            ["" if x is None else str(x) for x in data],
+                            dtype="S",
+                        )
+                        id_group.create_dataset(
+                            col,
+                            data=str_data,
+                            compression="gzip",
+                        )
+                        id_group[col].attrs["dtype"] = "string_converted"
+                    except Exception:
+                        json_data = np.array(
+                            [json.dumps(x, default=str) for x in data],
+                            dtype="S",
+                        )
+                        id_group.create_dataset(
+                            col,
+                            data=json_data,
+                            compression="gzip",
+                        )
+                        id_group[col].attrs["dtype"] = "json"
+                else:
+                    id_group.create_dataset(
+                        col,
+                        data=data,
+                        compression="gzip",
+                    )
+            id_group.attrs["columns"] = list(self.id_df.columns)
     self.logger.success(f"Sample saved to {filename}")
     if save_featurexml:
@@ -1004,8 +1075,87 @@ def _load_sample5(self, filename: str, map: bool = False):
         else:
             self.ms1_df = None
+        # Load lib_df (library DataFrame)
+        if "lib" in f:
+            lib_group = f["lib"]
+            data = {}
+            # Get all datasets in the lib group
+            for col in lib_group.keys():
+                data_col = lib_group[col][:]
+                # Handle string data
+                if hasattr(lib_group[col], "attrs") and lib_group[col].attrs.get("dtype") in ["string_converted", "json"]:
+                    data[col] = [x.decode("utf-8") if isinstance(x, bytes) else x for x in data_col]
+                else:
+                    data[col] = data_col
+            if data:
+                # Create DataFrame directly with Polars
+                self.lib_df = pl.DataFrame(data)
+                # Apply schema if available
+                if "lib_df" in schema and "columns" in schema["lib_df"]:
+                    schema_columns = schema["lib_df"]["columns"]
+                    for col in self.lib_df.columns:
+                        if col in schema_columns:
+                            dtype_str = schema_columns[col]["dtype"]
+                            try:
+                                self.lib_df = self.lib_df.with_columns(
+                                    [pl.col(col).cast(eval(dtype_str), strict=False)]
+                                )
+                            except Exception as e:
+                                self.logger.warning(
+                                    f"Failed to apply schema type {dtype_str} to column {col}: {e}",
+                                )
+                # Convert "None" strings and NaN values to proper null values
+                self.lib_df = clean_null_values_polars(self.lib_df)
+            else:
+                self.lib_df = None
+        else:
+            self.lib_df = None
+        # Load id_df (identification results DataFrame)
+        if "id" in f:
+            id_group = f["id"]
+            data = {}
+            # Get all datasets in the id group
+            for col in id_group.keys():
+                data_col = id_group[col][:]
+                # Handle string data
+                if hasattr(id_group[col], "attrs") and id_group[col].attrs.get("dtype") in ["string_converted", "json"]:
+                    data[col] = [x.decode("utf-8") if isinstance(x, bytes) else x for x in data_col]
+                else:
+                    data[col] = data_col
+            if data:
+                # Create DataFrame directly with Polars
+                self.id_df = pl.DataFrame(data)
+                # Apply schema if available
+                if "id_df" in schema and "columns" in schema["id_df"]:
+                    schema_columns = schema["id_df"]["columns"]
+                    for col in self.id_df.columns:
+                        if col in schema_columns:
+                            dtype_str = schema_columns[col]["dtype"]
+                            try:
+                                self.id_df = self.id_df.with_columns(
+                                    [pl.col(col).cast(eval(dtype_str), strict=False)]
+                                )
+                            except Exception as e:
+                                self.logger.warning(
+                                    f"Failed to apply schema type {dtype_str} to column {col}: {e}",
+                                )
+                # Convert "None" strings and NaN values to proper null values
+                self.id_df = clean_null_values_polars(self.id_df)
+            else:
+                self.id_df = None
+        else:
+            self.id_df = None
         # Parameters are now loaded from metadata JSON (see above)
-        # Lib and lib_match are no longer saved/loaded
     # if map:
     #    featureXML = filename.replace(".sample5", ".featureXML")

masster/sample/helpers.py CHANGED Viewed

@@ -359,17 +359,22 @@ def features_select(
     uid=None,
     mz=None,
     rt=None,
-    coherence=None,
-    inty=None,
     rt_delta=None,
-    iso=None,
-    iso_of=None,
-    has_MS2=None,
+    inty=None,
+    coherence=None,
     prominence_scaled=None,
     prominence=None,
     height_scaled=None,
     height=None,
+    iso=None,
+    iso_of=None,
+    has_MS2=None,
     adduct_group=None,
+    id=None,
+    id_top_name=None,
+    id_top_class=None,
+    id_top_adduct=None,
+    id_top_score=None,
 ):
     """
     Select features based on specified criteria and return the filtered DataFrame.
@@ -389,6 +394,11 @@ def features_select(
         prominence: prominence filter (tuple for range, single value for minimum)
         height: height filter (tuple for range, single value for minimum)
         adduct_group: adduct group filter (single value for exact match, list of values for multiple groups, tuple for range, or None for all)
+        id: filter for features with/without identification (bool: True for identified, False for unidentified)
+        id_top_name: filter by top identification name using regex (str for regex pattern, list of str for multiple patterns combined with OR)
+        id_top_class: filter by top identification class using regex (str for regex pattern, list of str for multiple patterns combined with OR)
+        id_top_adduct: filter by top identification adduct (str for exact match, list of str for multiple adducts)
+        id_top_score: filter by top identification score (tuple for range, single value for minimum)
     Returns:
         polars.DataFrame: Filtered features DataFrame
     """
@@ -600,6 +610,82 @@ def features_select(
                 f"Selected features by adduct_group. Features removed: {feats_len_before_filter - len(feats)}",
             )
+    if id is not None:
+        feats_len_before_filter = len(feats)
+        if "id_top_name" not in feats.columns:
+            self.logger.warning("No identification data found in features.")
+        else:
+            if id:
+                # Filter for features with identification (non-null id_top_name)
+                feats = feats.filter(pl.col("id_top_name").is_not_null())
+            else:
+                # Filter for features without identification (null id_top_name)
+                feats = feats.filter(pl.col("id_top_name").is_null())
+            self.logger.debug(
+                f"Selected features by identification presence. Features removed: {feats_len_before_filter - len(feats)}",
+            )
+    if id_top_name is not None:
+        feats_len_before_filter = len(feats)
+        if "id_top_name" not in feats.columns:
+            self.logger.warning("No id_top_name data found in features.")
+        else:
+            if isinstance(id_top_name, list):
+                # Use regex matching for each pattern in the list (OR logic)
+                pattern = "|".join(id_top_name)
+                feats = feats.filter(pl.col("id_top_name").str.contains(pattern))
+            else:
+                # Use regex matching for single pattern
+                feats = feats.filter(pl.col("id_top_name").str.contains(id_top_name))
+            self.logger.debug(
+                f"Selected features by id_top_name (regex). Features removed: {feats_len_before_filter - len(feats)}",
+            )
+    if id_top_class is not None:
+        feats_len_before_filter = len(feats)
+        if "id_top_class" not in feats.columns:
+            self.logger.warning("No id_top_class data found in features.")
+        else:
+            if isinstance(id_top_class, list):
+                # Use regex matching for each pattern in the list (OR logic)
+                pattern = "|".join(id_top_class)
+                feats = feats.filter(pl.col("id_top_class").str.contains(pattern))
+            else:
+                # Use regex matching for single pattern
+                feats = feats.filter(pl.col("id_top_class").str.contains(id_top_class))
+            self.logger.debug(
+                f"Selected features by id_top_class (regex). Features removed: {feats_len_before_filter - len(feats)}",
+            )
+    if id_top_adduct is not None:
+        feats_len_before_filter = len(feats)
+        if "id_top_adduct" not in feats.columns:
+            self.logger.warning("No id_top_adduct data found in features.")
+        else:
+            if isinstance(id_top_adduct, list):
+                feats = feats.filter(pl.col("id_top_adduct").is_in(id_top_adduct))
+            else:
+                feats = feats.filter(pl.col("id_top_adduct") == id_top_adduct)
+            self.logger.debug(
+                f"Selected features by id_top_adduct. Features removed: {feats_len_before_filter - len(feats)}",
+            )
+    if id_top_score is not None:
+        feats_len_before_filter = len(feats)
+        if "id_top_score" not in feats.columns:
+            self.logger.warning("No id_top_score data found in features.")
+        else:
+            if isinstance(id_top_score, tuple) and len(id_top_score) == 2:
+                min_score, max_score = id_top_score
+                feats = feats.filter(
+                    (pl.col("id_top_score") >= min_score) & (pl.col("id_top_score") <= max_score)
+                )
+            else:
+                feats = feats.filter(pl.col("id_top_score") >= id_top_score)
+            self.logger.debug(
+                f"Selected features by id_top_score. Features removed: {feats_len_before_filter - len(feats)}",
+            )
     if len(feats) == 0:
         self.logger.warning("No features remaining after applying selection criteria.")
     else:

masster 0.5.27__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

masster 0.5.27py3-none-any.whl → 0.6.0py3-none-any.whl