PyPI - masster - Versions diffs - 0.3.15__tar.gz → 0.3.16__tar.gz - Mend

masster 0.3.15tar.gz → 0.3.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (78) hide show

{masster-0.3.15 → masster-0.3.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.3.15
+Version: 0.3.16
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.3.15 → masster-0.3.16}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.3.15"
+version = "0.3.16"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.3.15 → masster-0.3.16}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.5.7"
+__version__ = "0.3.16"
 def get_version():

{masster-0.3.15 → masster-0.3.16}/src/masster/sample/h5.py RENAMED Viewed

@@ -897,6 +897,583 @@ def _load_sample5(self, filename: str, map: bool = True):
     self.logger.info(f"Sample loaded successfully from {filename}")
+def _load_sample5_study(self, filename: str, map: bool = True):
+    """
+    Optimized variant of _load_sample5 for study loading that skips reading ms1_df.
+    This is used when adding samples to studies where ms1_df data is not needed,
+    improving loading throughput by skipping the potentially large ms1_df dataset.
+    Args:
+        filename (str): Path to the sample5 HDF5 file to load.
+        map (bool, optional): Whether to map featureXML file if available. Defaults to True.
+    Returns:
+        None (modifies self in place)
+    Notes:
+        - Same as _load_sample5 but skips ms1_df loading for better performance
+        - Sets ms1_df = None explicitly
+        - Suitable for study workflows where MS1 spectral data is not required
+    """
+    # Load schema for proper DataFrame reconstruction
+    schema_path = os.path.join(os.path.dirname(__file__), "sample5_schema.json")
+    try:
+        with open(schema_path) as f:
+            schema = json.load(f)
+    except FileNotFoundError:
+        self.logger.warning(
+            f"Schema file {schema_path} not found. Using default types.",
+        )
+        schema = {}
+    with h5py.File(filename, "r") as f:
+        # Load metadata
+        if "metadata" in f:
+            metadata_group = f["metadata"]
+            self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
+            # Load file_source if it exists, otherwise set it equal to file_path
+            if "file_source" in metadata_group.attrs:
+                self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
+            else:
+                self.file_source = self.file_path
+            self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
+            self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
+            # Load parameters from JSON in metadata
+            loaded_data = load_parameters_from_metadata(metadata_group)
+            # Always create a fresh sample_defaults object
+            from masster.sample.defaults.sample_def import sample_defaults
+            self.parameters = sample_defaults()
+            # Initialize history and populate from loaded data
+            self.history = {}
+            if loaded_data is not None and isinstance(loaded_data, dict):
+                # Store the loaded data in history
+                self.history = loaded_data
+                # If there are sample parameters in the history, use them to update defaults
+                if "sample" in loaded_data:
+                    sample_params = loaded_data["sample"]
+                    if isinstance(sample_params, dict):
+                        self.parameters.set_from_dict(sample_params, validate=False)
+        # Load scans_df
+        if "scans" in f:
+            scans_group = f["scans"]
+            data: dict[str, Any] = {}
+            missing_columns = []
+            for col in schema.get("scans_df", {}).get("columns", []):
+                if col not in scans_group:
+                    self.logger.debug(f"Column '{col}' not found in sample5/scans.")
+                    data[col] = None
+                    missing_columns.append(col)
+                    continue
+                dtype = schema["scans_df"]["columns"][col].get("dtype", "native")
+                match dtype:
+                    case "pl.Object":
+                        self.logger.debug(f"Unexpected Object column '{col}'")
+                        data[col] = None
+                        missing_columns.append(col)
+                    case _:
+                        data[col] = scans_group[col][:]
+            # create polars DataFrame from data
+            if data:
+                self.scans_df = pl.DataFrame(data)
+                # Convert "None" strings and NaN values to proper null values
+                for col in self.scans_df.columns:
+                    if self.scans_df[col].dtype == pl.Utf8:  # String columns
+                        self.scans_df = self.scans_df.with_columns([
+                            pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                            .then(None)
+                            .otherwise(pl.col(col))
+                            .alias(col),
+                        ])
+                    elif self.scans_df[col].dtype in [
+                        pl.Float64,
+                        pl.Float32,
+                    ]:  # Float columns
+                        self.scans_df = self.scans_df.with_columns([
+                            pl.col(col).fill_nan(None).alias(col),
+                        ])
+                # update all columns with schema types
+                for col in self.scans_df.columns:
+                    if col in schema.get("scans_df", {}).get("columns", {}):
+                        try:
+                            dtype_str = schema["scans_df"]["columns"][col]["dtype"]
+                            # Convert dtype string to actual polars dtype
+                            if dtype_str.startswith("pl."):
+                                # Skip Object columns - they're already properly reconstructed
+                                if "Object" in dtype_str:
+                                    continue
+                                # Handle different polars data types
+                                if "Int" in dtype_str:
+                                    # Convert to numeric first, handling different input types
+                                    if self.scans_df[col].dtype == pl.Utf8:
+                                        # String data - convert to integer
+                                        self.scans_df = self.scans_df.with_columns(
+                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
+                                        )
+                                    elif self.scans_df[col].dtype in [
+                                        pl.Float64,
+                                        pl.Float32,
+                                    ]:
+                                        # Float data - cast to integer
+                                        self.scans_df = self.scans_df.with_columns(
+                                            pl.col(col).cast(eval(dtype_str)),
+                                        )
+                                    else:
+                                        # Try direct casting
+                                        self.scans_df = self.scans_df.with_columns(
+                                            pl.col(col).cast(eval(dtype_str)),
+                                        )
+                                elif "Float" in dtype_str:
+                                    # Convert to float, handling different input types
+                                    if self.scans_df[col].dtype == pl.Utf8:
+                                        # String data - convert to float
+                                        self.scans_df = self.scans_df.with_columns(
+                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
+                                        )
+                                    else:
+                                        # Try direct casting
+                                        self.scans_df = self.scans_df.with_columns(
+                                            pl.col(col).cast(eval(dtype_str)),
+                                        )
+                                elif "Utf8" in dtype_str:
+                                    # Ensure it's string type
+                                    self.scans_df = self.scans_df.with_columns(
+                                        pl.col(col).cast(pl.Utf8),
+                                    )
+                                else:
+                                    # Handle special cases and try direct casting for other types
+                                    current_dtype = self.scans_df[col].dtype
+                                    target_dtype = eval(dtype_str)
+                                    # Handle binary data that might need string conversion first
+                                    if "Binary" in str(current_dtype):
+                                        # Convert binary to string first, then to target type
+                                        if target_dtype == pl.Utf8:
+                                            self.scans_df = self.scans_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
+                                                )
+                                                .cast(target_dtype),
+                                            )
+                                        elif "Int" in str(target_dtype):
+                                            self.scans_df = self.scans_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
+                                                )
+                                                .str.to_integer()
+                                                .cast(target_dtype),
+                                            )
+                                        elif "Float" in str(target_dtype):
+                                            self.scans_df = self.scans_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
+                                                )
+                                                .str.to_decimal()
+                                                .cast(target_dtype),
+                                            )
+                                        else:
+                                            # Try direct casting
+                                            self.scans_df = self.scans_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
+                                            )
+                                    else:
+                                        # Try direct casting for non-binary types
+                                        self.scans_df = self.scans_df.with_columns(
+                                            pl.col(col).cast(target_dtype),
+                                        )
+                        except Exception as e:
+                            self.logger.warning(
+                                f"Failed to cast column '{col}' in scans_df: {e}",
+                            )
+                    else:
+                        self.logger.warning(
+                            f"Column '{col}' in scans_df not found in schema, keeping original type.",
+                        )
+            # Ensure column order matches schema order
+            if "scans_df" in schema and "columns" in schema["scans_df"]:
+                schema_column_order = list(schema["scans_df"]["columns"].keys())
+                # Only reorder columns that exist in both schema and DataFrame
+                existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
+                if existing_columns:
+                    self.scans_df = self.scans_df.select(existing_columns)
+            else:
+                self.scans_df = None
+        else:
+            self.scans_df = None
+        # Load features_df
+        if "features" in f:
+            features_group = f["features"]
+            # columns = list(features_group.attrs.get('columns', []))
+            data = {}
+            missing_columns = []
+            for col in schema.get("features_df", {}).get("columns", []):
+                if col not in features_group:
+                    self.logger.debug(
+                        f"Column '{col}' not found in sample5/features.",
+                    )
+                    data[col] = None
+                    missing_columns.append(col)
+                    continue
+                dtype = schema["features_df"]["columns"][col].get("dtype", "native")
+                match dtype:
+                    case "pl.Object":
+                        match col:
+                            case "chrom":
+                                data_col = features_group[col][:]
+                                # Convert JSON strings back to Chromatogram objects
+                                reconstructed_data: list[Any] = []
+                                for item in data_col:
+                                    if isinstance(item, bytes):
+                                        item = item.decode("utf-8")
+                                    if item == "None" or item == "":
+                                        reconstructed_data.append(None)
+                                    else:
+                                        try:
+                                            reconstructed_data.append(
+                                                Chromatogram.from_json(item),
+                                            )
+                                        except (json.JSONDecodeError, ValueError):
+                                            reconstructed_data.append(None)
+                                data[col] = reconstructed_data
+                            case "ms2_scans":
+                                data_col = features_group[col][:]
+                                # Convert JSON strings back to list objects
+                                reconstructed_data = []
+                                for item in data_col:
+                                    if isinstance(item, bytes):
+                                        item = item.decode("utf-8")
+                                    if item == "None" or item == "":
+                                        reconstructed_data.append(None)
+                                    else:
+                                        try:
+                                            reconstructed_data.append(json.loads(item))
+                                        except json.JSONDecodeError:
+                                            reconstructed_data.append(None)
+                                data[col] = reconstructed_data
+                            case "ms2_specs":
+                                data_col = features_group[col][:]
+                                # Convert JSON strings back to list of Spectrum objects
+                                reconstructed_data = []
+                                for item in data_col:
+                                    if isinstance(item, bytes):
+                                        item = item.decode("utf-8")
+                                    if item == "None" or item == "":
+                                        reconstructed_data.append(None)
+                                    else:
+                                        try:
+                                            spectrum_list = []
+                                            for spec_data in json.loads(item):
+                                                if spec_data is not None:
+                                                    spectrum = Spectrum.from_json(spec_data)
+                                                    spectrum_list.append(spectrum)
+                                                else:
+                                                    spectrum_list.append(None)
+                                            reconstructed_data.append(spectrum_list)
+                                        except (json.JSONDecodeError, ValueError, TypeError):
+                                            reconstructed_data.append(None)
+                                data[col] = reconstructed_data
+                            case _:
+                                # Handle other Object columns as raw data
+                                data[col] = features_group[col][:]
+                    case _:
+                        data[col] = features_group[col][:]
+            # create polars DataFrame from data
+            if data:
+                self.features_df = pl.DataFrame(data, strict=False)
+                # Convert "None" strings and NaN values to proper null values for regular columns first
+                for col in self.features_df.columns:
+                    # Skip Object columns - they're already properly reconstructed
+                    if col in schema.get("features_df", {}).get("columns", {}):
+                        if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
+                            continue
+                    if self.features_df[col].dtype == pl.Utf8:  # String columns
+                        self.features_df = self.features_df.with_columns([
+                            pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                            .then(None)
+                            .otherwise(pl.col(col))
+                            .alias(col),
+                        ])
+                    elif self.features_df[col].dtype in [
+                        pl.Float64,
+                        pl.Float32,
+                    ]:  # Float columns
+                        self.features_df = self.features_df.with_columns([
+                            pl.col(col).fill_nan(None).alias(col),
+                        ])
+                # update all columns with schema types
+                for col in self.features_df.columns:
+                    if col in schema.get("features_df", {}).get("columns", {}):
+                        try:
+                            dtype_str = schema["features_df"]["columns"][col]["dtype"]
+                            # Convert dtype string to actual polars dtype
+                            if dtype_str.startswith("pl."):
+                                # Skip Object columns - they're already properly reconstructed
+                                if "Object" in dtype_str:
+                                    continue
+                                # Handle different polars data types
+                                if "Int" in dtype_str:
+                                    # Convert to numeric first, handling different input types
+                                    if self.features_df[col].dtype == pl.Utf8:
+                                        # String data - convert to integer
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
+                                        )
+                                    elif self.features_df[col].dtype in [
+                                        pl.Float64,
+                                        pl.Float32,
+                                    ]:
+                                        # Float data - cast to integer with null handling for NaN values
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).cast(eval(dtype_str), strict=False),
+                                        )
+                                    else:
+                                        # Handle special cases and try direct casting for other types
+                                        current_dtype = self.features_df[col].dtype
+                                        target_dtype = eval(dtype_str)
+                                        # Handle binary data that might need string conversion first
+                                        if "Binary" in str(current_dtype):
+                                            # Convert binary to string first, then to target type
+                                            if target_dtype == pl.Utf8:
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .cast(target_dtype),
+                                                )
+                                            elif "Int" in str(target_dtype):
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .str.to_integer()
+                                                    .cast(target_dtype),
+                                                )
+                                            elif "Float" in str(target_dtype):
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .str.to_decimal()
+                                                    .cast(target_dtype),
+                                                )
+                                            else:
+                                                # Try direct casting
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
+                                                )
+                                        else:
+                                            # Try direct casting for non-binary types
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
+                                            )
+                                elif "Float" in dtype_str:
+                                    # Convert to float, handling different input types
+                                    if self.features_df[col].dtype == pl.Utf8:
+                                        # String data - convert to float
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
+                                        )
+                                    else:
+                                        # Handle special cases and try direct casting for other types
+                                        current_dtype = self.features_df[col].dtype
+                                        target_dtype = eval(dtype_str)
+                                        # Handle binary data that might need string conversion first
+                                        if "Binary" in str(current_dtype):
+                                            # Convert binary to string first, then to target type
+                                            if target_dtype == pl.Utf8:
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .cast(target_dtype),
+                                                )
+                                            elif "Int" in str(target_dtype):
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .str.to_integer()
+                                                    .cast(target_dtype),
+                                                )
+                                            elif "Float" in str(target_dtype):
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .str.to_decimal()
+                                                    .cast(target_dtype),
+                                                )
+                                            else:
+                                                # Try direct casting
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
+                                                )
+                                        else:
+                                            # Try direct casting for non-binary types
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
+                                            )
+                                elif "Utf8" in dtype_str:
+                                    # Ensure it's string type
+                                    self.features_df = self.features_df.with_columns(
+                                        pl.col(col).cast(pl.Utf8),
+                                    )
+                                else:
+                                    # Handle special cases and try direct casting for other types
+                                    current_dtype = self.features_df[col].dtype
+                                    target_dtype = eval(dtype_str)
+                                    # Handle binary data that might need string conversion first
+                                    if "Binary" in str(current_dtype):
+                                        # Convert binary to string first, then to target type
+                                        if target_dtype == pl.Utf8:
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
+                                                )
+                                                .cast(target_dtype),
+                                            )
+                                        elif "Int" in str(target_dtype):
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
+                                                )
+                                                .str.to_integer()
+                                                .cast(target_dtype),
+                                            )
+                                        elif "Float" in str(target_dtype):
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
+                                                )
+                                                .str.to_decimal()
+                                                .cast(target_dtype),
+                                            )
+                                        else:
+                                            # Try direct casting
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
+                                            )
+                                    else:
+                                        # Try direct casting for non-binary types
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).cast(target_dtype),
+                                        )
+                        except Exception as e:
+                            self.logger.warning(
+                                f"Failed to cast column '{col}' in features_df: {e}",
+                            )
+                    else:
+                        self.logger.warning(
+                            f"Column '{col}' in features_df not found in schema, keeping original type.",
+                        )
+                # FINAL null conversion pass - after all type casting is done
+                # This ensures "None" strings introduced by failed conversions are properly handled
+                for col in self.features_df.columns:
+                    if self.features_df[col].dtype == pl.Utf8:  # String columns
+                        self.features_df = self.features_df.with_columns([
+                            pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                            .then(None)
+                            .otherwise(pl.col(col))
+                            .alias(col),
+                        ])
+                    # Float columns
+                    elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
+                        self.features_df = self.features_df.with_columns([
+                            pl.col(col).fill_nan(None).alias(col),
+                        ])
+                # Ensure column order matches schema order
+                if "features_df" in schema and "columns" in schema["features_df"]:
+                    schema_column_order = list(schema["features_df"]["columns"].keys())
+                    # Only reorder columns that exist in both schema and DataFrame
+                    existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
+                    if existing_columns:
+                        self.features_df = self.features_df.select(existing_columns)
+            else:
+                self.features_df = None
+        else:
+            self.features_df = None
+        # OPTIMIZED: Skip loading ms1_df for study use - set to None for performance
+        self.ms1_df = None
+        # Parameters are now loaded from metadata JSON (see above)
+        # Lib and lib_match are no longer saved/loaded
+    if map:
+        featureXML = filename.replace(".sample5", ".featureXML")
+        if os.path.exists(featureXML):
+            self._load_featureXML(featureXML)
+            self._features_sync()
+        else:
+            self.logger.warning(
+                f"Feature XML file {featureXML} not found, skipping loading.",
+            )
+    # set self.file_path to *.sample5
+    self.file_path = filename
+    # set self.label to basename without extension
+    if self.label is None or self.label == "":
+        self.label = os.path.splitext(os.path.basename(filename))[0]
+    self.logger.info(f"Sample loaded successfully from {filename} (optimized for study)")
 def load_schema(schema_path: str) -> Dict[str, Any]:
     """
     Load schema from JSON file with error handling.

masster 0.3.15__tar.gz → 0.3.16__tar.gz

Potentially problematic release.

masster 0.3.15tar.gz → 0.3.16tar.gz