PyPI - masster - Versions diffs - 0.5.18__tar.gz → 0.5.20__tar.gz - Mend

masster 0.5.18tar.gz → 0.5.20tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (98) hide show

{masster-0.5.18 → masster-0.5.20}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.5.18
+Version: 0.5.20
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.5.18 → masster-0.5.20}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.5.18"
+version = "0.5.20"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.5.18 → masster-0.5.20}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.5.18"
+__version__ = "0.5.19"
 def get_version():

{masster-0.5.18 → masster-0.5.20}/src/masster/lib/lib.py RENAMED Viewed

@@ -46,6 +46,7 @@ annotations = lib.annotate_features(sample.features_df)
 """
 import os
+import json
 from typing import Optional, Union, List, Dict, Any, TYPE_CHECKING
 import warnings
@@ -685,6 +686,142 @@ class Lib:
             if skipped_compounds > 0:
                 print(f"All {total_compounds} compounds were skipped due to invalid formulas")
+    def import_json(self,
+                   jsonfile: str,
+                   polarity: Optional[str] = None,
+                   adducts: Optional[List[str]] = None,
+                   min_probability: float = 0.03) -> None:
+        """
+        Import compound library from a JSON file created by csv_to_json.py.
+        This method reads a JSON file with the structure created by csv_to_json.py
+        and generates adduct variants for each compound.
+        Args:
+            jsonfile: Path to the JSON file
+            polarity: Ionization polarity ("positive", "negative", or None for positive)
+            adducts: Specific adducts to generate. If None, generates defaults for the polarity
+            min_probability: Minimum probability threshold for adduct filtering
+        Expected JSON structure:
+            {
+                "version": "1.0",
+                "creation_date": "2025-10-07T09:17:06.142290",
+                "description": "Converted from CSV file...",
+                "source_file": "filename.csv",
+                "record_count": 123,
+                "data": [
+                    {
+                        "name": "compound name",
+                        "smiles": "SMILES string",
+                        "inchikey": "InChI key",
+                        "formula": "molecular formula",
+                        "db_id": "database ID",
+                        "db": "database name"
+                    },
+                    ...
+                ]
+            }
+        Raises:
+            FileNotFoundError: If JSON file doesn't exist
+            ValueError: If JSON structure is invalid or required data is missing
+        """
+        if not os.path.exists(jsonfile):
+            raise FileNotFoundError(f"JSON file not found: {jsonfile}")
+        # Read and parse JSON file
+        try:
+            with open(jsonfile, 'r', encoding='utf-8') as f:
+                json_data = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON file: {e}") from e
+        except Exception as e:
+            raise ValueError(f"Error reading JSON file: {e}") from e
+        # Validate JSON structure
+        if not isinstance(json_data, dict):
+            raise ValueError("JSON file must contain a dictionary at root level")
+        if "data" not in json_data:
+            raise ValueError("JSON file must contain a 'data' field with compound records")
+        data = json_data["data"]
+        if not isinstance(data, list):
+            raise ValueError("'data' field must be a list of compound records")
+        # Extract metadata for reporting
+        version = json_data.get("version", "unknown")
+        source_file = json_data.get("source_file", "unknown")
+        record_count = json_data.get("record_count", len(data))
+        print(f"Loading JSON library: version {version}, source: {source_file}, records: {record_count}")
+        # Process each compound
+        all_variants = []
+        cmpd_id_counter = 1
+        lib_id_counter = 1
+        total_compounds = 0
+        skipped_compounds = 0
+        for compound_record in data:
+            total_compounds += 1
+            # Validate required fields
+            if not isinstance(compound_record, dict):
+                skipped_compounds += 1
+                continue
+            formula = compound_record.get("formula", "")
+            if not formula or not isinstance(formula, str):
+                skipped_compounds += 1
+                continue
+            # Extract compound data, handling both CSV column names and JSON field names
+            compound_level_uid = cmpd_id_counter
+            cmpd_id_counter += 1
+            compound_data = {
+                "name": compound_record.get("name", compound_record.get("Name", "")),
+                "shortname": compound_record.get("shortname", ""),
+                "class": compound_record.get("class", ""),
+                "smiles": compound_record.get("smiles", compound_record.get("SMILES", "")),
+                "inchi": compound_record.get("inchi", compound_record.get("InChI", "")),
+                "inchikey": compound_record.get("inchikey", compound_record.get("InChIKey", "")),
+                "formula": formula,
+                "rt": self._safe_float_conversion(compound_record.get("rt", compound_record.get("RT", None))),
+                "db_id": compound_record.get("db_id", compound_record.get("database_id", None)),
+                "db": compound_record.get("db", compound_record.get("database", None)),
+                "cmpd_uid": compound_level_uid,
+            }
+            # Generate adduct variants
+            variants, lib_id_counter = self._generate_adduct_variants(
+                compound_data, adducts=adducts, polarity=polarity,
+                lib_id_counter=lib_id_counter, min_probability=min_probability
+            )
+            all_variants.extend(variants)
+            # Track if compound was skipped due to invalid formula
+            if len(variants) == 0:
+                skipped_compounds += 1
+        # Convert to DataFrame and store
+        if all_variants:
+            new_lib_df = pl.DataFrame(all_variants)
+            # Combine with existing data if any
+            if self.lib_df is not None and len(self.lib_df) > 0:
+                self.lib_df = pl.concat([self.lib_df, new_lib_df])
+            else:
+                self.lib_df = new_lib_df
+            print(f"Imported {len(all_variants)} library entries from {jsonfile}")
+        else:
+            print(f"No valid compounds found in {jsonfile}")
+            if skipped_compounds > 0:
+                print(f"All {total_compounds} compounds were skipped due to invalid formulas")
     def _map_csv_columns(self, columns: List[str]) -> Dict[str, str]:
         """
         Map CSV column names to standardized internal names (case-insensitive).

{masster-0.5.18 → masster-0.5.20}/src/masster/sample/defaults/find_ms2_def.py RENAMED Viewed

@@ -42,7 +42,7 @@ class find_ms2_defaults:
         - get_description(param_name): Get parameter description
         - get_info(param_name): Get full parameter metadata
         - list_parameters(): Get list of all parameter names
-        - get_mz_tolerance(file_type): Get appropriate m/z tolerance based on file type
+        - get_mz_tolerance(type): Get appropriate m/z tolerance based on type
     """
     # Core MS2 linking parameters
@@ -270,16 +270,16 @@ class find_ms2_defaults:
         return len(invalid_params) == 0, invalid_params
-    def get_mz_tolerance(self, file_type=None):
+    def get_mz_tolerance(self, type=None):
         """
-        Get the appropriate m/z tolerance based on file type.
+        Get the appropriate m/z tolerance based on type.
         Args:
-            file_type (str, optional): File type ('ztscan', 'dia', or other)
+            type (str, optional): Acquisition type ('ztscan', 'dia', or other)
         Returns:
             float: Appropriate m/z tolerance value
         """
-        if file_type is not None and file_type.lower() in ["ztscan", "dia"]:
+        if type is not None and type.lower() in ["ztscan", "dia"]:
             return self.get("mz_tol_ztscan")
         return self.get("mz_tol")

{masster-0.5.18 → masster-0.5.20}/src/masster/sample/h5.py RENAMED Viewed

@@ -1,4 +1,4 @@
-import json
+import json
 import os
 import h5py
@@ -94,8 +94,8 @@ def _save_sample5(
             metadata_group.attrs["file_source"] = str(self.file_source)
         else:
             metadata_group.attrs["file_source"] = ""
-        if hasattr(self, 'file_type') and self.file_type is not None:
-            metadata_group.attrs["file_type"] = str(self.file_type)
+        if hasattr(self, 'type') and self.type is not None:
+            metadata_group.attrs["file_type"] = str(self.type)
         else:
             metadata_group.attrs["file_type"] = ""
         if self.label is not None:
@@ -393,7 +393,7 @@ def _load_sample5(self, filename: str, map: bool = False):
             else:
                 self.file_source = self.file_path
-            self.file_type = decode_metadata_attr(
+            self.type = decode_metadata_attr(
                 metadata_group.attrs.get("file_type", ""),
             )
             self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
@@ -1160,7 +1160,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
             else:
                 self.file_source = self.file_path
-            self.file_type = decode_metadata_attr(
+            self.type = decode_metadata_attr(
                 metadata_group.attrs.get("file_type", ""),
             )
             self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
@@ -2302,7 +2302,7 @@ def create_h5_metadata_group(
     f: h5py.File,
     file_path: Optional[str],
     file_source: Optional[str],
-    file_type: Optional[str],
+    type: Optional[str],
     label: Optional[str],
 ) -> None:
     """
@@ -2312,7 +2312,7 @@ def create_h5_metadata_group(
         f: The HDF5 file object
         file_path: Source file path
         file_source: Original source file path
-        file_type: Source file type
+        type: Source file type
         label: Sample label
     """
     metadata_group = f.create_group("metadata")
@@ -2321,5 +2321,5 @@ def create_h5_metadata_group(
     metadata_group.attrs["file_source"] = (
         str(file_source) if file_source is not None else ""
     )
-    metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
+    metadata_group.attrs["file_type"] = str(type) if type is not None else ""
     metadata_group.attrs["label"] = str(label) if label is not None else ""

{masster-0.5.18 → masster-0.5.20}/src/masster/sample/processing.py RENAMED Viewed

@@ -1028,7 +1028,7 @@ def find_ms2(self, **kwargs):
     - mz_tol (float):
         Precursor m/z tolerance used for matching. The effective tolerance may be
-        adjusted by file type (the defaults class provides ``get_mz_tolerance(file_type)``).
+        adjusted by type (the defaults class provides ``get_mz_tolerance(type)``).
         Default: 0.5 (ztscan/DIA defaults may be larger).
     - centroid (bool):
@@ -1077,7 +1077,7 @@ def find_ms2(self, **kwargs):
     # Extract parameter values
     features = params.get("features")
-    mz_tol = params.get_mz_tolerance(self.file_type)
+    mz_tol = params.get_mz_tolerance(self.type)
     centroid = params.get("centroid")
     deisotope = params.get("deisotope")
     dia_stats = params.get("dia_stats")

{masster-0.5.18 → masster-0.5.20}/src/masster/sample/sciex.py RENAMED Viewed

@@ -379,7 +379,7 @@ class SciexWiffData:
         self._raw_file_path = ""
         self.centroided = centroided
         self.creation_time = ""
-        self.file_type = "sciex"
+        self.type = "sciex"
         self.instrument = "sciex"
         if self.centroided:
@@ -616,7 +616,7 @@ if __name__ == "__main__":
             print(f"  - Number of spectra: {len(wiff_data.spectrum_df)}")
             print(f"  - Number of peaks: {len(wiff_data.peak_df)}")
             print(f"  - Creation time: {wiff_data.creation_time}")
-            print(f"  - File type: {wiff_data.file_type}")
+            print(f"  - File type: {wiff_data.type}")
             print(f"  - Instrument: {wiff_data.instrument}")
             # Test getting peaks from first spectrum

{masster-0.5.18 → masster-0.5.20}/src/masster/sample/thermo.py RENAMED Viewed

@@ -524,7 +524,7 @@ class ThermoRawData:
         # File and instrument information
         self._raw_file_path = ""
         self.creation_time = ""
-        self.file_type = "thermo"
+        self.type = "thermo"
         self.instrument = "thermo"
         # Processing parameters

{masster-0.5.18 → masster-0.5.20}/src/masster/study/id.py RENAMED Viewed

@@ -21,10 +21,10 @@ def lib_load(
     Args:
         study: Study instance
-        lib_source: either a CSV file path (str) or a Lib instance
-        polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV path.
+        lib_source: either a CSV/JSON file path (str) or a Lib instance
+        polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV/JSON path.
                  If None, uses study.polarity automatically.
-        adducts: specific adducts to generate - used when lib_source is a CSV path
+        adducts: specific adducts to generate - used when lib_source is a CSV/JSON path
         iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
     Side effects:
@@ -38,7 +38,7 @@ def lib_load(
         Lib = None
     if lib_source is None:
-        raise ValueError("lib_source must be a CSV file path (str) or a Lib instance")
+        raise ValueError("lib_source must be a CSV/JSON file path (str) or a Lib instance")
     # Use study polarity if not explicitly provided
     if polarity is None:
@@ -52,15 +52,23 @@ def lib_load(
             polarity = "positive"  # Default fallback
         study.logger.debug(f"Using study polarity: {polarity}")
-    # Handle string input (CSV file path)
+    # Handle string input (CSV or JSON file path)
     if isinstance(lib_source, str):
         if Lib is None:
             raise ImportError(
-                "Could not import masster.lib.lib.Lib - required for CSV loading",
+                "Could not import masster.lib.lib.Lib - required for CSV/JSON loading",
             )
         lib_obj = Lib()
-        lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
+        # Determine file type by extension
+        if lib_source.lower().endswith('.json'):
+            lib_obj.import_json(lib_source, polarity=polarity, adducts=adducts)
+        elif lib_source.lower().endswith('.csv'):
+            lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
+        else:
+            # Default to CSV behavior for backward compatibility
+            lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
     # Handle Lib instance
     elif Lib is not None and isinstance(lib_source, Lib):
@@ -72,7 +80,7 @@ def lib_load(
     else:
         raise TypeError(
-            "lib_source must be a CSV file path (str), a masster.lib.Lib instance, or have a 'lib_df' attribute",
+            "lib_source must be a CSV/JSON file path (str), a masster.lib.Lib instance, or have a 'lib_df' attribute",
         )
     # Ensure lib_df is populated
@@ -101,7 +109,7 @@ def lib_load(
     # Store pointer and DataFrame on study
     study._lib = lib_obj
-    # Add source_id column with filename (without path) if loading from CSV
+    # Add source_id column with filename (without path) if loading from CSV/JSON
     if isinstance(lib_source, str):
         import os
         filename_only = os.path.basename(lib_source)

{masster-0.5.18 → masster-0.5.20}/src/masster/wizard/README.md RENAMED Viewed

@@ -12,7 +12,7 @@ from masster import Wizard
 # Create wizard with minimal configuration
 wizard = Wizard(
     data_source="./raw_data",      # Directory with raw files
-    study_folder="./processed",    # Output directory
+    study_folder="./processed",    # Output directory
     polarity="positive",           # or "negative"
     num_cores=4                    # CPU cores to use
 )
@@ -35,22 +35,22 @@ params = wizard_def(
     study_folder="./processed_advanced",
     polarity="negative",
     num_cores=8,
     # File discovery
     file_extensions=[".wiff", ".raw", ".mzML"],
     search_subfolders=True,
     skip_patterns=["blank", "QC", "test"],
     # Processing parameters
     adducts=["H-1:-:0.95", "Cl:-:0.05", "CH2O2:0:0.2"],
     chrom_fwhm=0.15,
     noise_threshold=5e4,
     # Study assembly
     rt_tolerance=1.0,
     mz_tolerance=0.008,
     min_samples_for_merge=30,
     # Output options
     export_formats=["csv", "xlsx", "mgf", "parquet"],
     generate_plots=True,
@@ -73,7 +73,7 @@ wizard.run_full_pipeline()
 ### 💾 Intelligent Resume
 - **Checkpoint System**: Automatically saves progress at key points
-- **File Tracking**: Remembers which files have been processed successfully
+- **File Tracking**: Remembers which files have been processed successfully
 - **Smart Recovery**: Resumes from last successful step after interruption
 - **Validation**: Verifies existing outputs before skipping
@@ -112,7 +112,7 @@ wizard.run_full_pipeline()
 ### 4. Feature Alignment
 - **RT Alignment**: Corrects retention time shifts between samples
-- **Mass Alignment**: Accounts for mass calibration differences
+- **Mass Alignment**: Accounts for mass calibration differences
 - **Algorithm Selection**: Supports KD-tree, QT-clustering, and chunked methods
 - **Validation**: Reports alignment statistics and quality metrics
@@ -232,7 +232,7 @@ Returns detailed status dictionary with current step, processed files, timing, a
 - Ensure sufficient disk space in output directory
 - Close any applications that might lock files
-**Processing Failures**
+**Processing Failures**
 - Check individual file integrity
 - Review `skip_patterns` to exclude problematic files
 - Examine detailed logs in `wizard.log` and `processing.log`
@@ -249,7 +249,7 @@ The Wizard includes built-in validation at each step:
 - **File Validation**: Checks file accessibility and format compatibility
 - **Processing Validation**: Verifies sample5 outputs can be loaded
-- **Study Validation**: Ensures study assembly completed successfully
+- **Study Validation**: Ensures study assembly completed successfully
 - **Alignment Validation**: Reports alignment statistics and warnings
 - **Export Validation**: Confirms all requested outputs were created
@@ -257,7 +257,7 @@ The Wizard includes built-in validation at each step:
 ### System Requirements
 - **Minimum**: 4 CPU cores, 8 GB RAM
-- **Recommended**: 8+ CPU cores, 16+ GB RAM
+- **Recommended**: 8+ CPU cores, 16+ GB RAM
 - **Large Studies**: 16+ CPU cores, 32+ GB RAM
 - **Storage**: SSD recommended, ~2-3x raw data size free space
@@ -265,7 +265,7 @@ The Wizard includes built-in validation at each step:
 **For Small Studies (< 50 samples)**
 - Use `num_cores = 4-6`
-- Set `batch_size = 4-8`
+- Set `batch_size = 4-8`
 - Use `merge_method = "kd"`
 - Enable all export formats
@@ -297,7 +297,7 @@ if not wizard.study_folder_path.glob("*.sample5"):
 # Continue with study-level processing
 wizard.assemble_study()
-wizard.align_and_merge()
+wizard.align_and_merge()
 wizard.export_results()
 ```
@@ -312,7 +312,7 @@ studies = [
 for study_config in studies:
     wizard = Wizard(**study_config, num_cores=8)
     success = wizard.run_full_pipeline()
     if success:
         print(f"✅ {study_config['output']} completed")
     else:
@@ -338,7 +338,7 @@ if hasattr(wizard.study, 'features_filter'):
     )
     wizard.study.features_filter(selection)
-# Continue with standard pipeline
+# Continue with standard pipeline
 wizard.align_and_merge()
 wizard.generate_plots()
 ```
@@ -370,4 +370,4 @@ The Wizard generates several types of output files:
 - `sample_name.mgf` - Individual sample MS2 spectra
 - `sample_name_2d.html` - Individual sample 2D plot
-The Wizard provides a complete, automated solution for mass spectrometry data processing while maintaining flexibility for custom workflows and providing robust error handling and recovery capabilities.
+The Wizard provides a complete, automated solution for mass spectrometry data processing while maintaining flexibility for custom workflows and providing robust error handling and recovery capabilities.

masster 0.5.18__tar.gz → 0.5.20__tar.gz

Potentially problematic release.

masster 0.5.18tar.gz → 0.5.20tar.gz