PyPI - masster - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

masster 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

masster/__init__.py +8 -8
masster/_version.py +1 -1
masster/chromatogram.py +3 -9
masster/data/libs/README.md +1 -1
masster/data/libs/ccm.csv +120 -120
masster/data/libs/ccm.py +116 -62
masster/data/libs/central_carbon_README.md +1 -1
masster/data/libs/urine.py +161 -65
masster/data/libs/urine_metabolites.csv +4693 -4693
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
masster/logger.py +43 -78
masster/sample/__init__.py +1 -1
masster/sample/adducts.py +264 -338
masster/sample/defaults/find_adducts_def.py +8 -21
masster/sample/defaults/find_features_def.py +1 -6
masster/sample/defaults/get_spectrum_def.py +1 -5
masster/sample/defaults/sample_def.py +1 -5
masster/sample/h5.py +282 -561
masster/sample/helpers.py +75 -131
masster/sample/lib.py +17 -42
masster/sample/load.py +17 -31
masster/sample/parameters.py +2 -6
masster/sample/plot.py +27 -88
masster/sample/processing.py +87 -117
masster/sample/quant.py +51 -57
masster/sample/sample.py +90 -103
masster/sample/sample5_schema.json +44 -44
masster/sample/save.py +12 -35
masster/sample/sciex.py +19 -66
masster/spectrum.py +20 -58
masster/study/__init__.py +1 -1
masster/study/defaults/align_def.py +1 -5
masster/study/defaults/fill_chrom_def.py +1 -5
masster/study/defaults/fill_def.py +1 -5
masster/study/defaults/integrate_chrom_def.py +1 -5
masster/study/defaults/integrate_def.py +1 -5
masster/study/defaults/study_def.py +25 -58
masster/study/export.py +207 -233
masster/study/h5.py +136 -470
masster/study/helpers.py +202 -495
masster/study/helpers_optimized.py +13 -40
masster/study/id.py +110 -213
masster/study/load.py +143 -230
masster/study/plot.py +257 -518
masster/study/processing.py +257 -469
masster/study/save.py +5 -15
masster/study/study.py +276 -379
masster/study/study5_schema.json +96 -96
{masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
masster-0.4.1.dist-info/RECORD +67 -0
masster-0.4.0.dist-info/RECORD +0 -67
{masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
{masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
{masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0

masster/data/libs/ccm.py CHANGED Viewed

@@ -5,8 +5,8 @@ Workflow:
     amino acids, common organic acids, nucleotides, fatty acids, cofactors, sugars).
 - Query PubChem's PUG-REST for MolecularFormula, CanonicalSMILES and InChIKey for each name
     with retries and basic name normalization to improve matching.
-- Save results to `masster/data/examples/central_carbon_metabolites.csv`.
-- Test loading with `master.lib.Lib.import_csv`.
+- Save results to `masster/data/examples/ccm.csv`.
+- Test loading with `masster.lib.Lib.import_csv`.
 This is a best-effort programmatic lookup; ambiguous names may not resolve (those rows will
 have empty Formula/SMILES/InChIKey). For authoritative lists, prefer curated databases
@@ -70,6 +70,7 @@ CCM_METABOLITES = [
     "Pentose",
     "Acetaldehyde",
     "Acetic acid",
     # Proteinogenic amino acids (20 standard)
     "Alanine",
     "Arginine",
@@ -97,6 +98,7 @@ CCM_METABOLITES = [
     "Homocysteine",
     "S-adenosylmethionine",
     "S-adenosylhomocysteine",
     # Common organic acids / intermediates & related small metabolites
     "Formic acid",
     "Propionic acid",
@@ -108,6 +110,7 @@ CCM_METABOLITES = [
     "Beta-hydroxybutyrate",
     "Pyruvic acid",
     "Lactic acid",
     # Fatty acids (common)
     "Myristic acid",
     "Palmitic acid",
@@ -117,6 +120,7 @@ CCM_METABOLITES = [
     "Linoleic acid",
     "Alpha-linolenic acid",
     "Arachidonic acid",
     # Nucleobases and nucleosides
     "Adenine",
     "Guanine",
@@ -127,6 +131,7 @@ CCM_METABOLITES = [
     "Guanosine",
     "Cytidine",
     "Uridine",
     # Nucleotides (mono/di/tri)
     "AMP",
     "ADP",
@@ -140,6 +145,7 @@ CCM_METABOLITES = [
     "UMP",
     "UDP",
     "UTP",
     # Cofactors / common metabolites
     "NAD+",
     "NADH",
@@ -151,6 +157,7 @@ CCM_METABOLITES = [
     "Pantothenic acid",
     "Riboflavin",
     "Niacin",
     # Sugar and sugar derivatives
     "Fructose",
     "Mannose",
@@ -158,6 +165,7 @@ CCM_METABOLITES = [
     "Ribose",
     "Glucosamine",
     "N-acetylglucosamine",
     # Other common metabolites
     "Choline",
     "Betaine",
@@ -171,13 +179,40 @@ CCM_METABOLITES = [
 ]
+def canonicalize_smiles(smiles_str: str) -> str:
+    """
+    Canonicalize SMILES string using RDKit.
+    Args:
+        smiles_str: Input SMILES string
+    Returns:
+        Canonical SMILES string, or original string if canonicalization fails
+    """
+    if not smiles_str or not smiles_str.strip() or Chem is None:
+        return smiles_str
+    try:
+        mol = Chem.MolFromSmiles(smiles_str, sanitize=True)
+        if mol is None:
+            return smiles_str
+        # Generate canonical SMILES with isomeric information
+        canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
+        return canonical_smiles
+    except Exception:
+        # If canonicalization fails, return the original SMILES
+        return smiles_str
 def fetch_from_pubchem(name: str):
-    """Fetch formula, smiles and inchikey from PubChem by compound name.
+    """Fetch formula, smiles, inchikey, and CID from PubChem by compound name.
     Uses basic normalization and retries with exponential backoff. Returns
-    (formula, smiles, inchikey) or (None, None, None) on failure.
+    (formula, smiles, inchikey, cid) or (None, None, None, None) on failure.
     """
-    props = (None, None, None)
+    props = (None, None, None, None)
     def normalize_name(n: str) -> str:
         if not n:
@@ -219,6 +254,21 @@ def fetch_from_pubchem(name: str):
             return None
         return None
+    def try_query_with_cid(q: str):
+        """Query compound by name and get CID along with properties."""
+        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(q)}/cids/JSON"
+        try:
+            r = requests.get(url, timeout=15)
+            if r.status_code == 200:
+                j = r.json()
+                if 'IdentifierList' in j and 'CID' in j['IdentifierList']:
+                    cids = j['IdentifierList']['CID']
+                    if cids:
+                        return cids[0]  # Return the first CID
+        except Exception:
+            return None
+        return None
     def try_query_inchikey(ik: str):
         url = (
             f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(ik)}/property/"
@@ -251,8 +301,8 @@ def fetch_from_pubchem(name: str):
             r = requests.get(url, timeout=15)
             if r.status_code == 200:
                 j = r.json()
-                if "IdentifierList" in j and "CID" in j["IdentifierList"]:
-                    return j["IdentifierList"]["CID"]
+                if 'IdentifierList' in j and 'CID' in j['IdentifierList']:
+                    return j['IdentifierList']['CID']
         except Exception:
             return []
         return []
@@ -262,7 +312,13 @@ def fetch_from_pubchem(name: str):
     # exponential backoff attempts
     attempts = 3
+    cid = None
     for i in range(attempts):
+        # First try to get the CID
+        if not cid:
+            cid = try_query_with_cid(query)
         j = try_query(query)
         if j:
             try:
@@ -275,11 +331,7 @@ def fetch_from_pubchem(name: str):
                     # if SMILES missing, try a lookup by InChIKey (dedicated endpoint)
                     if not sm and ik:
                         j2 = try_query_inchikey(ik)
-                        if (
-                            j2
-                            and "PropertyTable" in j2
-                            and "Properties" in j2["PropertyTable"]
-                        ):
+                        if j2 and "PropertyTable" in j2 and "Properties" in j2["PropertyTable"]:
                             p2 = j2["PropertyTable"]["Properties"][0]
                             sm = p2.get("CanonicalSMILES") or sm
                             inchi = inchi or p2.get("InChI")
@@ -296,38 +348,39 @@ def fetch_from_pubchem(name: str):
                     # if still no SMILES, try fetching CIDs from InChIKey and query a CID record
                     if not sm and ik:
                         cids = try_get_cids_from_inchikey(ik)
-                        for cid in (cids or [])[:5]:
-                            j3 = try_query_cid(cid)
-                            if (
-                                j3
-                                and "PropertyTable" in j3
-                                and "Properties" in j3["PropertyTable"]
-                            ):
+                        for cid_from_ik in (cids or [])[:5]:
+                            if not cid:  # Only set CID if we don't have one yet
+                                cid = cid_from_ik
+                            j3 = try_query_cid(cid_from_ik)
+                            if j3 and "PropertyTable" in j3 and "Properties" in j3["PropertyTable"]:
                                 p3 = j3["PropertyTable"]["Properties"][0]
                                 sm = p3.get("CanonicalSMILES") or sm
                                 if sm:
                                     break
-                    return (mf, sm, ik)
+                    return (mf, sm, ik, cid)
             except Exception:
                 pass
-        time.sleep(1 + 2**i)
+        time.sleep(1 + 2 ** i)
     # final fallback: try raw name without normalization
+    if not cid:
+        cid = try_query_with_cid(name)
     j = try_query(name)
     if j and "PropertyTable" in j and "Properties" in j["PropertyTable"]:
         p = j["PropertyTable"]["Properties"][0]
-        return (p.get("MolecularFormula"), p.get("CanonicalSMILES"), p.get("InChIKey"))
+        return (p.get("MolecularFormula"), p.get("CanonicalSMILES"), p.get("InChIKey"), cid)
     return props
-def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
+def generate_csv(out_path: str = "ccm.csv"):
     rows = []
     for name in CCM_METABOLITES:
-        formula, smiles, inchikey = (None, None, None)
+        formula, smiles, inchikey, cid = (None, None, None, None)
         if requests is not None:
-            formula, smiles, inchikey = fetch_from_pubchem(name)
+            formula, smiles, inchikey, cid = fetch_from_pubchem(name)
         # Neutralize charged molecular formulas (e.g., trailing +, -, 2+, 3-)
         # by adjusting the hydrogen count accordingly and removing the explicit charge.
@@ -336,15 +389,13 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
                 return fmt
             s = fmt.strip()
             # normalize common unicode superscripts (²³¹⁺⁻) to ascii
-            sup_map = str.maketrans(
-                {
-                    "²": "2",
-                    "³": "3",
-                    "¹": "1",
-                    "⁺": "+",
-                    "⁻": "-",
-                },
-            )
+            sup_map = str.maketrans({
+                "²": "2",
+                "³": "3",
+                "¹": "1",
+                "⁺": "+",
+                "⁻": "-",
+            })
             s = s.translate(sup_map)
             # Remove enclosing brackets if present, e.g. [C6H5O7]2-
             if s.startswith("[") and s.endswith("]"):
@@ -361,17 +412,13 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
             # determine magnitude and sign for patterns like '2-' or '-2' or '+2' or '3+'
             sign = 1
             mag = 1
-            if charge_str[0] in "+-":
+            if charge_str[0] in '+-':
                 # formats like '-2' or '+2' or '-' or '+'
-                sign = -1 if charge_str[0] == "-" else 1
-                mag = (
-                    int(charge_str[1:])
-                    if len(charge_str) > 1 and charge_str[1:].isdigit()
-                    else 1
-                )
-            elif charge_str[-1] in "+-":
+                sign = -1 if charge_str[0] == '-' else 1
+                mag = int(charge_str[1:]) if len(charge_str) > 1 and charge_str[1:].isdigit() else 1
+            elif charge_str[-1] in '+-':
                 # formats like '2-' or '3+'
-                sign = -1 if charge_str[-1] == "-" else 1
+                sign = -1 if charge_str[-1] == '-' else 1
                 mag = int(charge_str[:-1]) if charge_str[:-1].isdigit() else 1
             # parse element counts from base formula
@@ -414,7 +461,7 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
             for el in elems:
                 if el in counts:
                     n = counts[el]
-                    parts.append(f"{el}{n if n != 1 else ''}")
+                    parts.append(f"{el}{n if n!=1 else ''}")
             new_formula = "".join(parts)
             return new_formula
@@ -443,11 +490,7 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
                     q = a.GetFormalCharge()
                     if q > 0:
                         # remove up to q hydrogen neighbors (by index)
-                        h_neighbors = [
-                            nbr.GetIdx()
-                            for nbr in a.GetNeighbors()
-                            if nbr.GetSymbol() == "H"
-                        ]
+                        h_neighbors = [nbr.GetIdx() for nbr in a.GetNeighbors() if nbr.GetSymbol() == "H"]
                         remove = h_neighbors[: min(len(h_neighbors), q)]
                         to_remove.extend(remove)
                     elif q < 0:
@@ -486,21 +529,32 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
             smiles = neutralize_smiles(smiles) if smiles else smiles
         except Exception:
             pass
-        rows.append(
-            {
-                "Name": name,
-                "Formula": formula or "",
-                "SMILES": smiles or "",
-                "InChIKey": inchikey or "",
-            },
-        )
+        # Canonicalize SMILES after neutralization
+        try:
+            smiles = canonicalize_smiles(smiles) if smiles else smiles
+        except Exception:
+            pass
+        # Format the database ID and database name
+        db_id = f"CID:{cid}" if cid else ""
+        db = "pubchem" if cid else ""
+        rows.append({
+            "Name": name,
+            "Formula": formula or "",
+            "SMILES": smiles or "",
+            "InChIKey": inchikey or "",
+            "db_id": db_id,
+            "db": db,
+        })
     # Ensure output directory exists (data/libs)
-    out_dir = os.path.join("master", "data", "libs")
+    out_dir = os.path.join('masster', 'data', 'libs')
     os.makedirs(out_dir, exist_ok=True)
     out_path_full = os.path.join(out_dir, os.path.basename(out_path))
-    fieldnames = ["Name", "Formula", "SMILES", "InChIKey"]
+    fieldnames = ["Name", "Formula", "SMILES", "InChIKey", "db_id", "db"]
     with open(out_path_full, "w", newline="", encoding="utf-8") as f:
         writer = csv.DictWriter(f, fieldnames=fieldnames)
         writer.writeheader()
@@ -512,11 +566,11 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
 def test_load_with_lib(csv_path: str):
-    """Try to load the generated CSV using master.lib.Lib.import_csv."""
+    """Try to load the generated CSV using masster.lib.Lib.import_csv."""
     try:
-        from master.lib import Lib
+        from masster.lib import Lib
     except Exception as e:
-        print(f"Cannot import master.lib.Lib: {e}")
+        print(f"Cannot import masster.lib.Lib: {e}")
         return False
     try:

masster/data/libs/central_carbon_README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 central_carbon_metabolites.csv
-This folder contains example compound lists used by the master package.
+This folder contains example compound lists used by the masster package.
 Files:
 - central_carbon_metabolites.csv: a best-effort list of central carbon metabolism related

masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

masster 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl