harmonsmile 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ # SPDX-License-Identifier: LGPL-3.0-or-later
2
+ """
3
+ harmonsmile — Harmonize SMILES strings to canonical + isomeric + Kekulized convention.
4
+
5
+ Provides pipelines and utilities for standardizing SMILES strings using RDKit,
6
+ following the COCONUT 2.0 convention: canonical, isomeric, and Kekulized form.
7
+
8
+ Classes
9
+ -------
10
+ RDKitStandardizer
11
+ Standardize SMILES strings using RDKit.
12
+ Config
13
+ Immutable configuration for harmonsmile pipelines.
14
+ PubChemIngest
15
+ Pipeline for ingesting and harmonizing PubChem compound data.
16
+ ChEMBLIngest
17
+ Pipeline for ingesting and harmonizing ChEMBL compound data.
18
+ SMILESPrep
19
+ Pipeline for harmonizing SMILES from any tabular source.
20
+
21
+ Functions
22
+ ---------
23
+ load_table(path)
24
+ Load a tabular file into a DataFrame.
25
+ save_table(df, path)
26
+ Save a DataFrame to a CSV file.
27
+
28
+ Examples
29
+ --------
30
+ Standardize a single SMILES string:
31
+
32
+ >>> from harmonsmile import RDKitStandardizer
33
+ >>> std = RDKitStandardizer()
34
+ >>> std.to_iso_kek("c1ccccc1")
35
+ 'C1=CC=CC=C1'
36
+ >>> std.to_conn_kek("C[C@@H](O)F")
37
+ 'CC(O)F'
38
+
39
+ Harmonize a COCONUT or independent database:
40
+
41
+ >>> from harmonsmile import CoconutPrep
42
+ >>> CoconutPrep(
43
+ ... input_path="data/database.csv",
44
+ ... smiles_col="SMILES",
45
+ ... output_path="results/database_harmonized.csv",
46
+ ... ).run()
47
+
48
+ Fetch and harmonize PubChem data:
49
+
50
+ >>> from harmonsmile import PubChemIngest, Config
51
+ >>> cfg = Config(
52
+ ... input_path="data/database_pubchem.csv",
53
+ ... output_path="results/pubchem_harmonized.csv",
54
+ ... )
55
+ >>> PubChemIngest(cfg).run()
56
+ """
57
+
58
+ from .standardize import RDKitStandardizer
59
+ from .pipelines import PubChemIngest, ChEMBLIngest, SMILESPrep, CoconutPrep
60
+ from .config import Config
61
+ from .pubchem import PubChemClient
62
+ from .io import load_table, save_table
63
+ from .version import __version__, PROJECT_NAME, PROJECT_VERSION, PROJECT_STATUS
64
+
65
+ __author__ = "Flavio F. Contreras-Torres"
66
+
67
+ __all__ = [
68
+ "RDKitStandardizer",
69
+ "PubChemIngest",
70
+ "ChEMBLIngest",
71
+ "SMILESPrep",
72
+ "Config",
73
+ "load_table",
74
+ "save_table",
75
+ "__version__",
76
+ "PROJECT_NAME",
77
+ "PROJECT_VERSION",
78
+ "PROJECT_STATUS"
79
+ ]
@@ -0,0 +1,27 @@
1
+ # SPDX-License-Identifier: LGPL-3.0-or-later
2
+ """
3
+ Entry point for running harmonsmile as a module.
4
+
5
+ Allows the package to be invoked directly from the command line
6
+ using ``python -m harmonsmile``. All arguments are forwarded to the
7
+ CLI defined in :mod:`harmonsmile._cli`.
8
+
9
+ See Also
10
+ --------
11
+ harmonsmile._cli.main : The CLI entry point function.
12
+
13
+ Examples
14
+ --------
15
+ Fetch PubChem properties and standardize SMILES::
16
+
17
+ python -m harmonsmile --pubchem-in data/db.csv --pubchem-out results/out.csv
18
+
19
+ Standardize an existing SMILES column::
20
+
21
+ python -m harmonsmile --coconut-in data/db.csv --coconut-smiles SMILES --coconut-out results/out.csv
22
+ """
23
+
24
+ from harmonsmile._cli import main
25
+
26
+ if __name__ == "__main__":
27
+ main()
harmonsmile/_cli.py ADDED
@@ -0,0 +1,127 @@
1
+ # SPDX-License-Identifier: LGPL-3.0-or-later
2
+ """
3
+ Command-line interface for harmonsmile.
4
+
5
+ Implements the ``harmonsmile`` entry point and ``python -m harmonsmile``
6
+ invocation. Arguments are parsed and forwarded to
7
+ :class:`~harmonsmile.pipelines.PubChemIngest`,
8
+ :class:`~harmonsmile.pipelines.ChEMBLIngest`, and
9
+ :class:`~harmonsmile.pipelines.SMILESPrep`.
10
+
11
+ Examples
12
+ --------
13
+ ::
14
+
15
+ harmonsmile --pubchem-in data/db.csv --pubchem-out results/out.csv
16
+ harmonsmile --chembl-in data/db.csv --chembl-out results/out.csv
17
+ harmonsmile --coconut-in data/db.csv --coconut-smiles SMILES --coconut-out results/out.csv
18
+ python -m harmonsmile --pubchem-in data/db.csv --pubchem-out results/out.csv
19
+ """
20
+
21
+ from __future__ import annotations
22
+ from .version import __version__
23
+ import argparse
24
+ import os
25
+
26
+ from .config import Config
27
+ from .pipelines import PubChemIngest, ChEMBLIngest, SMILESPrep
28
+
29
+
30
+ def _ensure_dirs() -> None:
31
+ for d in ("logs", "results"):
32
+ os.makedirs(d, exist_ok=True)
33
+
34
+
35
+ def _parse(argv: list[str] | None = None) -> argparse.Namespace:
36
+ p = argparse.ArgumentParser(
37
+ prog="harmonsmile",
38
+ description="Harmonize SMILES strings to canonical + isomeric + Kekulized convention.",
39
+ )
40
+ p.add_argument("--version", action="version", version=f"%(prog)s {__version__}",)
41
+ pub = p.add_argument_group("PubChem")
42
+ pub.add_argument("--pubchem-in", dest="pub_in", metavar="FILE")
43
+ pub.add_argument("--pubchem-out", dest="pub_out", metavar="FILE")
44
+ pub.add_argument("--pubchem-cidcol", dest="pubchem_cidcol", default="PubChem CID", metavar="COL")
45
+
46
+ chembl = p.add_argument_group("ChEMBL")
47
+ chembl.add_argument("--chembl-in", dest="chembl_in", metavar="FILE")
48
+ chembl.add_argument("--chembl-out", dest="chembl_out", metavar="FILE")
49
+ chembl.add_argument("--chembl-idcol", dest="chembl_idcol", default="ChEMBL ID", metavar="COL")
50
+
51
+ coco = p.add_argument_group("COCONUT / independent")
52
+ coco.add_argument("--coconut-in", dest="coco_in", metavar="FILE")
53
+ coco.add_argument("--coconut-out", dest="coco_out", metavar="FILE")
54
+ coco.add_argument("--coconut-smiles", dest="coco_smiles", metavar="COL")
55
+
56
+ args = p.parse_args(argv)
57
+
58
+ # Validate paired arguments
59
+ if bool(args.pub_in) != bool(args.pub_out):
60
+ p.error("--pubchem-in and --pubchem-out must be provided together.")
61
+ if bool(args.chembl_in) != bool(args.chembl_out):
62
+ p.error("--chembl-in and --chembl-out must be provided together.")
63
+ if bool(args.coco_in) != bool(args.coco_out):
64
+ p.error("--coconut-in and --coconut-out must be provided together.")
65
+ if args.coco_in and not args.coco_smiles:
66
+ p.error("--coconut-smiles is required when --coconut-in is provided.")
67
+
68
+ return args
69
+
70
+
71
+ def main(argv: list[str] | None = None) -> None:
72
+ """
73
+ Entry point for the harmonsmile command-line interface.
74
+
75
+ Parameters
76
+ ----------
77
+ argv : list of str, optional
78
+ Argument list. Defaults to sys.argv if None.
79
+
80
+ Examples
81
+ --------
82
+ Programmatic invocation with PubChem pipeline:
83
+
84
+ >>> from harmonsmile._cli import main
85
+ >>> main(["--pubchem-in", "data/db.csv", "--pubchem-out", "results/out.csv"])
86
+
87
+ Programmatic invocation with ChEMBL pipeline:
88
+
89
+ >>> main(["--chembl-in", "data/db.csv", "--chembl-out", "results/out.csv"])
90
+
91
+ Programmatic invocation with COCONUT pipeline:
92
+
93
+ >>> main(["--coconut-in", "data/db.csv", "--coconut-smiles", "SMILES",
94
+ ... "--coconut-out", "results/out.csv"])
95
+ """
96
+ args = _parse(argv)
97
+ ran_any = False
98
+
99
+ if args.pub_in and args.pub_out:
100
+ _ensure_dirs()
101
+ cfg = Config(
102
+ input_path=args.pub_in,
103
+ output_path=args.pub_out,
104
+ cid_col=args.pubchem_cidcol,
105
+ )
106
+ PubChemIngest(cfg).run()
107
+ ran_any = True
108
+
109
+ if args.chembl_in and args.chembl_out:
110
+ _ensure_dirs()
111
+ ChEMBLIngest(
112
+ input_path=args.chembl_in,
113
+ output_path=args.chembl_out,
114
+ chembl_id_col=args.chembl_idcol,
115
+ ).run()
116
+ ran_any = True
117
+
118
+ if args.coco_in and args.coco_out and args.coco_smiles:
119
+ _ensure_dirs()
120
+ SMILESPrep(args.coco_in, args.coco_smiles, args.coco_out).run()
121
+ ran_any = True
122
+
123
+ if not ran_any:
124
+ raise SystemExit(
125
+ "Nothing to run. Provide --pubchem-*, --chembl-*, and/or --coconut-* arguments.\n"
126
+ "Run 'harmonsmile --help' for usage."
127
+ )
harmonsmile/chembl.py ADDED
@@ -0,0 +1,179 @@
1
+ # SPDX-License-Identifier: LGPL-3.0-or-later
2
+ """
3
+ ChEMBL REST API client for harmonsmile.
4
+
5
+ Provides :class:`_ChEMBLClient` for fetching compound properties from the
6
+ ChEMBL REST API, with exponential backoff and persistent connection reuse.
7
+ """
8
+
9
+ from __future__ import annotations
10
+ import logging
11
+ import re
12
+ import time
13
+ from typing import Any, Callable
14
+
15
+ import requests
16
+
17
+ _CHEMBL_ID_RE = re.compile(r"^CHEMBL\d+$")
18
+
19
+ _ROOT_FIELDS: tuple[str, ...] = ("molecule_chembl_id", "pref_name")
20
+ _STRUCT_FIELDS: tuple[str, ...] = ("canonical_smiles", "standard_inchi", "standard_inchi_key")
21
+ _PROP_FIELDS: tuple[str, ...] = (
22
+ "alogp", "full_mwt", "full_molformula",
23
+ "hba", "hbd", "heavy_atoms",
24
+ "psa", "qed_weighted", "num_ro5_violations", "rtb",
25
+ )
26
+ _ALL_FIELDS: tuple[str, ...] = _ROOT_FIELDS + _STRUCT_FIELDS + _PROP_FIELDS
27
+
28
+
29
+ class _ChEMBLClient:
30
+ """
31
+ Client for fetching compound properties from the ChEMBL REST API.
32
+
33
+ Uses exponential backoff on failure and a persistent requests.Session
34
+ for efficient connection reuse across multiple compounds.
35
+
36
+ Parameters
37
+ ----------
38
+ logger : Callable[[str], None] or None, optional
39
+ Callable for error reporting. Defaults to the module logger warning.
40
+ sleep : float, optional
41
+ Base sleep time in seconds between requests. Defaults to 0.2.
42
+ retries : int, optional
43
+ Number of retry attempts on failure. Defaults to 3.
44
+
45
+ Examples
46
+ --------
47
+ >>> client = _ChEMBLClient()
48
+ >>> props = client.fetch_props("CHEMBL25") # doctest: +SKIP
49
+ >>> client.close()
50
+ """
51
+
52
+ _BASE_URL = "https://www.ebi.ac.uk/chembl/api/data/molecule"
53
+
54
+ def __init__(
55
+ self,
56
+ logger: Callable[[str], None] | None = None,
57
+ sleep: float = 0.2,
58
+ retries: int = 3,
59
+ ) -> None:
60
+ if not 0.1 <= sleep <= 10.0:
61
+ raise ValueError("sleep must be between 0.1 and 10.0 seconds.")
62
+ if not 1 <= retries <= 10:
63
+ raise ValueError("retries must be between 1 and 10.")
64
+ self.log = logger or (lambda m: logging.getLogger(__name__).warning(m))
65
+ self.sleep = sleep
66
+ self.retries = retries
67
+ self._session = requests.Session()
68
+ self._session.headers.update({"User-Agent": "harmonsmile (python-requests)"})
69
+
70
+ def fetch_props(self, chembl_id: str | None) -> dict[str, Any]:
71
+ """
72
+ Fetch compound properties from ChEMBL by ChEMBL ID.
73
+
74
+ Parameters
75
+ ----------
76
+ chembl_id : str or None
77
+ ChEMBL compound identifier (e.g. 'CHEMBL25'). Whitespace is
78
+ stripped; IDs not matching ``CHEMBL\\d+`` return all-None values.
79
+
80
+ Returns
81
+ -------
82
+ dict[str, Any]
83
+ Dictionary of 15 extracted properties. Values are None if the
84
+ fetch failed or the identifier is missing or invalid.
85
+
86
+ Examples
87
+ --------
88
+ >>> client = _ChEMBLClient()
89
+ >>> client.fetch_props("CHEMBL25") # doctest: +SKIP
90
+ {'molecule_chembl_id': 'CHEMBL25', 'pref_name': 'ASPIRIN', ...}
91
+ >>> client.fetch_props("")
92
+ {'molecule_chembl_id': None, 'pref_name': None, ...}
93
+ >>> client.close()
94
+ """
95
+ null = {f: None for f in _ALL_FIELDS}
96
+ if not chembl_id:
97
+ return null
98
+ chembl_id = str(chembl_id).strip()
99
+ if not _CHEMBL_ID_RE.match(chembl_id):
100
+ return null
101
+ url = f"{self._BASE_URL}/{chembl_id}.json"
102
+ for k in range(self.retries):
103
+ try:
104
+ r = self._session.get(url, timeout=12)
105
+ r.raise_for_status()
106
+ data = r.json()
107
+ structs = data.get("molecule_structures") or {}
108
+ props = data.get("molecule_properties") or {}
109
+ result: dict[str, Any] = {
110
+ "molecule_chembl_id": data.get("molecule_chembl_id"),
111
+ "pref_name": data.get("pref_name"),
112
+ "canonical_smiles": structs.get("canonical_smiles"),
113
+ "standard_inchi": structs.get("standard_inchi"),
114
+ "standard_inchi_key": structs.get("standard_inchi_key"),
115
+ "alogp": props.get("alogp"),
116
+ "full_mwt": props.get("full_mwt"),
117
+ "full_molformula": props.get("full_molformula"),
118
+ "hba": props.get("hba"),
119
+ "hbd": props.get("hbd"),
120
+ "heavy_atoms": props.get("heavy_atoms"),
121
+ "psa": props.get("psa"),
122
+ "qed_weighted": props.get("qed_weighted"),
123
+ "num_ro5_violations": props.get("num_ro5_violations"),
124
+ "rtb": props.get("rtb"),
125
+ }
126
+ time.sleep(self.sleep)
127
+ return result
128
+ except Exception as e:
129
+ if k + 1 == self.retries:
130
+ self.log(f"[ChEMBL] {chembl_id}: {e}")
131
+ return null
132
+ time.sleep(self.sleep * (2 ** k))
133
+ return null
134
+
135
+ def __enter__(self) -> _ChEMBLClient:
136
+ """
137
+ Enter the context manager.
138
+
139
+ Returns
140
+ -------
141
+ _ChEMBLClient
142
+ The client instance itself.
143
+ """
144
+ return self
145
+
146
+ def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
147
+ """
148
+ Exit the context manager and release resources.
149
+
150
+ Parameters
151
+ ----------
152
+ exc_type : type or None
153
+ Exception type, if any.
154
+ exc_val : BaseException or None
155
+ Exception value, if any.
156
+ exc_tb : traceback or None
157
+ Exception traceback, if any.
158
+
159
+ Returns
160
+ -------
161
+ bool
162
+ Always False; exceptions are not suppressed.
163
+ """
164
+ self.close()
165
+ return False
166
+
167
+ def close(self) -> None:
168
+ """
169
+ Close the underlying HTTP session.
170
+
171
+ Should be called when the client is no longer needed to release
172
+ connection resources.
173
+
174
+ Examples
175
+ --------
176
+ >>> client = _ChEMBLClient()
177
+ >>> client.close()
178
+ """
179
+ self._session.close()
harmonsmile/config.py ADDED
@@ -0,0 +1,60 @@
1
+ # SPDX-License-Identifier: LGPL-3.0-or-later
2
+ """
3
+ Configuration dataclass for harmonsmile pipelines.
4
+
5
+ Defines the immutable :class:`Config` object used by
6
+ :class:`~harmonsmile.pipelines.PubChemIngest` to parameterize
7
+ input/output paths, PubChem column names, and properties to fetch.
8
+ """
9
+
10
+ from __future__ import annotations
11
+ from dataclasses import dataclass
12
+
13
+ VALID_PUBCHEM_PROPS: frozenset[str] = frozenset({
14
+ "SMILES", "ConnectivitySMILES", "MolecularWeight",
15
+ "MolecularFormula", "InChI", "InChIKey", "XLogP", "TPSA",
16
+ "HBondDonorCount", "HBondAcceptorCount", "RotatableBondCount",
17
+ "HeavyAtomCount", "Charge",
18
+ })
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class Config:
23
+ """
24
+ Immutable configuration for harmonsmile pipelines.
25
+
26
+ Parameters
27
+ ----------
28
+ input_path : str
29
+ Path to the input file (CSV, TSV, XLSX).
30
+ output_path : str
31
+ Path to the output CSV file.
32
+ error_log : str, optional
33
+ Path to the error log file. Defaults to 'logs/errors.txt'.
34
+ cid_col : str, optional
35
+ Name of the PubChem CID column. Defaults to 'PubChem CID'.
36
+ props : tuple of str, optional
37
+ PubChem properties to fetch. Defaults to all available properties.
38
+ """
39
+
40
+ input_path: str
41
+ output_path: str
42
+ error_log: str = "logs/errors.txt"
43
+ cid_col: str = "PubChem CID"
44
+ props: tuple[str, ...] = ("SMILES", "ConnectivitySMILES", "MolecularFormula",
45
+ "MolecularWeight", "InChI", "InChIKey", "XLogP", "TPSA",
46
+ "Charge", "HBondDonorCount", "HBondAcceptorCount",
47
+ "RotatableBondCount", "HeavyAtomCount",)
48
+
49
+ def __post_init__(self) -> None:
50
+ if not self.input_path:
51
+ raise ValueError("input_path must not be empty.")
52
+ if not self.output_path:
53
+ raise ValueError("output_path must not be empty.")
54
+ if ".." in self.output_path:
55
+ raise ValueError("output_path must not contain path traversal patterns ('..').")
56
+ if not self.props:
57
+ raise ValueError("props must contain at least one PubChem property.")
58
+ invalid = {p for p in self.props if p not in VALID_PUBCHEM_PROPS}
59
+ if invalid:
60
+ raise ValueError(f"Invalid PubChem properties: {sorted(invalid)}")
harmonsmile/io.py ADDED
@@ -0,0 +1,116 @@
1
+ # SPDX-License-Identifier: LGPL-3.0-or-later
2
+ """
3
+ Table I/O utilities for harmonsmile.
4
+
5
+ Provides :func:`load_table` and :func:`save_table` for reading and writing
6
+ tabular chemical data.
7
+ """
8
+
9
+ from __future__ import annotations
10
+ import os
11
+ from typing import Any
12
+ import pandas as pd
13
+
14
+
15
+ def _sanitize_cid(x: Any) -> str | None:
16
+ """
17
+ Sanitize a PubChem CID value to a clean numeric string.
18
+
19
+ Parameters
20
+ ----------
21
+ x : Any
22
+ Raw CID value (int, float, str, or NaN).
23
+
24
+ Returns
25
+ -------
26
+ str or None
27
+ Numeric string CID, or None if the value is missing or invalid.
28
+
29
+ Examples
30
+ --------
31
+ >>> _sanitize_cid(2723949.0)
32
+ '2723949'
33
+ >>> _sanitize_cid(" 12345 ")
34
+ '12345'
35
+ >>> _sanitize_cid(None)
36
+ """
37
+ if pd.isna(x):
38
+ return None
39
+ try:
40
+ if isinstance(x, float):
41
+ x = int(x)
42
+ s = str(x).strip()
43
+ s = "".join(ch for ch in s if ch.isdigit())
44
+ return s or None
45
+ except Exception:
46
+ return None
47
+
48
+
49
+ def load_table(path: str | os.PathLike) -> pd.DataFrame:
50
+ """
51
+ Load a tabular file into a DataFrame.
52
+
53
+ Supports CSV, TSV, TXT, XLSX, XLSM, and XLS formats.
54
+ Automatically detects delimiter for text files; falls back to
55
+ semicolon separator with latin-1 encoding if auto-detection fails.
56
+
57
+ Parameters
58
+ ----------
59
+ path : str or os.PathLike
60
+ Path to the input file.
61
+
62
+ Returns
63
+ -------
64
+ pd.DataFrame
65
+ Loaded DataFrame with cleaned 'id' and 'PubChem CID' columns
66
+ if present.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If the file format is not supported.
72
+
73
+ Examples
74
+ --------
75
+ >>> df = load_table("data/database_pubchem.csv")
76
+ >>> df = load_table("data/database_coconut.xlsx")
77
+ """
78
+ ext = os.path.splitext(path)[1].lower()
79
+ if ext in (".csv", ".tsv", ".txt"):
80
+ try:
81
+ df = pd.read_csv(path, engine="python", sep=None, encoding="utf-8-sig")
82
+ except Exception:
83
+ df = pd.read_csv(path, sep=";", encoding="latin-1")
84
+ elif ext in (".xlsx", ".xlsm", ".xls"):
85
+ df = pd.read_excel(path)
86
+ else:
87
+ raise ValueError(f"Unsupported format: {path}")
88
+
89
+ if "id" in df.columns:
90
+ df["id"] = pd.to_numeric(df["id"], errors="coerce").astype("Int64")
91
+ if "PubChem CID" in df.columns:
92
+ df["PubChem CID"] = df["PubChem CID"].apply(_sanitize_cid)
93
+ return df
94
+
95
+
96
+ def save_table(df: pd.DataFrame, path: str | os.PathLike) -> None:
97
+ """
98
+ Save a DataFrame to a CSV file.
99
+
100
+ Parent directories are created automatically if they do not exist.
101
+
102
+ Parameters
103
+ ----------
104
+ df : pd.DataFrame
105
+ DataFrame to save.
106
+ path : str or os.PathLike
107
+ Output file path.
108
+
109
+ Examples
110
+ --------
111
+ >>> import pandas as pd
112
+ >>> df = pd.DataFrame({"SMILES": ["C1=CC=CC=C1"], "SMILES_RDKit": ["C1=CC=CC=C1"]})
113
+ >>> save_table(df, "results/output.csv")
114
+ """
115
+ os.makedirs(os.path.dirname(os.fspath(path)) or ".", exist_ok=True)
116
+ df.to_csv(path, index=False, encoding="utf-8")