harmonsmile 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- harmonsmile/__init__.py +79 -0
- harmonsmile/__main__.py +27 -0
- harmonsmile/_cli.py +127 -0
- harmonsmile/chembl.py +179 -0
- harmonsmile/config.py +60 -0
- harmonsmile/io.py +116 -0
- harmonsmile/pipelines.py +319 -0
- harmonsmile/pubchem.py +151 -0
- harmonsmile/standardize.py +85 -0
- harmonsmile/version.py +9 -0
- harmonsmile-0.1.1.dist-info/METADATA +251 -0
- harmonsmile-0.1.1.dist-info/RECORD +17 -0
- harmonsmile-0.1.1.dist-info/WHEEL +4 -0
- harmonsmile-0.1.1.dist-info/entry_points.txt +2 -0
- harmonsmile-0.1.1.dist-info/licenses/COPYING +674 -0
- harmonsmile-0.1.1.dist-info/licenses/COPYING.LESSER +165 -0
- harmonsmile-0.1.1.dist-info/licenses/LICENSE +8 -0
harmonsmile/__init__.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
2
|
+
"""
|
|
3
|
+
harmonsmile — Harmonize SMILES strings to canonical + isomeric + Kekulized convention.
|
|
4
|
+
|
|
5
|
+
Provides pipelines and utilities for standardizing SMILES strings using RDKit,
|
|
6
|
+
following the COCONUT 2.0 convention: canonical, isomeric, and Kekulized form.
|
|
7
|
+
|
|
8
|
+
Classes
|
|
9
|
+
-------
|
|
10
|
+
RDKitStandardizer
|
|
11
|
+
Standardize SMILES strings using RDKit.
|
|
12
|
+
Config
|
|
13
|
+
Immutable configuration for harmonsmile pipelines.
|
|
14
|
+
PubChemIngest
|
|
15
|
+
Pipeline for ingesting and harmonizing PubChem compound data.
|
|
16
|
+
ChEMBLIngest
|
|
17
|
+
Pipeline for ingesting and harmonizing ChEMBL compound data.
|
|
18
|
+
SMILESPrep
|
|
19
|
+
Pipeline for harmonizing SMILES from any tabular source.
|
|
20
|
+
|
|
21
|
+
Functions
|
|
22
|
+
---------
|
|
23
|
+
load_table(path)
|
|
24
|
+
Load a tabular file into a DataFrame.
|
|
25
|
+
save_table(df, path)
|
|
26
|
+
Save a DataFrame to a CSV file.
|
|
27
|
+
|
|
28
|
+
Examples
|
|
29
|
+
--------
|
|
30
|
+
Standardize a single SMILES string:
|
|
31
|
+
|
|
32
|
+
>>> from harmonsmile import RDKitStandardizer
|
|
33
|
+
>>> std = RDKitStandardizer()
|
|
34
|
+
>>> std.to_iso_kek("c1ccccc1")
|
|
35
|
+
'C1=CC=CC=C1'
|
|
36
|
+
>>> std.to_conn_kek("C[C@@H](O)F")
|
|
37
|
+
'CC(O)F'
|
|
38
|
+
|
|
39
|
+
Harmonize a COCONUT or independent database:
|
|
40
|
+
|
|
41
|
+
>>> from harmonsmile import CoconutPrep
|
|
42
|
+
>>> CoconutPrep(
|
|
43
|
+
... input_path="data/database.csv",
|
|
44
|
+
... smiles_col="SMILES",
|
|
45
|
+
... output_path="results/database_harmonized.csv",
|
|
46
|
+
... ).run()
|
|
47
|
+
|
|
48
|
+
Fetch and harmonize PubChem data:
|
|
49
|
+
|
|
50
|
+
>>> from harmonsmile import PubChemIngest, Config
|
|
51
|
+
>>> cfg = Config(
|
|
52
|
+
... input_path="data/database_pubchem.csv",
|
|
53
|
+
... output_path="results/pubchem_harmonized.csv",
|
|
54
|
+
... )
|
|
55
|
+
>>> PubChemIngest(cfg).run()
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
from .standardize import RDKitStandardizer
|
|
59
|
+
from .pipelines import PubChemIngest, ChEMBLIngest, SMILESPrep, CoconutPrep
|
|
60
|
+
from .config import Config
|
|
61
|
+
from .pubchem import PubChemClient
|
|
62
|
+
from .io import load_table, save_table
|
|
63
|
+
from .version import __version__, PROJECT_NAME, PROJECT_VERSION, PROJECT_STATUS
|
|
64
|
+
|
|
65
|
+
__author__ = "Flavio F. Contreras-Torres"
|
|
66
|
+
|
|
67
|
+
__all__ = [
|
|
68
|
+
"RDKitStandardizer",
|
|
69
|
+
"PubChemIngest",
|
|
70
|
+
"ChEMBLIngest",
|
|
71
|
+
"SMILESPrep",
|
|
72
|
+
"Config",
|
|
73
|
+
"load_table",
|
|
74
|
+
"save_table",
|
|
75
|
+
"__version__",
|
|
76
|
+
"PROJECT_NAME",
|
|
77
|
+
"PROJECT_VERSION",
|
|
78
|
+
"PROJECT_STATUS"
|
|
79
|
+
]
|
harmonsmile/__main__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
2
|
+
"""
|
|
3
|
+
Entry point for running harmonsmile as a module.
|
|
4
|
+
|
|
5
|
+
Allows the package to be invoked directly from the command line
|
|
6
|
+
using ``python -m harmonsmile``. All arguments are forwarded to the
|
|
7
|
+
CLI defined in :mod:`harmonsmile._cli`.
|
|
8
|
+
|
|
9
|
+
See Also
|
|
10
|
+
--------
|
|
11
|
+
harmonsmile._cli.main : The CLI entry point function.
|
|
12
|
+
|
|
13
|
+
Examples
|
|
14
|
+
--------
|
|
15
|
+
Fetch PubChem properties and standardize SMILES::
|
|
16
|
+
|
|
17
|
+
python -m harmonsmile --pubchem-in data/db.csv --pubchem-out results/out.csv
|
|
18
|
+
|
|
19
|
+
Standardize an existing SMILES column::
|
|
20
|
+
|
|
21
|
+
python -m harmonsmile --coconut-in data/db.csv --coconut-smiles SMILES --coconut-out results/out.csv
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from harmonsmile._cli import main
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
main()
|
harmonsmile/_cli.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
2
|
+
"""
|
|
3
|
+
Command-line interface for harmonsmile.
|
|
4
|
+
|
|
5
|
+
Implements the ``harmonsmile`` entry point and ``python -m harmonsmile``
|
|
6
|
+
invocation. Arguments are parsed and forwarded to
|
|
7
|
+
:class:`~harmonsmile.pipelines.PubChemIngest`,
|
|
8
|
+
:class:`~harmonsmile.pipelines.ChEMBLIngest`, and
|
|
9
|
+
:class:`~harmonsmile.pipelines.SMILESPrep`.
|
|
10
|
+
|
|
11
|
+
Examples
|
|
12
|
+
--------
|
|
13
|
+
::
|
|
14
|
+
|
|
15
|
+
harmonsmile --pubchem-in data/db.csv --pubchem-out results/out.csv
|
|
16
|
+
harmonsmile --chembl-in data/db.csv --chembl-out results/out.csv
|
|
17
|
+
harmonsmile --coconut-in data/db.csv --coconut-smiles SMILES --coconut-out results/out.csv
|
|
18
|
+
python -m harmonsmile --pubchem-in data/db.csv --pubchem-out results/out.csv
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
from .version import __version__
|
|
23
|
+
import argparse
|
|
24
|
+
import os
|
|
25
|
+
|
|
26
|
+
from .config import Config
|
|
27
|
+
from .pipelines import PubChemIngest, ChEMBLIngest, SMILESPrep
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _ensure_dirs() -> None:
|
|
31
|
+
for d in ("logs", "results"):
|
|
32
|
+
os.makedirs(d, exist_ok=True)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _parse(argv: list[str] | None = None) -> argparse.Namespace:
|
|
36
|
+
p = argparse.ArgumentParser(
|
|
37
|
+
prog="harmonsmile",
|
|
38
|
+
description="Harmonize SMILES strings to canonical + isomeric + Kekulized convention.",
|
|
39
|
+
)
|
|
40
|
+
p.add_argument("--version", action="version", version=f"%(prog)s {__version__}",)
|
|
41
|
+
pub = p.add_argument_group("PubChem")
|
|
42
|
+
pub.add_argument("--pubchem-in", dest="pub_in", metavar="FILE")
|
|
43
|
+
pub.add_argument("--pubchem-out", dest="pub_out", metavar="FILE")
|
|
44
|
+
pub.add_argument("--pubchem-cidcol", dest="pubchem_cidcol", default="PubChem CID", metavar="COL")
|
|
45
|
+
|
|
46
|
+
chembl = p.add_argument_group("ChEMBL")
|
|
47
|
+
chembl.add_argument("--chembl-in", dest="chembl_in", metavar="FILE")
|
|
48
|
+
chembl.add_argument("--chembl-out", dest="chembl_out", metavar="FILE")
|
|
49
|
+
chembl.add_argument("--chembl-idcol", dest="chembl_idcol", default="ChEMBL ID", metavar="COL")
|
|
50
|
+
|
|
51
|
+
coco = p.add_argument_group("COCONUT / independent")
|
|
52
|
+
coco.add_argument("--coconut-in", dest="coco_in", metavar="FILE")
|
|
53
|
+
coco.add_argument("--coconut-out", dest="coco_out", metavar="FILE")
|
|
54
|
+
coco.add_argument("--coconut-smiles", dest="coco_smiles", metavar="COL")
|
|
55
|
+
|
|
56
|
+
args = p.parse_args(argv)
|
|
57
|
+
|
|
58
|
+
# Validate paired arguments
|
|
59
|
+
if bool(args.pub_in) != bool(args.pub_out):
|
|
60
|
+
p.error("--pubchem-in and --pubchem-out must be provided together.")
|
|
61
|
+
if bool(args.chembl_in) != bool(args.chembl_out):
|
|
62
|
+
p.error("--chembl-in and --chembl-out must be provided together.")
|
|
63
|
+
if bool(args.coco_in) != bool(args.coco_out):
|
|
64
|
+
p.error("--coconut-in and --coconut-out must be provided together.")
|
|
65
|
+
if args.coco_in and not args.coco_smiles:
|
|
66
|
+
p.error("--coconut-smiles is required when --coconut-in is provided.")
|
|
67
|
+
|
|
68
|
+
return args
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def main(argv: list[str] | None = None) -> None:
|
|
72
|
+
"""
|
|
73
|
+
Entry point for the harmonsmile command-line interface.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
argv : list of str, optional
|
|
78
|
+
Argument list. Defaults to sys.argv if None.
|
|
79
|
+
|
|
80
|
+
Examples
|
|
81
|
+
--------
|
|
82
|
+
Programmatic invocation with PubChem pipeline:
|
|
83
|
+
|
|
84
|
+
>>> from harmonsmile._cli import main
|
|
85
|
+
>>> main(["--pubchem-in", "data/db.csv", "--pubchem-out", "results/out.csv"])
|
|
86
|
+
|
|
87
|
+
Programmatic invocation with ChEMBL pipeline:
|
|
88
|
+
|
|
89
|
+
>>> main(["--chembl-in", "data/db.csv", "--chembl-out", "results/out.csv"])
|
|
90
|
+
|
|
91
|
+
Programmatic invocation with COCONUT pipeline:
|
|
92
|
+
|
|
93
|
+
>>> main(["--coconut-in", "data/db.csv", "--coconut-smiles", "SMILES",
|
|
94
|
+
... "--coconut-out", "results/out.csv"])
|
|
95
|
+
"""
|
|
96
|
+
args = _parse(argv)
|
|
97
|
+
ran_any = False
|
|
98
|
+
|
|
99
|
+
if args.pub_in and args.pub_out:
|
|
100
|
+
_ensure_dirs()
|
|
101
|
+
cfg = Config(
|
|
102
|
+
input_path=args.pub_in,
|
|
103
|
+
output_path=args.pub_out,
|
|
104
|
+
cid_col=args.pubchem_cidcol,
|
|
105
|
+
)
|
|
106
|
+
PubChemIngest(cfg).run()
|
|
107
|
+
ran_any = True
|
|
108
|
+
|
|
109
|
+
if args.chembl_in and args.chembl_out:
|
|
110
|
+
_ensure_dirs()
|
|
111
|
+
ChEMBLIngest(
|
|
112
|
+
input_path=args.chembl_in,
|
|
113
|
+
output_path=args.chembl_out,
|
|
114
|
+
chembl_id_col=args.chembl_idcol,
|
|
115
|
+
).run()
|
|
116
|
+
ran_any = True
|
|
117
|
+
|
|
118
|
+
if args.coco_in and args.coco_out and args.coco_smiles:
|
|
119
|
+
_ensure_dirs()
|
|
120
|
+
SMILESPrep(args.coco_in, args.coco_smiles, args.coco_out).run()
|
|
121
|
+
ran_any = True
|
|
122
|
+
|
|
123
|
+
if not ran_any:
|
|
124
|
+
raise SystemExit(
|
|
125
|
+
"Nothing to run. Provide --pubchem-*, --chembl-*, and/or --coconut-* arguments.\n"
|
|
126
|
+
"Run 'harmonsmile --help' for usage."
|
|
127
|
+
)
|
harmonsmile/chembl.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
2
|
+
"""
|
|
3
|
+
ChEMBL REST API client for harmonsmile.
|
|
4
|
+
|
|
5
|
+
Provides :class:`_ChEMBLClient` for fetching compound properties from the
|
|
6
|
+
ChEMBL REST API, with exponential backoff and persistent connection reuse.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
import logging
|
|
11
|
+
import re
|
|
12
|
+
import time
|
|
13
|
+
from typing import Any, Callable
|
|
14
|
+
|
|
15
|
+
import requests
|
|
16
|
+
|
|
17
|
+
_CHEMBL_ID_RE = re.compile(r"^CHEMBL\d+$")
|
|
18
|
+
|
|
19
|
+
_ROOT_FIELDS: tuple[str, ...] = ("molecule_chembl_id", "pref_name")
|
|
20
|
+
_STRUCT_FIELDS: tuple[str, ...] = ("canonical_smiles", "standard_inchi", "standard_inchi_key")
|
|
21
|
+
_PROP_FIELDS: tuple[str, ...] = (
|
|
22
|
+
"alogp", "full_mwt", "full_molformula",
|
|
23
|
+
"hba", "hbd", "heavy_atoms",
|
|
24
|
+
"psa", "qed_weighted", "num_ro5_violations", "rtb",
|
|
25
|
+
)
|
|
26
|
+
_ALL_FIELDS: tuple[str, ...] = _ROOT_FIELDS + _STRUCT_FIELDS + _PROP_FIELDS
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class _ChEMBLClient:
|
|
30
|
+
"""
|
|
31
|
+
Client for fetching compound properties from the ChEMBL REST API.
|
|
32
|
+
|
|
33
|
+
Uses exponential backoff on failure and a persistent requests.Session
|
|
34
|
+
for efficient connection reuse across multiple compounds.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
logger : Callable[[str], None] or None, optional
|
|
39
|
+
Callable for error reporting. Defaults to the module logger warning.
|
|
40
|
+
sleep : float, optional
|
|
41
|
+
Base sleep time in seconds between requests. Defaults to 0.2.
|
|
42
|
+
retries : int, optional
|
|
43
|
+
Number of retry attempts on failure. Defaults to 3.
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> client = _ChEMBLClient()
|
|
48
|
+
>>> props = client.fetch_props("CHEMBL25") # doctest: +SKIP
|
|
49
|
+
>>> client.close()
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
_BASE_URL = "https://www.ebi.ac.uk/chembl/api/data/molecule"
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
logger: Callable[[str], None] | None = None,
|
|
57
|
+
sleep: float = 0.2,
|
|
58
|
+
retries: int = 3,
|
|
59
|
+
) -> None:
|
|
60
|
+
if not 0.1 <= sleep <= 10.0:
|
|
61
|
+
raise ValueError("sleep must be between 0.1 and 10.0 seconds.")
|
|
62
|
+
if not 1 <= retries <= 10:
|
|
63
|
+
raise ValueError("retries must be between 1 and 10.")
|
|
64
|
+
self.log = logger or (lambda m: logging.getLogger(__name__).warning(m))
|
|
65
|
+
self.sleep = sleep
|
|
66
|
+
self.retries = retries
|
|
67
|
+
self._session = requests.Session()
|
|
68
|
+
self._session.headers.update({"User-Agent": "harmonsmile (python-requests)"})
|
|
69
|
+
|
|
70
|
+
def fetch_props(self, chembl_id: str | None) -> dict[str, Any]:
|
|
71
|
+
"""
|
|
72
|
+
Fetch compound properties from ChEMBL by ChEMBL ID.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
chembl_id : str or None
|
|
77
|
+
ChEMBL compound identifier (e.g. 'CHEMBL25'). Whitespace is
|
|
78
|
+
stripped; IDs not matching ``CHEMBL\\d+`` return all-None values.
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
dict[str, Any]
|
|
83
|
+
Dictionary of 15 extracted properties. Values are None if the
|
|
84
|
+
fetch failed or the identifier is missing or invalid.
|
|
85
|
+
|
|
86
|
+
Examples
|
|
87
|
+
--------
|
|
88
|
+
>>> client = _ChEMBLClient()
|
|
89
|
+
>>> client.fetch_props("CHEMBL25") # doctest: +SKIP
|
|
90
|
+
{'molecule_chembl_id': 'CHEMBL25', 'pref_name': 'ASPIRIN', ...}
|
|
91
|
+
>>> client.fetch_props("")
|
|
92
|
+
{'molecule_chembl_id': None, 'pref_name': None, ...}
|
|
93
|
+
>>> client.close()
|
|
94
|
+
"""
|
|
95
|
+
null = {f: None for f in _ALL_FIELDS}
|
|
96
|
+
if not chembl_id:
|
|
97
|
+
return null
|
|
98
|
+
chembl_id = str(chembl_id).strip()
|
|
99
|
+
if not _CHEMBL_ID_RE.match(chembl_id):
|
|
100
|
+
return null
|
|
101
|
+
url = f"{self._BASE_URL}/{chembl_id}.json"
|
|
102
|
+
for k in range(self.retries):
|
|
103
|
+
try:
|
|
104
|
+
r = self._session.get(url, timeout=12)
|
|
105
|
+
r.raise_for_status()
|
|
106
|
+
data = r.json()
|
|
107
|
+
structs = data.get("molecule_structures") or {}
|
|
108
|
+
props = data.get("molecule_properties") or {}
|
|
109
|
+
result: dict[str, Any] = {
|
|
110
|
+
"molecule_chembl_id": data.get("molecule_chembl_id"),
|
|
111
|
+
"pref_name": data.get("pref_name"),
|
|
112
|
+
"canonical_smiles": structs.get("canonical_smiles"),
|
|
113
|
+
"standard_inchi": structs.get("standard_inchi"),
|
|
114
|
+
"standard_inchi_key": structs.get("standard_inchi_key"),
|
|
115
|
+
"alogp": props.get("alogp"),
|
|
116
|
+
"full_mwt": props.get("full_mwt"),
|
|
117
|
+
"full_molformula": props.get("full_molformula"),
|
|
118
|
+
"hba": props.get("hba"),
|
|
119
|
+
"hbd": props.get("hbd"),
|
|
120
|
+
"heavy_atoms": props.get("heavy_atoms"),
|
|
121
|
+
"psa": props.get("psa"),
|
|
122
|
+
"qed_weighted": props.get("qed_weighted"),
|
|
123
|
+
"num_ro5_violations": props.get("num_ro5_violations"),
|
|
124
|
+
"rtb": props.get("rtb"),
|
|
125
|
+
}
|
|
126
|
+
time.sleep(self.sleep)
|
|
127
|
+
return result
|
|
128
|
+
except Exception as e:
|
|
129
|
+
if k + 1 == self.retries:
|
|
130
|
+
self.log(f"[ChEMBL] {chembl_id}: {e}")
|
|
131
|
+
return null
|
|
132
|
+
time.sleep(self.sleep * (2 ** k))
|
|
133
|
+
return null
|
|
134
|
+
|
|
135
|
+
def __enter__(self) -> _ChEMBLClient:
|
|
136
|
+
"""
|
|
137
|
+
Enter the context manager.
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
_ChEMBLClient
|
|
142
|
+
The client instance itself.
|
|
143
|
+
"""
|
|
144
|
+
return self
|
|
145
|
+
|
|
146
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
|
|
147
|
+
"""
|
|
148
|
+
Exit the context manager and release resources.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
exc_type : type or None
|
|
153
|
+
Exception type, if any.
|
|
154
|
+
exc_val : BaseException or None
|
|
155
|
+
Exception value, if any.
|
|
156
|
+
exc_tb : traceback or None
|
|
157
|
+
Exception traceback, if any.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
bool
|
|
162
|
+
Always False; exceptions are not suppressed.
|
|
163
|
+
"""
|
|
164
|
+
self.close()
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
def close(self) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Close the underlying HTTP session.
|
|
170
|
+
|
|
171
|
+
Should be called when the client is no longer needed to release
|
|
172
|
+
connection resources.
|
|
173
|
+
|
|
174
|
+
Examples
|
|
175
|
+
--------
|
|
176
|
+
>>> client = _ChEMBLClient()
|
|
177
|
+
>>> client.close()
|
|
178
|
+
"""
|
|
179
|
+
self._session.close()
|
harmonsmile/config.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
2
|
+
"""
|
|
3
|
+
Configuration dataclass for harmonsmile pipelines.
|
|
4
|
+
|
|
5
|
+
Defines the immutable :class:`Config` object used by
|
|
6
|
+
:class:`~harmonsmile.pipelines.PubChemIngest` to parameterize
|
|
7
|
+
input/output paths, PubChem column names, and properties to fetch.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
VALID_PUBCHEM_PROPS: frozenset[str] = frozenset({
|
|
14
|
+
"SMILES", "ConnectivitySMILES", "MolecularWeight",
|
|
15
|
+
"MolecularFormula", "InChI", "InChIKey", "XLogP", "TPSA",
|
|
16
|
+
"HBondDonorCount", "HBondAcceptorCount", "RotatableBondCount",
|
|
17
|
+
"HeavyAtomCount", "Charge",
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class Config:
|
|
23
|
+
"""
|
|
24
|
+
Immutable configuration for harmonsmile pipelines.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
input_path : str
|
|
29
|
+
Path to the input file (CSV, TSV, XLSX).
|
|
30
|
+
output_path : str
|
|
31
|
+
Path to the output CSV file.
|
|
32
|
+
error_log : str, optional
|
|
33
|
+
Path to the error log file. Defaults to 'logs/errors.txt'.
|
|
34
|
+
cid_col : str, optional
|
|
35
|
+
Name of the PubChem CID column. Defaults to 'PubChem CID'.
|
|
36
|
+
props : tuple of str, optional
|
|
37
|
+
PubChem properties to fetch. Defaults to all available properties.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
input_path: str
|
|
41
|
+
output_path: str
|
|
42
|
+
error_log: str = "logs/errors.txt"
|
|
43
|
+
cid_col: str = "PubChem CID"
|
|
44
|
+
props: tuple[str, ...] = ("SMILES", "ConnectivitySMILES", "MolecularFormula",
|
|
45
|
+
"MolecularWeight", "InChI", "InChIKey", "XLogP", "TPSA",
|
|
46
|
+
"Charge", "HBondDonorCount", "HBondAcceptorCount",
|
|
47
|
+
"RotatableBondCount", "HeavyAtomCount",)
|
|
48
|
+
|
|
49
|
+
def __post_init__(self) -> None:
|
|
50
|
+
if not self.input_path:
|
|
51
|
+
raise ValueError("input_path must not be empty.")
|
|
52
|
+
if not self.output_path:
|
|
53
|
+
raise ValueError("output_path must not be empty.")
|
|
54
|
+
if ".." in self.output_path:
|
|
55
|
+
raise ValueError("output_path must not contain path traversal patterns ('..').")
|
|
56
|
+
if not self.props:
|
|
57
|
+
raise ValueError("props must contain at least one PubChem property.")
|
|
58
|
+
invalid = {p for p in self.props if p not in VALID_PUBCHEM_PROPS}
|
|
59
|
+
if invalid:
|
|
60
|
+
raise ValueError(f"Invalid PubChem properties: {sorted(invalid)}")
|
harmonsmile/io.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
2
|
+
"""
|
|
3
|
+
Table I/O utilities for harmonsmile.
|
|
4
|
+
|
|
5
|
+
Provides :func:`load_table` and :func:`save_table` for reading and writing
|
|
6
|
+
tabular chemical data.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
import os
|
|
11
|
+
from typing import Any
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _sanitize_cid(x: Any) -> str | None:
|
|
16
|
+
"""
|
|
17
|
+
Sanitize a PubChem CID value to a clean numeric string.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
x : Any
|
|
22
|
+
Raw CID value (int, float, str, or NaN).
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
str or None
|
|
27
|
+
Numeric string CID, or None if the value is missing or invalid.
|
|
28
|
+
|
|
29
|
+
Examples
|
|
30
|
+
--------
|
|
31
|
+
>>> _sanitize_cid(2723949.0)
|
|
32
|
+
'2723949'
|
|
33
|
+
>>> _sanitize_cid(" 12345 ")
|
|
34
|
+
'12345'
|
|
35
|
+
>>> _sanitize_cid(None)
|
|
36
|
+
"""
|
|
37
|
+
if pd.isna(x):
|
|
38
|
+
return None
|
|
39
|
+
try:
|
|
40
|
+
if isinstance(x, float):
|
|
41
|
+
x = int(x)
|
|
42
|
+
s = str(x).strip()
|
|
43
|
+
s = "".join(ch for ch in s if ch.isdigit())
|
|
44
|
+
return s or None
|
|
45
|
+
except Exception:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def load_table(path: str | os.PathLike) -> pd.DataFrame:
|
|
50
|
+
"""
|
|
51
|
+
Load a tabular file into a DataFrame.
|
|
52
|
+
|
|
53
|
+
Supports CSV, TSV, TXT, XLSX, XLSM, and XLS formats.
|
|
54
|
+
Automatically detects delimiter for text files; falls back to
|
|
55
|
+
semicolon separator with latin-1 encoding if auto-detection fails.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
path : str or os.PathLike
|
|
60
|
+
Path to the input file.
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
pd.DataFrame
|
|
65
|
+
Loaded DataFrame with cleaned 'id' and 'PubChem CID' columns
|
|
66
|
+
if present.
|
|
67
|
+
|
|
68
|
+
Raises
|
|
69
|
+
------
|
|
70
|
+
ValueError
|
|
71
|
+
If the file format is not supported.
|
|
72
|
+
|
|
73
|
+
Examples
|
|
74
|
+
--------
|
|
75
|
+
>>> df = load_table("data/database_pubchem.csv")
|
|
76
|
+
>>> df = load_table("data/database_coconut.xlsx")
|
|
77
|
+
"""
|
|
78
|
+
ext = os.path.splitext(path)[1].lower()
|
|
79
|
+
if ext in (".csv", ".tsv", ".txt"):
|
|
80
|
+
try:
|
|
81
|
+
df = pd.read_csv(path, engine="python", sep=None, encoding="utf-8-sig")
|
|
82
|
+
except Exception:
|
|
83
|
+
df = pd.read_csv(path, sep=";", encoding="latin-1")
|
|
84
|
+
elif ext in (".xlsx", ".xlsm", ".xls"):
|
|
85
|
+
df = pd.read_excel(path)
|
|
86
|
+
else:
|
|
87
|
+
raise ValueError(f"Unsupported format: {path}")
|
|
88
|
+
|
|
89
|
+
if "id" in df.columns:
|
|
90
|
+
df["id"] = pd.to_numeric(df["id"], errors="coerce").astype("Int64")
|
|
91
|
+
if "PubChem CID" in df.columns:
|
|
92
|
+
df["PubChem CID"] = df["PubChem CID"].apply(_sanitize_cid)
|
|
93
|
+
return df
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def save_table(df: pd.DataFrame, path: str | os.PathLike) -> None:
|
|
97
|
+
"""
|
|
98
|
+
Save a DataFrame to a CSV file.
|
|
99
|
+
|
|
100
|
+
Parent directories are created automatically if they do not exist.
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
df : pd.DataFrame
|
|
105
|
+
DataFrame to save.
|
|
106
|
+
path : str or os.PathLike
|
|
107
|
+
Output file path.
|
|
108
|
+
|
|
109
|
+
Examples
|
|
110
|
+
--------
|
|
111
|
+
>>> import pandas as pd
|
|
112
|
+
>>> df = pd.DataFrame({"SMILES": ["C1=CC=CC=C1"], "SMILES_RDKit": ["C1=CC=CC=C1"]})
|
|
113
|
+
>>> save_table(df, "results/output.csv")
|
|
114
|
+
"""
|
|
115
|
+
os.makedirs(os.path.dirname(os.fspath(path)) or ".", exist_ok=True)
|
|
116
|
+
df.to_csv(path, index=False, encoding="utf-8")
|