harmonsmile 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- harmonsmile-0.1.1/.github/workflows/ci.yml +59 -0
- harmonsmile-0.1.1/.github/workflows/publish-to-pypi.yml +91 -0
- harmonsmile-0.1.1/.gitignore +28 -0
- harmonsmile-0.1.1/.vscode/settings.json +5 -0
- harmonsmile-0.1.1/CHANGELOG.md +105 -0
- harmonsmile-0.1.1/CITATION.cff +29 -0
- harmonsmile-0.1.1/COPYING +674 -0
- harmonsmile-0.1.1/COPYING.LESSER +165 -0
- harmonsmile-0.1.1/LICENSE +8 -0
- harmonsmile-0.1.1/PKG-INFO +251 -0
- harmonsmile-0.1.1/README.md +212 -0
- harmonsmile-0.1.1/environment.yml +12 -0
- harmonsmile-0.1.1/harmonsmile/__init__.py +79 -0
- harmonsmile-0.1.1/harmonsmile/__main__.py +27 -0
- harmonsmile-0.1.1/harmonsmile/_cli.py +127 -0
- harmonsmile-0.1.1/harmonsmile/chembl.py +179 -0
- harmonsmile-0.1.1/harmonsmile/config.py +60 -0
- harmonsmile-0.1.1/harmonsmile/io.py +116 -0
- harmonsmile-0.1.1/harmonsmile/pipelines.py +319 -0
- harmonsmile-0.1.1/harmonsmile/pubchem.py +151 -0
- harmonsmile-0.1.1/harmonsmile/standardize.py +85 -0
- harmonsmile-0.1.1/harmonsmile/version.py +9 -0
- harmonsmile-0.1.1/pyproject.toml +54 -0
- harmonsmile-0.1.1/requirements-dev.txt +9 -0
- harmonsmile-0.1.1/tests/__init__.py +1 -0
- harmonsmile-0.1.1/tests/test_chembl.py +152 -0
- harmonsmile-0.1.1/tests/test_config.py +57 -0
- harmonsmile-0.1.1/tests/test_io.py +139 -0
- harmonsmile-0.1.1/tests/test_pubchem.py +80 -0
- harmonsmile-0.1.1/tests/test_security.py +110 -0
- harmonsmile-0.1.1/tests/test_standardize.py +81 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
|
2
|
+
name: CI
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
pull_request:
|
|
6
|
+
push:
|
|
7
|
+
branches:
|
|
8
|
+
- main
|
|
9
|
+
- "dev-v*"
|
|
10
|
+
- "release/*"
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test-build-smoke:
|
|
14
|
+
name: Test, build, and smoke install
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- name: Check out repository
|
|
19
|
+
uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.11"
|
|
25
|
+
cache: pip
|
|
26
|
+
|
|
27
|
+
- name: Install package and validation tools
|
|
28
|
+
run: |
|
|
29
|
+
python -m pip install --upgrade pip
|
|
30
|
+
python -m pip install "setuptools>=68" wheel
|
|
31
|
+
python -m pip install -e ".[dev]" build twine
|
|
32
|
+
|
|
33
|
+
- name: Run tests
|
|
34
|
+
run: python -m pytest tests -p no:cacheprovider --basetemp .pytest_tmp
|
|
35
|
+
|
|
36
|
+
- name: Check CLI version
|
|
37
|
+
run: harmonsmile --version
|
|
38
|
+
|
|
39
|
+
- name: Check public import boundary
|
|
40
|
+
run: python -c "import harmonsmile; print(harmonsmile.__version__, harmonsmile.__all__, hasattr(harmonsmile, 'RDKitStandardizer'))"
|
|
41
|
+
|
|
42
|
+
- name: Build distributions
|
|
43
|
+
run: python -m build
|
|
44
|
+
|
|
45
|
+
- name: Check distributions
|
|
46
|
+
run: python -m twine check dist/*
|
|
47
|
+
|
|
48
|
+
- name: Smoke install built wheel
|
|
49
|
+
shell: bash
|
|
50
|
+
run: |
|
|
51
|
+
set -euo pipefail
|
|
52
|
+
WHEEL="$(ls dist/*.whl | head -n 1)"
|
|
53
|
+
python -m venv .smoke_venv
|
|
54
|
+
.smoke_venv/bin/python -m pip install --upgrade pip
|
|
55
|
+
.smoke_venv/bin/python -m pip install "$WHEEL"
|
|
56
|
+
mkdir -p .smoke_outside_checkout
|
|
57
|
+
cd .smoke_outside_checkout
|
|
58
|
+
../.smoke_venv/bin/harmonsmile --version
|
|
59
|
+
../.smoke_venv/bin/python -c "import harmonsmile; print(harmonsmile.__version__, harmonsmile.__all__, hasattr(harmonsmile, 'RDKitStandardizer'))"
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
name: Publish Python package to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
inputs:
|
|
6
|
+
tag:
|
|
7
|
+
description: "Release tag to publish, for example v0.1.0"
|
|
8
|
+
required: true
|
|
9
|
+
type: string
|
|
10
|
+
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
build:
|
|
16
|
+
name: Build distribution
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- name: Check out release tag
|
|
21
|
+
uses: actions/checkout@v4
|
|
22
|
+
with:
|
|
23
|
+
ref: ${{ inputs.tag }}
|
|
24
|
+
persist-credentials: false
|
|
25
|
+
|
|
26
|
+
- name: Validate release tag
|
|
27
|
+
run: |
|
|
28
|
+
case "${{ inputs.tag }}" in
|
|
29
|
+
v[0-9]*.[0-9]*.[0-9]*) ;;
|
|
30
|
+
*)
|
|
31
|
+
echo "Input tag must look like vX.Y.Z, for example v0.1.1"
|
|
32
|
+
exit 1
|
|
33
|
+
;;
|
|
34
|
+
esac
|
|
35
|
+
|
|
36
|
+
git fetch --tags --force
|
|
37
|
+
git rev-parse -q --verify "refs/tags/${{ inputs.tag }}" >/dev/null
|
|
38
|
+
|
|
39
|
+
- name: Verify package version matches tag
|
|
40
|
+
run: |
|
|
41
|
+
expected="${{ inputs.tag }}"
|
|
42
|
+
expected="${expected#v}"
|
|
43
|
+
actual="$(python -c "import runpy; print(runpy.run_path('harmonsmile/version.py')['__version__'])")"
|
|
44
|
+
|
|
45
|
+
if [ "$actual" != "$expected" ]; then
|
|
46
|
+
echo "Package version $actual does not match tag ${{ inputs.tag }}"
|
|
47
|
+
exit 1
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
- name: Set up Python
|
|
51
|
+
uses: actions/setup-python@v5
|
|
52
|
+
with:
|
|
53
|
+
python-version: "3.11"
|
|
54
|
+
|
|
55
|
+
- name: Install build tools
|
|
56
|
+
run: python -m pip install --upgrade pip build twine
|
|
57
|
+
|
|
58
|
+
- name: Build wheel and source distribution
|
|
59
|
+
run: python -m build
|
|
60
|
+
|
|
61
|
+
- name: Check distribution metadata
|
|
62
|
+
run: python -m twine check dist/*
|
|
63
|
+
|
|
64
|
+
- name: Upload distribution artifact
|
|
65
|
+
uses: actions/upload-artifact@v4
|
|
66
|
+
with:
|
|
67
|
+
name: python-package-distributions
|
|
68
|
+
path: dist/
|
|
69
|
+
if-no-files-found: error
|
|
70
|
+
|
|
71
|
+
publish:
|
|
72
|
+
name: Publish distribution to PyPI
|
|
73
|
+
needs: build
|
|
74
|
+
runs-on: ubuntu-latest
|
|
75
|
+
|
|
76
|
+
environment:
|
|
77
|
+
name: pypi
|
|
78
|
+
url: https://pypi.org/p/harmonsmile
|
|
79
|
+
|
|
80
|
+
permissions:
|
|
81
|
+
id-token: write
|
|
82
|
+
|
|
83
|
+
steps:
|
|
84
|
+
- name: Download distribution artifact
|
|
85
|
+
uses: actions/download-artifact@v4
|
|
86
|
+
with:
|
|
87
|
+
name: python-package-distributions
|
|
88
|
+
path: dist/
|
|
89
|
+
|
|
90
|
+
- name: Publish distribution to PyPI
|
|
91
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.pyc
|
|
4
|
+
.pytest_cache/
|
|
5
|
+
|
|
6
|
+
# Environments
|
|
7
|
+
.venv/
|
|
8
|
+
.env
|
|
9
|
+
|
|
10
|
+
# Local data and outputs
|
|
11
|
+
data/*
|
|
12
|
+
results/*
|
|
13
|
+
logs/*
|
|
14
|
+
|
|
15
|
+
# Keep folders empty
|
|
16
|
+
!data/.gitkeep
|
|
17
|
+
!results/.gitkeep
|
|
18
|
+
!logs/.gitkeep
|
|
19
|
+
|
|
20
|
+
# Distribution and build files
|
|
21
|
+
dist/
|
|
22
|
+
*.egg-info/
|
|
23
|
+
build/
|
|
24
|
+
__pycache__/
|
|
25
|
+
*.pyc
|
|
26
|
+
.pytest_tmp/
|
|
27
|
+
.pytest_cache/
|
|
28
|
+
tests_output.txt
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## [Unreleased]
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## [0.1.1] - 2026-05-18
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
- `RDKitStandardizer` class with two SMILES normalization methods:
|
|
18
|
+
- `to_iso_kek()` — canonical + isomeric + Kekulized SMILES (COCONUT 2.0 convention)
|
|
19
|
+
- `to_conn_kek()` — canonical + connectivity-only + Kekulized SMILES
|
|
20
|
+
- `PubChemIngest` pipeline: fetches all available properties from PubChem REST API
|
|
21
|
+
(SMILES, ConnectivitySMILES, MolecularWeight, MolecularFormula, InChI, InChIKey,
|
|
22
|
+
XLogP, TPSA, Charge, HBondDonorCount, HBondAcceptorCount, RotatableBondCount,
|
|
23
|
+
HeavyAtomCount) and appends standardized `SMILES_RDKit` column.
|
|
24
|
+
- `ChEMBLIngest` pipeline: fetches properties from ChEMBL REST API by ChEMBL ID
|
|
25
|
+
(canonical_smiles, InChI, InChIKey, MW, MolecularFormula, ALogP, TPSA, HBA, HBD,
|
|
26
|
+
RotatableBonds, HeavyAtoms, QED, Ro5Violations) and appends standardized `SMILES_RDKit` column.
|
|
27
|
+
- `SMILESPrep` pipeline: standardizes SMILES from any CSV/Excel file using RDKit —
|
|
28
|
+
accepts any tabular source (COCONUT, ChEMBL downloads, in-house databases, etc.).
|
|
29
|
+
- `_PubChemClient` with configurable retries, exponential backoff, persistent
|
|
30
|
+
`requests.Session`, context manager protocol, and pluggable logger.
|
|
31
|
+
- `_ChEMBLClient` with same design as `_PubChemClient` — ChEMBL ID format validation,
|
|
32
|
+
exponential backoff, context manager protocol.
|
|
33
|
+
- `Config` frozen dataclass for pipeline configuration with `__post_init__` validation.
|
|
34
|
+
- `load_table()` and `save_table()` I/O utilities supporting CSV, TSV, XLSX, and XLS
|
|
35
|
+
formats, with `PathLike` support.
|
|
36
|
+
- `version.py` as single source of truth for package metadata (`__version__`,
|
|
37
|
+
`PROJECT_NAME`, `PROJECT_VERSION`, `PROJECT_STATUS`).
|
|
38
|
+
- Command-line interface via `harmonsmile` entry point and `python -m harmonsmile`,
|
|
39
|
+
with paired argument validation, grouped help output, and `--version` flag.
|
|
40
|
+
- `pyproject.toml` for PyPI packaging (build backend: hatchling).
|
|
41
|
+
- SPDX license headers (`LGPL-3.0-or-later`) in all source files.
|
|
42
|
+
- NumPy-style docstrings with Examples in all public modules and classes.
|
|
43
|
+
- `CITATION.cff` for software citation.
|
|
44
|
+
- `CHANGELOG.md` following Keep a Changelog format.
|
|
45
|
+
- `environment.yml` and `requirements-dev.txt` for reproducible environments.
|
|
46
|
+
- Unit test suite with pytest covering standardize, config, io,
|
|
47
|
+
pubchem, chembl, and security.
|
|
48
|
+
|
|
49
|
+
### Security
|
|
50
|
+
- `_PubChemClient`: bounds validation on `sleep` (0.1–10.0 s) and `retries` (1–10).
|
|
51
|
+
- `_PubChemClient.fetch_props()`: CID sanitization strips non-numeric characters
|
|
52
|
+
before URL construction.
|
|
53
|
+
- `_ChEMBLClient`: same bounds validation; ChEMBL ID format validated against
|
|
54
|
+
`^CHEMBL\d+$` regex before network calls.
|
|
55
|
+
- `Config`: path traversal guard rejects `output_path` containing `..`.
|
|
56
|
+
- `Config`: `VALID_PUBCHEM_PROPS` allowlist validates requested PubChem properties.
|
|
57
|
+
|
|
58
|
+
### Changed
|
|
59
|
+
- License changed from MIT to GNU Lesser General Public License v3.0 or later
|
|
60
|
+
(LGPL-3.0-or-later).
|
|
61
|
+
- `__main__.py` now delegates to `harmonsmile._cli` instead of `cli.harmonize`,
|
|
62
|
+
making the package self-contained and installable from PyPI.
|
|
63
|
+
- Console status messages changed to English for international audience.
|
|
64
|
+
- `Config` is now immutable (`frozen=True`).
|
|
65
|
+
- `CoconutPrep` renamed to `SMILESPrep` to reflect its universal scope. `CoconutPrep`
|
|
66
|
+
remains available as a deprecated alias and will be removed in a future release.
|
|
67
|
+
- `PubChemClient` renamed to `_PubChemClient` (private) to prevent direct use that
|
|
68
|
+
could abuse the PubChem REST API. `PubChemClient` remains available as a deprecated
|
|
69
|
+
alias and will be removed in a future release.
|
|
70
|
+
- Default `props` in `Config` expanded to include all available PubChem properties.
|
|
71
|
+
- Development status set to Alpha (`3 - Alpha`) reflecting first public release.
|
|
72
|
+
|
|
73
|
+
### Fixed
|
|
74
|
+
- Double `time.sleep()` call in `_PubChemClient.fetch_props()` that caused unnecessary
|
|
75
|
+
delays on successful requests.
|
|
76
|
+
- Missing column validation in `PubChemIngest.run()` before initiating network calls.
|
|
77
|
+
- Incorrect guard condition for `SMILES_RDKit` counter in `PubChemIngest.run()`.
|
|
78
|
+
- Unguarded `Chem.MolToSmiles()` call in `RDKitStandardizer` that could raise unhandled
|
|
79
|
+
C++ exceptions for unusual aromaticity models.
|
|
80
|
+
- Fallback encoding in `load_table()` changed to `latin-1` to correctly handle
|
|
81
|
+
non-UTF-8 encoded files.
|
|
82
|
+
|
|
83
|
+
### Removed
|
|
84
|
+
- Redundant `cli/` scripts (`harmonize.py`, `ingest_pubchem.py`, `prep_coconut.py`)
|
|
85
|
+
superseded by the unified `harmonsmile` entry point.
|
|
86
|
+
- Unused `id_col` field from `Config` dataclass.
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Future Releases (Planned)
|
|
91
|
+
|
|
92
|
+
### [0.2.0] — COCONUT source
|
|
93
|
+
- Add `CoconutIngest` pipeline — knows COCONUT 2.0 schema automatically
|
|
94
|
+
(`canonical_smiles` column, `identifier`, molecular properties).
|
|
95
|
+
- Optional COCONUT REST API integration (authenticated).
|
|
96
|
+
|
|
97
|
+
### [0.3.0] — ML-ready features
|
|
98
|
+
- Standardized pipeline to generate ECFP fingerprints (with/without chirality).
|
|
99
|
+
- InChI / InChIKey generation for deduplication and robust cross-database matching.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
[Unreleased]: https://github.com/NanoBiostructuresRG/harmonsmile/compare/v0.1.0...HEAD
|
|
104
|
+
[0.1.1]: https://github.com/NanoBiostructuresRG/harmonsmile/releases/tag/v0.1.1
|
|
105
|
+
[0.1.0]: https://github.com/NanoBiostructuresRG/harmonsmile/releases/tag/v0.1.0
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
message: "If you use this software, please cite it as below."
|
|
3
|
+
type: software
|
|
4
|
+
title: "HARMONSMILE: Harmonize SMILES Strings for Cheminformatics and Machine Learning"
|
|
5
|
+
version: "0.1.1"
|
|
6
|
+
date-released: "2026-05-18"
|
|
7
|
+
authors:
|
|
8
|
+
- family-names: "Contreras-Torres"
|
|
9
|
+
given-names: "Flavio F."
|
|
10
|
+
orcid: "https://orcid.org/0000-0003-2375-131X"
|
|
11
|
+
affiliation: "Tecnologico de Monterrey"
|
|
12
|
+
repository-code: "https://github.com/NanoBiostructuresRG/harmonsmile"
|
|
13
|
+
license: "LGPL-3.0-or-later"
|
|
14
|
+
abstract: >-
|
|
15
|
+
HARMONSMILE is a toolkit for harmonizing SMILES strings to a consistent
|
|
16
|
+
convention: canonical + isomeric + Kekulized, as used by RDKit and COCONUT 2.0.
|
|
17
|
+
It automates the preparation of SMILES for cheminformatics workflows and
|
|
18
|
+
machine learning applications within the computational drug discovery pipeline,
|
|
19
|
+
enabling standardized comparison across PubChem, COCONUT, and independent
|
|
20
|
+
molecular databases.
|
|
21
|
+
keywords:
|
|
22
|
+
- cheminformatics
|
|
23
|
+
- SMILES
|
|
24
|
+
- SMILES standardization
|
|
25
|
+
- RDKit
|
|
26
|
+
- COCONUT
|
|
27
|
+
- PubChem
|
|
28
|
+
- drug discovery
|
|
29
|
+
- molecular datasets
|