harmonsmile 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ # SPDX-License-Identifier: LGPL-3.0-or-later
2
+ name: CI
3
+
4
+ on:
5
+ pull_request:
6
+ push:
7
+ branches:
8
+ - main
9
+ - "dev-v*"
10
+ - "release/*"
11
+
12
+ jobs:
13
+ test-build-smoke:
14
+ name: Test, build, and smoke install
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - name: Check out repository
19
+ uses: actions/checkout@v4
20
+
21
+ - name: Set up Python
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: "3.11"
25
+ cache: pip
26
+
27
+ - name: Install package and validation tools
28
+ run: |
29
+ python -m pip install --upgrade pip
30
+ python -m pip install "setuptools>=68" wheel
31
+ python -m pip install -e ".[dev]" build twine
32
+
33
+ - name: Run tests
34
+ run: python -m pytest tests -p no:cacheprovider --basetemp .pytest_tmp
35
+
36
+ - name: Check CLI version
37
+ run: harmonsmile --version
38
+
39
+ - name: Check public import boundary
40
+ run: python -c "import harmonsmile; print(harmonsmile.__version__, harmonsmile.__all__, hasattr(harmonsmile, 'RDKitStandardizer'))"
41
+
42
+ - name: Build distributions
43
+ run: python -m build
44
+
45
+ - name: Check distributions
46
+ run: python -m twine check dist/*
47
+
48
+ - name: Smoke install built wheel
49
+ shell: bash
50
+ run: |
51
+ set -euo pipefail
52
+ WHEEL="$(ls dist/*.whl | head -n 1)"
53
+ python -m venv .smoke_venv
54
+ .smoke_venv/bin/python -m pip install --upgrade pip
55
+ .smoke_venv/bin/python -m pip install "$WHEEL"
56
+ mkdir -p .smoke_outside_checkout
57
+ cd .smoke_outside_checkout
58
+ ../.smoke_venv/bin/harmonsmile --version
59
+ ../.smoke_venv/bin/python -c "import harmonsmile; print(harmonsmile.__version__, harmonsmile.__all__, hasattr(harmonsmile, 'RDKitStandardizer'))"
@@ -0,0 +1,91 @@
1
+ name: Publish Python package to PyPI
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ tag:
7
+ description: "Release tag to publish, for example v0.1.0"
8
+ required: true
9
+ type: string
10
+
11
+ permissions:
12
+ contents: read
13
+
14
+ jobs:
15
+ build:
16
+ name: Build distribution
17
+ runs-on: ubuntu-latest
18
+
19
+ steps:
20
+ - name: Check out release tag
21
+ uses: actions/checkout@v4
22
+ with:
23
+ ref: ${{ inputs.tag }}
24
+ persist-credentials: false
25
+
26
+ - name: Validate release tag
27
+ run: |
28
+ case "${{ inputs.tag }}" in
29
+ v[0-9]*.[0-9]*.[0-9]*) ;;
30
+ *)
31
+ echo "Input tag must look like vX.Y.Z, for example v0.1.1"
32
+ exit 1
33
+ ;;
34
+ esac
35
+
36
+ git fetch --tags --force
37
+ git rev-parse -q --verify "refs/tags/${{ inputs.tag }}" >/dev/null
38
+
39
+ - name: Verify package version matches tag
40
+ run: |
41
+ expected="${{ inputs.tag }}"
42
+ expected="${expected#v}"
43
+ actual="$(python -c "import runpy; print(runpy.run_path('harmonsmile/version.py')['__version__'])")"
44
+
45
+ if [ "$actual" != "$expected" ]; then
46
+ echo "Package version $actual does not match tag ${{ inputs.tag }}"
47
+ exit 1
48
+ fi
49
+
50
+ - name: Set up Python
51
+ uses: actions/setup-python@v5
52
+ with:
53
+ python-version: "3.11"
54
+
55
+ - name: Install build tools
56
+ run: python -m pip install --upgrade pip build twine
57
+
58
+ - name: Build wheel and source distribution
59
+ run: python -m build
60
+
61
+ - name: Check distribution metadata
62
+ run: python -m twine check dist/*
63
+
64
+ - name: Upload distribution artifact
65
+ uses: actions/upload-artifact@v4
66
+ with:
67
+ name: python-package-distributions
68
+ path: dist/
69
+ if-no-files-found: error
70
+
71
+ publish:
72
+ name: Publish distribution to PyPI
73
+ needs: build
74
+ runs-on: ubuntu-latest
75
+
76
+ environment:
77
+ name: pypi
78
+ url: https://pypi.org/p/harmonsmile
79
+
80
+ permissions:
81
+ id-token: write
82
+
83
+ steps:
84
+ - name: Download distribution artifact
85
+ uses: actions/download-artifact@v4
86
+ with:
87
+ name: python-package-distributions
88
+ path: dist/
89
+
90
+ - name: Publish distribution to PyPI
91
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,28 @@
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ .pytest_cache/
5
+
6
+ # Environments
7
+ .venv/
8
+ .env
9
+
10
+ # Local data and outputs
11
+ data/*
12
+ results/*
13
+ logs/*
14
+
15
+ # Keep folders empty
16
+ !data/.gitkeep
17
+ !results/.gitkeep
18
+ !logs/.gitkeep
19
+
20
+ # Distribution and build files
21
+ dist/
22
+ *.egg-info/
23
+ build/
24
+ __pycache__/
25
+ *.pyc
26
+ .pytest_tmp/
27
+ .pytest_cache/
28
+ tests_output.txt
@@ -0,0 +1,5 @@
1
+ {
2
+ "python-envs.defaultEnvManager": "ms-python.python:conda",
3
+ "python-envs.defaultPackageManager": "ms-python.python:conda",
4
+ "python-envs.pythonProjects": []
5
+ }
@@ -0,0 +1,105 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ---
9
+
10
+ ## [Unreleased]
11
+
12
+ ---
13
+
14
+ ## [0.1.1] - 2026-05-18
15
+
16
+ ### Added
17
+ - `RDKitStandardizer` class with two SMILES normalization methods:
18
+ - `to_iso_kek()` — canonical + isomeric + Kekulized SMILES (COCONUT 2.0 convention)
19
+ - `to_conn_kek()` — canonical + connectivity-only + Kekulized SMILES
20
+ - `PubChemIngest` pipeline: fetches all available properties from PubChem REST API
21
+ (SMILES, ConnectivitySMILES, MolecularWeight, MolecularFormula, InChI, InChIKey,
22
+ XLogP, TPSA, Charge, HBondDonorCount, HBondAcceptorCount, RotatableBondCount,
23
+ HeavyAtomCount) and appends standardized `SMILES_RDKit` column.
24
+ - `ChEMBLIngest` pipeline: fetches properties from ChEMBL REST API by ChEMBL ID
25
+ (canonical_smiles, InChI, InChIKey, MW, MolecularFormula, ALogP, TPSA, HBA, HBD,
26
+ RotatableBonds, HeavyAtoms, QED, Ro5Violations) and appends standardized `SMILES_RDKit` column.
27
+ - `SMILESPrep` pipeline: standardizes SMILES from any CSV/Excel file using RDKit —
28
+ accepts any tabular source (COCONUT, ChEMBL downloads, in-house databases, etc.).
29
+ - `_PubChemClient` with configurable retries, exponential backoff, persistent
30
+ `requests.Session`, context manager protocol, and pluggable logger.
31
+ - `_ChEMBLClient` with same design as `_PubChemClient` — ChEMBL ID format validation,
32
+ exponential backoff, context manager protocol.
33
+ - `Config` frozen dataclass for pipeline configuration with `__post_init__` validation.
34
+ - `load_table()` and `save_table()` I/O utilities supporting CSV, TSV, XLSX, and XLS
35
+ formats, with `PathLike` support.
36
+ - `version.py` as single source of truth for package metadata (`__version__`,
37
+ `PROJECT_NAME`, `PROJECT_VERSION`, `PROJECT_STATUS`).
38
+ - Command-line interface via `harmonsmile` entry point and `python -m harmonsmile`,
39
+ with paired argument validation, grouped help output, and `--version` flag.
40
+ - `pyproject.toml` for PyPI packaging (build backend: hatchling).
41
+ - SPDX license headers (`LGPL-3.0-or-later`) in all source files.
42
+ - NumPy-style docstrings with Examples in all public modules and classes.
43
+ - `CITATION.cff` for software citation.
44
+ - `CHANGELOG.md` following Keep a Changelog format.
45
+ - `environment.yml` and `requirements-dev.txt` for reproducible environments.
46
+ - Unit test suite with pytest covering standardize, config, io,
47
+ pubchem, chembl, and security.
48
+
49
+ ### Security
50
+ - `_PubChemClient`: bounds validation on `sleep` (0.1–10.0 s) and `retries` (1–10).
51
+ - `_PubChemClient.fetch_props()`: CID sanitization strips non-numeric characters
52
+ before URL construction.
53
+ - `_ChEMBLClient`: same bounds validation; ChEMBL ID format validated against
54
+ `^CHEMBL\d+$` regex before network calls.
55
+ - `Config`: path traversal guard rejects `output_path` containing `..`.
56
+ - `Config`: `VALID_PUBCHEM_PROPS` allowlist validates requested PubChem properties.
57
+
58
+ ### Changed
59
+ - License changed from MIT to GNU Lesser General Public License v3.0 or later
60
+ (LGPL-3.0-or-later).
61
+ - `__main__.py` now delegates to `harmonsmile._cli` instead of `cli.harmonize`,
62
+ making the package self-contained and installable from PyPI.
63
+ - Console status messages changed to English for international audience.
64
+ - `Config` is now immutable (`frozen=True`).
65
+ - `CoconutPrep` renamed to `SMILESPrep` to reflect its universal scope. `CoconutPrep`
66
+ remains available as a deprecated alias and will be removed in a future release.
67
+ - `PubChemClient` renamed to `_PubChemClient` (private) to prevent direct use that
68
+ could abuse the PubChem REST API. `PubChemClient` remains available as a deprecated
69
+ alias and will be removed in a future release.
70
+ - Default `props` in `Config` expanded to include all available PubChem properties.
71
+ - Development status set to Alpha (`3 - Alpha`) reflecting first public release.
72
+
73
+ ### Fixed
74
+ - Double `time.sleep()` call in `_PubChemClient.fetch_props()` that caused unnecessary
75
+ delays on successful requests.
76
+ - Missing column validation in `PubChemIngest.run()` before initiating network calls.
77
+ - Incorrect guard condition for `SMILES_RDKit` counter in `PubChemIngest.run()`.
78
+ - Unguarded `Chem.MolToSmiles()` call in `RDKitStandardizer` that could raise unhandled
79
+ C++ exceptions for unusual aromaticity models.
80
+ - Fallback encoding in `load_table()` changed to `latin-1` to correctly handle
81
+ non-UTF-8 encoded files.
82
+
83
+ ### Removed
84
+ - Redundant `cli/` scripts (`harmonize.py`, `ingest_pubchem.py`, `prep_coconut.py`)
85
+ superseded by the unified `harmonsmile` entry point.
86
+ - Unused `id_col` field from `Config` dataclass.
87
+
88
+ ---
89
+
90
+ ## Future Releases (Planned)
91
+
92
+ ### [0.2.0] — COCONUT source
93
+ - Add `CoconutIngest` pipeline — knows COCONUT 2.0 schema automatically
94
+ (`canonical_smiles` column, `identifier`, molecular properties).
95
+ - Optional COCONUT REST API integration (authenticated).
96
+
97
+ ### [0.3.0] — ML-ready features
98
+ - Standardized pipeline to generate ECFP fingerprints (with/without chirality).
99
+ - InChI / InChIKey generation for deduplication and robust cross-database matching.
100
+
101
+ ---
102
+
103
+ [Unreleased]: https://github.com/NanoBiostructuresRG/harmonsmile/compare/v0.1.0...HEAD
104
+ [0.1.1]: https://github.com/NanoBiostructuresRG/harmonsmile/releases/tag/v0.1.1
105
+ [0.1.0]: https://github.com/NanoBiostructuresRG/harmonsmile/releases/tag/v0.1.0
@@ -0,0 +1,29 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use this software, please cite it as below."
3
+ type: software
4
+ title: "HARMONSMILE: Harmonize SMILES Strings for Cheminformatics and Machine Learning"
5
+ version: "0.1.1"
6
+ date-released: "2026-05-18"
7
+ authors:
8
+ - family-names: "Contreras-Torres"
9
+ given-names: "Flavio F."
10
+ orcid: "https://orcid.org/0000-0003-2375-131X"
11
+ affiliation: "Tecnologico de Monterrey"
12
+ repository-code: "https://github.com/NanoBiostructuresRG/harmonsmile"
13
+ license: "LGPL-3.0-or-later"
14
+ abstract: >-
15
+ HARMONSMILE is a toolkit for harmonizing SMILES strings to a consistent
16
+ convention: canonical + isomeric + Kekulized, as used by RDKit and COCONUT 2.0.
17
+ It automates the preparation of SMILES for cheminformatics workflows and
18
+ machine learning applications within the computational drug discovery pipeline,
19
+ enabling standardized comparison across PubChem, COCONUT, and independent
20
+ molecular databases.
21
+ keywords:
22
+ - cheminformatics
23
+ - SMILES
24
+ - SMILES standardization
25
+ - RDKit
26
+ - COCONUT
27
+ - PubChem
28
+ - drug discovery
29
+ - molecular datasets