hashmol3d 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hashmol3d/__init__.py +32 -0
- hashmol3d/cli.py +109 -0
- hashmol3d/core.py +293 -0
- hashmol3d/io.py +80 -0
- hashmol3d/periodic_table.py +169 -0
- hashmol3d/version.py +3 -0
- hashmol3d-0.5.0.dist-info/METADATA +152 -0
- hashmol3d-0.5.0.dist-info/RECORD +12 -0
- hashmol3d-0.5.0.dist-info/WHEEL +5 -0
- hashmol3d-0.5.0.dist-info/entry_points.txt +2 -0
- hashmol3d-0.5.0.dist-info/licenses/LICENSE +3 -0
- hashmol3d-0.5.0.dist-info/top_level.txt +1 -0
hashmol3d/__init__.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""HashMol3D: deterministic 3D molecular geometry hashing."""
|
|
2
|
+
|
|
3
|
+
from .core import (
|
|
4
|
+
DESCRIPTOR_VERSION,
|
|
5
|
+
HashMol3DResult,
|
|
6
|
+
generate_hashmol3d,
|
|
7
|
+
hash_molecule,
|
|
8
|
+
)
|
|
9
|
+
from .io import read_xyz
|
|
10
|
+
from .version import __version__
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def hash_xyz(path, **kwargs) -> HashMol3DResult:
|
|
14
|
+
"""Read an XYZ file and hash its geometry.
|
|
15
|
+
|
|
16
|
+
Convenience wrapper combining :func:`read_xyz` and
|
|
17
|
+
:func:`hash_molecule`. Any keyword arguments are forwarded to
|
|
18
|
+
:func:`hash_molecule`.
|
|
19
|
+
"""
|
|
20
|
+
atomic_nums, coords = read_xyz(path)
|
|
21
|
+
return hash_molecule(atomic_nums, coords, **kwargs)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"DESCRIPTOR_VERSION",
|
|
26
|
+
"HashMol3DResult",
|
|
27
|
+
"__version__",
|
|
28
|
+
"generate_hashmol3d",
|
|
29
|
+
"hash_molecule",
|
|
30
|
+
"hash_xyz",
|
|
31
|
+
"read_xyz",
|
|
32
|
+
]
|
hashmol3d/cli.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Command-line interface for HashMol3D."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Sequence
|
|
8
|
+
|
|
9
|
+
from .core import hash_molecule
|
|
10
|
+
from .io import read_xyz
|
|
11
|
+
from .version import __version__
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
15
|
+
parser = argparse.ArgumentParser(
|
|
16
|
+
prog="hashmol3d",
|
|
17
|
+
description=(
|
|
18
|
+
"Deterministic 3D molecular geometry hash. "
|
|
19
|
+
"Reads an XYZ file and prints the HashMol3D identifier."
|
|
20
|
+
),
|
|
21
|
+
)
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"--version",
|
|
24
|
+
action="version",
|
|
25
|
+
version=f"hashmol3d {__version__}",
|
|
26
|
+
)
|
|
27
|
+
parser.add_argument("file", help="Path to a molecular geometry file (.xyz)")
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-p",
|
|
30
|
+
"--precision",
|
|
31
|
+
type=float,
|
|
32
|
+
default=1e-4,
|
|
33
|
+
metavar="Å",
|
|
34
|
+
help="Distance precision in angstroms (default: 1e-4)",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"-c",
|
|
38
|
+
"--charge",
|
|
39
|
+
type=int,
|
|
40
|
+
default=0,
|
|
41
|
+
help="Total formal charge (default: 0)",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"-m",
|
|
45
|
+
"--multiplicity",
|
|
46
|
+
type=int,
|
|
47
|
+
default=None,
|
|
48
|
+
help="Spin multiplicity (default: inferred from electron count)",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"-l",
|
|
52
|
+
"--length",
|
|
53
|
+
type=int,
|
|
54
|
+
default=None,
|
|
55
|
+
help="Number of hex characters in the geometry hash, 1-64 "
|
|
56
|
+
"(default: auto-scaled as clip(N, 16, 64))",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"-v",
|
|
60
|
+
"--verbose",
|
|
61
|
+
action="store_true",
|
|
62
|
+
help="Also print the canonical descriptor and metadata",
|
|
63
|
+
)
|
|
64
|
+
return parser
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def cli(argv: Sequence[str] | None = None) -> int:
|
|
68
|
+
"""Run the HashMol3D CLI.
|
|
69
|
+
|
|
70
|
+
Returns the process exit code. ``argv`` may be passed for testing;
|
|
71
|
+
if omitted, ``sys.argv[1:]`` is used.
|
|
72
|
+
"""
|
|
73
|
+
parser = _build_parser()
|
|
74
|
+
args = parser.parse_args(argv)
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
atomic_nums, coords = read_xyz(args.file)
|
|
78
|
+
result = hash_molecule(
|
|
79
|
+
atomic_nums,
|
|
80
|
+
coords,
|
|
81
|
+
precision=args.precision,
|
|
82
|
+
charge=args.charge,
|
|
83
|
+
multiplicity=args.multiplicity,
|
|
84
|
+
length=args.length,
|
|
85
|
+
)
|
|
86
|
+
except FileNotFoundError as err:
|
|
87
|
+
print(f"hashmol3d: {err}", file=sys.stderr)
|
|
88
|
+
return 1
|
|
89
|
+
except (ValueError, OSError) as err:
|
|
90
|
+
print(f"hashmol3d: {err}", file=sys.stderr)
|
|
91
|
+
return 1
|
|
92
|
+
|
|
93
|
+
if args.verbose:
|
|
94
|
+
print(f"identifier: {result.hash_str}")
|
|
95
|
+
print(f"formula: {result.formula}")
|
|
96
|
+
print(f"geometry_hash: {result.geometry_hash}")
|
|
97
|
+
print(f"descriptor: {result.descriptor}")
|
|
98
|
+
print(f"version: {result.version}")
|
|
99
|
+
print(f"precision: {result.precision}")
|
|
100
|
+
print(f"charge: {result.charge}")
|
|
101
|
+
print(f"multiplicity: {result.multiplicity}")
|
|
102
|
+
else:
|
|
103
|
+
print(result.hash_str)
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def main() -> None:
|
|
108
|
+
"""Entry point used by the ``hashmol3d`` console script."""
|
|
109
|
+
sys.exit(cli())
|
hashmol3d/core.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HashMol3D core: a deterministic identifier for 3D molecular conformers.
|
|
3
|
+
|
|
4
|
+
The identifier has the form::
|
|
5
|
+
|
|
6
|
+
<Hill formula><state tag>-<geometry hash>
|
|
7
|
+
|
|
8
|
+
For example, ``H2Oq0m1-a1b28135...`` for neutral singlet water.
|
|
9
|
+
|
|
10
|
+
The trailing hexadecimal hash is invariant under exactly the operations
|
|
11
|
+
that leave the non-relativistic molecular Hamiltonian's eigenvalues
|
|
12
|
+
unchanged:
|
|
13
|
+
|
|
14
|
+
* rigid translation of the coordinates
|
|
15
|
+
* rigid rotation of the coordinates
|
|
16
|
+
* permutation (relabeling) of atom indices
|
|
17
|
+
* spatial inversion / reflection (parity)
|
|
18
|
+
|
|
19
|
+
It depends on atomic numbers, pairwise distances (rounded to a user
|
|
20
|
+
specified precision), and the descriptor version. Total charge and
|
|
21
|
+
spin multiplicity are encoded in the readable prefix, not inside the
|
|
22
|
+
hash, so changing charge or multiplicity only changes the prefix.
|
|
23
|
+
|
|
24
|
+
The implementation has no RDKit dependency; it uses only NumPy and the
|
|
25
|
+
Python standard library.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import hashlib
|
|
31
|
+
import warnings
|
|
32
|
+
from collections import Counter
|
|
33
|
+
from dataclasses import dataclass
|
|
34
|
+
|
|
35
|
+
import numpy as np
|
|
36
|
+
|
|
37
|
+
from .periodic_table import get_symbol
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"DESCRIPTOR_VERSION",
|
|
41
|
+
"HashMol3DResult",
|
|
42
|
+
"generate_hashmol3d",
|
|
43
|
+
"hash_molecule",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# The descriptor version is part of the hashed payload. Bump it whenever
|
|
48
|
+
# the descriptor format changes in a way that would alter hashes.
|
|
49
|
+
DESCRIPTOR_VERSION = "4-GEOM-SHA256"
|
|
50
|
+
|
|
51
|
+
# Auto-scaled hash length. The number of distinguishable conformers grows
|
|
52
|
+
# (roughly) exponentially with N, so log2 of it grows linearly with N;
|
|
53
|
+
# growing the hash length linearly with N keeps birthday-collision risk
|
|
54
|
+
# constant. 16 hex chars (64 bits) is the floor for very small molecules;
|
|
55
|
+
# SHA-256 caps us at 64 hex chars (256 bits).
|
|
56
|
+
_MIN_LENGTH = 16
|
|
57
|
+
_MAX_LENGTH = 64
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class HashMol3DResult:
|
|
62
|
+
"""The result of hashing a molecular geometry."""
|
|
63
|
+
|
|
64
|
+
hash_str: str
|
|
65
|
+
formula: str
|
|
66
|
+
geometry_hash: str
|
|
67
|
+
version: str
|
|
68
|
+
precision: float
|
|
69
|
+
charge: int
|
|
70
|
+
multiplicity: int
|
|
71
|
+
descriptor: str
|
|
72
|
+
|
|
73
|
+
def __str__(self) -> str:
|
|
74
|
+
return self.hash_str
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _precision_to_decimals(precision: float) -> int:
|
|
78
|
+
"""Number of decimal places implied by a distance precision in Å."""
|
|
79
|
+
if not np.isfinite(precision) or precision <= 0:
|
|
80
|
+
raise ValueError(f"precision must be a positive finite number, got {precision!r}")
|
|
81
|
+
return int(max(0, round(-np.log10(precision))))
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _infer_multiplicity(atomic_nums: np.ndarray, charge: int, multiplicity: int | None) -> int:
|
|
85
|
+
"""Use the caller-supplied multiplicity, or infer one from electron count."""
|
|
86
|
+
if multiplicity is not None:
|
|
87
|
+
m = int(multiplicity)
|
|
88
|
+
if m < 1:
|
|
89
|
+
raise ValueError(f"multiplicity must be >= 1, got {m}")
|
|
90
|
+
return m
|
|
91
|
+
electrons = int(np.sum(atomic_nums)) - int(charge)
|
|
92
|
+
return 1 if electrons % 2 == 0 else 2
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _hill_formula(atomic_nums: np.ndarray) -> str:
|
|
96
|
+
"""Render the molecular formula in Hill order.
|
|
97
|
+
|
|
98
|
+
Carbon first (if present), then hydrogen (if present), then the
|
|
99
|
+
remaining elements alphabetically by symbol. A count of 1 is
|
|
100
|
+
omitted (e.g. ``H2O``, ``CHBrClF``).
|
|
101
|
+
"""
|
|
102
|
+
counts: Counter = Counter(int(z) for z in atomic_nums)
|
|
103
|
+
|
|
104
|
+
ordered: list[tuple[str, int]] = []
|
|
105
|
+
if 6 in counts:
|
|
106
|
+
ordered.append(("C", counts.pop(6)))
|
|
107
|
+
if 1 in counts:
|
|
108
|
+
ordered.append(("H", counts.pop(1)))
|
|
109
|
+
rest = sorted(((get_symbol(z), n) for z, n in counts.items()), key=lambda x: x[0])
|
|
110
|
+
ordered.extend(rest)
|
|
111
|
+
|
|
112
|
+
return "".join(sym if n == 1 else f"{sym}{n}" for sym, n in ordered)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _state_tag(charge: int, multiplicity: int) -> str:
|
|
116
|
+
"""Render the readable charge/multiplicity suffix, e.g. ``q1m2``.
|
|
117
|
+
|
|
118
|
+
Positive and zero charges are written without a sign (``q0``, ``q1``);
|
|
119
|
+
only negative charges carry a leading ``-`` (``q-1``).
|
|
120
|
+
"""
|
|
121
|
+
return f"q{charge}m{multiplicity}"
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _auto_length(n_atoms: int) -> int:
|
|
125
|
+
"""Default hash length in hex chars, scaling linearly with N."""
|
|
126
|
+
return max(_MIN_LENGTH, min(_MAX_LENGTH, n_atoms))
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _pair_signature(
|
|
130
|
+
atomic_nums: np.ndarray, coords: np.ndarray, decimals: int
|
|
131
|
+
) -> tuple[tuple[int, ...], list[tuple[int, int, float]]]:
|
|
132
|
+
"""Build the permutation-invariant fingerprint of the geometry.
|
|
133
|
+
|
|
134
|
+
Returns ``(z_sorted, pairs)`` where ``z_sorted`` is a sorted tuple of
|
|
135
|
+
atomic numbers and ``pairs`` is a sorted list of
|
|
136
|
+
``(Z_min, Z_max, rounded_distance)`` triples over every unordered pair
|
|
137
|
+
of atoms. Both objects are invariant under any relabeling of atoms
|
|
138
|
+
(multisets) and under any rigid motion or reflection (functions only
|
|
139
|
+
of Z and pairwise distances).
|
|
140
|
+
"""
|
|
141
|
+
n = atomic_nums.shape[0]
|
|
142
|
+
z_sorted = tuple(sorted(int(z) for z in atomic_nums))
|
|
143
|
+
|
|
144
|
+
if n < 2:
|
|
145
|
+
return z_sorted, []
|
|
146
|
+
|
|
147
|
+
diff = coords[:, None, :] - coords[None, :, :]
|
|
148
|
+
dmat = np.linalg.norm(diff, axis=-1)
|
|
149
|
+
iu, ju = np.triu_indices(n, k=1)
|
|
150
|
+
dvals = np.round(dmat[iu, ju], decimals=decimals)
|
|
151
|
+
|
|
152
|
+
z_i = atomic_nums[iu].astype(int)
|
|
153
|
+
z_j = atomic_nums[ju].astype(int)
|
|
154
|
+
za = np.minimum(z_i, z_j)
|
|
155
|
+
zb = np.maximum(z_i, z_j)
|
|
156
|
+
|
|
157
|
+
pairs = [(int(a), int(b), float(d)) for a, b, d in zip(za, zb, dvals)]
|
|
158
|
+
pairs.sort()
|
|
159
|
+
return z_sorted, pairs
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _format_descriptor(
|
|
163
|
+
version: str,
|
|
164
|
+
precision: float,
|
|
165
|
+
decimals: int,
|
|
166
|
+
z_sorted: tuple[int, ...],
|
|
167
|
+
pairs: list[tuple[int, int, float]],
|
|
168
|
+
) -> str:
|
|
169
|
+
"""Render the canonical descriptor string that is fed to SHA-256.
|
|
170
|
+
|
|
171
|
+
Charge and multiplicity are *not* included: they are part of the
|
|
172
|
+
readable prefix of the final identifier, not of the hashed payload.
|
|
173
|
+
"""
|
|
174
|
+
prec_str = f"{precision:.1e}"
|
|
175
|
+
z_part = ",".join(str(z) for z in z_sorted)
|
|
176
|
+
fmt = f"{{:.{decimals}f}}"
|
|
177
|
+
d_part = ",".join(f"{a}-{b}:{fmt.format(d)}" for a, b, d in pairs)
|
|
178
|
+
return "|".join(
|
|
179
|
+
[
|
|
180
|
+
"V:" + version,
|
|
181
|
+
"P:" + prec_str,
|
|
182
|
+
"Z:" + z_part,
|
|
183
|
+
"D:" + d_part,
|
|
184
|
+
]
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def hash_molecule(
|
|
189
|
+
atomic_nums,
|
|
190
|
+
coords,
|
|
191
|
+
*,
|
|
192
|
+
precision: float = 1e-4,
|
|
193
|
+
charge: int = 0,
|
|
194
|
+
multiplicity: int | None = None,
|
|
195
|
+
length: int | None = None,
|
|
196
|
+
) -> HashMol3DResult:
|
|
197
|
+
"""Compute the HashMol3D identifier for a 3D molecular geometry.
|
|
198
|
+
|
|
199
|
+
The identifier has the form ``<Hill formula><state tag>-<geom hash>``,
|
|
200
|
+
e.g. ``H2Oq0m1-a1b28135...``. Charge and multiplicity appear in the
|
|
201
|
+
readable prefix; only the geometry contributes to the hash.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
atomic_nums: integer array-like of atomic numbers, shape ``(N,)``.
|
|
205
|
+
coords: float array-like of Cartesian coordinates in Å, shape
|
|
206
|
+
``(N, 3)``.
|
|
207
|
+
precision: distance precision in Å (default ``1e-4``).
|
|
208
|
+
charge: total formal charge (default ``0``).
|
|
209
|
+
multiplicity: spin multiplicity (``1`` = singlet, ``2`` = doublet,
|
|
210
|
+
...). If ``None``, inferred as singlet/doublet from the
|
|
211
|
+
electron count.
|
|
212
|
+
length: number of hex characters retained from the SHA-256 digest.
|
|
213
|
+
Must be in ``[1, 64]``. If ``None`` (default), scales with the
|
|
214
|
+
number of atoms as ``clip(N, 16, 64)`` so collision risk stays
|
|
215
|
+
roughly constant as molecules grow.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
:class:`HashMol3DResult`.
|
|
219
|
+
"""
|
|
220
|
+
atomic_nums = np.asarray(atomic_nums, dtype=int).reshape(-1)
|
|
221
|
+
coords = np.asarray(coords, dtype=float)
|
|
222
|
+
|
|
223
|
+
if atomic_nums.size == 0:
|
|
224
|
+
raise ValueError("molecule must contain at least one atom")
|
|
225
|
+
if coords.ndim != 2 or coords.shape[1] != 3:
|
|
226
|
+
raise ValueError(f"coords must have shape (N, 3); got {coords.shape}")
|
|
227
|
+
if coords.shape[0] != atomic_nums.size:
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"atomic_nums has {atomic_nums.size} entries but coords has {coords.shape[0]} rows"
|
|
230
|
+
)
|
|
231
|
+
if not np.all(atomic_nums > 0):
|
|
232
|
+
raise ValueError("atomic numbers must be positive integers")
|
|
233
|
+
if not np.all(np.isfinite(coords)):
|
|
234
|
+
raise ValueError("coords contain non-finite values")
|
|
235
|
+
|
|
236
|
+
if length is None:
|
|
237
|
+
length = _auto_length(int(atomic_nums.size))
|
|
238
|
+
elif not isinstance(length, int) or not (1 <= length <= 64):
|
|
239
|
+
raise ValueError("length must be an int in [1, 64]")
|
|
240
|
+
|
|
241
|
+
charge = int(charge)
|
|
242
|
+
decimals = _precision_to_decimals(precision)
|
|
243
|
+
used_mult = _infer_multiplicity(atomic_nums, charge, multiplicity)
|
|
244
|
+
|
|
245
|
+
z_sorted, pairs = _pair_signature(atomic_nums, coords, decimals)
|
|
246
|
+
descriptor = _format_descriptor(DESCRIPTOR_VERSION, precision, decimals, z_sorted, pairs)
|
|
247
|
+
digest = hashlib.sha256(descriptor.encode("utf-8")).hexdigest()[:length]
|
|
248
|
+
|
|
249
|
+
formula = _hill_formula(atomic_nums)
|
|
250
|
+
identifier = f"{formula}{_state_tag(charge, used_mult)}-{digest}"
|
|
251
|
+
|
|
252
|
+
return HashMol3DResult(
|
|
253
|
+
hash_str=identifier,
|
|
254
|
+
formula=formula,
|
|
255
|
+
geometry_hash=digest,
|
|
256
|
+
version=DESCRIPTOR_VERSION,
|
|
257
|
+
precision=precision,
|
|
258
|
+
charge=charge,
|
|
259
|
+
multiplicity=used_mult,
|
|
260
|
+
descriptor=descriptor,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def generate_hashmol3d(
|
|
265
|
+
atomic_nums,
|
|
266
|
+
coords,
|
|
267
|
+
precision: float = 1e-4,
|
|
268
|
+
charge: int = 0,
|
|
269
|
+
multiplicity: int | None = None,
|
|
270
|
+
hash_length: int = 32,
|
|
271
|
+
) -> HashMol3DResult:
|
|
272
|
+
"""Deprecated alias for :func:`hash_molecule`.
|
|
273
|
+
|
|
274
|
+
.. deprecated:: 0.4.0
|
|
275
|
+
Use :func:`hash_molecule` instead. The ``hash_length`` keyword is
|
|
276
|
+
renamed to ``length`` in the new function. Note that as of 0.5.0
|
|
277
|
+
the returned ``hash_str`` is the full readable identifier
|
|
278
|
+
(``<formula><state>-<hash>``), not just the hex digest.
|
|
279
|
+
"""
|
|
280
|
+
warnings.warn(
|
|
281
|
+
"generate_hashmol3d() is deprecated; use hash_molecule() instead "
|
|
282
|
+
"(the hash_length kwarg is now called length).",
|
|
283
|
+
DeprecationWarning,
|
|
284
|
+
stacklevel=2,
|
|
285
|
+
)
|
|
286
|
+
return hash_molecule(
|
|
287
|
+
atomic_nums,
|
|
288
|
+
coords,
|
|
289
|
+
precision=precision,
|
|
290
|
+
charge=charge,
|
|
291
|
+
multiplicity=multiplicity,
|
|
292
|
+
length=hash_length,
|
|
293
|
+
)
|
hashmol3d/io.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""File I/O helpers for HashMol3D."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from .periodic_table import get_atomic_num
|
|
8
|
+
|
|
9
|
+
__all__ = ["read_xyz"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def read_xyz(path: str) -> tuple[np.ndarray, np.ndarray]:
|
|
13
|
+
"""Read a standard XYZ file.
|
|
14
|
+
|
|
15
|
+
The first line must contain the atom count; the second line is a
|
|
16
|
+
free-form comment; subsequent lines contain ``symbol x y z`` (or
|
|
17
|
+
``Z x y z``).
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
path: Path to the XYZ file.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
``(atomic_nums, coords)`` as NumPy arrays.
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
ValueError: if the file is malformed (bad atom count, missing or
|
|
27
|
+
extra atom lines, unknown element symbol, non-numeric coords).
|
|
28
|
+
"""
|
|
29
|
+
with open(path) as f:
|
|
30
|
+
lines = f.readlines()
|
|
31
|
+
|
|
32
|
+
if len(lines) < 2:
|
|
33
|
+
raise ValueError("XYZ file is too short (need at least 2 header lines)")
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
n_declared = int(lines[0].strip())
|
|
37
|
+
except ValueError as err:
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"first line of an XYZ file must be the atom count; got {lines[0].strip()!r}"
|
|
40
|
+
) from err
|
|
41
|
+
if n_declared <= 0:
|
|
42
|
+
raise ValueError(f"XYZ atom count must be positive, got {n_declared}")
|
|
43
|
+
|
|
44
|
+
z_list = []
|
|
45
|
+
coords_list = []
|
|
46
|
+
for raw in lines[2:]:
|
|
47
|
+
line = raw.strip()
|
|
48
|
+
if not line:
|
|
49
|
+
continue
|
|
50
|
+
parts = line.split()
|
|
51
|
+
if len(parts) < 4:
|
|
52
|
+
raise ValueError(f"malformed XYZ atom line: {line!r}")
|
|
53
|
+
|
|
54
|
+
sym = parts[0]
|
|
55
|
+
if sym.lstrip("-").isdigit():
|
|
56
|
+
z = int(sym)
|
|
57
|
+
if z <= 0 or z > 118:
|
|
58
|
+
raise ValueError(f"atomic number out of range: {z}")
|
|
59
|
+
else:
|
|
60
|
+
z = get_atomic_num(sym)
|
|
61
|
+
if z == 0:
|
|
62
|
+
raise ValueError(f"unknown element symbol: {sym!r}")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
xyz = [float(parts[1]), float(parts[2]), float(parts[3])]
|
|
66
|
+
except ValueError as err:
|
|
67
|
+
raise ValueError(f"malformed coordinates in XYZ line: {line!r}") from err
|
|
68
|
+
|
|
69
|
+
z_list.append(z)
|
|
70
|
+
coords_list.append(xyz)
|
|
71
|
+
if len(z_list) == n_declared:
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
if len(z_list) != n_declared:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"XYZ header declares {n_declared} atoms but only "
|
|
77
|
+
f"{len(z_list)} valid atom lines were found"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return np.array(z_list, dtype=int), np.array(coords_list, dtype=float)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Periodic table data and utilities for atomic number lookup.
|
|
2
|
+
|
|
3
|
+
Element symbols are stored in their canonical mixed-case form
|
|
4
|
+
(``"H"``, ``"He"``, ``"Co"``, ...) so that ``Co`` (cobalt) and ``CO``
|
|
5
|
+
(carbon + oxygen, which is not a single element) can be told apart.
|
|
6
|
+
|
|
7
|
+
``get_atomic_num`` accepts any case (``"co"``, ``"CO"``, ``"Co"``) and
|
|
8
|
+
normalizes to the canonical capitalization before lookup.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Dict
|
|
12
|
+
|
|
13
|
+
# Symbol -> atomic number, using canonical capitalization.
|
|
14
|
+
PTABLE: Dict[str, int] = {
|
|
15
|
+
"H": 1,
|
|
16
|
+
"He": 2,
|
|
17
|
+
"Li": 3,
|
|
18
|
+
"Be": 4,
|
|
19
|
+
"B": 5,
|
|
20
|
+
"C": 6,
|
|
21
|
+
"N": 7,
|
|
22
|
+
"O": 8,
|
|
23
|
+
"F": 9,
|
|
24
|
+
"Ne": 10,
|
|
25
|
+
"Na": 11,
|
|
26
|
+
"Mg": 12,
|
|
27
|
+
"Al": 13,
|
|
28
|
+
"Si": 14,
|
|
29
|
+
"P": 15,
|
|
30
|
+
"S": 16,
|
|
31
|
+
"Cl": 17,
|
|
32
|
+
"Ar": 18,
|
|
33
|
+
"K": 19,
|
|
34
|
+
"Ca": 20,
|
|
35
|
+
"Sc": 21,
|
|
36
|
+
"Ti": 22,
|
|
37
|
+
"V": 23,
|
|
38
|
+
"Cr": 24,
|
|
39
|
+
"Mn": 25,
|
|
40
|
+
"Fe": 26,
|
|
41
|
+
"Co": 27,
|
|
42
|
+
"Ni": 28,
|
|
43
|
+
"Cu": 29,
|
|
44
|
+
"Zn": 30,
|
|
45
|
+
"Ga": 31,
|
|
46
|
+
"Ge": 32,
|
|
47
|
+
"As": 33,
|
|
48
|
+
"Se": 34,
|
|
49
|
+
"Br": 35,
|
|
50
|
+
"Kr": 36,
|
|
51
|
+
"Rb": 37,
|
|
52
|
+
"Sr": 38,
|
|
53
|
+
"Y": 39,
|
|
54
|
+
"Zr": 40,
|
|
55
|
+
"Nb": 41,
|
|
56
|
+
"Mo": 42,
|
|
57
|
+
"Tc": 43,
|
|
58
|
+
"Ru": 44,
|
|
59
|
+
"Rh": 45,
|
|
60
|
+
"Pd": 46,
|
|
61
|
+
"Ag": 47,
|
|
62
|
+
"Cd": 48,
|
|
63
|
+
"In": 49,
|
|
64
|
+
"Sn": 50,
|
|
65
|
+
"Sb": 51,
|
|
66
|
+
"Te": 52,
|
|
67
|
+
"I": 53,
|
|
68
|
+
"Xe": 54,
|
|
69
|
+
"Cs": 55,
|
|
70
|
+
"Ba": 56,
|
|
71
|
+
"La": 57,
|
|
72
|
+
"Ce": 58,
|
|
73
|
+
"Pr": 59,
|
|
74
|
+
"Nd": 60,
|
|
75
|
+
"Pm": 61,
|
|
76
|
+
"Sm": 62,
|
|
77
|
+
"Eu": 63,
|
|
78
|
+
"Gd": 64,
|
|
79
|
+
"Tb": 65,
|
|
80
|
+
"Dy": 66,
|
|
81
|
+
"Ho": 67,
|
|
82
|
+
"Er": 68,
|
|
83
|
+
"Tm": 69,
|
|
84
|
+
"Yb": 70,
|
|
85
|
+
"Lu": 71,
|
|
86
|
+
"Hf": 72,
|
|
87
|
+
"Ta": 73,
|
|
88
|
+
"W": 74,
|
|
89
|
+
"Re": 75,
|
|
90
|
+
"Os": 76,
|
|
91
|
+
"Ir": 77,
|
|
92
|
+
"Pt": 78,
|
|
93
|
+
"Au": 79,
|
|
94
|
+
"Hg": 80,
|
|
95
|
+
"Tl": 81,
|
|
96
|
+
"Pb": 82,
|
|
97
|
+
"Bi": 83,
|
|
98
|
+
"Po": 84,
|
|
99
|
+
"At": 85,
|
|
100
|
+
"Rn": 86,
|
|
101
|
+
"Fr": 87,
|
|
102
|
+
"Ra": 88,
|
|
103
|
+
"Ac": 89,
|
|
104
|
+
"Th": 90,
|
|
105
|
+
"Pa": 91,
|
|
106
|
+
"U": 92,
|
|
107
|
+
"Np": 93,
|
|
108
|
+
"Pu": 94,
|
|
109
|
+
"Am": 95,
|
|
110
|
+
"Cm": 96,
|
|
111
|
+
"Bk": 97,
|
|
112
|
+
"Cf": 98,
|
|
113
|
+
"Es": 99,
|
|
114
|
+
"Fm": 100,
|
|
115
|
+
"Md": 101,
|
|
116
|
+
"No": 102,
|
|
117
|
+
"Lr": 103,
|
|
118
|
+
"Rf": 104,
|
|
119
|
+
"Db": 105,
|
|
120
|
+
"Sg": 106,
|
|
121
|
+
"Bh": 107,
|
|
122
|
+
"Hs": 108,
|
|
123
|
+
"Mt": 109,
|
|
124
|
+
"Ds": 110,
|
|
125
|
+
"Rg": 111,
|
|
126
|
+
"Cn": 112,
|
|
127
|
+
"Nh": 113,
|
|
128
|
+
"Fl": 114,
|
|
129
|
+
"Mc": 115,
|
|
130
|
+
"Lv": 116,
|
|
131
|
+
"Ts": 117,
|
|
132
|
+
"Og": 118,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# Atomic number -> canonical symbol (inverse of PTABLE).
|
|
137
|
+
SYMBOL_BY_Z: Dict[int, str] = {z: s for s, z in PTABLE.items()}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_symbol(atomic_num: int) -> str:
|
|
141
|
+
"""Get the canonical element symbol for an atomic number.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
KeyError: if ``atomic_num`` is not in ``[1, 118]``.
|
|
145
|
+
"""
|
|
146
|
+
return SYMBOL_BY_Z[int(atomic_num)]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _canonicalize_symbol(symbol: str) -> str:
|
|
150
|
+
"""Normalize an element symbol to canonical capitalization."""
|
|
151
|
+
s = symbol.strip()
|
|
152
|
+
if not s:
|
|
153
|
+
return s
|
|
154
|
+
if len(s) == 1:
|
|
155
|
+
return s.upper()
|
|
156
|
+
return s[0].upper() + s[1:].lower()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_atomic_num(symbol: str) -> int:
|
|
160
|
+
"""
|
|
161
|
+
Get atomic number from element symbol.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
symbol: Element symbol; capitalization is normalized.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Atomic number, or 0 if the symbol is not recognized.
|
|
168
|
+
"""
|
|
169
|
+
return PTABLE.get(_canonicalize_symbol(symbol), 0)
|
hashmol3d/version.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hashmol3d
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Deterministic 3D molecular geometry hashing standard
|
|
5
|
+
Author: Murat Keçeli
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Provides-Extra: test
|
|
11
|
+
Requires-Dist: pytest; extra == "test"
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# HashMol3D
|
|
15
|
+
|
|
16
|
+
**HashMol3D** is a standard, deterministic 3D molecular geometry identifier
|
|
17
|
+
for computational chemistry, machine learning, and HPC workflows.
|
|
18
|
+
|
|
19
|
+
It produces a **readable** identifier of the form
|
|
20
|
+
|
|
21
|
+
<Hill formula><state tag>-<geometry hash>
|
|
22
|
+
|
|
23
|
+
e.g. `H2Oq0m1-68936c504bf5fa3b` for neutral singlet water. The trailing
|
|
24
|
+
geometry hash is **rotation-, translation-, permutation-, and
|
|
25
|
+
parity-invariant** (matching the invariances of the eigenvalues of the
|
|
26
|
+
non-relativistic molecular Hamiltonian), and depends on:
|
|
27
|
+
|
|
28
|
+
- atomic numbers
|
|
29
|
+
- pairwise distances rounded to a user-specified precision
|
|
30
|
+
- a descriptor version tag
|
|
31
|
+
|
|
32
|
+
Charge and spin multiplicity live in the readable prefix, **not** in
|
|
33
|
+
the hash, so two states of the same geometry share the same hex tail
|
|
34
|
+
and can be grouped by suffix matching:
|
|
35
|
+
|
|
36
|
+
```text
|
|
37
|
+
H2Oq0m1-68936c504bf5fa3b # neutral singlet water
|
|
38
|
+
H2Oq1m2-68936c504bf5fa3b # water cation, same geometry → same hex tail
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
The hash length auto-scales with the number of atoms (`clip(N, 16, 64)`
|
|
42
|
+
hex chars) so collision risk stays roughly constant as molecules grow;
|
|
43
|
+
pass `length=` to pin a fixed value.
|
|
44
|
+
|
|
45
|
+
It deliberately does **not** distinguish enantiomers (which share their
|
|
46
|
+
Hamiltonian eigenvalues). The reference implementation depends only on
|
|
47
|
+
NumPy.
|
|
48
|
+
|
|
49
|
+
HashMol3D IDs are **stable across machines**, **reproducible**, and ideal for:
|
|
50
|
+
- workflow deduplication
|
|
51
|
+
- caching
|
|
52
|
+
- large QC datasets
|
|
53
|
+
- MD conformer tracking
|
|
54
|
+
- ML potential datasets
|
|
55
|
+
- LLM scientific agents
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
### Using pip
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install hashmol3d
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Using uv
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Install from PyPI
|
|
69
|
+
uv pip install hashmol3d
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Install from source
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Clone the repository
|
|
76
|
+
git clone https://github.com/yourusername/HashMol3D.git
|
|
77
|
+
cd HashMol3D
|
|
78
|
+
|
|
79
|
+
# Create and activate a virtual environment (recommended)
|
|
80
|
+
# Using uv:
|
|
81
|
+
uv venv
|
|
82
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
83
|
+
|
|
84
|
+
# Or using standard Python:
|
|
85
|
+
python -m venv .venv
|
|
86
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
87
|
+
|
|
88
|
+
# Install the package in editable mode
|
|
89
|
+
uv pip install -e . # Or: pip install -e .
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Usage (CLI)
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
$ hashmol3d water.xyz
|
|
96
|
+
H2Oq0m1-68936c504bf5fa3b
|
|
97
|
+
|
|
98
|
+
# Cation with explicit multiplicity — only the prefix changes.
|
|
99
|
+
$ hashmol3d -c 1 -m 2 water.xyz
|
|
100
|
+
H2Oq1m2-68936c504bf5fa3b
|
|
101
|
+
|
|
102
|
+
# Pin a fixed hash length and a coarser precision.
|
|
103
|
+
$ hashmol3d -p 1e-3 -l 32 benzene.xyz
|
|
104
|
+
|
|
105
|
+
# Verbose: also print formula, geometry hash, descriptor, and metadata.
|
|
106
|
+
$ hashmol3d -v water.xyz
|
|
107
|
+
|
|
108
|
+
# Show the package version.
|
|
109
|
+
$ hashmol3d --version
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Short flags: `-p/--precision`, `-c/--charge`, `-m/--multiplicity`,
|
|
113
|
+
`-l/--length`, `-v/--verbose`. Errors on missing or malformed input go
|
|
114
|
+
to stderr with exit code 1 (no Python traceback).
|
|
115
|
+
|
|
116
|
+
## Usage (Python)
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
import numpy as np
|
|
120
|
+
from hashmol3d import hash_molecule
|
|
121
|
+
|
|
122
|
+
atomic_nums = np.array([8, 1, 1])
|
|
123
|
+
coords = np.array([
|
|
124
|
+
[ 0.0000, 0.0000, 0.0],
|
|
125
|
+
[ 0.7572, 0.5860, 0.0],
|
|
126
|
+
[-0.7572, 0.5860, 0.0],
|
|
127
|
+
])
|
|
128
|
+
res = hash_molecule(atomic_nums, coords)
|
|
129
|
+
print(res.hash_str) # H2Oq0m1-68936c504bf5fa3b
|
|
130
|
+
print(res.formula) # H2O
|
|
131
|
+
print(res.geometry_hash) # 68936c504bf5fa3b
|
|
132
|
+
print(res.charge, res.multiplicity) # 0 1
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
All optional arguments are keyword-only: `precision`, `charge`,
|
|
136
|
+
`multiplicity`, `length`.
|
|
137
|
+
|
|
138
|
+
Or read straight from a file:
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from hashmol3d import hash_xyz
|
|
142
|
+
|
|
143
|
+
print(hash_xyz("water.xyz").hash_str) # H2Oq0m1-68936c504bf5fa3b
|
|
144
|
+
print(hash_xyz("water.xyz", charge=1, multiplicity=2).hash_str)
|
|
145
|
+
# H2Oq1m2-68936c504bf5fa3b
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
See [`docs/`](docs/) for the full
|
|
149
|
+
[specification](docs/specification.md),
|
|
150
|
+
[API reference](docs/api_reference.md), and
|
|
151
|
+
[CLI guide](docs/cli_usage.md).
|
|
152
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
hashmol3d/__init__.py,sha256=p7-ZEIqORy-Ag48OH7I_q5yLN2avJLmB_hISa7l-tMc,746
|
|
2
|
+
hashmol3d/cli.py,sha256=sHBz5ehATwvrv7x-iJACh6tX1Y4pMGjFagjGfR6bXPE,3021
|
|
3
|
+
hashmol3d/core.py,sha256=Ni0sWPBPw9IOGh_EZSiqo5DuppN9BFwQ-IZ3bMny6nc,9852
|
|
4
|
+
hashmol3d/io.py,sha256=HtN0EuDBQADqNGQxqxanZ6Old2BvV2_j3mpHlOBybO4,2349
|
|
5
|
+
hashmol3d/periodic_table.py,sha256=7K8fZ-6m-oVzIcrDxaOfGhx5bT19qc_9tgZ3wDU06MM,3066
|
|
6
|
+
hashmol3d/version.py,sha256=vjyKlU95LK1slAc40jPFhbpCONJ-peuFyEHdOGuTbYA,64
|
|
7
|
+
hashmol3d-0.5.0.dist-info/licenses/LICENSE,sha256=KYkSY4pbu9rhIRi9_q7SuwZcRrrKUqV6iiSUNKyujiU,23
|
|
8
|
+
hashmol3d-0.5.0.dist-info/METADATA,sha256=i7Ff13vFq8WfKpuhDKCuStMMaAEzWPHFHg10L7q7aaA,3982
|
|
9
|
+
hashmol3d-0.5.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
10
|
+
hashmol3d-0.5.0.dist-info/entry_points.txt,sha256=j1uNhM8Nqdk1TAN1fS9oDSMUrw6weW83_18BqMwr_9w,49
|
|
11
|
+
hashmol3d-0.5.0.dist-info/top_level.txt,sha256=tTvyNJglebM26-3e-0bBBMF-GuXdkKTEa0grZX6UF6I,10
|
|
12
|
+
hashmol3d-0.5.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hashmol3d
|