hashmol3d 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hashmol3d/__init__.py ADDED
@@ -0,0 +1,32 @@
1
+ """HashMol3D: deterministic 3D molecular geometry hashing."""
2
+
3
+ from .core import (
4
+ DESCRIPTOR_VERSION,
5
+ HashMol3DResult,
6
+ generate_hashmol3d,
7
+ hash_molecule,
8
+ )
9
+ from .io import read_xyz
10
+ from .version import __version__
11
+
12
+
13
+ def hash_xyz(path, **kwargs) -> HashMol3DResult:
14
+ """Read an XYZ file and hash its geometry.
15
+
16
+ Convenience wrapper combining :func:`read_xyz` and
17
+ :func:`hash_molecule`. Any keyword arguments are forwarded to
18
+ :func:`hash_molecule`.
19
+ """
20
+ atomic_nums, coords = read_xyz(path)
21
+ return hash_molecule(atomic_nums, coords, **kwargs)
22
+
23
+
24
+ __all__ = [
25
+ "DESCRIPTOR_VERSION",
26
+ "HashMol3DResult",
27
+ "__version__",
28
+ "generate_hashmol3d",
29
+ "hash_molecule",
30
+ "hash_xyz",
31
+ "read_xyz",
32
+ ]
hashmol3d/cli.py ADDED
@@ -0,0 +1,109 @@
1
+ """Command-line interface for HashMol3D."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ from typing import Sequence
8
+
9
+ from .core import hash_molecule
10
+ from .io import read_xyz
11
+ from .version import __version__
12
+
13
+
14
+ def _build_parser() -> argparse.ArgumentParser:
15
+ parser = argparse.ArgumentParser(
16
+ prog="hashmol3d",
17
+ description=(
18
+ "Deterministic 3D molecular geometry hash. "
19
+ "Reads an XYZ file and prints the HashMol3D identifier."
20
+ ),
21
+ )
22
+ parser.add_argument(
23
+ "--version",
24
+ action="version",
25
+ version=f"hashmol3d {__version__}",
26
+ )
27
+ parser.add_argument("file", help="Path to a molecular geometry file (.xyz)")
28
+ parser.add_argument(
29
+ "-p",
30
+ "--precision",
31
+ type=float,
32
+ default=1e-4,
33
+ metavar="Å",
34
+ help="Distance precision in angstroms (default: 1e-4)",
35
+ )
36
+ parser.add_argument(
37
+ "-c",
38
+ "--charge",
39
+ type=int,
40
+ default=0,
41
+ help="Total formal charge (default: 0)",
42
+ )
43
+ parser.add_argument(
44
+ "-m",
45
+ "--multiplicity",
46
+ type=int,
47
+ default=None,
48
+ help="Spin multiplicity (default: inferred from electron count)",
49
+ )
50
+ parser.add_argument(
51
+ "-l",
52
+ "--length",
53
+ type=int,
54
+ default=None,
55
+ help="Number of hex characters in the geometry hash, 1-64 "
56
+ "(default: auto-scaled as clip(N, 16, 64))",
57
+ )
58
+ parser.add_argument(
59
+ "-v",
60
+ "--verbose",
61
+ action="store_true",
62
+ help="Also print the canonical descriptor and metadata",
63
+ )
64
+ return parser
65
+
66
+
67
+ def cli(argv: Sequence[str] | None = None) -> int:
68
+ """Run the HashMol3D CLI.
69
+
70
+ Returns the process exit code. ``argv`` may be passed for testing;
71
+ if omitted, ``sys.argv[1:]`` is used.
72
+ """
73
+ parser = _build_parser()
74
+ args = parser.parse_args(argv)
75
+
76
+ try:
77
+ atomic_nums, coords = read_xyz(args.file)
78
+ result = hash_molecule(
79
+ atomic_nums,
80
+ coords,
81
+ precision=args.precision,
82
+ charge=args.charge,
83
+ multiplicity=args.multiplicity,
84
+ length=args.length,
85
+ )
86
+ except FileNotFoundError as err:
87
+ print(f"hashmol3d: {err}", file=sys.stderr)
88
+ return 1
89
+ except (ValueError, OSError) as err:
90
+ print(f"hashmol3d: {err}", file=sys.stderr)
91
+ return 1
92
+
93
+ if args.verbose:
94
+ print(f"identifier: {result.hash_str}")
95
+ print(f"formula: {result.formula}")
96
+ print(f"geometry_hash: {result.geometry_hash}")
97
+ print(f"descriptor: {result.descriptor}")
98
+ print(f"version: {result.version}")
99
+ print(f"precision: {result.precision}")
100
+ print(f"charge: {result.charge}")
101
+ print(f"multiplicity: {result.multiplicity}")
102
+ else:
103
+ print(result.hash_str)
104
+ return 0
105
+
106
+
107
+ def main() -> None:
108
+ """Entry point used by the ``hashmol3d`` console script."""
109
+ sys.exit(cli())
hashmol3d/core.py ADDED
@@ -0,0 +1,293 @@
1
+ """
2
+ HashMol3D core: a deterministic identifier for 3D molecular conformers.
3
+
4
+ The identifier has the form::
5
+
6
+ <Hill formula><state tag>-<geometry hash>
7
+
8
+ For example, ``H2Oq0m1-a1b28135...`` for neutral singlet water.
9
+
10
+ The trailing hexadecimal hash is invariant under exactly the operations
11
+ that leave the non-relativistic molecular Hamiltonian's eigenvalues
12
+ unchanged:
13
+
14
+ * rigid translation of the coordinates
15
+ * rigid rotation of the coordinates
16
+ * permutation (relabeling) of atom indices
17
+ * spatial inversion / reflection (parity)
18
+
19
+ It depends on atomic numbers, pairwise distances (rounded to a user
20
+ specified precision), and the descriptor version. Total charge and
21
+ spin multiplicity are encoded in the readable prefix, not inside the
22
+ hash, so changing charge or multiplicity only changes the prefix.
23
+
24
+ The implementation has no RDKit dependency; it uses only NumPy and the
25
+ Python standard library.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import hashlib
31
+ import warnings
32
+ from collections import Counter
33
+ from dataclasses import dataclass
34
+
35
+ import numpy as np
36
+
37
+ from .periodic_table import get_symbol
38
+
39
+ __all__ = [
40
+ "DESCRIPTOR_VERSION",
41
+ "HashMol3DResult",
42
+ "generate_hashmol3d",
43
+ "hash_molecule",
44
+ ]
45
+
46
+
47
+ # The descriptor version is part of the hashed payload. Bump it whenever
48
+ # the descriptor format changes in a way that would alter hashes.
49
+ DESCRIPTOR_VERSION = "4-GEOM-SHA256"
50
+
51
+ # Auto-scaled hash length. The number of distinguishable conformers grows
52
+ # (roughly) exponentially with N, so log2 of it grows linearly with N;
53
+ # growing the hash length linearly with N keeps birthday-collision risk
54
+ # constant. 16 hex chars (64 bits) is the floor for very small molecules;
55
+ # SHA-256 caps us at 64 hex chars (256 bits).
56
+ _MIN_LENGTH = 16
57
+ _MAX_LENGTH = 64
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class HashMol3DResult:
62
+ """The result of hashing a molecular geometry."""
63
+
64
+ hash_str: str
65
+ formula: str
66
+ geometry_hash: str
67
+ version: str
68
+ precision: float
69
+ charge: int
70
+ multiplicity: int
71
+ descriptor: str
72
+
73
+ def __str__(self) -> str:
74
+ return self.hash_str
75
+
76
+
77
+ def _precision_to_decimals(precision: float) -> int:
78
+ """Number of decimal places implied by a distance precision in Å."""
79
+ if not np.isfinite(precision) or precision <= 0:
80
+ raise ValueError(f"precision must be a positive finite number, got {precision!r}")
81
+ return int(max(0, round(-np.log10(precision))))
82
+
83
+
84
+ def _infer_multiplicity(atomic_nums: np.ndarray, charge: int, multiplicity: int | None) -> int:
85
+ """Use the caller-supplied multiplicity, or infer one from electron count."""
86
+ if multiplicity is not None:
87
+ m = int(multiplicity)
88
+ if m < 1:
89
+ raise ValueError(f"multiplicity must be >= 1, got {m}")
90
+ return m
91
+ electrons = int(np.sum(atomic_nums)) - int(charge)
92
+ return 1 if electrons % 2 == 0 else 2
93
+
94
+
95
+ def _hill_formula(atomic_nums: np.ndarray) -> str:
96
+ """Render the molecular formula in Hill order.
97
+
98
+ Carbon first (if present), then hydrogen (if present), then the
99
+ remaining elements alphabetically by symbol. A count of 1 is
100
+ omitted (e.g. ``H2O``, ``CHBrClF``).
101
+ """
102
+ counts: Counter = Counter(int(z) for z in atomic_nums)
103
+
104
+ ordered: list[tuple[str, int]] = []
105
+ if 6 in counts:
106
+ ordered.append(("C", counts.pop(6)))
107
+ if 1 in counts:
108
+ ordered.append(("H", counts.pop(1)))
109
+ rest = sorted(((get_symbol(z), n) for z, n in counts.items()), key=lambda x: x[0])
110
+ ordered.extend(rest)
111
+
112
+ return "".join(sym if n == 1 else f"{sym}{n}" for sym, n in ordered)
113
+
114
+
115
+ def _state_tag(charge: int, multiplicity: int) -> str:
116
+ """Render the readable charge/multiplicity suffix, e.g. ``q1m2``.
117
+
118
+ Positive and zero charges are written without a sign (``q0``, ``q1``);
119
+ only negative charges carry a leading ``-`` (``q-1``).
120
+ """
121
+ return f"q{charge}m{multiplicity}"
122
+
123
+
124
+ def _auto_length(n_atoms: int) -> int:
125
+ """Default hash length in hex chars, scaling linearly with N."""
126
+ return max(_MIN_LENGTH, min(_MAX_LENGTH, n_atoms))
127
+
128
+
129
+ def _pair_signature(
130
+ atomic_nums: np.ndarray, coords: np.ndarray, decimals: int
131
+ ) -> tuple[tuple[int, ...], list[tuple[int, int, float]]]:
132
+ """Build the permutation-invariant fingerprint of the geometry.
133
+
134
+ Returns ``(z_sorted, pairs)`` where ``z_sorted`` is a sorted tuple of
135
+ atomic numbers and ``pairs`` is a sorted list of
136
+ ``(Z_min, Z_max, rounded_distance)`` triples over every unordered pair
137
+ of atoms. Both objects are invariant under any relabeling of atoms
138
+ (multisets) and under any rigid motion or reflection (functions only
139
+ of Z and pairwise distances).
140
+ """
141
+ n = atomic_nums.shape[0]
142
+ z_sorted = tuple(sorted(int(z) for z in atomic_nums))
143
+
144
+ if n < 2:
145
+ return z_sorted, []
146
+
147
+ diff = coords[:, None, :] - coords[None, :, :]
148
+ dmat = np.linalg.norm(diff, axis=-1)
149
+ iu, ju = np.triu_indices(n, k=1)
150
+ dvals = np.round(dmat[iu, ju], decimals=decimals)
151
+
152
+ z_i = atomic_nums[iu].astype(int)
153
+ z_j = atomic_nums[ju].astype(int)
154
+ za = np.minimum(z_i, z_j)
155
+ zb = np.maximum(z_i, z_j)
156
+
157
+ pairs = [(int(a), int(b), float(d)) for a, b, d in zip(za, zb, dvals)]
158
+ pairs.sort()
159
+ return z_sorted, pairs
160
+
161
+
162
+ def _format_descriptor(
163
+ version: str,
164
+ precision: float,
165
+ decimals: int,
166
+ z_sorted: tuple[int, ...],
167
+ pairs: list[tuple[int, int, float]],
168
+ ) -> str:
169
+ """Render the canonical descriptor string that is fed to SHA-256.
170
+
171
+ Charge and multiplicity are *not* included: they are part of the
172
+ readable prefix of the final identifier, not of the hashed payload.
173
+ """
174
+ prec_str = f"{precision:.1e}"
175
+ z_part = ",".join(str(z) for z in z_sorted)
176
+ fmt = f"{{:.{decimals}f}}"
177
+ d_part = ",".join(f"{a}-{b}:{fmt.format(d)}" for a, b, d in pairs)
178
+ return "|".join(
179
+ [
180
+ "V:" + version,
181
+ "P:" + prec_str,
182
+ "Z:" + z_part,
183
+ "D:" + d_part,
184
+ ]
185
+ )
186
+
187
+
188
+ def hash_molecule(
189
+ atomic_nums,
190
+ coords,
191
+ *,
192
+ precision: float = 1e-4,
193
+ charge: int = 0,
194
+ multiplicity: int | None = None,
195
+ length: int | None = None,
196
+ ) -> HashMol3DResult:
197
+ """Compute the HashMol3D identifier for a 3D molecular geometry.
198
+
199
+ The identifier has the form ``<Hill formula><state tag>-<geom hash>``,
200
+ e.g. ``H2Oq0m1-a1b28135...``. Charge and multiplicity appear in the
201
+ readable prefix; only the geometry contributes to the hash.
202
+
203
+ Args:
204
+ atomic_nums: integer array-like of atomic numbers, shape ``(N,)``.
205
+ coords: float array-like of Cartesian coordinates in Å, shape
206
+ ``(N, 3)``.
207
+ precision: distance precision in Å (default ``1e-4``).
208
+ charge: total formal charge (default ``0``).
209
+ multiplicity: spin multiplicity (``1`` = singlet, ``2`` = doublet,
210
+ ...). If ``None``, inferred as singlet/doublet from the
211
+ electron count.
212
+ length: number of hex characters retained from the SHA-256 digest.
213
+ Must be in ``[1, 64]``. If ``None`` (default), scales with the
214
+ number of atoms as ``clip(N, 16, 64)`` so collision risk stays
215
+ roughly constant as molecules grow.
216
+
217
+ Returns:
218
+ :class:`HashMol3DResult`.
219
+ """
220
+ atomic_nums = np.asarray(atomic_nums, dtype=int).reshape(-1)
221
+ coords = np.asarray(coords, dtype=float)
222
+
223
+ if atomic_nums.size == 0:
224
+ raise ValueError("molecule must contain at least one atom")
225
+ if coords.ndim != 2 or coords.shape[1] != 3:
226
+ raise ValueError(f"coords must have shape (N, 3); got {coords.shape}")
227
+ if coords.shape[0] != atomic_nums.size:
228
+ raise ValueError(
229
+ f"atomic_nums has {atomic_nums.size} entries but coords has {coords.shape[0]} rows"
230
+ )
231
+ if not np.all(atomic_nums > 0):
232
+ raise ValueError("atomic numbers must be positive integers")
233
+ if not np.all(np.isfinite(coords)):
234
+ raise ValueError("coords contain non-finite values")
235
+
236
+ if length is None:
237
+ length = _auto_length(int(atomic_nums.size))
238
+ elif not isinstance(length, int) or not (1 <= length <= 64):
239
+ raise ValueError("length must be an int in [1, 64]")
240
+
241
+ charge = int(charge)
242
+ decimals = _precision_to_decimals(precision)
243
+ used_mult = _infer_multiplicity(atomic_nums, charge, multiplicity)
244
+
245
+ z_sorted, pairs = _pair_signature(atomic_nums, coords, decimals)
246
+ descriptor = _format_descriptor(DESCRIPTOR_VERSION, precision, decimals, z_sorted, pairs)
247
+ digest = hashlib.sha256(descriptor.encode("utf-8")).hexdigest()[:length]
248
+
249
+ formula = _hill_formula(atomic_nums)
250
+ identifier = f"{formula}{_state_tag(charge, used_mult)}-{digest}"
251
+
252
+ return HashMol3DResult(
253
+ hash_str=identifier,
254
+ formula=formula,
255
+ geometry_hash=digest,
256
+ version=DESCRIPTOR_VERSION,
257
+ precision=precision,
258
+ charge=charge,
259
+ multiplicity=used_mult,
260
+ descriptor=descriptor,
261
+ )
262
+
263
+
264
+ def generate_hashmol3d(
265
+ atomic_nums,
266
+ coords,
267
+ precision: float = 1e-4,
268
+ charge: int = 0,
269
+ multiplicity: int | None = None,
270
+ hash_length: int = 32,
271
+ ) -> HashMol3DResult:
272
+ """Deprecated alias for :func:`hash_molecule`.
273
+
274
+ .. deprecated:: 0.4.0
275
+ Use :func:`hash_molecule` instead. The ``hash_length`` keyword is
276
+ renamed to ``length`` in the new function. Note that as of 0.5.0
277
+ the returned ``hash_str`` is the full readable identifier
278
+ (``<formula><state>-<hash>``), not just the hex digest.
279
+ """
280
+ warnings.warn(
281
+ "generate_hashmol3d() is deprecated; use hash_molecule() instead "
282
+ "(the hash_length kwarg is now called length).",
283
+ DeprecationWarning,
284
+ stacklevel=2,
285
+ )
286
+ return hash_molecule(
287
+ atomic_nums,
288
+ coords,
289
+ precision=precision,
290
+ charge=charge,
291
+ multiplicity=multiplicity,
292
+ length=hash_length,
293
+ )
hashmol3d/io.py ADDED
@@ -0,0 +1,80 @@
1
+ """File I/O helpers for HashMol3D."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+ from .periodic_table import get_atomic_num
8
+
9
+ __all__ = ["read_xyz"]
10
+
11
+
12
+ def read_xyz(path: str) -> tuple[np.ndarray, np.ndarray]:
13
+ """Read a standard XYZ file.
14
+
15
+ The first line must contain the atom count; the second line is a
16
+ free-form comment; subsequent lines contain ``symbol x y z`` (or
17
+ ``Z x y z``).
18
+
19
+ Args:
20
+ path: Path to the XYZ file.
21
+
22
+ Returns:
23
+ ``(atomic_nums, coords)`` as NumPy arrays.
24
+
25
+ Raises:
26
+ ValueError: if the file is malformed (bad atom count, missing or
27
+ extra atom lines, unknown element symbol, non-numeric coords).
28
+ """
29
+ with open(path) as f:
30
+ lines = f.readlines()
31
+
32
+ if len(lines) < 2:
33
+ raise ValueError("XYZ file is too short (need at least 2 header lines)")
34
+
35
+ try:
36
+ n_declared = int(lines[0].strip())
37
+ except ValueError as err:
38
+ raise ValueError(
39
+ f"first line of an XYZ file must be the atom count; got {lines[0].strip()!r}"
40
+ ) from err
41
+ if n_declared <= 0:
42
+ raise ValueError(f"XYZ atom count must be positive, got {n_declared}")
43
+
44
+ z_list = []
45
+ coords_list = []
46
+ for raw in lines[2:]:
47
+ line = raw.strip()
48
+ if not line:
49
+ continue
50
+ parts = line.split()
51
+ if len(parts) < 4:
52
+ raise ValueError(f"malformed XYZ atom line: {line!r}")
53
+
54
+ sym = parts[0]
55
+ if sym.lstrip("-").isdigit():
56
+ z = int(sym)
57
+ if z <= 0 or z > 118:
58
+ raise ValueError(f"atomic number out of range: {z}")
59
+ else:
60
+ z = get_atomic_num(sym)
61
+ if z == 0:
62
+ raise ValueError(f"unknown element symbol: {sym!r}")
63
+
64
+ try:
65
+ xyz = [float(parts[1]), float(parts[2]), float(parts[3])]
66
+ except ValueError as err:
67
+ raise ValueError(f"malformed coordinates in XYZ line: {line!r}") from err
68
+
69
+ z_list.append(z)
70
+ coords_list.append(xyz)
71
+ if len(z_list) == n_declared:
72
+ break
73
+
74
+ if len(z_list) != n_declared:
75
+ raise ValueError(
76
+ f"XYZ header declares {n_declared} atoms but only "
77
+ f"{len(z_list)} valid atom lines were found"
78
+ )
79
+
80
+ return np.array(z_list, dtype=int), np.array(coords_list, dtype=float)
@@ -0,0 +1,169 @@
1
+ """Periodic table data and utilities for atomic number lookup.
2
+
3
+ Element symbols are stored in their canonical mixed-case form
4
+ (``"H"``, ``"He"``, ``"Co"``, ...) so that ``Co`` (cobalt) and ``CO``
5
+ (carbon + oxygen, which is not a single element) can be told apart.
6
+
7
+ ``get_atomic_num`` accepts any case (``"co"``, ``"CO"``, ``"Co"``) and
8
+ normalizes to the canonical capitalization before lookup.
9
+ """
10
+
11
+ from typing import Dict
12
+
13
+ # Symbol -> atomic number, using canonical capitalization.
14
+ PTABLE: Dict[str, int] = {
15
+ "H": 1,
16
+ "He": 2,
17
+ "Li": 3,
18
+ "Be": 4,
19
+ "B": 5,
20
+ "C": 6,
21
+ "N": 7,
22
+ "O": 8,
23
+ "F": 9,
24
+ "Ne": 10,
25
+ "Na": 11,
26
+ "Mg": 12,
27
+ "Al": 13,
28
+ "Si": 14,
29
+ "P": 15,
30
+ "S": 16,
31
+ "Cl": 17,
32
+ "Ar": 18,
33
+ "K": 19,
34
+ "Ca": 20,
35
+ "Sc": 21,
36
+ "Ti": 22,
37
+ "V": 23,
38
+ "Cr": 24,
39
+ "Mn": 25,
40
+ "Fe": 26,
41
+ "Co": 27,
42
+ "Ni": 28,
43
+ "Cu": 29,
44
+ "Zn": 30,
45
+ "Ga": 31,
46
+ "Ge": 32,
47
+ "As": 33,
48
+ "Se": 34,
49
+ "Br": 35,
50
+ "Kr": 36,
51
+ "Rb": 37,
52
+ "Sr": 38,
53
+ "Y": 39,
54
+ "Zr": 40,
55
+ "Nb": 41,
56
+ "Mo": 42,
57
+ "Tc": 43,
58
+ "Ru": 44,
59
+ "Rh": 45,
60
+ "Pd": 46,
61
+ "Ag": 47,
62
+ "Cd": 48,
63
+ "In": 49,
64
+ "Sn": 50,
65
+ "Sb": 51,
66
+ "Te": 52,
67
+ "I": 53,
68
+ "Xe": 54,
69
+ "Cs": 55,
70
+ "Ba": 56,
71
+ "La": 57,
72
+ "Ce": 58,
73
+ "Pr": 59,
74
+ "Nd": 60,
75
+ "Pm": 61,
76
+ "Sm": 62,
77
+ "Eu": 63,
78
+ "Gd": 64,
79
+ "Tb": 65,
80
+ "Dy": 66,
81
+ "Ho": 67,
82
+ "Er": 68,
83
+ "Tm": 69,
84
+ "Yb": 70,
85
+ "Lu": 71,
86
+ "Hf": 72,
87
+ "Ta": 73,
88
+ "W": 74,
89
+ "Re": 75,
90
+ "Os": 76,
91
+ "Ir": 77,
92
+ "Pt": 78,
93
+ "Au": 79,
94
+ "Hg": 80,
95
+ "Tl": 81,
96
+ "Pb": 82,
97
+ "Bi": 83,
98
+ "Po": 84,
99
+ "At": 85,
100
+ "Rn": 86,
101
+ "Fr": 87,
102
+ "Ra": 88,
103
+ "Ac": 89,
104
+ "Th": 90,
105
+ "Pa": 91,
106
+ "U": 92,
107
+ "Np": 93,
108
+ "Pu": 94,
109
+ "Am": 95,
110
+ "Cm": 96,
111
+ "Bk": 97,
112
+ "Cf": 98,
113
+ "Es": 99,
114
+ "Fm": 100,
115
+ "Md": 101,
116
+ "No": 102,
117
+ "Lr": 103,
118
+ "Rf": 104,
119
+ "Db": 105,
120
+ "Sg": 106,
121
+ "Bh": 107,
122
+ "Hs": 108,
123
+ "Mt": 109,
124
+ "Ds": 110,
125
+ "Rg": 111,
126
+ "Cn": 112,
127
+ "Nh": 113,
128
+ "Fl": 114,
129
+ "Mc": 115,
130
+ "Lv": 116,
131
+ "Ts": 117,
132
+ "Og": 118,
133
+ }
134
+
135
+
136
+ # Atomic number -> canonical symbol (inverse of PTABLE).
137
+ SYMBOL_BY_Z: Dict[int, str] = {z: s for s, z in PTABLE.items()}
138
+
139
+
140
+ def get_symbol(atomic_num: int) -> str:
141
+ """Get the canonical element symbol for an atomic number.
142
+
143
+ Raises:
144
+ KeyError: if ``atomic_num`` is not in ``[1, 118]``.
145
+ """
146
+ return SYMBOL_BY_Z[int(atomic_num)]
147
+
148
+
149
+ def _canonicalize_symbol(symbol: str) -> str:
150
+ """Normalize an element symbol to canonical capitalization."""
151
+ s = symbol.strip()
152
+ if not s:
153
+ return s
154
+ if len(s) == 1:
155
+ return s.upper()
156
+ return s[0].upper() + s[1:].lower()
157
+
158
+
159
+ def get_atomic_num(symbol: str) -> int:
160
+ """
161
+ Get atomic number from element symbol.
162
+
163
+ Args:
164
+ symbol: Element symbol; capitalization is normalized.
165
+
166
+ Returns:
167
+ Atomic number, or 0 if the symbol is not recognized.
168
+ """
169
+ return PTABLE.get(_canonicalize_symbol(symbol), 0)
hashmol3d/version.py ADDED
@@ -0,0 +1,3 @@
1
+ """Version information for HashMol3D."""
2
+
3
+ __version__ = "0.5.0"
@@ -0,0 +1,152 @@
1
+ Metadata-Version: 2.4
2
+ Name: hashmol3d
3
+ Version: 0.5.0
4
+ Summary: Deterministic 3D molecular geometry hashing standard
5
+ Author: Murat Keçeli
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: numpy
10
+ Provides-Extra: test
11
+ Requires-Dist: pytest; extra == "test"
12
+ Dynamic: license-file
13
+
14
+ # HashMol3D
15
+
16
+ **HashMol3D** is a standard, deterministic 3D molecular geometry identifier
17
+ for computational chemistry, machine learning, and HPC workflows.
18
+
19
+ It produces a **readable** identifier of the form
20
+
21
+ <Hill formula><state tag>-<geometry hash>
22
+
23
+ e.g. `H2Oq0m1-68936c504bf5fa3b` for neutral singlet water. The trailing
24
+ geometry hash is **rotation-, translation-, permutation-, and
25
+ parity-invariant** (matching the invariances of the eigenvalues of the
26
+ non-relativistic molecular Hamiltonian), and depends on:
27
+
28
+ - atomic numbers
29
+ - pairwise distances rounded to a user-specified precision
30
+ - a descriptor version tag
31
+
32
+ Charge and spin multiplicity live in the readable prefix, **not** in
33
+ the hash, so two states of the same geometry share the same hex tail
34
+ and can be grouped by suffix matching:
35
+
36
+ ```text
37
+ H2Oq0m1-68936c504bf5fa3b # neutral singlet water
38
+ H2Oq1m2-68936c504bf5fa3b # water cation, same geometry → same hex tail
39
+ ```
40
+
41
+ The hash length auto-scales with the number of atoms (`clip(N, 16, 64)`
42
+ hex chars) so collision risk stays roughly constant as molecules grow;
43
+ pass `length=` to pin a fixed value.
44
+
45
+ It deliberately does **not** distinguish enantiomers (which share their
46
+ Hamiltonian eigenvalues). The reference implementation depends only on
47
+ NumPy.
48
+
49
+ HashMol3D IDs are **stable across machines**, **reproducible**, and ideal for:
50
+ - workflow deduplication
51
+ - caching
52
+ - large QC datasets
53
+ - MD conformer tracking
54
+ - ML potential datasets
55
+ - LLM scientific agents
56
+
57
+ ## Install
58
+
59
+ ### Using pip
60
+
61
+ ```bash
62
+ pip install hashmol3d
63
+ ```
64
+
65
+ ### Using uv
66
+
67
+ ```bash
68
+ # Install from PyPI
69
+ uv pip install hashmol3d
70
+ ```
71
+
72
+ ### Install from source
73
+
74
+ ```bash
75
+ # Clone the repository
76
+ git clone https://github.com/yourusername/HashMol3D.git
77
+ cd HashMol3D
78
+
79
+ # Create and activate a virtual environment (recommended)
80
+ # Using uv:
81
+ uv venv
82
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
83
+
84
+ # Or using standard Python:
85
+ python -m venv .venv
86
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
87
+
88
+ # Install the package in editable mode
89
+ uv pip install -e . # Or: pip install -e .
90
+ ```
91
+
92
+ ## Usage (CLI)
93
+
94
+ ```bash
95
+ $ hashmol3d water.xyz
96
+ H2Oq0m1-68936c504bf5fa3b
97
+
98
+ # Cation with explicit multiplicity — only the prefix changes.
99
+ $ hashmol3d -c 1 -m 2 water.xyz
100
+ H2Oq1m2-68936c504bf5fa3b
101
+
102
+ # Pin a fixed hash length and a coarser precision.
103
+ $ hashmol3d -p 1e-3 -l 32 benzene.xyz
104
+
105
+ # Verbose: also print formula, geometry hash, descriptor, and metadata.
106
+ $ hashmol3d -v water.xyz
107
+
108
+ # Show the package version.
109
+ $ hashmol3d --version
110
+ ```
111
+
112
+ Short flags: `-p/--precision`, `-c/--charge`, `-m/--multiplicity`,
113
+ `-l/--length`, `-v/--verbose`. Errors on missing or malformed input go
114
+ to stderr with exit code 1 (no Python traceback).
115
+
116
+ ## Usage (Python)
117
+
118
+ ```python
119
+ import numpy as np
120
+ from hashmol3d import hash_molecule
121
+
122
+ atomic_nums = np.array([8, 1, 1])
123
+ coords = np.array([
124
+ [ 0.0000, 0.0000, 0.0],
125
+ [ 0.7572, 0.5860, 0.0],
126
+ [-0.7572, 0.5860, 0.0],
127
+ ])
128
+ res = hash_molecule(atomic_nums, coords)
129
+ print(res.hash_str) # H2Oq0m1-68936c504bf5fa3b
130
+ print(res.formula) # H2O
131
+ print(res.geometry_hash) # 68936c504bf5fa3b
132
+ print(res.charge, res.multiplicity) # 0 1
133
+ ```
134
+
135
+ All optional arguments are keyword-only: `precision`, `charge`,
136
+ `multiplicity`, `length`.
137
+
138
+ Or read straight from a file:
139
+
140
+ ```python
141
+ from hashmol3d import hash_xyz
142
+
143
+ print(hash_xyz("water.xyz").hash_str) # H2Oq0m1-68936c504bf5fa3b
144
+ print(hash_xyz("water.xyz", charge=1, multiplicity=2).hash_str)
145
+ # H2Oq1m2-68936c504bf5fa3b
146
+ ```
147
+
148
+ See [`docs/`](docs/) for the full
149
+ [specification](docs/specification.md),
150
+ [API reference](docs/api_reference.md), and
151
+ [CLI guide](docs/cli_usage.md).
152
+
@@ -0,0 +1,12 @@
1
+ hashmol3d/__init__.py,sha256=p7-ZEIqORy-Ag48OH7I_q5yLN2avJLmB_hISa7l-tMc,746
2
+ hashmol3d/cli.py,sha256=sHBz5ehATwvrv7x-iJACh6tX1Y4pMGjFagjGfR6bXPE,3021
3
+ hashmol3d/core.py,sha256=Ni0sWPBPw9IOGh_EZSiqo5DuppN9BFwQ-IZ3bMny6nc,9852
4
+ hashmol3d/io.py,sha256=HtN0EuDBQADqNGQxqxanZ6Old2BvV2_j3mpHlOBybO4,2349
5
+ hashmol3d/periodic_table.py,sha256=7K8fZ-6m-oVzIcrDxaOfGhx5bT19qc_9tgZ3wDU06MM,3066
6
+ hashmol3d/version.py,sha256=vjyKlU95LK1slAc40jPFhbpCONJ-peuFyEHdOGuTbYA,64
7
+ hashmol3d-0.5.0.dist-info/licenses/LICENSE,sha256=KYkSY4pbu9rhIRi9_q7SuwZcRrrKUqV6iiSUNKyujiU,23
8
+ hashmol3d-0.5.0.dist-info/METADATA,sha256=i7Ff13vFq8WfKpuhDKCuStMMaAEzWPHFHg10L7q7aaA,3982
9
+ hashmol3d-0.5.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
10
+ hashmol3d-0.5.0.dist-info/entry_points.txt,sha256=j1uNhM8Nqdk1TAN1fS9oDSMUrw6weW83_18BqMwr_9w,49
11
+ hashmol3d-0.5.0.dist-info/top_level.txt,sha256=tTvyNJglebM26-3e-0bBBMF-GuXdkKTEa0grZX6UF6I,10
12
+ hashmol3d-0.5.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ hashmol3d = hashmol3d.cli:main
@@ -0,0 +1,3 @@
1
+ MIT License
2
+
3
+ Copyright.
@@ -0,0 +1 @@
1
+ hashmol3d