mlxmolkit-rdkit 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. mlxmolkit_rdkit-0.3.0/PKG-INFO +345 -0
  2. mlxmolkit_rdkit-0.3.0/README.md +312 -0
  3. mlxmolkit_rdkit-0.3.0/mlxmolkit/__init__.py +47 -0
  4. mlxmolkit_rdkit-0.3.0/mlxmolkit/bfgs_batch_metal.py +743 -0
  5. mlxmolkit_rdkit-0.3.0/mlxmolkit/bfgs_metal.py +806 -0
  6. mlxmolkit_rdkit-0.3.0/mlxmolkit/butina.py +127 -0
  7. mlxmolkit_rdkit-0.3.0/mlxmolkit/butina_metal.py +106 -0
  8. mlxmolkit_rdkit-0.3.0/mlxmolkit/conformer_metal.py +736 -0
  9. mlxmolkit_rdkit-0.3.0/mlxmolkit/conformer_pipeline.py +301 -0
  10. mlxmolkit_rdkit-0.3.0/mlxmolkit/conformer_pipeline_v2.py +397 -0
  11. mlxmolkit_rdkit-0.3.0/mlxmolkit/dg_energy_metal.py +260 -0
  12. mlxmolkit_rdkit-0.3.0/mlxmolkit/dg_extract.py +362 -0
  13. mlxmolkit_rdkit-0.3.0/mlxmolkit/dg_minimize_metal.py +361 -0
  14. mlxmolkit_rdkit-0.3.0/mlxmolkit/energy_distgeom.py +142 -0
  15. mlxmolkit_rdkit-0.3.0/mlxmolkit/etk_energy_metal.py +361 -0
  16. mlxmolkit_rdkit-0.3.0/mlxmolkit/etk_extract.py +321 -0
  17. mlxmolkit_rdkit-0.3.0/mlxmolkit/etk_metal.py +626 -0
  18. mlxmolkit_rdkit-0.3.0/mlxmolkit/etk_minimize_metal.py +288 -0
  19. mlxmolkit_rdkit-0.3.0/mlxmolkit/fp_uint32.py +34 -0
  20. mlxmolkit_rdkit-0.3.0/mlxmolkit/fused_tanimoto_nlist.py +289 -0
  21. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_batch_optimizer.py +455 -0
  22. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_batching_shivam.py +283 -0
  23. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_bfgs_header.metal +454 -0
  24. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_bfgs_shivam.py +304 -0
  25. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_bfgs_source.metal +262 -0
  26. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_bfgs_source_tg.metal +359 -0
  27. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_energy_mlx.py +286 -0
  28. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_energy_native.py +320 -0
  29. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_energy_vectorized.py +323 -0
  30. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_extract_shivam.py +471 -0
  31. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_lbfgs_source_tg.metal +283 -0
  32. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_metal_kernel.py +1150 -0
  33. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_metal_optimizer.py +589 -0
  34. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_minimize.py +333 -0
  35. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_mlx_optimizer.py +213 -0
  36. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_native_batch_optimizer.py +348 -0
  37. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_native_batch_optimizer_v2.py +283 -0
  38. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_optimizer.py +239 -0
  39. mlxmolkit_rdkit-0.3.0/mlxmolkit/mmff_params.py +320 -0
  40. mlxmolkit_rdkit-0.3.0/mlxmolkit/morgan_cpu.py +92 -0
  41. mlxmolkit_rdkit-0.3.0/mlxmolkit/native_metal.py +100 -0
  42. mlxmolkit_rdkit-0.3.0/mlxmolkit/shared_batch.py +264 -0
  43. mlxmolkit_rdkit-0.3.0/mlxmolkit/stereo_checks.py +386 -0
  44. mlxmolkit_rdkit-0.3.0/mlxmolkit/tanimoto_blockwise.py +254 -0
  45. mlxmolkit_rdkit-0.3.0/mlxmolkit/tanimoto_metal_u32.py +368 -0
  46. mlxmolkit_rdkit-0.3.0/mlxmolkit_rdkit.egg-info/PKG-INFO +345 -0
  47. mlxmolkit_rdkit-0.3.0/mlxmolkit_rdkit.egg-info/SOURCES.txt +56 -0
  48. mlxmolkit_rdkit-0.3.0/mlxmolkit_rdkit.egg-info/dependency_links.txt +1 -0
  49. mlxmolkit_rdkit-0.3.0/mlxmolkit_rdkit.egg-info/requires.txt +12 -0
  50. mlxmolkit_rdkit-0.3.0/mlxmolkit_rdkit.egg-info/top_level.txt +1 -0
  51. mlxmolkit_rdkit-0.3.0/pyproject.toml +54 -0
  52. mlxmolkit_rdkit-0.3.0/setup.cfg +4 -0
  53. mlxmolkit_rdkit-0.3.0/tests/test_bfgs.py +561 -0
  54. mlxmolkit_rdkit-0.3.0/tests/test_butina.py +205 -0
  55. mlxmolkit_rdkit-0.3.0/tests/test_conformer_metal.py +423 -0
  56. mlxmolkit_rdkit-0.3.0/tests/test_dg_core.py +261 -0
  57. mlxmolkit_rdkit-0.3.0/tests/test_mmff_mlx.py +255 -0
  58. mlxmolkit_rdkit-0.3.0/tests/test_tanimoto.py +52 -0
@@ -0,0 +1,345 @@
1
+ Metadata-Version: 2.4
2
+ Name: mlxmolkit-rdkit
3
+ Version: 0.3.0
4
+ Summary: GPU-accelerated molecular toolkit on Apple Silicon: conformer generation (ETKDG + MMFF94) and Butina clustering via Metal/MLX
5
+ Author: Guillaume Osmo
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/guillaume-osmo/mlxmolkit
8
+ Project-URL: Repository, https://github.com/guillaume-osmo/mlxmolkit
9
+ Project-URL: Issues, https://github.com/guillaume-osmo/mlxmolkit/issues
10
+ Keywords: molecular,conformer,ETKDG,MMFF94,clustering,Tanimoto,Butina,MLX,Metal,Apple Silicon,GPU
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: MacOS
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
21
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: mlx>=0.10.0
25
+ Requires-Dist: numpy>=1.24.0
26
+ Provides-Extra: rdkit
27
+ Requires-Dist: rdkit; extra == "rdkit"
28
+ Provides-Extra: test
29
+ Requires-Dist: pytest; extra == "test"
30
+ Requires-Dist: rdkit; extra == "test"
31
+ Provides-Extra: all
32
+ Requires-Dist: rdkit; extra == "all"
33
+
34
+ # mlxmolkit — GPU-accelerated molecular toolkit on Apple Silicon
35
+
36
+ Port of [nvMolKit](https://github.com/NVIDIA-Digital-Bio/nvMolKit) (CUDA) to Apple Metal via [MLX](https://github.com/ml-explore/mlx). Two pipelines:
37
+
38
+ 1. **Molecular Clustering** — Morgan FP → Tanimoto similarity → Butina clustering
39
+ 2. **3D Conformer Generation** — DG (4D) → ETK (3D) → MMFF94 optimization
40
+
41
+ ## Features
42
+
43
+ - **Conformer Generation** — Drop-in replacement for RDKit's ETKDG (`EmbedMolecules`). Supports ETKDG, ETKDGv2, ETKDGv3, srETKDGv3, KDG, ETDG, and pure DG.
44
+ - **MMFF94 Optimization** — GPU-accelerated force field optimization (`MMFFOptimizeMoleculesConfs`). All 7 MMFF energy terms with fused Metal kernel. Full BFGS or L-BFGS in-kernel (zero CPU round-trips).
45
+ - **Molecular Clustering** — Butina clustering at 150k+ molecules with divide-and-conquer memory management.
46
+ - **N x k Parallel** — Generate k conformers for N molecules simultaneously. Constraints shared across conformers (`conf_to_mol` indirection, 50% memory savings).
47
+
48
+ ## Performance
49
+
50
+ ### Conformer Generation (N=20 molecules, k=50 conformers = 1000 total)
51
+
52
+ | Pipeline | Time | Throughput | GPU Memory |
53
+ |----------|------|-----------|------------|
54
+ | DG only | 0.13s | 7,549 conf/s | 2.6 MB |
55
+ | DG + ETK | 0.16s | 6,228 conf/s | 2.6 MB |
56
+ | DG + ETK + MMFF | 0.52s | 1,908 conf/s | 5.1 MB |
57
+
58
+ ### Conformer Memory Scaling (DG + ETK + MMFF, batch=500)
59
+
60
+ | Conformers | Batch | GPU (BFGS) | GPU (L-BFGS) | Time | Throughput |
61
+ |-----------|-------|-----------|-------------|------|-----------|
62
+ | 1,000 | 1000 | 5.1 MB | 2.9 MB | 0.43s | 2,342/s |
63
+ | 2,000 | 500 | 2.6 MB | 1.5 MB | 1.43s | 1,402/s |
64
+ | 4,000 | 500 | 2.6 MB | 1.5 MB | 1.91s | 2,094/s |
65
+ | 10,000 | 500 | **2.6 MB** | **1.5 MB** | 4.82s | 2,075/s |
66
+
67
+ GPU memory stays constant regardless of total conformers thanks to divide-and-conquer batching.
68
+
69
+ ### Scale Test: 1000 Molecules x 10 Conformers = 10,000 Total
70
+
71
+ | Pipeline | Time | Throughput | Convergence | Batches |
72
+ |----------|------|-----------|-------------|---------|
73
+ | DG + ETK | 3.1s | **3,210 conf/s** | 99.8% | 1 |
74
+ | DG + ETK + MMFF | 6.5s | **1,536 conf/s** | 99.8% | 1 |
75
+
76
+ 1000 drug-like molecules with explicit H (6-33 atoms, mean 14.2). All 10,000 conformers in a single GPU batch. 100% valid 3D coordinates.
77
+
78
+ ### Batch Size Impact (N=20, k=50, C=1000)
79
+
80
+ | Batch | Batches | Time | conf/s |
81
+ |------:|--------:|-----:|-------:|
82
+ | 100 | 10 | 0.62s | 1,610 |
83
+ | 500 | 2 | 0.29s | 3,394 |
84
+ | 1000+ | 1 | 0.22s | 4,442 |
85
+
86
+ Larger batches = fewer kernel launches = higher throughput. Auto-sizing (default) picks the largest batch that fits in free memory.
87
+
88
+ ### GPU Memory per Conformer
89
+
90
+ | Atoms | DG (4D) | ETK (3D) | MMFF (BFGS) | MMFF (L-BFGS) |
91
+ |------:|--------:|---------:|------------:|--------------:|
92
+ | 5 | 1.9 KB | 1.4 KB | 1.3 KB | 1.4 KB |
93
+ | 12 | 4.4 KB | 3.3 KB | 6.0 KB | 3.3 KB |
94
+ | 21 | 7.6 KB | 5.7 KB | 17.2 KB | 5.7 KB |
95
+ | 30 | 10.8 KB | 8.1 KB | 34.1 KB | 8.1 KB |
96
+ | 50 | 18.0 KB | 13.5 KB | 92.0 KB | 13.5 KB |
97
+ | 64 | 23.1 KB | 17.3 KB | **149.2 KB** | 17.3 KB |
98
+
99
+ MMFF BFGS memory grows as O(n^2) due to the dense Hessian (n_atoms x 3)^2. **BFGS is faster than L-BFGS at all typical drug-like sizes** (up to 74 atoms with H) because the better curvature information requires fewer iterations. L-BFGS is only needed for very large molecules (>150 atoms) where the Hessian exceeds ~1 MB per conformer.
100
+
101
+ | Molecule | Atoms (with H) | BFGS | L-BFGS | Winner |
102
+ |----------|---------------|------|--------|--------|
103
+ | Methane | 5 | 0.255s | 0.215s | BFGS |
104
+ | Benzene | 12 | 0.213s | 0.222s | ~tie |
105
+ | Aspirin | 21 | 0.241s | 0.230s | ~tie |
106
+ | Testosterone | 49 | 0.364s | 0.335s | BFGS |
107
+ | Cholesterol | 74 | 0.590s | 0.486s | BFGS |
108
+
109
+ Recommendation: use BFGS (default) for all molecules <150 atoms with H. The pipeline auto-switches to L-BFGS at 150+ atoms (`mmff_use_lbfgs=None`, the default).
110
+
111
+ **Important:** Always add explicit hydrogens (`Chem.AddHs`) before conformer generation. Convergence is significantly better with explicit H because the distance geometry constraints are more complete and the force field terms (bond/angle/torsion) are fully defined. The pipeline calls `AddHs` automatically.
112
+
113
+ With 64 GB unified memory, a single batch can hold:
114
+
115
+ | Molecule size | DG/ETK | MMFF (BFGS) | MMFF (L-BFGS) |
116
+ |--------------|-------:|------------:|--------------:|
117
+ | 12 atoms | ~9.8M conformers | ~7.4M | ~9.8M |
118
+ | 30 atoms | ~3.9M conformers | ~1.3M | ~3.9M |
119
+ | 64 atoms | ~1.8M conformers | **~300K** | ~1.8M |
120
+
121
+ The divide-and-conquer queue automatically splits into multiple batches when total exceeds free memory.
122
+
123
+ ### Clustering (Enamine REAL subset, Apple M3 Max)
124
+
125
+ | N | Fused sim→CSR | Butina | **Total** | vs RDKit | Memory |
126
+ |---|---|---|---|---|---|
127
+ | 20k | 0.26s | 0.09s | **0.35s** | **152x** | 0.1 MB |
128
+ | 50k | 1.26s | 0.36s | **1.62s** | — | 0.5 MB |
129
+ | 100k | 4.87s | 0.97s | **5.84s** | — | 1.3 MB |
130
+ | 150k+ | blockwise | — | scales | — | bounded |
131
+
132
+ ### ETKDG Variant Comparison (N=20, k=50)
133
+
134
+ | Variant | conf/s | Convergence |
135
+ |---------|--------|-------------|
136
+ | DG | 7,549 | 96.6% |
137
+ | KDG | 7,243 | 96.6% |
138
+ | ETDG | 1,844 | 96.6% |
139
+ | ETKDG | 6,064 | 96.6% |
140
+ | ETKDGv2 | 6,228 | 96.6% |
141
+ | ETKDGv3 | 6,636 | 96.6% |
142
+ | srETKDGv3 | 6,678 | 96.6% |
143
+
144
+ ## Architecture
145
+
146
+ ### Conformer Generation (N x k parallel)
147
+
148
+ ```
149
+ SMILES x N
150
+ |
151
+ [RDKit CPU] Extract params ONCE per molecule
152
+ |
153
+ [Pack] SharedConstraintBatch (conf_to_mol indirection)
154
+ |
155
+ +-- Stage 1: DG minimize (4D, Metal TPM=32) --------+
156
+ | One threadgroup per conformer |
157
+ | L-BFGS in-kernel, GPU-parallel line search |
158
+ | Shared constraints via conf_to_mol |
159
+ +----------------------------------------------------+
160
+ |
161
+ [Extract 3D] Drop 4th coordinate
162
+ |
163
+ +-- Stage 2: ETK minimize (3D, Metal TPM=32) --------+
164
+ | CSD torsion + improper + 1-4 distance |
165
+ | Optional parallel_grad for large molecules |
166
+ +-----------------------------------------------------+
167
+ |
168
+ +-- Stage 3: MMFF94 optimize (Metal, in-kernel) ------+
169
+ | 7 energy terms: bond, angle, stretch-bend, |
170
+ | OOP, torsion, vdW, electrostatic |
171
+ | BFGS (default) or L-BFGS option |
172
+ +------------------------------------------------------+
173
+ |
174
+ Optimized 3D conformers
175
+ ```
176
+
177
+ ### Clustering (divide-and-conquer for 150k+)
178
+
179
+ ```
180
+ Morgan FP (RDKit CPU)
181
+ |
182
+ uint8 -> uint32 packing
183
+ |
184
+ +-- N <= 100k: Fused Metal Kernel ------+
185
+ | Single dispatch, no NxN matrix |
186
+ +-- N > 100k: Blockwise D&C ------------+
187
+ | Tile both dimensions (auto-sized) |
188
+ | mx.eval() between tiles (free GPU) |
189
+ +----------------------------------------+
190
+ |
191
+ Butina greedy (CPU, numpy CSR)
192
+ |
193
+ Clusters
194
+ ```
195
+
196
+ ## Adaptive Iteration Scaling
197
+
198
+ Iterations auto-scale by molecule complexity (default). Small molecules converge early via in-kernel TOLX/gradient checks — no wasted GPU compute.
199
+
200
+ Formula: `max_iters = base + scale * max(n_atoms, sqrt(n_constraints))`
201
+
202
+ | Molecule | Atoms | Constraints | DG iters | ETK iters | MMFF iters |
203
+ |----------|-------|-------------|----------|-----------|------------|
204
+ | Methane | 5 | 10 | 400 | 200 | 275 |
205
+ | Benzene | 12 | 66 | 540 | 270 | 380 |
206
+ | Aspirin | 21 | 210 | 720 | 360 | 515 |
207
+ | Testosterone | 49 | 1176 | 1280 | 640 | 935 |
208
+ | 64-atom | 64 | 2016 | 1580 | 790 | 1160 |
209
+
210
+ Override with explicit values when needed:
211
+
212
+ ```python
213
+ # Auto (default) — scales with molecule size
214
+ result = generate_conformers_nk(smiles_list, n_confs_per_mol=10)
215
+
216
+ # Fixed iterations for fine control
217
+ result = generate_conformers_nk(smiles_list, n_confs_per_mol=10,
218
+ dg_max_iters=1000, etk_max_iters=500, mmff_max_iters=400)
219
+ ```
220
+
221
+ ## Optimization Options
222
+
223
+ | Option | Flag | Effect | When to use |
224
+ |--------|------|--------|-------------|
225
+ | Auto iterations (default) | `dg_max_iters=0` | Scales with molecule size | Always (default) |
226
+ | Warm-start retry | automatic | Re-runs non-converged with 2x iters | Always (automatic) |
227
+ | ETK parallel gradient | `parallel_grad=True` | 1.18x ETK speedup | Many distance constraints |
228
+ | DG parallel gradient | `parallel_grad=True` | Parallelizes dist gradient | >500 distance constraints |
229
+ | MMFF94s variant | `mmff_variant="MMFF94s"` | Softer torsion barriers | Conjugated/aromatic molecules |
230
+ | MMFF L-BFGS | `mmff_use_lbfgs=True` | 5x less memory | Molecules >50 atoms |
231
+ | MMFF BFGS (default) | `mmff_use_lbfgs=False` | 2x faster for small mols | Molecules <50 atoms |
232
+ | ETKDG variant | `variant="ETKDGv3"` | 7 variants supported | Choose per use case |
233
+
234
+ ### MMFF94 Force Field Variants
235
+
236
+ | Variant | Flag | Torsion barriers | Best for |
237
+ |---------|------|-----------------|----------|
238
+ | MMFF94 (default) | `mmff_variant="MMFF94"` | Standard | General molecules |
239
+ | MMFF94s | `mmff_variant="MMFF94s"` | Softer for conjugated systems | Aromatic, planar, conjugated |
240
+
241
+ ## Usage
242
+
243
+ ### 3D Conformer Generation
244
+
245
+ ```python
246
+ from mlxmolkit.conformer_pipeline_v2 import generate_conformers_nk
247
+
248
+ # Basic: 10 conformers per molecule, ETKDGv2
249
+ result = generate_conformers_nk(
250
+ smiles_list=["c1ccccc1", "CC(=O)O", "CC(=O)Oc1ccccc1C(=O)O"],
251
+ n_confs_per_mol=10,
252
+ )
253
+ for mol in result.molecules:
254
+ print(f"{mol.n_atoms} atoms, {len(mol.positions_3d)} conformers")
255
+
256
+ # Full pipeline with MMFF94
257
+ result = generate_conformers_nk(
258
+ smiles_list=["c1ccccc1", "CC(=O)O"],
259
+ n_confs_per_mol=50,
260
+ variant="ETKDGv3",
261
+ run_mmff=True,
262
+ mmff_use_lbfgs=False, # BFGS (default, fast for <50 atoms)
263
+ max_confs_per_batch=500, # divide-and-conquer batch size
264
+ )
265
+
266
+ # Pure distance geometry (no torsion refinement)
267
+ result = generate_conformers_nk(
268
+ smiles_list=["c1ccccc1"],
269
+ n_confs_per_mol=100,
270
+ variant="DG",
271
+ )
272
+
273
+ # Large molecules: use L-BFGS for MMFF
274
+ result = generate_conformers_nk(
275
+ smiles_list=[large_smiles],
276
+ n_confs_per_mol=20,
277
+ run_mmff=True,
278
+ mmff_use_lbfgs=True, # L-BFGS for >50 atoms
279
+ )
280
+ ```
281
+
282
+ ### Example Script
283
+
284
+ ```bash
285
+ # Basic: 20 molecules x 10 conformers
286
+ python examples/conf3d_example.py
287
+
288
+ # Scale test: 1000 molecules x 10 conformers with MMFF
289
+ python examples/conf3d_example.py --n-mols 1000 --n-confs 10 --mmff
290
+
291
+ # All options
292
+ python examples/conf3d_example.py --n-mols 100 --n-confs 20 --variant ETKDGv3 \
293
+ --mmff --mmff-variant MMFF94s --batch-size 200
294
+
295
+ # Custom SMILES
296
+ python examples/conf3d_example.py --smiles "c1ccccc1" "CC(=O)O" --n-confs 50 --mmff
297
+ ```
298
+
299
+ ### Molecular Clustering
300
+
301
+ ```python
302
+ from mlxmolkit import butina_tanimoto_mlx
303
+ import mlx.core as mx
304
+
305
+ # Automatic: fused kernel for N<=100k, blockwise for N>100k
306
+ result = butina_tanimoto_mlx(mx.array(fp_bytes), cutoff=0.4)
307
+ print(f"{len(result.clusters)} clusters")
308
+ ```
309
+
310
+ ### Low-level API
311
+
312
+ ```python
313
+ from mlxmolkit import (
314
+ fp_uint8_to_uint32,
315
+ fused_neighbor_list_metal,
316
+ tanimoto_neighbors_blockwise,
317
+ butina_from_neighbor_list_csr,
318
+ )
319
+
320
+ fp_u32 = fp_uint8_to_uint32(mx.array(fp_bytes))
321
+
322
+ # Small N: fused single-dispatch
323
+ offsets, indices = fused_neighbor_list_metal(fp_u32, cutoff=0.4)
324
+
325
+ # Large N (150k+): divide-and-conquer blockwise
326
+ offsets, indices = tanimoto_neighbors_blockwise(fp_u32, cutoff=0.4)
327
+
328
+ result = butina_from_neighbor_list_csr(offsets, indices, N, cutoff=0.4)
329
+ ```
330
+
331
+ ## Tests
332
+
333
+ ```bash
334
+ pip install -e .
335
+ pytest tests/ -v
336
+ ```
337
+
338
+ ## References
339
+
340
+ - [nvMolKit](https://github.com/NVIDIA-Digital-Bio/nvMolKit) — NVIDIA's CUDA implementation (Apache 2.0)
341
+ - [shivampatel10/mlxmolkit](https://github.com/shivampatel10/mlxmolkit) — TPM threadgroup kernels and MMFF Metal implementation
342
+ - [RDKit blog: Butina clustering with nvMolKit](https://greglandrum.github.io/rdkit-blog/posts/2026-02-28-nvmolkit-clustering.html)
343
+ - [MLX](https://github.com/ml-explore/mlx) — Apple's ML framework with Metal kernel support
344
+ - [MMFF94](https://doi.org/10.1002/(SICI)1096-987X(199604)17:5/6<490::AID-JCC1>3.0.CO;2-P) — Halgren, J. Comput. Chem. 1996
345
+ - [Butina, D. (1999)](https://doi.org/10.1021/ci9803381) — Performance of Kier-Hall and molecular connectivity indices
@@ -0,0 +1,312 @@
1
+ # mlxmolkit — GPU-accelerated molecular toolkit on Apple Silicon
2
+
3
+ Port of [nvMolKit](https://github.com/NVIDIA-Digital-Bio/nvMolKit) (CUDA) to Apple Metal via [MLX](https://github.com/ml-explore/mlx). Two pipelines:
4
+
5
+ 1. **Molecular Clustering** — Morgan FP → Tanimoto similarity → Butina clustering
6
+ 2. **3D Conformer Generation** — DG (4D) → ETK (3D) → MMFF94 optimization
7
+
8
+ ## Features
9
+
10
+ - **Conformer Generation** — Drop-in replacement for RDKit's ETKDG (`EmbedMolecules`). Supports ETKDG, ETKDGv2, ETKDGv3, srETKDGv3, KDG, ETDG, and pure DG.
11
+ - **MMFF94 Optimization** — GPU-accelerated force field optimization (`MMFFOptimizeMoleculesConfs`). All 7 MMFF energy terms with fused Metal kernel. Full BFGS or L-BFGS in-kernel (zero CPU round-trips).
12
+ - **Molecular Clustering** — Butina clustering at 150k+ molecules with divide-and-conquer memory management.
13
+ - **N x k Parallel** — Generate k conformers for N molecules simultaneously. Constraints shared across conformers (`conf_to_mol` indirection, 50% memory savings).
14
+
15
+ ## Performance
16
+
17
+ ### Conformer Generation (N=20 molecules, k=50 conformers = 1000 total)
18
+
19
+ | Pipeline | Time | Throughput | GPU Memory |
20
+ |----------|------|-----------|------------|
21
+ | DG only | 0.13s | 7,549 conf/s | 2.6 MB |
22
+ | DG + ETK | 0.16s | 6,228 conf/s | 2.6 MB |
23
+ | DG + ETK + MMFF | 0.52s | 1,908 conf/s | 5.1 MB |
24
+
25
+ ### Conformer Memory Scaling (DG + ETK + MMFF, batch=500)
26
+
27
+ | Conformers | Batch | GPU (BFGS) | GPU (L-BFGS) | Time | Throughput |
28
+ |-----------|-------|-----------|-------------|------|-----------|
29
+ | 1,000 | 1000 | 5.1 MB | 2.9 MB | 0.43s | 2,342/s |
30
+ | 2,000 | 500 | 2.6 MB | 1.5 MB | 1.43s | 1,402/s |
31
+ | 4,000 | 500 | 2.6 MB | 1.5 MB | 1.91s | 2,094/s |
32
+ | 10,000 | 500 | **2.6 MB** | **1.5 MB** | 4.82s | 2,075/s |
33
+
34
+ GPU memory stays constant regardless of total conformers thanks to divide-and-conquer batching.
35
+
36
+ ### Scale Test: 1000 Molecules x 10 Conformers = 10,000 Total
37
+
38
+ | Pipeline | Time | Throughput | Convergence | Batches |
39
+ |----------|------|-----------|-------------|---------|
40
+ | DG + ETK | 3.1s | **3,210 conf/s** | 99.8% | 1 |
41
+ | DG + ETK + MMFF | 6.5s | **1,536 conf/s** | 99.8% | 1 |
42
+
43
+ 1000 drug-like molecules with explicit H (6-33 atoms, mean 14.2). All 10,000 conformers in a single GPU batch. 100% valid 3D coordinates.
44
+
45
+ ### Batch Size Impact (N=20, k=50, C=1000)
46
+
47
+ | Batch | Batches | Time | conf/s |
48
+ |------:|--------:|-----:|-------:|
49
+ | 100 | 10 | 0.62s | 1,610 |
50
+ | 500 | 2 | 0.29s | 3,394 |
51
+ | 1000+ | 1 | 0.22s | 4,442 |
52
+
53
+ Larger batches = fewer kernel launches = higher throughput. Auto-sizing (default) picks the largest batch that fits in free memory.
54
+
55
+ ### GPU Memory per Conformer
56
+
57
+ | Atoms | DG (4D) | ETK (3D) | MMFF (BFGS) | MMFF (L-BFGS) |
58
+ |------:|--------:|---------:|------------:|--------------:|
59
+ | 5 | 1.9 KB | 1.4 KB | 1.3 KB | 1.4 KB |
60
+ | 12 | 4.4 KB | 3.3 KB | 6.0 KB | 3.3 KB |
61
+ | 21 | 7.6 KB | 5.7 KB | 17.2 KB | 5.7 KB |
62
+ | 30 | 10.8 KB | 8.1 KB | 34.1 KB | 8.1 KB |
63
+ | 50 | 18.0 KB | 13.5 KB | 92.0 KB | 13.5 KB |
64
+ | 64 | 23.1 KB | 17.3 KB | **149.2 KB** | 17.3 KB |
65
+
66
+ MMFF BFGS memory grows as O(n^2) due to the dense Hessian (n_atoms x 3)^2. **BFGS is faster than L-BFGS at all typical drug-like sizes** (up to 74 atoms with H) because the better curvature information requires fewer iterations. L-BFGS is only needed for very large molecules (>150 atoms) where the Hessian exceeds ~1 MB per conformer.
67
+
68
+ | Molecule | Atoms (with H) | BFGS | L-BFGS | Winner |
69
+ |----------|---------------|------|--------|--------|
70
+ | Methane | 5 | 0.255s | 0.215s | BFGS |
71
+ | Benzene | 12 | 0.213s | 0.222s | ~tie |
72
+ | Aspirin | 21 | 0.241s | 0.230s | ~tie |
73
+ | Testosterone | 49 | 0.364s | 0.335s | BFGS |
74
+ | Cholesterol | 74 | 0.590s | 0.486s | BFGS |
75
+
76
+ Recommendation: use BFGS (default) for all molecules <150 atoms with H. The pipeline auto-switches to L-BFGS at 150+ atoms (`mmff_use_lbfgs=None`, the default).
77
+
78
+ **Important:** Always add explicit hydrogens (`Chem.AddHs`) before conformer generation. Convergence is significantly better with explicit H because the distance geometry constraints are more complete and the force field terms (bond/angle/torsion) are fully defined. The pipeline calls `AddHs` automatically.
79
+
80
+ With 64 GB unified memory, a single batch can hold:
81
+
82
+ | Molecule size | DG/ETK | MMFF (BFGS) | MMFF (L-BFGS) |
83
+ |--------------|-------:|------------:|--------------:|
84
+ | 12 atoms | ~9.8M conformers | ~7.4M | ~9.8M |
85
+ | 30 atoms | ~3.9M conformers | ~1.3M | ~3.9M |
86
+ | 64 atoms | ~1.8M conformers | **~300K** | ~1.8M |
87
+
88
+ The divide-and-conquer queue automatically splits into multiple batches when total exceeds free memory.
89
+
90
+ ### Clustering (Enamine REAL subset, Apple M3 Max)
91
+
92
+ | N | Fused sim→CSR | Butina | **Total** | vs RDKit | Memory |
93
+ |---|---|---|---|---|---|
94
+ | 20k | 0.26s | 0.09s | **0.35s** | **152x** | 0.1 MB |
95
+ | 50k | 1.26s | 0.36s | **1.62s** | — | 0.5 MB |
96
+ | 100k | 4.87s | 0.97s | **5.84s** | — | 1.3 MB |
97
+ | 150k+ | blockwise | — | scales | — | bounded |
98
+
99
+ ### ETKDG Variant Comparison (N=20, k=50)
100
+
101
+ | Variant | conf/s | Convergence |
102
+ |---------|--------|-------------|
103
+ | DG | 7,549 | 96.6% |
104
+ | KDG | 7,243 | 96.6% |
105
+ | ETDG | 1,844 | 96.6% |
106
+ | ETKDG | 6,064 | 96.6% |
107
+ | ETKDGv2 | 6,228 | 96.6% |
108
+ | ETKDGv3 | 6,636 | 96.6% |
109
+ | srETKDGv3 | 6,678 | 96.6% |
110
+
111
+ ## Architecture
112
+
113
+ ### Conformer Generation (N x k parallel)
114
+
115
+ ```
116
+ SMILES x N
117
+ |
118
+ [RDKit CPU] Extract params ONCE per molecule
119
+ |
120
+ [Pack] SharedConstraintBatch (conf_to_mol indirection)
121
+ |
122
+ +-- Stage 1: DG minimize (4D, Metal TPM=32) --------+
123
+ | One threadgroup per conformer |
124
+ | L-BFGS in-kernel, GPU-parallel line search |
125
+ | Shared constraints via conf_to_mol |
126
+ +----------------------------------------------------+
127
+ |
128
+ [Extract 3D] Drop 4th coordinate
129
+ |
130
+ +-- Stage 2: ETK minimize (3D, Metal TPM=32) --------+
131
+ | CSD torsion + improper + 1-4 distance |
132
+ | Optional parallel_grad for large molecules |
133
+ +-----------------------------------------------------+
134
+ |
135
+ +-- Stage 3: MMFF94 optimize (Metal, in-kernel) ------+
136
+ | 7 energy terms: bond, angle, stretch-bend, |
137
+ | OOP, torsion, vdW, electrostatic |
138
+ | BFGS (default) or L-BFGS option |
139
+ +------------------------------------------------------+
140
+ |
141
+ Optimized 3D conformers
142
+ ```
143
+
144
+ ### Clustering (divide-and-conquer for 150k+)
145
+
146
+ ```
147
+ Morgan FP (RDKit CPU)
148
+ |
149
+ uint8 -> uint32 packing
150
+ |
151
+ +-- N <= 100k: Fused Metal Kernel ------+
152
+ | Single dispatch, no NxN matrix |
153
+ +-- N > 100k: Blockwise D&C ------------+
154
+ | Tile both dimensions (auto-sized) |
155
+ | mx.eval() between tiles (free GPU) |
156
+ +----------------------------------------+
157
+ |
158
+ Butina greedy (CPU, numpy CSR)
159
+ |
160
+ Clusters
161
+ ```
162
+
163
+ ## Adaptive Iteration Scaling
164
+
165
+ Iterations auto-scale by molecule complexity (default). Small molecules converge early via in-kernel TOLX/gradient checks — no wasted GPU compute.
166
+
167
+ Formula: `max_iters = base + scale * max(n_atoms, sqrt(n_constraints))`
168
+
169
+ | Molecule | Atoms | Constraints | DG iters | ETK iters | MMFF iters |
170
+ |----------|-------|-------------|----------|-----------|------------|
171
+ | Methane | 5 | 10 | 400 | 200 | 275 |
172
+ | Benzene | 12 | 66 | 540 | 270 | 380 |
173
+ | Aspirin | 21 | 210 | 720 | 360 | 515 |
174
+ | Testosterone | 49 | 1176 | 1280 | 640 | 935 |
175
+ | 64-atom | 64 | 2016 | 1580 | 790 | 1160 |
176
+
177
+ Override with explicit values when needed:
178
+
179
+ ```python
180
+ # Auto (default) — scales with molecule size
181
+ result = generate_conformers_nk(smiles_list, n_confs_per_mol=10)
182
+
183
+ # Fixed iterations for fine control
184
+ result = generate_conformers_nk(smiles_list, n_confs_per_mol=10,
185
+ dg_max_iters=1000, etk_max_iters=500, mmff_max_iters=400)
186
+ ```
187
+
188
+ ## Optimization Options
189
+
190
+ | Option | Flag | Effect | When to use |
191
+ |--------|------|--------|-------------|
192
+ | Auto iterations (default) | `dg_max_iters=0` | Scales with molecule size | Always (default) |
193
+ | Warm-start retry | automatic | Re-runs non-converged with 2x iters | Always (automatic) |
194
+ | ETK parallel gradient | `parallel_grad=True` | 1.18x ETK speedup | Many distance constraints |
195
+ | DG parallel gradient | `parallel_grad=True` | Parallelizes dist gradient | >500 distance constraints |
196
+ | MMFF94s variant | `mmff_variant="MMFF94s"` | Softer torsion barriers | Conjugated/aromatic molecules |
197
+ | MMFF L-BFGS | `mmff_use_lbfgs=True` | 5x less memory | Molecules >50 atoms |
198
+ | MMFF BFGS (default) | `mmff_use_lbfgs=False` | 2x faster for small mols | Molecules <50 atoms |
199
+ | ETKDG variant | `variant="ETKDGv3"` | 7 variants supported | Choose per use case |
200
+
201
+ ### MMFF94 Force Field Variants
202
+
203
+ | Variant | Flag | Torsion barriers | Best for |
204
+ |---------|------|-----------------|----------|
205
+ | MMFF94 (default) | `mmff_variant="MMFF94"` | Standard | General molecules |
206
+ | MMFF94s | `mmff_variant="MMFF94s"` | Softer for conjugated systems | Aromatic, planar, conjugated |
207
+
208
+ ## Usage
209
+
210
+ ### 3D Conformer Generation
211
+
212
+ ```python
213
+ from mlxmolkit.conformer_pipeline_v2 import generate_conformers_nk
214
+
215
+ # Basic: 10 conformers per molecule, ETKDGv2
216
+ result = generate_conformers_nk(
217
+ smiles_list=["c1ccccc1", "CC(=O)O", "CC(=O)Oc1ccccc1C(=O)O"],
218
+ n_confs_per_mol=10,
219
+ )
220
+ for mol in result.molecules:
221
+ print(f"{mol.n_atoms} atoms, {len(mol.positions_3d)} conformers")
222
+
223
+ # Full pipeline with MMFF94
224
+ result = generate_conformers_nk(
225
+ smiles_list=["c1ccccc1", "CC(=O)O"],
226
+ n_confs_per_mol=50,
227
+ variant="ETKDGv3",
228
+ run_mmff=True,
229
+ mmff_use_lbfgs=False, # BFGS (default, fast for <50 atoms)
230
+ max_confs_per_batch=500, # divide-and-conquer batch size
231
+ )
232
+
233
+ # Pure distance geometry (no torsion refinement)
234
+ result = generate_conformers_nk(
235
+ smiles_list=["c1ccccc1"],
236
+ n_confs_per_mol=100,
237
+ variant="DG",
238
+ )
239
+
240
+ # Large molecules: use L-BFGS for MMFF
241
+ result = generate_conformers_nk(
242
+ smiles_list=[large_smiles],
243
+ n_confs_per_mol=20,
244
+ run_mmff=True,
245
+ mmff_use_lbfgs=True, # L-BFGS for >50 atoms
246
+ )
247
+ ```
248
+
249
+ ### Example Script
250
+
251
+ ```bash
252
+ # Basic: 20 molecules x 10 conformers
253
+ python examples/conf3d_example.py
254
+
255
+ # Scale test: 1000 molecules x 10 conformers with MMFF
256
+ python examples/conf3d_example.py --n-mols 1000 --n-confs 10 --mmff
257
+
258
+ # All options
259
+ python examples/conf3d_example.py --n-mols 100 --n-confs 20 --variant ETKDGv3 \
260
+ --mmff --mmff-variant MMFF94s --batch-size 200
261
+
262
+ # Custom SMILES
263
+ python examples/conf3d_example.py --smiles "c1ccccc1" "CC(=O)O" --n-confs 50 --mmff
264
+ ```
265
+
266
+ ### Molecular Clustering
267
+
268
+ ```python
269
+ from mlxmolkit import butina_tanimoto_mlx
270
+ import mlx.core as mx
271
+
272
+ # Automatic: fused kernel for N<=100k, blockwise for N>100k
273
+ result = butina_tanimoto_mlx(mx.array(fp_bytes), cutoff=0.4)
274
+ print(f"{len(result.clusters)} clusters")
275
+ ```
276
+
277
+ ### Low-level API
278
+
279
+ ```python
280
+ from mlxmolkit import (
281
+ fp_uint8_to_uint32,
282
+ fused_neighbor_list_metal,
283
+ tanimoto_neighbors_blockwise,
284
+ butina_from_neighbor_list_csr,
285
+ )
286
+
287
+ fp_u32 = fp_uint8_to_uint32(mx.array(fp_bytes))
288
+
289
+ # Small N: fused single-dispatch
290
+ offsets, indices = fused_neighbor_list_metal(fp_u32, cutoff=0.4)
291
+
292
+ # Large N (150k+): divide-and-conquer blockwise
293
+ offsets, indices = tanimoto_neighbors_blockwise(fp_u32, cutoff=0.4)
294
+
295
+ result = butina_from_neighbor_list_csr(offsets, indices, N, cutoff=0.4)
296
+ ```
297
+
298
+ ## Tests
299
+
300
+ ```bash
301
+ pip install -e .
302
+ pytest tests/ -v
303
+ ```
304
+
305
+ ## References
306
+
307
+ - [nvMolKit](https://github.com/NVIDIA-Digital-Bio/nvMolKit) — NVIDIA's CUDA implementation (Apache 2.0)
308
+ - [shivampatel10/mlxmolkit](https://github.com/shivampatel10/mlxmolkit) — TPM threadgroup kernels and MMFF Metal implementation
309
+ - [RDKit blog: Butina clustering with nvMolKit](https://greglandrum.github.io/rdkit-blog/posts/2026-02-28-nvmolkit-clustering.html)
310
+ - [MLX](https://github.com/ml-explore/mlx) — Apple's ML framework with Metal kernel support
311
+ - [MMFF94](https://doi.org/10.1002/(SICI)1096-987X(199604)17:5/6<490::AID-JCC1>3.0.CO;2-P) — Halgren, J. Comput. Chem. 1996
312
+ - [Butina, D. (1999)](https://doi.org/10.1021/ci9803381) — Performance of Kier-Hall and molecular connectivity indices
@@ -0,0 +1,47 @@
1
+ """
2
+ mlxmolkit — GPU-accelerated molecular toolkit on Apple Silicon.
3
+
4
+ Two pipelines:
5
+ 1. Conformer generation: DG (4D) → ETK (3D) → MMFF94 optimization
6
+ 2. Molecular clustering: Morgan FP → Tanimoto → Butina
7
+ """
8
+
9
+ __version__ = "0.3.0"
10
+
11
+ # --- Clustering ---
12
+ from mlxmolkit.tanimoto_metal_u32 import tanimoto_matrix_metal_u32
13
+ from mlxmolkit.fused_tanimoto_nlist import fused_neighbor_list_metal
14
+ from mlxmolkit.tanimoto_blockwise import tanimoto_neighbors_blockwise
15
+ from mlxmolkit.fp_uint32 import fp_uint8_to_uint32
16
+ from mlxmolkit.butina import (
17
+ ButinaResult,
18
+ butina_from_neighbor_list_csr,
19
+ butina_from_similarity_matrix,
20
+ butina_tanimoto_mlx,
21
+ )
22
+ from mlxmolkit.morgan_cpu import morgan_fp_bytes_from_mols, morgan_fp_bytes_from_smiles
23
+
24
+ # --- Conformer generation ---
25
+ from mlxmolkit.conformer_pipeline_v2 import (
26
+ generate_conformers_nk,
27
+ ConformerResult,
28
+ PipelineResult,
29
+ )
30
+
31
+ __all__ = [
32
+ # Conformer generation
33
+ "generate_conformers_nk",
34
+ "ConformerResult",
35
+ "PipelineResult",
36
+ # Clustering
37
+ "tanimoto_matrix_metal_u32",
38
+ "fused_neighbor_list_metal",
39
+ "tanimoto_neighbors_blockwise",
40
+ "fp_uint8_to_uint32",
41
+ "butina_from_neighbor_list_csr",
42
+ "butina_from_similarity_matrix",
43
+ "butina_tanimoto_mlx",
44
+ "ButinaResult",
45
+ "morgan_fp_bytes_from_mols",
46
+ "morgan_fp_bytes_from_smiles",
47
+ ]