molscope 0.6.2__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {molscope-0.6.2 → molscope-0.8.0}/PKG-INFO +127 -22
  2. {molscope-0.6.2 → molscope-0.8.0}/README.md +111 -21
  3. {molscope-0.6.2 → molscope-0.8.0}/molscope/__init__.py +24 -6
  4. molscope-0.8.0/molscope/chem.py +204 -0
  5. molscope-0.8.0/molscope/cif.py +154 -0
  6. {molscope-0.6.2 → molscope-0.8.0}/molscope/contactmap.py +5 -3
  7. {molscope-0.6.2 → molscope-0.8.0}/molscope/descriptors.py +202 -11
  8. molscope-0.8.0/molscope/dssp.py +232 -0
  9. molscope-0.8.0/molscope/graph.py +305 -0
  10. molscope-0.8.0/molscope/io.py +690 -0
  11. {molscope-0.6.2 → molscope-0.8.0}/molscope/molecule.py +234 -19
  12. {molscope-0.6.2 → molscope-0.8.0}/molscope/plotting.py +7 -2
  13. {molscope-0.6.2 → molscope-0.8.0}/molscope.egg-info/PKG-INFO +127 -22
  14. {molscope-0.6.2 → molscope-0.8.0}/molscope.egg-info/SOURCES.txt +7 -0
  15. molscope-0.8.0/molscope.egg-info/requires.txt +34 -0
  16. {molscope-0.6.2 → molscope-0.8.0}/pyproject.toml +14 -2
  17. molscope-0.8.0/tests/test_chem.py +51 -0
  18. molscope-0.8.0/tests/test_cif_validation.py +69 -0
  19. molscope-0.8.0/tests/test_descriptors.py +159 -0
  20. molscope-0.8.0/tests/test_dssp.py +67 -0
  21. molscope-0.8.0/tests/test_extras.py +28 -0
  22. {molscope-0.6.2 → molscope-0.8.0}/tests/test_features.py +73 -0
  23. molscope-0.8.0/tests/test_graph.py +193 -0
  24. {molscope-0.6.2 → molscope-0.8.0}/tests/test_io.py +65 -1
  25. {molscope-0.6.2 → molscope-0.8.0}/tests/test_molecule.py +24 -0
  26. molscope-0.6.2/molscope/graph.py +0 -151
  27. molscope-0.6.2/molscope/io.py +0 -342
  28. molscope-0.6.2/molscope.egg-info/requires.txt +0 -14
  29. molscope-0.6.2/tests/test_descriptors.py +0 -73
  30. molscope-0.6.2/tests/test_graph.py +0 -99
  31. {molscope-0.6.2 → molscope-0.8.0}/LICENSE +0 -0
  32. {molscope-0.6.2 → molscope-0.8.0}/molscope/__main__.py +0 -0
  33. {molscope-0.6.2 → molscope-0.8.0}/molscope/cli.py +0 -0
  34. {molscope-0.6.2 → molscope-0.8.0}/molscope/coarsegrain.py +0 -0
  35. {molscope-0.6.2 → molscope-0.8.0}/molscope/elements.py +0 -0
  36. {molscope-0.6.2 → molscope-0.8.0}/molscope/ensemble.py +0 -0
  37. {molscope-0.6.2 → molscope-0.8.0}/molscope.egg-info/dependency_links.txt +0 -0
  38. {molscope-0.6.2 → molscope-0.8.0}/molscope.egg-info/entry_points.txt +0 -0
  39. {molscope-0.6.2 → molscope-0.8.0}/molscope.egg-info/top_level.txt +0 -0
  40. {molscope-0.6.2 → molscope-0.8.0}/setup.cfg +0 -0
  41. {molscope-0.6.2 → molscope-0.8.0}/tests/test_clustering.py +0 -0
  42. {molscope-0.6.2 → molscope-0.8.0}/tests/test_coarsegrain.py +0 -0
  43. {molscope-0.6.2 → molscope-0.8.0}/tests/test_contactmap.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: molscope
3
- Version: 0.6.2
3
+ Version: 0.8.0
4
4
  Summary: Lightweight molecular structure analysis, visualisation, graph export, and coarse-graining in Python.
5
5
  Author-email: Roshan Shrestha <roshanpra@gmail.com>
6
6
  License-Expression: MIT
@@ -23,6 +23,21 @@ Provides-Extra: viz
23
23
  Requires-Dist: py3Dmol>=2.0; extra == "viz"
24
24
  Provides-Extra: graph
25
25
  Requires-Dist: networkx>=2.6; extra == "graph"
26
+ Provides-Extra: chem
27
+ Requires-Dist: rdkit>=2023.9; extra == "chem"
28
+ Provides-Extra: cif
29
+ Requires-Dist: gemmi>=0.7; extra == "cif"
30
+ Provides-Extra: pyg
31
+ Requires-Dist: torch>=2.0; extra == "pyg"
32
+ Requires-Dist: torch-geometric>=2.3; extra == "pyg"
33
+ Provides-Extra: dgl
34
+ Requires-Dist: torch>=2.0; extra == "dgl"
35
+ Requires-Dist: dgl>=1.1; extra == "dgl"
36
+ Provides-Extra: gnn
37
+ Requires-Dist: networkx>=2.6; extra == "gnn"
38
+ Requires-Dist: torch>=2.0; extra == "gnn"
39
+ Requires-Dist: torch-geometric>=2.3; extra == "gnn"
40
+ Requires-Dist: dgl>=1.1; extra == "gnn"
26
41
  Dynamic: license-file
27
42
 
28
43
  # MolScope
@@ -35,29 +50,65 @@ Dynamic: license-file
35
50
  Lightweight molecular structure analysis, visualisation, graph export, and
36
51
  coarse-graining in Python. Read `.xyz`, `.pdb`, `.cif` and `.sdf` files
37
52
  (optionally gzip-compressed), select and analyse atoms, and visualise them in
38
- 3D. The `.cif` reader is a basic mmCIF parser for standard `_atom_site`
39
- coordinate loops, not a full mmCIF syntax implementation.
53
+ 3D. The `.cif` reader handles standard `_atom_site` coordinate loops, including
54
+ quoted values; optional Gemmi-backed validation is available through
55
+ `pip install "molscope[cif]"`.
40
56
 
41
- | 3D structure rendering | Residue contact map | Coarse-grained beads |
42
- | --- | --- | --- |
43
- | ![Aquaporin-1 rendered as a 3D element-coloured molecular structure](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/aquaporin-structure-v2.png) | ![Residue-level contact map heatmap for Aquaporin-1](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/residue-contact-map.png) | ![Coarse-grained bead model of Aquaporin-1](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/coarse-grained-beads-v2.png) |
57
+ | 3D structure (element) | Secondary structure (DSSP) | Residue contact map | Coarse-grained beads |
58
+ | --- | --- | --- | --- |
59
+ | ![Aquaporin-1 rendered as a 3D element-coloured molecular structure](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/aquaporin-structure-v2.png) | ![Aquaporin-1 coloured by DSSP secondary structure: helices red, turns cyan, coil grey](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/secondary-structure.png) | ![Residue-level contact map heatmap for Aquaporin-1](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/residue-contact-map.png) | ![Coarse-grained bead model of Aquaporin-1](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/coarse-grained-beads-v2.png) |
44
60
 
45
61
  ## What it does
46
62
 
47
- - **Read and write** XYZ, PDB, mmCIF and SDF (gzip-aware), fetch structures by
48
- id from RCSB, and load multi-model NMR ensembles.
63
+ - **Read and write** XYZ, PDB, mmCIF and SDF (gzip-aware), preserve SDF/PDB
64
+ explicit bonds and SDF formal charges where present, fetch structures by id
65
+ from RCSB, and load multi-model NMR ensembles.
66
+ - **Validate mmCIF** syntax, atom-site coordinate columns, and supplied
67
+ dictionary files with optional Gemmi support.
49
68
  - **Select and measure** by chain, element or residue; compute distances,
50
69
  angles, dihedrals and Kabsch-aligned RMSD.
51
- - **Analyse** centroids, radius of gyration, the inertia tensor, inferred bonds
52
- and contacts.
70
+ - **Analyse** centroids, radius of gyration, the inertia tensor,
71
+ explicit/inferred bonds, and contacts.
53
72
  - **Contact maps** at atom or residue level, with heatmap plots.
73
+ - **Secondary structure** via a self-contained, dependency-free DSSP, with
74
+ `plot(color_by="ss")`.
54
75
  - **Ensembles**: pairwise RMSD, RMSF, averaging, and conformer clustering.
55
76
  - **Export for ML**: flat structural descriptors and molecular graphs for
56
77
  NetworkX, PyTorch Geometric and DGL.
78
+ - **Chemical perception and descriptors**: optional RDKit-backed formal charge,
79
+ valence, aromaticity and scalar descriptor features with
80
+ `pip install "molscope[chem]"`.
57
81
  - **Coarse-grain** onto residue, Martini-style or custom bead mappings.
58
82
  - **Visualise** with 3D matplotlib plots, an interactive py3Dmol viewer, spin
59
83
  GIFs, and a command-line interface.
60
84
 
85
+ ## Why MolScope?
86
+
87
+ MolScope is **not** intended to replace full molecular-simulation or
88
+ cheminformatics frameworks. It is a lightweight **educational and prototyping**
89
+ toolkit for reading common molecular structure files, performing simple
90
+ structural analysis, exporting graph representations for ML workflows, and
91
+ experimenting with coarse-grained mappings. Its core depends only on NumPy and
92
+ Matplotlib, and the API is Python-first and scriptable.
93
+
94
+ In particular, the coarse-graining tools are for **educational CG mapping and
95
+ bead-graph prototyping**: useful for exploring mappings before moving to a
96
+ production Martini workflow. They are not a validated Martini force-field
97
+ generator.
98
+
99
+ | Tool | Main focus | How MolScope differs |
100
+ | --- | --- | --- |
101
+ | RDKit | Cheminformatics | MolScope leans toward structure visualisation, protein/PDB-style metadata, and CG prototyping |
102
+ | MDAnalysis | MD trajectories | MolScope is lighter and easier for static structures and teaching |
103
+ | MDTraj | Trajectory analysis | MolScope is simpler and graph/CG oriented |
104
+ | Biopython | Structure parsing / bioinformatics | MolScope adds 3D analysis, ML-graph export, and coarse-graining |
105
+ | PyMOL / VMD | Interactive visualisation | MolScope is Python-first, scriptable, and ML-export friendly |
106
+ | nglview | Notebook structure viewer | MolScope also does analysis, descriptors, graphs and CG, not just viewing |
107
+
108
+ Reach for those tools when you need their depth and validation. Reach for
109
+ MolScope when you want something small, readable, and quick to teach or
110
+ prototype with.
111
+
61
112
  ## Install
62
113
 
63
114
  With [uv](https://docs.astral.sh/uv/) (recommended):
@@ -144,6 +195,7 @@ mol.radius_of_gyration # compactness (angstrom)
144
195
  mol.dimensions, mol.formula # bounding box, Hill-order formula
145
196
  mol.bonds() # inferred bond index pairs (KD-tree if scipy)
146
197
  mol.contacts(cutoff=5.0) # atom pairs within a distance
198
+ mol.contact_count(cutoff=5.0) # count pairs without returning them
147
199
 
148
200
  mol.distance(i, j) # bond length
149
201
  mol.angle(i, j, k) # bond angle (degrees)
@@ -156,12 +208,14 @@ a.alpha_carbons().rmsd(b.alpha_carbons(), align=True) # CA-RMSD after Kabsch f
156
208
 
157
209
  ```python
158
210
  features = mol.descriptors() # flat dict of scalar/vector descriptors
211
+ features = mol.descriptors(preset="native-3d")
159
212
  features["radius_of_gyration"]
160
213
  features["principal_moments"] # 3 values
161
214
  features["distance_histogram"] # fixed-size histogram
162
215
 
163
216
  X, names = ms.featurize_many(
164
217
  ["a.pdb", "b.pdb", "c.xyz"],
218
+ preset="native-basic",
165
219
  return_names=True,
166
220
  ) # numeric matrix + column names
167
221
  ```
@@ -170,7 +224,17 @@ Descriptors include atom/residue counts, element counts, molecular mass,
170
224
  centres, radius of gyration, bounding-box dimensions, inertia tensor, principal
171
225
  moments/axes, shape anisotropy, compactness, distance histograms, bond-length
172
226
  summary statistics, and atom/residue contact summaries. Full contact maps remain
173
- available through `mol.contact_map(...)`.
227
+ available through `mol.contact_map(...)`. With `pip install "molscope[chem]"`,
228
+ you can also request RDKit descriptors directly:
229
+
230
+ ```python
231
+ mol.rdkit_descriptors(names=["MolWt", "TPSA"])
232
+ mol.descriptors(include_rdkit=True, rdkit_descriptor_names=["MolWt", "TPSA"])
233
+ ```
234
+
235
+ For reproducible ML columns, use descriptor presets: `native-basic`,
236
+ `native-3d`, or `rdkit-basic`. Inspect the flattened column order with
237
+ `ms.descriptor_feature_names(...)`.
174
238
 
175
239
  ### Contact maps
176
240
 
@@ -184,6 +248,29 @@ mol.contact_map(level="residue", method="min") # closest inter-residue at
184
248
  mol.contact_map(level="residue", method="com") # residue centre of mass
185
249
  ```
186
250
 
251
+ ### Secondary structure (DSSP)
252
+
253
+ Assign protein secondary structure from backbone hydrogen-bond patterns with a
254
+ self-contained, pure-NumPy DSSP (no external `mkdssp` binary needed):
255
+
256
+ ```python
257
+ mol = ms.read("1fqy.pdb")
258
+ ss = mol.secondary_structure() # SecondaryStructure, one code per residue
259
+
260
+ ss.string # e.g. '--HHHHHHHH--SS--EEEE--'
261
+ ss.codes # per-residue array
262
+ ss.summary() # helix/strand/coil counts and fractions
263
+
264
+ mol.plot(color_by="ss") # colour the 3D view by secondary structure
265
+ ```
266
+
267
+ Codes follow DSSP: `H`/`G`/`I` helices, `E`/`B` strands, `T` turn, `S` bend,
268
+ `-` coil. This is a simplified **educational** implementation: it reproduces the
269
+ main classes from the Kabsch-Sander hydrogen-bond model but is not bit-identical
270
+ to the reference `mkdssp` on every edge case. It needs backbone N/CA/C/O atoms,
271
+ so use PDB/mmCIF input (not a bare `.xyz`). The secondary-structure render in the
272
+ showcase above (helices red, turns cyan, coil grey) is produced this way.
273
+
187
274
  ### NMR ensembles
188
275
 
189
276
  ```python
@@ -232,9 +319,9 @@ spin_gif(mol, "spin.gif") # rotating animation
232
319
 
233
320
  ### Molecular graphs (for machine learning)
234
321
 
235
- Turn 3D coordinates plus inferred bonds into a graph, then export to the common
236
- ML frameworks. The base `to_graph()` needs no extra dependencies; each exporter
237
- imports its backend lazily.
322
+ Turn 3D coordinates plus explicit or inferred bonds into a graph, then export
323
+ to the common ML frameworks. The base `to_graph()` needs no extra dependencies;
324
+ each exporter imports its backend lazily.
238
325
 
239
326
  ```python
240
327
  mol = ms.read("1fqy.pdb")
@@ -243,18 +330,28 @@ g = mol.to_graph() # MolecularGraph: nodes + edges, no deps
243
330
  g.n_atoms, g.n_bonds # counts
244
331
  g.atomic_numbers, g.masses # per-node arrays
245
332
  g.node_features() # (N, 2) default features [atomic_number, mass]
333
+ g.node_features("ml") # stable ML node preset
334
+ g.edge_features("ml") # stable ML edge preset
246
335
 
247
336
  G = mol.to_networkx() # networkx.Graph with node/edge attributes
248
337
  data = mol.to_pyg_data() # torch_geometric.data.Data (x, pos, edge_index, edge_attr, z)
249
338
  dglg = mol.to_dgl_graph() # dgl.DGLGraph with ndata/edata tensors
250
339
  ```
251
340
 
252
- Nodes carry element, atomic number, mass, coordinates and (from PDB/mmCIF) atom
253
- name, residue and chain. Edges carry the bonded pair, interatomic distance, and
254
- bond order (`1.0` for geometrically inferred bonds). Install backends as needed:
255
- `pip install "molscope[graph]"` installs only NetworkX. PyTorch Geometric and
256
- DGL are optional manual installs: `pip install torch torch_geometric` or
257
- `pip install dgl` after choosing the right PyTorch build for your platform.
341
+ Nodes carry element, atomic number, mass, coordinates, formal charge, and (from
342
+ PDB/mmCIF) atom name, residue and chain. Edges carry the bonded pair,
343
+ interatomic distance, and bond order from SDF where available (`1.0` for
344
+ PDB/CONECT or geometrically inferred bonds). Install backends as needed:
345
+ `pip install "molscope[graph]"` for NetworkX, `"molscope[pyg]"` for PyTorch
346
+ Geometric, `"molscope[dgl]"` for DGL, or `"molscope[gnn]"` for all graph
347
+ backends. For custom CUDA, ROCm, Apple Silicon, or cluster builds, install the
348
+ matching PyTorch stack first.
349
+
350
+ Graph feature presets are also available through
351
+ `mol.to_pyg_data(node_preset="ml", edge_preset="ml")` and
352
+ `mol.to_dgl_graph(node_preset="ml", edge_preset="ml")`. Use
353
+ `mol.to_graph(include_chemical_features=True)` to attach optional RDKit-backed
354
+ aromatic atom and bond flags.
258
355
 
259
356
  ### Coarse-graining
260
357
 
@@ -314,12 +411,20 @@ python -m molscope 1fqy.pdb # equivalent if not pip-installed
314
411
  - PDB files are parsed by **fixed columns**, not whitespace splitting, so atoms
315
412
  with touching coordinate fields (large or negative values) read correctly.
316
413
  - Alternate conformations (altLoc) other than the primary one are skipped.
414
+ Use `read_pdb(..., altloc="first"|"highest_occupancy"|"all")` to select a
415
+ different policy.
317
416
  - `read_pdb` returns a single model (`model=1` by default); use `read_pdb_models`
318
417
  for the whole ensemble.
418
+ - SDF/MOL V2000 bond blocks, formal charges, and PDB `CONECT` records are
419
+ preserved. PDB output writes explicit bonds back as `CONECT` records.
319
420
  - Bond inference uses a `scipy.spatial.cKDTree` when available; without scipy it
320
421
  falls back to a dense `O(n^2)` search that is refused above ~8000 atoms.
321
- - Optional extras: `pip install "molscope[fast]"` (scipy, faster bonds/contacts)
322
- and `"molscope[viz]"` (py3Dmol, for `Molecule.view`).
422
+ - Optional extras: `pip install "molscope[fast]"` (scipy, faster bonds/contacts),
423
+ `"molscope[viz]"` (py3Dmol, for `Molecule.view`), `"molscope[graph]"`
424
+ (NetworkX), `"molscope[chem]"` (RDKit), `"molscope[cif]"` (Gemmi),
425
+ `"molscope[pyg]"`, `"molscope[dgl]"`, or `"molscope[gnn]"`. For custom CUDA,
426
+ ROCm, Apple Silicon, or cluster builds, install the matching PyTorch stack
427
+ first.
323
428
 
324
429
  ## Tests and linting
325
430
 
@@ -8,29 +8,65 @@
8
8
  Lightweight molecular structure analysis, visualisation, graph export, and
9
9
  coarse-graining in Python. Read `.xyz`, `.pdb`, `.cif` and `.sdf` files
10
10
  (optionally gzip-compressed), select and analyse atoms, and visualise them in
11
- 3D. The `.cif` reader is a basic mmCIF parser for standard `_atom_site`
12
- coordinate loops, not a full mmCIF syntax implementation.
11
+ 3D. The `.cif` reader handles standard `_atom_site` coordinate loops, including
12
+ quoted values; optional Gemmi-backed validation is available through
13
+ `pip install "molscope[cif]"`.
13
14
 
14
- | 3D structure rendering | Residue contact map | Coarse-grained beads |
15
- | --- | --- | --- |
16
- | ![Aquaporin-1 rendered as a 3D element-coloured molecular structure](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/aquaporin-structure-v2.png) | ![Residue-level contact map heatmap for Aquaporin-1](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/residue-contact-map.png) | ![Coarse-grained bead model of Aquaporin-1](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/coarse-grained-beads-v2.png) |
15
+ | 3D structure (element) | Secondary structure (DSSP) | Residue contact map | Coarse-grained beads |
16
+ | --- | --- | --- | --- |
17
+ | ![Aquaporin-1 rendered as a 3D element-coloured molecular structure](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/aquaporin-structure-v2.png) | ![Aquaporin-1 coloured by DSSP secondary structure: helices red, turns cyan, coil grey](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/secondary-structure.png) | ![Residue-level contact map heatmap for Aquaporin-1](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/residue-contact-map.png) | ![Coarse-grained bead model of Aquaporin-1](https://raw.githubusercontent.com/roshan2004/molscope/main/docs/assets/readme/coarse-grained-beads-v2.png) |
17
18
 
18
19
  ## What it does
19
20
 
20
- - **Read and write** XYZ, PDB, mmCIF and SDF (gzip-aware), fetch structures by
21
- id from RCSB, and load multi-model NMR ensembles.
21
+ - **Read and write** XYZ, PDB, mmCIF and SDF (gzip-aware), preserve SDF/PDB
22
+ explicit bonds and SDF formal charges where present, fetch structures by id
23
+ from RCSB, and load multi-model NMR ensembles.
24
+ - **Validate mmCIF** syntax, atom-site coordinate columns, and supplied
25
+ dictionary files with optional Gemmi support.
22
26
  - **Select and measure** by chain, element or residue; compute distances,
23
27
  angles, dihedrals and Kabsch-aligned RMSD.
24
- - **Analyse** centroids, radius of gyration, the inertia tensor, inferred bonds
25
- and contacts.
28
+ - **Analyse** centroids, radius of gyration, the inertia tensor,
29
+ explicit/inferred bonds, and contacts.
26
30
  - **Contact maps** at atom or residue level, with heatmap plots.
31
+ - **Secondary structure** via a self-contained, dependency-free DSSP, with
32
+ `plot(color_by="ss")`.
27
33
  - **Ensembles**: pairwise RMSD, RMSF, averaging, and conformer clustering.
28
34
  - **Export for ML**: flat structural descriptors and molecular graphs for
29
35
  NetworkX, PyTorch Geometric and DGL.
36
+ - **Chemical perception and descriptors**: optional RDKit-backed formal charge,
37
+ valence, aromaticity and scalar descriptor features with
38
+ `pip install "molscope[chem]"`.
30
39
  - **Coarse-grain** onto residue, Martini-style or custom bead mappings.
31
40
  - **Visualise** with 3D matplotlib plots, an interactive py3Dmol viewer, spin
32
41
  GIFs, and a command-line interface.
33
42
 
43
+ ## Why MolScope?
44
+
45
+ MolScope is **not** intended to replace full molecular-simulation or
46
+ cheminformatics frameworks. It is a lightweight **educational and prototyping**
47
+ toolkit for reading common molecular structure files, performing simple
48
+ structural analysis, exporting graph representations for ML workflows, and
49
+ experimenting with coarse-grained mappings. Its core depends only on NumPy and
50
+ Matplotlib, and the API is Python-first and scriptable.
51
+
52
+ In particular, the coarse-graining tools are for **educational CG mapping and
53
+ bead-graph prototyping**: useful for exploring mappings before moving to a
54
+ production Martini workflow. They are not a validated Martini force-field
55
+ generator.
56
+
57
+ | Tool | Main focus | How MolScope differs |
58
+ | --- | --- | --- |
59
+ | RDKit | Cheminformatics | MolScope leans toward structure visualisation, protein/PDB-style metadata, and CG prototyping |
60
+ | MDAnalysis | MD trajectories | MolScope is lighter and easier for static structures and teaching |
61
+ | MDTraj | Trajectory analysis | MolScope is simpler and graph/CG oriented |
62
+ | Biopython | Structure parsing / bioinformatics | MolScope adds 3D analysis, ML-graph export, and coarse-graining |
63
+ | PyMOL / VMD | Interactive visualisation | MolScope is Python-first, scriptable, and ML-export friendly |
64
+ | nglview | Notebook structure viewer | MolScope also does analysis, descriptors, graphs and CG, not just viewing |
65
+
66
+ Reach for those tools when you need their depth and validation. Reach for
67
+ MolScope when you want something small, readable, and quick to teach or
68
+ prototype with.
69
+
34
70
  ## Install
35
71
 
36
72
  With [uv](https://docs.astral.sh/uv/) (recommended):
@@ -117,6 +153,7 @@ mol.radius_of_gyration # compactness (angstrom)
117
153
  mol.dimensions, mol.formula # bounding box, Hill-order formula
118
154
  mol.bonds() # inferred bond index pairs (KD-tree if scipy)
119
155
  mol.contacts(cutoff=5.0) # atom pairs within a distance
156
+ mol.contact_count(cutoff=5.0) # count pairs without returning them
120
157
 
121
158
  mol.distance(i, j) # bond length
122
159
  mol.angle(i, j, k) # bond angle (degrees)
@@ -129,12 +166,14 @@ a.alpha_carbons().rmsd(b.alpha_carbons(), align=True) # CA-RMSD after Kabsch f
129
166
 
130
167
  ```python
131
168
  features = mol.descriptors() # flat dict of scalar/vector descriptors
169
+ features = mol.descriptors(preset="native-3d")
132
170
  features["radius_of_gyration"]
133
171
  features["principal_moments"] # 3 values
134
172
  features["distance_histogram"] # fixed-size histogram
135
173
 
136
174
  X, names = ms.featurize_many(
137
175
  ["a.pdb", "b.pdb", "c.xyz"],
176
+ preset="native-basic",
138
177
  return_names=True,
139
178
  ) # numeric matrix + column names
140
179
  ```
@@ -143,7 +182,17 @@ Descriptors include atom/residue counts, element counts, molecular mass,
143
182
  centres, radius of gyration, bounding-box dimensions, inertia tensor, principal
144
183
  moments/axes, shape anisotropy, compactness, distance histograms, bond-length
145
184
  summary statistics, and atom/residue contact summaries. Full contact maps remain
146
- available through `mol.contact_map(...)`.
185
+ available through `mol.contact_map(...)`. With `pip install "molscope[chem]"`,
186
+ you can also request RDKit descriptors directly:
187
+
188
+ ```python
189
+ mol.rdkit_descriptors(names=["MolWt", "TPSA"])
190
+ mol.descriptors(include_rdkit=True, rdkit_descriptor_names=["MolWt", "TPSA"])
191
+ ```
192
+
193
+ For reproducible ML columns, use descriptor presets: `native-basic`,
194
+ `native-3d`, or `rdkit-basic`. Inspect the flattened column order with
195
+ `ms.descriptor_feature_names(...)`.
147
196
 
148
197
  ### Contact maps
149
198
 
@@ -157,6 +206,29 @@ mol.contact_map(level="residue", method="min") # closest inter-residue at
157
206
  mol.contact_map(level="residue", method="com") # residue centre of mass
158
207
  ```
159
208
 
209
+ ### Secondary structure (DSSP)
210
+
211
+ Assign protein secondary structure from backbone hydrogen-bond patterns with a
212
+ self-contained, pure-NumPy DSSP (no external `mkdssp` binary needed):
213
+
214
+ ```python
215
+ mol = ms.read("1fqy.pdb")
216
+ ss = mol.secondary_structure() # SecondaryStructure, one code per residue
217
+
218
+ ss.string # e.g. '--HHHHHHHH--SS--EEEE--'
219
+ ss.codes # per-residue array
220
+ ss.summary() # helix/strand/coil counts and fractions
221
+
222
+ mol.plot(color_by="ss") # colour the 3D view by secondary structure
223
+ ```
224
+
225
+ Codes follow DSSP: `H`/`G`/`I` helices, `E`/`B` strands, `T` turn, `S` bend,
226
+ `-` coil. This is a simplified **educational** implementation: it reproduces the
227
+ main classes from the Kabsch-Sander hydrogen-bond model but is not bit-identical
228
+ to the reference `mkdssp` on every edge case. It needs backbone N/CA/C/O atoms,
229
+ so use PDB/mmCIF input (not a bare `.xyz`). The secondary-structure render in the
230
+ showcase above (helices red, turns cyan, coil grey) is produced this way.
231
+
160
232
  ### NMR ensembles
161
233
 
162
234
  ```python
@@ -205,9 +277,9 @@ spin_gif(mol, "spin.gif") # rotating animation
205
277
 
206
278
  ### Molecular graphs (for machine learning)
207
279
 
208
- Turn 3D coordinates plus inferred bonds into a graph, then export to the common
209
- ML frameworks. The base `to_graph()` needs no extra dependencies; each exporter
210
- imports its backend lazily.
280
+ Turn 3D coordinates plus explicit or inferred bonds into a graph, then export
281
+ to the common ML frameworks. The base `to_graph()` needs no extra dependencies;
282
+ each exporter imports its backend lazily.
211
283
 
212
284
  ```python
213
285
  mol = ms.read("1fqy.pdb")
@@ -216,18 +288,28 @@ g = mol.to_graph() # MolecularGraph: nodes + edges, no deps
216
288
  g.n_atoms, g.n_bonds # counts
217
289
  g.atomic_numbers, g.masses # per-node arrays
218
290
  g.node_features() # (N, 2) default features [atomic_number, mass]
291
+ g.node_features("ml") # stable ML node preset
292
+ g.edge_features("ml") # stable ML edge preset
219
293
 
220
294
  G = mol.to_networkx() # networkx.Graph with node/edge attributes
221
295
  data = mol.to_pyg_data() # torch_geometric.data.Data (x, pos, edge_index, edge_attr, z)
222
296
  dglg = mol.to_dgl_graph() # dgl.DGLGraph with ndata/edata tensors
223
297
  ```
224
298
 
225
- Nodes carry element, atomic number, mass, coordinates and (from PDB/mmCIF) atom
226
- name, residue and chain. Edges carry the bonded pair, interatomic distance, and
227
- bond order (`1.0` for geometrically inferred bonds). Install backends as needed:
228
- `pip install "molscope[graph]"` installs only NetworkX. PyTorch Geometric and
229
- DGL are optional manual installs: `pip install torch torch_geometric` or
230
- `pip install dgl` after choosing the right PyTorch build for your platform.
299
+ Nodes carry element, atomic number, mass, coordinates, formal charge, and (from
300
+ PDB/mmCIF) atom name, residue and chain. Edges carry the bonded pair,
301
+ interatomic distance, and bond order from SDF where available (`1.0` for
302
+ PDB/CONECT or geometrically inferred bonds). Install backends as needed:
303
+ `pip install "molscope[graph]"` for NetworkX, `"molscope[pyg]"` for PyTorch
304
+ Geometric, `"molscope[dgl]"` for DGL, or `"molscope[gnn]"` for all graph
305
+ backends. For custom CUDA, ROCm, Apple Silicon, or cluster builds, install the
306
+ matching PyTorch stack first.
307
+
308
+ Graph feature presets are also available through
309
+ `mol.to_pyg_data(node_preset="ml", edge_preset="ml")` and
310
+ `mol.to_dgl_graph(node_preset="ml", edge_preset="ml")`. Use
311
+ `mol.to_graph(include_chemical_features=True)` to attach optional RDKit-backed
312
+ aromatic atom and bond flags.
231
313
 
232
314
  ### Coarse-graining
233
315
 
@@ -287,12 +369,20 @@ python -m molscope 1fqy.pdb # equivalent if not pip-installed
287
369
  - PDB files are parsed by **fixed columns**, not whitespace splitting, so atoms
288
370
  with touching coordinate fields (large or negative values) read correctly.
289
371
  - Alternate conformations (altLoc) other than the primary one are skipped.
372
+ Use `read_pdb(..., altloc="first"|"highest_occupancy"|"all")` to select a
373
+ different policy.
290
374
  - `read_pdb` returns a single model (`model=1` by default); use `read_pdb_models`
291
375
  for the whole ensemble.
376
+ - SDF/MOL V2000 bond blocks, formal charges, and PDB `CONECT` records are
377
+ preserved. PDB output writes explicit bonds back as `CONECT` records.
292
378
  - Bond inference uses a `scipy.spatial.cKDTree` when available; without scipy it
293
379
  falls back to a dense `O(n^2)` search that is refused above ~8000 atoms.
294
- - Optional extras: `pip install "molscope[fast]"` (scipy, faster bonds/contacts)
295
- and `"molscope[viz]"` (py3Dmol, for `Molecule.view`).
380
+ - Optional extras: `pip install "molscope[fast]"` (scipy, faster bonds/contacts),
381
+ `"molscope[viz]"` (py3Dmol, for `Molecule.view`), `"molscope[graph]"`
382
+ (NetworkX), `"molscope[chem]"` (RDKit), `"molscope[cif]"` (Gemmi),
383
+ `"molscope[pyg]"`, `"molscope[dgl]"`, or `"molscope[gnn]"`. For custom CUDA,
384
+ ROCm, Apple Silicon, or cluster builds, install the matching PyTorch stack
385
+ first.
296
386
 
297
387
  ## Tests and linting
298
388
 
@@ -7,8 +7,10 @@ beads, and visualise everything in 3D.
7
7
 
8
8
  What it does
9
9
  ------------
10
- - **Read and write** XYZ, PDB, mmCIF and SDF; fetch by id from RCSB; load
11
- multi-model NMR ensembles (:func:`read`, :func:`fetch`, :func:`read_pdb_models`).
10
+ - **Read and write** XYZ, PDB, mmCIF and SDF; preserve SDF/PDB explicit bonds
11
+ and SDF formal charges; fetch by id from RCSB; load multi-model NMR ensembles;
12
+ validate CIF/mmCIF with optional Gemmi support
13
+ (:func:`read`, :func:`fetch`, :func:`read_pdb_models`, :func:`validate_cif`).
12
14
  - **Select and measure** by chain, element or residue; distances, angles,
13
15
  dihedrals and Kabsch-aligned RMSD (:class:`Molecule`).
14
16
  - **Analyse** centroids, radius of gyration, inertia tensor, bonds and contacts.
@@ -17,6 +19,8 @@ What it does
17
19
  (:mod:`molscope.ensemble`, :func:`cluster`, :func:`rmsd_matrix`).
18
20
  - **Export for ML**: structural descriptors and molecular graphs for NetworkX,
19
21
  PyTorch Geometric and DGL (:func:`descriptors`, :class:`MolecularGraph`).
22
+ - **Chemical perception**: optional RDKit-backed valence, aromaticity, charge
23
+ features and descriptors (:func:`chemical_features`, :func:`rdkit_descriptors`).
20
24
  - **Coarse-grain** onto residue, Martini-style or custom bead mappings
21
25
  (:mod:`molscope.coarsegrain`).
22
26
  - **Visualise** with 3D matplotlib plots, an interactive py3Dmol viewer, and
@@ -37,13 +41,16 @@ Examples
37
41
  See https://github.com/roshan2004/molscope for the full documentation.
38
42
  """
39
43
 
40
- from . import coarsegrain, ensemble
44
+ from . import coarsegrain, dssp, ensemble
45
+ from .chem import ChemicalFeatures, chemical_features, rdkit_descriptors, to_rdkit
46
+ from .cif import CifValidationReport, validate_cif
41
47
  from .coarsegrain import BeadMapping, BondMapping, CoarseGrainReport, DroppedAtom
42
48
  from .contactmap import ContactMap
43
- from .descriptors import descriptors, featurize_many
49
+ from .descriptors import descriptor_feature_names, descriptors, featurize_many
50
+ from .dssp import SecondaryStructure
44
51
  from .ensemble import Clustering, cluster, rmsd_matrix
45
52
  from .ensemble import contact_frequency as ensemble_contact_frequency
46
- from .graph import MolecularGraph
53
+ from .graph import MolecularGraph, edge_feature_names, node_feature_names
47
54
  from .io import (
48
55
  fetch,
49
56
  read,
@@ -61,6 +68,8 @@ from .plotting import plot_rmsd_heatmap
61
68
 
62
69
  __all__ = [
63
70
  "Clustering",
71
+ "ChemicalFeatures",
72
+ "CifValidationReport",
64
73
  "BeadMapping",
65
74
  "BondMapping",
66
75
  "CoarseGrainReport",
@@ -68,11 +77,16 @@ __all__ = [
68
77
  "DroppedAtom",
69
78
  "Molecule",
70
79
  "MolecularGraph",
80
+ "SecondaryStructure",
71
81
  "cluster",
82
+ "chemical_features",
72
83
  "coarsegrain",
84
+ "descriptor_feature_names",
73
85
  "descriptors",
86
+ "dssp",
74
87
  "ensemble",
75
88
  "ensemble_contact_frequency",
89
+ "edge_feature_names",
76
90
  "featurize_many",
77
91
  "fetch",
78
92
  "plot_rmsd_heatmap",
@@ -83,8 +97,12 @@ __all__ = [
83
97
  "read_sdf",
84
98
  "read_xyz",
85
99
  "read_xyz_frames",
100
+ "rdkit_descriptors",
86
101
  "rmsd_matrix",
102
+ "node_feature_names",
103
+ "to_rdkit",
104
+ "validate_cif",
87
105
  "write_pdb",
88
106
  "write_xyz",
89
107
  ]
90
- __version__ = "0.6.2"
108
+ __version__ = "0.8.0"