proteintensor 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {proteintensor-0.2.0 → proteintensor-0.3.0}/PKG-INFO +126 -61
  2. {proteintensor-0.2.0 → proteintensor-0.3.0}/README.md +123 -60
  3. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/__init__.py +5 -1
  4. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/cli.py +140 -2
  5. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/converters/mmcif.py +15 -4
  6. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/dataset.py +25 -13
  7. proteintensor-0.3.0/proteintensor/ligands.py +216 -0
  8. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/msa.py +38 -23
  9. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/reader.py +2 -0
  10. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/schema.py +28 -1
  11. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/writer.py +4 -0
  12. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/PKG-INFO +126 -61
  13. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/SOURCES.txt +3 -0
  14. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/requires.txt +3 -0
  15. {proteintensor-0.2.0 → proteintensor-0.3.0}/pyproject.toml +6 -5
  16. proteintensor-0.3.0/tests/test_convert_dir.py +89 -0
  17. {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_dataset.py +59 -0
  18. proteintensor-0.3.0/tests/test_ligands.py +123 -0
  19. {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_msa.py +54 -0
  20. {proteintensor-0.2.0 → proteintensor-0.3.0}/LICENSE +0 -0
  21. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/adapters/__init__.py +0 -0
  22. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/adapters/boltz.py +0 -0
  23. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/bonds.py +0 -0
  24. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/converters/__init__.py +0 -0
  25. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/converters/sequence.py +0 -0
  26. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/embeddings.py +0 -0
  27. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/pairs.py +0 -0
  28. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/remote.py +0 -0
  29. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/dependency_links.txt +0 -0
  30. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/entry_points.txt +0 -0
  31. {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/top_level.txt +0 -0
  32. {proteintensor-0.2.0 → proteintensor-0.3.0}/setup.cfg +0 -0
  33. {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_adapters.py +0 -0
  34. {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_embeddings.py +0 -0
  35. {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_pairs.py +0 -0
  36. {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_remote.py +0 -0
  37. {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_roundtrip.py +0 -0
  38. {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_sequence.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proteintensor
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: AI-native biomolecular tensor format for structural biology ML
5
5
  Author-email: Clayton Moore <claytonwaynemoore@gmail.com>
6
6
  License-Expression: MIT
@@ -34,6 +34,8 @@ Provides-Extra: cloud
34
34
  Requires-Dist: fsspec>=2023.1; extra == "cloud"
35
35
  Requires-Dist: s3fs>=2023.1; extra == "cloud"
36
36
  Requires-Dist: gcsfs>=2023.1; extra == "cloud"
37
+ Provides-Extra: ligands
38
+ Requires-Dist: rdkit>=2023.3; extra == "ligands"
37
39
  Provides-Extra: dev
38
40
  Requires-Dist: pytest>=7; extra == "dev"
39
41
  Requires-Dist: pytest-benchmark; extra == "dev"
@@ -41,7 +43,9 @@ Requires-Dist: pytest-cov; extra == "dev"
41
43
  Requires-Dist: fsspec>=2023.1; extra == "dev"
42
44
  Dynamic: license-file
43
45
 
44
- # HelixDB / ProteinTensor
46
+ ![ProteinTensor - AI-native protein data format: convert structure or sequence into cached tensors](assets/banner.png)
47
+
48
+ # ProteinTensor Introduction
45
49
 
46
50
  **ProteinTensor** is an AI-native biomolecular storage format designed to eliminate
47
51
  the preprocessing bottleneck in modern structural biology machine learning pipelines.
@@ -108,20 +112,21 @@ performance format that turns a recurring computational tax into a one-time cost
108
112
 
109
113
  ## Benchmark: Traditional Pipeline vs ProteinTensor
110
114
 
111
- All timings are median over 30 rounds on an NVIDIA RTX 5080, CUDA 12.8, Python 3.11.
112
- Proteins span the full range from a 76-residue domain to a 3,525-residue CRISPR enzyme.
113
- Run `python boltz_benchmark.py` to reproduce.
115
+ All timings are median over 30 rounds on a Windows workstation (RTX 5080, Python
116
+ 3.11.9); mmCIF parsing and `.ptt` reads are CPU-bound, so these reflect CPU
117
+ performance. Proteins span the full range from a 76-residue domain to a
118
+ 3,525-residue CRISPR enzyme. Run `python boltz_benchmark.py` to reproduce.
114
119
 
115
120
  ### Per-structure load times
116
121
 
117
122
  | Structure | Method | Res | MSA seqs | mmCIF parse | ptt: full | ptt: backbone | ptt: bonds | ptt: MSA | ptt: dist mx |
118
123
  |---|---|---|---|---|---|---|---|---|---|
119
- | 1UBQ - Ubiquitin | X-ray | 76 | 512 | 7.2 ms | 2.8 ms | 1.2 ms | 0.7 ms | 1.6 ms | 0.8 ms |
120
- | 6LU7 - SARS-CoV-2 Mpro | X-ray | 312 | 1,024 | 29.6 ms | 2.9 ms | 1.2 ms | 0.7 ms | 5.1 ms | 2.0 ms |
121
- | 4HHB - Hemoglobin | X-ray | 574 | 2,048 | 55.3 ms | 2.9 ms | 1.2 ms | 0.7 ms | 11.3 ms | 3.5 ms |
122
- | 6M0J - ACE2 + RBD | Cryo-EM | 791 | 2,048 | 74.7 ms | 2.9 ms | 1.2 ms | 0.7 ms | 14.7 ms | 6.4 ms |
123
- | 6VXX - Spike trimer | Cryo-EM | 2,916 | 8,192 | 283.4 ms | 3.3 ms | 1.3 ms | 0.9 ms | 208.3 ms | 71.1 ms |
124
- | 6OHW - Cas12a | Cryo-EM | 3,525 | 8,192 | 352.4 ms | 3.3 ms | 1.2 ms | 1.0 ms | 240.7 ms | 104.5 ms |
124
+ | 1UBQ - Ubiquitin | X-ray | 76 | 512 | 7.4 ms | 3.2 ms | 1.3 ms | 0.8 ms | 1.8 ms | 0.8 ms |
125
+ | 6LU7 - SARS-CoV-2 Mpro | X-ray | 312 | 1,024 | 28.7 ms | 3.3 ms | 1.3 ms | 0.8 ms | 5.2 ms | 1.9 ms |
126
+ | 4HHB - Hemoglobin | X-ray | 574 | 2,048 | 54.1 ms | 3.3 ms | 1.3 ms | 0.8 ms | 11.5 ms | 3.6 ms |
127
+ | 6M0J - ACE2 + RBD | Cryo-EM | 791 | 2,048 | 73.2 ms | 3.3 ms | 1.4 ms | 0.8 ms | 15.3 ms | 6.9 ms |
128
+ | 6VXX - Spike trimer | Cryo-EM | 2,916 | 8,192 | 283.9 ms | 3.7 ms | 1.4 ms | 1.0 ms | 213.7 ms | 74.7 ms |
129
+ | 6OHW - Cas12a | Cryo-EM | 3,525 | 8,192 | 346.5 ms | 3.7 ms | 1.3 ms | 1.0 ms | 243.9 ms | 107.3 ms |
125
130
 
126
131
  **Column definitions**
127
132
  - `ptt: full` - `read()` - all atoms, backbone, bonds, metadata
@@ -134,32 +139,42 @@ Run `python boltz_benchmark.py` to reproduce.
134
139
 
135
140
  | Structure | Res | full | backbone | bonds | MSA | dist mx |
136
141
  |---|---|---|---|---|---|---|
137
- | 1UBQ - Ubiquitin | 76 | 3x | 6x | 11x | 4x | 9x |
138
- | 6LU7 - SARS-CoV-2 Mpro | 312 | 10x | 24x | 43x | 6x | 15x |
139
- | 4HHB - Hemoglobin | 574 | 19x | 45x | 78x | 5x | 16x |
140
- | 6M0J - ACE2 + RBD | 791 | 26x | 61x | 102x | 5x | 12x |
141
- | 6VXX - Spike trimer | 2,916 | 87x | 223x | 308x | 1x* | 4x |
142
- | 6OHW - Cas12a | 3,525 | 108x | 284x | 370x | 1x* | 3x |
142
+ | 1UBQ - Ubiquitin | 76 | 2x | 6x | 10x | 4x | 9x |
143
+ | 6LU7 - SARS-CoV-2 Mpro | 312 | 9x | 21x | 38x | 5x | 15x |
144
+ | 4HHB - Hemoglobin | 574 | 17x | 40x | 70x | 5x | 15x |
145
+ | 6M0J - ACE2 + RBD | 791 | 22x | 54x | 92x | 5x | 11x |
146
+ | 6VXX - Spike trimer | 2,916 | 76x | 201x | 285x | 1x* | 4x |
147
+ | 6OHW - Cas12a | 3,525 | 95x | 257x | 343x | 1x* | 3x |
143
148
 
144
149
  *MSA speedup shown as 1x vs mmCIF parse because both are in the same time range for
145
150
  large proteins - the real MSA comparison is vs JackHMMER generation (see below).
146
151
 
147
152
  ### Feature assembly: time to prepare all tensors for model.forward()
148
153
 
149
- Traditional = mmCIF parse + read MSA from A3M file. ProteinTensor = single .ptt read
150
- with all features pre-cached (sequence, backbone, bonds, MSA, distance matrix,
151
- ESM2 embedding).
152
-
153
- | Structure | Res | Traditional | ProteinTensor | Speedup |
154
- |---|---|---|---|---|
155
- | 1UBQ - Ubiquitin | 76 | 22.7 ms | 5.2 ms | 4x |
156
- | 6LU7 - SARS-CoV-2 Mpro | 312 | 157.3 ms | 9.9 ms | 16x |
157
- | 4HHB - Hemoglobin | 574 | 525.5 ms | 17.7 ms | 30x |
158
- | 6M0J - ACE2 + RBD | 791 | 722.7 ms | 23.9 ms | 30x |
159
- | 6VXX - Spike trimer | 2,916 | 9,838.5 ms | 282.7 ms | 35x |
160
- | 6OHW - Cas12a | 3,525 | 11,903.1 ms | 348.4 ms | **34x** |
161
-
162
- Average speedup across all six structures: **34x** for full feature assembly.
154
+ Traditional = mmCIF parse + A3M MSA parse + distance-matrix compute. ProteinTensor
155
+ = read the structure, MSA, distance matrix, and ESM2 embedding from a single
156
+ pre-cached `.ptt`. Reproduce with `python benchmarks/assembly_benchmark.py`
157
+ (MSA depth and embedding shape are realistic; numeric content is synthetic, so
158
+ timing reflects tensor dimensions, not values).
159
+
160
+ | Structure | Res | MSA depth | Traditional | ProteinTensor | Speedup |
161
+ |---|---|---|---|---|---|
162
+ | 1UBQ - Ubiquitin | 76 | 512 | 14.1 ms | 7.1 ms | 2.0x |
163
+ | 6LU7 - SARS-CoV-2 Mpro | 312 | 1,024 | 48.7 ms | 13.6 ms | 3.6x |
164
+ | 4HHB - Hemoglobin | 574 | 2,048 | 118.0 ms | 22.7 ms | 5.2x |
165
+ | 6M0J - ACE2 + RBD | 791 | 2,048 | 196.4 ms | 38.3 ms | 5.1x |
166
+ | 6VXX - Spike trimer | 2,916 | 8,192 | 1,395 ms | 309 ms | 4.5x |
167
+ | 6OHW - Cas12a | 3,525 | 8,192 | 1,462 ms | 381 ms | 3.8x |
168
+
169
+ Average speedup across all six structures: **4x** for full feature assembly
170
+ (measured on a Windows CPU box - see
171
+ [`benchmarks/ASSEMBLY_RESULTS.md`](benchmarks/ASSEMBLY_RESULTS.md)).
172
+
173
+ > **On an earlier 34x figure:** prior versions reported ~34x here. That number was
174
+ > measured against ProteinTensor's original scalar A3M parser, which dominated the
175
+ > traditional side (~11 s to parse an 8,192-deep MSA). Vectorizing that parser in
176
+ > v0.2.0 cut the traditional baseline ~8x, so the *fair* feature-assembly speedup
177
+ > is now ~4x. The `.ptt` read side was unchanged - only the baseline got faster.
163
178
 
164
179
  ### Drug target benchmark
165
180
 
@@ -169,21 +184,21 @@ IgG1 antibody. Numbers are consistent with the structural biology benchmark abov
169
184
 
170
185
  | Target | Res | mmCIF parse | ptt: full | ptt: backbone | ptt: bonds | ptt: MSA | ptt: dist mx |
171
186
  |---|---|---|---|---|---|---|---|
172
- | 6OIM - KRAS G12C + Sotorasib | 167 | 16.6 ms | 2.8 ms | 1.2 ms | 0.7 ms | 2.8 ms | 1.1 ms |
173
- | 3HTB - HIV-1 protease | 163 | 16.0 ms | 2.8 ms | 1.2 ms | 0.7 ms | 2.7 ms | 1.1 ms |
174
- | 5WT9 - PD-L1 checkpoint | 533 | 53.8 ms | 2.9 ms | 1.2 ms | 0.7 ms | 13.1 ms | 3.3 ms |
175
- | 1TUP - p53 tumor suppressor | 585 | 56.5 ms | 2.8 ms | 1.2 ms | 0.7 ms | 12.4 ms | 3.4 ms |
176
- | 2P4E - PCSK9 | 586 | 54.7 ms | 2.8 ms | 1.2 ms | 0.7 ms | 12.1 ms | 3.4 ms |
177
- | 1IGT - IgG1 antibody | 1,316 | 123.4 ms | 2.9 ms | 1.2 ms | 0.8 ms | 46.8 ms | 16.4 ms |
187
+ | 6OIM - KRAS G12C + Sotorasib | 167 | 17.1 ms | 3.4 ms | 1.3 ms | 0.8 ms | 3.0 ms | 1.3 ms |
188
+ | 3HTB - HIV-1 protease | 163 | 16.5 ms | 3.3 ms | 1.4 ms | 0.8 ms | 2.8 ms | 1.3 ms |
189
+ | 5WT9 - PD-L1 checkpoint | 533 | 54.8 ms | 3.8 ms | 1.4 ms | 0.8 ms | 11.9 ms | 3.8 ms |
190
+ | 1TUP - p53 tumor suppressor | 585 | 57.4 ms | 3.4 ms | 1.4 ms | 0.8 ms | 13.0 ms | 4.0 ms |
191
+ | 2P4E - PCSK9 | 586 | 55.4 ms | 3.4 ms | 1.4 ms | 0.8 ms | 12.8 ms | 4.1 ms |
192
+ | 1IGT - IgG1 antibody | 1,316 | 127.3 ms | 3.5 ms | 1.4 ms | 0.8 ms | 47.1 ms | 17.9 ms |
178
193
 
179
194
  | Target | Res | full | backbone | bonds | MSA | dist mx |
180
195
  |---|---|---|---|---|---|---|
181
- | 6OIM - KRAS G12C + Sotorasib | 167 | 6x | 14x | 24x | 6x | 15x |
182
- | 3HTB - HIV-1 protease | 163 | 6x | 14x | 23x | 6x | 14x |
183
- | 5WT9 - PD-L1 checkpoint | 533 | 19x | 44x | 77x | 4x | 16x |
184
- | 1TUP - p53 tumor suppressor | 585 | 20x | 47x | 80x | 5x | 17x |
185
- | 2P4E - PCSK9 | 586 | 19x | 46x | 77x | 5x | 16x |
186
- | 1IGT - IgG1 antibody | 1,316 | 42x | **100x** | **162x** | 3x | 8x |
196
+ | 6OIM - KRAS G12C + Sotorasib | 167 | 5x | 13x | 22x | 6x | 13x |
197
+ | 3HTB - HIV-1 protease | 163 | 5x | 12x | 21x | 6x | 13x |
198
+ | 5WT9 - PD-L1 checkpoint | 533 | 15x | 40x | 69x | 5x | 14x |
199
+ | 1TUP - p53 tumor suppressor | 585 | 17x | 42x | 71x | 4x | 14x |
200
+ | 2P4E - PCSK9 | 586 | 16x | 41x | 70x | 4x | 14x |
201
+ | 1IGT - IgG1 antibody | 1,316 | 37x | **92x** | **156x** | 3x | 7x |
187
202
 
188
203
  ### DataLoader batch throughput
189
204
 
@@ -192,26 +207,38 @@ padded batches ready for `model.forward()`. Single process, no prefetch workers.
192
207
 
193
208
  | Batch size | ms / batch | Structures / sec |
194
209
  |---|---|---|
195
- | 1 | 0.01 ms | 88,106 |
196
- | 4 | 0.04 ms | 108,696 |
197
- | 8 | 0.37 ms | 21,707 |
198
- | 16 | 0.95 ms | 16,783 |
199
- | 32 | 2.0 ms | **15,854** |
210
+ | 1 | 0.01 ms | 97,088 |
211
+ | 4 | 0.03 ms | 116,279 |
212
+ | 8 | 0.42 ms | 19,242 |
213
+ | 16 | 0.97 ms | 16,412 |
214
+ | 32 | 2.1 ms | **15,033** |
200
215
 
201
216
  ### Scale projection: 100,000 structures, one training epoch
202
217
 
218
+ These are **projections**, extrapolated from the measured per-structure timings
219
+ above - not end-to-end measurements at 100k scale.
220
+
203
221
  | Operation | Traditional pipeline | ProteinTensor | Speedup |
204
222
  |---|---|---|---|
205
- | Structure load (parse mmCIF each epoch) | 3.7 hours | 5 min | **45x** |
206
- | Backbone-only load (template search) | 3.7 hours | 2 min | **109x** |
207
- | Full feature assembly (seq + MSA + pairs + emb) | 4.5 days | 3.2 hours | **34x** |
208
- | MSA generation (JackHMMER, 32-core CPU, once) | 4,000 hours | 2.2 hours | **1,794x** |
223
+ | Structure load (parse mmCIF each epoch) | 3.8 hours | 6 min | **37x** |
224
+ | Backbone-only load (template search) | 3.8 hours | 2 min | **95x** |
225
+ | Full feature assembly (seq + MSA + pairs + emb) | 16 hours | 3.9 hours | **4x** |
226
+ | MSA generation (JackHMMER, 32-core CPU, once) | 4,000 hours | 2.7 hours | **1,477x** |
209
227
 
210
228
  > MSA generation assumes 2.4 min/protein on a 32-core server (PDB90 database, standard
211
229
  > AlphaFold settings). ProteinTensor generates MSAs once and loads from the `.ptt` cache
212
230
  > on every subsequent run. The 4,000-hour figure is the real cost AlphaFold2 and Boltz
213
231
  > users pay to build training datasets from scratch.
214
232
 
233
+ > **Measured vs projected - read this.** The **1,477x** above is MSA *generation*
234
+ > (building the alignment once with JackHMMER) and is a **literature-based
235
+ > projection**, not something benchmarked here. What *is* measured on hardware is
236
+ > the recurring per-epoch MSA **load** - reading a cached MSA from `.ptt` vs
237
+ > re-parsing A3M text each epoch (against a vectorized A3M parser baseline):
238
+ > **3.4x-5.9x**, growing with MSA depth. See
239
+ > [`benchmarks/MSA_RESULTS.md`](benchmarks/MSA_RESULTS.md). These are different
240
+ > quantities; do not read the 1,477x as a measured load speedup.
241
+
215
242
  ### Disk tradeoff
216
243
 
217
244
  A full-featured `.ptt` (8,192-sequence MSA + distance matrix + ESM2-650M embedding at
@@ -267,6 +294,18 @@ pt.write(data, "ubq.ptt")
267
294
  data = pt.from_fasta("complex.fasta")
268
295
  ```
269
296
 
297
+ ### Batch-convert a directory
298
+
299
+ Convert an entire directory of structures in parallel, with progress reporting.
300
+ Files that fail to parse are skipped and listed in the summary; already-converted
301
+ outputs are skipped by default.
302
+
303
+ ```bash
304
+ proteintensor convert-dir ./pdb_files/ ./ptt_files/ # auto worker count
305
+ proteintensor convert-dir ./pdb_files/ ./ptt_files/ --workers 16 --recursive
306
+ proteintensor convert-dir ./pdb_files/ ./ptt_files/ --overwrite # rebuild existing
307
+ ```
308
+
270
309
  ### Benchmark against mmCIF
271
310
 
272
311
  ```bash
@@ -351,6 +390,20 @@ pt.add_pair_feature("1abc.ptt", my_array, name="template_pair",
351
390
  emb = pt.read_embedding("1abc.ptt", "esm2_t33_650M_UR50D")
352
391
  emb.data.shape # (N_res, 1280) float32 (upcast from float16 on load)
353
392
 
393
+ # ------ Ligands / small molecules ------
394
+ # Capture drugs, cofactors, and ions from a structure (opt-in)
395
+ data = pt.from_mmcif("6oim.cif", include_ligands=True)
396
+ [l.name for l in data.ligands] # ['MG', 'GDP', 'MOV'] (MOV = sotorasib)
397
+
398
+ ligs = pt.read_ligands("6oim.ptt")
399
+ ligs[0].elements # (N_atoms,) S2 element symbols
400
+ ligs[0].positions # (N_atoms, 3) float32
401
+ pt.list_ligands("6oim.ptt") # ['MG', 'GDP', 'MOV']
402
+
403
+ # Build a ligand from SMILES (needs `pip install "proteintensor[ligands]"`)
404
+ aspirin = pt.from_smiles("CC(=O)Oc1ccccc1C(=O)O", name="AIN")
405
+ pt.add_ligand("target.ptt", aspirin) # attach to an existing .ptt
406
+
354
407
  # ------ Lazy / zero-copy access ------
355
408
  positions = pt.mmap_positions("1abc.ptt") # zarr.Array - no full load
356
409
  backbone = pt.mmap_backbone("1abc.ptt") # [N_res, 4, 3]
@@ -392,6 +445,7 @@ data = pt.read(
392
445
  )
393
446
 
394
447
  # ------ Multi-structure dataset ------
448
+ # Structure .ptt files and sequence-only .ptt files can be mixed in one dataset.
395
449
  pt.create_dataset("training.ptt")
396
450
  for ptt_file in Path("ptt_files").glob("*.ptt"):
397
451
  pt.add_to_dataset("training.ptt", ptt_file)
@@ -407,8 +461,13 @@ loader = DataLoader(ds, batch_size=8, collate_fn=pt.ProteinDataset.collate)
407
461
  for batch in loader:
408
462
  coords = torch.from_numpy(batch["atom_positions"]) # (B, max_atoms, 3)
409
463
  pad = torch.from_numpy(batch["padding_mask"]) # (B, max_res) True=real
464
+ has_str = torch.from_numpy(batch["has_structure"]) # (B,) False = sequence-only
410
465
  ```
411
466
 
467
+ Sequence-only entries contribute zero atoms to the batch (`n_atoms == 0`,
468
+ `has_structure == False`), so sequence-driven and structure-based samples can be
469
+ loaded together in one `DataLoader`.
470
+
412
471
  ---
413
472
 
414
473
  ## .ptt file layout
@@ -445,10 +504,16 @@ structure.ptt/ Zarr directory store (v0.7)
445
504
  │ └── <name>/ one sub-group per named feature
446
505
  │ ├── .zattrs channels, symmetric, dtype, description
447
506
  │ └── data [N_res, N_res, C] any dtype, chunked 128x128xC
448
- └── embeddings/
449
- └── <model>/ one sub-group per PLM model
450
- ├── .zattrs model, layer, dim, dtype, seq SHA-256
451
- └── data [N_res, D] float32 or float16, chunked 256xD
507
+ ├── embeddings/
508
+ └── <model>/ one sub-group per PLM model
509
+ ├── .zattrs model, layer, dim, dtype, seq SHA-256
510
+ └── data [N_res, D] float32 or float16, chunked 256xD
511
+ └── ligands/
512
+ └── <index>/ one sub-group per non-polymer ligand
513
+ ├── .zattrs name (CCD), chain_id, res_num, smiles
514
+ ├── elements [N_atoms] S2 element symbols
515
+ ├── positions [N_atoms, 3] float32 Angstrom coordinates
516
+ └── b_factors [N_atoms] float32
452
517
  ```
453
518
 
454
519
  ### Multi-structure dataset layout
@@ -486,9 +551,9 @@ Each sub-group under `structures/` is identical to a standalone `.ptt` root, so
486
551
  pytest tests/ -v
487
552
  ```
488
553
 
489
- 106 tests across structure roundtrip, backbone/bonds/MSA/pairs/embeddings,
490
- A3M parsing, Boltz adapter, multi-structure dataset, and cloud streaming
491
- (memory:// fsspec - no real cloud account required).
554
+ 150 tests across structure roundtrip, backbone/bonds/MSA/pairs/embeddings/ligands,
555
+ sequence conversion, A3M parsing, Boltz adapter, multi-structure dataset, and cloud
556
+ streaming (memory:// fsspec - no real cloud account required).
492
557
 
493
558
  ---
494
559
 
@@ -509,11 +574,11 @@ A3M parsing, Boltz adapter, multi-structure dataset, and cloud streaming
509
574
  - [ ] Chai-1 adapter
510
575
 
511
576
  **Data pipeline**
512
- - [ ] Batch convert CLI - convert entire PDB directories in parallel with progress reporting
577
+ - [x] Batch convert CLI - convert entire PDB directories in parallel with progress reporting
513
578
  - [ ] Sequence-identity dataset splitting - MMseqs2-based cluster splits to prevent data leakage between train / val / test
514
579
 
515
580
  **Format extensions**
516
- - [ ] Ligand / small-molecule support - SMILES, CCD-based atom graphs, binding site annotations for drug-protein interaction models
581
+ - [x] Ligand / small-molecule support - CCD-based extraction from structures, SMILES input via RDKit, element/coordinate storage (bond graphs and binding-site annotations still to come)
517
582
  - [ ] MD trajectory storage - time axis `[N_frames, N_atoms, 3]` for conformational ensembles and AlphaFold 3 diffusion trajectories
518
583
 
519
584
  **Performance**
@@ -1,4 +1,6 @@
1
- # HelixDB / ProteinTensor
1
+ ![ProteinTensor - AI-native protein data format: convert structure or sequence into cached tensors](assets/banner.png)
2
+
3
+ # ProteinTensor Introduction
2
4
 
3
5
  **ProteinTensor** is an AI-native biomolecular storage format designed to eliminate
4
6
  the preprocessing bottleneck in modern structural biology machine learning pipelines.
@@ -65,20 +67,21 @@ performance format that turns a recurring computational tax into a one-time cost
65
67
 
66
68
  ## Benchmark: Traditional Pipeline vs ProteinTensor
67
69
 
68
- All timings are median over 30 rounds on an NVIDIA RTX 5080, CUDA 12.8, Python 3.11.
69
- Proteins span the full range from a 76-residue domain to a 3,525-residue CRISPR enzyme.
70
- Run `python boltz_benchmark.py` to reproduce.
70
+ All timings are median over 30 rounds on a Windows workstation (RTX 5080, Python
71
+ 3.11.9); mmCIF parsing and `.ptt` reads are CPU-bound, so these reflect CPU
72
+ performance. Proteins span the full range from a 76-residue domain to a
73
+ 3,525-residue CRISPR enzyme. Run `python boltz_benchmark.py` to reproduce.
71
74
 
72
75
  ### Per-structure load times
73
76
 
74
77
  | Structure | Method | Res | MSA seqs | mmCIF parse | ptt: full | ptt: backbone | ptt: bonds | ptt: MSA | ptt: dist mx |
75
78
  |---|---|---|---|---|---|---|---|---|---|
76
- | 1UBQ - Ubiquitin | X-ray | 76 | 512 | 7.2 ms | 2.8 ms | 1.2 ms | 0.7 ms | 1.6 ms | 0.8 ms |
77
- | 6LU7 - SARS-CoV-2 Mpro | X-ray | 312 | 1,024 | 29.6 ms | 2.9 ms | 1.2 ms | 0.7 ms | 5.1 ms | 2.0 ms |
78
- | 4HHB - Hemoglobin | X-ray | 574 | 2,048 | 55.3 ms | 2.9 ms | 1.2 ms | 0.7 ms | 11.3 ms | 3.5 ms |
79
- | 6M0J - ACE2 + RBD | Cryo-EM | 791 | 2,048 | 74.7 ms | 2.9 ms | 1.2 ms | 0.7 ms | 14.7 ms | 6.4 ms |
80
- | 6VXX - Spike trimer | Cryo-EM | 2,916 | 8,192 | 283.4 ms | 3.3 ms | 1.3 ms | 0.9 ms | 208.3 ms | 71.1 ms |
81
- | 6OHW - Cas12a | Cryo-EM | 3,525 | 8,192 | 352.4 ms | 3.3 ms | 1.2 ms | 1.0 ms | 240.7 ms | 104.5 ms |
79
+ | 1UBQ - Ubiquitin | X-ray | 76 | 512 | 7.4 ms | 3.2 ms | 1.3 ms | 0.8 ms | 1.8 ms | 0.8 ms |
80
+ | 6LU7 - SARS-CoV-2 Mpro | X-ray | 312 | 1,024 | 28.7 ms | 3.3 ms | 1.3 ms | 0.8 ms | 5.2 ms | 1.9 ms |
81
+ | 4HHB - Hemoglobin | X-ray | 574 | 2,048 | 54.1 ms | 3.3 ms | 1.3 ms | 0.8 ms | 11.5 ms | 3.6 ms |
82
+ | 6M0J - ACE2 + RBD | Cryo-EM | 791 | 2,048 | 73.2 ms | 3.3 ms | 1.4 ms | 0.8 ms | 15.3 ms | 6.9 ms |
83
+ | 6VXX - Spike trimer | Cryo-EM | 2,916 | 8,192 | 283.9 ms | 3.7 ms | 1.4 ms | 1.0 ms | 213.7 ms | 74.7 ms |
84
+ | 6OHW - Cas12a | Cryo-EM | 3,525 | 8,192 | 346.5 ms | 3.7 ms | 1.3 ms | 1.0 ms | 243.9 ms | 107.3 ms |
82
85
 
83
86
  **Column definitions**
84
87
  - `ptt: full` - `read()` - all atoms, backbone, bonds, metadata
@@ -91,32 +94,42 @@ Run `python boltz_benchmark.py` to reproduce.
91
94
 
92
95
  | Structure | Res | full | backbone | bonds | MSA | dist mx |
93
96
  |---|---|---|---|---|---|---|
94
- | 1UBQ - Ubiquitin | 76 | 3x | 6x | 11x | 4x | 9x |
95
- | 6LU7 - SARS-CoV-2 Mpro | 312 | 10x | 24x | 43x | 6x | 15x |
96
- | 4HHB - Hemoglobin | 574 | 19x | 45x | 78x | 5x | 16x |
97
- | 6M0J - ACE2 + RBD | 791 | 26x | 61x | 102x | 5x | 12x |
98
- | 6VXX - Spike trimer | 2,916 | 87x | 223x | 308x | 1x* | 4x |
99
- | 6OHW - Cas12a | 3,525 | 108x | 284x | 370x | 1x* | 3x |
97
+ | 1UBQ - Ubiquitin | 76 | 2x | 6x | 10x | 4x | 9x |
98
+ | 6LU7 - SARS-CoV-2 Mpro | 312 | 9x | 21x | 38x | 5x | 15x |
99
+ | 4HHB - Hemoglobin | 574 | 17x | 40x | 70x | 5x | 15x |
100
+ | 6M0J - ACE2 + RBD | 791 | 22x | 54x | 92x | 5x | 11x |
101
+ | 6VXX - Spike trimer | 2,916 | 76x | 201x | 285x | 1x* | 4x |
102
+ | 6OHW - Cas12a | 3,525 | 95x | 257x | 343x | 1x* | 3x |
100
103
 
101
104
  *MSA speedup shown as 1x vs mmCIF parse because both are in the same time range for
102
105
  large proteins - the real MSA comparison is vs JackHMMER generation (see below).
103
106
 
104
107
  ### Feature assembly: time to prepare all tensors for model.forward()
105
108
 
106
- Traditional = mmCIF parse + read MSA from A3M file. ProteinTensor = single .ptt read
107
- with all features pre-cached (sequence, backbone, bonds, MSA, distance matrix,
108
- ESM2 embedding).
109
-
110
- | Structure | Res | Traditional | ProteinTensor | Speedup |
111
- |---|---|---|---|---|
112
- | 1UBQ - Ubiquitin | 76 | 22.7 ms | 5.2 ms | 4x |
113
- | 6LU7 - SARS-CoV-2 Mpro | 312 | 157.3 ms | 9.9 ms | 16x |
114
- | 4HHB - Hemoglobin | 574 | 525.5 ms | 17.7 ms | 30x |
115
- | 6M0J - ACE2 + RBD | 791 | 722.7 ms | 23.9 ms | 30x |
116
- | 6VXX - Spike trimer | 2,916 | 9,838.5 ms | 282.7 ms | 35x |
117
- | 6OHW - Cas12a | 3,525 | 11,903.1 ms | 348.4 ms | **34x** |
118
-
119
- Average speedup across all six structures: **34x** for full feature assembly.
109
+ Traditional = mmCIF parse + A3M MSA parse + distance-matrix compute. ProteinTensor
110
+ = read the structure, MSA, distance matrix, and ESM2 embedding from a single
111
+ pre-cached `.ptt`. Reproduce with `python benchmarks/assembly_benchmark.py`
112
+ (MSA depth and embedding shape are realistic; numeric content is synthetic, so
113
+ timing reflects tensor dimensions, not values).
114
+
115
+ | Structure | Res | MSA depth | Traditional | ProteinTensor | Speedup |
116
+ |---|---|---|---|---|---|
117
+ | 1UBQ - Ubiquitin | 76 | 512 | 14.1 ms | 7.1 ms | 2.0x |
118
+ | 6LU7 - SARS-CoV-2 Mpro | 312 | 1,024 | 48.7 ms | 13.6 ms | 3.6x |
119
+ | 4HHB - Hemoglobin | 574 | 2,048 | 118.0 ms | 22.7 ms | 5.2x |
120
+ | 6M0J - ACE2 + RBD | 791 | 2,048 | 196.4 ms | 38.3 ms | 5.1x |
121
+ | 6VXX - Spike trimer | 2,916 | 8,192 | 1,395 ms | 309 ms | 4.5x |
122
+ | 6OHW - Cas12a | 3,525 | 8,192 | 1,462 ms | 381 ms | 3.8x |
123
+
124
+ Average speedup across all six structures: **4x** for full feature assembly
125
+ (measured on a Windows CPU box - see
126
+ [`benchmarks/ASSEMBLY_RESULTS.md`](benchmarks/ASSEMBLY_RESULTS.md)).
127
+
128
+ > **On an earlier 34x figure:** prior versions reported ~34x here. That number was
129
+ > measured against ProteinTensor's original scalar A3M parser, which dominated the
130
+ > traditional side (~11 s to parse an 8,192-deep MSA). Vectorizing that parser in
131
+ > v0.2.0 cut the traditional baseline ~8x, so the *fair* feature-assembly speedup
132
+ > is now ~4x. The `.ptt` read side was unchanged - only the baseline got faster.
120
133
 
121
134
  ### Drug target benchmark
122
135
 
@@ -126,21 +139,21 @@ IgG1 antibody. Numbers are consistent with the structural biology benchmark abov
126
139
 
127
140
  | Target | Res | mmCIF parse | ptt: full | ptt: backbone | ptt: bonds | ptt: MSA | ptt: dist mx |
128
141
  |---|---|---|---|---|---|---|---|
129
- | 6OIM - KRAS G12C + Sotorasib | 167 | 16.6 ms | 2.8 ms | 1.2 ms | 0.7 ms | 2.8 ms | 1.1 ms |
130
- | 3HTB - HIV-1 protease | 163 | 16.0 ms | 2.8 ms | 1.2 ms | 0.7 ms | 2.7 ms | 1.1 ms |
131
- | 5WT9 - PD-L1 checkpoint | 533 | 53.8 ms | 2.9 ms | 1.2 ms | 0.7 ms | 13.1 ms | 3.3 ms |
132
- | 1TUP - p53 tumor suppressor | 585 | 56.5 ms | 2.8 ms | 1.2 ms | 0.7 ms | 12.4 ms | 3.4 ms |
133
- | 2P4E - PCSK9 | 586 | 54.7 ms | 2.8 ms | 1.2 ms | 0.7 ms | 12.1 ms | 3.4 ms |
134
- | 1IGT - IgG1 antibody | 1,316 | 123.4 ms | 2.9 ms | 1.2 ms | 0.8 ms | 46.8 ms | 16.4 ms |
142
+ | 6OIM - KRAS G12C + Sotorasib | 167 | 17.1 ms | 3.4 ms | 1.3 ms | 0.8 ms | 3.0 ms | 1.3 ms |
143
+ | 3HTB - HIV-1 protease | 163 | 16.5 ms | 3.3 ms | 1.4 ms | 0.8 ms | 2.8 ms | 1.3 ms |
144
+ | 5WT9 - PD-L1 checkpoint | 533 | 54.8 ms | 3.8 ms | 1.4 ms | 0.8 ms | 11.9 ms | 3.8 ms |
145
+ | 1TUP - p53 tumor suppressor | 585 | 57.4 ms | 3.4 ms | 1.4 ms | 0.8 ms | 13.0 ms | 4.0 ms |
146
+ | 2P4E - PCSK9 | 586 | 55.4 ms | 3.4 ms | 1.4 ms | 0.8 ms | 12.8 ms | 4.1 ms |
147
+ | 1IGT - IgG1 antibody | 1,316 | 127.3 ms | 3.5 ms | 1.4 ms | 0.8 ms | 47.1 ms | 17.9 ms |
135
148
 
136
149
  | Target | Res | full | backbone | bonds | MSA | dist mx |
137
150
  |---|---|---|---|---|---|---|
138
- | 6OIM - KRAS G12C + Sotorasib | 167 | 6x | 14x | 24x | 6x | 15x |
139
- | 3HTB - HIV-1 protease | 163 | 6x | 14x | 23x | 6x | 14x |
140
- | 5WT9 - PD-L1 checkpoint | 533 | 19x | 44x | 77x | 4x | 16x |
141
- | 1TUP - p53 tumor suppressor | 585 | 20x | 47x | 80x | 5x | 17x |
142
- | 2P4E - PCSK9 | 586 | 19x | 46x | 77x | 5x | 16x |
143
- | 1IGT - IgG1 antibody | 1,316 | 42x | **100x** | **162x** | 3x | 8x |
151
+ | 6OIM - KRAS G12C + Sotorasib | 167 | 5x | 13x | 22x | 6x | 13x |
152
+ | 3HTB - HIV-1 protease | 163 | 5x | 12x | 21x | 6x | 13x |
153
+ | 5WT9 - PD-L1 checkpoint | 533 | 15x | 40x | 69x | 5x | 14x |
154
+ | 1TUP - p53 tumor suppressor | 585 | 17x | 42x | 71x | 4x | 14x |
155
+ | 2P4E - PCSK9 | 586 | 16x | 41x | 70x | 4x | 14x |
156
+ | 1IGT - IgG1 antibody | 1,316 | 37x | **92x** | **156x** | 3x | 7x |
144
157
 
145
158
  ### DataLoader batch throughput
146
159
 
@@ -149,26 +162,38 @@ padded batches ready for `model.forward()`. Single process, no prefetch workers.
149
162
 
150
163
  | Batch size | ms / batch | Structures / sec |
151
164
  |---|---|---|
152
- | 1 | 0.01 ms | 88,106 |
153
- | 4 | 0.04 ms | 108,696 |
154
- | 8 | 0.37 ms | 21,707 |
155
- | 16 | 0.95 ms | 16,783 |
156
- | 32 | 2.0 ms | **15,854** |
165
+ | 1 | 0.01 ms | 97,088 |
166
+ | 4 | 0.03 ms | 116,279 |
167
+ | 8 | 0.42 ms | 19,242 |
168
+ | 16 | 0.97 ms | 16,412 |
169
+ | 32 | 2.1 ms | **15,033** |
157
170
 
158
171
  ### Scale projection: 100,000 structures, one training epoch
159
172
 
173
+ These are **projections**, extrapolated from the measured per-structure timings
174
+ above - not end-to-end measurements at 100k scale.
175
+
160
176
  | Operation | Traditional pipeline | ProteinTensor | Speedup |
161
177
  |---|---|---|---|
162
- | Structure load (parse mmCIF each epoch) | 3.7 hours | 5 min | **45x** |
163
- | Backbone-only load (template search) | 3.7 hours | 2 min | **109x** |
164
- | Full feature assembly (seq + MSA + pairs + emb) | 4.5 days | 3.2 hours | **34x** |
165
- | MSA generation (JackHMMER, 32-core CPU, once) | 4,000 hours | 2.2 hours | **1,794x** |
178
+ | Structure load (parse mmCIF each epoch) | 3.8 hours | 6 min | **37x** |
179
+ | Backbone-only load (template search) | 3.8 hours | 2 min | **95x** |
180
+ | Full feature assembly (seq + MSA + pairs + emb) | 16 hours | 3.9 hours | **4x** |
181
+ | MSA generation (JackHMMER, 32-core CPU, once) | 4,000 hours | 2.7 hours | **1,477x** |
166
182
 
167
183
  > MSA generation assumes 2.4 min/protein on a 32-core server (PDB90 database, standard
168
184
  > AlphaFold settings). ProteinTensor generates MSAs once and loads from the `.ptt` cache
169
185
  > on every subsequent run. The 4,000-hour figure is the real cost AlphaFold2 and Boltz
170
186
  > users pay to build training datasets from scratch.
171
187
 
188
+ > **Measured vs projected - read this.** The **1,477x** above is MSA *generation*
189
+ > (building the alignment once with JackHMMER) and is a **literature-based
190
+ > projection**, not something benchmarked here. What *is* measured on hardware is
191
+ > the recurring per-epoch MSA **load** - reading a cached MSA from `.ptt` vs
192
+ > re-parsing A3M text each epoch (against a vectorized A3M parser baseline):
193
+ > **3.4x-5.9x**, growing with MSA depth. See
194
+ > [`benchmarks/MSA_RESULTS.md`](benchmarks/MSA_RESULTS.md). These are different
195
+ > quantities; do not read the 1,477x as a measured load speedup.
196
+
172
197
  ### Disk tradeoff
173
198
 
174
199
  A full-featured `.ptt` (8,192-sequence MSA + distance matrix + ESM2-650M embedding at
@@ -224,6 +249,18 @@ pt.write(data, "ubq.ptt")
224
249
  data = pt.from_fasta("complex.fasta")
225
250
  ```
226
251
 
252
+ ### Batch-convert a directory
253
+
254
+ Convert an entire directory of structures in parallel, with progress reporting.
255
+ Files that fail to parse are skipped and listed in the summary; already-converted
256
+ outputs are skipped by default.
257
+
258
+ ```bash
259
+ proteintensor convert-dir ./pdb_files/ ./ptt_files/ # auto worker count
260
+ proteintensor convert-dir ./pdb_files/ ./ptt_files/ --workers 16 --recursive
261
+ proteintensor convert-dir ./pdb_files/ ./ptt_files/ --overwrite # rebuild existing
262
+ ```
263
+
227
264
  ### Benchmark against mmCIF
228
265
 
229
266
  ```bash
@@ -308,6 +345,20 @@ pt.add_pair_feature("1abc.ptt", my_array, name="template_pair",
308
345
  emb = pt.read_embedding("1abc.ptt", "esm2_t33_650M_UR50D")
309
346
  emb.data.shape # (N_res, 1280) float32 (upcast from float16 on load)
310
347
 
348
+ # ------ Ligands / small molecules ------
349
+ # Capture drugs, cofactors, and ions from a structure (opt-in)
350
+ data = pt.from_mmcif("6oim.cif", include_ligands=True)
351
+ [l.name for l in data.ligands] # ['MG', 'GDP', 'MOV'] (MOV = sotorasib)
352
+
353
+ ligs = pt.read_ligands("6oim.ptt")
354
+ ligs[0].elements # (N_atoms,) S2 element symbols
355
+ ligs[0].positions # (N_atoms, 3) float32
356
+ pt.list_ligands("6oim.ptt") # ['MG', 'GDP', 'MOV']
357
+
358
+ # Build a ligand from SMILES (needs `pip install "proteintensor[ligands]"`)
359
+ aspirin = pt.from_smiles("CC(=O)Oc1ccccc1C(=O)O", name="AIN")
360
+ pt.add_ligand("target.ptt", aspirin) # attach to an existing .ptt
361
+
311
362
  # ------ Lazy / zero-copy access ------
312
363
  positions = pt.mmap_positions("1abc.ptt") # zarr.Array - no full load
313
364
  backbone = pt.mmap_backbone("1abc.ptt") # [N_res, 4, 3]
@@ -349,6 +400,7 @@ data = pt.read(
349
400
  )
350
401
 
351
402
  # ------ Multi-structure dataset ------
403
+ # Structure .ptt files and sequence-only .ptt files can be mixed in one dataset.
352
404
  pt.create_dataset("training.ptt")
353
405
  for ptt_file in Path("ptt_files").glob("*.ptt"):
354
406
  pt.add_to_dataset("training.ptt", ptt_file)
@@ -364,8 +416,13 @@ loader = DataLoader(ds, batch_size=8, collate_fn=pt.ProteinDataset.collate)
364
416
  for batch in loader:
365
417
  coords = torch.from_numpy(batch["atom_positions"]) # (B, max_atoms, 3)
366
418
  pad = torch.from_numpy(batch["padding_mask"]) # (B, max_res) True=real
419
+ has_str = torch.from_numpy(batch["has_structure"]) # (B,) False = sequence-only
367
420
  ```
368
421
 
422
+ Sequence-only entries contribute zero atoms to the batch (`n_atoms == 0`,
423
+ `has_structure == False`), so sequence-driven and structure-based samples can be
424
+ loaded together in one `DataLoader`.
425
+
369
426
  ---
370
427
 
371
428
  ## .ptt file layout
@@ -402,10 +459,16 @@ structure.ptt/ Zarr directory store (v0.7)
402
459
  │ └── <name>/ one sub-group per named feature
403
460
  │ ├── .zattrs channels, symmetric, dtype, description
404
461
  │ └── data [N_res, N_res, C] any dtype, chunked 128x128xC
405
- └── embeddings/
406
- └── <model>/ one sub-group per PLM model
407
- ├── .zattrs model, layer, dim, dtype, seq SHA-256
408
- └── data [N_res, D] float32 or float16, chunked 256xD
462
+ ├── embeddings/
463
+ └── <model>/ one sub-group per PLM model
464
+ ├── .zattrs model, layer, dim, dtype, seq SHA-256
465
+ └── data [N_res, D] float32 or float16, chunked 256xD
466
+ └── ligands/
467
+ └── <index>/ one sub-group per non-polymer ligand
468
+ ├── .zattrs name (CCD), chain_id, res_num, smiles
469
+ ├── elements [N_atoms] S2 element symbols
470
+ ├── positions [N_atoms, 3] float32 Angstrom coordinates
471
+ └── b_factors [N_atoms] float32
409
472
  ```
410
473
 
411
474
  ### Multi-structure dataset layout
@@ -443,9 +506,9 @@ Each sub-group under `structures/` is identical to a standalone `.ptt` root, so
443
506
  pytest tests/ -v
444
507
  ```
445
508
 
446
- 106 tests across structure roundtrip, backbone/bonds/MSA/pairs/embeddings,
447
- A3M parsing, Boltz adapter, multi-structure dataset, and cloud streaming
448
- (memory:// fsspec - no real cloud account required).
509
+ 150 tests across structure roundtrip, backbone/bonds/MSA/pairs/embeddings/ligands,
510
+ sequence conversion, A3M parsing, Boltz adapter, multi-structure dataset, and cloud
511
+ streaming (memory:// fsspec - no real cloud account required).
449
512
 
450
513
  ---
451
514
 
@@ -466,11 +529,11 @@ A3M parsing, Boltz adapter, multi-structure dataset, and cloud streaming
466
529
  - [ ] Chai-1 adapter
467
530
 
468
531
  **Data pipeline**
469
- - [ ] Batch convert CLI - convert entire PDB directories in parallel with progress reporting
532
+ - [x] Batch convert CLI - convert entire PDB directories in parallel with progress reporting
470
533
  - [ ] Sequence-identity dataset splitting - MMseqs2-based cluster splits to prevent data leakage between train / val / test
471
534
 
472
535
  **Format extensions**
473
- - [ ] Ligand / small-molecule support - SMILES, CCD-based atom graphs, binding site annotations for drug-protein interaction models
536
+ - [x] Ligand / small-molecule support - CCD-based extraction from structures, SMILES input via RDKit, element/coordinate storage (bond graphs and binding-site annotations still to come)
474
537
  - [ ] MD trajectory storage - time axis `[N_frames, N_atoms, 3]` for conformational ensembles and AlphaFold 3 diffusion trajectories
475
538
 
476
539
  **Performance**
@@ -34,8 +34,10 @@ from .bonds import (
34
34
  from .dataset import ProteinDataset, create_dataset, add_to_dataset
35
35
  from .remote import consolidate
36
36
  from .converters import from_mmcif, from_sequence, from_fasta, parse_fasta
37
+ from .ligands import read_ligands, list_ligands, add_ligand, from_smiles
38
+ from .schema import LigandData
37
39
 
38
- __version__ = "0.2.0"
40
+ __version__ = "0.3.0"
39
41
 
40
42
  __all__ = [
41
43
  # Converters - input
@@ -51,6 +53,8 @@ __all__ = [
51
53
  "compute_and_store_distances", "compute_and_store_contacts",
52
54
  # I/O - embeddings
53
55
  "read_embedding", "add_embedding", "list_embeddings", "mmap_embedding",
56
+ # Ligands / small molecules
57
+ "read_ligands", "list_ligands", "add_ligand", "from_smiles", "LigandData",
54
58
  # Data containers
55
59
  "ProteinTensorData", "BackboneData", "BondData", "MsaData", "PairFeature", "EmbeddingData",
56
60
  # MSA utilities