proteintensor 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proteintensor-0.2.0 → proteintensor-0.3.0}/PKG-INFO +126 -61
- {proteintensor-0.2.0 → proteintensor-0.3.0}/README.md +123 -60
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/__init__.py +5 -1
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/cli.py +140 -2
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/converters/mmcif.py +15 -4
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/dataset.py +25 -13
- proteintensor-0.3.0/proteintensor/ligands.py +216 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/msa.py +38 -23
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/reader.py +2 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/schema.py +28 -1
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/writer.py +4 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/PKG-INFO +126 -61
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/SOURCES.txt +3 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/requires.txt +3 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/pyproject.toml +6 -5
- proteintensor-0.3.0/tests/test_convert_dir.py +89 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_dataset.py +59 -0
- proteintensor-0.3.0/tests/test_ligands.py +123 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_msa.py +54 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/LICENSE +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/adapters/__init__.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/adapters/boltz.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/bonds.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/converters/__init__.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/converters/sequence.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/embeddings.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/pairs.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor/remote.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/dependency_links.txt +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/entry_points.txt +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/proteintensor.egg-info/top_level.txt +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/setup.cfg +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_adapters.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_embeddings.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_pairs.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_remote.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_roundtrip.py +0 -0
- {proteintensor-0.2.0 → proteintensor-0.3.0}/tests/test_sequence.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proteintensor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: AI-native biomolecular tensor format for structural biology ML
|
|
5
5
|
Author-email: Clayton Moore <claytonwaynemoore@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -34,6 +34,8 @@ Provides-Extra: cloud
|
|
|
34
34
|
Requires-Dist: fsspec>=2023.1; extra == "cloud"
|
|
35
35
|
Requires-Dist: s3fs>=2023.1; extra == "cloud"
|
|
36
36
|
Requires-Dist: gcsfs>=2023.1; extra == "cloud"
|
|
37
|
+
Provides-Extra: ligands
|
|
38
|
+
Requires-Dist: rdkit>=2023.3; extra == "ligands"
|
|
37
39
|
Provides-Extra: dev
|
|
38
40
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
39
41
|
Requires-Dist: pytest-benchmark; extra == "dev"
|
|
@@ -41,7 +43,9 @@ Requires-Dist: pytest-cov; extra == "dev"
|
|
|
41
43
|
Requires-Dist: fsspec>=2023.1; extra == "dev"
|
|
42
44
|
Dynamic: license-file
|
|
43
45
|
|
|
44
|
-
|
|
46
|
+

|
|
47
|
+
|
|
48
|
+
# ProteinTensor Introduction
|
|
45
49
|
|
|
46
50
|
**ProteinTensor** is an AI-native biomolecular storage format designed to eliminate
|
|
47
51
|
the preprocessing bottleneck in modern structural biology machine learning pipelines.
|
|
@@ -108,20 +112,21 @@ performance format that turns a recurring computational tax into a one-time cost
|
|
|
108
112
|
|
|
109
113
|
## Benchmark: Traditional Pipeline vs ProteinTensor
|
|
110
114
|
|
|
111
|
-
All timings are median over 30 rounds on
|
|
112
|
-
|
|
113
|
-
|
|
115
|
+
All timings are median over 30 rounds on a Windows workstation (RTX 5080, Python
|
|
116
|
+
3.11.9); mmCIF parsing and `.ptt` reads are CPU-bound, so these reflect CPU
|
|
117
|
+
performance. Proteins span the full range from a 76-residue domain to a
|
|
118
|
+
3,525-residue CRISPR enzyme. Run `python boltz_benchmark.py` to reproduce.
|
|
114
119
|
|
|
115
120
|
### Per-structure load times
|
|
116
121
|
|
|
117
122
|
| Structure | Method | Res | MSA seqs | mmCIF parse | ptt: full | ptt: backbone | ptt: bonds | ptt: MSA | ptt: dist mx |
|
|
118
123
|
|---|---|---|---|---|---|---|---|---|---|
|
|
119
|
-
| 1UBQ - Ubiquitin | X-ray | 76 | 512 | 7.
|
|
120
|
-
| 6LU7 - SARS-CoV-2 Mpro | X-ray | 312 | 1,024 |
|
|
121
|
-
| 4HHB - Hemoglobin | X-ray | 574 | 2,048 |
|
|
122
|
-
| 6M0J - ACE2 + RBD | Cryo-EM | 791 | 2,048 |
|
|
123
|
-
| 6VXX - Spike trimer | Cryo-EM | 2,916 | 8,192 | 283.
|
|
124
|
-
| 6OHW - Cas12a | Cryo-EM | 3,525 | 8,192 |
|
|
124
|
+
| 1UBQ - Ubiquitin | X-ray | 76 | 512 | 7.4 ms | 3.2 ms | 1.3 ms | 0.8 ms | 1.8 ms | 0.8 ms |
|
|
125
|
+
| 6LU7 - SARS-CoV-2 Mpro | X-ray | 312 | 1,024 | 28.7 ms | 3.3 ms | 1.3 ms | 0.8 ms | 5.2 ms | 1.9 ms |
|
|
126
|
+
| 4HHB - Hemoglobin | X-ray | 574 | 2,048 | 54.1 ms | 3.3 ms | 1.3 ms | 0.8 ms | 11.5 ms | 3.6 ms |
|
|
127
|
+
| 6M0J - ACE2 + RBD | Cryo-EM | 791 | 2,048 | 73.2 ms | 3.3 ms | 1.4 ms | 0.8 ms | 15.3 ms | 6.9 ms |
|
|
128
|
+
| 6VXX - Spike trimer | Cryo-EM | 2,916 | 8,192 | 283.9 ms | 3.7 ms | 1.4 ms | 1.0 ms | 213.7 ms | 74.7 ms |
|
|
129
|
+
| 6OHW - Cas12a | Cryo-EM | 3,525 | 8,192 | 346.5 ms | 3.7 ms | 1.3 ms | 1.0 ms | 243.9 ms | 107.3 ms |
|
|
125
130
|
|
|
126
131
|
**Column definitions**
|
|
127
132
|
- `ptt: full` - `read()` - all atoms, backbone, bonds, metadata
|
|
@@ -134,32 +139,42 @@ Run `python boltz_benchmark.py` to reproduce.
|
|
|
134
139
|
|
|
135
140
|
| Structure | Res | full | backbone | bonds | MSA | dist mx |
|
|
136
141
|
|---|---|---|---|---|---|---|
|
|
137
|
-
| 1UBQ - Ubiquitin | 76 |
|
|
138
|
-
| 6LU7 - SARS-CoV-2 Mpro | 312 |
|
|
139
|
-
| 4HHB - Hemoglobin | 574 |
|
|
140
|
-
| 6M0J - ACE2 + RBD | 791 |
|
|
141
|
-
| 6VXX - Spike trimer | 2,916 |
|
|
142
|
-
| 6OHW - Cas12a | 3,525 |
|
|
142
|
+
| 1UBQ - Ubiquitin | 76 | 2x | 6x | 10x | 4x | 9x |
|
|
143
|
+
| 6LU7 - SARS-CoV-2 Mpro | 312 | 9x | 21x | 38x | 5x | 15x |
|
|
144
|
+
| 4HHB - Hemoglobin | 574 | 17x | 40x | 70x | 5x | 15x |
|
|
145
|
+
| 6M0J - ACE2 + RBD | 791 | 22x | 54x | 92x | 5x | 11x |
|
|
146
|
+
| 6VXX - Spike trimer | 2,916 | 76x | 201x | 285x | 1x* | 4x |
|
|
147
|
+
| 6OHW - Cas12a | 3,525 | 95x | 257x | 343x | 1x* | 3x |
|
|
143
148
|
|
|
144
149
|
*MSA speedup shown as 1x vs mmCIF parse because both are in the same time range for
|
|
145
150
|
large proteins - the real MSA comparison is vs JackHMMER generation (see below).
|
|
146
151
|
|
|
147
152
|
### Feature assembly: time to prepare all tensors for model.forward()
|
|
148
153
|
|
|
149
|
-
Traditional = mmCIF parse +
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
|
156
|
-
|
|
157
|
-
|
|
|
158
|
-
|
|
|
159
|
-
|
|
|
160
|
-
|
|
|
161
|
-
|
|
162
|
-
|
|
154
|
+
Traditional = mmCIF parse + A3M MSA parse + distance-matrix compute. ProteinTensor
|
|
155
|
+
= read the structure, MSA, distance matrix, and ESM2 embedding from a single
|
|
156
|
+
pre-cached `.ptt`. Reproduce with `python benchmarks/assembly_benchmark.py`
|
|
157
|
+
(MSA depth and embedding shape are realistic; numeric content is synthetic, so
|
|
158
|
+
timing reflects tensor dimensions, not values).
|
|
159
|
+
|
|
160
|
+
| Structure | Res | MSA depth | Traditional | ProteinTensor | Speedup |
|
|
161
|
+
|---|---|---|---|---|---|
|
|
162
|
+
| 1UBQ - Ubiquitin | 76 | 512 | 14.1 ms | 7.1 ms | 2.0x |
|
|
163
|
+
| 6LU7 - SARS-CoV-2 Mpro | 312 | 1,024 | 48.7 ms | 13.6 ms | 3.6x |
|
|
164
|
+
| 4HHB - Hemoglobin | 574 | 2,048 | 118.0 ms | 22.7 ms | 5.2x |
|
|
165
|
+
| 6M0J - ACE2 + RBD | 791 | 2,048 | 196.4 ms | 38.3 ms | 5.1x |
|
|
166
|
+
| 6VXX - Spike trimer | 2,916 | 8,192 | 1,395 ms | 309 ms | 4.5x |
|
|
167
|
+
| 6OHW - Cas12a | 3,525 | 8,192 | 1,462 ms | 381 ms | 3.8x |
|
|
168
|
+
|
|
169
|
+
Average speedup across all six structures: **4x** for full feature assembly
|
|
170
|
+
(measured on a Windows CPU box - see
|
|
171
|
+
[`benchmarks/ASSEMBLY_RESULTS.md`](benchmarks/ASSEMBLY_RESULTS.md)).
|
|
172
|
+
|
|
173
|
+
> **On an earlier 34x figure:** prior versions reported ~34x here. That number was
|
|
174
|
+
> measured against ProteinTensor's original scalar A3M parser, which dominated the
|
|
175
|
+
> traditional side (~11 s to parse an 8,192-deep MSA). Vectorizing that parser in
|
|
176
|
+
> v0.2.0 cut the traditional baseline ~8x, so the *fair* feature-assembly speedup
|
|
177
|
+
> is now ~4x. The `.ptt` read side was unchanged - only the baseline got faster.
|
|
163
178
|
|
|
164
179
|
### Drug target benchmark
|
|
165
180
|
|
|
@@ -169,21 +184,21 @@ IgG1 antibody. Numbers are consistent with the structural biology benchmark abov
|
|
|
169
184
|
|
|
170
185
|
| Target | Res | mmCIF parse | ptt: full | ptt: backbone | ptt: bonds | ptt: MSA | ptt: dist mx |
|
|
171
186
|
|---|---|---|---|---|---|---|---|
|
|
172
|
-
| 6OIM - KRAS G12C + Sotorasib | 167 |
|
|
173
|
-
| 3HTB - HIV-1 protease | 163 | 16.
|
|
174
|
-
| 5WT9 - PD-L1 checkpoint | 533 |
|
|
175
|
-
| 1TUP - p53 tumor suppressor | 585 |
|
|
176
|
-
| 2P4E - PCSK9 | 586 |
|
|
177
|
-
| 1IGT - IgG1 antibody | 1,316 |
|
|
187
|
+
| 6OIM - KRAS G12C + Sotorasib | 167 | 17.1 ms | 3.4 ms | 1.3 ms | 0.8 ms | 3.0 ms | 1.3 ms |
|
|
188
|
+
| 3HTB - HIV-1 protease | 163 | 16.5 ms | 3.3 ms | 1.4 ms | 0.8 ms | 2.8 ms | 1.3 ms |
|
|
189
|
+
| 5WT9 - PD-L1 checkpoint | 533 | 54.8 ms | 3.8 ms | 1.4 ms | 0.8 ms | 11.9 ms | 3.8 ms |
|
|
190
|
+
| 1TUP - p53 tumor suppressor | 585 | 57.4 ms | 3.4 ms | 1.4 ms | 0.8 ms | 13.0 ms | 4.0 ms |
|
|
191
|
+
| 2P4E - PCSK9 | 586 | 55.4 ms | 3.4 ms | 1.4 ms | 0.8 ms | 12.8 ms | 4.1 ms |
|
|
192
|
+
| 1IGT - IgG1 antibody | 1,316 | 127.3 ms | 3.5 ms | 1.4 ms | 0.8 ms | 47.1 ms | 17.9 ms |
|
|
178
193
|
|
|
179
194
|
| Target | Res | full | backbone | bonds | MSA | dist mx |
|
|
180
195
|
|---|---|---|---|---|---|---|
|
|
181
|
-
| 6OIM - KRAS G12C + Sotorasib | 167 |
|
|
182
|
-
| 3HTB - HIV-1 protease | 163 |
|
|
183
|
-
| 5WT9 - PD-L1 checkpoint | 533 |
|
|
184
|
-
| 1TUP - p53 tumor suppressor | 585 |
|
|
185
|
-
| 2P4E - PCSK9 | 586 |
|
|
186
|
-
| 1IGT - IgG1 antibody | 1,316 |
|
|
196
|
+
| 6OIM - KRAS G12C + Sotorasib | 167 | 5x | 13x | 22x | 6x | 13x |
|
|
197
|
+
| 3HTB - HIV-1 protease | 163 | 5x | 12x | 21x | 6x | 13x |
|
|
198
|
+
| 5WT9 - PD-L1 checkpoint | 533 | 15x | 40x | 69x | 5x | 14x |
|
|
199
|
+
| 1TUP - p53 tumor suppressor | 585 | 17x | 42x | 71x | 4x | 14x |
|
|
200
|
+
| 2P4E - PCSK9 | 586 | 16x | 41x | 70x | 4x | 14x |
|
|
201
|
+
| 1IGT - IgG1 antibody | 1,316 | 37x | **92x** | **156x** | 3x | 7x |
|
|
187
202
|
|
|
188
203
|
### DataLoader batch throughput
|
|
189
204
|
|
|
@@ -192,26 +207,38 @@ padded batches ready for `model.forward()`. Single process, no prefetch workers.
|
|
|
192
207
|
|
|
193
208
|
| Batch size | ms / batch | Structures / sec |
|
|
194
209
|
|---|---|---|
|
|
195
|
-
| 1 | 0.01 ms |
|
|
196
|
-
| 4 | 0.
|
|
197
|
-
| 8 | 0.
|
|
198
|
-
| 16 | 0.
|
|
199
|
-
| 32 | 2.
|
|
210
|
+
| 1 | 0.01 ms | 97,088 |
|
|
211
|
+
| 4 | 0.03 ms | 116,279 |
|
|
212
|
+
| 8 | 0.42 ms | 19,242 |
|
|
213
|
+
| 16 | 0.97 ms | 16,412 |
|
|
214
|
+
| 32 | 2.1 ms | **15,033** |
|
|
200
215
|
|
|
201
216
|
### Scale projection: 100,000 structures, one training epoch
|
|
202
217
|
|
|
218
|
+
These are **projections**, extrapolated from the measured per-structure timings
|
|
219
|
+
above - not end-to-end measurements at 100k scale.
|
|
220
|
+
|
|
203
221
|
| Operation | Traditional pipeline | ProteinTensor | Speedup |
|
|
204
222
|
|---|---|---|---|
|
|
205
|
-
| Structure load (parse mmCIF each epoch) | 3.
|
|
206
|
-
| Backbone-only load (template search) | 3.
|
|
207
|
-
| Full feature assembly (seq + MSA + pairs + emb) |
|
|
208
|
-
| MSA generation (JackHMMER, 32-core CPU, once) | 4,000 hours | 2.
|
|
223
|
+
| Structure load (parse mmCIF each epoch) | 3.8 hours | 6 min | **37x** |
|
|
224
|
+
| Backbone-only load (template search) | 3.8 hours | 2 min | **95x** |
|
|
225
|
+
| Full feature assembly (seq + MSA + pairs + emb) | 16 hours | 3.9 hours | **4x** |
|
|
226
|
+
| MSA generation (JackHMMER, 32-core CPU, once) | 4,000 hours | 2.7 hours | **1,477x** |
|
|
209
227
|
|
|
210
228
|
> MSA generation assumes 2.4 min/protein on a 32-core server (PDB90 database, standard
|
|
211
229
|
> AlphaFold settings). ProteinTensor generates MSAs once and loads from the `.ptt` cache
|
|
212
230
|
> on every subsequent run. The 4,000-hour figure is the real cost AlphaFold2 and Boltz
|
|
213
231
|
> users pay to build training datasets from scratch.
|
|
214
232
|
|
|
233
|
+
> **Measured vs projected - read this.** The **1,477x** above is MSA *generation*
|
|
234
|
+
> (building the alignment once with JackHMMER) and is a **literature-based
|
|
235
|
+
> projection**, not something benchmarked here. What *is* measured on hardware is
|
|
236
|
+
> the recurring per-epoch MSA **load** - reading a cached MSA from `.ptt` vs
|
|
237
|
+
> re-parsing A3M text each epoch (against a vectorized A3M parser baseline):
|
|
238
|
+
> **3.4x-5.9x**, growing with MSA depth. See
|
|
239
|
+
> [`benchmarks/MSA_RESULTS.md`](benchmarks/MSA_RESULTS.md). These are different
|
|
240
|
+
> quantities; do not read the 1,477x as a measured load speedup.
|
|
241
|
+
|
|
215
242
|
### Disk tradeoff
|
|
216
243
|
|
|
217
244
|
A full-featured `.ptt` (8,192-sequence MSA + distance matrix + ESM2-650M embedding at
|
|
@@ -267,6 +294,18 @@ pt.write(data, "ubq.ptt")
|
|
|
267
294
|
data = pt.from_fasta("complex.fasta")
|
|
268
295
|
```
|
|
269
296
|
|
|
297
|
+
### Batch-convert a directory
|
|
298
|
+
|
|
299
|
+
Convert an entire directory of structures in parallel, with progress reporting.
|
|
300
|
+
Files that fail to parse are skipped and listed in the summary; already-converted
|
|
301
|
+
outputs are skipped by default.
|
|
302
|
+
|
|
303
|
+
```bash
|
|
304
|
+
proteintensor convert-dir ./pdb_files/ ./ptt_files/ # auto worker count
|
|
305
|
+
proteintensor convert-dir ./pdb_files/ ./ptt_files/ --workers 16 --recursive
|
|
306
|
+
proteintensor convert-dir ./pdb_files/ ./ptt_files/ --overwrite # rebuild existing
|
|
307
|
+
```
|
|
308
|
+
|
|
270
309
|
### Benchmark against mmCIF
|
|
271
310
|
|
|
272
311
|
```bash
|
|
@@ -351,6 +390,20 @@ pt.add_pair_feature("1abc.ptt", my_array, name="template_pair",
|
|
|
351
390
|
emb = pt.read_embedding("1abc.ptt", "esm2_t33_650M_UR50D")
|
|
352
391
|
emb.data.shape # (N_res, 1280) float32 (upcast from float16 on load)
|
|
353
392
|
|
|
393
|
+
# ------ Ligands / small molecules ------
|
|
394
|
+
# Capture drugs, cofactors, and ions from a structure (opt-in)
|
|
395
|
+
data = pt.from_mmcif("6oim.cif", include_ligands=True)
|
|
396
|
+
[l.name for l in data.ligands] # ['MG', 'GDP', 'MOV'] (MOV = sotorasib)
|
|
397
|
+
|
|
398
|
+
ligs = pt.read_ligands("6oim.ptt")
|
|
399
|
+
ligs[0].elements # (N_atoms,) S2 element symbols
|
|
400
|
+
ligs[0].positions # (N_atoms, 3) float32
|
|
401
|
+
pt.list_ligands("6oim.ptt") # ['MG', 'GDP', 'MOV']
|
|
402
|
+
|
|
403
|
+
# Build a ligand from SMILES (needs `pip install "proteintensor[ligands]"`)
|
|
404
|
+
aspirin = pt.from_smiles("CC(=O)Oc1ccccc1C(=O)O", name="AIN")
|
|
405
|
+
pt.add_ligand("target.ptt", aspirin) # attach to an existing .ptt
|
|
406
|
+
|
|
354
407
|
# ------ Lazy / zero-copy access ------
|
|
355
408
|
positions = pt.mmap_positions("1abc.ptt") # zarr.Array - no full load
|
|
356
409
|
backbone = pt.mmap_backbone("1abc.ptt") # [N_res, 4, 3]
|
|
@@ -392,6 +445,7 @@ data = pt.read(
|
|
|
392
445
|
)
|
|
393
446
|
|
|
394
447
|
# ------ Multi-structure dataset ------
|
|
448
|
+
# Structure .ptt files and sequence-only .ptt files can be mixed in one dataset.
|
|
395
449
|
pt.create_dataset("training.ptt")
|
|
396
450
|
for ptt_file in Path("ptt_files").glob("*.ptt"):
|
|
397
451
|
pt.add_to_dataset("training.ptt", ptt_file)
|
|
@@ -407,8 +461,13 @@ loader = DataLoader(ds, batch_size=8, collate_fn=pt.ProteinDataset.collate)
|
|
|
407
461
|
for batch in loader:
|
|
408
462
|
coords = torch.from_numpy(batch["atom_positions"]) # (B, max_atoms, 3)
|
|
409
463
|
pad = torch.from_numpy(batch["padding_mask"]) # (B, max_res) True=real
|
|
464
|
+
has_str = torch.from_numpy(batch["has_structure"]) # (B,) False = sequence-only
|
|
410
465
|
```
|
|
411
466
|
|
|
467
|
+
Sequence-only entries contribute zero atoms to the batch (`n_atoms == 0`,
|
|
468
|
+
`has_structure == False`), so sequence-driven and structure-based samples can be
|
|
469
|
+
loaded together in one `DataLoader`.
|
|
470
|
+
|
|
412
471
|
---
|
|
413
472
|
|
|
414
473
|
## .ptt file layout
|
|
@@ -445,10 +504,16 @@ structure.ptt/ Zarr directory store (v0.7)
|
|
|
445
504
|
│ └── <name>/ one sub-group per named feature
|
|
446
505
|
│ ├── .zattrs channels, symmetric, dtype, description
|
|
447
506
|
│ └── data [N_res, N_res, C] any dtype, chunked 128x128xC
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
507
|
+
├── embeddings/
|
|
508
|
+
│ └── <model>/ one sub-group per PLM model
|
|
509
|
+
│ ├── .zattrs model, layer, dim, dtype, seq SHA-256
|
|
510
|
+
│ └── data [N_res, D] float32 or float16, chunked 256xD
|
|
511
|
+
└── ligands/
|
|
512
|
+
└── <index>/ one sub-group per non-polymer ligand
|
|
513
|
+
├── .zattrs name (CCD), chain_id, res_num, smiles
|
|
514
|
+
├── elements [N_atoms] S2 element symbols
|
|
515
|
+
├── positions [N_atoms, 3] float32 Angstrom coordinates
|
|
516
|
+
└── b_factors [N_atoms] float32
|
|
452
517
|
```
|
|
453
518
|
|
|
454
519
|
### Multi-structure dataset layout
|
|
@@ -486,9 +551,9 @@ Each sub-group under `structures/` is identical to a standalone `.ptt` root, so
|
|
|
486
551
|
pytest tests/ -v
|
|
487
552
|
```
|
|
488
553
|
|
|
489
|
-
|
|
490
|
-
A3M parsing, Boltz adapter, multi-structure dataset, and cloud
|
|
491
|
-
(memory:// fsspec - no real cloud account required).
|
|
554
|
+
150 tests across structure roundtrip, backbone/bonds/MSA/pairs/embeddings/ligands,
|
|
555
|
+
sequence conversion, A3M parsing, Boltz adapter, multi-structure dataset, and cloud
|
|
556
|
+
streaming (memory:// fsspec - no real cloud account required).
|
|
492
557
|
|
|
493
558
|
---
|
|
494
559
|
|
|
@@ -509,11 +574,11 @@ A3M parsing, Boltz adapter, multi-structure dataset, and cloud streaming
|
|
|
509
574
|
- [ ] Chai-1 adapter
|
|
510
575
|
|
|
511
576
|
**Data pipeline**
|
|
512
|
-
- [
|
|
577
|
+
- [x] Batch convert CLI - convert entire PDB directories in parallel with progress reporting
|
|
513
578
|
- [ ] Sequence-identity dataset splitting - MMseqs2-based cluster splits to prevent data leakage between train / val / test
|
|
514
579
|
|
|
515
580
|
**Format extensions**
|
|
516
|
-
- [
|
|
581
|
+
- [x] Ligand / small-molecule support - CCD-based extraction from structures, SMILES input via RDKit, element/coordinate storage (bond graphs and binding-site annotations still to come)
|
|
517
582
|
- [ ] MD trajectory storage - time axis `[N_frames, N_atoms, 3]` for conformational ensembles and AlphaFold 3 diffusion trajectories
|
|
518
583
|
|
|
519
584
|
**Performance**
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# ProteinTensor Introduction
|
|
2
4
|
|
|
3
5
|
**ProteinTensor** is an AI-native biomolecular storage format designed to eliminate
|
|
4
6
|
the preprocessing bottleneck in modern structural biology machine learning pipelines.
|
|
@@ -65,20 +67,21 @@ performance format that turns a recurring computational tax into a one-time cost
|
|
|
65
67
|
|
|
66
68
|
## Benchmark: Traditional Pipeline vs ProteinTensor
|
|
67
69
|
|
|
68
|
-
All timings are median over 30 rounds on
|
|
69
|
-
|
|
70
|
-
|
|
70
|
+
All timings are median over 30 rounds on a Windows workstation (RTX 5080, Python
|
|
71
|
+
3.11.9); mmCIF parsing and `.ptt` reads are CPU-bound, so these reflect CPU
|
|
72
|
+
performance. Proteins span the full range from a 76-residue domain to a
|
|
73
|
+
3,525-residue CRISPR enzyme. Run `python boltz_benchmark.py` to reproduce.
|
|
71
74
|
|
|
72
75
|
### Per-structure load times
|
|
73
76
|
|
|
74
77
|
| Structure | Method | Res | MSA seqs | mmCIF parse | ptt: full | ptt: backbone | ptt: bonds | ptt: MSA | ptt: dist mx |
|
|
75
78
|
|---|---|---|---|---|---|---|---|---|---|
|
|
76
|
-
| 1UBQ - Ubiquitin | X-ray | 76 | 512 | 7.
|
|
77
|
-
| 6LU7 - SARS-CoV-2 Mpro | X-ray | 312 | 1,024 |
|
|
78
|
-
| 4HHB - Hemoglobin | X-ray | 574 | 2,048 |
|
|
79
|
-
| 6M0J - ACE2 + RBD | Cryo-EM | 791 | 2,048 |
|
|
80
|
-
| 6VXX - Spike trimer | Cryo-EM | 2,916 | 8,192 | 283.
|
|
81
|
-
| 6OHW - Cas12a | Cryo-EM | 3,525 | 8,192 |
|
|
79
|
+
| 1UBQ - Ubiquitin | X-ray | 76 | 512 | 7.4 ms | 3.2 ms | 1.3 ms | 0.8 ms | 1.8 ms | 0.8 ms |
|
|
80
|
+
| 6LU7 - SARS-CoV-2 Mpro | X-ray | 312 | 1,024 | 28.7 ms | 3.3 ms | 1.3 ms | 0.8 ms | 5.2 ms | 1.9 ms |
|
|
81
|
+
| 4HHB - Hemoglobin | X-ray | 574 | 2,048 | 54.1 ms | 3.3 ms | 1.3 ms | 0.8 ms | 11.5 ms | 3.6 ms |
|
|
82
|
+
| 6M0J - ACE2 + RBD | Cryo-EM | 791 | 2,048 | 73.2 ms | 3.3 ms | 1.4 ms | 0.8 ms | 15.3 ms | 6.9 ms |
|
|
83
|
+
| 6VXX - Spike trimer | Cryo-EM | 2,916 | 8,192 | 283.9 ms | 3.7 ms | 1.4 ms | 1.0 ms | 213.7 ms | 74.7 ms |
|
|
84
|
+
| 6OHW - Cas12a | Cryo-EM | 3,525 | 8,192 | 346.5 ms | 3.7 ms | 1.3 ms | 1.0 ms | 243.9 ms | 107.3 ms |
|
|
82
85
|
|
|
83
86
|
**Column definitions**
|
|
84
87
|
- `ptt: full` - `read()` - all atoms, backbone, bonds, metadata
|
|
@@ -91,32 +94,42 @@ Run `python boltz_benchmark.py` to reproduce.
|
|
|
91
94
|
|
|
92
95
|
| Structure | Res | full | backbone | bonds | MSA | dist mx |
|
|
93
96
|
|---|---|---|---|---|---|---|
|
|
94
|
-
| 1UBQ - Ubiquitin | 76 |
|
|
95
|
-
| 6LU7 - SARS-CoV-2 Mpro | 312 |
|
|
96
|
-
| 4HHB - Hemoglobin | 574 |
|
|
97
|
-
| 6M0J - ACE2 + RBD | 791 |
|
|
98
|
-
| 6VXX - Spike trimer | 2,916 |
|
|
99
|
-
| 6OHW - Cas12a | 3,525 |
|
|
97
|
+
| 1UBQ - Ubiquitin | 76 | 2x | 6x | 10x | 4x | 9x |
|
|
98
|
+
| 6LU7 - SARS-CoV-2 Mpro | 312 | 9x | 21x | 38x | 5x | 15x |
|
|
99
|
+
| 4HHB - Hemoglobin | 574 | 17x | 40x | 70x | 5x | 15x |
|
|
100
|
+
| 6M0J - ACE2 + RBD | 791 | 22x | 54x | 92x | 5x | 11x |
|
|
101
|
+
| 6VXX - Spike trimer | 2,916 | 76x | 201x | 285x | 1x* | 4x |
|
|
102
|
+
| 6OHW - Cas12a | 3,525 | 95x | 257x | 343x | 1x* | 3x |
|
|
100
103
|
|
|
101
104
|
*MSA speedup shown as 1x vs mmCIF parse because both are in the same time range for
|
|
102
105
|
large proteins - the real MSA comparison is vs JackHMMER generation (see below).
|
|
103
106
|
|
|
104
107
|
### Feature assembly: time to prepare all tensors for model.forward()
|
|
105
108
|
|
|
106
|
-
Traditional = mmCIF parse +
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
|
113
|
-
|
|
114
|
-
|
|
|
115
|
-
|
|
|
116
|
-
|
|
|
117
|
-
|
|
|
118
|
-
|
|
119
|
-
|
|
109
|
+
Traditional = mmCIF parse + A3M MSA parse + distance-matrix compute. ProteinTensor
|
|
110
|
+
= read the structure, MSA, distance matrix, and ESM2 embedding from a single
|
|
111
|
+
pre-cached `.ptt`. Reproduce with `python benchmarks/assembly_benchmark.py`
|
|
112
|
+
(MSA depth and embedding shape are realistic; numeric content is synthetic, so
|
|
113
|
+
timing reflects tensor dimensions, not values).
|
|
114
|
+
|
|
115
|
+
| Structure | Res | MSA depth | Traditional | ProteinTensor | Speedup |
|
|
116
|
+
|---|---|---|---|---|---|
|
|
117
|
+
| 1UBQ - Ubiquitin | 76 | 512 | 14.1 ms | 7.1 ms | 2.0x |
|
|
118
|
+
| 6LU7 - SARS-CoV-2 Mpro | 312 | 1,024 | 48.7 ms | 13.6 ms | 3.6x |
|
|
119
|
+
| 4HHB - Hemoglobin | 574 | 2,048 | 118.0 ms | 22.7 ms | 5.2x |
|
|
120
|
+
| 6M0J - ACE2 + RBD | 791 | 2,048 | 196.4 ms | 38.3 ms | 5.1x |
|
|
121
|
+
| 6VXX - Spike trimer | 2,916 | 8,192 | 1,395 ms | 309 ms | 4.5x |
|
|
122
|
+
| 6OHW - Cas12a | 3,525 | 8,192 | 1,462 ms | 381 ms | 3.8x |
|
|
123
|
+
|
|
124
|
+
Average speedup across all six structures: **4x** for full feature assembly
|
|
125
|
+
(measured on a Windows CPU box - see
|
|
126
|
+
[`benchmarks/ASSEMBLY_RESULTS.md`](benchmarks/ASSEMBLY_RESULTS.md)).
|
|
127
|
+
|
|
128
|
+
> **On an earlier 34x figure:** prior versions reported ~34x here. That number was
|
|
129
|
+
> measured against ProteinTensor's original scalar A3M parser, which dominated the
|
|
130
|
+
> traditional side (~11 s to parse an 8,192-deep MSA). Vectorizing that parser in
|
|
131
|
+
> v0.2.0 cut the traditional baseline ~8x, so the *fair* feature-assembly speedup
|
|
132
|
+
> is now ~4x. The `.ptt` read side was unchanged - only the baseline got faster.
|
|
120
133
|
|
|
121
134
|
### Drug target benchmark
|
|
122
135
|
|
|
@@ -126,21 +139,21 @@ IgG1 antibody. Numbers are consistent with the structural biology benchmark abov
|
|
|
126
139
|
|
|
127
140
|
| Target | Res | mmCIF parse | ptt: full | ptt: backbone | ptt: bonds | ptt: MSA | ptt: dist mx |
|
|
128
141
|
|---|---|---|---|---|---|---|---|
|
|
129
|
-
| 6OIM - KRAS G12C + Sotorasib | 167 |
|
|
130
|
-
| 3HTB - HIV-1 protease | 163 | 16.
|
|
131
|
-
| 5WT9 - PD-L1 checkpoint | 533 |
|
|
132
|
-
| 1TUP - p53 tumor suppressor | 585 |
|
|
133
|
-
| 2P4E - PCSK9 | 586 |
|
|
134
|
-
| 1IGT - IgG1 antibody | 1,316 |
|
|
142
|
+
| 6OIM - KRAS G12C + Sotorasib | 167 | 17.1 ms | 3.4 ms | 1.3 ms | 0.8 ms | 3.0 ms | 1.3 ms |
|
|
143
|
+
| 3HTB - HIV-1 protease | 163 | 16.5 ms | 3.3 ms | 1.4 ms | 0.8 ms | 2.8 ms | 1.3 ms |
|
|
144
|
+
| 5WT9 - PD-L1 checkpoint | 533 | 54.8 ms | 3.8 ms | 1.4 ms | 0.8 ms | 11.9 ms | 3.8 ms |
|
|
145
|
+
| 1TUP - p53 tumor suppressor | 585 | 57.4 ms | 3.4 ms | 1.4 ms | 0.8 ms | 13.0 ms | 4.0 ms |
|
|
146
|
+
| 2P4E - PCSK9 | 586 | 55.4 ms | 3.4 ms | 1.4 ms | 0.8 ms | 12.8 ms | 4.1 ms |
|
|
147
|
+
| 1IGT - IgG1 antibody | 1,316 | 127.3 ms | 3.5 ms | 1.4 ms | 0.8 ms | 47.1 ms | 17.9 ms |
|
|
135
148
|
|
|
136
149
|
| Target | Res | full | backbone | bonds | MSA | dist mx |
|
|
137
150
|
|---|---|---|---|---|---|---|
|
|
138
|
-
| 6OIM - KRAS G12C + Sotorasib | 167 |
|
|
139
|
-
| 3HTB - HIV-1 protease | 163 |
|
|
140
|
-
| 5WT9 - PD-L1 checkpoint | 533 |
|
|
141
|
-
| 1TUP - p53 tumor suppressor | 585 |
|
|
142
|
-
| 2P4E - PCSK9 | 586 |
|
|
143
|
-
| 1IGT - IgG1 antibody | 1,316 |
|
|
151
|
+
| 6OIM - KRAS G12C + Sotorasib | 167 | 5x | 13x | 22x | 6x | 13x |
|
|
152
|
+
| 3HTB - HIV-1 protease | 163 | 5x | 12x | 21x | 6x | 13x |
|
|
153
|
+
| 5WT9 - PD-L1 checkpoint | 533 | 15x | 40x | 69x | 5x | 14x |
|
|
154
|
+
| 1TUP - p53 tumor suppressor | 585 | 17x | 42x | 71x | 4x | 14x |
|
|
155
|
+
| 2P4E - PCSK9 | 586 | 16x | 41x | 70x | 4x | 14x |
|
|
156
|
+
| 1IGT - IgG1 antibody | 1,316 | 37x | **92x** | **156x** | 3x | 7x |
|
|
144
157
|
|
|
145
158
|
### DataLoader batch throughput
|
|
146
159
|
|
|
@@ -149,26 +162,38 @@ padded batches ready for `model.forward()`. Single process, no prefetch workers.
|
|
|
149
162
|
|
|
150
163
|
| Batch size | ms / batch | Structures / sec |
|
|
151
164
|
|---|---|---|
|
|
152
|
-
| 1 | 0.01 ms |
|
|
153
|
-
| 4 | 0.
|
|
154
|
-
| 8 | 0.
|
|
155
|
-
| 16 | 0.
|
|
156
|
-
| 32 | 2.
|
|
165
|
+
| 1 | 0.01 ms | 97,088 |
|
|
166
|
+
| 4 | 0.03 ms | 116,279 |
|
|
167
|
+
| 8 | 0.42 ms | 19,242 |
|
|
168
|
+
| 16 | 0.97 ms | 16,412 |
|
|
169
|
+
| 32 | 2.1 ms | **15,033** |
|
|
157
170
|
|
|
158
171
|
### Scale projection: 100,000 structures, one training epoch
|
|
159
172
|
|
|
173
|
+
These are **projections**, extrapolated from the measured per-structure timings
|
|
174
|
+
above - not end-to-end measurements at 100k scale.
|
|
175
|
+
|
|
160
176
|
| Operation | Traditional pipeline | ProteinTensor | Speedup |
|
|
161
177
|
|---|---|---|---|
|
|
162
|
-
| Structure load (parse mmCIF each epoch) | 3.
|
|
163
|
-
| Backbone-only load (template search) | 3.
|
|
164
|
-
| Full feature assembly (seq + MSA + pairs + emb) |
|
|
165
|
-
| MSA generation (JackHMMER, 32-core CPU, once) | 4,000 hours | 2.
|
|
178
|
+
| Structure load (parse mmCIF each epoch) | 3.8 hours | 6 min | **37x** |
|
|
179
|
+
| Backbone-only load (template search) | 3.8 hours | 2 min | **95x** |
|
|
180
|
+
| Full feature assembly (seq + MSA + pairs + emb) | 16 hours | 3.9 hours | **4x** |
|
|
181
|
+
| MSA generation (JackHMMER, 32-core CPU, once) | 4,000 hours | 2.7 hours | **1,477x** |
|
|
166
182
|
|
|
167
183
|
> MSA generation assumes 2.4 min/protein on a 32-core server (PDB90 database, standard
|
|
168
184
|
> AlphaFold settings). ProteinTensor generates MSAs once and loads from the `.ptt` cache
|
|
169
185
|
> on every subsequent run. The 4,000-hour figure is the real cost AlphaFold2 and Boltz
|
|
170
186
|
> users pay to build training datasets from scratch.
|
|
171
187
|
|
|
188
|
+
> **Measured vs projected - read this.** The **1,477x** above is MSA *generation*
|
|
189
|
+
> (building the alignment once with JackHMMER) and is a **literature-based
|
|
190
|
+
> projection**, not something benchmarked here. What *is* measured on hardware is
|
|
191
|
+
> the recurring per-epoch MSA **load** - reading a cached MSA from `.ptt` vs
|
|
192
|
+
> re-parsing A3M text each epoch (against a vectorized A3M parser baseline):
|
|
193
|
+
> **3.4x-5.9x**, growing with MSA depth. See
|
|
194
|
+
> [`benchmarks/MSA_RESULTS.md`](benchmarks/MSA_RESULTS.md). These are different
|
|
195
|
+
> quantities; do not read the 1,477x as a measured load speedup.
|
|
196
|
+
|
|
172
197
|
### Disk tradeoff
|
|
173
198
|
|
|
174
199
|
A full-featured `.ptt` (8,192-sequence MSA + distance matrix + ESM2-650M embedding at
|
|
@@ -224,6 +249,18 @@ pt.write(data, "ubq.ptt")
|
|
|
224
249
|
data = pt.from_fasta("complex.fasta")
|
|
225
250
|
```
|
|
226
251
|
|
|
252
|
+
### Batch-convert a directory
|
|
253
|
+
|
|
254
|
+
Convert an entire directory of structures in parallel, with progress reporting.
|
|
255
|
+
Files that fail to parse are skipped and listed in the summary; already-converted
|
|
256
|
+
outputs are skipped by default.
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
proteintensor convert-dir ./pdb_files/ ./ptt_files/ # auto worker count
|
|
260
|
+
proteintensor convert-dir ./pdb_files/ ./ptt_files/ --workers 16 --recursive
|
|
261
|
+
proteintensor convert-dir ./pdb_files/ ./ptt_files/ --overwrite # rebuild existing
|
|
262
|
+
```
|
|
263
|
+
|
|
227
264
|
### Benchmark against mmCIF
|
|
228
265
|
|
|
229
266
|
```bash
|
|
@@ -308,6 +345,20 @@ pt.add_pair_feature("1abc.ptt", my_array, name="template_pair",
|
|
|
308
345
|
emb = pt.read_embedding("1abc.ptt", "esm2_t33_650M_UR50D")
|
|
309
346
|
emb.data.shape # (N_res, 1280) float32 (upcast from float16 on load)
|
|
310
347
|
|
|
348
|
+
# ------ Ligands / small molecules ------
|
|
349
|
+
# Capture drugs, cofactors, and ions from a structure (opt-in)
|
|
350
|
+
data = pt.from_mmcif("6oim.cif", include_ligands=True)
|
|
351
|
+
[l.name for l in data.ligands] # ['MG', 'GDP', 'MOV'] (MOV = sotorasib)
|
|
352
|
+
|
|
353
|
+
ligs = pt.read_ligands("6oim.ptt")
|
|
354
|
+
ligs[0].elements # (N_atoms,) S2 element symbols
|
|
355
|
+
ligs[0].positions # (N_atoms, 3) float32
|
|
356
|
+
pt.list_ligands("6oim.ptt") # ['MG', 'GDP', 'MOV']
|
|
357
|
+
|
|
358
|
+
# Build a ligand from SMILES (needs `pip install "proteintensor[ligands]"`)
|
|
359
|
+
aspirin = pt.from_smiles("CC(=O)Oc1ccccc1C(=O)O", name="AIN")
|
|
360
|
+
pt.add_ligand("target.ptt", aspirin) # attach to an existing .ptt
|
|
361
|
+
|
|
311
362
|
# ------ Lazy / zero-copy access ------
|
|
312
363
|
positions = pt.mmap_positions("1abc.ptt") # zarr.Array - no full load
|
|
313
364
|
backbone = pt.mmap_backbone("1abc.ptt") # [N_res, 4, 3]
|
|
@@ -349,6 +400,7 @@ data = pt.read(
|
|
|
349
400
|
)
|
|
350
401
|
|
|
351
402
|
# ------ Multi-structure dataset ------
|
|
403
|
+
# Structure .ptt files and sequence-only .ptt files can be mixed in one dataset.
|
|
352
404
|
pt.create_dataset("training.ptt")
|
|
353
405
|
for ptt_file in Path("ptt_files").glob("*.ptt"):
|
|
354
406
|
pt.add_to_dataset("training.ptt", ptt_file)
|
|
@@ -364,8 +416,13 @@ loader = DataLoader(ds, batch_size=8, collate_fn=pt.ProteinDataset.collate)
|
|
|
364
416
|
for batch in loader:
|
|
365
417
|
coords = torch.from_numpy(batch["atom_positions"]) # (B, max_atoms, 3)
|
|
366
418
|
pad = torch.from_numpy(batch["padding_mask"]) # (B, max_res) True=real
|
|
419
|
+
has_str = torch.from_numpy(batch["has_structure"]) # (B,) False = sequence-only
|
|
367
420
|
```
|
|
368
421
|
|
|
422
|
+
Sequence-only entries contribute zero atoms to the batch (`n_atoms == 0`,
|
|
423
|
+
`has_structure == False`), so sequence-driven and structure-based samples can be
|
|
424
|
+
loaded together in one `DataLoader`.
|
|
425
|
+
|
|
369
426
|
---
|
|
370
427
|
|
|
371
428
|
## .ptt file layout
|
|
@@ -402,10 +459,16 @@ structure.ptt/ Zarr directory store (v0.7)
|
|
|
402
459
|
│ └── <name>/ one sub-group per named feature
|
|
403
460
|
│ ├── .zattrs channels, symmetric, dtype, description
|
|
404
461
|
│ └── data [N_res, N_res, C] any dtype, chunked 128x128xC
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
462
|
+
├── embeddings/
|
|
463
|
+
│ └── <model>/ one sub-group per PLM model
|
|
464
|
+
│ ├── .zattrs model, layer, dim, dtype, seq SHA-256
|
|
465
|
+
│ └── data [N_res, D] float32 or float16, chunked 256xD
|
|
466
|
+
└── ligands/
|
|
467
|
+
└── <index>/ one sub-group per non-polymer ligand
|
|
468
|
+
├── .zattrs name (CCD), chain_id, res_num, smiles
|
|
469
|
+
├── elements [N_atoms] S2 element symbols
|
|
470
|
+
├── positions [N_atoms, 3] float32 Angstrom coordinates
|
|
471
|
+
└── b_factors [N_atoms] float32
|
|
409
472
|
```
|
|
410
473
|
|
|
411
474
|
### Multi-structure dataset layout
|
|
@@ -443,9 +506,9 @@ Each sub-group under `structures/` is identical to a standalone `.ptt` root, so
|
|
|
443
506
|
pytest tests/ -v
|
|
444
507
|
```
|
|
445
508
|
|
|
446
|
-
|
|
447
|
-
A3M parsing, Boltz adapter, multi-structure dataset, and cloud
|
|
448
|
-
(memory:// fsspec - no real cloud account required).
|
|
509
|
+
150 tests across structure roundtrip, backbone/bonds/MSA/pairs/embeddings/ligands,
|
|
510
|
+
sequence conversion, A3M parsing, Boltz adapter, multi-structure dataset, and cloud
|
|
511
|
+
streaming (memory:// fsspec - no real cloud account required).
|
|
449
512
|
|
|
450
513
|
---
|
|
451
514
|
|
|
@@ -466,11 +529,11 @@ A3M parsing, Boltz adapter, multi-structure dataset, and cloud streaming
|
|
|
466
529
|
- [ ] Chai-1 adapter
|
|
467
530
|
|
|
468
531
|
**Data pipeline**
|
|
469
|
-
- [
|
|
532
|
+
- [x] Batch convert CLI - convert entire PDB directories in parallel with progress reporting
|
|
470
533
|
- [ ] Sequence-identity dataset splitting - MMseqs2-based cluster splits to prevent data leakage between train / val / test
|
|
471
534
|
|
|
472
535
|
**Format extensions**
|
|
473
|
-
- [
|
|
536
|
+
- [x] Ligand / small-molecule support - CCD-based extraction from structures, SMILES input via RDKit, element/coordinate storage (bond graphs and binding-site annotations still to come)
|
|
474
537
|
- [ ] MD trajectory storage - time axis `[N_frames, N_atoms, 3]` for conformational ensembles and AlphaFold 3 diffusion trajectories
|
|
475
538
|
|
|
476
539
|
**Performance**
|
|
@@ -34,8 +34,10 @@ from .bonds import (
|
|
|
34
34
|
from .dataset import ProteinDataset, create_dataset, add_to_dataset
|
|
35
35
|
from .remote import consolidate
|
|
36
36
|
from .converters import from_mmcif, from_sequence, from_fasta, parse_fasta
|
|
37
|
+
from .ligands import read_ligands, list_ligands, add_ligand, from_smiles
|
|
38
|
+
from .schema import LigandData
|
|
37
39
|
|
|
38
|
-
__version__ = "0.
|
|
40
|
+
__version__ = "0.3.0"
|
|
39
41
|
|
|
40
42
|
__all__ = [
|
|
41
43
|
# Converters - input
|
|
@@ -51,6 +53,8 @@ __all__ = [
|
|
|
51
53
|
"compute_and_store_distances", "compute_and_store_contacts",
|
|
52
54
|
# I/O - embeddings
|
|
53
55
|
"read_embedding", "add_embedding", "list_embeddings", "mmap_embedding",
|
|
56
|
+
# Ligands / small molecules
|
|
57
|
+
"read_ligands", "list_ligands", "add_ligand", "from_smiles", "LigandData",
|
|
54
58
|
# Data containers
|
|
55
59
|
"ProteinTensorData", "BackboneData", "BondData", "MsaData", "PairFeature", "EmbeddingData",
|
|
56
60
|
# MSA utilities
|