proteintensor 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proteintensor-0.1.3 → proteintensor-0.2.0}/PKG-INFO +25 -1
- {proteintensor-0.1.3 → proteintensor-0.2.0}/README.md +24 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/__init__.py +4 -1
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/cli.py +56 -0
- proteintensor-0.2.0/proteintensor/converters/__init__.py +4 -0
- proteintensor-0.2.0/proteintensor/converters/sequence.py +103 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/reader.py +7 -5
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/schema.py +38 -10
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/writer.py +11 -8
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/PKG-INFO +25 -1
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/SOURCES.txt +3 -1
- {proteintensor-0.1.3 → proteintensor-0.2.0}/pyproject.toml +1 -1
- proteintensor-0.2.0/tests/test_sequence.py +150 -0
- proteintensor-0.1.3/proteintensor/converters/__init__.py +0 -3
- {proteintensor-0.1.3 → proteintensor-0.2.0}/LICENSE +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/adapters/__init__.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/adapters/boltz.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/bonds.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/converters/mmcif.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/dataset.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/embeddings.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/msa.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/pairs.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/remote.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/dependency_links.txt +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/entry_points.txt +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/requires.txt +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/top_level.txt +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/setup.cfg +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_adapters.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_dataset.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_embeddings.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_msa.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_pairs.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_remote.py +0 -0
- {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_roundtrip.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proteintensor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: AI-native biomolecular tensor format for structural biology ML
|
|
5
5
|
Author-email: Clayton Moore <claytonwaynemoore@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -243,6 +243,30 @@ proteintensor convert 1abc.cif 1abc.ptt
|
|
|
243
243
|
proteintensor info 1abc.ptt
|
|
244
244
|
```
|
|
245
245
|
|
|
246
|
+
### Convert a sequence (no structure required)
|
|
247
|
+
|
|
248
|
+
For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
|
|
249
|
+
sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
|
|
250
|
+
coordinates) directly from a raw string or a FASTA file:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
|
|
254
|
+
proteintensor convert-seq complex.fasta complex.ptt # multi-record FASTA -> multi-chain
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
import proteintensor as pt
|
|
259
|
+
|
|
260
|
+
data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
|
|
261
|
+
data.has_structure # False - sequence-only entry
|
|
262
|
+
data.sequence_tokens # (N_res,) int32
|
|
263
|
+
|
|
264
|
+
pt.write(data, "ubq.ptt")
|
|
265
|
+
|
|
266
|
+
# FASTA: a single record -> one chain; multiple records -> multi-chain complex
|
|
267
|
+
data = pt.from_fasta("complex.fasta")
|
|
268
|
+
```
|
|
269
|
+
|
|
246
270
|
### Benchmark against mmCIF
|
|
247
271
|
|
|
248
272
|
```bash
|
|
@@ -200,6 +200,30 @@ proteintensor convert 1abc.cif 1abc.ptt
|
|
|
200
200
|
proteintensor info 1abc.ptt
|
|
201
201
|
```
|
|
202
202
|
|
|
203
|
+
### Convert a sequence (no structure required)
|
|
204
|
+
|
|
205
|
+
For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
|
|
206
|
+
sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
|
|
207
|
+
coordinates) directly from a raw string or a FASTA file:
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
|
|
211
|
+
proteintensor convert-seq complex.fasta complex.ptt # multi-record FASTA -> multi-chain
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
import proteintensor as pt
|
|
216
|
+
|
|
217
|
+
data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
|
|
218
|
+
data.has_structure # False - sequence-only entry
|
|
219
|
+
data.sequence_tokens # (N_res,) int32
|
|
220
|
+
|
|
221
|
+
pt.write(data, "ubq.ptt")
|
|
222
|
+
|
|
223
|
+
# FASTA: a single record -> one chain; multiple records -> multi-chain complex
|
|
224
|
+
data = pt.from_fasta("complex.fasta")
|
|
225
|
+
```
|
|
226
|
+
|
|
203
227
|
### Benchmark against mmCIF
|
|
204
228
|
|
|
205
229
|
```bash
|
|
@@ -33,10 +33,13 @@ from .bonds import (
|
|
|
33
33
|
)
|
|
34
34
|
from .dataset import ProteinDataset, create_dataset, add_to_dataset
|
|
35
35
|
from .remote import consolidate
|
|
36
|
+
from .converters import from_mmcif, from_sequence, from_fasta, parse_fasta
|
|
36
37
|
|
|
37
|
-
__version__ = "0.
|
|
38
|
+
__version__ = "0.2.0"
|
|
38
39
|
|
|
39
40
|
__all__ = [
|
|
41
|
+
# Converters - input
|
|
42
|
+
"from_mmcif", "from_sequence", "from_fasta", "parse_fasta",
|
|
40
43
|
# I/O - structure
|
|
41
44
|
"read", "write",
|
|
42
45
|
"read_backbone", "read_bonds",
|
|
@@ -73,6 +73,62 @@ def convert(input_path: Path, output_path: Path, compression: str, pdb_id: str):
|
|
|
73
73
|
console.print(Panel(tbl, title=f"[green]Converted -> {output_path}[/green]", expand=False))
|
|
74
74
|
|
|
75
75
|
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
# convert-seq
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
@main.command("convert-seq")
|
|
81
|
+
@click.argument("sequence_or_fasta", type=str)
|
|
82
|
+
@click.argument("output_path", type=click.Path(path_type=Path))
|
|
83
|
+
@click.option("--compression", default="blosc", show_default=True,
|
|
84
|
+
type=click.Choice(["blosc", "none"]),
|
|
85
|
+
help="Compression codec for the Zarr store.")
|
|
86
|
+
@click.option("--pdb-id", default="", help="Identifier stored in metadata (e.g. UniProt accession).")
|
|
87
|
+
@click.option("--chain", default="A", show_default=True,
|
|
88
|
+
help="Chain label applied to a raw sequence input.")
|
|
89
|
+
def convert_seq(sequence_or_fasta: str, output_path: Path, compression: str,
|
|
90
|
+
pdb_id: str, chain: str):
|
|
91
|
+
"""Convert a protein sequence to ProteinTensor (.ptt) format.
|
|
92
|
+
|
|
93
|
+
SEQUENCE_OR_FASTA may be a path to a FASTA file or a literal 1-letter
|
|
94
|
+
amino-acid string. The result is a sequence-only .ptt (no coordinates) -
|
|
95
|
+
the primary input form for AlphaFold- and Boltz-style predictors.
|
|
96
|
+
"""
|
|
97
|
+
from .converters.sequence import from_sequence, from_fasta
|
|
98
|
+
from .writer import write
|
|
99
|
+
|
|
100
|
+
src = Path(sequence_or_fasta)
|
|
101
|
+
is_file = src.exists() and src.is_file()
|
|
102
|
+
|
|
103
|
+
t0 = time.perf_counter()
|
|
104
|
+
if is_file:
|
|
105
|
+
data = from_fasta(src, pdb_id=pdb_id)
|
|
106
|
+
source_desc = src.name
|
|
107
|
+
else:
|
|
108
|
+
data = from_sequence(sequence_or_fasta, pdb_id=pdb_id, chain_id=chain)
|
|
109
|
+
source_desc = f"<literal sequence: {data.num_residues} aa>"
|
|
110
|
+
build_ms = (time.perf_counter() - t0) * 1000
|
|
111
|
+
|
|
112
|
+
t0 = time.perf_counter()
|
|
113
|
+
write(data, output_path, compression=compression)
|
|
114
|
+
write_ms = (time.perf_counter() - t0) * 1000
|
|
115
|
+
|
|
116
|
+
dst_bytes = sum(f.stat().st_size for f in Path(output_path).rglob("*") if f.is_file())
|
|
117
|
+
|
|
118
|
+
tbl = Table(show_header=False, box=None, padding=(0, 2))
|
|
119
|
+
tbl.add_row("PDB ID", data.pdb_id or "(unknown)")
|
|
120
|
+
tbl.add_row("Chains", _chain_summary(data.chain_id))
|
|
121
|
+
tbl.add_row("Residues", f"{data.num_residues:,}")
|
|
122
|
+
tbl.add_row("Structure", "no (sequence-only)")
|
|
123
|
+
tbl.add_row("")
|
|
124
|
+
tbl.add_row("Source", source_desc)
|
|
125
|
+
tbl.add_row("Build time", f"{build_ms:.1f} ms")
|
|
126
|
+
tbl.add_row("Write time", f"{write_ms:.1f} ms")
|
|
127
|
+
tbl.add_row("Output", _fmt_bytes(dst_bytes))
|
|
128
|
+
|
|
129
|
+
console.print(Panel(tbl, title=f"[green]Converted -> {output_path}[/green]", expand=False))
|
|
130
|
+
|
|
131
|
+
|
|
76
132
|
# ---------------------------------------------------------------------------
|
|
77
133
|
# info
|
|
78
134
|
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import numpy as np
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ..schema import ProteinTensorData, sequence_to_tokens
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def from_sequence(
|
|
9
|
+
sequence: str,
|
|
10
|
+
*,
|
|
11
|
+
pdb_id: str = "",
|
|
12
|
+
chain_id: str = "A",
|
|
13
|
+
residue_start: int = 1,
|
|
14
|
+
) -> ProteinTensorData:
|
|
15
|
+
"""Build a sequence-only ProteinTensorData from a 1-letter amino-acid string.
|
|
16
|
+
|
|
17
|
+
No 3D coordinates are produced - the result has ``has_structure == False`` and
|
|
18
|
+
is the primary input form for sequence-driven predictors such as AlphaFold and
|
|
19
|
+
Boltz. Unknown / ambiguity characters (B, Z, J, O, X, gaps) map to UNK.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
sequence 1-letter amino-acid string. Whitespace is ignored.
|
|
24
|
+
pdb_id Identifier stored in metadata (e.g. a UniProt accession).
|
|
25
|
+
chain_id Single-character chain label applied to every residue.
|
|
26
|
+
residue_start PDB residue number assigned to the first residue (default 1).
|
|
27
|
+
"""
|
|
28
|
+
tokens = sequence_to_tokens(sequence)
|
|
29
|
+
if tokens.shape[0] == 0:
|
|
30
|
+
raise ValueError("Empty sequence: no amino-acid residues to encode.")
|
|
31
|
+
|
|
32
|
+
n = tokens.shape[0]
|
|
33
|
+
chain_label = (chain_id[0] if chain_id else "A").encode()
|
|
34
|
+
return ProteinTensorData(
|
|
35
|
+
sequence_tokens=tokens,
|
|
36
|
+
residue_index=np.arange(residue_start, residue_start + n, dtype=np.int32),
|
|
37
|
+
chain_id=np.full(n, chain_label, dtype="S1"),
|
|
38
|
+
pdb_id=pdb_id,
|
|
39
|
+
method="sequence",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse_fasta(text: str) -> list[tuple[str, str]]:
|
|
44
|
+
"""Parse FASTA text into a list of (header, sequence) tuples."""
|
|
45
|
+
records: list[tuple[str, str]] = []
|
|
46
|
+
header: str | None = None
|
|
47
|
+
chunks: list[str] = []
|
|
48
|
+
for line in text.splitlines():
|
|
49
|
+
line = line.strip()
|
|
50
|
+
if not line:
|
|
51
|
+
continue
|
|
52
|
+
if line.startswith(">"):
|
|
53
|
+
if header is not None:
|
|
54
|
+
records.append((header, "".join(chunks)))
|
|
55
|
+
header = line[1:].strip()
|
|
56
|
+
chunks = []
|
|
57
|
+
else:
|
|
58
|
+
chunks.append(line)
|
|
59
|
+
if header is not None:
|
|
60
|
+
records.append((header, "".join(chunks)))
|
|
61
|
+
return records
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def from_fasta(path: str | Path, *, pdb_id: str = "") -> ProteinTensorData:
|
|
65
|
+
"""Build a ProteinTensorData from a FASTA file.
|
|
66
|
+
|
|
67
|
+
A single record produces a single-chain sequence-only entry. Multiple records
|
|
68
|
+
are treated as a multi-chain complex: chains are concatenated and labelled
|
|
69
|
+
A, B, C, ... in file order, with residue numbering restarting per chain. This
|
|
70
|
+
matches the multi-chain input that AlphaFold-Multimer and Boltz consume.
|
|
71
|
+
"""
|
|
72
|
+
path = Path(path)
|
|
73
|
+
records = parse_fasta(path.read_text())
|
|
74
|
+
if not records:
|
|
75
|
+
raise ValueError(f"No FASTA records found in '{path}'.")
|
|
76
|
+
if not pdb_id:
|
|
77
|
+
pdb_id = path.stem.upper().split("_")[0]
|
|
78
|
+
|
|
79
|
+
if len(records) == 1:
|
|
80
|
+
return from_sequence(records[0][1], pdb_id=pdb_id, chain_id="A")
|
|
81
|
+
|
|
82
|
+
all_tokens: list[np.ndarray] = []
|
|
83
|
+
all_res_idx: list[np.ndarray] = []
|
|
84
|
+
all_chain: list[np.ndarray] = []
|
|
85
|
+
for i, (_, seq) in enumerate(records):
|
|
86
|
+
sub = from_sequence(seq, chain_id=_chain_label(i))
|
|
87
|
+
all_tokens.append(sub.sequence_tokens)
|
|
88
|
+
all_res_idx.append(sub.residue_index)
|
|
89
|
+
all_chain.append(sub.chain_id)
|
|
90
|
+
|
|
91
|
+
return ProteinTensorData(
|
|
92
|
+
sequence_tokens=np.concatenate(all_tokens),
|
|
93
|
+
residue_index=np.concatenate(all_res_idx),
|
|
94
|
+
chain_id=np.concatenate(all_chain),
|
|
95
|
+
pdb_id=pdb_id,
|
|
96
|
+
method="sequence",
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _chain_label(index: int) -> str:
|
|
101
|
+
"""Map a 0-based chain index to a label: 0->A .. 25->Z, then a, b, ..."""
|
|
102
|
+
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
|
103
|
+
return alphabet[index] if index < len(alphabet) else "X"
|
|
@@ -24,6 +24,8 @@ def read(
|
|
|
24
24
|
store = open_store(path, storage_options=storage_options)
|
|
25
25
|
attrs = dict(store.attrs)
|
|
26
26
|
|
|
27
|
+
has_atoms = "atoms" in store
|
|
28
|
+
has_struct = "structure" in store
|
|
27
29
|
bb_positions = store["backbone/positions"][:] if "backbone" in store else None
|
|
28
30
|
bb_mask = store["backbone/mask"][:] if "backbone" in store else None
|
|
29
31
|
bond_edge_idx = store["bonds/edge_index"][:] if "bonds" in store else None
|
|
@@ -33,11 +35,11 @@ def read(
|
|
|
33
35
|
sequence_tokens=store["sequence/tokens"][:],
|
|
34
36
|
residue_index=store["sequence/residue_index"][:],
|
|
35
37
|
chain_id=store["sequence/chain_id"][:],
|
|
36
|
-
atom_positions=store["atoms/positions"][:],
|
|
37
|
-
atom_mask=store["atoms/mask"][:],
|
|
38
|
-
b_factors=store["atoms/b_factors"][:],
|
|
39
|
-
residue_atom_start=store["structure/residue_atom_start"][:],
|
|
40
|
-
residue_atom_count=store["structure/residue_atom_count"][:],
|
|
38
|
+
atom_positions=store["atoms/positions"][:] if has_atoms else None,
|
|
39
|
+
atom_mask=store["atoms/mask"][:] if has_atoms else None,
|
|
40
|
+
b_factors=store["atoms/b_factors"][:] if has_atoms else None,
|
|
41
|
+
residue_atom_start=store["structure/residue_atom_start"][:] if has_struct else None,
|
|
42
|
+
residue_atom_count=store["structure/residue_atom_count"][:] if has_struct else None,
|
|
41
43
|
backbone_positions=bb_positions,
|
|
42
44
|
backbone_mask=bb_mask,
|
|
43
45
|
bond_edge_index=bond_edge_idx,
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
import numpy as np
|
|
4
4
|
|
|
5
|
-
FORMAT_VERSION = "0.
|
|
5
|
+
FORMAT_VERSION = "0.7"
|
|
6
6
|
|
|
7
7
|
AA_VOCAB: dict[str, int] = {
|
|
8
8
|
"ALA": 0, "ARG": 1, "ASN": 2, "ASP": 3, "CYS": 4,
|
|
@@ -14,8 +14,26 @@ AA_VOCAB: dict[str, int] = {
|
|
|
14
14
|
AA_UNK = 20
|
|
15
15
|
AA_VOCAB_SIZE = 21
|
|
16
16
|
|
|
17
|
-
# Single-letter
|
|
18
|
-
|
|
17
|
+
# Single-letter codes indexed by token: position i is the 1-letter code for token i.
|
|
18
|
+
# Tokens 0-19 are the standard amino acids in AA_VOCAB order; token 20 (UNK) -> "X".
|
|
19
|
+
AA_1LETTER = "ARNDCQEGHILKMFPSTWYVX"
|
|
20
|
+
|
|
21
|
+
# Inverse map for sequence input. Any character absent here resolves to AA_UNK,
|
|
22
|
+
# which also covers ambiguity codes (B, Z, J, O) and gaps (-, .).
|
|
23
|
+
ONE_LETTER_TO_TOKEN: dict[str, int] = {c: i for i, c in enumerate(AA_1LETTER)}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def sequence_to_tokens(sequence: str) -> np.ndarray:
|
|
27
|
+
"""Map a 1-letter amino-acid string to an int32 token array (unknown -> UNK)."""
|
|
28
|
+
cleaned = "".join(sequence.split()).upper()
|
|
29
|
+
return np.array(
|
|
30
|
+
[ONE_LETTER_TO_TOKEN.get(c, AA_UNK) for c in cleaned], dtype=np.int32
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def tokens_to_sequence(tokens: np.ndarray) -> str:
|
|
35
|
+
"""Map an int32 token array back to a 1-letter amino-acid string."""
|
|
36
|
+
return "".join(AA_1LETTER[int(t)] if 0 <= int(t) < AA_VOCAB_SIZE else "X" for t in tokens)
|
|
19
37
|
|
|
20
38
|
# Canonical backbone atom order (AlphaFold / OpenFold convention)
|
|
21
39
|
BACKBONE_ATOMS = ["N", "CA", "C", "O"]
|
|
@@ -47,14 +65,15 @@ class ProteinTensorData:
|
|
|
47
65
|
residue_index: np.ndarray # int32 PDB sequence numbers
|
|
48
66
|
chain_id: np.ndarray # S1 single-char chain labels
|
|
49
67
|
|
|
50
|
-
# Atom-level - shapes [N_atoms] or [N_atoms, 3]
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
68
|
+
# Atom-level - shapes [N_atoms] or [N_atoms, 3].
|
|
69
|
+
# None for sequence-only entries (from_sequence / from_fasta) that carry no structure.
|
|
70
|
+
atom_positions: np.ndarray | None = None # float32 [N_atoms, 3] Angstroms
|
|
71
|
+
atom_mask: np.ndarray | None = None # bool [N_atoms]
|
|
72
|
+
b_factors: np.ndarray | None = None # float32 [N_atoms] B-factor / pLDDT
|
|
54
73
|
|
|
55
|
-
# Residue->atom mapping - shape [N_res]
|
|
56
|
-
residue_atom_start: np.ndarray # int32 first atom index for each residue
|
|
57
|
-
residue_atom_count: np.ndarray # int32 number of atoms per residue
|
|
74
|
+
# Residue->atom mapping - shape [N_res] (None for sequence-only entries)
|
|
75
|
+
residue_atom_start: np.ndarray | None = None # int32 first atom index for each residue
|
|
76
|
+
residue_atom_count: np.ndarray | None = None # int32 number of atoms per residue
|
|
58
77
|
|
|
59
78
|
# Backbone dense layout - shapes [N_res, 4, 3] and [N_res, 4]
|
|
60
79
|
# Atom order: N=0, CA=1, C=2, O=3 (missing atoms have mask=False, coords=0)
|
|
@@ -70,3 +89,12 @@ class ProteinTensorData:
|
|
|
70
89
|
resolution: float = float("nan")
|
|
71
90
|
method: str = ""
|
|
72
91
|
deposition_date: str = ""
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def has_structure(self) -> bool:
|
|
95
|
+
"""True if 3D coordinates are present; False for sequence-only entries."""
|
|
96
|
+
return self.atom_positions is not None
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def num_residues(self) -> int:
|
|
100
|
+
return int(self.sequence_tokens.shape[0])
|
|
@@ -22,7 +22,8 @@ def write(data: ProteinTensorData, path: str | Path, compression: str = "blosc")
|
|
|
22
22
|
"deposition_date": data.deposition_date,
|
|
23
23
|
"created_at": time.time(),
|
|
24
24
|
"num_residues": int(data.sequence_tokens.shape[0]),
|
|
25
|
-
"num_atoms": int(data.atom_positions.shape[0]),
|
|
25
|
+
"num_atoms": int(data.atom_positions.shape[0]) if data.has_structure else 0,
|
|
26
|
+
"has_structure": data.has_structure,
|
|
26
27
|
})
|
|
27
28
|
|
|
28
29
|
seq = store.require_group("sequence")
|
|
@@ -30,14 +31,16 @@ def write(data: ProteinTensorData, path: str | Path, compression: str = "blosc")
|
|
|
30
31
|
_arr(seq, "residue_index", data.residue_index, "int32", compressor)
|
|
31
32
|
_arr(seq, "chain_id", data.chain_id, "S1", compressor)
|
|
32
33
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
# Atom-level and residue->atom mapping are omitted for sequence-only entries.
|
|
35
|
+
if data.has_structure:
|
|
36
|
+
atoms = store.require_group("atoms")
|
|
37
|
+
_arr(atoms, "positions", data.atom_positions, "float32", compressor)
|
|
38
|
+
_arr(atoms, "mask", data.atom_mask, "bool", compressor)
|
|
39
|
+
_arr(atoms, "b_factors", data.b_factors, "float32", compressor)
|
|
37
40
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
+
struct = store.require_group("structure")
|
|
42
|
+
_arr(struct, "residue_atom_start", data.residue_atom_start, "int32", compressor)
|
|
43
|
+
_arr(struct, "residue_atom_count", data.residue_atom_count, "int32", compressor)
|
|
41
44
|
|
|
42
45
|
if data.backbone_positions is not None and data.backbone_mask is not None:
|
|
43
46
|
bb = store.require_group("backbone")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proteintensor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: AI-native biomolecular tensor format for structural biology ML
|
|
5
5
|
Author-email: Clayton Moore <claytonwaynemoore@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -243,6 +243,30 @@ proteintensor convert 1abc.cif 1abc.ptt
|
|
|
243
243
|
proteintensor info 1abc.ptt
|
|
244
244
|
```
|
|
245
245
|
|
|
246
|
+
### Convert a sequence (no structure required)
|
|
247
|
+
|
|
248
|
+
For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
|
|
249
|
+
sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
|
|
250
|
+
coordinates) directly from a raw string or a FASTA file:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
|
|
254
|
+
proteintensor convert-seq complex.fasta complex.ptt # multi-record FASTA -> multi-chain
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
import proteintensor as pt
|
|
259
|
+
|
|
260
|
+
data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
|
|
261
|
+
data.has_structure # False - sequence-only entry
|
|
262
|
+
data.sequence_tokens # (N_res,) int32
|
|
263
|
+
|
|
264
|
+
pt.write(data, "ubq.ptt")
|
|
265
|
+
|
|
266
|
+
# FASTA: a single record -> one chain; multiple records -> multi-chain complex
|
|
267
|
+
data = pt.from_fasta("complex.fasta")
|
|
268
|
+
```
|
|
269
|
+
|
|
246
270
|
### Benchmark against mmCIF
|
|
247
271
|
|
|
248
272
|
```bash
|
|
@@ -22,10 +22,12 @@ proteintensor/adapters/__init__.py
|
|
|
22
22
|
proteintensor/adapters/boltz.py
|
|
23
23
|
proteintensor/converters/__init__.py
|
|
24
24
|
proteintensor/converters/mmcif.py
|
|
25
|
+
proteintensor/converters/sequence.py
|
|
25
26
|
tests/test_adapters.py
|
|
26
27
|
tests/test_dataset.py
|
|
27
28
|
tests/test_embeddings.py
|
|
28
29
|
tests/test_msa.py
|
|
29
30
|
tests/test_pairs.py
|
|
30
31
|
tests/test_remote.py
|
|
31
|
-
tests/test_roundtrip.py
|
|
32
|
+
tests/test_roundtrip.py
|
|
33
|
+
tests/test_sequence.py
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
import proteintensor as pt
|
|
5
|
+
from proteintensor.schema import (
|
|
6
|
+
AA_VOCAB, AA_UNK, AA_1LETTER, ONE_LETTER_TO_TOKEN,
|
|
7
|
+
sequence_to_tokens, tokens_to_sequence,
|
|
8
|
+
)
|
|
9
|
+
from proteintensor.converters.sequence import from_sequence, from_fasta, parse_fasta
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
UBIQUITIN = (
|
|
13
|
+
"MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# --------------------------------------------------------------------------
|
|
18
|
+
# vocab consistency (guards the AA_1LETTER bug that was fixed)
|
|
19
|
+
# --------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
def test_aa_1letter_matches_vocab_order():
|
|
22
|
+
# AA_1LETTER[token] must be the 1-letter code for that token.
|
|
23
|
+
three_to_one = {
|
|
24
|
+
"ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
|
|
25
|
+
"GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
|
|
26
|
+
"LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
|
|
27
|
+
"SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V",
|
|
28
|
+
}
|
|
29
|
+
for three, token in AA_VOCAB.items():
|
|
30
|
+
if three == "UNK":
|
|
31
|
+
assert AA_1LETTER[token] == "X"
|
|
32
|
+
else:
|
|
33
|
+
assert AA_1LETTER[token] == three_to_one[three]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_token_roundtrip_is_identity():
|
|
37
|
+
toks = sequence_to_tokens(UBIQUITIN)
|
|
38
|
+
assert tokens_to_sequence(toks) == UBIQUITIN
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# --------------------------------------------------------------------------
|
|
42
|
+
# from_sequence
|
|
43
|
+
# --------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
def test_from_sequence_basic():
|
|
46
|
+
data = from_sequence(UBIQUITIN, pdb_id="UBQ", chain_id="A")
|
|
47
|
+
assert data.num_residues == len(UBIQUITIN)
|
|
48
|
+
assert data.has_structure is False
|
|
49
|
+
assert data.atom_positions is None
|
|
50
|
+
assert data.backbone_positions is None
|
|
51
|
+
assert data.sequence_tokens.dtype == np.int32
|
|
52
|
+
assert data.residue_index[0] == 1
|
|
53
|
+
assert data.residue_index[-1] == len(UBIQUITIN)
|
|
54
|
+
assert set(data.chain_id.tolist()) == {b"A"}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_from_sequence_whitespace_ignored():
|
|
58
|
+
a = from_sequence("MKT AYI\nAKQR")
|
|
59
|
+
b = from_sequence("MKTAYIAKQR")
|
|
60
|
+
np.testing.assert_array_equal(a.sequence_tokens, b.sequence_tokens)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_from_sequence_unknown_chars_map_to_unk():
|
|
64
|
+
# B, Z, J, O, X, and gap chars are not standard residues -> UNK
|
|
65
|
+
data = from_sequence("ABXZ-J")
|
|
66
|
+
assert data.sequence_tokens[0] == AA_VOCAB["ALA"]
|
|
67
|
+
assert (data.sequence_tokens[1:] == AA_UNK).all()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_from_sequence_empty_raises():
|
|
71
|
+
with pytest.raises(ValueError):
|
|
72
|
+
from_sequence(" \n ")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_from_sequence_residue_start():
|
|
76
|
+
data = from_sequence("MKT", residue_start=100)
|
|
77
|
+
np.testing.assert_array_equal(data.residue_index, [100, 101, 102])
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# --------------------------------------------------------------------------
|
|
81
|
+
# round-trip through .ptt (write -> read) for a sequence-only entry
|
|
82
|
+
# --------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def test_sequence_only_roundtrip(tmp_path):
|
|
85
|
+
data = from_sequence(UBIQUITIN, pdb_id="UBQ")
|
|
86
|
+
ptt = tmp_path / "ubq_seq.ptt"
|
|
87
|
+
pt.write(data, str(ptt))
|
|
88
|
+
|
|
89
|
+
loaded = pt.read(str(ptt))
|
|
90
|
+
assert loaded.has_structure is False
|
|
91
|
+
assert loaded.atom_positions is None
|
|
92
|
+
assert loaded.residue_atom_start is None
|
|
93
|
+
np.testing.assert_array_equal(loaded.sequence_tokens, data.sequence_tokens)
|
|
94
|
+
np.testing.assert_array_equal(loaded.residue_index, data.residue_index)
|
|
95
|
+
assert loaded.pdb_id == "UBQ"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_sequence_only_has_structure_flag_in_store(tmp_path):
|
|
99
|
+
import zarr
|
|
100
|
+
ptt = tmp_path / "seq.ptt"
|
|
101
|
+
pt.write(from_sequence("MKTAYIAKQR"), str(ptt))
|
|
102
|
+
store = zarr.open(str(ptt), mode="r")
|
|
103
|
+
assert store.attrs["has_structure"] is False
|
|
104
|
+
assert store.attrs["num_atoms"] == 0
|
|
105
|
+
assert "atoms" not in store
|
|
106
|
+
assert "structure" not in store
|
|
107
|
+
assert "sequence" in store
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_read_backbone_on_sequence_only_raises(tmp_path):
|
|
111
|
+
ptt = tmp_path / "seq.ptt"
|
|
112
|
+
pt.write(from_sequence("MKTAYIAKQR"), str(ptt))
|
|
113
|
+
with pytest.raises(KeyError):
|
|
114
|
+
pt.read_backbone(str(ptt))
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# --------------------------------------------------------------------------
|
|
118
|
+
# FASTA parsing
|
|
119
|
+
# --------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def test_parse_fasta_multi():
|
|
122
|
+
text = ">chainA\nMKTA\nYIAK\n>chainB\nQRLL\n"
|
|
123
|
+
recs = parse_fasta(text)
|
|
124
|
+
assert recs == [("chainA", "MKTAYIAK"), ("chainB", "QRLL")]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_from_fasta_single(tmp_path):
|
|
128
|
+
fasta = tmp_path / "ubq.fasta"
|
|
129
|
+
fasta.write_text(f">UBQ\n{UBIQUITIN}\n")
|
|
130
|
+
data = from_fasta(fasta)
|
|
131
|
+
assert data.num_residues == len(UBIQUITIN)
|
|
132
|
+
assert data.pdb_id == "UBQ"
|
|
133
|
+
assert set(data.chain_id.tolist()) == {b"A"}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_from_fasta_multichain(tmp_path):
|
|
137
|
+
fasta = tmp_path / "complex.fasta"
|
|
138
|
+
fasta.write_text(">a\nMKTAY\n>b\nQRLLG\n")
|
|
139
|
+
data = from_fasta(fasta)
|
|
140
|
+
assert data.num_residues == 10
|
|
141
|
+
# two chains, A and B, residue numbering restarts per chain
|
|
142
|
+
assert set(data.chain_id.tolist()) == {b"A", b"B"}
|
|
143
|
+
np.testing.assert_array_equal(data.residue_index, [1, 2, 3, 4, 5, 1, 2, 3, 4, 5])
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_from_fasta_empty_raises(tmp_path):
|
|
147
|
+
fasta = tmp_path / "empty.fasta"
|
|
148
|
+
fasta.write_text("\n\n")
|
|
149
|
+
with pytest.raises(ValueError):
|
|
150
|
+
from_fasta(fasta)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|