proteintensor 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {proteintensor-0.1.3 → proteintensor-0.2.0}/PKG-INFO +25 -1
  2. {proteintensor-0.1.3 → proteintensor-0.2.0}/README.md +24 -0
  3. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/__init__.py +4 -1
  4. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/cli.py +56 -0
  5. proteintensor-0.2.0/proteintensor/converters/__init__.py +4 -0
  6. proteintensor-0.2.0/proteintensor/converters/sequence.py +103 -0
  7. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/reader.py +7 -5
  8. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/schema.py +38 -10
  9. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/writer.py +11 -8
  10. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/PKG-INFO +25 -1
  11. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/SOURCES.txt +3 -1
  12. {proteintensor-0.1.3 → proteintensor-0.2.0}/pyproject.toml +1 -1
  13. proteintensor-0.2.0/tests/test_sequence.py +150 -0
  14. proteintensor-0.1.3/proteintensor/converters/__init__.py +0 -3
  15. {proteintensor-0.1.3 → proteintensor-0.2.0}/LICENSE +0 -0
  16. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/adapters/__init__.py +0 -0
  17. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/adapters/boltz.py +0 -0
  18. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/bonds.py +0 -0
  19. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/converters/mmcif.py +0 -0
  20. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/dataset.py +0 -0
  21. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/embeddings.py +0 -0
  22. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/msa.py +0 -0
  23. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/pairs.py +0 -0
  24. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/remote.py +0 -0
  25. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/dependency_links.txt +0 -0
  26. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/entry_points.txt +0 -0
  27. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/requires.txt +0 -0
  28. {proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/top_level.txt +0 -0
  29. {proteintensor-0.1.3 → proteintensor-0.2.0}/setup.cfg +0 -0
  30. {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_adapters.py +0 -0
  31. {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_dataset.py +0 -0
  32. {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_embeddings.py +0 -0
  33. {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_msa.py +0 -0
  34. {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_pairs.py +0 -0
  35. {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_remote.py +0 -0
  36. {proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_roundtrip.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proteintensor
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: AI-native biomolecular tensor format for structural biology ML
5
5
  Author-email: Clayton Moore <claytonwaynemoore@gmail.com>
6
6
  License-Expression: MIT
@@ -243,6 +243,30 @@ proteintensor convert 1abc.cif 1abc.ptt
243
243
  proteintensor info 1abc.ptt
244
244
  ```
245
245
 
246
+ ### Convert a sequence (no structure required)
247
+
248
+ For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
249
+ sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
250
+ coordinates) directly from a raw string or a FASTA file:
251
+
252
+ ```bash
253
+ proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
254
+ proteintensor convert-seq complex.fasta complex.ptt # multi-record FASTA -> multi-chain
255
+ ```
256
+
257
+ ```python
258
+ import proteintensor as pt
259
+
260
+ data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
261
+ data.has_structure # False - sequence-only entry
262
+ data.sequence_tokens # (N_res,) int32
263
+
264
+ pt.write(data, "ubq.ptt")
265
+
266
+ # FASTA: a single record -> one chain; multiple records -> multi-chain complex
267
+ data = pt.from_fasta("complex.fasta")
268
+ ```
269
+
246
270
  ### Benchmark against mmCIF
247
271
 
248
272
  ```bash
@@ -200,6 +200,30 @@ proteintensor convert 1abc.cif 1abc.ptt
200
200
  proteintensor info 1abc.ptt
201
201
  ```
202
202
 
203
+ ### Convert a sequence (no structure required)
204
+
205
+ For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
206
+ sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
207
+ coordinates) directly from a raw string or a FASTA file:
208
+
209
+ ```bash
210
+ proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
211
+ proteintensor convert-seq complex.fasta complex.ptt # multi-record FASTA -> multi-chain
212
+ ```
213
+
214
+ ```python
215
+ import proteintensor as pt
216
+
217
+ data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
218
+ data.has_structure # False - sequence-only entry
219
+ data.sequence_tokens # (N_res,) int32
220
+
221
+ pt.write(data, "ubq.ptt")
222
+
223
+ # FASTA: a single record -> one chain; multiple records -> multi-chain complex
224
+ data = pt.from_fasta("complex.fasta")
225
+ ```
226
+
203
227
  ### Benchmark against mmCIF
204
228
 
205
229
  ```bash
@@ -33,10 +33,13 @@ from .bonds import (
33
33
  )
34
34
  from .dataset import ProteinDataset, create_dataset, add_to_dataset
35
35
  from .remote import consolidate
36
+ from .converters import from_mmcif, from_sequence, from_fasta, parse_fasta
36
37
 
37
- __version__ = "0.1.0"
38
+ __version__ = "0.2.0"
38
39
 
39
40
  __all__ = [
41
+ # Converters - input
42
+ "from_mmcif", "from_sequence", "from_fasta", "parse_fasta",
40
43
  # I/O - structure
41
44
  "read", "write",
42
45
  "read_backbone", "read_bonds",
@@ -73,6 +73,62 @@ def convert(input_path: Path, output_path: Path, compression: str, pdb_id: str):
73
73
  console.print(Panel(tbl, title=f"[green]Converted -> {output_path}[/green]", expand=False))
74
74
 
75
75
 
76
+ # ---------------------------------------------------------------------------
77
+ # convert-seq
78
+ # ---------------------------------------------------------------------------
79
+
80
+ @main.command("convert-seq")
81
+ @click.argument("sequence_or_fasta", type=str)
82
+ @click.argument("output_path", type=click.Path(path_type=Path))
83
+ @click.option("--compression", default="blosc", show_default=True,
84
+ type=click.Choice(["blosc", "none"]),
85
+ help="Compression codec for the Zarr store.")
86
+ @click.option("--pdb-id", default="", help="Identifier stored in metadata (e.g. UniProt accession).")
87
+ @click.option("--chain", default="A", show_default=True,
88
+ help="Chain label applied to a raw sequence input.")
89
+ def convert_seq(sequence_or_fasta: str, output_path: Path, compression: str,
90
+ pdb_id: str, chain: str):
91
+ """Convert a protein sequence to ProteinTensor (.ptt) format.
92
+
93
+ SEQUENCE_OR_FASTA may be a path to a FASTA file or a literal 1-letter
94
+ amino-acid string. The result is a sequence-only .ptt (no coordinates) -
95
+ the primary input form for AlphaFold- and Boltz-style predictors.
96
+ """
97
+ from .converters.sequence import from_sequence, from_fasta
98
+ from .writer import write
99
+
100
+ src = Path(sequence_or_fasta)
101
+ is_file = src.exists() and src.is_file()
102
+
103
+ t0 = time.perf_counter()
104
+ if is_file:
105
+ data = from_fasta(src, pdb_id=pdb_id)
106
+ source_desc = src.name
107
+ else:
108
+ data = from_sequence(sequence_or_fasta, pdb_id=pdb_id, chain_id=chain)
109
+ source_desc = f"<literal sequence: {data.num_residues} aa>"
110
+ build_ms = (time.perf_counter() - t0) * 1000
111
+
112
+ t0 = time.perf_counter()
113
+ write(data, output_path, compression=compression)
114
+ write_ms = (time.perf_counter() - t0) * 1000
115
+
116
+ dst_bytes = sum(f.stat().st_size for f in Path(output_path).rglob("*") if f.is_file())
117
+
118
+ tbl = Table(show_header=False, box=None, padding=(0, 2))
119
+ tbl.add_row("PDB ID", data.pdb_id or "(unknown)")
120
+ tbl.add_row("Chains", _chain_summary(data.chain_id))
121
+ tbl.add_row("Residues", f"{data.num_residues:,}")
122
+ tbl.add_row("Structure", "no (sequence-only)")
123
+ tbl.add_row("")
124
+ tbl.add_row("Source", source_desc)
125
+ tbl.add_row("Build time", f"{build_ms:.1f} ms")
126
+ tbl.add_row("Write time", f"{write_ms:.1f} ms")
127
+ tbl.add_row("Output", _fmt_bytes(dst_bytes))
128
+
129
+ console.print(Panel(tbl, title=f"[green]Converted -> {output_path}[/green]", expand=False))
130
+
131
+
76
132
  # ---------------------------------------------------------------------------
77
133
  # info
78
134
  # ---------------------------------------------------------------------------
@@ -0,0 +1,4 @@
1
+ from .mmcif import from_mmcif
2
+ from .sequence import from_sequence, from_fasta, parse_fasta
3
+
4
+ __all__ = ["from_mmcif", "from_sequence", "from_fasta", "parse_fasta"]
@@ -0,0 +1,103 @@
1
+ from __future__ import annotations
2
+ import numpy as np
3
+ from pathlib import Path
4
+
5
+ from ..schema import ProteinTensorData, sequence_to_tokens
6
+
7
+
8
+ def from_sequence(
9
+ sequence: str,
10
+ *,
11
+ pdb_id: str = "",
12
+ chain_id: str = "A",
13
+ residue_start: int = 1,
14
+ ) -> ProteinTensorData:
15
+ """Build a sequence-only ProteinTensorData from a 1-letter amino-acid string.
16
+
17
+ No 3D coordinates are produced - the result has ``has_structure == False`` and
18
+ is the primary input form for sequence-driven predictors such as AlphaFold and
19
+ Boltz. Unknown / ambiguity characters (B, Z, J, O, X, gaps) map to UNK.
20
+
21
+ Parameters
22
+ ----------
23
+ sequence 1-letter amino-acid string. Whitespace is ignored.
24
+ pdb_id Identifier stored in metadata (e.g. a UniProt accession).
25
+ chain_id Single-character chain label applied to every residue.
26
+ residue_start PDB residue number assigned to the first residue (default 1).
27
+ """
28
+ tokens = sequence_to_tokens(sequence)
29
+ if tokens.shape[0] == 0:
30
+ raise ValueError("Empty sequence: no amino-acid residues to encode.")
31
+
32
+ n = tokens.shape[0]
33
+ chain_label = (chain_id[0] if chain_id else "A").encode()
34
+ return ProteinTensorData(
35
+ sequence_tokens=tokens,
36
+ residue_index=np.arange(residue_start, residue_start + n, dtype=np.int32),
37
+ chain_id=np.full(n, chain_label, dtype="S1"),
38
+ pdb_id=pdb_id,
39
+ method="sequence",
40
+ )
41
+
42
+
43
+ def parse_fasta(text: str) -> list[tuple[str, str]]:
44
+ """Parse FASTA text into a list of (header, sequence) tuples."""
45
+ records: list[tuple[str, str]] = []
46
+ header: str | None = None
47
+ chunks: list[str] = []
48
+ for line in text.splitlines():
49
+ line = line.strip()
50
+ if not line:
51
+ continue
52
+ if line.startswith(">"):
53
+ if header is not None:
54
+ records.append((header, "".join(chunks)))
55
+ header = line[1:].strip()
56
+ chunks = []
57
+ else:
58
+ chunks.append(line)
59
+ if header is not None:
60
+ records.append((header, "".join(chunks)))
61
+ return records
62
+
63
+
64
+ def from_fasta(path: str | Path, *, pdb_id: str = "") -> ProteinTensorData:
65
+ """Build a ProteinTensorData from a FASTA file.
66
+
67
+ A single record produces a single-chain sequence-only entry. Multiple records
68
+ are treated as a multi-chain complex: chains are concatenated and labelled
69
+ A, B, C, ... in file order, with residue numbering restarting per chain. This
70
+ matches the multi-chain input that AlphaFold-Multimer and Boltz consume.
71
+ """
72
+ path = Path(path)
73
+ records = parse_fasta(path.read_text())
74
+ if not records:
75
+ raise ValueError(f"No FASTA records found in '{path}'.")
76
+ if not pdb_id:
77
+ pdb_id = path.stem.upper().split("_")[0]
78
+
79
+ if len(records) == 1:
80
+ return from_sequence(records[0][1], pdb_id=pdb_id, chain_id="A")
81
+
82
+ all_tokens: list[np.ndarray] = []
83
+ all_res_idx: list[np.ndarray] = []
84
+ all_chain: list[np.ndarray] = []
85
+ for i, (_, seq) in enumerate(records):
86
+ sub = from_sequence(seq, chain_id=_chain_label(i))
87
+ all_tokens.append(sub.sequence_tokens)
88
+ all_res_idx.append(sub.residue_index)
89
+ all_chain.append(sub.chain_id)
90
+
91
+ return ProteinTensorData(
92
+ sequence_tokens=np.concatenate(all_tokens),
93
+ residue_index=np.concatenate(all_res_idx),
94
+ chain_id=np.concatenate(all_chain),
95
+ pdb_id=pdb_id,
96
+ method="sequence",
97
+ )
98
+
99
+
100
+ def _chain_label(index: int) -> str:
101
+ """Map a 0-based chain index to a label: 0->A .. 25->Z, then a, b, ..."""
102
+ alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
103
+ return alphabet[index] if index < len(alphabet) else "X"
@@ -24,6 +24,8 @@ def read(
24
24
  store = open_store(path, storage_options=storage_options)
25
25
  attrs = dict(store.attrs)
26
26
 
27
+ has_atoms = "atoms" in store
28
+ has_struct = "structure" in store
27
29
  bb_positions = store["backbone/positions"][:] if "backbone" in store else None
28
30
  bb_mask = store["backbone/mask"][:] if "backbone" in store else None
29
31
  bond_edge_idx = store["bonds/edge_index"][:] if "bonds" in store else None
@@ -33,11 +35,11 @@ def read(
33
35
  sequence_tokens=store["sequence/tokens"][:],
34
36
  residue_index=store["sequence/residue_index"][:],
35
37
  chain_id=store["sequence/chain_id"][:],
36
- atom_positions=store["atoms/positions"][:],
37
- atom_mask=store["atoms/mask"][:],
38
- b_factors=store["atoms/b_factors"][:],
39
- residue_atom_start=store["structure/residue_atom_start"][:],
40
- residue_atom_count=store["structure/residue_atom_count"][:],
38
+ atom_positions=store["atoms/positions"][:] if has_atoms else None,
39
+ atom_mask=store["atoms/mask"][:] if has_atoms else None,
40
+ b_factors=store["atoms/b_factors"][:] if has_atoms else None,
41
+ residue_atom_start=store["structure/residue_atom_start"][:] if has_struct else None,
42
+ residue_atom_count=store["structure/residue_atom_count"][:] if has_struct else None,
41
43
  backbone_positions=bb_positions,
42
44
  backbone_mask=bb_mask,
43
45
  bond_edge_index=bond_edge_idx,
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
  from dataclasses import dataclass
3
3
  import numpy as np
4
4
 
5
- FORMAT_VERSION = "0.6"
5
+ FORMAT_VERSION = "0.7"
6
6
 
7
7
  AA_VOCAB: dict[str, int] = {
8
8
  "ALA": 0, "ARG": 1, "ASN": 2, "ASP": 3, "CYS": 4,
@@ -14,8 +14,26 @@ AA_VOCAB: dict[str, int] = {
14
14
  AA_UNK = 20
15
15
  AA_VOCAB_SIZE = 21
16
16
 
17
- # Single-letter equivalents for display
18
- AA_1LETTER = "ARNDCQEGHILKMFPSTWYXU"
17
+ # Single-letter codes indexed by token: position i is the 1-letter code for token i.
18
+ # Tokens 0-19 are the standard amino acids in AA_VOCAB order; token 20 (UNK) -> "X".
19
+ AA_1LETTER = "ARNDCQEGHILKMFPSTWYVX"
20
+
21
+ # Inverse map for sequence input. Any character absent here resolves to AA_UNK,
22
+ # which also covers ambiguity codes (B, Z, J, O) and gaps (-, .).
23
+ ONE_LETTER_TO_TOKEN: dict[str, int] = {c: i for i, c in enumerate(AA_1LETTER)}
24
+
25
+
26
+ def sequence_to_tokens(sequence: str) -> np.ndarray:
27
+ """Map a 1-letter amino-acid string to an int32 token array (unknown -> UNK)."""
28
+ cleaned = "".join(sequence.split()).upper()
29
+ return np.array(
30
+ [ONE_LETTER_TO_TOKEN.get(c, AA_UNK) for c in cleaned], dtype=np.int32
31
+ )
32
+
33
+
34
+ def tokens_to_sequence(tokens: np.ndarray) -> str:
35
+ """Map an int32 token array back to a 1-letter amino-acid string."""
36
+ return "".join(AA_1LETTER[int(t)] if 0 <= int(t) < AA_VOCAB_SIZE else "X" for t in tokens)
19
37
 
20
38
  # Canonical backbone atom order (AlphaFold / OpenFold convention)
21
39
  BACKBONE_ATOMS = ["N", "CA", "C", "O"]
@@ -47,14 +65,15 @@ class ProteinTensorData:
47
65
  residue_index: np.ndarray # int32 PDB sequence numbers
48
66
  chain_id: np.ndarray # S1 single-char chain labels
49
67
 
50
- # Atom-level - shapes [N_atoms] or [N_atoms, 3]
51
- atom_positions: np.ndarray # float32 [N_atoms, 3] Angstroms
52
- atom_mask: np.ndarray # bool [N_atoms]
53
- b_factors: np.ndarray # float32 [N_atoms] B-factor / pLDDT
68
+ # Atom-level - shapes [N_atoms] or [N_atoms, 3].
69
+ # None for sequence-only entries (from_sequence / from_fasta) that carry no structure.
70
+ atom_positions: np.ndarray | None = None # float32 [N_atoms, 3] Angstroms
71
+ atom_mask: np.ndarray | None = None # bool [N_atoms]
72
+ b_factors: np.ndarray | None = None # float32 [N_atoms] B-factor / pLDDT
54
73
 
55
- # Residue->atom mapping - shape [N_res]
56
- residue_atom_start: np.ndarray # int32 first atom index for each residue
57
- residue_atom_count: np.ndarray # int32 number of atoms per residue
74
+ # Residue->atom mapping - shape [N_res] (None for sequence-only entries)
75
+ residue_atom_start: np.ndarray | None = None # int32 first atom index for each residue
76
+ residue_atom_count: np.ndarray | None = None # int32 number of atoms per residue
58
77
 
59
78
  # Backbone dense layout - shapes [N_res, 4, 3] and [N_res, 4]
60
79
  # Atom order: N=0, CA=1, C=2, O=3 (missing atoms have mask=False, coords=0)
@@ -70,3 +89,12 @@ class ProteinTensorData:
70
89
  resolution: float = float("nan")
71
90
  method: str = ""
72
91
  deposition_date: str = ""
92
+
93
+ @property
94
+ def has_structure(self) -> bool:
95
+ """True if 3D coordinates are present; False for sequence-only entries."""
96
+ return self.atom_positions is not None
97
+
98
+ @property
99
+ def num_residues(self) -> int:
100
+ return int(self.sequence_tokens.shape[0])
@@ -22,7 +22,8 @@ def write(data: ProteinTensorData, path: str | Path, compression: str = "blosc")
22
22
  "deposition_date": data.deposition_date,
23
23
  "created_at": time.time(),
24
24
  "num_residues": int(data.sequence_tokens.shape[0]),
25
- "num_atoms": int(data.atom_positions.shape[0]),
25
+ "num_atoms": int(data.atom_positions.shape[0]) if data.has_structure else 0,
26
+ "has_structure": data.has_structure,
26
27
  })
27
28
 
28
29
  seq = store.require_group("sequence")
@@ -30,14 +31,16 @@ def write(data: ProteinTensorData, path: str | Path, compression: str = "blosc")
30
31
  _arr(seq, "residue_index", data.residue_index, "int32", compressor)
31
32
  _arr(seq, "chain_id", data.chain_id, "S1", compressor)
32
33
 
33
- atoms = store.require_group("atoms")
34
- _arr(atoms, "positions", data.atom_positions, "float32", compressor)
35
- _arr(atoms, "mask", data.atom_mask, "bool", compressor)
36
- _arr(atoms, "b_factors", data.b_factors, "float32", compressor)
34
+ # Atom-level and residue->atom mapping are omitted for sequence-only entries.
35
+ if data.has_structure:
36
+ atoms = store.require_group("atoms")
37
+ _arr(atoms, "positions", data.atom_positions, "float32", compressor)
38
+ _arr(atoms, "mask", data.atom_mask, "bool", compressor)
39
+ _arr(atoms, "b_factors", data.b_factors, "float32", compressor)
37
40
 
38
- struct = store.require_group("structure")
39
- _arr(struct, "residue_atom_start", data.residue_atom_start, "int32", compressor)
40
- _arr(struct, "residue_atom_count", data.residue_atom_count, "int32", compressor)
41
+ struct = store.require_group("structure")
42
+ _arr(struct, "residue_atom_start", data.residue_atom_start, "int32", compressor)
43
+ _arr(struct, "residue_atom_count", data.residue_atom_count, "int32", compressor)
41
44
 
42
45
  if data.backbone_positions is not None and data.backbone_mask is not None:
43
46
  bb = store.require_group("backbone")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proteintensor
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: AI-native biomolecular tensor format for structural biology ML
5
5
  Author-email: Clayton Moore <claytonwaynemoore@gmail.com>
6
6
  License-Expression: MIT
@@ -243,6 +243,30 @@ proteintensor convert 1abc.cif 1abc.ptt
243
243
  proteintensor info 1abc.ptt
244
244
  ```
245
245
 
246
+ ### Convert a sequence (no structure required)
247
+
248
+ For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
249
+ sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
250
+ coordinates) directly from a raw string or a FASTA file:
251
+
252
+ ```bash
253
+ proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
254
+ proteintensor convert-seq complex.fasta complex.ptt # multi-record FASTA -> multi-chain
255
+ ```
256
+
257
+ ```python
258
+ import proteintensor as pt
259
+
260
+ data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
261
+ data.has_structure # False - sequence-only entry
262
+ data.sequence_tokens # (N_res,) int32
263
+
264
+ pt.write(data, "ubq.ptt")
265
+
266
+ # FASTA: a single record -> one chain; multiple records -> multi-chain complex
267
+ data = pt.from_fasta("complex.fasta")
268
+ ```
269
+
246
270
  ### Benchmark against mmCIF
247
271
 
248
272
  ```bash
@@ -22,10 +22,12 @@ proteintensor/adapters/__init__.py
22
22
  proteintensor/adapters/boltz.py
23
23
  proteintensor/converters/__init__.py
24
24
  proteintensor/converters/mmcif.py
25
+ proteintensor/converters/sequence.py
25
26
  tests/test_adapters.py
26
27
  tests/test_dataset.py
27
28
  tests/test_embeddings.py
28
29
  tests/test_msa.py
29
30
  tests/test_pairs.py
30
31
  tests/test_remote.py
31
- tests/test_roundtrip.py
32
+ tests/test_roundtrip.py
33
+ tests/test_sequence.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "proteintensor"
7
- version = "0.1.3"
7
+ version = "0.2.0"
8
8
  description = "AI-native biomolecular tensor format for structural biology ML"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -0,0 +1,150 @@
1
+ import numpy as np
2
+ import pytest
3
+
4
+ import proteintensor as pt
5
+ from proteintensor.schema import (
6
+ AA_VOCAB, AA_UNK, AA_1LETTER, ONE_LETTER_TO_TOKEN,
7
+ sequence_to_tokens, tokens_to_sequence,
8
+ )
9
+ from proteintensor.converters.sequence import from_sequence, from_fasta, parse_fasta
10
+
11
+
12
+ UBIQUITIN = (
13
+ "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
14
+ )
15
+
16
+
17
+ # --------------------------------------------------------------------------
18
+ # vocab consistency (guards the AA_1LETTER bug that was fixed)
19
+ # --------------------------------------------------------------------------
20
+
21
+ def test_aa_1letter_matches_vocab_order():
22
+ # AA_1LETTER[token] must be the 1-letter code for that token.
23
+ three_to_one = {
24
+ "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
25
+ "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
26
+ "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
27
+ "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V",
28
+ }
29
+ for three, token in AA_VOCAB.items():
30
+ if three == "UNK":
31
+ assert AA_1LETTER[token] == "X"
32
+ else:
33
+ assert AA_1LETTER[token] == three_to_one[three]
34
+
35
+
36
+ def test_token_roundtrip_is_identity():
37
+ toks = sequence_to_tokens(UBIQUITIN)
38
+ assert tokens_to_sequence(toks) == UBIQUITIN
39
+
40
+
41
+ # --------------------------------------------------------------------------
42
+ # from_sequence
43
+ # --------------------------------------------------------------------------
44
+
45
+ def test_from_sequence_basic():
46
+ data = from_sequence(UBIQUITIN, pdb_id="UBQ", chain_id="A")
47
+ assert data.num_residues == len(UBIQUITIN)
48
+ assert data.has_structure is False
49
+ assert data.atom_positions is None
50
+ assert data.backbone_positions is None
51
+ assert data.sequence_tokens.dtype == np.int32
52
+ assert data.residue_index[0] == 1
53
+ assert data.residue_index[-1] == len(UBIQUITIN)
54
+ assert set(data.chain_id.tolist()) == {b"A"}
55
+
56
+
57
+ def test_from_sequence_whitespace_ignored():
58
+ a = from_sequence("MKT AYI\nAKQR")
59
+ b = from_sequence("MKTAYIAKQR")
60
+ np.testing.assert_array_equal(a.sequence_tokens, b.sequence_tokens)
61
+
62
+
63
+ def test_from_sequence_unknown_chars_map_to_unk():
64
+ # B, Z, J, O, X, and gap chars are not standard residues -> UNK
65
+ data = from_sequence("ABXZ-J")
66
+ assert data.sequence_tokens[0] == AA_VOCAB["ALA"]
67
+ assert (data.sequence_tokens[1:] == AA_UNK).all()
68
+
69
+
70
+ def test_from_sequence_empty_raises():
71
+ with pytest.raises(ValueError):
72
+ from_sequence(" \n ")
73
+
74
+
75
+ def test_from_sequence_residue_start():
76
+ data = from_sequence("MKT", residue_start=100)
77
+ np.testing.assert_array_equal(data.residue_index, [100, 101, 102])
78
+
79
+
80
+ # --------------------------------------------------------------------------
81
+ # round-trip through .ptt (write -> read) for a sequence-only entry
82
+ # --------------------------------------------------------------------------
83
+
84
+ def test_sequence_only_roundtrip(tmp_path):
85
+ data = from_sequence(UBIQUITIN, pdb_id="UBQ")
86
+ ptt = tmp_path / "ubq_seq.ptt"
87
+ pt.write(data, str(ptt))
88
+
89
+ loaded = pt.read(str(ptt))
90
+ assert loaded.has_structure is False
91
+ assert loaded.atom_positions is None
92
+ assert loaded.residue_atom_start is None
93
+ np.testing.assert_array_equal(loaded.sequence_tokens, data.sequence_tokens)
94
+ np.testing.assert_array_equal(loaded.residue_index, data.residue_index)
95
+ assert loaded.pdb_id == "UBQ"
96
+
97
+
98
+ def test_sequence_only_has_structure_flag_in_store(tmp_path):
99
+ import zarr
100
+ ptt = tmp_path / "seq.ptt"
101
+ pt.write(from_sequence("MKTAYIAKQR"), str(ptt))
102
+ store = zarr.open(str(ptt), mode="r")
103
+ assert store.attrs["has_structure"] is False
104
+ assert store.attrs["num_atoms"] == 0
105
+ assert "atoms" not in store
106
+ assert "structure" not in store
107
+ assert "sequence" in store
108
+
109
+
110
+ def test_read_backbone_on_sequence_only_raises(tmp_path):
111
+ ptt = tmp_path / "seq.ptt"
112
+ pt.write(from_sequence("MKTAYIAKQR"), str(ptt))
113
+ with pytest.raises(KeyError):
114
+ pt.read_backbone(str(ptt))
115
+
116
+
117
+ # --------------------------------------------------------------------------
118
+ # FASTA parsing
119
+ # --------------------------------------------------------------------------
120
+
121
+ def test_parse_fasta_multi():
122
+ text = ">chainA\nMKTA\nYIAK\n>chainB\nQRLL\n"
123
+ recs = parse_fasta(text)
124
+ assert recs == [("chainA", "MKTAYIAK"), ("chainB", "QRLL")]
125
+
126
+
127
+ def test_from_fasta_single(tmp_path):
128
+ fasta = tmp_path / "ubq.fasta"
129
+ fasta.write_text(f">UBQ\n{UBIQUITIN}\n")
130
+ data = from_fasta(fasta)
131
+ assert data.num_residues == len(UBIQUITIN)
132
+ assert data.pdb_id == "UBQ"
133
+ assert set(data.chain_id.tolist()) == {b"A"}
134
+
135
+
136
+ def test_from_fasta_multichain(tmp_path):
137
+ fasta = tmp_path / "complex.fasta"
138
+ fasta.write_text(">a\nMKTAY\n>b\nQRLLG\n")
139
+ data = from_fasta(fasta)
140
+ assert data.num_residues == 10
141
+ # two chains, A and B, residue numbering restarts per chain
142
+ assert set(data.chain_id.tolist()) == {b"A", b"B"}
143
+ np.testing.assert_array_equal(data.residue_index, [1, 2, 3, 4, 5, 1, 2, 3, 4, 5])
144
+
145
+
146
+ def test_from_fasta_empty_raises(tmp_path):
147
+ fasta = tmp_path / "empty.fasta"
148
+ fasta.write_text("\n\n")
149
+ with pytest.raises(ValueError):
150
+ from_fasta(fasta)
@@ -1,3 +0,0 @@
1
- from .mmcif import from_mmcif
2
-
3
- __all__ = ["from_mmcif"]
File without changes
File without changes