PyPI - proteintensor - Versions diffs - 0.1.3__tar.gz → 0.2.0__tar.gz - Mend

proteintensor 0.1.3tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{proteintensor-0.1.3 → proteintensor-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: proteintensor
-Version: 0.1.3
+Version: 0.2.0
 Summary: AI-native biomolecular tensor format for structural biology ML
 Author-email: Clayton Moore <claytonwaynemoore@gmail.com>
 License-Expression: MIT
@@ -243,6 +243,30 @@ proteintensor convert 1abc.cif 1abc.ptt
 proteintensor info 1abc.ptt
 ```
+### Convert a sequence (no structure required)
+For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
+sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
+coordinates) directly from a raw string or a FASTA file:
+```bash
+proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
+proteintensor convert-seq complex.fasta complex.ptt   # multi-record FASTA -> multi-chain
+```
+```python
+import proteintensor as pt
+data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
+data.has_structure        # False - sequence-only entry
+data.sequence_tokens      # (N_res,)  int32
+pt.write(data, "ubq.ptt")
+# FASTA: a single record -> one chain; multiple records -> multi-chain complex
+data = pt.from_fasta("complex.fasta")
+```
 ### Benchmark against mmCIF
 ```bash

{proteintensor-0.1.3 → proteintensor-0.2.0}/README.md RENAMED Viewed

@@ -200,6 +200,30 @@ proteintensor convert 1abc.cif 1abc.ptt
 proteintensor info 1abc.ptt
 ```
+### Convert a sequence (no structure required)
+For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
+sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
+coordinates) directly from a raw string or a FASTA file:
+```bash
+proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
+proteintensor convert-seq complex.fasta complex.ptt   # multi-record FASTA -> multi-chain
+```
+```python
+import proteintensor as pt
+data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
+data.has_structure        # False - sequence-only entry
+data.sequence_tokens      # (N_res,)  int32
+pt.write(data, "ubq.ptt")
+# FASTA: a single record -> one chain; multiple records -> multi-chain complex
+data = pt.from_fasta("complex.fasta")
+```
 ### Benchmark against mmCIF
 ```bash

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/__init__.py RENAMED Viewed

@@ -33,10 +33,13 @@ from .bonds import (
 )
 from .dataset import ProteinDataset, create_dataset, add_to_dataset
 from .remote import consolidate
+from .converters import from_mmcif, from_sequence, from_fasta, parse_fasta
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 __all__ = [
+    # Converters - input
+    "from_mmcif", "from_sequence", "from_fasta", "parse_fasta",
     # I/O - structure
     "read", "write",
     "read_backbone", "read_bonds",

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/cli.py RENAMED Viewed

@@ -73,6 +73,62 @@ def convert(input_path: Path, output_path: Path, compression: str, pdb_id: str):
     console.print(Panel(tbl, title=f"[green]Converted -> {output_path}[/green]", expand=False))
+# ---------------------------------------------------------------------------
+# convert-seq
+# ---------------------------------------------------------------------------
+@main.command("convert-seq")
+@click.argument("sequence_or_fasta", type=str)
+@click.argument("output_path", type=click.Path(path_type=Path))
+@click.option("--compression", default="blosc", show_default=True,
+              type=click.Choice(["blosc", "none"]),
+              help="Compression codec for the Zarr store.")
+@click.option("--pdb-id", default="", help="Identifier stored in metadata (e.g. UniProt accession).")
+@click.option("--chain", default="A", show_default=True,
+              help="Chain label applied to a raw sequence input.")
+def convert_seq(sequence_or_fasta: str, output_path: Path, compression: str,
+                pdb_id: str, chain: str):
+    """Convert a protein sequence to ProteinTensor (.ptt) format.
+    SEQUENCE_OR_FASTA may be a path to a FASTA file or a literal 1-letter
+    amino-acid string. The result is a sequence-only .ptt (no coordinates) -
+    the primary input form for AlphaFold- and Boltz-style predictors.
+    """
+    from .converters.sequence import from_sequence, from_fasta
+    from .writer import write
+    src = Path(sequence_or_fasta)
+    is_file = src.exists() and src.is_file()
+    t0 = time.perf_counter()
+    if is_file:
+        data = from_fasta(src, pdb_id=pdb_id)
+        source_desc = src.name
+    else:
+        data = from_sequence(sequence_or_fasta, pdb_id=pdb_id, chain_id=chain)
+        source_desc = f"<literal sequence: {data.num_residues} aa>"
+    build_ms = (time.perf_counter() - t0) * 1000
+    t0 = time.perf_counter()
+    write(data, output_path, compression=compression)
+    write_ms = (time.perf_counter() - t0) * 1000
+    dst_bytes = sum(f.stat().st_size for f in Path(output_path).rglob("*") if f.is_file())
+    tbl = Table(show_header=False, box=None, padding=(0, 2))
+    tbl.add_row("PDB ID",    data.pdb_id or "(unknown)")
+    tbl.add_row("Chains",    _chain_summary(data.chain_id))
+    tbl.add_row("Residues",  f"{data.num_residues:,}")
+    tbl.add_row("Structure", "no (sequence-only)")
+    tbl.add_row("")
+    tbl.add_row("Source",     source_desc)
+    tbl.add_row("Build time", f"{build_ms:.1f} ms")
+    tbl.add_row("Write time", f"{write_ms:.1f} ms")
+    tbl.add_row("Output",     _fmt_bytes(dst_bytes))
+    console.print(Panel(tbl, title=f"[green]Converted -> {output_path}[/green]", expand=False))
 # ---------------------------------------------------------------------------
 # info
 # ---------------------------------------------------------------------------

proteintensor-0.2.0/proteintensor/converters/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .mmcif import from_mmcif
+from .sequence import from_sequence, from_fasta, parse_fasta
+__all__ = ["from_mmcif", "from_sequence", "from_fasta", "parse_fasta"]

proteintensor-0.2.0/proteintensor/converters/sequence.py ADDED Viewed

@@ -0,0 +1,103 @@
+from __future__ import annotations
+import numpy as np
+from pathlib import Path
+from ..schema import ProteinTensorData, sequence_to_tokens
+def from_sequence(
+    sequence: str,
+    *,
+    pdb_id: str = "",
+    chain_id: str = "A",
+    residue_start: int = 1,
+) -> ProteinTensorData:
+    """Build a sequence-only ProteinTensorData from a 1-letter amino-acid string.
+    No 3D coordinates are produced - the result has ``has_structure == False`` and
+    is the primary input form for sequence-driven predictors such as AlphaFold and
+    Boltz. Unknown / ambiguity characters (B, Z, J, O, X, gaps) map to UNK.
+    Parameters
+    ----------
+    sequence       1-letter amino-acid string. Whitespace is ignored.
+    pdb_id         Identifier stored in metadata (e.g. a UniProt accession).
+    chain_id       Single-character chain label applied to every residue.
+    residue_start  PDB residue number assigned to the first residue (default 1).
+    """
+    tokens = sequence_to_tokens(sequence)
+    if tokens.shape[0] == 0:
+        raise ValueError("Empty sequence: no amino-acid residues to encode.")
+    n = tokens.shape[0]
+    chain_label = (chain_id[0] if chain_id else "A").encode()
+    return ProteinTensorData(
+        sequence_tokens=tokens,
+        residue_index=np.arange(residue_start, residue_start + n, dtype=np.int32),
+        chain_id=np.full(n, chain_label, dtype="S1"),
+        pdb_id=pdb_id,
+        method="sequence",
+    )
+def parse_fasta(text: str) -> list[tuple[str, str]]:
+    """Parse FASTA text into a list of (header, sequence) tuples."""
+    records: list[tuple[str, str]] = []
+    header: str | None = None
+    chunks: list[str] = []
+    for line in text.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith(">"):
+            if header is not None:
+                records.append((header, "".join(chunks)))
+            header = line[1:].strip()
+            chunks = []
+        else:
+            chunks.append(line)
+    if header is not None:
+        records.append((header, "".join(chunks)))
+    return records
+def from_fasta(path: str | Path, *, pdb_id: str = "") -> ProteinTensorData:
+    """Build a ProteinTensorData from a FASTA file.
+    A single record produces a single-chain sequence-only entry. Multiple records
+    are treated as a multi-chain complex: chains are concatenated and labelled
+    A, B, C, ... in file order, with residue numbering restarting per chain. This
+    matches the multi-chain input that AlphaFold-Multimer and Boltz consume.
+    """
+    path = Path(path)
+    records = parse_fasta(path.read_text())
+    if not records:
+        raise ValueError(f"No FASTA records found in '{path}'.")
+    if not pdb_id:
+        pdb_id = path.stem.upper().split("_")[0]
+    if len(records) == 1:
+        return from_sequence(records[0][1], pdb_id=pdb_id, chain_id="A")
+    all_tokens: list[np.ndarray] = []
+    all_res_idx: list[np.ndarray] = []
+    all_chain: list[np.ndarray] = []
+    for i, (_, seq) in enumerate(records):
+        sub = from_sequence(seq, chain_id=_chain_label(i))
+        all_tokens.append(sub.sequence_tokens)
+        all_res_idx.append(sub.residue_index)
+        all_chain.append(sub.chain_id)
+    return ProteinTensorData(
+        sequence_tokens=np.concatenate(all_tokens),
+        residue_index=np.concatenate(all_res_idx),
+        chain_id=np.concatenate(all_chain),
+        pdb_id=pdb_id,
+        method="sequence",
+    )
+def _chain_label(index: int) -> str:
+    """Map a 0-based chain index to a label: 0->A .. 25->Z, then a, b, ..."""
+    alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    return alphabet[index] if index < len(alphabet) else "X"

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/reader.py RENAMED Viewed

@@ -24,6 +24,8 @@ def read(
     store = open_store(path, storage_options=storage_options)
     attrs = dict(store.attrs)
+    has_atoms      = "atoms" in store
+    has_struct     = "structure" in store
     bb_positions   = store["backbone/positions"][:] if "backbone" in store else None
     bb_mask        = store["backbone/mask"][:]      if "backbone" in store else None
     bond_edge_idx  = store["bonds/edge_index"][:]   if "bonds"    in store else None
@@ -33,11 +35,11 @@ def read(
         sequence_tokens=store["sequence/tokens"][:],
         residue_index=store["sequence/residue_index"][:],
         chain_id=store["sequence/chain_id"][:],
-        atom_positions=store["atoms/positions"][:],
-        atom_mask=store["atoms/mask"][:],
-        b_factors=store["atoms/b_factors"][:],
-        residue_atom_start=store["structure/residue_atom_start"][:],
-        residue_atom_count=store["structure/residue_atom_count"][:],
+        atom_positions=store["atoms/positions"][:]  if has_atoms  else None,
+        atom_mask=store["atoms/mask"][:]            if has_atoms  else None,
+        b_factors=store["atoms/b_factors"][:]       if has_atoms  else None,
+        residue_atom_start=store["structure/residue_atom_start"][:] if has_struct else None,
+        residue_atom_count=store["structure/residue_atom_count"][:] if has_struct else None,
         backbone_positions=bb_positions,
         backbone_mask=bb_mask,
         bond_edge_index=bond_edge_idx,

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/schema.py RENAMED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 import numpy as np
-FORMAT_VERSION = "0.6"
+FORMAT_VERSION = "0.7"
 AA_VOCAB: dict[str, int] = {
     "ALA": 0, "ARG": 1, "ASN": 2, "ASP": 3, "CYS": 4,
@@ -14,8 +14,26 @@ AA_VOCAB: dict[str, int] = {
 AA_UNK = 20
 AA_VOCAB_SIZE = 21
-# Single-letter equivalents for display
-AA_1LETTER = "ARNDCQEGHILKMFPSTWYXU"
+# Single-letter codes indexed by token: position i is the 1-letter code for token i.
+# Tokens 0-19 are the standard amino acids in AA_VOCAB order; token 20 (UNK) -> "X".
+AA_1LETTER = "ARNDCQEGHILKMFPSTWYVX"
+# Inverse map for sequence input. Any character absent here resolves to AA_UNK,
+# which also covers ambiguity codes (B, Z, J, O) and gaps (-, .).
+ONE_LETTER_TO_TOKEN: dict[str, int] = {c: i for i, c in enumerate(AA_1LETTER)}
+def sequence_to_tokens(sequence: str) -> np.ndarray:
+    """Map a 1-letter amino-acid string to an int32 token array (unknown -> UNK)."""
+    cleaned = "".join(sequence.split()).upper()
+    return np.array(
+        [ONE_LETTER_TO_TOKEN.get(c, AA_UNK) for c in cleaned], dtype=np.int32
+    )
+def tokens_to_sequence(tokens: np.ndarray) -> str:
+    """Map an int32 token array back to a 1-letter amino-acid string."""
+    return "".join(AA_1LETTER[int(t)] if 0 <= int(t) < AA_VOCAB_SIZE else "X" for t in tokens)
 # Canonical backbone atom order (AlphaFold / OpenFold convention)
 BACKBONE_ATOMS = ["N", "CA", "C", "O"]
@@ -47,14 +65,15 @@ class ProteinTensorData:
     residue_index: np.ndarray        # int32   PDB sequence numbers
     chain_id: np.ndarray             # S1      single-char chain labels
-    # Atom-level - shapes [N_atoms] or [N_atoms, 3]
-    atom_positions: np.ndarray       # float32 [N_atoms, 3]  Angstroms
-    atom_mask: np.ndarray            # bool    [N_atoms]
-    b_factors: np.ndarray            # float32 [N_atoms]     B-factor / pLDDT
+    # Atom-level - shapes [N_atoms] or [N_atoms, 3].
+    # None for sequence-only entries (from_sequence / from_fasta) that carry no structure.
+    atom_positions: np.ndarray | None = None   # float32 [N_atoms, 3]  Angstroms
+    atom_mask: np.ndarray | None = None        # bool    [N_atoms]
+    b_factors: np.ndarray | None = None        # float32 [N_atoms]     B-factor / pLDDT
-    # Residue->atom mapping - shape [N_res]
-    residue_atom_start: np.ndarray   # int32   first atom index for each residue
-    residue_atom_count: np.ndarray   # int32   number of atoms per residue
+    # Residue->atom mapping - shape [N_res] (None for sequence-only entries)
+    residue_atom_start: np.ndarray | None = None   # int32   first atom index for each residue
+    residue_atom_count: np.ndarray | None = None   # int32   number of atoms per residue
     # Backbone dense layout - shapes [N_res, 4, 3] and [N_res, 4]
     # Atom order: N=0, CA=1, C=2, O=3  (missing atoms have mask=False, coords=0)
@@ -70,3 +89,12 @@ class ProteinTensorData:
     resolution: float = float("nan")
     method: str = ""
     deposition_date: str = ""
+    @property
+    def has_structure(self) -> bool:
+        """True if 3D coordinates are present; False for sequence-only entries."""
+        return self.atom_positions is not None
+    @property
+    def num_residues(self) -> int:
+        return int(self.sequence_tokens.shape[0])

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/writer.py RENAMED Viewed

@@ -22,7 +22,8 @@ def write(data: ProteinTensorData, path: str | Path, compression: str = "blosc")
         "deposition_date": data.deposition_date,
         "created_at": time.time(),
         "num_residues": int(data.sequence_tokens.shape[0]),
-        "num_atoms": int(data.atom_positions.shape[0]),
+        "num_atoms": int(data.atom_positions.shape[0]) if data.has_structure else 0,
+        "has_structure": data.has_structure,
     })
     seq = store.require_group("sequence")
@@ -30,14 +31,16 @@ def write(data: ProteinTensorData, path: str | Path, compression: str = "blosc")
     _arr(seq, "residue_index", data.residue_index,      "int32",   compressor)
     _arr(seq, "chain_id",      data.chain_id,           "S1",      compressor)
-    atoms = store.require_group("atoms")
-    _arr(atoms, "positions",   data.atom_positions,     "float32", compressor)
-    _arr(atoms, "mask",        data.atom_mask,          "bool",    compressor)
-    _arr(atoms, "b_factors",   data.b_factors,          "float32", compressor)
+    # Atom-level and residue->atom mapping are omitted for sequence-only entries.
+    if data.has_structure:
+        atoms = store.require_group("atoms")
+        _arr(atoms, "positions",   data.atom_positions,     "float32", compressor)
+        _arr(atoms, "mask",        data.atom_mask,          "bool",    compressor)
+        _arr(atoms, "b_factors",   data.b_factors,          "float32", compressor)
-    struct = store.require_group("structure")
-    _arr(struct, "residue_atom_start", data.residue_atom_start, "int32", compressor)
-    _arr(struct, "residue_atom_count", data.residue_atom_count, "int32", compressor)
+        struct = store.require_group("structure")
+        _arr(struct, "residue_atom_start", data.residue_atom_start, "int32", compressor)
+        _arr(struct, "residue_atom_count", data.residue_atom_count, "int32", compressor)
     if data.backbone_positions is not None and data.backbone_mask is not None:
         bb = store.require_group("backbone")

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: proteintensor
-Version: 0.1.3
+Version: 0.2.0
 Summary: AI-native biomolecular tensor format for structural biology ML
 Author-email: Clayton Moore <claytonwaynemoore@gmail.com>
 License-Expression: MIT
@@ -243,6 +243,30 @@ proteintensor convert 1abc.cif 1abc.ptt
 proteintensor info 1abc.ptt
 ```
+### Convert a sequence (no structure required)
+For sequence-driven predictors like AlphaFold and Boltz, the primary input is a
+sequence, not a structure. ProteinTensor can build a sequence-only `.ptt` (no
+coordinates) directly from a raw string or a FASTA file:
+```bash
+proteintensor convert-seq MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDG ubq.ptt
+proteintensor convert-seq complex.fasta complex.ptt   # multi-record FASTA -> multi-chain
+```
+```python
+import proteintensor as pt
+data = pt.from_sequence("MQIFVKTLTGK...", pdb_id="UBQ", chain_id="A")
+data.has_structure        # False - sequence-only entry
+data.sequence_tokens      # (N_res,)  int32
+pt.write(data, "ubq.ptt")
+# FASTA: a single record -> one chain; multiple records -> multi-chain complex
+data = pt.from_fasta("complex.fasta")
+```
 ### Benchmark against mmCIF
 ```bash

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/SOURCES.txt RENAMED Viewed

@@ -22,10 +22,12 @@ proteintensor/adapters/__init__.py
 proteintensor/adapters/boltz.py
 proteintensor/converters/__init__.py
 proteintensor/converters/mmcif.py
+proteintensor/converters/sequence.py
 tests/test_adapters.py
 tests/test_dataset.py
 tests/test_embeddings.py
 tests/test_msa.py
 tests/test_pairs.py
 tests/test_remote.py
-tests/test_roundtrip.py
+tests/test_roundtrip.py
+tests/test_sequence.py

{proteintensor-0.1.3 → proteintensor-0.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "proteintensor"
-version = "0.1.3"
+version = "0.2.0"
 description = "AI-native biomolecular tensor format for structural biology ML"
 readme = "README.md"
 requires-python = ">=3.9"

proteintensor-0.2.0/tests/test_sequence.py ADDED Viewed

@@ -0,0 +1,150 @@
+import numpy as np
+import pytest
+import proteintensor as pt
+from proteintensor.schema import (
+    AA_VOCAB, AA_UNK, AA_1LETTER, ONE_LETTER_TO_TOKEN,
+    sequence_to_tokens, tokens_to_sequence,
+)
+from proteintensor.converters.sequence import from_sequence, from_fasta, parse_fasta
+UBIQUITIN = (
+    "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
+)
+# --------------------------------------------------------------------------
+# vocab consistency (guards the AA_1LETTER bug that was fixed)
+# --------------------------------------------------------------------------
+def test_aa_1letter_matches_vocab_order():
+    # AA_1LETTER[token] must be the 1-letter code for that token.
+    three_to_one = {
+        "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
+        "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
+        "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
+        "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V",
+    }
+    for three, token in AA_VOCAB.items():
+        if three == "UNK":
+            assert AA_1LETTER[token] == "X"
+        else:
+            assert AA_1LETTER[token] == three_to_one[three]
+def test_token_roundtrip_is_identity():
+    toks = sequence_to_tokens(UBIQUITIN)
+    assert tokens_to_sequence(toks) == UBIQUITIN
+# --------------------------------------------------------------------------
+# from_sequence
+# --------------------------------------------------------------------------
+def test_from_sequence_basic():
+    data = from_sequence(UBIQUITIN, pdb_id="UBQ", chain_id="A")
+    assert data.num_residues == len(UBIQUITIN)
+    assert data.has_structure is False
+    assert data.atom_positions is None
+    assert data.backbone_positions is None
+    assert data.sequence_tokens.dtype == np.int32
+    assert data.residue_index[0] == 1
+    assert data.residue_index[-1] == len(UBIQUITIN)
+    assert set(data.chain_id.tolist()) == {b"A"}
+def test_from_sequence_whitespace_ignored():
+    a = from_sequence("MKT AYI\nAKQR")
+    b = from_sequence("MKTAYIAKQR")
+    np.testing.assert_array_equal(a.sequence_tokens, b.sequence_tokens)
+def test_from_sequence_unknown_chars_map_to_unk():
+    # B, Z, J, O, X, and gap chars are not standard residues -> UNK
+    data = from_sequence("ABXZ-J")
+    assert data.sequence_tokens[0] == AA_VOCAB["ALA"]
+    assert (data.sequence_tokens[1:] == AA_UNK).all()
+def test_from_sequence_empty_raises():
+    with pytest.raises(ValueError):
+        from_sequence("   \n  ")
+def test_from_sequence_residue_start():
+    data = from_sequence("MKT", residue_start=100)
+    np.testing.assert_array_equal(data.residue_index, [100, 101, 102])
+# --------------------------------------------------------------------------
+# round-trip through .ptt (write -> read) for a sequence-only entry
+# --------------------------------------------------------------------------
+def test_sequence_only_roundtrip(tmp_path):
+    data = from_sequence(UBIQUITIN, pdb_id="UBQ")
+    ptt = tmp_path / "ubq_seq.ptt"
+    pt.write(data, str(ptt))
+    loaded = pt.read(str(ptt))
+    assert loaded.has_structure is False
+    assert loaded.atom_positions is None
+    assert loaded.residue_atom_start is None
+    np.testing.assert_array_equal(loaded.sequence_tokens, data.sequence_tokens)
+    np.testing.assert_array_equal(loaded.residue_index, data.residue_index)
+    assert loaded.pdb_id == "UBQ"
+def test_sequence_only_has_structure_flag_in_store(tmp_path):
+    import zarr
+    ptt = tmp_path / "seq.ptt"
+    pt.write(from_sequence("MKTAYIAKQR"), str(ptt))
+    store = zarr.open(str(ptt), mode="r")
+    assert store.attrs["has_structure"] is False
+    assert store.attrs["num_atoms"] == 0
+    assert "atoms" not in store
+    assert "structure" not in store
+    assert "sequence" in store
+def test_read_backbone_on_sequence_only_raises(tmp_path):
+    ptt = tmp_path / "seq.ptt"
+    pt.write(from_sequence("MKTAYIAKQR"), str(ptt))
+    with pytest.raises(KeyError):
+        pt.read_backbone(str(ptt))
+# --------------------------------------------------------------------------
+# FASTA parsing
+# --------------------------------------------------------------------------
+def test_parse_fasta_multi():
+    text = ">chainA\nMKTA\nYIAK\n>chainB\nQRLL\n"
+    recs = parse_fasta(text)
+    assert recs == [("chainA", "MKTAYIAK"), ("chainB", "QRLL")]
+def test_from_fasta_single(tmp_path):
+    fasta = tmp_path / "ubq.fasta"
+    fasta.write_text(f">UBQ\n{UBIQUITIN}\n")
+    data = from_fasta(fasta)
+    assert data.num_residues == len(UBIQUITIN)
+    assert data.pdb_id == "UBQ"
+    assert set(data.chain_id.tolist()) == {b"A"}
+def test_from_fasta_multichain(tmp_path):
+    fasta = tmp_path / "complex.fasta"
+    fasta.write_text(">a\nMKTAY\n>b\nQRLLG\n")
+    data = from_fasta(fasta)
+    assert data.num_residues == 10
+    # two chains, A and B, residue numbering restarts per chain
+    assert set(data.chain_id.tolist()) == {b"A", b"B"}
+    np.testing.assert_array_equal(data.residue_index, [1, 2, 3, 4, 5, 1, 2, 3, 4, 5])
+def test_from_fasta_empty_raises(tmp_path):
+    fasta = tmp_path / "empty.fasta"
+    fasta.write_text("\n\n")
+    with pytest.raises(ValueError):
+        from_fasta(fasta)

proteintensor-0.1.3/proteintensor/converters/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .mmcif import from_mmcif
-__all__ = ["from_mmcif"]

{proteintensor-0.1.3 → proteintensor-0.2.0}/LICENSE RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/adapters/__init__.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/adapters/boltz.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/bonds.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/converters/mmcif.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/dataset.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/embeddings.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/msa.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/pairs.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor/remote.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/entry_points.txt RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/requires.txt RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/proteintensor.egg-info/top_level.txt RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/setup.cfg RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_adapters.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_dataset.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_embeddings.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_msa.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_pairs.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_remote.py RENAMED Viewed

File without changes

{proteintensor-0.1.3 → proteintensor-0.2.0}/tests/test_roundtrip.py RENAMED Viewed

File without changes

proteintensor 0.1.3__tar.gz → 0.2.0__tar.gz

proteintensor 0.1.3tar.gz → 0.2.0tar.gz