tpixel 0.1.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tpixel/__init__.py +24 -0
- tpixel/_version.py +34 -0
- tpixel/cli.py +137 -0
- tpixel/fasta.py +109 -0
- tpixel/hiv.py +190 -0
- tpixel/hxb2.py +187 -0
- tpixel/models.py +138 -0
- tpixel/pngs.py +172 -0
- tpixel/renderer.py +368 -0
- tpixel-0.1.1.dev0.dist-info/METADATA +77 -0
- tpixel-0.1.1.dev0.dist-info/RECORD +15 -0
- tpixel-0.1.1.dev0.dist-info/WHEEL +5 -0
- tpixel-0.1.1.dev0.dist-info/entry_points.txt +2 -0
- tpixel-0.1.1.dev0.dist-info/licenses/LICENSE +21 -0
- tpixel-0.1.1.dev0.dist-info/top_level.txt +1 -0
tpixel/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""tpixel — Pixel-block alignment viewer for hundreds of sequences."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
from tpixel.fasta import fasta_panel, read_fasta
|
|
6
|
+
from tpixel.hiv import hiv_panel
|
|
7
|
+
from tpixel.models import Marker, Panel, Region, SeqGroup
|
|
8
|
+
from tpixel.renderer import render_panels
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
__version__ = version("tpixel")
|
|
12
|
+
except PackageNotFoundError:
|
|
13
|
+
__version__ = "0.0.0"
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Marker",
|
|
17
|
+
"Panel",
|
|
18
|
+
"Region",
|
|
19
|
+
"SeqGroup",
|
|
20
|
+
"fasta_panel",
|
|
21
|
+
"hiv_panel",
|
|
22
|
+
"read_fasta",
|
|
23
|
+
"render_panels",
|
|
24
|
+
]
|
tpixel/_version.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.1.1.dev0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 1, 'dev0')
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
tpixel/cli.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Click CLI for tpixel."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from tpixel.fasta import fasta_panel, read_fasta
|
|
8
|
+
from tpixel.renderer import render_panels
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _expand_stdin(paths: list[str]) -> list[str]:
|
|
12
|
+
"""If paths is ``['-']``, read file paths from stdin (one per line).
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
paths: List of file path strings. A single ``'-'`` triggers stdin reading.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Expanded list of file paths.
|
|
19
|
+
|
|
20
|
+
Examples:
|
|
21
|
+
>>> _expand_stdin(["file1.fasta", "file2.fasta"])
|
|
22
|
+
['file1.fasta', 'file2.fasta']
|
|
23
|
+
>>> _expand_stdin([])
|
|
24
|
+
[]
|
|
25
|
+
"""
|
|
26
|
+
if paths and len(paths) == 1 and paths[0] == "-":
|
|
27
|
+
return [line.strip() for line in sys.stdin if line.strip()]
|
|
28
|
+
return list(paths)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _auto_detect_hiv(fasta_path: str) -> bool:
|
|
32
|
+
"""Check if alignment contains HxB2 and a ``*_ref`` sequence.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
fasta_path: Path to the aligned FASTA file.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
``True`` if both HxB2 and a ``*_ref`` sequence are present.
|
|
39
|
+
"""
|
|
40
|
+
seqs = read_fasta(fasta_path)
|
|
41
|
+
names = {n.split()[0] for n, _ in seqs}
|
|
42
|
+
has_hxb2 = "HxB2" in names
|
|
43
|
+
has_ref = any(n.endswith("_ref") for n in names)
|
|
44
|
+
return has_hxb2 and has_ref
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@click.command(
|
|
48
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
49
|
+
epilog="Use '-' to read file paths from stdin, e.g.:\n\n"
|
|
50
|
+
" find . -name '*.fasta' | tpixel --fasta - -o out.png",
|
|
51
|
+
)
|
|
52
|
+
@click.option(
|
|
53
|
+
"--fasta",
|
|
54
|
+
multiple=True,
|
|
55
|
+
help="Aligned FASTA file(s) — each becomes a panel. Use '-' for stdin.",
|
|
56
|
+
)
|
|
57
|
+
@click.option(
|
|
58
|
+
"--columns", help="Column range for FASTA, 1-based inclusive (e.g. 1-120)."
|
|
59
|
+
)
|
|
60
|
+
@click.option(
|
|
61
|
+
"-o",
|
|
62
|
+
"--output",
|
|
63
|
+
default="pixel.png",
|
|
64
|
+
show_default=True,
|
|
65
|
+
help="Output image path.",
|
|
66
|
+
)
|
|
67
|
+
@click.option(
|
|
68
|
+
"--dpi", type=int, default=300, show_default=True, help="Image resolution."
|
|
69
|
+
)
|
|
70
|
+
@click.option(
|
|
71
|
+
"--cell", type=float, default=None, help="Cell size in inches (default: 0.03)."
|
|
72
|
+
)
|
|
73
|
+
@click.option(
|
|
74
|
+
"--hiv/--no-hiv",
|
|
75
|
+
default=None,
|
|
76
|
+
help="Force HIV mode (HxB2 regions, PNGS, animal grouping). Auto-detected if omitted.",
|
|
77
|
+
)
|
|
78
|
+
@click.option(
|
|
79
|
+
"--nt/--aa",
|
|
80
|
+
default=None,
|
|
81
|
+
help="Force nucleotide or amino-acid mode. Auto-detected if omitted.",
|
|
82
|
+
)
|
|
83
|
+
@click.option(
|
|
84
|
+
"--ref-pos",
|
|
85
|
+
default="1,2",
|
|
86
|
+
show_default=True,
|
|
87
|
+
help="Comma-separated 1-based positions of reference sequences. "
|
|
88
|
+
"Last position is the primary reference; earlier ones are extra reference rows.",
|
|
89
|
+
)
|
|
90
|
+
@click.option(
|
|
91
|
+
"--title",
|
|
92
|
+
default=None,
|
|
93
|
+
help="Title displayed above the plot.",
|
|
94
|
+
)
|
|
95
|
+
def main(fasta, columns, output, dpi, cell, hiv, nt, ref_pos, title):
|
|
96
|
+
"""Pixel-block alignment viewer for hundreds of sequences.
|
|
97
|
+
|
|
98
|
+
Renders Roark-style PIXEL plots: grey=match, red=substitution, black=gap.
|
|
99
|
+
Each sequence is a thin row of colored blocks — no text in cells.
|
|
100
|
+
|
|
101
|
+
HIV mode is auto-detected when the alignment contains HxB2 and a *_ref
|
|
102
|
+
sequence. Force with --hiv or --no-hiv.
|
|
103
|
+
"""
|
|
104
|
+
fasta_paths = _expand_stdin(list(fasta))
|
|
105
|
+
|
|
106
|
+
if not fasta_paths:
|
|
107
|
+
raise click.UsageError("Provide --fasta")
|
|
108
|
+
|
|
109
|
+
ref_positions = [int(x) for x in ref_pos.split(",")]
|
|
110
|
+
|
|
111
|
+
panels = []
|
|
112
|
+
col_start, col_end = None, None
|
|
113
|
+
if columns:
|
|
114
|
+
parts = columns.replace(",", "").split("-")
|
|
115
|
+
col_start = int(parts[0])
|
|
116
|
+
col_end = int(parts[1]) if len(parts) > 1 else None
|
|
117
|
+
|
|
118
|
+
for fasta_path in fasta_paths:
|
|
119
|
+
use_hiv = hiv if hiv is not None else _auto_detect_hiv(fasta_path)
|
|
120
|
+
|
|
121
|
+
if use_hiv:
|
|
122
|
+
from tpixel.hiv import hiv_panel
|
|
123
|
+
|
|
124
|
+
seq_type = None
|
|
125
|
+
if nt is True:
|
|
126
|
+
seq_type = "NT"
|
|
127
|
+
elif nt is False:
|
|
128
|
+
seq_type = "AA"
|
|
129
|
+
panel = hiv_panel(fasta_path, ref_positions=ref_positions, seq_type=seq_type)
|
|
130
|
+
else:
|
|
131
|
+
panel = fasta_panel(fasta_path, col_start, col_end, ref_positions=ref_positions)
|
|
132
|
+
|
|
133
|
+
if title:
|
|
134
|
+
panel.title = title
|
|
135
|
+
panels.append(panel)
|
|
136
|
+
|
|
137
|
+
render_panels(panels, output, dpi=dpi, cell=cell)
|
tpixel/fasta.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""FASTA parsing and panel construction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from tpixel.models import Panel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def read_fasta(path: str | Path) -> list[tuple[str, str]]:
|
|
11
|
+
"""Parse a FASTA file into a list of (name, sequence) tuples.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
path: Path to the FASTA file.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
List of (header_name, concatenated_sequence) tuples.
|
|
18
|
+
"""
|
|
19
|
+
seqs: list[tuple[str, str]] = []
|
|
20
|
+
name: str | None = None
|
|
21
|
+
buf: list[str] = []
|
|
22
|
+
with open(path, encoding="utf-8") as fh:
|
|
23
|
+
for line in fh:
|
|
24
|
+
if line.startswith(">"):
|
|
25
|
+
if name is not None:
|
|
26
|
+
seqs.append((name, "".join(buf)))
|
|
27
|
+
name = line[1:].strip()
|
|
28
|
+
buf = []
|
|
29
|
+
else:
|
|
30
|
+
buf.append(line.strip())
|
|
31
|
+
if name is not None:
|
|
32
|
+
seqs.append((name, "".join(buf)))
|
|
33
|
+
return seqs
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def fasta_panel(
|
|
37
|
+
path: str | Path,
|
|
38
|
+
col_start: int | None = None,
|
|
39
|
+
col_end: int | None = None,
|
|
40
|
+
ref_positions: list[int] | None = None,
|
|
41
|
+
) -> Panel:
|
|
42
|
+
"""Build a Panel from an aligned FASTA.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
path: Path to the aligned FASTA file.
|
|
46
|
+
col_start: 1-based inclusive start column for slicing the alignment.
|
|
47
|
+
col_end: 1-based inclusive end column for slicing the alignment.
|
|
48
|
+
ref_positions: 1-based positions of reference sequences. Last is
|
|
49
|
+
the primary reference; earlier ones become extra reference rows.
|
|
50
|
+
Defaults to [1].
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
A Panel with reference row, sequence rows, and column labels.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ValueError: If the FASTA file contains no sequences.
|
|
57
|
+
"""
|
|
58
|
+
if ref_positions is None:
|
|
59
|
+
ref_positions = [1]
|
|
60
|
+
|
|
61
|
+
seqs = read_fasta(path)
|
|
62
|
+
if not seqs:
|
|
63
|
+
raise ValueError(f"No sequences in {path}")
|
|
64
|
+
|
|
65
|
+
# Primary reference is the last position in ref_positions
|
|
66
|
+
primary_idx = ref_positions[-1] - 1
|
|
67
|
+
_ref_name, ref_seq = seqs[primary_idx]
|
|
68
|
+
|
|
69
|
+
# Slice columns if requested (1-based inclusive)
|
|
70
|
+
if col_start is not None or col_end is not None:
|
|
71
|
+
cs = (col_start or 1) - 1
|
|
72
|
+
ce = col_end or len(ref_seq)
|
|
73
|
+
ref_seq = ref_seq[cs:ce]
|
|
74
|
+
seqs = [(n, s[cs:ce]) for n, s in seqs]
|
|
75
|
+
|
|
76
|
+
aln_len = len(ref_seq)
|
|
77
|
+
ref_row = list(ref_seq.upper())
|
|
78
|
+
|
|
79
|
+
# Extra reference rows (all ref positions except the last)
|
|
80
|
+
extra_ref_rows: list[tuple[str, list[str]]] = []
|
|
81
|
+
for pos in ref_positions[:-1]:
|
|
82
|
+
idx = pos - 1
|
|
83
|
+
name, seq = seqs[idx]
|
|
84
|
+
row = list(seq.upper()[:aln_len])
|
|
85
|
+
row += ["-"] * (aln_len - len(row))
|
|
86
|
+
extra_ref_rows.append((name, row))
|
|
87
|
+
|
|
88
|
+
# Sample sequences: everything not in ref_positions
|
|
89
|
+
ref_indices = {pos - 1 for pos in ref_positions}
|
|
90
|
+
seq_rows: list[tuple[str, list[str]]] = []
|
|
91
|
+
for i, (name, seq) in enumerate(seqs):
|
|
92
|
+
if i in ref_indices:
|
|
93
|
+
continue
|
|
94
|
+
row = list(seq.upper()[:aln_len])
|
|
95
|
+
row += ["-"] * (aln_len - len(row))
|
|
96
|
+
seq_rows.append((name, row))
|
|
97
|
+
|
|
98
|
+
# Column labels: 1-based position in the reference (skip gap columns)
|
|
99
|
+
col_labels: list[tuple[int, str]] = []
|
|
100
|
+
ref_pos = 0
|
|
101
|
+
for i, base in enumerate(ref_row):
|
|
102
|
+
if base != "-":
|
|
103
|
+
ref_pos += 1
|
|
104
|
+
if ref_pos == 1 or ref_pos % 10 == 0:
|
|
105
|
+
col_labels.append((i, str(ref_pos)))
|
|
106
|
+
|
|
107
|
+
label = Path(path).stem
|
|
108
|
+
return Panel(label, ref_row, seq_rows, aln_len, col_labels,
|
|
109
|
+
extra_ref_rows=extra_ref_rows or None)
|
tpixel/hiv.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""HIV-aware panel builder for PIXEL plots.
|
|
2
|
+
|
|
3
|
+
Handles HxB2 coordinate mapping, Env region annotations, PNGS markers,
|
|
4
|
+
and animal-based sequence grouping from SHIV/HIV aligned FASTA files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from tpixel.fasta import read_fasta
|
|
13
|
+
from tpixel.hxb2 import _is_nucleotide, build_hxb2_map, hxb2_col_labels, hxb2_regions
|
|
14
|
+
from tpixel.models import Marker, Panel, SeqGroup
|
|
15
|
+
from tpixel.pngs import find_pngs_markers, find_pngs_markers_nt
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _find_ref_id(names: list[str]) -> str | None:
|
|
19
|
+
"""Find the parental reference (name ending with ``'_ref'``).
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
names: Sequence IDs from the alignment.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
First name ending with ``'_ref'``, or ``None``.
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
>>> _find_ref_id(["HxB2", "animal1_ref", "animal1_s1"])
|
|
29
|
+
'animal1_ref'
|
|
30
|
+
>>> _find_ref_id(["HxB2", "s1", "s2"]) is None
|
|
31
|
+
True
|
|
32
|
+
"""
|
|
33
|
+
for name in names:
|
|
34
|
+
if name.endswith("_ref"):
|
|
35
|
+
return name
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _extract_animal(seq_id: str) -> str:
|
|
40
|
+
"""Extract animal name from sequence ID (prefix before first ``'_'``).
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
seq_id: Full sequence identifier string.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
The portion of *seq_id* before the first underscore.
|
|
47
|
+
|
|
48
|
+
Examples:
|
|
49
|
+
>>> _extract_animal("animal1_s1")
|
|
50
|
+
'animal1'
|
|
51
|
+
>>> _extract_animal("RM5695_env_s3")
|
|
52
|
+
'RM5695'
|
|
53
|
+
>>> _extract_animal("nounderscore")
|
|
54
|
+
'nounderscore'
|
|
55
|
+
"""
|
|
56
|
+
parts = seq_id.split("_")
|
|
57
|
+
return parts[0]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _sort_animal_groups(animal_names: list[str], lineage: str) -> list[str]:
|
|
61
|
+
"""Sort: lineage self first, recombinants, then alphabetical.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
animal_names: Unique animal/group names to sort.
|
|
65
|
+
lineage: The lineage name to place first.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Sorted list: lineage first, then recombinants, then others alphabetically.
|
|
69
|
+
|
|
70
|
+
Examples:
|
|
71
|
+
>>> _sort_animal_groups(["B", "rec1", "A", "lin1"], "lin1")
|
|
72
|
+
['lin1', 'rec1', 'A', 'B']
|
|
73
|
+
>>> _sort_animal_groups(["X", "Y"], "Z")
|
|
74
|
+
['X', 'Y']
|
|
75
|
+
"""
|
|
76
|
+
self_group = []
|
|
77
|
+
rec_group = []
|
|
78
|
+
other_group = []
|
|
79
|
+
for name in animal_names:
|
|
80
|
+
if name == lineage:
|
|
81
|
+
self_group.append(name)
|
|
82
|
+
elif name.lower().startswith("rec"):
|
|
83
|
+
rec_group.append(name)
|
|
84
|
+
else:
|
|
85
|
+
other_group.append(name)
|
|
86
|
+
return self_group + sorted(rec_group) + sorted(other_group)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def hiv_panel(
|
|
90
|
+
path: str | Path,
|
|
91
|
+
hxb2_id: str = "HxB2",
|
|
92
|
+
ref_id: str | None = None,
|
|
93
|
+
tick_step: int = 50,
|
|
94
|
+
ref_positions: list[int] | None = None,
|
|
95
|
+
seq_type: str | None = None,
|
|
96
|
+
) -> Panel:
|
|
97
|
+
"""Build a full Roark-style Panel from an HIV Env alignment.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
path: Path to aligned FASTA containing HxB2 and a *_ref sequence.
|
|
101
|
+
Accepts both amino-acid and nucleotide alignments.
|
|
102
|
+
hxb2_id: ID of the HxB2 coordinate reference in the alignment.
|
|
103
|
+
ref_id: Parental reference ID. Auto-detected (*_ref) if None.
|
|
104
|
+
Ignored when ref_positions is provided.
|
|
105
|
+
tick_step: HxB2 AA position interval for x-axis ticks.
|
|
106
|
+
ref_positions: 1-based positions of reference sequences. Last is
|
|
107
|
+
the primary reference; earlier ones become extra reference rows.
|
|
108
|
+
Defaults to [1, 2].
|
|
109
|
+
seq_type: ``"NT"`` or ``"AA"``. Auto-detected from the reference
|
|
110
|
+
sequence when *None*.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Panel with regions, PNGS markers, grouped sequences, and HxB2 ticks.
|
|
114
|
+
"""
|
|
115
|
+
seqs = read_fasta(path)
|
|
116
|
+
if not seqs:
|
|
117
|
+
raise ValueError(f"No sequences in {path}")
|
|
118
|
+
|
|
119
|
+
names = [n for n, _ in seqs]
|
|
120
|
+
seq_dict = {n: s for n, s in seqs}
|
|
121
|
+
|
|
122
|
+
if ref_positions is not None:
|
|
123
|
+
# Position-based: last position is primary reference
|
|
124
|
+
primary_idx = ref_positions[-1] - 1
|
|
125
|
+
ref_id = names[primary_idx]
|
|
126
|
+
else:
|
|
127
|
+
# Name-based auto-detection (original behavior)
|
|
128
|
+
ref_positions = [1, 2]
|
|
129
|
+
if ref_id is None:
|
|
130
|
+
ref_id = _find_ref_id(names)
|
|
131
|
+
if ref_id is None:
|
|
132
|
+
raise ValueError("No *_ref sequence found. Specify ref_id explicitly.")
|
|
133
|
+
if ref_id not in seq_dict:
|
|
134
|
+
raise ValueError(f"Reference '{ref_id}' not in alignment")
|
|
135
|
+
|
|
136
|
+
ref_seq = seq_dict[ref_id]
|
|
137
|
+
aln_len = len(ref_seq)
|
|
138
|
+
ref_row = list(ref_seq.upper())
|
|
139
|
+
|
|
140
|
+
# Auto-detect sequence type from reference when not specified
|
|
141
|
+
if seq_type is None:
|
|
142
|
+
seq_type = "NT" if _is_nucleotide(ref_seq) else "AA"
|
|
143
|
+
|
|
144
|
+
hxb2_map = build_hxb2_map(seqs, hxb2_id, seq_type=seq_type)
|
|
145
|
+
regions = hxb2_regions(hxb2_map)
|
|
146
|
+
col_labels = hxb2_col_labels(hxb2_map, step=tick_step)
|
|
147
|
+
|
|
148
|
+
if seq_type == "NT":
|
|
149
|
+
markers = find_pngs_markers_nt(ref_seq, hxb2_map)
|
|
150
|
+
else:
|
|
151
|
+
markers = find_pngs_markers(ref_seq, hxb2_map)
|
|
152
|
+
|
|
153
|
+
lineage = ref_id.replace("_ref", "") if ref_id.endswith("_ref") else ref_id
|
|
154
|
+
|
|
155
|
+
# Extra reference rows: all ref positions except the last
|
|
156
|
+
extra_ref_rows: list[tuple[str, list[str]]] = []
|
|
157
|
+
for pos in ref_positions[:-1]:
|
|
158
|
+
idx = pos - 1
|
|
159
|
+
name = names[idx]
|
|
160
|
+
seq = seq_dict[name]
|
|
161
|
+
row = list(seq.upper()[:aln_len])
|
|
162
|
+
row += ["-"] * (aln_len - len(row))
|
|
163
|
+
extra_ref_rows.append((name, row))
|
|
164
|
+
|
|
165
|
+
# Group sample sequences by animal
|
|
166
|
+
skip = {names[pos - 1] for pos in ref_positions}
|
|
167
|
+
animal_seqs: dict[str, list[tuple[str, list[str]]]] = defaultdict(list)
|
|
168
|
+
for name, seq in seqs:
|
|
169
|
+
if name in skip:
|
|
170
|
+
continue
|
|
171
|
+
animal = _extract_animal(name)
|
|
172
|
+
row = list(seq.upper()[:aln_len])
|
|
173
|
+
row += ["-"] * (aln_len - len(row))
|
|
174
|
+
animal_seqs[animal].append((name, row))
|
|
175
|
+
|
|
176
|
+
sorted_animals = _sort_animal_groups(list(animal_seqs.keys()), lineage)
|
|
177
|
+
groups = [SeqGroup(name=a, seqs=animal_seqs[a]) for a in sorted_animals]
|
|
178
|
+
|
|
179
|
+
return Panel(
|
|
180
|
+
label=ref_id,
|
|
181
|
+
ref_row=ref_row,
|
|
182
|
+
seq_rows=[],
|
|
183
|
+
total_cols=aln_len,
|
|
184
|
+
col_labels=col_labels,
|
|
185
|
+
regions=regions,
|
|
186
|
+
markers=markers,
|
|
187
|
+
marker_color="#4CAF50",
|
|
188
|
+
groups=groups,
|
|
189
|
+
extra_ref_rows=extra_ref_rows,
|
|
190
|
+
)
|
tpixel/hxb2.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""HxB2 coordinate mapping for HIV Env gp160 protein alignments.
|
|
2
|
+
|
|
3
|
+
Maps alignment columns to HxB2 amino acid positions and Env structural
|
|
4
|
+
regions using the LANL convention boundaries.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
from tpixel.models import Region
|
|
12
|
+
|
|
13
|
+
ENV_REGIONS: list[tuple[str, int, int]] = [
|
|
14
|
+
("SP", 1, 30),
|
|
15
|
+
("C1", 31, 130),
|
|
16
|
+
("V1", 131, 157),
|
|
17
|
+
("V2", 158, 196),
|
|
18
|
+
("C2", 197, 295),
|
|
19
|
+
("V3", 296, 331),
|
|
20
|
+
("C3", 332, 384),
|
|
21
|
+
("V4", 385, 418),
|
|
22
|
+
("C4", 419, 459),
|
|
23
|
+
("V5", 460, 469),
|
|
24
|
+
("C5", 470, 511),
|
|
25
|
+
("gp41", 512, 856),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
REGION_COLORS: dict[str, str] = {
|
|
29
|
+
"SP": "#FFF9C4",
|
|
30
|
+
"C1": "#EEEEEE",
|
|
31
|
+
"V1": "#BBDEFB",
|
|
32
|
+
"V2": "#BBDEFB",
|
|
33
|
+
"C2": "#EEEEEE",
|
|
34
|
+
"V3": "#BBDEFB",
|
|
35
|
+
"C3": "#EEEEEE",
|
|
36
|
+
"V4": "#BBDEFB",
|
|
37
|
+
"C4": "#EEEEEE",
|
|
38
|
+
"V5": "#BBDEFB",
|
|
39
|
+
"C5": "#EEEEEE",
|
|
40
|
+
"gp41": "#F8BBD0",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
_REGION_LOOKUP: dict[int, str] = {}
|
|
44
|
+
for _name, _start, _end in ENV_REGIONS:
|
|
45
|
+
for _pos in range(_start, _end + 1):
|
|
46
|
+
_REGION_LOOKUP[_pos] = _name
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_env_region(hxb2_aa_pos: int) -> str | None:
|
|
50
|
+
"""Return the Env region name for an HxB2 amino acid position.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
hxb2_aa_pos: 1-based HxB2 amino acid position.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Region name (e.g. ``'V3'``) or ``None`` if outside known boundaries.
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
>>> get_env_region(1)
|
|
60
|
+
'SP'
|
|
61
|
+
>>> get_env_region(131)
|
|
62
|
+
'V1'
|
|
63
|
+
>>> get_env_region(296)
|
|
64
|
+
'V3'
|
|
65
|
+
>>> get_env_region(900) is None
|
|
66
|
+
True
|
|
67
|
+
"""
|
|
68
|
+
return _REGION_LOOKUP.get(hxb2_aa_pos)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class HxB2Position:
|
|
73
|
+
"""A single alignment column mapped to HxB2 coordinates.
|
|
74
|
+
|
|
75
|
+
Attributes:
|
|
76
|
+
alignment_col: 0-based alignment column index.
|
|
77
|
+
hxb2_aa_pos: 1-based HxB2 amino acid position, or ``None`` for gaps.
|
|
78
|
+
region: Env region name (e.g. ``'V3'``), or ``None``.
|
|
79
|
+
hxb2_residue: The residue character at this column in the HxB2 sequence.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
alignment_col: int
|
|
83
|
+
hxb2_aa_pos: int | None
|
|
84
|
+
region: str | None
|
|
85
|
+
hxb2_residue: str
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _is_nucleotide(seq: str) -> bool:
|
|
89
|
+
"""Return True if *seq* looks like a nucleotide sequence.
|
|
90
|
+
|
|
91
|
+
Examples:
|
|
92
|
+
>>> _is_nucleotide("ACGTACGT")
|
|
93
|
+
True
|
|
94
|
+
>>> _is_nucleotide("MWLK")
|
|
95
|
+
False
|
|
96
|
+
>>> _is_nucleotide("ACG-T.NU")
|
|
97
|
+
True
|
|
98
|
+
>>> _is_nucleotide("")
|
|
99
|
+
True
|
|
100
|
+
"""
|
|
101
|
+
nt_chars = set("ACGTUNacgtun-.")
|
|
102
|
+
return all(c in nt_chars for c in seq)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def build_hxb2_map(
|
|
106
|
+
aligned_seqs: list[tuple[str, str]],
|
|
107
|
+
hxb2_id: str = "HxB2",
|
|
108
|
+
seq_type: str | None = None,
|
|
109
|
+
) -> list[HxB2Position]:
|
|
110
|
+
"""Walk the HxB2 row and map every alignment column to HxB2 coordinates.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
aligned_seqs: List of (name, sequence) from read_fasta.
|
|
114
|
+
hxb2_id: Sequence ID of HxB2 in the alignment.
|
|
115
|
+
seq_type: ``"NT"`` or ``"AA"``. Auto-detected from the HxB2
|
|
116
|
+
sequence when *None*.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
One HxB2Position per alignment column.
|
|
120
|
+
"""
|
|
121
|
+
hxb2_seq = None
|
|
122
|
+
for name, seq in aligned_seqs:
|
|
123
|
+
if name == hxb2_id or name.split()[0] == hxb2_id:
|
|
124
|
+
hxb2_seq = seq
|
|
125
|
+
break
|
|
126
|
+
|
|
127
|
+
if hxb2_seq is None:
|
|
128
|
+
raise ValueError(f"HxB2 sequence '{hxb2_id}' not found in alignment")
|
|
129
|
+
|
|
130
|
+
if seq_type is None:
|
|
131
|
+
seq_type = "NT" if _is_nucleotide(hxb2_seq) else "AA"
|
|
132
|
+
|
|
133
|
+
is_nt = seq_type == "NT"
|
|
134
|
+
|
|
135
|
+
positions: list[HxB2Position] = []
|
|
136
|
+
nt_counter = 0
|
|
137
|
+
aa_counter = 0
|
|
138
|
+
|
|
139
|
+
for col_idx, residue in enumerate(hxb2_seq):
|
|
140
|
+
if residue in ("-", "."):
|
|
141
|
+
positions.append(HxB2Position(col_idx, None, None, residue))
|
|
142
|
+
else:
|
|
143
|
+
if is_nt:
|
|
144
|
+
nt_counter += 1
|
|
145
|
+
aa_pos = (nt_counter - 1) // 3 + 1
|
|
146
|
+
else:
|
|
147
|
+
aa_counter += 1
|
|
148
|
+
aa_pos = aa_counter
|
|
149
|
+
positions.append(HxB2Position(col_idx, aa_pos, get_env_region(aa_pos), residue))
|
|
150
|
+
|
|
151
|
+
return positions
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def hxb2_col_labels(hxb2_map: list[HxB2Position], step: int = 50) -> list[tuple[int, str]]:
|
|
155
|
+
"""Build x-axis tick labels at regular HxB2 AA intervals."""
|
|
156
|
+
max_pos = max((p.hxb2_aa_pos for p in hxb2_map if p.hxb2_aa_pos is not None), default=0)
|
|
157
|
+
labels: list[tuple[int, str]] = []
|
|
158
|
+
for target in range(step, max_pos + 1, step):
|
|
159
|
+
for p in hxb2_map:
|
|
160
|
+
if p.hxb2_aa_pos == target:
|
|
161
|
+
labels.append((p.alignment_col, str(target)))
|
|
162
|
+
break
|
|
163
|
+
return labels
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def hxb2_regions(hxb2_map: list[HxB2Position]) -> list[Region]:
|
|
167
|
+
"""Build Region annotations from HxB2 position map."""
|
|
168
|
+
region_spans: list[tuple[str, int, int]] = []
|
|
169
|
+
current: str | None = None
|
|
170
|
+
span_start = 0
|
|
171
|
+
|
|
172
|
+
for p in hxb2_map:
|
|
173
|
+
r = p.region
|
|
174
|
+
if r != current:
|
|
175
|
+
if current is not None:
|
|
176
|
+
region_spans.append((current, span_start, p.alignment_col))
|
|
177
|
+
current = r
|
|
178
|
+
span_start = p.alignment_col
|
|
179
|
+
|
|
180
|
+
if current is not None:
|
|
181
|
+
region_spans.append((current, span_start, len(hxb2_map)))
|
|
182
|
+
|
|
183
|
+
return [
|
|
184
|
+
Region(name, start, end, REGION_COLORS.get(name, "#EEEEEE"))
|
|
185
|
+
for name, start, end in region_spans
|
|
186
|
+
if name is not None
|
|
187
|
+
]
|