foldreport 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- foldreport/__init__.py +16 -0
- foldreport/cli.py +78 -0
- foldreport/figures.py +125 -0
- foldreport/metrics.py +89 -0
- foldreport/models.py +75 -0
- foldreport/parsers/__init__.py +65 -0
- foldreport/parsers/af3_server.py +129 -0
- foldreport/parsers/alphafold_db.py +144 -0
- foldreport/parsers/base.py +137 -0
- foldreport/parsers/boltz.py +181 -0
- foldreport/parsers/colabfold.py +151 -0
- foldreport/parsers/openfold3.py +171 -0
- foldreport/report/3Dmol-min.js +2 -0
- foldreport/report/__init__.py +5 -0
- foldreport/report/builder.py +277 -0
- foldreport/report/template.html +299 -0
- foldreport-0.1.0.dist-info/METADATA +154 -0
- foldreport-0.1.0.dist-info/RECORD +22 -0
- foldreport-0.1.0.dist-info/WHEEL +5 -0
- foldreport-0.1.0.dist-info/entry_points.txt +2 -0
- foldreport-0.1.0.dist-info/licenses/LICENSE +21 -0
- foldreport-0.1.0.dist-info/top_level.txt +1 -0
foldreport/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""FoldReport — unify structure-prediction outputs into a single HTML report."""
|
|
2
|
+
|
|
3
|
+
from foldreport.models import (
|
|
4
|
+
Chain,
|
|
5
|
+
Prediction,
|
|
6
|
+
PredictionMetrics,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__version__ = "0.1.0"
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"Chain",
|
|
13
|
+
"Prediction",
|
|
14
|
+
"PredictionMetrics",
|
|
15
|
+
"__version__",
|
|
16
|
+
]
|
foldreport/cli.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Command-line interface: ``foldreport <folder> [...] -o report.html``.
|
|
2
|
+
|
|
3
|
+
Point it at one or more folders of predictions. Each folder's format is autodetected;
|
|
4
|
+
all predictions are pooled, ranked by confidence, and written to a single HTML file.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import click
|
|
13
|
+
|
|
14
|
+
from foldreport import __version__
|
|
15
|
+
from foldreport.metrics import ranked_dataframe
|
|
16
|
+
from foldreport.parsers import detect_parser, parse_folder
|
|
17
|
+
from foldreport.report import build_report
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
|
|
21
|
+
@click.argument(
|
|
22
|
+
"folders",
|
|
23
|
+
nargs=-1,
|
|
24
|
+
required=True,
|
|
25
|
+
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
|
|
26
|
+
)
|
|
27
|
+
@click.option(
|
|
28
|
+
"-o",
|
|
29
|
+
"--output",
|
|
30
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
31
|
+
default=Path("foldreport.html"),
|
|
32
|
+
show_default=True,
|
|
33
|
+
help="Path of the self-contained HTML report to write.",
|
|
34
|
+
)
|
|
35
|
+
@click.option(
|
|
36
|
+
"-t",
|
|
37
|
+
"--title",
|
|
38
|
+
default="FoldReport",
|
|
39
|
+
show_default=True,
|
|
40
|
+
help="Title shown at the top of the report.",
|
|
41
|
+
)
|
|
42
|
+
@click.option(
|
|
43
|
+
"--csv",
|
|
44
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
45
|
+
default=None,
|
|
46
|
+
help="Also write the ranked metrics table to this CSV path.",
|
|
47
|
+
)
|
|
48
|
+
@click.version_option(__version__, "-V", "--version", prog_name="foldreport")
|
|
49
|
+
def main(folders: tuple[Path, ...], output: Path, title: str, csv: Path | None) -> None:
|
|
50
|
+
"""Build a single HTML report from prediction FOLDERS.
|
|
51
|
+
|
|
52
|
+
Supported tools (autodetected): ColabFold, AlphaFold 3 Server, Boltz, OpenFold3,
|
|
53
|
+
and AlphaFold DB downloads.
|
|
54
|
+
"""
|
|
55
|
+
predictions = []
|
|
56
|
+
for folder in folders:
|
|
57
|
+
parser = detect_parser(folder)
|
|
58
|
+
if parser is None:
|
|
59
|
+
click.echo(f" ! Skipping {folder}: no supported format detected.", err=True)
|
|
60
|
+
continue
|
|
61
|
+
found = parse_folder(folder)
|
|
62
|
+
click.echo(f" + {folder}: {len(found)} prediction(s) via '{parser.name}'.")
|
|
63
|
+
predictions.extend(found)
|
|
64
|
+
|
|
65
|
+
if not predictions:
|
|
66
|
+
click.echo("No predictions found in the given folder(s).", err=True)
|
|
67
|
+
sys.exit(1)
|
|
68
|
+
|
|
69
|
+
if csv is not None:
|
|
70
|
+
ranked_dataframe(predictions).to_csv(csv, index=False)
|
|
71
|
+
click.echo(f" > Metrics table: {csv}")
|
|
72
|
+
|
|
73
|
+
out_path = build_report(predictions, output, title=title)
|
|
74
|
+
click.echo(f" > Report ({len(predictions)} predictions): {out_path}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
main()
|
foldreport/figures.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Publication-quality static figures: PAE and per-residue pLDDT.
|
|
2
|
+
|
|
3
|
+
Figures are rendered with a non-interactive matplotlib backend and returned as base64
|
|
4
|
+
PNG data URIs so the report can embed them with zero external files. Predictions that
|
|
5
|
+
lack a given metric simply produce no figure (the caller renders "N/A").
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import io
|
|
12
|
+
|
|
13
|
+
import matplotlib
|
|
14
|
+
|
|
15
|
+
matplotlib.use("Agg") # Headless: never requires a display server.
|
|
16
|
+
|
|
17
|
+
import matplotlib.pyplot as plt
|
|
18
|
+
import numpy as np
|
|
19
|
+
from matplotlib.figure import Figure
|
|
20
|
+
|
|
21
|
+
from foldreport.models import Prediction
|
|
22
|
+
|
|
23
|
+
# pLDDT confidence bands used by the AlphaFold family (0-100 scale).
|
|
24
|
+
_PLDDT_BANDS = [
|
|
25
|
+
(90, 100, "#0053D6", "Very high (90-100)"),
|
|
26
|
+
(70, 90, "#65CBF3", "Confident (70-90)"),
|
|
27
|
+
(50, 70, "#FFDB13", "Low (50-70)"),
|
|
28
|
+
(0, 50, "#FF7D45", "Very low (0-50)"),
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _fig_to_data_uri(fig: Figure, dpi: int = 150) -> str:
|
|
33
|
+
buf = io.BytesIO()
|
|
34
|
+
fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight")
|
|
35
|
+
plt.close(fig)
|
|
36
|
+
encoded = base64.b64encode(buf.getvalue()).decode("ascii")
|
|
37
|
+
return f"data:image/png;base64,{encoded}"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def plddt_figure(pred: Prediction) -> str | None:
|
|
41
|
+
"""Per-residue pLDDT line plot with confidence bands. Returns a PNG data URI."""
|
|
42
|
+
if not pred.plddt:
|
|
43
|
+
return None
|
|
44
|
+
plddt = np.asarray(pred.plddt, dtype=float)
|
|
45
|
+
x = np.arange(1, len(plddt) + 1)
|
|
46
|
+
|
|
47
|
+
fig, ax = plt.subplots(figsize=(7, 2.6))
|
|
48
|
+
for low, high, color, _label in _PLDDT_BANDS:
|
|
49
|
+
ax.axhspan(low, high, color=color, alpha=0.12, linewidth=0)
|
|
50
|
+
ax.plot(x, plddt, color="#1a1a1a", linewidth=1.2)
|
|
51
|
+
|
|
52
|
+
_draw_chain_boundaries(ax, pred)
|
|
53
|
+
|
|
54
|
+
ax.set_xlim(1, len(plddt))
|
|
55
|
+
ax.set_ylim(0, 100)
|
|
56
|
+
ax.set_xlabel("Residue")
|
|
57
|
+
ax.set_ylabel("pLDDT")
|
|
58
|
+
ax.set_title("Per-residue pLDDT")
|
|
59
|
+
ax.grid(True, axis="y", alpha=0.2)
|
|
60
|
+
return _fig_to_data_uri(fig)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def pae_figure(pred: Prediction) -> str | None:
|
|
64
|
+
"""PAE heatmap (Predicted Aligned Error). Returns a PNG data URI or None."""
|
|
65
|
+
if pred.pae is None:
|
|
66
|
+
return None
|
|
67
|
+
pae = np.asarray(pred.pae, dtype=float)
|
|
68
|
+
if pae.ndim != 2:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
fig, ax = plt.subplots(figsize=(4.2, 3.6))
|
|
72
|
+
im = ax.imshow(pae, cmap="Greens_r", vmin=0, vmax=max(float(pae.max()), 1.0), origin="upper")
|
|
73
|
+
ax.set_xlabel("Scored residue")
|
|
74
|
+
ax.set_ylabel("Aligned residue")
|
|
75
|
+
ax.set_title("Predicted Aligned Error")
|
|
76
|
+
cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
|
|
77
|
+
cbar.set_label("Expected position error (Å)")
|
|
78
|
+
|
|
79
|
+
_draw_pae_chain_lines(ax, pred)
|
|
80
|
+
return _fig_to_data_uri(fig)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _chain_offsets(pred: Prediction) -> list[int]:
|
|
84
|
+
"""Cumulative residue offsets at chain boundaries (excluding the final end)."""
|
|
85
|
+
offsets: list[int] = []
|
|
86
|
+
cumulative = 0
|
|
87
|
+
for chain in pred.chains[:-1]:
|
|
88
|
+
cumulative += chain.n_residues
|
|
89
|
+
offsets.append(cumulative)
|
|
90
|
+
return offsets
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _draw_chain_boundaries(ax, pred: Prediction) -> None:
|
|
94
|
+
for boundary in _chain_offsets(pred):
|
|
95
|
+
ax.axvline(boundary + 0.5, color="#888888", linestyle="--", linewidth=0.8)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _draw_pae_chain_lines(ax, pred: Prediction) -> None:
|
|
99
|
+
for boundary in _chain_offsets(pred):
|
|
100
|
+
ax.axhline(boundary - 0.5, color="#444444", linewidth=0.6)
|
|
101
|
+
ax.axvline(boundary - 0.5, color="#444444", linewidth=0.6)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def pae_data_for_js(pred: Prediction) -> dict | None:
|
|
105
|
+
"""Return PAE matrix data for the interactive JS heatmap, or None."""
|
|
106
|
+
if pred.pae is None:
|
|
107
|
+
return None
|
|
108
|
+
pae = np.asarray(pred.pae, dtype=float)
|
|
109
|
+
if pae.ndim != 2:
|
|
110
|
+
return None
|
|
111
|
+
return {
|
|
112
|
+
"matrix": np.round(pae, 2).tolist(),
|
|
113
|
+
"size": int(pae.shape[0]),
|
|
114
|
+
"max_val": round(float(pae.max()), 2),
|
|
115
|
+
"chain_boundaries": _chain_offsets(pred),
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def make_figures(pred: Prediction) -> dict[str, str | None]:
|
|
120
|
+
"""Return both figures for a prediction as a dict of data URIs (or None)."""
|
|
121
|
+
return {
|
|
122
|
+
"plddt": plddt_figure(pred),
|
|
123
|
+
"pae": pae_figure(pred),
|
|
124
|
+
"pae_interactive": pae_data_for_js(pred),
|
|
125
|
+
}
|
foldreport/metrics.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Normalize a list of predictions into a sortable/filterable metrics table.
|
|
2
|
+
|
|
3
|
+
The table has exactly one row per prediction. Missing metrics stay as ``None`` (which
|
|
4
|
+
pandas renders as ``NaN``); nothing here invents values. The default ranking key is a
|
|
5
|
+
confidence score that gracefully falls back when a tool omits a metric.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from foldreport.models import Prediction
|
|
15
|
+
|
|
16
|
+
# Columns in display order. Keep human-readable; the report renders these as headers.
|
|
17
|
+
COLUMNS = [
|
|
18
|
+
"name",
|
|
19
|
+
"source_tool",
|
|
20
|
+
"rank",
|
|
21
|
+
"mean_plddt",
|
|
22
|
+
"ptm",
|
|
23
|
+
"iptm",
|
|
24
|
+
"mpdockq",
|
|
25
|
+
"ranking_score",
|
|
26
|
+
"n_chains",
|
|
27
|
+
"n_residues",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def metrics_dataframe(predictions: Sequence[Prediction]) -> pd.DataFrame:
|
|
32
|
+
"""Build a one-row-per-prediction DataFrame of normalized metrics."""
|
|
33
|
+
rows = []
|
|
34
|
+
for pred in predictions:
|
|
35
|
+
m = pred.metrics
|
|
36
|
+
rows.append(
|
|
37
|
+
{
|
|
38
|
+
"name": pred.name,
|
|
39
|
+
"source_tool": pred.source_tool,
|
|
40
|
+
"rank": pred.rank,
|
|
41
|
+
"mean_plddt": m.mean_plddt,
|
|
42
|
+
"ptm": m.ptm,
|
|
43
|
+
"iptm": m.iptm,
|
|
44
|
+
"mpdockq": m.mpdockq,
|
|
45
|
+
"ranking_score": m.ranking_score,
|
|
46
|
+
"n_chains": m.n_chains,
|
|
47
|
+
"n_residues": m.n_residues,
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
df = pd.DataFrame(rows, columns=COLUMNS)
|
|
51
|
+
return df
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def confidence_score(pred: Prediction) -> float:
|
|
55
|
+
"""A single comparable confidence value used to rank predictions.
|
|
56
|
+
|
|
57
|
+
Preference order, using whatever the tool provided:
|
|
58
|
+
1. ipTM (complexes) blended with pTM: 0.8*ipTM + 0.2*pTM
|
|
59
|
+
2. pTM alone
|
|
60
|
+
3. mean pLDDT scaled to 0-1
|
|
61
|
+
4. the tool's own ranking_score
|
|
62
|
+
5. 0.0 as a last resort (keeps it sortable, sorts last)
|
|
63
|
+
"""
|
|
64
|
+
m = pred.metrics
|
|
65
|
+
if m.iptm is not None and m.ptm is not None:
|
|
66
|
+
return 0.8 * m.iptm + 0.2 * m.ptm
|
|
67
|
+
if m.iptm is not None:
|
|
68
|
+
return m.iptm
|
|
69
|
+
if m.ptm is not None:
|
|
70
|
+
return m.ptm
|
|
71
|
+
if m.mean_plddt is not None:
|
|
72
|
+
return m.mean_plddt / 100.0
|
|
73
|
+
if m.ranking_score is not None:
|
|
74
|
+
return m.ranking_score
|
|
75
|
+
return 0.0
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def rank_predictions(predictions: Sequence[Prediction]) -> list[Prediction]:
|
|
79
|
+
"""Return predictions sorted by descending confidence (best first)."""
|
|
80
|
+
return sorted(predictions, key=confidence_score, reverse=True)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def ranked_dataframe(predictions: Sequence[Prediction]) -> pd.DataFrame:
|
|
84
|
+
"""Metrics table sorted best-first, with a 1-based ``overall_rank`` column."""
|
|
85
|
+
ordered = rank_predictions(predictions)
|
|
86
|
+
df = metrics_dataframe(ordered)
|
|
87
|
+
df.insert(0, "overall_rank", range(1, len(df) + 1))
|
|
88
|
+
df["confidence"] = [round(confidence_score(p), 4) for p in ordered]
|
|
89
|
+
return df
|
foldreport/models.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Internal representation shared by every parser.
|
|
2
|
+
|
|
3
|
+
This is the core abstraction of FoldReport: each parser converts the on-disk output
|
|
4
|
+
of a prediction tool into ``list[Prediction]`` objects. Nothing downstream (metrics,
|
|
5
|
+
figures, report) knows the original format. Adding a new tool means writing a parser
|
|
6
|
+
that produces these dataclasses, not touching the rest of the codebase.
|
|
7
|
+
|
|
8
|
+
Missing values are always ``None`` — never invented. The report renders them as "N/A".
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Chain:
|
|
21
|
+
"""A single polymer chain within a predicted structure."""
|
|
22
|
+
|
|
23
|
+
chain_id: str
|
|
24
|
+
n_residues: int
|
|
25
|
+
sequence: str | None = None # one-letter sequence, if known
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class PredictionMetrics:
|
|
30
|
+
"""Normalized confidence metrics for one prediction.
|
|
31
|
+
|
|
32
|
+
Any metric a tool does not provide stays ``None``; downstream code must tolerate
|
|
33
|
+
holes and never fabricate a value.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
mean_plddt: float | None = None
|
|
37
|
+
ptm: float | None = None
|
|
38
|
+
iptm: float | None = None
|
|
39
|
+
mpdockq: float | None = None
|
|
40
|
+
n_chains: int = 0
|
|
41
|
+
n_residues: int = 0
|
|
42
|
+
# Optional ranking score reported by some tools (e.g. AF3 Server "ranking_score").
|
|
43
|
+
ranking_score: float | None = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class Prediction:
|
|
48
|
+
"""A single predicted model, normalized across tools.
|
|
49
|
+
|
|
50
|
+
Attributes:
|
|
51
|
+
name: Human-readable identifier (usually derived from the file name).
|
|
52
|
+
source_tool: One of "colabfold", "af3_server", "boltz", ...
|
|
53
|
+
structure_path: Path to the ``.cif``/``.pdb`` holding coordinates.
|
|
54
|
+
chains: Per-chain metadata in canonical order.
|
|
55
|
+
plddt: Per-residue pLDDT in canonical residue order (0-100). Empty if unknown.
|
|
56
|
+
pae: Predicted Aligned Error matrix (N_tokens x N_tokens) or ``None``.
|
|
57
|
+
metrics: Normalized scalar metrics.
|
|
58
|
+
rank: Tool-reported rank (1 = best) when available, else ``None``.
|
|
59
|
+
raw_files: Provenance — maps a logical role to the file it came from.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
name: str
|
|
63
|
+
source_tool: str
|
|
64
|
+
structure_path: Path
|
|
65
|
+
chains: list[Chain] = field(default_factory=list)
|
|
66
|
+
plddt: list[float] = field(default_factory=list)
|
|
67
|
+
pae: np.ndarray | None = None
|
|
68
|
+
metrics: PredictionMetrics = field(default_factory=PredictionMetrics)
|
|
69
|
+
rank: int | None = None
|
|
70
|
+
raw_files: dict[str, Path] = field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
def __post_init__(self) -> None:
|
|
73
|
+
# Keep the PAE as a float ndarray for consistent downstream handling.
|
|
74
|
+
if self.pae is not None and not isinstance(self.pae, np.ndarray):
|
|
75
|
+
self.pae = np.asarray(self.pae, dtype=float)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Parser registry and format autodetection.
|
|
2
|
+
|
|
3
|
+
Each tool-specific parser registers here. ``detect_parser`` asks every parser whether
|
|
4
|
+
it recognizes a folder, so the user never has to declare the format.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from foldreport.models import Prediction
|
|
12
|
+
from foldreport.parsers.af3_server import Af3ServerParser
|
|
13
|
+
from foldreport.parsers.alphafold_db import AlphaFoldDBParser
|
|
14
|
+
from foldreport.parsers.base import Parser
|
|
15
|
+
from foldreport.parsers.boltz import BoltzParser
|
|
16
|
+
from foldreport.parsers.colabfold import ColabFoldParser
|
|
17
|
+
from foldreport.parsers.openfold3 import OpenFold3Parser
|
|
18
|
+
|
|
19
|
+
# Order matters only as a tie-breaker; can_handle checks should be mutually exclusive
|
|
20
|
+
# in practice. More specific layouts come first.
|
|
21
|
+
ALL_PARSERS: list[Parser] = [
|
|
22
|
+
ColabFoldParser(),
|
|
23
|
+
Af3ServerParser(),
|
|
24
|
+
OpenFold3Parser(),
|
|
25
|
+
AlphaFoldDBParser(),
|
|
26
|
+
BoltzParser(),
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def detect_parser(path: Path) -> Parser | None:
|
|
31
|
+
"""Return the first parser that recognizes ``path``, or None."""
|
|
32
|
+
path = Path(path)
|
|
33
|
+
for parser in ALL_PARSERS:
|
|
34
|
+
if parser.can_handle(path):
|
|
35
|
+
return parser
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def parse_folder(path: Path) -> list[Prediction]:
|
|
40
|
+
"""Autodetect the tool for ``path`` and parse all predictions.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: if no registered parser recognizes the folder.
|
|
44
|
+
"""
|
|
45
|
+
path = Path(path)
|
|
46
|
+
parser = detect_parser(path)
|
|
47
|
+
if parser is None:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"No registered parser recognizes the folder: {path}. "
|
|
50
|
+
f"Supported tools: {', '.join(p.name for p in ALL_PARSERS)}."
|
|
51
|
+
)
|
|
52
|
+
return parser.parse(path)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
__all__ = [
|
|
56
|
+
"ALL_PARSERS",
|
|
57
|
+
"Parser",
|
|
58
|
+
"ColabFoldParser",
|
|
59
|
+
"Af3ServerParser",
|
|
60
|
+
"BoltzParser",
|
|
61
|
+
"OpenFold3Parser",
|
|
62
|
+
"AlphaFoldDBParser",
|
|
63
|
+
"detect_parser",
|
|
64
|
+
"parse_folder",
|
|
65
|
+
]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Parser for AlphaFold 3 Server output folders.
|
|
2
|
+
|
|
3
|
+
The AF3 Server download contains, per job, five ranked models plus JSON sidecars::
|
|
4
|
+
|
|
5
|
+
fold_<job>_model_0.cif ... fold_<job>_model_4.cif
|
|
6
|
+
fold_<job>_full_data_0.json -> atom_plddts, pae, token_chain_ids, ...
|
|
7
|
+
fold_<job>_summary_confidences_0.json -> ptm, iptm, ranking_score, ...
|
|
8
|
+
fold_<job>_job_request.json
|
|
9
|
+
terms_of_use.md
|
|
10
|
+
|
|
11
|
+
pLDDT is stored both per-atom in ``full_data`` and in the mmCIF B-factor column; we
|
|
12
|
+
read per-residue pLDDT from the B-factors for consistency with the other parsers.
|
|
13
|
+
The PAE matrix (per token) comes from ``full_data``.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
from foldreport.models import Prediction, PredictionMetrics
|
|
25
|
+
from foldreport.parsers import base
|
|
26
|
+
|
|
27
|
+
_MODEL_RE = re.compile(r"^(?P<job>.+)_model_(?P<idx>\d+)\.(?:cif|pdb)$")
|
|
28
|
+
_SUMMARY_RE = re.compile(r"^(?P<job>.+)_summary_confidences_(?P<idx>\d+)\.json$")
|
|
29
|
+
_FULLDATA_RE = re.compile(r"^(?P<job>.+)_full_data_(?P<idx>\d+)\.json$")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Af3ServerParser:
|
|
33
|
+
"""Parse a folder produced by the AlphaFold 3 Server."""
|
|
34
|
+
|
|
35
|
+
name = "af3_server"
|
|
36
|
+
|
|
37
|
+
def can_handle(self, path: Path) -> bool:
|
|
38
|
+
path = Path(path)
|
|
39
|
+
if not path.is_dir():
|
|
40
|
+
return False
|
|
41
|
+
has_summary = has_fulldata = False
|
|
42
|
+
for entry in path.iterdir():
|
|
43
|
+
if _SUMMARY_RE.match(entry.name):
|
|
44
|
+
has_summary = True
|
|
45
|
+
elif _FULLDATA_RE.match(entry.name):
|
|
46
|
+
has_fulldata = True
|
|
47
|
+
if has_summary and has_fulldata:
|
|
48
|
+
return True
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
def parse(self, path: Path) -> list[Prediction]:
|
|
52
|
+
path = Path(path)
|
|
53
|
+
models = self._index(path, _MODEL_RE)
|
|
54
|
+
summaries = self._index(path, _SUMMARY_RE)
|
|
55
|
+
fulldata = self._index(path, _FULLDATA_RE)
|
|
56
|
+
|
|
57
|
+
predictions: list[Prediction] = []
|
|
58
|
+
for (job, idx), struct_path in sorted(models.items(), key=lambda kv: kv[0][1]):
|
|
59
|
+
summary = _load_json(summaries.get((job, idx)))
|
|
60
|
+
data = _load_json(fulldata.get((job, idx)))
|
|
61
|
+
|
|
62
|
+
plddt = base.plddt_from_bfactors(struct_path)
|
|
63
|
+
pae = _as_matrix(data.get("pae"))
|
|
64
|
+
|
|
65
|
+
structure = base.read_structure(struct_path)
|
|
66
|
+
chains = base.chains_from_structure(structure)
|
|
67
|
+
n_residues = sum(c.n_residues for c in chains)
|
|
68
|
+
|
|
69
|
+
metrics = PredictionMetrics(
|
|
70
|
+
mean_plddt=float(np.mean(plddt)) if plddt else None,
|
|
71
|
+
ptm=_as_opt_float(summary.get("ptm")),
|
|
72
|
+
iptm=_as_opt_float(summary.get("iptm")),
|
|
73
|
+
mpdockq=None, # AF3 Server does not report mpDockQ.
|
|
74
|
+
n_chains=len(chains),
|
|
75
|
+
n_residues=n_residues,
|
|
76
|
+
ranking_score=_as_opt_float(summary.get("ranking_score")),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
raw_files: dict[str, Path] = {"structure": struct_path}
|
|
80
|
+
if summaries.get((job, idx)):
|
|
81
|
+
raw_files["summary"] = summaries[(job, idx)]
|
|
82
|
+
if fulldata.get((job, idx)):
|
|
83
|
+
raw_files["full_data"] = fulldata[(job, idx)]
|
|
84
|
+
|
|
85
|
+
predictions.append(
|
|
86
|
+
Prediction(
|
|
87
|
+
name=f"{_clean_job(job)}_model_{idx}",
|
|
88
|
+
source_tool=self.name,
|
|
89
|
+
structure_path=struct_path,
|
|
90
|
+
chains=chains,
|
|
91
|
+
plddt=plddt,
|
|
92
|
+
pae=pae,
|
|
93
|
+
metrics=metrics,
|
|
94
|
+
rank=int(idx) + 1, # model_0 is the top-ranked model
|
|
95
|
+
raw_files=raw_files,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
return predictions
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _index(path: Path, pattern: re.Pattern) -> dict[tuple[str, str], Path]:
|
|
102
|
+
out: dict[tuple[str, str], Path] = {}
|
|
103
|
+
for entry in path.iterdir():
|
|
104
|
+
m = pattern.match(entry.name)
|
|
105
|
+
if m:
|
|
106
|
+
out[(m["job"], m["idx"])] = entry
|
|
107
|
+
return out
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _clean_job(job: str) -> str:
|
|
111
|
+
return job[len("fold_"):] if job.startswith("fold_") else job
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _load_json(path: Path | None) -> dict:
|
|
115
|
+
if path is None:
|
|
116
|
+
return {}
|
|
117
|
+
with open(path, "r", encoding="utf-8") as fh:
|
|
118
|
+
return json.load(fh)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _as_opt_float(value) -> float | None:
|
|
122
|
+
return None if value is None else float(value)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _as_matrix(value) -> np.ndarray | None:
|
|
126
|
+
if value is None:
|
|
127
|
+
return None
|
|
128
|
+
arr = np.asarray(value, dtype=float)
|
|
129
|
+
return arr if arr.ndim == 2 else None
|