proteinmpnn-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proteinmpnn/__init__.py +30 -0
- proteinmpnn/cli/__init__.py +40 -0
- proteinmpnn/cli/compute_probs.py +107 -0
- proteinmpnn/cli/display.py +124 -0
- proteinmpnn/cli/output.py +76 -0
- proteinmpnn/cli/run_single.py +152 -0
- proteinmpnn/data/__init__.py +23 -0
- proteinmpnn/data/config.py +54 -0
- proteinmpnn/data/input.py +119 -0
- proteinmpnn/data/multi_state.py +517 -0
- proteinmpnn/data/single_state.py +307 -0
- proteinmpnn/data/structure/__init__.py +0 -0
- proteinmpnn/data/structure/dataset.py +176 -0
- proteinmpnn/inference/__init__.py +14 -0
- proteinmpnn/inference/results.py +215 -0
- proteinmpnn/inference/runner.py +708 -0
- proteinmpnn/inference/transform.py +313 -0
- proteinmpnn/model/__init__.py +49 -0
- proteinmpnn/model/featurize.py +419 -0
- proteinmpnn/model/layers.py +411 -0
- proteinmpnn/model/losses.py +87 -0
- proteinmpnn/model/protein/__init__.py +0 -0
- proteinmpnn/model/protein/features.py +0 -0
- proteinmpnn/model/proteinmpnn.py +1164 -0
- proteinmpnn/model/utils.py +98 -0
- proteinmpnn/py.typed +0 -0
- proteinmpnn/sample/__init__.py +0 -0
- proteinmpnn/sample/metropolis.py +788 -0
- proteinmpnn/utils/__init__.py +26 -0
- proteinmpnn/utils/constants.py +108 -0
- proteinmpnn/utils/logging.py +85 -0
- proteinmpnn/utils/pdb.py +294 -0
- proteinmpnn/utils/residue.py +125 -0
- proteinmpnn/utils/weights.py +287 -0
- proteinmpnn_cli-0.1.0.dist-info/METADATA +175 -0
- proteinmpnn_cli-0.1.0.dist-info/RECORD +39 -0
- proteinmpnn_cli-0.1.0.dist-info/WHEEL +4 -0
- proteinmpnn_cli-0.1.0.dist-info/entry_points.txt +2 -0
- proteinmpnn_cli-0.1.0.dist-info/licenses/LICENSE +22 -0
proteinmpnn/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""ProteinMPNN - Protein sequence design using message passing neural networks."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from importlib.metadata import version
|
|
5
|
+
|
|
6
|
+
# Prevent "No handler found" warnings for library users who don't configure logging
|
|
7
|
+
logging.getLogger("proteinmpnn").addHandler(logging.NullHandler())
|
|
8
|
+
|
|
9
|
+
__version__ = version("proteinmpnn-cli")
|
|
10
|
+
|
|
11
|
+
# Re-export main classes for convenient imports
|
|
12
|
+
from proteinmpnn.inference import ( # noqa: E402
|
|
13
|
+
DesignResult,
|
|
14
|
+
InferenceRunner,
|
|
15
|
+
SequenceResult,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"__version__",
|
|
20
|
+
"InferenceRunner",
|
|
21
|
+
"DesignResult",
|
|
22
|
+
"SequenceResult",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def main() -> None:
|
|
27
|
+
"""Entry point for the CLI."""
|
|
28
|
+
from proteinmpnn.cli import app
|
|
29
|
+
|
|
30
|
+
app()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""CLI for ProteinMPNN."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Annotated
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
|
|
8
|
+
from proteinmpnn.utils.logging import setup_logging
|
|
9
|
+
|
|
10
|
+
app = typer.Typer(
|
|
11
|
+
no_args_is_help=True,
|
|
12
|
+
add_completion=False,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@app.callback()
|
|
17
|
+
def main(
|
|
18
|
+
log_level: Annotated[
|
|
19
|
+
str,
|
|
20
|
+
typer.Option(
|
|
21
|
+
"--log-level",
|
|
22
|
+
"-l",
|
|
23
|
+
help="Logging level",
|
|
24
|
+
case_sensitive=False,
|
|
25
|
+
),
|
|
26
|
+
] = "INFO",
|
|
27
|
+
) -> None:
|
|
28
|
+
"""ProteinMPNN: Structure-conditioned protein sequence design."""
|
|
29
|
+
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
30
|
+
setup_logging(level=level)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Import commands to register them
|
|
34
|
+
from proteinmpnn.cli import ( # noqa: F401, E402
|
|
35
|
+
compute_probs, # noqa: F401, E402
|
|
36
|
+
run_single, # noqa: F401, E402
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if __name__ == "__main__":
|
|
40
|
+
app()
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Compute conditional probabilities CLI command."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path # noqa: TC003
|
|
6
|
+
from typing import Annotated, Literal
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from proteinmpnn.cli import app
|
|
11
|
+
from proteinmpnn.cli.display import display_probs_results
|
|
12
|
+
from proteinmpnn.cli.output import write_probs_csv, write_probs_npz
|
|
13
|
+
from proteinmpnn.inference import InferenceRunner
|
|
14
|
+
from proteinmpnn.utils.logging import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger("cli.compute_probs")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@app.command()
|
|
20
|
+
def compute_probs(
|
|
21
|
+
pdb_path: Annotated[Path, typer.Argument(help="Path to the input PDB file")],
|
|
22
|
+
model_name: Annotated[
|
|
23
|
+
Literal[
|
|
24
|
+
"v_48_002",
|
|
25
|
+
"v_48_010",
|
|
26
|
+
"v_48_020",
|
|
27
|
+
"v_48_030", # vanilla models
|
|
28
|
+
"ca_48_002",
|
|
29
|
+
"ca_48_010",
|
|
30
|
+
"ca_48_020", # CA models
|
|
31
|
+
"s_48_002",
|
|
32
|
+
"s_48_010",
|
|
33
|
+
"s_48_020",
|
|
34
|
+
"s_48_030", # soluble models
|
|
35
|
+
],
|
|
36
|
+
typer.Option("--model", "-m", help="ProteinMPNN model to use"),
|
|
37
|
+
] = "v_48_020",
|
|
38
|
+
designable_residues: Annotated[
|
|
39
|
+
str,
|
|
40
|
+
typer.Option(
|
|
41
|
+
"--design",
|
|
42
|
+
"-d",
|
|
43
|
+
help="Residues to compute conditional probs for (e.g., 'A1-A68')",
|
|
44
|
+
),
|
|
45
|
+
] = "",
|
|
46
|
+
unconditional: Annotated[
|
|
47
|
+
bool,
|
|
48
|
+
typer.Option(
|
|
49
|
+
"--unconditional",
|
|
50
|
+
"-u",
|
|
51
|
+
help="Compute unconditional probabilities (no sequence context)",
|
|
52
|
+
),
|
|
53
|
+
] = False,
|
|
54
|
+
output: Annotated[
|
|
55
|
+
Path | None,
|
|
56
|
+
typer.Option(
|
|
57
|
+
"--output",
|
|
58
|
+
"-o",
|
|
59
|
+
help="Output directory (defaults to PDB directory)",
|
|
60
|
+
),
|
|
61
|
+
] = None,
|
|
62
|
+
seed: Annotated[
|
|
63
|
+
int | None,
|
|
64
|
+
typer.Option("--seed", help="Random seed for reproducibility"),
|
|
65
|
+
] = None,
|
|
66
|
+
) -> None:
|
|
67
|
+
"""Compute amino acid conditional (or unconditional) probabilities per residue.
|
|
68
|
+
|
|
69
|
+
Outputs both CSV and NPZ files with log probabilities for each residue position.
|
|
70
|
+
|
|
71
|
+
Example usage:
|
|
72
|
+
|
|
73
|
+
proteinmpnn compute-probs 6MRR.pdb --design A1-A68
|
|
74
|
+
|
|
75
|
+
proteinmpnn compute-probs 6MRR.pdb --unconditional
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
# Determine output directory
|
|
79
|
+
if output is None:
|
|
80
|
+
output = pdb_path.parent
|
|
81
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
|
|
83
|
+
# Create runner and compute probabilities
|
|
84
|
+
logger.info("Loading model %s...", model_name)
|
|
85
|
+
runner = InferenceRunner(model_name=model_name)
|
|
86
|
+
|
|
87
|
+
mode_str = "unconditional" if unconditional else "conditional"
|
|
88
|
+
logger.info("Computing %s probabilities for %s...", mode_str, pdb_path.name)
|
|
89
|
+
|
|
90
|
+
result = runner.compute_probs(
|
|
91
|
+
pdb_path=pdb_path,
|
|
92
|
+
designable_res=designable_residues,
|
|
93
|
+
unconditional=unconditional,
|
|
94
|
+
seed=seed,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Write outputs
|
|
98
|
+
csv_path = output / f"{pdb_path.stem}_probs.csv"
|
|
99
|
+
write_probs_csv(result, csv_path)
|
|
100
|
+
logger.info("Wrote CSV to %s", csv_path)
|
|
101
|
+
|
|
102
|
+
npz_path = output / f"{pdb_path.stem}_probs.npz"
|
|
103
|
+
write_probs_npz(result, npz_path)
|
|
104
|
+
logger.info("Wrote NPZ to %s", npz_path)
|
|
105
|
+
|
|
106
|
+
# Display rich summary
|
|
107
|
+
display_probs_results(result)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Rich display utilities for CLI output."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.panel import Panel
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from proteinmpnn.inference.results import ConditionalProbsResult, DesignResult
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def display_design_results(result: DesignResult) -> None:
|
|
18
|
+
"""Display design results in a rich table."""
|
|
19
|
+
# Create a table for designed sequences
|
|
20
|
+
table = Table(title=f"Designed Sequences for {result.protein_name}")
|
|
21
|
+
|
|
22
|
+
table.add_column("#", style="dim", width=4)
|
|
23
|
+
table.add_column("Temperature", justify="right", style="cyan")
|
|
24
|
+
table.add_column("Score", justify="right", style="green")
|
|
25
|
+
table.add_column("Recovery", justify="right", style="yellow")
|
|
26
|
+
table.add_column("Sequence", style="dim", overflow="fold", max_width=50)
|
|
27
|
+
|
|
28
|
+
# Add native sequence first
|
|
29
|
+
table.add_row(
|
|
30
|
+
"WT",
|
|
31
|
+
"-",
|
|
32
|
+
f"{result.native.score:.4f}",
|
|
33
|
+
"1.0000",
|
|
34
|
+
_truncate_seq(result.native.sequence),
|
|
35
|
+
style="bold",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Add designed sequences sorted by score
|
|
39
|
+
sorted_seqs = sorted(result.sequences, key=lambda s: s.score)
|
|
40
|
+
for i, seq in enumerate(sorted_seqs, 1):
|
|
41
|
+
is_best = i == 1
|
|
42
|
+
table.add_row(
|
|
43
|
+
str(i),
|
|
44
|
+
f"{seq.temperature:.2f}",
|
|
45
|
+
f"{seq.score:.4f}",
|
|
46
|
+
f"{seq.seq_recovery:.4f}",
|
|
47
|
+
_truncate_seq(seq.sequence),
|
|
48
|
+
style="bold green" if is_best else None,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
console.print()
|
|
52
|
+
console.print(table)
|
|
53
|
+
|
|
54
|
+
# Summary panel
|
|
55
|
+
best = sorted_seqs[0]
|
|
56
|
+
worst = sorted_seqs[-1]
|
|
57
|
+
summary = (
|
|
58
|
+
f"[bold]Native score:[/bold] {result.native.score:.4f}\n"
|
|
59
|
+
f"[bold green]Best score:[/bold green] {best.score:.4f} "
|
|
60
|
+
f"(T={best.temperature}, recovery={best.seq_recovery:.2%})\n"
|
|
61
|
+
f"[bold]Worst score:[/bold] {worst.score:.4f}\n"
|
|
62
|
+
f"[bold]Total sequences:[/bold] {len(result.sequences)}"
|
|
63
|
+
)
|
|
64
|
+
console.print(Panel(summary, title="Summary", border_style="blue"))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def display_probs_results(result: ConditionalProbsResult) -> None:
|
|
68
|
+
"""Display probability computation results."""
|
|
69
|
+
import numpy as np
|
|
70
|
+
|
|
71
|
+
from proteinmpnn.model.utils import ALPHABET
|
|
72
|
+
|
|
73
|
+
# Summary panel
|
|
74
|
+
mode_style = "yellow" if result.mode == "unconditional" else "cyan"
|
|
75
|
+
summary = (
|
|
76
|
+
f"[bold]Protein:[/bold] {result.protein_name}\n"
|
|
77
|
+
f"[bold]Model:[/bold] {result.model_name}\n"
|
|
78
|
+
f"[bold]Mode:[/bold] [{mode_style}]{result.mode}[/{mode_style}]\n"
|
|
79
|
+
f"[bold]Residues:[/bold] {len(result.residue_info)}\n"
|
|
80
|
+
f"[bold]Matrix shape:[/bold] {result.log_probs.shape}"
|
|
81
|
+
)
|
|
82
|
+
console.print()
|
|
83
|
+
console.print(Panel(summary, title="Probability Computation", border_style="blue"))
|
|
84
|
+
|
|
85
|
+
# Show top predictions for first few residues
|
|
86
|
+
table = Table(title="Top 3 Predictions per Position (first 10 residues)")
|
|
87
|
+
table.add_column("Position", style="dim")
|
|
88
|
+
table.add_column("Chain", style="cyan")
|
|
89
|
+
table.add_column("1st", justify="center")
|
|
90
|
+
table.add_column("P(1st)", justify="right", style="green")
|
|
91
|
+
table.add_column("2nd", justify="center")
|
|
92
|
+
table.add_column("P(2nd)", justify="right")
|
|
93
|
+
table.add_column("3rd", justify="center")
|
|
94
|
+
table.add_column("P(3rd)", justify="right", style="dim")
|
|
95
|
+
|
|
96
|
+
# Show first 10 residues
|
|
97
|
+
for i, res_info in enumerate(result.residue_info[:10]):
|
|
98
|
+
probs = np.exp(result.log_probs[i])
|
|
99
|
+
top_indices = np.argsort(probs)[::-1][:3]
|
|
100
|
+
|
|
101
|
+
table.add_row(
|
|
102
|
+
str(res_info.residue_idx),
|
|
103
|
+
res_info.chain,
|
|
104
|
+
ALPHABET[top_indices[0]],
|
|
105
|
+
f"{probs[top_indices[0]]:.3f}",
|
|
106
|
+
ALPHABET[top_indices[1]],
|
|
107
|
+
f"{probs[top_indices[1]]:.3f}",
|
|
108
|
+
ALPHABET[top_indices[2]],
|
|
109
|
+
f"{probs[top_indices[2]]:.3f}",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
console.print(table)
|
|
113
|
+
|
|
114
|
+
if len(result.residue_info) > 10:
|
|
115
|
+
console.print(
|
|
116
|
+
f"[dim]... and {len(result.residue_info) - 10} more residues[/dim]"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _truncate_seq(seq: str, max_len: int = 40) -> str:
|
|
121
|
+
"""Truncate sequence for display."""
|
|
122
|
+
if len(seq) <= max_len:
|
|
123
|
+
return seq
|
|
124
|
+
return seq[: max_len - 3] + "..."
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Output formatting utilities for CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from proteinmpnn.inference.results import ConditionalProbsResult, DesignResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def write_fasta(result: DesignResult, output_path: Path) -> None:
|
|
14
|
+
"""Write design result to a FASTA file.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
result: DesignResult from inference.
|
|
18
|
+
output_path: Path to output FASTA file.
|
|
19
|
+
"""
|
|
20
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
output_path.write_text(result.to_fasta() + "\n")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def write_af2_csv(result: DesignResult, output_path: Path) -> None:
|
|
25
|
+
"""Write design result to AlphaFold2-compatible CSV.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
result: DesignResult from inference.
|
|
29
|
+
output_path: Path to output CSV file.
|
|
30
|
+
"""
|
|
31
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
output_path.write_text(result.to_af2_csv() + "\n")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def write_probs(result: DesignResult, output_path: Path) -> None:
|
|
36
|
+
"""Write probability matrix to NPZ file.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
result: DesignResult from inference (must have probs).
|
|
40
|
+
output_path: Path to output NPZ file.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If result has no probability matrix.
|
|
44
|
+
"""
|
|
45
|
+
if result.probs is None:
|
|
46
|
+
raise ValueError("DesignResult has no probability matrix")
|
|
47
|
+
|
|
48
|
+
import numpy as np
|
|
49
|
+
|
|
50
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
np.savez(output_path, probs=result.probs)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def write_probs_csv(result: ConditionalProbsResult, output_path: Path) -> None:
|
|
55
|
+
"""Write conditional probabilities result to CSV file.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
result: ConditionalProbsResult from inference.
|
|
59
|
+
output_path: Path to output CSV file.
|
|
60
|
+
"""
|
|
61
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
output_path.write_text(result.to_csv() + "\n")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def write_probs_npz(result: ConditionalProbsResult, output_path: Path) -> None:
|
|
66
|
+
"""Write conditional probabilities result to NPZ file.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
result: ConditionalProbsResult from inference.
|
|
70
|
+
output_path: Path to output NPZ file.
|
|
71
|
+
"""
|
|
72
|
+
import numpy as np
|
|
73
|
+
|
|
74
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
npz_dict = result.to_npz_dict()
|
|
76
|
+
np.savez(output_path, **npz_dict)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Run single protein design CLI command."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path # noqa: TC003
|
|
6
|
+
from typing import Annotated, Literal
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from proteinmpnn.cli import app
|
|
11
|
+
from proteinmpnn.cli.display import display_design_results
|
|
12
|
+
from proteinmpnn.cli.output import write_af2_csv, write_fasta
|
|
13
|
+
from proteinmpnn.inference import InferenceRunner
|
|
14
|
+
from proteinmpnn.utils.logging import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger("cli.run_single")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@app.command()
|
|
20
|
+
def run_single(
|
|
21
|
+
pdb_path: Annotated[Path, typer.Argument(help="Path to the input PDB file")],
|
|
22
|
+
model_name: Annotated[
|
|
23
|
+
Literal[
|
|
24
|
+
"v_48_002",
|
|
25
|
+
"v_48_010",
|
|
26
|
+
"v_48_020",
|
|
27
|
+
"v_48_030", # vanilla models
|
|
28
|
+
"ca_48_002",
|
|
29
|
+
"ca_48_010",
|
|
30
|
+
"ca_48_020", # CA models
|
|
31
|
+
"s_48_002",
|
|
32
|
+
"s_48_010",
|
|
33
|
+
"s_48_020",
|
|
34
|
+
"s_48_030", # soluble models
|
|
35
|
+
],
|
|
36
|
+
typer.Option("--model", "-m", help="ProteinMPNN model to use"),
|
|
37
|
+
] = "v_48_020",
|
|
38
|
+
designable_residues: Annotated[
|
|
39
|
+
str,
|
|
40
|
+
typer.Option(
|
|
41
|
+
"--design",
|
|
42
|
+
"-d",
|
|
43
|
+
help="Designable residues (e.g., 'A1-A68' or 'A10,A12-A15')",
|
|
44
|
+
),
|
|
45
|
+
] = "",
|
|
46
|
+
symmetric_residues: Annotated[
|
|
47
|
+
str,
|
|
48
|
+
typer.Option(
|
|
49
|
+
"--symmetric",
|
|
50
|
+
"-s",
|
|
51
|
+
help="Symmetric residue pairs (e.g., 'A10:B10,A11:B11')",
|
|
52
|
+
),
|
|
53
|
+
] = "",
|
|
54
|
+
cluster_center: Annotated[
|
|
55
|
+
str,
|
|
56
|
+
typer.Option(
|
|
57
|
+
"--cluster",
|
|
58
|
+
"-c",
|
|
59
|
+
help="Cluster center residue(s) for radius-based selection",
|
|
60
|
+
),
|
|
61
|
+
] = "",
|
|
62
|
+
cluster_radius: Annotated[
|
|
63
|
+
float,
|
|
64
|
+
typer.Option("--radius", "-r", help="Cluster radius in Angstroms"),
|
|
65
|
+
] = 10.0,
|
|
66
|
+
backbone_noise: Annotated[
|
|
67
|
+
float,
|
|
68
|
+
typer.Option("--noise", help="Backbone noise standard deviation"),
|
|
69
|
+
] = 0.0,
|
|
70
|
+
num_seq_per_target: Annotated[
|
|
71
|
+
int,
|
|
72
|
+
typer.Option("-n", "--num-seqs", help="Number of sequences to generate"),
|
|
73
|
+
] = 5,
|
|
74
|
+
batch_size: Annotated[
|
|
75
|
+
int,
|
|
76
|
+
typer.Option("--batch", "-b", help="Batch size for generation"),
|
|
77
|
+
] = 1,
|
|
78
|
+
temperature: Annotated[
|
|
79
|
+
str,
|
|
80
|
+
typer.Option(
|
|
81
|
+
"--temp",
|
|
82
|
+
"-t",
|
|
83
|
+
help="Sampling temperature(s), space-separated (e.g., '0.1 0.2')",
|
|
84
|
+
),
|
|
85
|
+
] = "0.1",
|
|
86
|
+
output: Annotated[
|
|
87
|
+
Path | None,
|
|
88
|
+
typer.Option(
|
|
89
|
+
"--output",
|
|
90
|
+
"-o",
|
|
91
|
+
help="Output directory (defaults to PDB directory)",
|
|
92
|
+
),
|
|
93
|
+
] = None,
|
|
94
|
+
af2: Annotated[
|
|
95
|
+
bool,
|
|
96
|
+
typer.Option("--af2", help="Also output AlphaFold2-format CSV"),
|
|
97
|
+
] = False,
|
|
98
|
+
seed: Annotated[
|
|
99
|
+
int | None,
|
|
100
|
+
typer.Option("--seed", help="Random seed for reproducibility"),
|
|
101
|
+
] = None,
|
|
102
|
+
) -> None:
|
|
103
|
+
"""Design sequences for a single protein structure.
|
|
104
|
+
|
|
105
|
+
Example usage:
|
|
106
|
+
|
|
107
|
+
proteinmpnn run-single 6MRR.pdb --design A1-A68 -n 10
|
|
108
|
+
|
|
109
|
+
proteinmpnn run-single 4GYT.pdb --design A7-A183,B7-B183 \\
|
|
110
|
+
--symmetric A7-A183:B7-B183
|
|
111
|
+
|
|
112
|
+
"""
|
|
113
|
+
# Parse temperatures
|
|
114
|
+
temperatures = [float(t) for t in temperature.split()]
|
|
115
|
+
|
|
116
|
+
# Determine output directory
|
|
117
|
+
if output is None:
|
|
118
|
+
output = pdb_path.parent
|
|
119
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
|
|
121
|
+
# Create runner and generate sequences
|
|
122
|
+
logger.info("Loading model %s...", model_name)
|
|
123
|
+
runner = InferenceRunner(
|
|
124
|
+
model_name=model_name,
|
|
125
|
+
backbone_noise=backbone_noise,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
logger.info("Designing sequences for %s...", pdb_path.name)
|
|
129
|
+
result = runner.design_single(
|
|
130
|
+
pdb_path=pdb_path,
|
|
131
|
+
designable_res=designable_residues,
|
|
132
|
+
symmetric_res=symmetric_residues,
|
|
133
|
+
cluster_center=cluster_center,
|
|
134
|
+
cluster_radius=cluster_radius,
|
|
135
|
+
num_sequences=num_seq_per_target,
|
|
136
|
+
batch_size=batch_size,
|
|
137
|
+
temperatures=temperatures,
|
|
138
|
+
seed=seed,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Write outputs
|
|
142
|
+
fasta_path = output / f"{pdb_path.stem}.fasta"
|
|
143
|
+
write_fasta(result, fasta_path)
|
|
144
|
+
logger.info("Wrote %d sequences to %s", len(result.sequences) + 1, fasta_path)
|
|
145
|
+
|
|
146
|
+
if af2:
|
|
147
|
+
csv_path = output / f"{pdb_path.stem}.csv"
|
|
148
|
+
write_af2_csv(result, csv_path)
|
|
149
|
+
logger.info("Wrote AlphaFold2 CSV to %s", csv_path)
|
|
150
|
+
|
|
151
|
+
# Display rich summary
|
|
152
|
+
display_design_results(result)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Data processing modules for ProteinMPNN."""
|
|
2
|
+
|
|
3
|
+
from proteinmpnn.data.config import (
|
|
4
|
+
DesignableResidue,
|
|
5
|
+
MultiStateConfig,
|
|
6
|
+
SingleStateConfig,
|
|
7
|
+
)
|
|
8
|
+
from proteinmpnn.data.input import (
|
|
9
|
+
ProteinDesignInputFormatter,
|
|
10
|
+
create_design_input,
|
|
11
|
+
)
|
|
12
|
+
from proteinmpnn.data.multi_state import MultiStateDesignInput
|
|
13
|
+
from proteinmpnn.data.single_state import SingleStateDesignInput
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"create_design_input",
|
|
17
|
+
"DesignableResidue",
|
|
18
|
+
"MultiStateConfig",
|
|
19
|
+
"MultiStateDesignInput",
|
|
20
|
+
"ProteinDesignInputFormatter",
|
|
21
|
+
"SingleStateConfig",
|
|
22
|
+
"SingleStateDesignInput",
|
|
23
|
+
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Pydantic models for ProteinMPNN design output configurations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DesignableResidue(BaseModel):
|
|
11
|
+
"""A single residue that can be mutated."""
|
|
12
|
+
|
|
13
|
+
chain: str = Field(description="Chain identifier")
|
|
14
|
+
resid: int = Field(description="Residue index (1-based, sequential within chain)")
|
|
15
|
+
WTAA: str = Field(description="Wild-type amino acid (1-letter code)")
|
|
16
|
+
MutTo: str = Field(default="all", description="Allowed amino acids for mutation")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SingleStateConfig(BaseModel):
|
|
20
|
+
"""Output config for single-state protein design."""
|
|
21
|
+
|
|
22
|
+
sequence: dict[str, str] = Field(
|
|
23
|
+
default_factory=dict, description="Chain ID -> amino acid sequence"
|
|
24
|
+
)
|
|
25
|
+
designable: list[DesignableResidue] = Field(
|
|
26
|
+
default_factory=list, description="List of designable residues"
|
|
27
|
+
)
|
|
28
|
+
symmetric: list[list[str]] = Field(
|
|
29
|
+
default_factory=list, description="List of tied position groups"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def to_json(self, path: str | Path, indent: int = 2) -> None:
|
|
33
|
+
"""Write config to JSON file.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
path: Output file path.
|
|
37
|
+
indent: JSON indentation level.
|
|
38
|
+
"""
|
|
39
|
+
Path(path).write_text(self.model_dump_json(indent=indent))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MultiStateConfig(SingleStateConfig):
|
|
43
|
+
"""Output config for multi-state protein design.
|
|
44
|
+
|
|
45
|
+
Extends SingleStateConfig with additional MSD-specific fields.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
tied_betas: dict[str, float] = Field(
|
|
49
|
+
default_factory=dict, description="Chain ID -> beta weight for MSD"
|
|
50
|
+
)
|
|
51
|
+
chain_key: dict[str, dict[str, str]] = Field(
|
|
52
|
+
default_factory=dict,
|
|
53
|
+
description="PDB name -> original chain -> remapped chain",
|
|
54
|
+
)
|