PyPI - boltz-vsynthes - Versions diffs - 1.0.0__py3-none-any.whl - Mend

boltz-vsynthes 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

boltz/__init__.py +7 -0
boltz/data/__init__.py +0 -0
boltz/data/const.py +1184 -0
boltz/data/crop/__init__.py +0 -0
boltz/data/crop/affinity.py +164 -0
boltz/data/crop/boltz.py +296 -0
boltz/data/crop/cropper.py +45 -0
boltz/data/feature/__init__.py +0 -0
boltz/data/feature/featurizer.py +1230 -0
boltz/data/feature/featurizerv2.py +2208 -0
boltz/data/feature/symmetry.py +602 -0
boltz/data/filter/__init__.py +0 -0
boltz/data/filter/dynamic/__init__.py +0 -0
boltz/data/filter/dynamic/date.py +76 -0
boltz/data/filter/dynamic/filter.py +24 -0
boltz/data/filter/dynamic/max_residues.py +37 -0
boltz/data/filter/dynamic/resolution.py +34 -0
boltz/data/filter/dynamic/size.py +38 -0
boltz/data/filter/dynamic/subset.py +42 -0
boltz/data/filter/static/__init__.py +0 -0
boltz/data/filter/static/filter.py +26 -0
boltz/data/filter/static/ligand.py +37 -0
boltz/data/filter/static/polymer.py +299 -0
boltz/data/module/__init__.py +0 -0
boltz/data/module/inference.py +307 -0
boltz/data/module/inferencev2.py +429 -0
boltz/data/module/training.py +684 -0
boltz/data/module/trainingv2.py +660 -0
boltz/data/mol.py +900 -0
boltz/data/msa/__init__.py +0 -0
boltz/data/msa/mmseqs2.py +235 -0
boltz/data/pad.py +84 -0
boltz/data/parse/__init__.py +0 -0
boltz/data/parse/a3m.py +134 -0
boltz/data/parse/csv.py +100 -0
boltz/data/parse/fasta.py +138 -0
boltz/data/parse/mmcif.py +1239 -0
boltz/data/parse/mmcif_with_constraints.py +1607 -0
boltz/data/parse/schema.py +1851 -0
boltz/data/parse/yaml.py +68 -0
boltz/data/sample/__init__.py +0 -0
boltz/data/sample/cluster.py +283 -0
boltz/data/sample/distillation.py +57 -0
boltz/data/sample/random.py +39 -0
boltz/data/sample/sampler.py +49 -0
boltz/data/tokenize/__init__.py +0 -0
boltz/data/tokenize/boltz.py +195 -0
boltz/data/tokenize/boltz2.py +396 -0
boltz/data/tokenize/tokenizer.py +24 -0
boltz/data/types.py +777 -0
boltz/data/write/__init__.py +0 -0
boltz/data/write/mmcif.py +305 -0
boltz/data/write/pdb.py +171 -0
boltz/data/write/utils.py +23 -0
boltz/data/write/writer.py +330 -0
boltz/main.py +1292 -0
boltz/model/__init__.py +0 -0
boltz/model/layers/__init__.py +0 -0
boltz/model/layers/attention.py +132 -0
boltz/model/layers/attentionv2.py +111 -0
boltz/model/layers/confidence_utils.py +231 -0
boltz/model/layers/dropout.py +34 -0
boltz/model/layers/initialize.py +100 -0
boltz/model/layers/outer_product_mean.py +98 -0
boltz/model/layers/pair_averaging.py +135 -0
boltz/model/layers/pairformer.py +337 -0
boltz/model/layers/relative.py +58 -0
boltz/model/layers/transition.py +78 -0
boltz/model/layers/triangular_attention/__init__.py +0 -0
boltz/model/layers/triangular_attention/attention.py +189 -0
boltz/model/layers/triangular_attention/primitives.py +409 -0
boltz/model/layers/triangular_attention/utils.py +380 -0
boltz/model/layers/triangular_mult.py +212 -0
boltz/model/loss/__init__.py +0 -0
boltz/model/loss/bfactor.py +49 -0
boltz/model/loss/confidence.py +590 -0
boltz/model/loss/confidencev2.py +621 -0
boltz/model/loss/diffusion.py +171 -0
boltz/model/loss/diffusionv2.py +134 -0
boltz/model/loss/distogram.py +48 -0
boltz/model/loss/distogramv2.py +105 -0
boltz/model/loss/validation.py +1025 -0
boltz/model/models/__init__.py +0 -0
boltz/model/models/boltz1.py +1286 -0
boltz/model/models/boltz2.py +1249 -0
boltz/model/modules/__init__.py +0 -0
boltz/model/modules/affinity.py +223 -0
boltz/model/modules/confidence.py +481 -0
boltz/model/modules/confidence_utils.py +181 -0
boltz/model/modules/confidencev2.py +495 -0
boltz/model/modules/diffusion.py +844 -0
boltz/model/modules/diffusion_conditioning.py +116 -0
boltz/model/modules/diffusionv2.py +677 -0
boltz/model/modules/encoders.py +639 -0
boltz/model/modules/encodersv2.py +565 -0
boltz/model/modules/transformers.py +322 -0
boltz/model/modules/transformersv2.py +261 -0
boltz/model/modules/trunk.py +688 -0
boltz/model/modules/trunkv2.py +828 -0
boltz/model/modules/utils.py +303 -0
boltz/model/optim/__init__.py +0 -0
boltz/model/optim/ema.py +389 -0
boltz/model/optim/scheduler.py +99 -0
boltz/model/potentials/__init__.py +0 -0
boltz/model/potentials/potentials.py +497 -0
boltz/model/potentials/schedules.py +32 -0
boltz_vsynthes-1.0.0.dist-info/METADATA +151 -0
boltz_vsynthes-1.0.0.dist-info/RECORD +112 -0
boltz_vsynthes-1.0.0.dist-info/WHEEL +5 -0
boltz_vsynthes-1.0.0.dist-info/entry_points.txt +2 -0
boltz_vsynthes-1.0.0.dist-info/licenses/LICENSE +21 -0
boltz_vsynthes-1.0.0.dist-info/top_level.txt +1 -0

boltz/main.py ADDED Viewed

@@ -0,0 +1,1292 @@
+import multiprocessing
+import os
+import pickle
+import platform
+import tarfile
+import urllib.request
+import warnings
+from dataclasses import asdict, dataclass
+from functools import partial
+from multiprocessing import Pool
+from pathlib import Path
+from typing import Literal, Optional
+import click
+import torch
+from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning.strategies import DDPStrategy
+from pytorch_lightning.utilities import rank_zero_only
+from rdkit import Chem
+from tqdm import tqdm
+from boltz.data import const
+from boltz.data.module.inference import BoltzInferenceDataModule
+from boltz.data.module.inferencev2 import Boltz2InferenceDataModule
+from boltz.data.mol import load_canonicals
+from boltz.data.msa.mmseqs2 import run_mmseqs2
+from boltz.data.parse.a3m import parse_a3m
+from boltz.data.parse.csv import parse_csv
+from boltz.data.parse.fasta import parse_fasta
+from boltz.data.parse.yaml import parse_yaml
+from boltz.data.types import MSA, Manifest, Record
+from boltz.data.write.writer import BoltzAffinityWriter, BoltzWriter
+from boltz.model.models.boltz1 import Boltz1
+from boltz.model.models.boltz2 import Boltz2
+CCD_URL = "https://huggingface.co/boltz-community/boltz-1/resolve/main/ccd.pkl"
+MOL_URL = "https://huggingface.co/boltz-community/boltz-2/resolve/main/mols.tar"
+BOLTZ1_URL_WITH_FALLBACK = [
+    "https://model-gateway.boltz.bio/boltz1_conf.ckpt",
+    "https://huggingface.co/boltz-community/boltz-1/resolve/main/boltz1_conf.ckpt",
+]
+BOLTZ2_URL_WITH_FALLBACK = [
+    "https://model-gateway.boltz.bio/boltz2_conf.ckpt",
+    "https://huggingface.co/boltz-community/boltz-2/resolve/main/boltz2_conf.ckpt",
+]
+BOLTZ2_AFFINITY_URL_WITH_FALLBACK = [
+    "https://model-gateway.boltz.bio/boltz2_aff.ckpt",
+    "https://huggingface.co/boltz-community/boltz-2/resolve/main/boltz2_aff.ckpt",
+]
+@dataclass
+class BoltzProcessedInput:
+    """Processed input data."""
+    manifest: Manifest
+    targets_dir: Path
+    msa_dir: Path
+    constraints_dir: Optional[Path] = None
+    template_dir: Optional[Path] = None
+    extra_mols_dir: Optional[Path] = None
+@dataclass
+class PairformerArgs:
+    """Pairformer arguments."""
+    num_blocks: int = 48
+    num_heads: int = 16
+    dropout: float = 0.0
+    activation_checkpointing: bool = False
+    offload_to_cpu: bool = False
+    v2: bool = False
+@dataclass
+class PairformerArgsV2:
+    """Pairformer arguments."""
+    num_blocks: int = 64
+    num_heads: int = 16
+    dropout: float = 0.0
+    activation_checkpointing: bool = False
+    offload_to_cpu: bool = False
+    v2: bool = True
+@dataclass
+class MSAModuleArgs:
+    """MSA module arguments."""
+    msa_s: int = 64
+    msa_blocks: int = 4
+    msa_dropout: float = 0.0
+    z_dropout: float = 0.0
+    use_paired_feature: bool = True
+    pairwise_head_width: int = 32
+    pairwise_num_heads: int = 4
+    activation_checkpointing: bool = False
+    offload_to_cpu: bool = False
+    subsample_msa: bool = False
+    num_subsampled_msa: int = 1024
+@dataclass
+class BoltzDiffusionParams:
+    """Diffusion process parameters."""
+    gamma_0: float = 0.605
+    gamma_min: float = 1.107
+    noise_scale: float = 0.901
+    rho: float = 8
+    step_scale: float = 1.638
+    sigma_min: float = 0.0004
+    sigma_max: float = 160.0
+    sigma_data: float = 16.0
+    P_mean: float = -1.2
+    P_std: float = 1.5
+    coordinate_augmentation: bool = True
+    alignment_reverse_diff: bool = True
+    synchronize_sigmas: bool = True
+    use_inference_model_cache: bool = True
+@dataclass
+class Boltz2DiffusionParams:
+    """Diffusion process parameters."""
+    gamma_0: float = 0.8
+    gamma_min: float = 1.0
+    noise_scale: float = 1.003
+    rho: float = 7
+    step_scale: float = 1.5
+    sigma_min: float = 0.0001
+    sigma_max: float = 160.0
+    sigma_data: float = 16.0
+    P_mean: float = -1.2
+    P_std: float = 1.5
+    coordinate_augmentation: bool = True
+    alignment_reverse_diff: bool = True
+    synchronize_sigmas: bool = True
+@dataclass
+class BoltzSteeringParams:
+    """Steering parameters."""
+    fk_steering: bool = True
+    num_particles: int = 3
+    fk_lambda: float = 4.0
+    fk_resampling_interval: int = 3
+    guidance_update: bool = True
+    num_gd_steps: int = 20
+@rank_zero_only
+def download_boltz1(cache: Path) -> None:
+    """Download all the required data.
+    Parameters
+    ----------
+    cache : Path
+        The cache directory.
+    """
+    # Download CCD
+    ccd = cache / "ccd.pkl"
+    if not ccd.exists():
+        click.echo(
+            f"Downloading the CCD dictionary to {ccd}. You may "
+            "change the cache directory with the --cache flag."
+        )
+        urllib.request.urlretrieve(CCD_URL, str(ccd))  # noqa: S310
+    # Download model
+    model = cache / "boltz1_conf.ckpt"
+    if not model.exists():
+        click.echo(
+            f"Downloading the model weights to {model}. You may "
+            "change the cache directory with the --cache flag."
+        )
+        for i, url in enumerate(BOLTZ1_URL_WITH_FALLBACK):
+            try:
+                urllib.request.urlretrieve(url, str(model))  # noqa: S310
+                break
+            except Exception as e:  # noqa: BLE001
+                if i == len(BOLTZ1_URL_WITH_FALLBACK) - 1:
+                    msg = f"Failed to download model from all URLs. Last error: {e}"
+                    raise RuntimeError(msg) from e
+                continue
+@rank_zero_only
+def download_boltz2(cache: Path) -> None:
+    """Download all the required data.
+    Parameters
+    ----------
+    cache : Path
+        The cache directory.
+    """
+    # Download CCD
+    mols = cache / "mols"
+    tar_mols = cache / "mols.tar"
+    if not mols.exists():
+        click.echo(
+            f"Downloading and extracting the CCD data to {mols}. "
+            "This may take a bit of time. You may change the cache directory "
+            "with the --cache flag."
+        )
+        urllib.request.urlretrieve(MOL_URL, str(tar_mols))  # noqa: S310
+        with tarfile.open(str(tar_mols), "r") as tar:
+            tar.extractall(cache)  # noqa: S202
+    # Download model
+    model = cache / "boltz2_conf.ckpt"
+    if not model.exists():
+        click.echo(
+            f"Downloading the Boltz-2 weights to {model}. You may "
+            "change the cache directory with the --cache flag."
+        )
+        for i, url in enumerate(BOLTZ2_URL_WITH_FALLBACK):
+            try:
+                urllib.request.urlretrieve(url, str(model))  # noqa: S310
+                break
+            except Exception as e:  # noqa: BLE001
+                if i == len(BOLTZ2_URL_WITH_FALLBACK) - 1:
+                    msg = f"Failed to download model from all URLs. Last error: {e}"
+                    raise RuntimeError(msg) from e
+                continue
+    # Download affinity model
+    affinity_model = cache / "boltz2_aff.ckpt"
+    if not affinity_model.exists():
+        click.echo(
+            f"Downloading the Boltz-2 affinity weights to {affinity_model}. You may "
+            "change the cache directory with the --cache flag."
+        )
+        for i, url in enumerate(BOLTZ2_AFFINITY_URL_WITH_FALLBACK):
+            try:
+                urllib.request.urlretrieve(url, str(affinity_model))  # noqa: S310
+                break
+            except Exception as e:  # noqa: BLE001
+                if i == len(BOLTZ2_AFFINITY_URL_WITH_FALLBACK) - 1:
+                    msg = f"Failed to download model from all URLs. Last error: {e}"
+                    raise RuntimeError(msg) from e
+                continue
+def get_cache_path() -> str:
+    """Determine the cache path, prioritising the BOLTZ_CACHE environment variable.
+    Returns
+    -------
+    str: Path
+        Path to use for boltz cache location.
+    """
+    env_cache = os.environ.get("BOLTZ_CACHE")
+    if env_cache:
+        resolved_cache = Path(env_cache).expanduser().resolve()
+        if not resolved_cache.is_absolute():
+            msg = f"BOLTZ_CACHE must be an absolute path, got: {env_cache}"
+            raise ValueError(msg)
+        return str(resolved_cache)
+    return str(Path("~/.boltz").expanduser())
+def check_inputs(data: Path) -> list[Path]:
+    """Check the input data and output directory.
+    Parameters
+    ----------
+    data : Path
+        The input data.
+    Returns
+    -------
+    list[Path]
+        The list of input data.
+    """
+    click.echo("Checking input data.")
+    # Check if data is a directory
+    if data.is_dir():
+        # Get all files recursively
+        data: list[Path] = []
+        for ext in (".fa", ".fas", ".fasta", ".yml", ".yaml"):
+            data.extend(data.glob(f"**/*{ext}"))
+        if not data:
+            msg = f"No .fasta or .yaml files found in {data}"
+            raise RuntimeError(msg)
+        # Filter out directories and invalid file types
+        valid_data = []
+        for d in data:
+            if d.is_dir():
+                msg = f"Found directory {d} instead of .fasta or .yaml."
+                click.echo(f"Warning: {msg}")
+                continue
+            if d.suffix not in (".fa", ".fas", ".fasta", ".yml", ".yaml"):
+                msg = f"Warning: Skipping file with unsupported extension {d.suffix}"
+                click.echo(msg)
+                continue
+            valid_data.append(d)
+        if not valid_data:
+            msg = "No valid input files found after filtering."
+            raise RuntimeError(msg)
+        data = valid_data
+    else:
+        # Single file case
+        if data.suffix not in (".fa", ".fas", ".fasta", ".yml", ".yaml"):
+            msg = (
+                f"Unable to parse filetype {data.suffix}, "
+                "please provide a .fasta or .yaml file."
+            )
+            raise RuntimeError(msg)
+        data = [data]
+    click.echo(f"Found {len(data)} valid input files.")
+    return data
+def filter_inputs_structure(
+    manifest: Manifest,
+    outdir: Path,
+    override: bool = False,
+) -> Manifest:
+    """Filter the manifest to only include missing predictions.
+    Parameters
+    ----------
+    manifest : Manifest
+        The manifest of the input data.
+    outdir : Path
+        The output directory.
+    override: bool
+        Whether to override existing predictions.
+    Returns
+    -------
+    Manifest
+        The manifest of the filtered input data.
+    """
+    # Check if existing predictions are found
+    existing = (outdir / "predictions").rglob("*")
+    existing = {e.name for e in existing if e.is_dir()}
+    # Remove them from the input data
+    if existing and not override:
+        manifest = Manifest([r for r in manifest.records if r.id not in existing])
+        msg = (
+            f"Found some existing predictions ({len(existing)}), "
+            f"skipping and running only the missing ones, "
+            "if any. If you wish to override these existing "
+            "predictions, please set the --override flag."
+        )
+        click.echo(msg)
+    elif existing and override:
+        msg = f"Found {len(existing)} existing predictions, will override."
+        click.echo(msg)
+    return manifest
+def filter_inputs_affinity(
+    manifest: Manifest,
+    outdir: Path,
+    override: bool = False,
+) -> Manifest:
+    """Check the input data and output directory for affinity.
+    Parameters
+    ----------
+    manifest : Manifest
+        The manifest.
+    outdir : Path
+        The output directory.
+    override: bool
+        Whether to override existing predictions.
+    Returns
+    -------
+    Manifest
+        The manifest of the filtered input data.
+    """
+    click.echo("Checking input data for affinity.")
+    # Get all affinity targets
+    existing = {
+        r.id
+        for r in manifest.records
+        if r.affinity
+        and (outdir / "predictions" / r.id / f"affinity_{r.id}.json").exists()
+    }
+    # Remove them from the input data
+    if existing and not override:
+        num_skipped = len(existing)
+        msg = (
+            f"Found some existing affinity predictions ({num_skipped}), "
+            f"skipping and running only the missing ones, "
+            "if any. If you wish to override these existing "
+            "affinity predictions, please set the --override flag."
+        )
+        click.echo(msg)
+    elif existing and override:
+        msg = "Found existing affinity predictions, will override."
+        click.echo(msg)
+    return Manifest([r for r in manifest.records if r.id not in existing])
+def compute_msa(
+    data: dict[str, str],
+    target_id: str,
+    msa_dir: Path,
+    msa_server_url: str,
+    msa_pairing_strategy: str,
+) -> None:
+    """Compute the MSA for the input data.
+    Parameters
+    ----------
+    data : dict[str, str]
+        The input protein sequences.
+    target_id : str
+        The target id.
+    msa_dir : Path
+        The msa directory.
+    msa_server_url : str
+        The MSA server URL.
+    msa_pairing_strategy : str
+        The MSA pairing strategy.
+    """
+    if len(data) > 1:
+        paired_msas = run_mmseqs2(
+            list(data.values()),
+            msa_dir / f"{target_id}_paired_tmp",
+            use_env=True,
+            use_pairing=True,
+            host_url=msa_server_url,
+            pairing_strategy=msa_pairing_strategy,
+        )
+    else:
+        paired_msas = [""] * len(data)
+    unpaired_msa = run_mmseqs2(
+        list(data.values()),
+        msa_dir / f"{target_id}_unpaired_tmp",
+        use_env=True,
+        use_pairing=False,
+        host_url=msa_server_url,
+        pairing_strategy=msa_pairing_strategy,
+    )
+    for idx, name in enumerate(data):
+        # Get paired sequences
+        paired = paired_msas[idx].strip().splitlines()
+        paired = paired[1::2]  # ignore headers
+        paired = paired[: const.max_paired_seqs]
+        # Set key per row and remove empty sequences
+        keys = [idx for idx, s in enumerate(paired) if s != "-" * len(s)]
+        paired = [s for s in paired if s != "-" * len(s)]
+        # Combine paired-unpaired sequences
+        unpaired = unpaired_msa[idx].strip().splitlines()
+        unpaired = unpaired[1::2]
+        unpaired = unpaired[: (const.max_msa_seqs - len(paired))]
+        if paired:
+            unpaired = unpaired[1:]  # ignore query is already present
+        # Combine
+        seqs = paired + unpaired
+        keys = keys + [-1] * len(unpaired)
+        # Dump MSA
+        csv_str = ["key,sequence"] + [f"{key},{seq}" for key, seq in zip(keys, seqs)]
+        msa_path = msa_dir / f"{name}.csv"
+        with msa_path.open("w") as f:
+            f.write("\n".join(csv_str))
+def process_input(  # noqa: C901, PLR0912, PLR0915, D103
+    path: Path,
+    ccd: dict,
+    msa_dir: Path,
+    mol_dir: Path,
+    boltz2: bool,
+    use_msa_server: bool,
+    msa_server_url: str,
+    msa_pairing_strategy: str,
+    max_msa_seqs: int,
+    processed_msa_dir: Path,
+    processed_constraints_dir: Path,
+    processed_templates_dir: Path,
+    processed_mols_dir: Path,
+    structure_dir: Path,
+    records_dir: Path,
+) -> None:
+    try:
+        # Parse data
+        if path.suffix in (".fa", ".fas", ".fasta"):
+            target = parse_fasta(path, ccd, mol_dir, boltz2)
+        elif path.suffix in (".yml", ".yaml"):
+            target = parse_yaml(path, ccd, mol_dir, boltz2)
+        elif path.is_dir():
+            msg = f"Found directory {path} instead of .fasta or .yaml, skipping."
+            raise RuntimeError(msg)  # noqa: TRY301
+        else:
+            msg = (
+                f"Unable to parse filetype {path.suffix}, "
+                "please provide a .fasta or .yaml file."
+            )
+            raise RuntimeError(msg)  # noqa: TRY301
+        # Get target id
+        target_id = target.record.id
+        # Get all MSA ids and decide whether to generate MSA
+        to_generate = {}
+        prot_id = const.chain_type_ids["PROTEIN"]
+        for chain in target.record.chains:
+            # Add to generate list, assigning entity id
+            if (chain.mol_type == prot_id) and (chain.msa_id == 0):
+                entity_id = chain.entity_id
+                msa_id = f"{target_id}_{entity_id}"
+                to_generate[msa_id] = target.sequences[entity_id]
+                chain.msa_id = msa_dir / f"{msa_id}.csv"
+            # We do not support msa generation for non-protein chains
+            elif chain.msa_id == 0:
+                chain.msa_id = -1
+        # Generate MSA
+        if to_generate and not use_msa_server:
+            msg = "Missing MSA's in input and --use_msa_server flag not set."
+            raise RuntimeError(msg)  # noqa: TRY301
+        if to_generate:
+            msg = f"Generating MSA for {path} with {len(to_generate)} protein entities."
+            click.echo(msg)
+            compute_msa(
+                data=to_generate,
+                target_id=target_id,
+                msa_dir=msa_dir,
+                msa_server_url=msa_server_url,
+                msa_pairing_strategy=msa_pairing_strategy,
+            )
+        # Parse MSA data
+        msas = sorted({c.msa_id for c in target.record.chains if c.msa_id != -1})
+        msa_id_map = {}
+        for msa_idx, msa_id in enumerate(msas):
+            # Check that raw MSA exists
+            msa_path = Path(msa_id)
+            if not msa_path.exists():
+                msg = f"MSA file {msa_path} not found."
+                raise FileNotFoundError(msg)  # noqa: TRY301
+            # Dump processed MSA
+            processed = processed_msa_dir / f"{target_id}_{msa_idx}.npz"
+            msa_id_map[msa_id] = f"{target_id}_{msa_idx}"
+            if not processed.exists():
+                # Parse A3M
+                if msa_path.suffix == ".a3m":
+                    msa: MSA = parse_a3m(
+                        msa_path,
+                        taxonomy=None,
+                        max_seqs=max_msa_seqs,
+                    )
+                elif msa_path.suffix == ".csv":
+                    msa: MSA = parse_csv(msa_path, max_seqs=max_msa_seqs)
+                else:
+                    msg = f"MSA file {msa_path} not supported, only a3m or csv."
+                    raise RuntimeError(msg)  # noqa: TRY301
+                msa.dump(processed)
+        # Modify records to point to processed MSA
+        for c in target.record.chains:
+            if (c.msa_id != -1) and (c.msa_id in msa_id_map):
+                c.msa_id = msa_id_map[c.msa_id]
+        # Dump templates
+        for template_id, template in target.templates.items():
+            name = f"{target.record.id}_{template_id}.npz"
+            template_path = processed_templates_dir / name
+            template.dump(template_path)
+        # Dump constraints
+        constraints_path = processed_constraints_dir / f"{target.record.id}.npz"
+        target.residue_constraints.dump(constraints_path)
+        # Dump extra molecules
+        Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
+        with (processed_mols_dir / f"{target.record.id}.pkl").open("wb") as f:
+            pickle.dump(target.extra_mols, f)
+        # Dump structure
+        struct_path = structure_dir / f"{target.record.id}.npz"
+        target.structure.dump(struct_path)
+        # Dump record
+        record_path = records_dir / f"{target.record.id}.json"
+        target.record.dump(record_path)
+    except Exception as e:  # noqa: BLE001
+        import traceback
+        traceback.print_exc()
+        print(f"Failed to process {path}. Skipping. Error: {e}.")  # noqa: T201
+def process_inputs(
+    data: list[Path],
+    out_dir: Path,
+    ccd_path: Path,
+    mol_dir: Path,
+    msa_server_url: str,
+    msa_pairing_strategy: str,
+    max_msa_seqs: int = 8192,
+    use_msa_server: bool = False,
+    boltz2: bool = False,
+    preprocessing_threads: int = 1,
+) -> Manifest:
+    """Process the input data and output directory.
+    Parameters
+    ----------
+    data : list[Path]
+        The input data.
+    out_dir : Path
+        The output directory.
+    ccd_path : Path
+        The path to the CCD dictionary.
+    max_msa_seqs : int, optional
+        Max number of MSA sequences, by default 4096.
+    use_msa_server : bool, optional
+        Whether to use the MMSeqs2 server for MSA generation, by default False.
+    boltz2: bool, optional
+        Whether to use Boltz2, by default False.
+    preprocessing_threads: int, optional
+        The number of threads to use for preprocessing, by default 1.
+    Returns
+    -------
+    Manifest
+        The manifest of the processed input data.
+    """
+    all_records = []
+    # Process each input file in its own directory
+    for input_file in data:
+        # Create a subdirectory for this input file
+        file_out_dir = out_dir / input_file.stem
+        file_out_dir.mkdir(parents=True, exist_ok=True)
+        # Check if records exist at output path
+        records_dir = file_out_dir / "processed" / "records"
+        if records_dir.exists():
+            # Load existing records
+            existing = [Record.load(p) for p in records_dir.glob("*.json")]
+            processed_ids = {record.id for record in existing}
+            # Skip if already processed
+            if input_file.stem in processed_ids:
+                click.echo(f"Found existing processed input for {input_file.stem}, skipping.")
+                all_records.extend(existing)
+                continue
+        # Create output directories for this file
+        msa_dir = file_out_dir / "msa"
+        records_dir = file_out_dir / "processed" / "records"
+        structure_dir = file_out_dir / "processed" / "structures"
+        processed_msa_dir = file_out_dir / "processed" / "msa"
+        processed_constraints_dir = file_out_dir / "processed" / "constraints"
+        processed_templates_dir = file_out_dir / "processed" / "templates"
+        processed_mols_dir = file_out_dir / "processed" / "mols"
+        predictions_dir = file_out_dir / "predictions"
+        file_out_dir.mkdir(parents=True, exist_ok=True)
+        msa_dir.mkdir(parents=True, exist_ok=True)
+        records_dir.mkdir(parents=True, exist_ok=True)
+        structure_dir.mkdir(parents=True, exist_ok=True)
+        processed_msa_dir.mkdir(parents=True, exist_ok=True)
+        processed_constraints_dir.mkdir(parents=True, exist_ok=True)
+        processed_templates_dir.mkdir(parents=True, exist_ok=True)
+        processed_mols_dir.mkdir(parents=True, exist_ok=True)
+        predictions_dir.mkdir(parents=True, exist_ok=True)
+        # Load CCD
+        if boltz2:
+            ccd = load_canonicals(mol_dir)
+        else:
+            with ccd_path.open("rb") as file:
+                ccd = pickle.load(file)  # noqa: S301
+        # Create partial function
+        process_input_partial = partial(
+            process_input,
+            ccd=ccd,
+            msa_dir=msa_dir,
+            mol_dir=mol_dir,
+            boltz2=boltz2,
+            use_msa_server=use_msa_server,
+            msa_server_url=msa_server_url,
+            msa_pairing_strategy=msa_pairing_strategy,
+            max_msa_seqs=max_msa_seqs,
+            processed_msa_dir=processed_msa_dir,
+            processed_constraints_dir=processed_constraints_dir,
+            processed_templates_dir=processed_templates_dir,
+            processed_mols_dir=processed_mols_dir,
+            structure_dir=structure_dir,
+            records_dir=records_dir,
+        )
+        # Process this input file
+        click.echo(f"Processing {input_file.name}")
+        process_input_partial(input_file)
+        # Load records for this file
+        records = [Record.load(p) for p in records_dir.glob("*.json")]
+        all_records.extend(records)
+    # Create combined manifest
+    manifest = Manifest(all_records)
+    manifest.dump(out_dir / "processed" / "manifest.json")
+    return manifest
+@click.group()
+def cli() -> None:
+    """Boltz."""
+    return
+@cli.command()
+@click.argument("data", type=click.Path(exists=True))
+@click.option(
+    "--out_dir",
+    type=click.Path(exists=False),
+    help="The path where to save the predictions.",
+    default="./",
+)
+@click.option(
+    "--cache",
+    type=click.Path(exists=False),
+    help=(
+        "The directory where to download the data and model. "
+        "Default is ~/.boltz, or $BOLTZ_CACHE if set."
+    ),
+    default=get_cache_path,
+)
+@click.option(
+    "--checkpoint",
+    type=click.Path(exists=True),
+    help="An optional checkpoint, will use the provided Boltz-1 model by default.",
+    default=None,
+)
+@click.option(
+    "--devices",
+    type=int,
+    help="The number of devices to use for prediction. Default is 1.",
+    default=1,
+)
+@click.option(
+    "--accelerator",
+    type=click.Choice(["gpu", "cpu", "tpu"]),
+    help="The accelerator to use for prediction. Default is gpu.",
+    default="gpu",
+)
+@click.option(
+    "--recycling_steps",
+    type=int,
+    help="The number of recycling steps to use for prediction. Default is 3.",
+    default=3,
+)
+@click.option(
+    "--sampling_steps",
+    type=int,
+    help="The number of sampling steps to use for prediction. Default is 200.",
+    default=200,
+)
+@click.option(
+    "--diffusion_samples",
+    type=int,
+    help="The number of diffusion samples to use for prediction. Default is 1.",
+    default=1,
+)
+@click.option(
+    "--max_parallel_samples",
+    type=int,
+    help="The maximum number of samples to predict in parallel. Default is None.",
+    default=5,
+)
+@click.option(
+    "--step_scale",
+    type=float,
+    help=(
+        "The step size is related to the temperature at "
+        "which the diffusion process samples the distribution. "
+        "The lower the higher the diversity among samples "
+        "(recommended between 1 and 2). "
+        "Default is 1.638 for Boltz-1 and 1.5 for Boltz-2. "
+        "If not provided, the default step size will be used."
+    ),
+    default=None,
+)
+@click.option(
+    "--write_full_pae",
+    type=bool,
+    is_flag=True,
+    help="Whether to dump the pae into a npz file. Default is True.",
+)
+@click.option(
+    "--write_full_pde",
+    type=bool,
+    is_flag=True,
+    help="Whether to dump the pde into a npz file. Default is False.",
+)
+@click.option(
+    "--output_format",
+    type=click.Choice(["pdb", "mmcif"]),
+    help="The output format to use for the predictions. Default is mmcif.",
+    default="mmcif",
+)
+@click.option(
+    "--num_workers",
+    type=int,
+    help="The number of dataloader workers to use for prediction. Default is 2.",
+    default=2,
+)
+@click.option(
+    "--override",
+    is_flag=True,
+    help="Whether to override existing found predictions. Default is False.",
+)
+@click.option(
+    "--seed",
+    type=int,
+    help="Seed to use for random number generator. Default is None (no seeding).",
+    default=None,
+)
+@click.option(
+    "--use_msa_server",
+    is_flag=True,
+    help="Whether to use the MMSeqs2 server for MSA generation. Default is False.",
+)
+@click.option(
+    "--msa_server_url",
+    type=str,
+    help="MSA server url. Used only if --use_msa_server is set. ",
+    default="https://api.colabfold.com",
+)
+@click.option(
+    "--msa_pairing_strategy",
+    type=str,
+    help=(
+        "Pairing strategy to use. Used only if --use_msa_server is set. "
+        "Options are 'greedy' and 'complete'"
+    ),
+    default="greedy",
+)
+@click.option(
+    "--use_potentials",
+    is_flag=True,
+    help="Whether to not use potentials for steering. Default is False.",
+)
+@click.option(
+    "--model",
+    default="boltz2",
+    type=click.Choice(["boltz1", "boltz2"]),
+    help="The model to use for prediction. Default is boltz2.",
+)
+@click.option(
+    "--method",
+    type=str,
+    help="The method to use for prediction. Default is None.",
+    default=None,
+)
+@click.option(
+    "--preprocessing-threads",
+    type=int,
+    help="The number of threads to use for preprocessing. Default is 1.",
+    default=multiprocessing.cpu_count(),
+)
+@click.option(
+    "--affinity_mw_correction",
+    is_flag=True,
+    type=bool,
+    help="Whether to add the Molecular Weight correction to the affinity value head.",
+)
+@click.option(
+    "--sampling_steps_affinity",
+    type=int,
+    help="The number of sampling steps to use for affinity prediction. Default is 200.",
+    default=200,
+)
+@click.option(
+    "--diffusion_samples_affinity",
+    type=int,
+    help="The number of diffusion samples to use for affinity prediction. Default is 5.",
+    default=5,
+)
+@click.option(
+    "--affinity_checkpoint",
+    type=click.Path(exists=True),
+    help="An optional checkpoint, will use the provided Boltz-1 model by default.",
+    default=None,
+)
+@click.option(
+    "--max_msa_seqs",
+    type=int,
+    help="The maximum number of MSA sequences to use for prediction. Default is 8192.",
+    default=8192,
+)
+@click.option(
+    "--subsample_msa",
+    is_flag=True,
+    help="Whether to subsample the MSA. Default is True.",
+)
+@click.option(
+    "--num_subsampled_msa",
+    type=int,
+    help="The number of MSA sequences to subsample. Default is 1024.",
+    default=1024,
+)
+@click.option(
+    "--no_kernels",
+    is_flag=True,
+    help="Whether to disable the kernels. Default False",
+)
+@click.option(
+    "--skip_structure",
+    is_flag=True,
+    help="Skip structure prediction and go straight to affinity prediction. Default is False.",
+)
+def predict(  # noqa: C901, PLR0915, PLR0912
+    data: str,
+    out_dir: str,
+    cache: str = "~/.boltz",
+    checkpoint: Optional[str] = None,
+    affinity_checkpoint: Optional[str] = None,
+    devices: int = 1,
+    accelerator: str = "gpu",
+    recycling_steps: int = 3,
+    sampling_steps: int = 200,
+    diffusion_samples: int = 1,
+    sampling_steps_affinity: int = 200,
+    diffusion_samples_affinity: int = 3,
+    max_parallel_samples: Optional[int] = None,
+    step_scale: Optional[float] = None,
+    write_full_pae: bool = False,
+    write_full_pde: bool = False,
+    output_format: Literal["pdb", "mmcif"] = "mmcif",
+    num_workers: int = 2,
+    override: bool = False,
+    seed: Optional[int] = None,
+    use_msa_server: bool = False,
+    msa_server_url: str = "https://api.colabfold.com",
+    msa_pairing_strategy: str = "greedy",
+    use_potentials: bool = False,
+    model: Literal["boltz1", "boltz2"] = "boltz2",
+    method: Optional[str] = None,
+    affinity_mw_correction: Optional[bool] = False,
+    preprocessing_threads: int = 1,
+    max_msa_seqs: int = 8192,
+    subsample_msa: bool = True,
+    num_subsampled_msa: int = 1024,
+    no_kernels: bool = False,
+    skip_structure: bool = False,
+) -> None:
+    """Run predictions with Boltz."""
+    # If cpu, write a friendly warning
+    if accelerator == "cpu":
+        msg = "Running on CPU, this will be slow. Consider using a GPU."
+        click.echo(msg)
+    # Supress some lightning warnings
+    warnings.filterwarnings(
+        "ignore", ".*that has Tensor Cores. To properly utilize them.*"
+    )
+    # Set no grad
+    torch.set_grad_enabled(False)
+    # Ignore matmul precision warning
+    torch.set_float32_matmul_precision("highest")
+    # Set rdkit pickle logic
+    Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
+    # Set seed if desired
+    if seed is not None:
+        seed_everything(seed)
+    for key in ["CUEQ_DEFAULT_CONFIG", "CUEQ_DISABLE_AOT_TUNING"]:
+        # Disable kernel tuning by default,
+        # but do not modify envvar if already set by caller
+        os.environ[key] = os.environ.get(key, "1")
+    # Set cache path
+    cache = Path(cache).expanduser()
+    cache.mkdir(parents=True, exist_ok=True)
+    # Create output directories
+    data = Path(data).expanduser()
+    out_dir = Path(out_dir).expanduser()
+    out_dir = out_dir / f"boltz_results_{data.stem}"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    # Download necessary data and model
+    if model == "boltz1":
+        download_boltz1(cache)
+    elif model == "boltz2":
+        download_boltz2(cache)
+    else:
+        msg = f"Model {model} not supported. Supported: boltz1, boltz2."
+        raise ValueError(f"Model {model} not supported.")
+    # Validate inputs
+    data = check_inputs(data)
+    # Check method
+    if method is not None:
+        if model == "boltz1":
+            msg = "Method conditioning is not supported for Boltz-1."
+            raise ValueError(msg)
+        if method.lower() not in const.method_types_ids:
+            method_names = list(const.method_types_ids.keys())
+            msg = f"Method {method} not supported. Supported: {method_names}"
+            raise ValueError(msg)
+    # Process inputs
+    ccd_path = cache / "ccd.pkl"
+    mol_dir = cache / "mols"
+    manifest = process_inputs(
+        data=data,
+        out_dir=out_dir,
+        ccd_path=ccd_path,
+        mol_dir=mol_dir,
+        use_msa_server=use_msa_server,
+        msa_server_url=msa_server_url,
+        msa_pairing_strategy=msa_pairing_strategy,
+        boltz2=model == "boltz2",
+        preprocessing_threads=preprocessing_threads,
+        max_msa_seqs=max_msa_seqs,
+    )
+    # Filter out existing predictions
+    filtered_manifest = filter_inputs_structure(
+        manifest=manifest,
+        outdir=out_dir,
+        override=override,
+    )
+    # Load processed data
+    processed_dir = out_dir / "processed"
+    processed = BoltzProcessedInput(
+        manifest=filtered_manifest,
+        targets_dir=processed_dir / "structures",
+        msa_dir=processed_dir / "msa",
+        constraints_dir=(
+            (processed_dir / "constraints")
+            if (processed_dir / "constraints").exists()
+            else None
+        ),
+        template_dir=(
+            (processed_dir / "templates")
+            if (processed_dir / "templates").exists()
+            else None
+        ),
+        extra_mols_dir=(
+            (processed_dir / "mols") if (processed_dir / "mols").exists() else None
+        ),
+    )
+    # Set up trainer
+    strategy = "auto"
+    if (isinstance(devices, int) and devices > 1) or (
+        isinstance(devices, list) and len(devices) > 1
+    ):
+        start_method = "fork" if platform.system() != "win32" else "spawn"
+        strategy = DDPStrategy(start_method=start_method)
+        if len(filtered_manifest.records) < devices:
+            msg = (
+                "Number of requested devices is greater "
+                "than the number of predictions, taking the minimum."
+            )
+            click.echo(msg)
+            if isinstance(devices, list):
+                devices = devices[: max(1, len(filtered_manifest.records))]
+            else:
+                devices = max(1, min(len(filtered_manifest.records), devices))
+    # Set up model parameters
+    if model == "boltz2":
+        diffusion_params = Boltz2DiffusionParams()
+        step_scale = 1.5 if step_scale is None else step_scale
+        diffusion_params.step_scale = step_scale
+        pairformer_args = PairformerArgsV2()
+    else:
+        diffusion_params = BoltzDiffusionParams()
+        step_scale = 1.638 if step_scale is None else step_scale
+        diffusion_params.step_scale = step_scale
+        pairformer_args = PairformerArgs()
+    msa_args = MSAModuleArgs(
+        subsample_msa=subsample_msa,
+        num_subsampled_msa=num_subsampled_msa,
+        use_paired_feature=model == "boltz2",
+    )
+    # Create prediction writer
+    pred_writer = BoltzWriter(
+        data_dir=processed.targets_dir,
+        output_dir=out_dir / "predictions",
+        output_format=output_format,
+        boltz2=model == "boltz2",
+    )
+    # Set up trainer
+    trainer = Trainer(
+        default_root_dir=out_dir,
+        strategy=strategy,
+        callbacks=[pred_writer],
+        accelerator=accelerator,
+        devices=devices,
+        precision=32 if model == "boltz1" else "bf16-mixed",
+    )
+    if filtered_manifest.records and not skip_structure:
+        msg = f"Running structure prediction for {len(filtered_manifest.records)} input"
+        msg += "s." if len(filtered_manifest.records) > 1 else "."
+        click.echo(msg)
+        # Create data module
+        if model == "boltz2":
+            data_module = Boltz2InferenceDataModule(
+                manifest=processed.manifest,
+                target_dir=processed.targets_dir,
+                msa_dir=processed.msa_dir,
+                mol_dir=mol_dir,
+                num_workers=num_workers,
+                constraints_dir=processed.constraints_dir,
+                template_dir=processed.template_dir,
+                extra_mols_dir=processed.extra_mols_dir,
+                override_method=method,
+            )
+        else:
+            data_module = BoltzInferenceDataModule(
+                manifest=processed.manifest,
+                target_dir=processed.targets_dir,
+                msa_dir=processed.msa_dir,
+                num_workers=num_workers,
+                constraints_dir=processed.constraints_dir,
+            )
+        # Load model
+        if checkpoint is None:
+            if model == "boltz2":
+                checkpoint = cache / "boltz2_conf.ckpt"
+            else:
+                checkpoint = cache / "boltz1_conf.ckpt"
+        predict_args = {
+            "recycling_steps": recycling_steps,
+            "sampling_steps": sampling_steps,
+            "diffusion_samples": diffusion_samples,
+            "max_parallel_samples": max_parallel_samples,
+            "write_confidence_summary": True,
+            "write_full_pae": write_full_pae,
+            "write_full_pde": write_full_pde,
+        }
+        steering_args = BoltzSteeringParams()
+        steering_args.fk_steering = use_potentials
+        steering_args.guidance_update = use_potentials
+        model_cls = Boltz2 if model == "boltz2" else Boltz1
+        model_module = model_cls.load_from_checkpoint(
+            checkpoint,
+            strict=True,
+            predict_args=predict_args,
+            map_location="cpu",
+            diffusion_process_args=asdict(diffusion_params),
+            ema=False,
+            use_kernels=not no_kernels,
+            pairformer_args=asdict(pairformer_args),
+            msa_args=asdict(msa_args),
+            steering_args=asdict(steering_args),
+        )
+        model_module.eval()
+        # Compute structure predictions
+        trainer.predict(
+            model_module,
+            datamodule=data_module,
+            return_predictions=False,
+        )
+    # Check if affinity predictions are needed
+    if any(r.affinity for r in manifest.records):
+        # Print header
+        click.echo("\nPredicting property: affinity\n")
+        # Validate inputs
+        manifest_filtered = filter_inputs_affinity(
+            manifest=manifest,
+            outdir=out_dir,
+            override=override,
+        )
+        if not manifest_filtered.records:
+            click.echo("Found existing affinity predictions for all inputs, skipping.")
+            return
+        msg = f"Running affinity prediction for {len(manifest_filtered.records)} input"
+        msg += "s." if len(manifest_filtered.records) > 1 else "."
+        click.echo(msg)
+        pred_writer = BoltzAffinityWriter(
+            data_dir=processed.targets_dir,
+            output_dir=out_dir / "predictions",
+        )
+        data_module = Boltz2InferenceDataModule(
+            manifest=manifest_filtered,
+            target_dir=out_dir / "predictions",
+            msa_dir=processed.msa_dir,
+            mol_dir=mol_dir,
+            num_workers=num_workers,
+            constraints_dir=processed.constraints_dir,
+            template_dir=processed.template_dir,
+            extra_mols_dir=processed.extra_mols_dir,
+            override_method="other",
+            affinity=True,
+        )
+        predict_affinity_args = {
+            "recycling_steps": 5,
+            "sampling_steps": sampling_steps_affinity,
+            "diffusion_samples": diffusion_samples_affinity,
+            "max_parallel_samples": 1,
+            "write_confidence_summary": False,
+            "write_full_pae": False,
+            "write_full_pde": False,
+        }
+        # Load affinity model
+        if affinity_checkpoint is None:
+            affinity_checkpoint = cache / "boltz2_aff.ckpt"
+        model_module = Boltz2.load_from_checkpoint(
+            affinity_checkpoint,
+            strict=True,
+            predict_args=predict_affinity_args,
+            map_location="cpu",
+            diffusion_process_args=asdict(diffusion_params),
+            ema=False,
+            pairformer_args=asdict(pairformer_args),
+            msa_args=asdict(msa_args),
+            steering_args={"fk_steering": False, "guidance_update": False},
+            affinity_mw_correction=affinity_mw_correction,
+        )
+        model_module.eval()
+        trainer.callbacks[0] = pred_writer
+        trainer.predict(
+            model_module,
+            datamodule=data_module,
+            return_predictions=False,
+        )
+if __name__ == "__main__":
+    cli()