PyPI - boltz-vsynthes - Versions diffs - 1.0.0__py3-none-any.whl - Mend

boltz-vsynthes 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

boltz/__init__.py +7 -0
boltz/data/__init__.py +0 -0
boltz/data/const.py +1184 -0
boltz/data/crop/__init__.py +0 -0
boltz/data/crop/affinity.py +164 -0
boltz/data/crop/boltz.py +296 -0
boltz/data/crop/cropper.py +45 -0
boltz/data/feature/__init__.py +0 -0
boltz/data/feature/featurizer.py +1230 -0
boltz/data/feature/featurizerv2.py +2208 -0
boltz/data/feature/symmetry.py +602 -0
boltz/data/filter/__init__.py +0 -0
boltz/data/filter/dynamic/__init__.py +0 -0
boltz/data/filter/dynamic/date.py +76 -0
boltz/data/filter/dynamic/filter.py +24 -0
boltz/data/filter/dynamic/max_residues.py +37 -0
boltz/data/filter/dynamic/resolution.py +34 -0
boltz/data/filter/dynamic/size.py +38 -0
boltz/data/filter/dynamic/subset.py +42 -0
boltz/data/filter/static/__init__.py +0 -0
boltz/data/filter/static/filter.py +26 -0
boltz/data/filter/static/ligand.py +37 -0
boltz/data/filter/static/polymer.py +299 -0
boltz/data/module/__init__.py +0 -0
boltz/data/module/inference.py +307 -0
boltz/data/module/inferencev2.py +429 -0
boltz/data/module/training.py +684 -0
boltz/data/module/trainingv2.py +660 -0
boltz/data/mol.py +900 -0
boltz/data/msa/__init__.py +0 -0
boltz/data/msa/mmseqs2.py +235 -0
boltz/data/pad.py +84 -0
boltz/data/parse/__init__.py +0 -0
boltz/data/parse/a3m.py +134 -0
boltz/data/parse/csv.py +100 -0
boltz/data/parse/fasta.py +138 -0
boltz/data/parse/mmcif.py +1239 -0
boltz/data/parse/mmcif_with_constraints.py +1607 -0
boltz/data/parse/schema.py +1851 -0
boltz/data/parse/yaml.py +68 -0
boltz/data/sample/__init__.py +0 -0
boltz/data/sample/cluster.py +283 -0
boltz/data/sample/distillation.py +57 -0
boltz/data/sample/random.py +39 -0
boltz/data/sample/sampler.py +49 -0
boltz/data/tokenize/__init__.py +0 -0
boltz/data/tokenize/boltz.py +195 -0
boltz/data/tokenize/boltz2.py +396 -0
boltz/data/tokenize/tokenizer.py +24 -0
boltz/data/types.py +777 -0
boltz/data/write/__init__.py +0 -0
boltz/data/write/mmcif.py +305 -0
boltz/data/write/pdb.py +171 -0
boltz/data/write/utils.py +23 -0
boltz/data/write/writer.py +330 -0
boltz/main.py +1292 -0
boltz/model/__init__.py +0 -0
boltz/model/layers/__init__.py +0 -0
boltz/model/layers/attention.py +132 -0
boltz/model/layers/attentionv2.py +111 -0
boltz/model/layers/confidence_utils.py +231 -0
boltz/model/layers/dropout.py +34 -0
boltz/model/layers/initialize.py +100 -0
boltz/model/layers/outer_product_mean.py +98 -0
boltz/model/layers/pair_averaging.py +135 -0
boltz/model/layers/pairformer.py +337 -0
boltz/model/layers/relative.py +58 -0
boltz/model/layers/transition.py +78 -0
boltz/model/layers/triangular_attention/__init__.py +0 -0
boltz/model/layers/triangular_attention/attention.py +189 -0
boltz/model/layers/triangular_attention/primitives.py +409 -0
boltz/model/layers/triangular_attention/utils.py +380 -0
boltz/model/layers/triangular_mult.py +212 -0
boltz/model/loss/__init__.py +0 -0
boltz/model/loss/bfactor.py +49 -0
boltz/model/loss/confidence.py +590 -0
boltz/model/loss/confidencev2.py +621 -0
boltz/model/loss/diffusion.py +171 -0
boltz/model/loss/diffusionv2.py +134 -0
boltz/model/loss/distogram.py +48 -0
boltz/model/loss/distogramv2.py +105 -0
boltz/model/loss/validation.py +1025 -0
boltz/model/models/__init__.py +0 -0
boltz/model/models/boltz1.py +1286 -0
boltz/model/models/boltz2.py +1249 -0
boltz/model/modules/__init__.py +0 -0
boltz/model/modules/affinity.py +223 -0
boltz/model/modules/confidence.py +481 -0
boltz/model/modules/confidence_utils.py +181 -0
boltz/model/modules/confidencev2.py +495 -0
boltz/model/modules/diffusion.py +844 -0
boltz/model/modules/diffusion_conditioning.py +116 -0
boltz/model/modules/diffusionv2.py +677 -0
boltz/model/modules/encoders.py +639 -0
boltz/model/modules/encodersv2.py +565 -0
boltz/model/modules/transformers.py +322 -0
boltz/model/modules/transformersv2.py +261 -0
boltz/model/modules/trunk.py +688 -0
boltz/model/modules/trunkv2.py +828 -0
boltz/model/modules/utils.py +303 -0
boltz/model/optim/__init__.py +0 -0
boltz/model/optim/ema.py +389 -0
boltz/model/optim/scheduler.py +99 -0
boltz/model/potentials/__init__.py +0 -0
boltz/model/potentials/potentials.py +497 -0
boltz/model/potentials/schedules.py +32 -0
boltz_vsynthes-1.0.0.dist-info/METADATA +151 -0
boltz_vsynthes-1.0.0.dist-info/RECORD +112 -0
boltz_vsynthes-1.0.0.dist-info/WHEEL +5 -0
boltz_vsynthes-1.0.0.dist-info/entry_points.txt +2 -0
boltz_vsynthes-1.0.0.dist-info/licenses/LICENSE +21 -0
boltz_vsynthes-1.0.0.dist-info/top_level.txt +1 -0

boltz/model/modules/diffusion.py ADDED Viewed

@@ -0,0 +1,844 @@
+# started from code from https://github.com/lucidrains/alphafold3-pytorch, MIT License, Copyright (c) 2024 Phil Wang
+from __future__ import annotations
+from math import sqrt
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from torch.nn import Module
+import boltz.model.layers.initialize as init
+from boltz.data import const
+from boltz.model.loss.diffusion import (
+    smooth_lddt_loss,
+    weighted_rigid_align,
+)
+from boltz.model.modules.utils import center_random_augmentation
+from boltz.model.modules.encoders import (
+    AtomAttentionDecoder,
+    AtomAttentionEncoder,
+    FourierEmbedding,
+    PairwiseConditioning,
+    SingleConditioning,
+)
+from boltz.model.modules.transformers import (
+    ConditionedTransitionBlock,
+    DiffusionTransformer,
+)
+from boltz.model.modules.utils import (
+    LinearNoBias,
+    compute_random_augmentation,
+    center_random_augmentation,
+    default,
+    log,
+)
+from boltz.model.potentials.potentials import get_potentials
+class DiffusionModule(Module):
+    """Diffusion module"""
+    def __init__(
+        self,
+        token_s: int,
+        token_z: int,
+        atom_s: int,
+        atom_z: int,
+        atoms_per_window_queries: int = 32,
+        atoms_per_window_keys: int = 128,
+        sigma_data: int = 16,
+        dim_fourier: int = 256,
+        atom_encoder_depth: int = 3,
+        atom_encoder_heads: int = 4,
+        token_transformer_depth: int = 24,
+        token_transformer_heads: int = 8,
+        atom_decoder_depth: int = 3,
+        atom_decoder_heads: int = 4,
+        atom_feature_dim: int = 128,
+        conditioning_transition_layers: int = 2,
+        activation_checkpointing: bool = False,
+        offload_to_cpu: bool = False,
+        **kwargs,
+    ) -> None:
+        """Initialize the diffusion module.
+        Parameters
+        ----------
+        token_s : int
+            The single representation dimension.
+        token_z : int
+            The pair representation dimension.
+        atom_s : int
+            The atom single representation dimension.
+        atom_z : int
+            The atom pair representation dimension.
+        atoms_per_window_queries : int, optional
+            The number of atoms per window for queries, by default 32.
+        atoms_per_window_keys : int, optional
+            The number of atoms per window for keys, by default 128.
+        sigma_data : int, optional
+            The standard deviation of the data distribution, by default 16.
+        dim_fourier : int, optional
+            The dimension of the fourier embedding, by default 256.
+        atom_encoder_depth : int, optional
+            The depth of the atom encoder, by default 3.
+        atom_encoder_heads : int, optional
+            The number of heads in the atom encoder, by default 4.
+        token_transformer_depth : int, optional
+            The depth of the token transformer, by default 24.
+        token_transformer_heads : int, optional
+            The number of heads in the token transformer, by default 8.
+        atom_decoder_depth : int, optional
+            The depth of the atom decoder, by default 3.
+        atom_decoder_heads : int, optional
+            The number of heads in the atom decoder, by default 4.
+        atom_feature_dim : int, optional
+            The atom feature dimension, by default 128.
+        conditioning_transition_layers : int, optional
+            The number of transition layers for conditioning, by default 2.
+        activation_checkpointing : bool, optional
+            Whether to use activation checkpointing, by default False.
+        offload_to_cpu : bool, optional
+            Whether to offload the activations to CPU, by default False.
+        """
+        super().__init__()
+        self.atoms_per_window_queries = atoms_per_window_queries
+        self.atoms_per_window_keys = atoms_per_window_keys
+        self.sigma_data = sigma_data
+        self.single_conditioner = SingleConditioning(
+            sigma_data=sigma_data,
+            token_s=token_s,
+            dim_fourier=dim_fourier,
+            num_transitions=conditioning_transition_layers,
+        )
+        self.pairwise_conditioner = PairwiseConditioning(
+            token_z=token_z,
+            dim_token_rel_pos_feats=token_z,
+            num_transitions=conditioning_transition_layers,
+        )
+        self.atom_attention_encoder = AtomAttentionEncoder(
+            atom_s=atom_s,
+            atom_z=atom_z,
+            token_s=token_s,
+            token_z=token_z,
+            atoms_per_window_queries=atoms_per_window_queries,
+            atoms_per_window_keys=atoms_per_window_keys,
+            atom_feature_dim=atom_feature_dim,
+            atom_encoder_depth=atom_encoder_depth,
+            atom_encoder_heads=atom_encoder_heads,
+            structure_prediction=True,
+            activation_checkpointing=activation_checkpointing,
+        )
+        self.s_to_a_linear = nn.Sequential(
+            nn.LayerNorm(2 * token_s), LinearNoBias(2 * token_s, 2 * token_s)
+        )
+        init.final_init_(self.s_to_a_linear[1].weight)
+        self.token_transformer = DiffusionTransformer(
+            dim=2 * token_s,
+            dim_single_cond=2 * token_s,
+            dim_pairwise=token_z,
+            depth=token_transformer_depth,
+            heads=token_transformer_heads,
+            activation_checkpointing=activation_checkpointing,
+            offload_to_cpu=offload_to_cpu,
+        )
+        self.a_norm = nn.LayerNorm(2 * token_s)
+        self.atom_attention_decoder = AtomAttentionDecoder(
+            atom_s=atom_s,
+            atom_z=atom_z,
+            token_s=token_s,
+            attn_window_queries=atoms_per_window_queries,
+            attn_window_keys=atoms_per_window_keys,
+            atom_decoder_depth=atom_decoder_depth,
+            atom_decoder_heads=atom_decoder_heads,
+            activation_checkpointing=activation_checkpointing,
+        )
+    def forward(
+        self,
+        s_inputs,
+        s_trunk,
+        z_trunk,
+        r_noisy,
+        times,
+        relative_position_encoding,
+        feats,
+        multiplicity=1,
+        model_cache=None,
+    ):
+        s, normed_fourier = self.single_conditioner(
+            times=times,
+            s_trunk=s_trunk.repeat_interleave(multiplicity, 0),
+            s_inputs=s_inputs.repeat_interleave(multiplicity, 0),
+        )
+        if model_cache is None or len(model_cache) == 0:
+            z = self.pairwise_conditioner(
+                z_trunk=z_trunk, token_rel_pos_feats=relative_position_encoding
+            )
+        else:
+            z = None
+        # Compute Atom Attention Encoder and aggregation to coarse-grained tokens
+        a, q_skip, c_skip, p_skip, to_keys = self.atom_attention_encoder(
+            feats=feats,
+            s_trunk=s_trunk,
+            z=z,
+            r=r_noisy,
+            multiplicity=multiplicity,
+            model_cache=model_cache,
+        )
+        # Full self-attention on token level
+        a = a + self.s_to_a_linear(s)
+        mask = feats["token_pad_mask"].repeat_interleave(multiplicity, 0)
+        a = self.token_transformer(
+            a,
+            mask=mask.float(),
+            s=s,
+            z=z,  # note z is not expanded with multiplicity until after bias is computed
+            multiplicity=multiplicity,
+            model_cache=model_cache,
+        )
+        a = self.a_norm(a)
+        # Broadcast token activations to atoms and run Sequence-local Atom Attention
+        r_update = self.atom_attention_decoder(
+            a=a,
+            q=q_skip,
+            c=c_skip,
+            p=p_skip,
+            feats=feats,
+            multiplicity=multiplicity,
+            to_keys=to_keys,
+            model_cache=model_cache,
+        )
+        return {"r_update": r_update, "token_a": a.detach()}
+class OutTokenFeatUpdate(Module):
+    """Output token feature update"""
+    def __init__(
+        self,
+        sigma_data: float,
+        token_s=384,
+        dim_fourier=256,
+    ):
+        """Initialize the Output token feature update for confidence model.
+        Parameters
+        ----------
+        sigma_data : float
+            The standard deviation of the data distribution.
+        token_s : int, optional
+            The token dimension, by default 384.
+        dim_fourier : int, optional
+            The dimension of the fourier embedding, by default 256.
+        """
+        super().__init__()
+        self.sigma_data = sigma_data
+        self.norm_next = nn.LayerNorm(2 * token_s)
+        self.fourier_embed = FourierEmbedding(dim_fourier)
+        self.norm_fourier = nn.LayerNorm(dim_fourier)
+        self.transition_block = ConditionedTransitionBlock(
+            2 * token_s, 2 * token_s + dim_fourier
+        )
+    def forward(
+        self,
+        times,
+        acc_a,
+        next_a,
+    ):
+        next_a = self.norm_next(next_a)
+        fourier_embed = self.fourier_embed(times)
+        normed_fourier = (
+            self.norm_fourier(fourier_embed)
+            .unsqueeze(1)
+            .expand(-1, next_a.shape[1], -1)
+        )
+        cond_a = torch.cat((acc_a, normed_fourier), dim=-1)
+        acc_a = acc_a + self.transition_block(next_a, cond_a)
+        return acc_a
+class AtomDiffusion(Module):
+    """Atom diffusion module"""
+    def __init__(
+        self,
+        score_model_args,
+        num_sampling_steps=5,
+        sigma_min=0.0004,
+        sigma_max=160.0,
+        sigma_data=16.0,
+        rho=7,
+        P_mean=-1.2,
+        P_std=1.5,
+        gamma_0=0.8,
+        gamma_min=1.0,
+        noise_scale=1.003,
+        step_scale=1.5,
+        coordinate_augmentation=True,
+        compile_score=False,
+        alignment_reverse_diff=False,
+        synchronize_sigmas=False,
+        use_inference_model_cache=False,
+        accumulate_token_repr=False,
+        **kwargs,
+    ):
+        """Initialize the atom diffusion module.
+        Parameters
+        ----------
+        score_model_args : dict
+            The arguments for the score model.
+        num_sampling_steps : int, optional
+            The number of sampling steps, by default 5.
+        sigma_min : float, optional
+            The minimum sigma value, by default 0.0004.
+        sigma_max : float, optional
+            The maximum sigma value, by default 160.0.
+        sigma_data : float, optional
+            The standard deviation of the data distribution, by default 16.0.
+        rho : int, optional
+            The rho value, by default 7.
+        P_mean : float, optional
+            The mean value of P, by default -1.2.
+        P_std : float, optional
+            The standard deviation of P, by default 1.5.
+        gamma_0 : float, optional
+            The gamma value, by default 0.8.
+        gamma_min : float, optional
+            The minimum gamma value, by default 1.0.
+        noise_scale : float, optional
+            The noise scale, by default 1.003.
+        step_scale : float, optional
+            The step scale, by default 1.5.
+        coordinate_augmentation : bool, optional
+            Whether to use coordinate augmentation, by default True.
+        compile_score : bool, optional
+            Whether to compile the score model, by default False.
+        alignment_reverse_diff : bool, optional
+            Whether to use alignment reverse diff, by default False.
+        synchronize_sigmas : bool, optional
+            Whether to synchronize the sigmas, by default False.
+        use_inference_model_cache : bool, optional
+            Whether to use the inference model cache, by default False.
+        accumulate_token_repr : bool, optional
+            Whether to accumulate the token representation, by default False.
+        """
+        super().__init__()
+        self.score_model = DiffusionModule(
+            **score_model_args,
+        )
+        if compile_score:
+            self.score_model = torch.compile(
+                self.score_model, dynamic=False, fullgraph=False
+            )
+        # parameters
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.sigma_data = sigma_data
+        self.rho = rho
+        self.P_mean = P_mean
+        self.P_std = P_std
+        self.num_sampling_steps = num_sampling_steps
+        self.gamma_0 = gamma_0
+        self.gamma_min = gamma_min
+        self.noise_scale = noise_scale
+        self.step_scale = step_scale
+        self.coordinate_augmentation = coordinate_augmentation
+        self.alignment_reverse_diff = alignment_reverse_diff
+        self.synchronize_sigmas = synchronize_sigmas
+        self.use_inference_model_cache = use_inference_model_cache
+        self.accumulate_token_repr = accumulate_token_repr
+        self.token_s = score_model_args["token_s"]
+        if self.accumulate_token_repr:
+            self.out_token_feat_update = OutTokenFeatUpdate(
+                sigma_data=sigma_data,
+                token_s=score_model_args["token_s"],
+                dim_fourier=score_model_args["dim_fourier"],
+            )
+        self.register_buffer("zero", torch.tensor(0.0), persistent=False)
+    @property
+    def device(self):
+        return next(self.score_model.parameters()).device
+    def c_skip(self, sigma):
+        return (self.sigma_data**2) / (sigma**2 + self.sigma_data**2)
+    def c_out(self, sigma):
+        return sigma * self.sigma_data / torch.sqrt(self.sigma_data**2 + sigma**2)
+    def c_in(self, sigma):
+        return 1 / torch.sqrt(sigma**2 + self.sigma_data**2)
+    def c_noise(self, sigma):
+        return log(sigma / self.sigma_data) * 0.25
+    def preconditioned_network_forward(
+        self,
+        noised_atom_coords,
+        sigma,
+        network_condition_kwargs: dict,
+        training: bool = True,
+    ):
+        batch, device = noised_atom_coords.shape[0], noised_atom_coords.device
+        if isinstance(sigma, float):
+            sigma = torch.full((batch,), sigma, device=device)
+        padded_sigma = rearrange(sigma, "b -> b 1 1")
+        net_out = self.score_model(
+            r_noisy=self.c_in(padded_sigma) * noised_atom_coords,
+            times=self.c_noise(sigma),
+            **network_condition_kwargs,
+        )
+        denoised_coords = (
+            self.c_skip(padded_sigma) * noised_atom_coords
+            + self.c_out(padded_sigma) * net_out["r_update"]
+        )
+        return denoised_coords, net_out["token_a"]
+    def sample_schedule(self, num_sampling_steps=None):
+        num_sampling_steps = default(num_sampling_steps, self.num_sampling_steps)
+        inv_rho = 1 / self.rho
+        steps = torch.arange(
+            num_sampling_steps, device=self.device, dtype=torch.float32
+        )
+        sigmas = (
+            self.sigma_max**inv_rho
+            + steps
+            / (num_sampling_steps - 1)
+            * (self.sigma_min**inv_rho - self.sigma_max**inv_rho)
+        ) ** self.rho
+        sigmas = sigmas * self.sigma_data
+        sigmas = F.pad(sigmas, (0, 1), value=0.0)  # last step is sigma value of 0.
+        return sigmas
+    def sample(
+        self,
+        atom_mask,
+        num_sampling_steps=None,
+        multiplicity=1,
+        max_parallel_samples=None,
+        train_accumulate_token_repr=False,
+        steering_args=None,
+        **network_condition_kwargs,
+    ):
+        if steering_args is not None and (steering_args["fk_steering"] or steering_args["guidance_update"]):
+            potentials = get_potentials()
+        if steering_args is not None and steering_args["fk_steering"]:
+            multiplicity = multiplicity * steering_args["num_particles"]
+            energy_traj = torch.empty((multiplicity, 0), device=self.device)
+            resample_weights = torch.ones(multiplicity, device=self.device).reshape(
+                -1, steering_args["num_particles"]
+            )
+        if steering_args is not None and steering_args["guidance_update"]:
+            scaled_guidance_update = torch.zeros(
+                (multiplicity, *atom_mask.shape[1:], 3),
+                dtype=torch.float32,
+                device=self.device,
+            )
+        num_sampling_steps = default(num_sampling_steps, self.num_sampling_steps)
+        atom_mask = atom_mask.repeat_interleave(multiplicity, 0)
+        shape = (*atom_mask.shape, 3)
+        token_repr_shape = (multiplicity, network_condition_kwargs['feats']['token_index'].shape[1], 2 * self.token_s)
+        # get the schedule, which is returned as (sigma, gamma) tuple, and pair up with the next sigma and gamma
+        sigmas = self.sample_schedule(num_sampling_steps)
+        gammas = torch.where(sigmas > self.gamma_min, self.gamma_0, 0.0)
+        sigmas_and_gammas = list(zip(sigmas[:-1], sigmas[1:], gammas[1:]))
+        # atom position is noise at the beginning
+        init_sigma = sigmas[0]
+        atom_coords = init_sigma * torch.randn(shape, device=self.device)
+        atom_coords_denoised = None
+        model_cache = {} if self.use_inference_model_cache else None
+        token_repr = None
+        token_a = None
+        # gradually denoise
+        for step_idx, (sigma_tm, sigma_t, gamma) in enumerate(sigmas_and_gammas):
+            random_R, random_tr = compute_random_augmentation(
+                multiplicity, device=atom_coords.device, dtype=atom_coords.dtype
+            )
+            atom_coords = atom_coords - atom_coords.mean(dim=-2, keepdims=True)
+            atom_coords = (
+                torch.einsum("bmd,bds->bms", atom_coords, random_R) + random_tr
+            )
+            if atom_coords_denoised is not None:
+                atom_coords_denoised -= atom_coords_denoised.mean(dim=-2, keepdims=True)
+                atom_coords_denoised = (
+                    torch.einsum("bmd,bds->bms", atom_coords_denoised, random_R)
+                    + random_tr
+                )
+            if steering_args is not None and steering_args["guidance_update"] and scaled_guidance_update is not None:
+                scaled_guidance_update = torch.einsum(
+                    "bmd,bds->bms", scaled_guidance_update, random_R
+                )
+            sigma_tm, sigma_t, gamma = sigma_tm.item(), sigma_t.item(), gamma.item()
+            t_hat = sigma_tm * (1 + gamma)
+            steering_t = 1.0 - (step_idx / num_sampling_steps)
+            noise_var = self.noise_scale**2 * (t_hat**2 - sigma_tm**2)
+            eps = sqrt(noise_var) * torch.randn(shape, device=self.device)
+            atom_coords_noisy = atom_coords + eps
+            with torch.no_grad():
+                atom_coords_denoised = torch.zeros_like(atom_coords_noisy)
+                token_a = torch.zeros(token_repr_shape).to(atom_coords_noisy)
+                sample_ids = torch.arange(multiplicity).to(atom_coords_noisy.device)
+                sample_ids_chunks = sample_ids.chunk(
+                    multiplicity % max_parallel_samples + 1
+                )
+                for sample_ids_chunk in sample_ids_chunks:
+                    atom_coords_denoised_chunk, token_a_chunk = \
+                          self.preconditioned_network_forward(
+                              atom_coords_noisy[sample_ids_chunk],
+                              t_hat,
+                              training=False,
+                              network_condition_kwargs=dict(
+                                multiplicity=sample_ids_chunk.numel(),
+                                model_cache=model_cache,
+                                **network_condition_kwargs,
+                            ),
+                        )
+                    atom_coords_denoised[sample_ids_chunk] = atom_coords_denoised_chunk
+                    token_a[sample_ids_chunk] = token_a_chunk
+                if steering_args is not None and steering_args["fk_steering"] and (
+                    (
+                        step_idx % steering_args["fk_resampling_interval"] == 0
+                        and noise_var > 0
+                    )
+                    or step_idx == num_sampling_steps - 1
+                ):
+                    # Compute energy of x_0 prediction
+                    energy = torch.zeros(multiplicity, device=self.device)
+                    for potential in potentials:
+                        parameters = potential.compute_parameters(steering_t)
+                        if parameters["resampling_weight"] > 0:
+                            component_energy = potential.compute(
+                                atom_coords_denoised,
+                                network_condition_kwargs["feats"],
+                                parameters,
+                            )
+                            energy += parameters["resampling_weight"] * component_energy
+                    energy_traj = torch.cat((energy_traj, energy.unsqueeze(1)), dim=1)
+                    # Compute log G values
+                    if step_idx == 0:
+                        log_G = -1 * energy
+                    else:
+                        log_G = energy_traj[:, -2] - energy_traj[:, -1]
+                    # Compute ll difference between guided and unguided transition distribution
+                    if steering_args["guidance_update"] and noise_var > 0:
+                        ll_difference = (
+                            eps**2 - (eps + scaled_guidance_update) ** 2
+                        ).sum(dim=(-1, -2)) / (2 * noise_var)
+                    else:
+                        ll_difference = torch.zeros_like(energy)
+                    # Compute resampling weights
+                    resample_weights = F.softmax(
+                        (ll_difference + steering_args["fk_lambda"] * log_G).reshape(
+                            -1, steering_args["num_particles"]
+                        ),
+                        dim=1,
+                    )
+                # Compute guidance update to x_0 prediction
+                if (
+                    steering_args is not None and
+                    steering_args["guidance_update"]
+                    and step_idx < num_sampling_steps - 1
+                ):
+                    guidance_update = torch.zeros_like(atom_coords_denoised)
+                    for guidance_step in range(steering_args["num_gd_steps"]):
+                        energy_gradient = torch.zeros_like(atom_coords_denoised)
+                        for potential in potentials:
+                            parameters = potential.compute_parameters(steering_t)
+                            if (
+                                parameters["guidance_weight"] > 0
+                                and (guidance_step) % parameters["guidance_interval"]
+                                == 0
+                            ):
+                                energy_gradient += parameters[
+                                    "guidance_weight"
+                                ] * potential.compute_gradient(
+                                    atom_coords_denoised + guidance_update,
+                                    network_condition_kwargs["feats"],
+                                    parameters,
+                                )
+                        guidance_update -= energy_gradient
+                    atom_coords_denoised += guidance_update
+                    scaled_guidance_update = (
+                        guidance_update
+                        * -1
+                        * self.step_scale
+                        * (sigma_t - t_hat)
+                        / t_hat
+                    )
+                if steering_args is not None and steering_args["fk_steering"] and (
+                    (
+                        step_idx % steering_args["fk_resampling_interval"] == 0
+                        and noise_var > 0
+                    )
+                    or step_idx == num_sampling_steps - 1
+                ):
+                    resample_indices = (
+                        torch.multinomial(
+                            resample_weights,
+                            resample_weights.shape[1]
+                            if step_idx < num_sampling_steps - 1
+                            else 1,
+                            replacement=True,
+                        )
+                        + resample_weights.shape[1]
+                        * torch.arange(
+                            resample_weights.shape[0], device=resample_weights.device
+                        ).unsqueeze(-1)
+                    ).flatten()
+                    atom_coords = atom_coords[resample_indices]
+                    atom_coords_noisy = atom_coords_noisy[resample_indices]
+                    atom_mask = atom_mask[resample_indices]
+                    if atom_coords_denoised is not None:
+                        atom_coords_denoised = atom_coords_denoised[resample_indices]
+                    energy_traj = energy_traj[resample_indices]
+                    if steering_args["guidance_update"]:
+                        scaled_guidance_update = scaled_guidance_update[
+                            resample_indices
+                        ]
+                    if token_repr is not None:
+                        token_repr = token_repr[resample_indices]
+                    if token_a is not None:
+                        token_a = token_a[resample_indices]
+            if self.accumulate_token_repr:
+                if token_repr is None:
+                    token_repr = torch.zeros_like(token_a)
+                with torch.set_grad_enabled(train_accumulate_token_repr):
+                    sigma = torch.full(
+                        (atom_coords_denoised.shape[0],),
+                        t_hat,
+                        device=atom_coords_denoised.device,
+                    )
+                    token_repr = self.out_token_feat_update(
+                        times=self.c_noise(sigma), acc_a=token_repr, next_a=token_a
+                    )
+            if self.alignment_reverse_diff:
+                with torch.autocast("cuda", enabled=False):
+                    atom_coords_noisy = weighted_rigid_align(
+                        atom_coords_noisy.float(),
+                        atom_coords_denoised.float(),
+                        atom_mask.float(),
+                        atom_mask.float(),
+                    )
+                atom_coords_noisy = atom_coords_noisy.to(atom_coords_denoised)
+            denoised_over_sigma = (atom_coords_noisy - atom_coords_denoised) / t_hat
+            atom_coords_next = (
+                atom_coords_noisy
+                + self.step_scale * (sigma_t - t_hat) * denoised_over_sigma
+            )
+            atom_coords = atom_coords_next
+        return dict(sample_atom_coords=atom_coords, diff_token_repr=token_repr)
+    def loss_weight(self, sigma):
+        return (sigma**2 + self.sigma_data**2) / ((sigma * self.sigma_data) ** 2)
+    def noise_distribution(self, batch_size):
+        return (
+            self.sigma_data
+            * (
+                self.P_mean
+                + self.P_std * torch.randn((batch_size,), device=self.device)
+            ).exp()
+        )
+    def forward(
+        self,
+        s_inputs,
+        s_trunk,
+        z_trunk,
+        relative_position_encoding,
+        feats,
+        multiplicity=1,
+    ):
+        # training diffusion step
+        batch_size = feats["coords"].shape[0]
+        if self.synchronize_sigmas:
+            sigmas = self.noise_distribution(batch_size).repeat_interleave(
+                multiplicity, 0
+            )
+        else:
+            sigmas = self.noise_distribution(batch_size * multiplicity)
+        padded_sigmas = rearrange(sigmas, "b -> b 1 1")
+        atom_coords = feats["coords"]
+        B, N, L = atom_coords.shape[0:3]
+        atom_coords = atom_coords.reshape(B * N, L, 3)
+        atom_coords = atom_coords.repeat_interleave(multiplicity // N, 0)
+        feats["coords"] = atom_coords
+        atom_mask = feats["atom_pad_mask"]
+        atom_mask = atom_mask.repeat_interleave(multiplicity, 0)
+        atom_coords = center_random_augmentation(
+            atom_coords, atom_mask, augmentation=self.coordinate_augmentation
+        )
+        noise = torch.randn_like(atom_coords)
+        noised_atom_coords = atom_coords + padded_sigmas * noise
+        denoised_atom_coords, _ = self.preconditioned_network_forward(
+            noised_atom_coords,
+            sigmas,
+            training=True,
+            network_condition_kwargs=dict(
+                s_inputs=s_inputs,
+                s_trunk=s_trunk,
+                z_trunk=z_trunk,
+                relative_position_encoding=relative_position_encoding,
+                feats=feats,
+                multiplicity=multiplicity,
+            ),
+        )
+        return dict(
+            noised_atom_coords=noised_atom_coords,
+            denoised_atom_coords=denoised_atom_coords,
+            sigmas=sigmas,
+            aligned_true_atom_coords=atom_coords,
+        )
+    def compute_loss(
+        self,
+        feats,
+        out_dict,
+        add_smooth_lddt_loss=True,
+        nucleotide_loss_weight=5.0,
+        ligand_loss_weight=10.0,
+        multiplicity=1,
+    ):
+        denoised_atom_coords = out_dict["denoised_atom_coords"]
+        noised_atom_coords = out_dict["noised_atom_coords"]
+        sigmas = out_dict["sigmas"]
+        resolved_atom_mask = feats["atom_resolved_mask"]
+        resolved_atom_mask = resolved_atom_mask.repeat_interleave(multiplicity, 0)
+        align_weights = noised_atom_coords.new_ones(noised_atom_coords.shape[:2])
+        atom_type = (
+            torch.bmm(
+                feats["atom_to_token"].float(), feats["mol_type"].unsqueeze(-1).float()
+            )
+            .squeeze(-1)
+            .long()
+        )
+        atom_type_mult = atom_type.repeat_interleave(multiplicity, 0)
+        align_weights = align_weights * (
+            1
+            + nucleotide_loss_weight
+            * (
+                torch.eq(atom_type_mult, const.chain_type_ids["DNA"]).float()
+                + torch.eq(atom_type_mult, const.chain_type_ids["RNA"]).float()
+            )
+            + ligand_loss_weight
+            * torch.eq(atom_type_mult, const.chain_type_ids["NONPOLYMER"]).float()
+        )
+        with torch.no_grad(), torch.autocast("cuda", enabled=False):
+            atom_coords = out_dict["aligned_true_atom_coords"]
+            atom_coords_aligned_ground_truth = weighted_rigid_align(
+                atom_coords.detach().float(),
+                denoised_atom_coords.detach().float(),
+                align_weights.detach().float(),
+                mask=resolved_atom_mask.detach().float(),
+            )
+        # Cast back
+        atom_coords_aligned_ground_truth = atom_coords_aligned_ground_truth.to(
+            denoised_atom_coords
+        )
+        # weighted MSE loss of denoised atom positions
+        mse_loss = ((denoised_atom_coords - atom_coords_aligned_ground_truth) ** 2).sum(
+            dim=-1
+        )
+        mse_loss = torch.sum(
+            mse_loss * align_weights * resolved_atom_mask, dim=-1
+        ) / torch.sum(3 * align_weights * resolved_atom_mask, dim=-1)
+        # weight by sigma factor
+        loss_weights = self.loss_weight(sigmas)
+        mse_loss = (mse_loss * loss_weights).mean()
+        total_loss = mse_loss
+        # proposed auxiliary smooth lddt loss
+        lddt_loss = self.zero
+        if add_smooth_lddt_loss:
+            lddt_loss = smooth_lddt_loss(
+                denoised_atom_coords,
+                feats["coords"],
+                torch.eq(atom_type, const.chain_type_ids["DNA"]).float()
+                + torch.eq(atom_type, const.chain_type_ids["RNA"]).float(),
+                coords_mask=feats["atom_resolved_mask"],
+                multiplicity=multiplicity,
+            )
+            total_loss = total_loss + lddt_loss
+        loss_breakdown = dict(
+            mse_loss=mse_loss,
+            smooth_lddt_loss=lddt_loss,
+        )
+        return dict(loss=total_loss, loss_breakdown=loss_breakdown)