PyPI - rc-foundry - Versions diffs - 0.1.1__py3-none-any.whl - Mend

rc-foundry 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

foundry/__init__.py +57 -0
foundry/callbacks/__init__.py +5 -0
foundry/callbacks/callback.py +116 -0
foundry/callbacks/health_logging.py +419 -0
foundry/callbacks/metrics_logging.py +211 -0
foundry/callbacks/timing_logging.py +67 -0
foundry/callbacks/train_logging.py +278 -0
foundry/common.py +108 -0
foundry/constants.py +28 -0
foundry/hydra/resolvers.py +77 -0
foundry/inference_engines/base.py +235 -0
foundry/inference_engines/checkpoint_registry.py +66 -0
foundry/metrics/__init__.py +12 -0
foundry/metrics/losses.py +30 -0
foundry/metrics/metric.py +319 -0
foundry/model/layers/blocks.py +47 -0
foundry/testing/__init__.py +6 -0
foundry/testing/fixtures.py +19 -0
foundry/testing/pytest_hooks.py +15 -0
foundry/trainers/fabric.py +923 -0
foundry/training/EMA.py +67 -0
foundry/training/checkpoint.py +61 -0
foundry/training/schedulers.py +91 -0
foundry/utils/alignment.py +86 -0
foundry/utils/components.py +415 -0
foundry/utils/datasets.py +405 -0
foundry/utils/ddp.py +103 -0
foundry/utils/instantiators.py +72 -0
foundry/utils/logging.py +279 -0
foundry/utils/rigid.py +1460 -0
foundry/utils/rotation_augmentation.py +65 -0
foundry/utils/squashfs.py +172 -0
foundry/utils/torch.py +317 -0
foundry/utils/weights.py +271 -0
foundry/version.py +34 -0
foundry_cli/__init__.py +3 -0
foundry_cli/download_checkpoints.py +281 -0
mpnn/__init__.py +1 -0
mpnn/collate/feature_collator.py +265 -0
mpnn/inference.py +53 -0
mpnn/inference_engines/mpnn.py +549 -0
mpnn/loss/nll_loss.py +122 -0
mpnn/metrics/nll.py +369 -0
mpnn/metrics/sequence_recovery.py +440 -0
mpnn/model/layers/graph_embeddings.py +2372 -0
mpnn/model/layers/message_passing.py +332 -0
mpnn/model/layers/position_wise_feed_forward.py +44 -0
mpnn/model/layers/positional_encoding.py +98 -0
mpnn/model/mpnn.py +2632 -0
mpnn/pipelines/mpnn.py +162 -0
mpnn/samplers/samplers.py +167 -0
mpnn/train.py +341 -0
mpnn/trainers/mpnn.py +193 -0
mpnn/transforms/feature_aggregation/mpnn.py +184 -0
mpnn/transforms/feature_aggregation/polymer_ligand_interface.py +76 -0
mpnn/transforms/feature_aggregation/token_encodings.py +132 -0
mpnn/transforms/feature_aggregation/user_settings.py +347 -0
mpnn/transforms/polymer_ligand_interface.py +164 -0
mpnn/utils/inference.py +2397 -0
mpnn/utils/probability.py +37 -0
mpnn/utils/weights.py +309 -0
rc_foundry-0.1.1.dist-info/METADATA +239 -0
rc_foundry-0.1.1.dist-info/RECORD +180 -0
rc_foundry-0.1.1.dist-info/WHEEL +4 -0
rc_foundry-0.1.1.dist-info/entry_points.txt +5 -0
rc_foundry-0.1.1.dist-info/licenses/LICENSE.md +28 -0
rf3/__init__.py +3 -0
rf3/_version.py +33 -0
rf3/alignment.py +79 -0
rf3/callbacks/dump_validation_structures.py +101 -0
rf3/callbacks/metrics_logging.py +324 -0
rf3/chemical.py +1529 -0
rf3/cli.py +77 -0
rf3/data/cyclic_transform.py +78 -0
rf3/data/extra_xforms.py +36 -0
rf3/data/ground_truth_template.py +463 -0
rf3/data/paired_msa.py +206 -0
rf3/data/pipeline_utils.py +128 -0
rf3/data/pipelines.py +558 -0
rf3/diffusion_samplers/inference_sampler.py +222 -0
rf3/inference.py +65 -0
rf3/inference_engines/__init__.py +5 -0
rf3/inference_engines/rf3.py +735 -0
rf3/kinematics.py +354 -0
rf3/loss/af3_confidence_loss.py +515 -0
rf3/loss/af3_losses.py +655 -0
rf3/loss/loss.py +179 -0
rf3/metrics/chiral.py +179 -0
rf3/metrics/clashing_chains.py +68 -0
rf3/metrics/distogram.py +421 -0
rf3/metrics/lddt.py +523 -0
rf3/metrics/metadata.py +43 -0
rf3/metrics/metric_utils.py +192 -0
rf3/metrics/predicted_error.py +134 -0
rf3/metrics/rasa.py +108 -0
rf3/metrics/selected_distances.py +91 -0
rf3/model/RF3.py +527 -0
rf3/model/RF3_blocks.py +92 -0
rf3/model/RF3_structure.py +303 -0
rf3/model/layers/af3_auxiliary_heads.py +255 -0
rf3/model/layers/af3_diffusion_transformer.py +544 -0
rf3/model/layers/attention.py +313 -0
rf3/model/layers/layer_utils.py +127 -0
rf3/model/layers/mlff.py +118 -0
rf3/model/layers/outer_product.py +59 -0
rf3/model/layers/pairformer_layers.py +783 -0
rf3/model/layers/structure_bias.py +56 -0
rf3/scoring.py +1787 -0
rf3/symmetry/resolve.py +284 -0
rf3/train.py +194 -0
rf3/trainers/rf3.py +570 -0
rf3/util_module.py +47 -0
rf3/utils/frames.py +109 -0
rf3/utils/inference.py +665 -0
rf3/utils/io.py +198 -0
rf3/utils/loss.py +72 -0
rf3/utils/predict_and_score.py +165 -0
rf3/utils/predicted_error.py +673 -0
rf3/utils/recycling.py +42 -0
rf3/validate.py +140 -0
rfd3/.gitignore +7 -0
rfd3/Makefile +76 -0
rfd3/__init__.py +12 -0
rfd3/callbacks.py +66 -0
rfd3/cli.py +41 -0
rfd3/constants.py +212 -0
rfd3/engine.py +543 -0
rfd3/inference/datasets.py +193 -0
rfd3/inference/input_parsing.py +1123 -0
rfd3/inference/legacy_input_parsing.py +717 -0
rfd3/inference/parsing.py +165 -0
rfd3/inference/symmetry/atom_array.py +298 -0
rfd3/inference/symmetry/checks.py +241 -0
rfd3/inference/symmetry/contigs.py +63 -0
rfd3/inference/symmetry/frames.py +355 -0
rfd3/inference/symmetry/symmetry_utils.py +398 -0
rfd3/metrics/design_metrics.py +465 -0
rfd3/metrics/hbonds_hbplus_metrics.py +308 -0
rfd3/metrics/hbonds_metrics.py +389 -0
rfd3/metrics/losses.py +325 -0
rfd3/metrics/metrics_utils.py +118 -0
rfd3/metrics/sidechain_metrics.py +349 -0
rfd3/model/RFD3.py +105 -0
rfd3/model/RFD3_diffusion_module.py +387 -0
rfd3/model/cfg_utils.py +81 -0
rfd3/model/inference_sampler.py +635 -0
rfd3/model/layers/attention.py +577 -0
rfd3/model/layers/block_utils.py +580 -0
rfd3/model/layers/blocks.py +777 -0
rfd3/model/layers/chunked_pairwise.py +377 -0
rfd3/model/layers/encoders.py +417 -0
rfd3/model/layers/layer_utils.py +197 -0
rfd3/model/layers/pairformer_layers.py +128 -0
rfd3/run_inference.py +45 -0
rfd3/testing/debug.py +139 -0
rfd3/testing/debug_utils.py +73 -0
rfd3/testing/testing_utils.py +356 -0
rfd3/train.py +194 -0
rfd3/trainer/dump_validation_structures.py +154 -0
rfd3/trainer/fabric_trainer.py +923 -0
rfd3/trainer/recycling.py +42 -0
rfd3/trainer/rfd3.py +485 -0
rfd3/trainer/trainer_utils.py +502 -0
rfd3/transforms/conditioning_base.py +508 -0
rfd3/transforms/conditioning_utils.py +200 -0
rfd3/transforms/design_transforms.py +807 -0
rfd3/transforms/dna_crop.py +523 -0
rfd3/transforms/hbonds.py +407 -0
rfd3/transforms/hbonds_hbplus.py +246 -0
rfd3/transforms/ncaa_transforms.py +153 -0
rfd3/transforms/pipelines.py +632 -0
rfd3/transforms/ppi_transforms.py +541 -0
rfd3/transforms/rasa.py +116 -0
rfd3/transforms/symmetry.py +76 -0
rfd3/transforms/training_conditions.py +552 -0
rfd3/transforms/util_transforms.py +498 -0
rfd3/transforms/virtual_atoms.py +305 -0
rfd3/utils/inference.py +648 -0
rfd3/utils/io.py +245 -0
rfd3/utils/vizualize.py +276 -0

rfd3/transforms/symmetry.py ADDED Viewed

@@ -0,0 +1,76 @@
+import numpy as np
+import torch
+from atomworks.ml.transforms.base import Transform
+from rfd3.inference.symmetry.frames import (
+    framecoords_to_RTs,
+    unpack_vector,
+)
+class AddSymmetryFeats(Transform):
+    """
+    Add atom_array symmetry features to the data features.
+    Arguments:
+        symmetry_features: The atom_array symmetry features to add to the data features.
+    Returns:
+        data: The data with the atom_array symmetry features added to the data features.
+    """
+    def __init__(
+        self,
+        symmetry_features=[
+            "sym_transform_id",
+            "sym_entity_id",
+            "is_sym_asu",
+        ],
+    ):
+        self.symmetry_feats = symmetry_features
+    def forward(self, data):
+        atom_array = data["atom_array"]
+        # Get frames from atom_array
+        transforms_dict = self.make_transforms_dict(atom_array)
+        data["feats"]["sym_transform"] = transforms_dict  # {str(id): tuple (R,T)}
+        # Else, add symmetry features atomwise
+        for feature_name in self.symmetry_feats:
+            feature_array = atom_array.get_annotation(feature_name)
+            data["feats"][feature_name] = feature_array
+        return data
+    def make_transforms_dict(self, atom_array):
+        transforms_dict = {}
+        # get decomposed frames from atom array (unpacking the vectorized frames)
+        Oris = torch.tensor(
+            [
+                np.asarray(unpack_vector(Ori)).tolist()
+                for Ori in atom_array.get_annotation("sym_transform_Ori")
+            ]
+        )
+        Xs = torch.tensor(
+            [
+                np.asarray(unpack_vector(X)).tolist()
+                for X in atom_array.get_annotation("sym_transform_X")
+            ]
+        )
+        Ys = torch.tensor(
+            [
+                np.asarray(unpack_vector(Y)).tolist()
+                for Y in atom_array.get_annotation("sym_transform_Y")
+            ]
+        )
+        TIDs = torch.from_numpy(atom_array.get_annotation("sym_transform_id"))
+        Oris = torch.unique_consecutive(Oris, dim=0)
+        Xs = torch.unique_consecutive(Xs, dim=0)
+        Ys = torch.unique_consecutive(Ys, dim=0)
+        TIDs = torch.unique_consecutive(TIDs, dim=0)
+        # the case in which there is only rotation (no translation), Ori = [0,0,0]
+        if len(Oris) == 1 and (Oris == 0).all():
+            Oris = Oris.repeat(len(Xs), 1)
+        Rs, Ts = framecoords_to_RTs(Oris, Xs, Ys)
+        for R, T, transform_id in zip(Rs, Ts, TIDs):
+            if transform_id.item() == -1:
+                continue
+            transforms_dict[str(transform_id.item())] = (R, T)
+        return transforms_dict

rfd3/transforms/training_conditions.py ADDED Viewed

@@ -0,0 +1,552 @@
+"""
+Class-based motif masking system
+"""
+import logging
+from abc import ABC, abstractmethod
+import networkx as nx
+import numpy as np
+from atomworks.ml.utils.token import (
+    apply_token_wise,
+    get_token_starts,
+    spread_token_wise,
+)
+from biotite.structure import AtomArray, get_residue_starts
+from rfd3.transforms.conditioning_utils import (
+    random_condition,
+    sample_island_tokens,
+    sample_subgraph_atoms,
+)
+nx.from_numpy_matrix = nx.from_numpy_array
+logger = logging.getLogger(__name__)
+#################################################################################
+# Transform for creating training conditions
+#################################################################################
+class TrainingCondition(ABC):
+    """
+    Base class for applying conditioning during training
+    """
+    name = None
+    def __init__(self, frequency):
+        self.frequency = frequency
+    @abstractmethod
+    def is_valid_for_example(self, data) -> bool:
+        """
+        Returns true whether this mask can be applied to the data instance
+        E.g. only use this transform if data metadata contains key or if data contains type
+        """
+    @abstractmethod
+    def sample(self, data) -> AtomArray:
+        """
+        Set which atoms should be made into tokens
+        """
+class IslandCondition(TrainingCondition):
+    """
+    Select islands as motif and assign conditioning strategies.
+    """
+    def __init__(
+        self,
+        *,
+        name,
+        frequency,
+        island_sampling_kwargs,
+        p_diffuse_motif_sidechains,
+        p_diffuse_subgraph_atoms,
+        subgraph_sampling_kwargs,
+        p_fix_motif_coordinates,
+        p_fix_motif_sequence,
+        p_unindex_motif_tokens,
+    ):
+        self.name = name
+        self.frequency = frequency
+        # Token selection
+        self.island_sampling_kwargs = island_sampling_kwargs
+        # Atom selection
+        self.p_diffuse_motif_sidechains = p_diffuse_motif_sidechains
+        self.p_include_oxygen_in_backbone_mask = 0.95
+        self.p_diffuse_subgraph_atoms = p_diffuse_subgraph_atoms
+        self.subgraph_sampling_kwargs = subgraph_sampling_kwargs
+        # Additional conditioning selection
+        self.p_fix_motif_coordinates = p_fix_motif_coordinates
+        self.p_fix_motif_sequence = p_fix_motif_sequence
+        self.p_unindex_motif_tokens = p_unindex_motif_tokens
+    def is_valid_for_example(self, data) -> bool:
+        is_protein = data["atom_array"].is_protein
+        if not np.any(is_protein):
+            return False
+        return True
+    def sample_motif_tokens(self, atom_array):
+        """
+        Samples what tokens should be considered motif.
+        """
+        token_level_array = atom_array[get_token_starts(atom_array)]
+        # initialize motif tokens as all non-protein tokens
+        is_motif_token = np.asarray(~token_level_array.is_protein, dtype=bool).copy()
+        n_protein_tokens = np.sum(token_level_array.is_protein)
+        islands_mask = sample_island_tokens(
+            n_protein_tokens,
+            **self.island_sampling_kwargs,
+        )
+        is_motif_token[token_level_array.is_protein] = islands_mask
+        # TODO: Atoms with covalent bonds should be motif, needs FlagAndReassignCovalentModifications transform prior to this
+        # atom_with_coval_bond = token_level_array.covale  # (n_atoms, )
+        # is_motif_token[atom_with_coval_bond] = True
+        return spread_token_wise(atom_array, is_motif_token)
+    def sample_motif_atoms(self, atom_array):
+        """
+        Samples which atoms in motif tokens should be masked.
+        This handles the case where you want the sidechain of a residue to not be motif.
+        Argument attrs:
+            - is_motif_token
+            - is_motif_atom_with_fixed_seq
+        """
+        is_motif_atom = np.asarray(atom_array.is_motif_token, dtype=bool).copy()
+        if random_condition(self.p_diffuse_motif_sidechains):
+            backbone_atoms = ["N", "C", "CA"]
+            if random_condition(self.p_include_oxygen_in_backbone_mask):
+                backbone_atoms.append("O")
+            is_motif_atom = is_motif_atom & np.isin(
+                atom_array.atom_name, backbone_atoms
+            )
+        elif random_condition(self.p_diffuse_subgraph_atoms):
+            is_motif_atom = sample_motif_subgraphs(
+                atom_array=atom_array,
+                **self.subgraph_sampling_kwargs,
+            )
+        # We also only want resolved atoms to be motif
+        is_motif_atom = (is_motif_atom) & (atom_array.occupancy > 0.0)
+        return is_motif_atom
+    def sample(self, data):
+        atom_array = data["atom_array"]
+        atom_array.set_annotation(
+            "is_motif_token", self.sample_motif_tokens(atom_array)
+        )
+        atom_array.set_annotation("is_motif_atom", self.sample_motif_atoms(atom_array))
+        # After selecting the motif, we need to decide what conditioning strategy to apply
+        atom_array = sample_conditioning_strategy(
+            atom_array,
+            p_fix_motif_sequence=self.p_fix_motif_sequence,
+            p_fix_motif_coordinates=self.p_fix_motif_coordinates,
+            p_unindex_motif_tokens=self.p_unindex_motif_tokens,
+        )
+        atom_array.set_annotation(
+            "is_motif_atom_unindexed_motif_breakpoint",
+            sample_unindexed_breaks(
+                atom_array,
+                remove_random_break=data["conditions"]["unindex_remove_random_break"],
+                insert_random_break=data["conditions"]["unindex_insert_random_break"],
+                leak_global_index=data["conditions"]["unindex_leak_global_index"],
+            ),
+        )
+        return atom_array
+class PPICondition(TrainingCondition):
+    """Get condition indicating what is motif and what is to be diffused for protein-protein interaction training."""
+    name = "ppi"
+    def is_valid_for_example(self, data):
+        # Extract relevant data
+        atom_array = data["atom_array"]
+        self.query_pn_unit_iids = data.get("query_pn_unit_iids")
+        # Compute protein pn_unit_iids
+        protein_pn_unit_iids = []
+        for pn_unit_iid in np.unique(atom_array.pn_unit_iid):
+            pn_unit_atom_array = atom_array[atom_array.pn_unit_iid == pn_unit_iid]
+            pn_unit_is_protein = np.unique(pn_unit_atom_array.is_protein)
+            if all(pn_unit_is_protein):  # Exclude cases of chimeric ligands
+                protein_pn_unit_iids.append(pn_unit_iid)
+        # This mask is intended to operate on binary protein-protein interfaces
+        if (
+            self.query_pn_unit_iids is None
+            or len(self.query_pn_unit_iids) != 2
+            or len(np.unique(self.query_pn_unit_iids)) != 2
+        ):
+            return False
+        elif not all(
+            [pn_unit in protein_pn_unit_iids for pn_unit in self.query_pn_unit_iids]
+        ):
+            return False
+        else:
+            # Randomly select one of the two query pn_unit_iids to be the binder
+            # NOTE: Could also do this based on if only one will work uncropped, but since that
+            # strategy will not always be applied, enforcing it here would bias the training data.
+            binder_pn_unit = np.random.choice(self.query_pn_unit_iids)
+            data["binder_pn_unit"] = binder_pn_unit
+            atom_array.set_annotation(
+                "is_binder_pn_unit", atom_array.pn_unit_iid == binder_pn_unit
+            )
+            return True
+    # TODO: If I want to have multiple possible strategies for motif assignment (e.g. motif scaffolding for the binder)
+    # should probably just have this function sample between them with a set of probabilities specified in the config.
+    # Anything that makes it this far will have to be a valid PPI example with an assigned binder chain.
+    def sample(self, data):
+        atom_array = data["atom_array"]
+        # Set `is_motif_token`
+        # NOTE: In the future, we may want to diffuse part of the target or fix part of the binder
+        is_motif_token = atom_array.pn_unit_iid != data["binder_pn_unit"]
+        atom_array.set_annotation("is_motif_token", is_motif_token)
+        # Set `is_motif_atom_with_fixed_seq`
+        is_motif_atom_with_fixed_seq = (
+            is_motif_token.copy()
+        )  # We fix the target sequence in binder design
+        atom_array.set_annotation(
+            "is_motif_atom_with_fixed_seq", is_motif_atom_with_fixed_seq
+        )
+        # Set `is_motif_atom`
+        is_motif_atom = (
+            is_motif_token.copy()
+        )  # The PPI mask should apply to all or no atoms of a token
+        atom_array.set_annotation("is_motif_atom", is_motif_atom)
+        # Set `is_motif_atom_with_fixed_pos`
+        is_motif_atom_with_fixed_coord = (
+            is_motif_token.copy()
+        )  # We fully fix the target atom positions (at least for now)
+        atom_array.set_annotation(
+            "is_motif_atom_with_fixed_coord", is_motif_atom_with_fixed_coord
+        )
+        # Set `is_motif_atom_without_index`
+        is_motif_atom_unindexed = np.zeros_like(
+            is_motif_token
+        )  # We want fixed indices for the target
+        atom_array.set_annotation("is_motif_atom_unindexed", is_motif_atom_unindexed)
+        # Set `is_motif_atom_unindexed_motif_breakpoint`
+        is_motif_atom_unindexed_motif_breakpoint = np.zeros_like(is_motif_token)
+        atom_array.set_annotation(
+            "is_motif_atom_unindexed_motif_breakpoint",
+            is_motif_atom_unindexed_motif_breakpoint,
+        )
+        return atom_array
+##############################################################################################
+# Additional conditioning classes
+##############################################################################################
+class SubtypeCondition(TrainingCondition):
+    """
+    Selects specific subtypes of atoms as motif and assigns conditioning strategies.
+    """
+    name = "subtype"
+    def __init__(self, frequency: float, subtype: list[str], fix_pos: bool = False):
+        self.frequency = frequency
+        self.subtype = subtype
+        self.fix_pos = fix_pos
+    def is_valid_for_example(self, data):
+        """
+        For subtype conditioning, the example must contain the specified subtype
+        """
+        is_subtypes = [
+            data["atom_array"].get_annotation(subtype).sum() for subtype in self.subtype
+        ]
+        if not np.any(is_subtypes):
+            return False
+        return True
+    def sample(self, data):
+        atom_array = data["atom_array"]
+        is_motif = generate_subtype_mask(atom_array, self.subtype)
+        is_motif = prune_unresolved_motif(atom_array, is_motif)
+        atom_array.set_annotation("is_motif_token", is_motif)
+        atom_array.set_annotation("is_motif_atom", is_motif)
+        atom_array.set_annotation("is_motif_atom_with_fixed_seq", is_motif)
+        if self.fix_pos:
+            atom_array.set_annotation("is_motif_atom_with_fixed_coord", is_motif)
+        else:
+            atom_array.set_annotation(
+                "is_motif_atom_with_fixed_coord", np.zeros(len(atom_array), dtype=bool)
+            )
+        atom_array.set_annotation(
+            "is_motif_atom_unindexed", np.zeros(len(atom_array), dtype=bool)
+        )
+        atom_array.set_annotation(
+            "is_motif_atom_unindexed_motif_breakpoint",
+            np.zeros(len(atom_array), dtype=bool),
+        )
+        return atom_array
+################# need mask -> condition refactor
+def prune_unresolved_motif(atom_array, mask):
+    """
+    Prune the mask to only include resolved atoms.
+    and for any residue that have unresolved atoms, set the whole residue to be False.
+    """
+    # Get the indices of the atoms that are resolved
+    resolved_indices = np.where(atom_array.occupancy > 0.0)[0]
+    # Create a mask for the resolved atoms
+    resolved_mask = np.zeros_like(mask, dtype=bool)
+    resolved_mask[resolved_indices] = True
+    # Combine the original mask with the resolved mask
+    combined_mask = mask & resolved_mask
+    # Set the whole residue to be False if any atom in the residue is unresolved
+    token_ids = np.unique(atom_array.token_id)
+    for token_id in token_ids:
+        if np.any(~combined_mask[atom_array.token_id == token_id]):
+            combined_mask[atom_array.token_id == token_id] = False
+    return combined_mask
+def generate_subtype_mask(atom_array, subtypes):
+    """
+    Generate a mask for a specific subtype list of atoms.
+    E.g. is_protein, is_ligand, is_dna etc.
+    """
+    all_masks = []
+    for subtype in subtypes:
+        if subtype not in atom_array.get_annotation_categories():
+            raise ValueError(f"Subtype {subtype} not found in atom array annotations.")
+        mask = atom_array.get_annotation(subtype)
+        all_masks.append(mask)
+    # Combine all masks using logical OR
+    combined_mask = np.logical_or.reduce(all_masks)
+    return combined_mask
+##############################################################################################
+# Shared assignment functions
+##############################################################################################
+def sample_motif_subgraphs(
+    atom_array,
+    residue_p_seed_furthest_from_o,
+    residue_n_bond_expectation,
+    hetatom_n_bond_expectation,
+    residue_p_fix_all,
+    hetatom_p_fix_all,
+):
+    """
+    Returns a boolean mask over atoms, indicating which atoms are part of the sampled motif.
+    Sampling is performed per residue, with sidechains optionally excluded based on bond-based neighborhood sampling.
+    Handles both protein residues and heteroatoms (e.g., ligands).
+    Args:
+        atom_array: AtomArray with annotations is_motif_token, is_protein, occupancy, res_id.
+    Returns:
+        is_motif_atom: np.ndarray of shape (n_atoms,) with True for sampled motif atoms.
+    """
+    is_motif_token = atom_array.is_motif_token.copy()
+    is_motif_atom = is_motif_token.copy()
+    idxs = np.arange(atom_array.array_length(), dtype=int)
+    starts = get_residue_starts(atom_array, add_exclusive_stop=True)
+    for i, (start, end) in enumerate(zip(starts[:-1], starts[1:])):
+        if not is_motif_token[start]:
+            continue
+        # Get the atoms of the current residue
+        subset_mask = np.isin(idxs, idxs[start:end])
+        atom_array_subset = atom_array[subset_mask]
+        assert atom_array_subset.array_length() > 0
+        args = {
+            "p_seed_furthest_from_o": residue_p_seed_furthest_from_o,
+            "n_bond_expectation": residue_n_bond_expectation,
+            "p_fix_all": residue_p_fix_all,
+        }
+        if not atom_array_subset.is_protein.all():
+            args.update(
+                {
+                    "p_seed_furthest_from_o": 0.0,
+                    "n_bond_expectation": hetatom_n_bond_expectation,
+                    "p_fix_all": hetatom_p_fix_all,
+                }
+            )
+        try:
+            mask = sample_subgraph_atoms(atom_array_subset, **args)
+        except Exception as e:
+            logger.warning(
+                f"Failed to sample subgraph motif atoms for {atom_array_subset.res_name[0]}. Error: {e}"
+            )
+            mask = np.ones(atom_array_subset.array_length(), dtype=bool)
+        is_motif_atom[subset_mask] = mask
+    # We also only want resolved atoms to be motif
+    is_motif_atom = (is_motif_atom) & (atom_array.occupancy > 0.0)
+    return is_motif_atom
+def sample_conditioning_strategy(
+    atom_array,
+    p_fix_motif_sequence,
+    p_fix_motif_coordinates,
+    p_unindex_motif_tokens,
+):
+    atom_array.set_annotation(
+        "is_motif_atom_with_fixed_seq",
+        sample_is_motif_atom_with_fixed_seq(
+            atom_array, p_fix_motif_sequence=p_fix_motif_sequence
+        ),
+    )
+    atom_array.set_annotation(
+        "is_motif_atom_with_fixed_coord",
+        sample_fix_motif_coordinates(
+            atom_array, p_fix_motif_coordinates=p_fix_motif_coordinates
+        ),
+    )
+    atom_array.set_annotation(
+        "is_motif_atom_unindexed",
+        sample_unindexed_atoms(
+            atom_array, p_unindex_motif_tokens=p_unindex_motif_tokens
+        ),
+    )
+    return atom_array
+def sample_is_motif_atom_with_fixed_seq(atom_array, p_fix_motif_sequence):
+    """
+    Samples what kind of conditioning to apply to motif tokens.
+    Argument attrs:
+        - is_motif_token
+    """
+    if random_condition(p_fix_motif_sequence):
+        is_motif_atom_with_fixed_seq = atom_array.is_motif_token.copy()
+    else:
+        is_motif_atom_with_fixed_seq = np.zeros(atom_array.array_length(), dtype=bool)
+    # By default reveal sequence for non-protein
+    is_motif_atom_with_fixed_seq = is_motif_atom_with_fixed_seq | ~atom_array.is_protein
+    return is_motif_atom_with_fixed_seq
+def sample_fix_motif_coordinates(atom_array, p_fix_motif_coordinates):
+    """
+    Universal function to decide if atoms' coords are fixed in the point cloud for conditioning.
+    Argument attrs:
+        - is_motif_atom_with_fixed_coord
+    """
+    if random_condition(p_fix_motif_coordinates):
+        is_motif_atom_with_fixed_coord = atom_array.is_motif_atom.copy()
+    else:
+        is_motif_atom_with_fixed_coord = np.zeros(atom_array.array_length(), dtype=bool)
+    return is_motif_atom_with_fixed_coord
+def sample_unindexed_atoms(atom_array, p_unindex_motif_tokens):
+    """
+    Samples which atoms in motif tokens should be flagged for unindexing.
+    Argument attrs:
+        - is_motif_atom_unindexed
+    """
+    if random_condition(p_unindex_motif_tokens):
+        is_motif_atom_unindexed = atom_array.is_motif_atom.copy()
+    else:
+        is_motif_atom_unindexed = np.zeros(atom_array.array_length(), dtype=bool)
+    # ensure non-residue atoms are not already flagged
+    is_motif_atom_unindexed = np.logical_and(
+        is_motif_atom_unindexed, atom_array.is_residue
+    )
+    return is_motif_atom_unindexed
+def sample_unindexed_breaks(
+    atom_array,
+    remove_random_break=False,
+    insert_random_break=False,
+    leak_global_index=False,
+):
+    is_unindexed_token = apply_token_wise(
+        atom_array,
+        atom_array.is_motif_atom_unindexed.copy(),
+        function=lambda x: np.any(x),
+    )
+    starts = get_token_starts(atom_array)
+    token_idxs = np.arange(len(starts))
+    breaks_all = np.zeros(len(starts), dtype=bool)
+    if is_unindexed_token.sum() == 1:
+        breaks_all = is_unindexed_token
+    elif np.any(is_unindexed_token):
+        # ... Subset to unindexed tokens
+        unindexed_token_starts = starts[is_unindexed_token]
+        unindexed_token_resid = atom_array[unindexed_token_starts].res_id
+        breaks = np.diff(unindexed_token_resid) != 1  # (M-1,)
+        # ... Connect discontiguous regions
+        if remove_random_break and np.any(breaks):
+            break_idx = np.random.choice(np.flatnonzero(breaks), size=1, replace=False)
+            breaks[break_idx] = False
+        # ... Disconnect contiguous regions
+        if insert_random_break:
+            break_idx = np.random.choice(np.arange(len(breaks)), size=1, replace=False)
+            breaks[break_idx] = True
+        breaks[0] = True
+        breaks = np.concatenate([np.array([False], dtype=bool), breaks])
+        # ... Remove all breaks to leak global indices:
+        if leak_global_index:
+            breaks = False
+        breaks_all[token_idxs[is_unindexed_token]] = breaks
+    return spread_token_wise(atom_array, breaks_all)