PyPI - rc-foundry - Versions diffs - 0.1.1__py3-none-any.whl - Mend

rc-foundry 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

foundry/__init__.py +57 -0
foundry/callbacks/__init__.py +5 -0
foundry/callbacks/callback.py +116 -0
foundry/callbacks/health_logging.py +419 -0
foundry/callbacks/metrics_logging.py +211 -0
foundry/callbacks/timing_logging.py +67 -0
foundry/callbacks/train_logging.py +278 -0
foundry/common.py +108 -0
foundry/constants.py +28 -0
foundry/hydra/resolvers.py +77 -0
foundry/inference_engines/base.py +235 -0
foundry/inference_engines/checkpoint_registry.py +66 -0
foundry/metrics/__init__.py +12 -0
foundry/metrics/losses.py +30 -0
foundry/metrics/metric.py +319 -0
foundry/model/layers/blocks.py +47 -0
foundry/testing/__init__.py +6 -0
foundry/testing/fixtures.py +19 -0
foundry/testing/pytest_hooks.py +15 -0
foundry/trainers/fabric.py +923 -0
foundry/training/EMA.py +67 -0
foundry/training/checkpoint.py +61 -0
foundry/training/schedulers.py +91 -0
foundry/utils/alignment.py +86 -0
foundry/utils/components.py +415 -0
foundry/utils/datasets.py +405 -0
foundry/utils/ddp.py +103 -0
foundry/utils/instantiators.py +72 -0
foundry/utils/logging.py +279 -0
foundry/utils/rigid.py +1460 -0
foundry/utils/rotation_augmentation.py +65 -0
foundry/utils/squashfs.py +172 -0
foundry/utils/torch.py +317 -0
foundry/utils/weights.py +271 -0
foundry/version.py +34 -0
foundry_cli/__init__.py +3 -0
foundry_cli/download_checkpoints.py +281 -0
mpnn/__init__.py +1 -0
mpnn/collate/feature_collator.py +265 -0
mpnn/inference.py +53 -0
mpnn/inference_engines/mpnn.py +549 -0
mpnn/loss/nll_loss.py +122 -0
mpnn/metrics/nll.py +369 -0
mpnn/metrics/sequence_recovery.py +440 -0
mpnn/model/layers/graph_embeddings.py +2372 -0
mpnn/model/layers/message_passing.py +332 -0
mpnn/model/layers/position_wise_feed_forward.py +44 -0
mpnn/model/layers/positional_encoding.py +98 -0
mpnn/model/mpnn.py +2632 -0
mpnn/pipelines/mpnn.py +162 -0
mpnn/samplers/samplers.py +167 -0
mpnn/train.py +341 -0
mpnn/trainers/mpnn.py +193 -0
mpnn/transforms/feature_aggregation/mpnn.py +184 -0
mpnn/transforms/feature_aggregation/polymer_ligand_interface.py +76 -0
mpnn/transforms/feature_aggregation/token_encodings.py +132 -0
mpnn/transforms/feature_aggregation/user_settings.py +347 -0
mpnn/transforms/polymer_ligand_interface.py +164 -0
mpnn/utils/inference.py +2397 -0
mpnn/utils/probability.py +37 -0
mpnn/utils/weights.py +309 -0
rc_foundry-0.1.1.dist-info/METADATA +239 -0
rc_foundry-0.1.1.dist-info/RECORD +180 -0
rc_foundry-0.1.1.dist-info/WHEEL +4 -0
rc_foundry-0.1.1.dist-info/entry_points.txt +5 -0
rc_foundry-0.1.1.dist-info/licenses/LICENSE.md +28 -0
rf3/__init__.py +3 -0
rf3/_version.py +33 -0
rf3/alignment.py +79 -0
rf3/callbacks/dump_validation_structures.py +101 -0
rf3/callbacks/metrics_logging.py +324 -0
rf3/chemical.py +1529 -0
rf3/cli.py +77 -0
rf3/data/cyclic_transform.py +78 -0
rf3/data/extra_xforms.py +36 -0
rf3/data/ground_truth_template.py +463 -0
rf3/data/paired_msa.py +206 -0
rf3/data/pipeline_utils.py +128 -0
rf3/data/pipelines.py +558 -0
rf3/diffusion_samplers/inference_sampler.py +222 -0
rf3/inference.py +65 -0
rf3/inference_engines/__init__.py +5 -0
rf3/inference_engines/rf3.py +735 -0
rf3/kinematics.py +354 -0
rf3/loss/af3_confidence_loss.py +515 -0
rf3/loss/af3_losses.py +655 -0
rf3/loss/loss.py +179 -0
rf3/metrics/chiral.py +179 -0
rf3/metrics/clashing_chains.py +68 -0
rf3/metrics/distogram.py +421 -0
rf3/metrics/lddt.py +523 -0
rf3/metrics/metadata.py +43 -0
rf3/metrics/metric_utils.py +192 -0
rf3/metrics/predicted_error.py +134 -0
rf3/metrics/rasa.py +108 -0
rf3/metrics/selected_distances.py +91 -0
rf3/model/RF3.py +527 -0
rf3/model/RF3_blocks.py +92 -0
rf3/model/RF3_structure.py +303 -0
rf3/model/layers/af3_auxiliary_heads.py +255 -0
rf3/model/layers/af3_diffusion_transformer.py +544 -0
rf3/model/layers/attention.py +313 -0
rf3/model/layers/layer_utils.py +127 -0
rf3/model/layers/mlff.py +118 -0
rf3/model/layers/outer_product.py +59 -0
rf3/model/layers/pairformer_layers.py +783 -0
rf3/model/layers/structure_bias.py +56 -0
rf3/scoring.py +1787 -0
rf3/symmetry/resolve.py +284 -0
rf3/train.py +194 -0
rf3/trainers/rf3.py +570 -0
rf3/util_module.py +47 -0
rf3/utils/frames.py +109 -0
rf3/utils/inference.py +665 -0
rf3/utils/io.py +198 -0
rf3/utils/loss.py +72 -0
rf3/utils/predict_and_score.py +165 -0
rf3/utils/predicted_error.py +673 -0
rf3/utils/recycling.py +42 -0
rf3/validate.py +140 -0
rfd3/.gitignore +7 -0
rfd3/Makefile +76 -0
rfd3/__init__.py +12 -0
rfd3/callbacks.py +66 -0
rfd3/cli.py +41 -0
rfd3/constants.py +212 -0
rfd3/engine.py +543 -0
rfd3/inference/datasets.py +193 -0
rfd3/inference/input_parsing.py +1123 -0
rfd3/inference/legacy_input_parsing.py +717 -0
rfd3/inference/parsing.py +165 -0
rfd3/inference/symmetry/atom_array.py +298 -0
rfd3/inference/symmetry/checks.py +241 -0
rfd3/inference/symmetry/contigs.py +63 -0
rfd3/inference/symmetry/frames.py +355 -0
rfd3/inference/symmetry/symmetry_utils.py +398 -0
rfd3/metrics/design_metrics.py +465 -0
rfd3/metrics/hbonds_hbplus_metrics.py +308 -0
rfd3/metrics/hbonds_metrics.py +389 -0
rfd3/metrics/losses.py +325 -0
rfd3/metrics/metrics_utils.py +118 -0
rfd3/metrics/sidechain_metrics.py +349 -0
rfd3/model/RFD3.py +105 -0
rfd3/model/RFD3_diffusion_module.py +387 -0
rfd3/model/cfg_utils.py +81 -0
rfd3/model/inference_sampler.py +635 -0
rfd3/model/layers/attention.py +577 -0
rfd3/model/layers/block_utils.py +580 -0
rfd3/model/layers/blocks.py +777 -0
rfd3/model/layers/chunked_pairwise.py +377 -0
rfd3/model/layers/encoders.py +417 -0
rfd3/model/layers/layer_utils.py +197 -0
rfd3/model/layers/pairformer_layers.py +128 -0
rfd3/run_inference.py +45 -0
rfd3/testing/debug.py +139 -0
rfd3/testing/debug_utils.py +73 -0
rfd3/testing/testing_utils.py +356 -0
rfd3/train.py +194 -0
rfd3/trainer/dump_validation_structures.py +154 -0
rfd3/trainer/fabric_trainer.py +923 -0
rfd3/trainer/recycling.py +42 -0
rfd3/trainer/rfd3.py +485 -0
rfd3/trainer/trainer_utils.py +502 -0
rfd3/transforms/conditioning_base.py +508 -0
rfd3/transforms/conditioning_utils.py +200 -0
rfd3/transforms/design_transforms.py +807 -0
rfd3/transforms/dna_crop.py +523 -0
rfd3/transforms/hbonds.py +407 -0
rfd3/transforms/hbonds_hbplus.py +246 -0
rfd3/transforms/ncaa_transforms.py +153 -0
rfd3/transforms/pipelines.py +632 -0
rfd3/transforms/ppi_transforms.py +541 -0
rfd3/transforms/rasa.py +116 -0
rfd3/transforms/symmetry.py +76 -0
rfd3/transforms/training_conditions.py +552 -0
rfd3/transforms/util_transforms.py +498 -0
rfd3/transforms/virtual_atoms.py +305 -0
rfd3/utils/inference.py +648 -0
rfd3/utils/io.py +245 -0
rfd3/utils/vizualize.py +276 -0

rfd3/transforms/ppi_transforms.py ADDED Viewed

@@ -0,0 +1,541 @@
+# from atomworks.ml.utils.token import get_token_masks, get_token_starts
+from typing import Any
+import biotite.structure as struc
+import numpy as np
+from assertpy import assert_that
+from atomworks.ml.preprocessing.utils.structure_utils import (
+    get_atom_mask_from_cell_list,
+)
+from atomworks.ml.transforms._checks import (
+    check_atom_array_annotation,
+    check_contains_keys,
+    check_is_instance,
+)
+from atomworks.ml.transforms.atom_array import atom_id_to_atom_idx, atom_id_to_token_idx
+from atomworks.ml.transforms.base import Transform
+from atomworks.ml.transforms.crop import (
+    get_spatial_crop_center,
+    get_token_count,
+    resize_crop_info_if_too_many_atoms,
+)
+from atomworks.ml.utils.token import (
+    get_af3_token_center_coords,
+    get_af3_token_center_masks,
+    get_token_starts,
+    spread_token_wise,
+)
+from biotite.structure import AtomArray
+from rfd3.transforms.conditioning_utils import sample_island_tokens
+from scipy.spatial import KDTree
+from foundry.utils.ddp import RankedLogger
+ranked_logger = RankedLogger(__name__, rank_zero_only=True)
+# NOTE: This transform is based off of `rf_diffusion_aa.rf_diffusion.ppi.FindHotspotsTrainingTransform`
+# However, this is progressing piecewise, and many features of that transform are not yet implemented.
+# If this seems to be working, those should definitely be added in the future!
+# NOTE: In contrast to RFD, we are providing hotspots at the atom level, not the residue level.
+# Future hotspot subsampling schemes might want to avoid giving redundant information via (say) bonded atoms
+def get_hotspot_atoms(atom_array, binder_pn_unit_iid, distance_cutoff=4.5):
+    """Get hotspot atoms for a given distance cutoff.
+    Args:
+        atom_array (AtomArray): The atom array containing the protein structure.
+        binder_pn_unit_iid (str): The chain ID of the binder (diffused chain).
+        distance_cutoff (float): The interchain distance cutoff that defines hotspot atoms.
+    Hotspots are atoms on non-binder chains that are within the distance cutoff of some residue on the binder.
+    Residue distances are computed as the minimum pairwise distance between the two atoms.
+    """
+    # We can only perform distance computations on atoms with non-NaN coordinates
+    nan_coords_mask = np.any(np.isnan(atom_array.coord), axis=1)
+    non_nan_atom_array = atom_array[~nan_coords_mask]
+    binder_atom_array = non_nan_atom_array[
+        non_nan_atom_array.pn_unit_iid == binder_pn_unit_iid
+    ]
+    # Perform the hotspot computation
+    cell_list = struc.CellList(non_nan_atom_array, cell_size=distance_cutoff)
+    full_contacting_atom_mask = get_atom_mask_from_cell_list(
+        binder_atom_array.coord, cell_list, len(non_nan_atom_array), distance_cutoff
+    )  # (n_query, n_cell_list)
+    contacting_atoms_mask = np.any(full_contacting_atom_mask, axis=0)  # (n_cell_list,)
+    # Filter out atoms in the binder chain
+    non_query_atoms_mask = non_nan_atom_array.pn_unit_iid != binder_pn_unit_iid
+    hotspot_atom_mask = contacting_atoms_mask & non_query_atoms_mask
+    # Convert from mask over non-nan coords to mask over all coords
+    full_hotspot_atom_mask = np.zeros(len(atom_array), dtype=bool)
+    full_hotspot_atom_mask[~nan_coords_mask] = hotspot_atom_mask
+    return full_hotspot_atom_mask
+def get_secondary_structure_types(atom_array: AtomArray) -> np.ndarray:
+    """Get the secondary structure types for a given atom array.
+    For now, only three categories will be one-hot encoded: helix, sheet, and loop.
+    """
+    ss_types = np.zeros((atom_array.array_length(), 3), dtype=bool)
+    # HACK: Temporarily overwrite res_id with token_id so that the sse_array will have length n_tokens
+    actual_res_id = atom_array.res_id.copy()
+    atom_array.res_id = atom_array.token_id
+    # Since annotate_sse detects chainbreaks based on res_id discontinuities, we create discontinuities where needed
+    _, chain_offsets = np.unique(atom_array.chain_iid, return_inverse=True)
+    atom_array.res_id += chain_offsets
+    # Compute secondary structure information
+    sse_array = struc.annotate_sse(atom_array)
+    assert len(sse_array) == len(
+        np.unique(atom_array.token_id)
+    ), "SSE array length does not match number of tokens."
+    # Restore original res_id
+    atom_array.res_id = actual_res_id
+    sse_array = spread_token_wise(atom_array, sse_array)
+    ss_types[:, 0] = sse_array == "a"
+    ss_types[:, 1] = sse_array == "b"
+    ss_types[:, 2] = sse_array == "c"
+    return ss_types
+class AddGlobalIsNonLoopyFeature(Transform):
+    """Add feature indicating whether the global loop content in the non-motif region is below 30%.
+    For this initial implementation, only three categories will be one-hot encoded: helix, sheet, and loop.
+    """
+    def check_input(self, data: dict[str, Any]) -> None:
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(data, ["is_motif_token"])
+    def forward(self, data: dict[str, Any]) -> dict[str, Any]:
+        atom_array = data["atom_array"]
+        # Compute all ground-truth secondary structure types for the binder chain.
+        # For now boolean, later could include distances as in RFD. But maybe that's better as a 2D condition
+        gt_secondary_structures = get_secondary_structure_types(atom_array)
+        atom_array.set_annotation("is_loop_gt", gt_secondary_structures[:, 2])
+        is_motif_atom = atom_array.is_motif_token
+        is_non_loopy = atom_array.is_loop_gt[~is_motif_atom].mean() < 0.3
+        is_non_loopy_annot = np.full(
+            atom_array.array_length(), 1 if is_non_loopy else -1, dtype=int
+        )
+        atom_array.set_annotation("is_non_loopy", is_non_loopy_annot)
+        # HACK: Enables adding as atom-level features as well
+        atom_array.set_annotation("is_non_loopy_atom_level", is_non_loopy_annot)
+        return data
+class Add1DSSFeature(Transform):
+    """Add secondary structure features to training examples.
+    For this initial implementation, only three categories will be one-hot encoded: helix, sheet, and loop.
+    """
+    def __init__(
+        self,
+        max_secondary_structure_frac_to_provide: float = 0.4,
+        min_ss_island_len: int = 1,
+        max_ss_island_len: int = 10,  # Might want to expand later, this is only average. Done for now to avoid over-conditioning.
+        n_islands_max: int = 3,
+    ):
+        self.max_secondary_structure_frac_to_provide = (
+            max_secondary_structure_frac_to_provide
+        )
+        self.min_ss_island_len = min_ss_island_len
+        self.max_ss_island_len = max_ss_island_len
+        self.n_islands_max = n_islands_max
+    def check_input(self, data: dict[str, Any]) -> None:
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(data, ["is_motif_token"])
+    def forward(self, data: dict[str, Any]) -> dict[str, Any]:
+        atom_array = data["atom_array"]
+        # Compute all ground-truth secondary structure types for the binder chain.
+        gt_secondary_structures = get_secondary_structure_types(atom_array)
+        atom_array.set_annotation("is_helix_gt", gt_secondary_structures[:, 0])
+        atom_array.set_annotation("is_sheet_gt", gt_secondary_structures[:, 1])
+        atom_array.set_annotation("is_loop_gt", gt_secondary_structures[:, 2])
+        if not data["conditions"]["add_1d_ss_features"]:
+            return data
+        # Always add the secondary structure type annotation, even if all zeros
+        atom_array.set_annotation(
+            "is_helix_conditioning", np.zeros(atom_array.array_length(), dtype=bool)
+        )
+        atom_array.set_annotation(
+            "is_sheet_conditioning", np.zeros(atom_array.array_length(), dtype=bool)
+        )
+        atom_array.set_annotation(
+            "is_loop_conditioning", np.zeros(atom_array.array_length(), dtype=bool)
+        )
+        # # Uniformly sample a number of tokens to receive secondary structure conditioning, up to the given maximum fraction
+        max_residues_with_ss_conditioning = int(
+            np.ceil(
+                gt_secondary_structures.sum()
+                * self.max_secondary_structure_frac_to_provide
+            )
+        )
+        # Compute islands within the subset that is diffused and has secondary structure types.
+        token_level_array = atom_array[get_token_starts(atom_array)]
+        is_motif_token = token_level_array.is_motif_token
+        eligible_for_ss_info_mask = (
+            ~is_motif_token
+            & token_level_array.is_protein
+            & (  # Protein atoms with NaN coordinates would have no secondary structure annotation
+                token_level_array.is_helix_gt
+                | token_level_array.is_sheet_gt
+                | token_level_array.is_loop_gt
+            )
+        )
+        where_to_show_ss = sample_island_tokens(
+            eligible_for_ss_info_mask.sum(),
+            island_len_min=self.min_ss_island_len,
+            island_len_max=self.max_ss_island_len,
+            n_islands_min=1,
+            n_islands_max=self.n_islands_max,
+            max_length=max_residues_with_ss_conditioning,
+        )
+        # Convert this to a mask over the entire token-level atom array
+        token_level_ss_mask = np.zeros(token_level_array.array_length(), dtype=bool)
+        token_level_ss_mask[eligible_for_ss_info_mask] = where_to_show_ss
+        ss_mask = spread_token_wise(atom_array, token_level_ss_mask)
+        # Add the secondary structure type annotation
+        atom_array.is_helix_conditioning[ss_mask] = atom_array.is_helix_gt[ss_mask]
+        atom_array.is_sheet_conditioning[ss_mask] = atom_array.is_sheet_gt[ss_mask]
+        atom_array.is_loop_conditioning[ss_mask] = atom_array.is_loop_gt[ss_mask]
+        return data
+class AddPPIHotspotFeature(Transform):
+    """Add hotspot features to PPI training examples."""
+    def __init__(
+        self,
+        max_hotspots_frac_to_provide: float = 0.2,
+        hotspot_max_distance: float = 7.0,
+    ):
+        """
+        Args:
+            max_hotspots_frac_to_provide (int): Maximum fraction of ground-truth hotspots to add to the training example.
+                The actual number added will be sampled uniformly from 0 to the number dictated by this parameter.
+            hotspot_min_distance (float): Maximum distance to the binder for an atom to be considered a hotspot.
+        """
+        self.max_hotspots_frac_to_provide = max_hotspots_frac_to_provide
+        self.hotspot_max_distance = hotspot_max_distance
+    def check_input(self, data: dict[str, Any]) -> None:
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(data, ["is_motif_token"])
+    def forward(self, data: dict[str, Any]) -> dict[str, Any]:
+        atom_array = data["atom_array"]
+        # Always add the hotspot annotation, even if all zeros
+        atom_array.set_annotation(
+            "is_atom_level_hotspot", np.zeros(atom_array.array_length(), dtype=bool)
+        )
+        # Compute all ground-truth hotspots for the binder chain.
+        # For now boolean, later could include distances as in RFD. But maybe that's better as a 2D condition
+        is_hotspot_atom_mask = get_hotspot_atoms(
+            atom_array,
+            binder_pn_unit_iid=data["binder_pn_unit"],
+            distance_cutoff=self.hotspot_max_distance,
+        )
+        atom_array.set_annotation("is_hotspot_gt", is_hotspot_atom_mask)
+        # Uniformly sample a number of hotspots to include, up to the given maximum fraction
+        max_hotspots_to_keep = int(
+            np.ceil(sum(is_hotspot_atom_mask) * self.max_hotspots_frac_to_provide)
+        )
+        if max_hotspots_to_keep == 0:
+            ranked_logger.warning("No hotspots found in the input data")
+            return data
+        else:
+            num_hotspots_to_keep = np.random.randint(
+                0,
+                int(
+                    np.ceil(
+                        sum(is_hotspot_atom_mask) * self.max_hotspots_frac_to_provide
+                    )
+                ),
+            )
+        # Subsample hotspots to add.
+        # For now random, later could add speckle_or_region from RFD
+        true_hotspot_indices = np.where(is_hotspot_atom_mask)[0]
+        hotspots_to_provide = np.random.choice(
+            true_hotspot_indices, size=num_hotspots_to_keep, replace=False
+        )
+        atom_array.is_atom_level_hotspot[hotspots_to_provide] = True
+        return data
+class PPIFullBinderCropSpatial(Transform):
+    """Crop which keeps the entire binder chain, then crops spatially around the given interface.
+    Args:
+        crop_size (int): The maximum number of tokens to crop. Must be greater than 0.
+        jitter_scale (float, optional): The scale of the jitter to apply to the crop center.
+            This is to break ties between atoms with the same spatial distance. Defaults to 1e-3.
+        crop_center_cutoff_distance (float, optional): The cutoff distance to consider for
+            selecting crop centers. Measured in Angstroms. Defaults to 15.0.
+        keep_uncropped_atom_array (bool, optional): Whether to keep the uncropped atom array in the data.
+            If `True`, the uncropped atom array will be stored in the `crop_info` dictionary
+            under the key `"atom_array"`. Defaults to `False`.
+        force_crop (bool, optional): Whether to force crop even if the atom array is already small enough.
+            Defaults to `False`.
+        max_atoms_in_crop (int, optional): Maximum number of atoms allowed in a crop. If None, no resizing is performed.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        crop_size: int,
+        jitter_scale: float = 1e-3,
+        crop_center_cutoff_distance: float = 15.0,
+        keep_uncropped_atom_array: bool = False,
+        force_crop: bool = False,
+        max_atoms_in_crop: int | None = None,
+    ):
+        self.crop_size = crop_size
+        self.jitter_scale = jitter_scale
+        self.crop_center_cutoff_distance = crop_center_cutoff_distance
+        self.keep_uncropped_atom_array = keep_uncropped_atom_array
+        self.force_crop = force_crop
+        self.max_atoms_in_crop = max_atoms_in_crop
+    def check_input(self, data: dict):
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(data, ["pn_unit_iid", "atomize", "atom_id"])
+    def forward(self, data: dict) -> dict:
+        atom_array = data["atom_array"]
+        if "query_pn_unit_iids" in data and data["query_pn_unit_iids"]:
+            query_pn_units = data["query_pn_unit_iids"]
+        else:
+            query_pn_units = np.unique(atom_array.pn_unit_iid)
+            ranked_logger.info(
+                f"No query PN unit(s) provided for spatial crop. Randomly selecting from {query_pn_units}."
+            )
+        if "binder_pn_unit" not in data:
+            raise ValueError("Data dict must contain 'binder_pn_unit' key.")
+        crop_info = crop_spatial_keep_full_binder(
+            atom_array=atom_array,
+            query_pn_unit_iids=query_pn_units,
+            binder_pn_unit_iid=data["binder_pn_unit"],
+            crop_size=self.crop_size,
+            jitter_scale=self.jitter_scale,
+            crop_center_cutoff_distance=self.crop_center_cutoff_distance,
+            force_crop=self.force_crop,
+        )
+        crop_info = resize_crop_info_if_too_many_atoms(
+            crop_info=crop_info,
+            atom_array=atom_array,
+            max_atoms=self.max_atoms_in_crop,
+        )
+        data["crop_info"] = {"type": self.__class__.__name__} | crop_info
+        if self.keep_uncropped_atom_array:
+            data["crop_info"]["atom_array"] = atom_array
+        # Update data with cropped atom array
+        data["atom_array"] = atom_array[crop_info["crop_atom_idxs"]]
+        return data
+def crop_spatial_keep_full_binder(
+    atom_array: AtomArray,
+    query_pn_unit_iids: list[str],
+    binder_pn_unit_iid: str,
+    crop_size: int,
+    jitter_scale: float = 1e-3,
+    crop_center_cutoff_distance: float = 15.0,
+    force_crop: bool = False,
+) -> dict:
+    """
+    Crop spatial tokens around a given `crop_center` by keeping the entire binder chain, then taking nearest
+    neighbors (with jitter) until reaching the `crop_size`.
+    Args:
+        - atom_array (AtomArray): The atom array to crop.
+        - query_pn_unit_iids (list[str]): List of query polymer/non-polymer unit instance IDs.
+        - binder_pn_unit_iid (str): The polymer/non-polymer unit instance ID corresponding to the binder.
+        - crop_size (int): The maximum number of tokens to crop.
+        - jitter_scale (float, optional): Scale of jitter to apply when calculating distances.
+            Defaults to 1e-3.
+        - crop_center_cutoff_distance (float, optional): Maximum distance from query units to
+            consider for crop center. Defaults to 15.0 Angstroms.
+        - force_crop (bool, optional): Whether to force crop even if the atom array is already small enough.
+            Defaults to False.
+    Returns:
+        dict: A dictionary containing crop information, including:
+            - requires_crop (bool): Whether cropping was necessary.
+            - crop_center_atom_id (int or np.nan): ID of the atom chosen as crop center.
+            - crop_center_atom_idx (int or np.nan): Index of the atom chosen as crop center.
+            - crop_center_token_idx (int or np.nan): Index of the token containing the crop center.
+            - crop_token_idxs (np.ndarray): Indices of tokens included in the crop.
+            - crop_atom_idxs (np.ndarray): Indices of atoms included in the crop.
+    Note:
+        This function implements the spatial cropping procedure as described in AlphaFold 3 and AlphaFold 2 Multimer.
+    References:
+        - AF3 https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-024-07487-w/MediaObjects/41586_2024_7487_MOESM1_ESM.pdf
+        - AF2 Multimer https://www.biorxiv.org/content/10.1101/2021.10.04.463034v2.full.pdf
+    """
+    if binder_pn_unit_iid not in query_pn_unit_iids:
+        raise ValueError(
+            f"Binder polymer/non-polymer unit instance ID '{binder_pn_unit_iid}' "
+            f"not found in query polymer/non-polymer unit instance IDs: {query_pn_unit_iids}"
+        )
+    n_tokens = get_token_count(atom_array)
+    requires_crop = n_tokens > crop_size
+    # ... get binder information
+    binder_token_mask = (
+        atom_array[get_af3_token_center_masks(atom_array)].pn_unit_iid
+        == binder_pn_unit_iid
+    )
+    binder_atom_mask = atom_array.pn_unit_iid == binder_pn_unit_iid
+    n_binder_tokens = get_token_count(atom_array[binder_atom_mask])
+    if force_crop or requires_crop:
+        # Get possible crop centers
+        can_be_crop_center = get_spatial_crop_center(
+            atom_array, query_pn_unit_iids, crop_center_cutoff_distance
+        )
+        # ... sample crop center atom
+        crop_center_atom_id = np.random.choice(atom_array[can_be_crop_center].atom_id)
+        crop_center_atom_idx = atom_id_to_atom_idx(atom_array, crop_center_atom_id)
+        # ... sample crop, excluding the binder polymer/non-polymer unit
+        token_coords = get_af3_token_center_coords(atom_array)
+        crop_center_token_idx = atom_id_to_token_idx(atom_array, crop_center_atom_id)
+        is_token_in_crop = get_spatial_crop_excluding_mask(
+            token_coords,
+            crop_center_token_idx,
+            crop_size=crop_size
+            - n_binder_tokens,  # reserve space for the binder tokens
+            mask_to_exclude=binder_token_mask,
+            jitter_scale=jitter_scale,
+        )
+        # ... spread token-level crop mask to atom-level
+        is_atom_in_crop = spread_token_wise(atom_array, is_token_in_crop)
+        # ... add in binder tokens and atoms
+        is_token_in_crop = is_token_in_crop | binder_token_mask
+        is_atom_in_crop = is_atom_in_crop | binder_atom_mask
+    else:
+        # ... no need to crop since the atom array is already small enough
+        crop_center_atom_id = np.nan
+        crop_center_atom_idx = np.nan
+        crop_center_token_idx = np.nan
+        is_atom_in_crop = np.ones(len(atom_array), dtype=bool)
+        is_token_in_crop = np.ones(n_tokens, dtype=bool)
+    return {
+        "requires_crop": requires_crop,  # whether cropping was necessary
+        "crop_center_atom_id": crop_center_atom_id,  # atom_id of crop center
+        "crop_center_atom_idx": crop_center_atom_idx,  # atom_idx of crop center
+        "crop_center_token_idx": crop_center_token_idx,  # token_idx of crop center
+        "crop_token_idxs": np.where(is_token_in_crop)[0],  # token_idxs in crop
+        "crop_atom_idxs": np.where(is_atom_in_crop)[0],  # atom_idxs in crop
+    }
+def get_spatial_crop_excluding_mask(
+    coord: np.ndarray,
+    crop_center_idx: int,
+    crop_size: int,
+    mask_to_exclude: np.ndarray,
+    jitter_scale: float = 1e-3,
+) -> np.ndarray:
+    """
+    Crop spatial tokens around a given `crop_center`, keeping nearest neighbors (with jitter) and excluding atoms in a
+    specified mask, until reaching the `crop_size`.
+    Args:
+        coord (np.ndarray): A 2D numpy array of shape (N, 3) representing the 3D token-level coordinates.
+            Coordinates are expected to be in Angstroms.
+        crop_center_idx (int): The index of the token to be used as the center of the crop.
+        crop_size (int): The number of nearest neighbors to include in the crop.
+        mask_to_exclude (siwnp.ndarray): A mask indicating atoms to be excluded from the crop.
+        jitter_scale (float): The scale of the jitter to add to the coordinates.
+    Returns:
+        crop_mask (np.ndarray): A boolean mask of shape (N,) where True indicates that the token is within the crop.
+    """
+    assert_that(coord.ndim).is_equal_to(2)
+    assert_that(coord.shape[1]).is_equal_to(3)
+    assert_that(crop_center_idx).is_less_than(coord.shape[0])
+    assert_that(crop_size).is_greater_than(0)
+    assert_that(jitter_scale).is_greater_than_or_equal_to(0)
+    # Add small jitter to coordinates to break ties
+    if jitter_scale > 0:
+        coord = coord + np.random.normal(scale=jitter_scale, size=coord.shape)
+    # ... get query center
+    query_center = coord[crop_center_idx]
+    # ... extract a mask for valid coordinates (i.e. no `nan`'s, which indicate unknown token centers)
+    #     including including unoccupied tokens in the crop
+    is_valid = np.isfinite(coord).all(axis=1)
+    # ... exclude the specified pn_unit
+    is_valid = is_valid & ~mask_to_exclude
+    # ... build a KDTree for efficient querying, excluding invalid coordinates
+    tree = KDTree(coord[is_valid])
+    # ... query the `crop_size` nearest neighbors of the crop center
+    _, nearest_neighbor_idxs = tree.query(query_center, k=crop_size, p=2)
+    # ... filter out missing neighbours (index equal to `tree.n`)
+    nearest_neighbor_idxs = nearest_neighbor_idxs[nearest_neighbor_idxs < tree.n]
+    # ... crop mask is True for the `crop_size` nearest neighbors of the crop center
+    crop_mask = np.zeros(coord.shape[0], dtype=bool)
+    is_valid_and_in_crop_idxs = np.where(is_valid)[0][nearest_neighbor_idxs]
+    crop_mask[is_valid_and_in_crop_idxs] = True
+    return crop_mask

rfd3/transforms/rasa.py ADDED Viewed

@@ -0,0 +1,116 @@
+import numpy as np
+from atomworks.ml.transforms.base import Transform
+from atomworks.ml.transforms.sasa import calculate_atomwise_rasa
+from atomworks.ml.utils.token import apply_and_spread_token_wise
+class CalculateRASA(Transform):
+    """Transform for calculating relative SASA (RASA) for each atom in an AtomArray."""
+    def __init__(
+        self,
+        probe_radius: float = 1.4,
+        atom_radii: str | np.ndarray = "ProtOr",
+        point_number: int = 100,
+        requires_ligand=False,
+    ):
+        """
+        probe_radius (float, optional): Van-der-Waals radius of the probe in Angstrom. Defaults to 1.4 (for water).
+        atom_radii (str | np.ndarray, optional): Atom radii set to use for calculation. Defaults to "ProtOr".
+            "ProtOr" will not get sasa's for hydrogen atoms and some other atoms, like ions or certain atoms with charges
+        point_number (int, optional): Number of points in the Shrake-Rupley algorithm to sample for calculating SASA. Defaults to 100.
+        """
+        self.probe_radius = probe_radius
+        self.atom_radii = atom_radii
+        self.point_number = point_number
+        self.requires_ligand = requires_ligand
+    def forward(self, data):
+        atom_array = data["atom_array"]
+        if not np.any(atom_array.is_ligand) and self.requires_ligand:
+            return data
+        # Calculate exact rasa
+        rasa = calculate_atomwise_rasa(
+            atom_array, self.probe_radius, self.atom_radii, self.point_number
+        )
+        atom_array.set_annotation("rasa", rasa)
+        data["atom_array"] = atom_array
+        return data
+def discretize_rasa(atom_array, low=0, high=0.2, n_bins=3, keep_protein_motif=False):
+    inclusion_mask = ~np.isnan(atom_array.rasa)
+    inclusion_mask = inclusion_mask & atom_array.is_motif_token
+    if not keep_protein_motif:
+        inclusion_mask = inclusion_mask & ~atom_array.is_protein
+    bin_edges = np.linspace(low, high, n_bins)  # e.g., [0.0, 0.1, 0.2]
+    bins = (
+        np.digitize(atom_array.rasa, bin_edges, right=False)
+        - 1  # Subtract 1 since first bin would mean negative rasa!
+    )  # bins in [0, n_bins-1]
+    bins[~inclusion_mask] = n_bins  # Assign excluded atoms to an additional, unused bin
+    return bins
+class SetZeroOccOnDeltaRASA(Transform):
+    """
+    Recomputes RASA and sets zero-occupancy for those that have become significantly exposed
+    Used to measure if the atomwise RASA changed during cropping
+    """
+    requires_previous_transforms = [CalculateRASA]
+    incompatible_previous_transforms = [
+        "PadWithVirtualAtoms",  # must have the same atom names
+        "CreateDesignReferenceFeatures",
+        "AggregateFeaturesLikeAF3WithoutMSA",
+    ]
+    def __init__(
+        self,
+        probe_radius: float = 1.4,
+        atom_radii: str | np.ndarray = "ProtOr",
+        point_number: int = 100,
+    ):
+        self.probe_radius = probe_radius
+        self.atom_radii = atom_radii
+        self.point_number = point_number
+    def check_input(self, data):
+        assert "rasa" in data["atom_array"].get_annotation_categories()
+    def forward(self, data):
+        atom_array = data["atom_array"]
+        rasa_old = atom_array.rasa
+        rasa_new = calculate_atomwise_rasa(
+            atom_array, self.probe_radius, self.atom_radii, self.point_number
+        )
+        delta_rasa = np.clip(rasa_new, a_min=0, a_max=0.2) - np.clip(
+            rasa_old, a_min=0, a_max=0.2
+        )
+        has_become_exposed = np.nan_to_num(delta_rasa) > 0.075
+        token_has_become_exposed = apply_and_spread_token_wise(
+            atom_array,
+            has_become_exposed,
+            function=lambda x: np.any(x),
+        )
+        is_sidechain = (
+            ~np.isin(atom_array.atom_name, ["N", "CA", "C", "O"])
+            & atom_array.is_residue
+        )
+        # Set zero occupancy for sidechains only
+        atom_has_become_exposed = token_has_become_exposed & is_sidechain
+        atom_array.occupancy[atom_has_become_exposed] = 0.0
+        # atom_array.res_name[token_has_become_exposed] = "UNK"
+        data["atom_array"] = atom_array
+        return data