PyPI - rc-foundry - Versions diffs - 0.1.1__py3-none-any.whl - Mend

rc-foundry 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

foundry/__init__.py +57 -0
foundry/callbacks/__init__.py +5 -0
foundry/callbacks/callback.py +116 -0
foundry/callbacks/health_logging.py +419 -0
foundry/callbacks/metrics_logging.py +211 -0
foundry/callbacks/timing_logging.py +67 -0
foundry/callbacks/train_logging.py +278 -0
foundry/common.py +108 -0
foundry/constants.py +28 -0
foundry/hydra/resolvers.py +77 -0
foundry/inference_engines/base.py +235 -0
foundry/inference_engines/checkpoint_registry.py +66 -0
foundry/metrics/__init__.py +12 -0
foundry/metrics/losses.py +30 -0
foundry/metrics/metric.py +319 -0
foundry/model/layers/blocks.py +47 -0
foundry/testing/__init__.py +6 -0
foundry/testing/fixtures.py +19 -0
foundry/testing/pytest_hooks.py +15 -0
foundry/trainers/fabric.py +923 -0
foundry/training/EMA.py +67 -0
foundry/training/checkpoint.py +61 -0
foundry/training/schedulers.py +91 -0
foundry/utils/alignment.py +86 -0
foundry/utils/components.py +415 -0
foundry/utils/datasets.py +405 -0
foundry/utils/ddp.py +103 -0
foundry/utils/instantiators.py +72 -0
foundry/utils/logging.py +279 -0
foundry/utils/rigid.py +1460 -0
foundry/utils/rotation_augmentation.py +65 -0
foundry/utils/squashfs.py +172 -0
foundry/utils/torch.py +317 -0
foundry/utils/weights.py +271 -0
foundry/version.py +34 -0
foundry_cli/__init__.py +3 -0
foundry_cli/download_checkpoints.py +281 -0
mpnn/__init__.py +1 -0
mpnn/collate/feature_collator.py +265 -0
mpnn/inference.py +53 -0
mpnn/inference_engines/mpnn.py +549 -0
mpnn/loss/nll_loss.py +122 -0
mpnn/metrics/nll.py +369 -0
mpnn/metrics/sequence_recovery.py +440 -0
mpnn/model/layers/graph_embeddings.py +2372 -0
mpnn/model/layers/message_passing.py +332 -0
mpnn/model/layers/position_wise_feed_forward.py +44 -0
mpnn/model/layers/positional_encoding.py +98 -0
mpnn/model/mpnn.py +2632 -0
mpnn/pipelines/mpnn.py +162 -0
mpnn/samplers/samplers.py +167 -0
mpnn/train.py +341 -0
mpnn/trainers/mpnn.py +193 -0
mpnn/transforms/feature_aggregation/mpnn.py +184 -0
mpnn/transforms/feature_aggregation/polymer_ligand_interface.py +76 -0
mpnn/transforms/feature_aggregation/token_encodings.py +132 -0
mpnn/transforms/feature_aggregation/user_settings.py +347 -0
mpnn/transforms/polymer_ligand_interface.py +164 -0
mpnn/utils/inference.py +2397 -0
mpnn/utils/probability.py +37 -0
mpnn/utils/weights.py +309 -0
rc_foundry-0.1.1.dist-info/METADATA +239 -0
rc_foundry-0.1.1.dist-info/RECORD +180 -0
rc_foundry-0.1.1.dist-info/WHEEL +4 -0
rc_foundry-0.1.1.dist-info/entry_points.txt +5 -0
rc_foundry-0.1.1.dist-info/licenses/LICENSE.md +28 -0
rf3/__init__.py +3 -0
rf3/_version.py +33 -0
rf3/alignment.py +79 -0
rf3/callbacks/dump_validation_structures.py +101 -0
rf3/callbacks/metrics_logging.py +324 -0
rf3/chemical.py +1529 -0
rf3/cli.py +77 -0
rf3/data/cyclic_transform.py +78 -0
rf3/data/extra_xforms.py +36 -0
rf3/data/ground_truth_template.py +463 -0
rf3/data/paired_msa.py +206 -0
rf3/data/pipeline_utils.py +128 -0
rf3/data/pipelines.py +558 -0
rf3/diffusion_samplers/inference_sampler.py +222 -0
rf3/inference.py +65 -0
rf3/inference_engines/__init__.py +5 -0
rf3/inference_engines/rf3.py +735 -0
rf3/kinematics.py +354 -0
rf3/loss/af3_confidence_loss.py +515 -0
rf3/loss/af3_losses.py +655 -0
rf3/loss/loss.py +179 -0
rf3/metrics/chiral.py +179 -0
rf3/metrics/clashing_chains.py +68 -0
rf3/metrics/distogram.py +421 -0
rf3/metrics/lddt.py +523 -0
rf3/metrics/metadata.py +43 -0
rf3/metrics/metric_utils.py +192 -0
rf3/metrics/predicted_error.py +134 -0
rf3/metrics/rasa.py +108 -0
rf3/metrics/selected_distances.py +91 -0
rf3/model/RF3.py +527 -0
rf3/model/RF3_blocks.py +92 -0
rf3/model/RF3_structure.py +303 -0
rf3/model/layers/af3_auxiliary_heads.py +255 -0
rf3/model/layers/af3_diffusion_transformer.py +544 -0
rf3/model/layers/attention.py +313 -0
rf3/model/layers/layer_utils.py +127 -0
rf3/model/layers/mlff.py +118 -0
rf3/model/layers/outer_product.py +59 -0
rf3/model/layers/pairformer_layers.py +783 -0
rf3/model/layers/structure_bias.py +56 -0
rf3/scoring.py +1787 -0
rf3/symmetry/resolve.py +284 -0
rf3/train.py +194 -0
rf3/trainers/rf3.py +570 -0
rf3/util_module.py +47 -0
rf3/utils/frames.py +109 -0
rf3/utils/inference.py +665 -0
rf3/utils/io.py +198 -0
rf3/utils/loss.py +72 -0
rf3/utils/predict_and_score.py +165 -0
rf3/utils/predicted_error.py +673 -0
rf3/utils/recycling.py +42 -0
rf3/validate.py +140 -0
rfd3/.gitignore +7 -0
rfd3/Makefile +76 -0
rfd3/__init__.py +12 -0
rfd3/callbacks.py +66 -0
rfd3/cli.py +41 -0
rfd3/constants.py +212 -0
rfd3/engine.py +543 -0
rfd3/inference/datasets.py +193 -0
rfd3/inference/input_parsing.py +1123 -0
rfd3/inference/legacy_input_parsing.py +717 -0
rfd3/inference/parsing.py +165 -0
rfd3/inference/symmetry/atom_array.py +298 -0
rfd3/inference/symmetry/checks.py +241 -0
rfd3/inference/symmetry/contigs.py +63 -0
rfd3/inference/symmetry/frames.py +355 -0
rfd3/inference/symmetry/symmetry_utils.py +398 -0
rfd3/metrics/design_metrics.py +465 -0
rfd3/metrics/hbonds_hbplus_metrics.py +308 -0
rfd3/metrics/hbonds_metrics.py +389 -0
rfd3/metrics/losses.py +325 -0
rfd3/metrics/metrics_utils.py +118 -0
rfd3/metrics/sidechain_metrics.py +349 -0
rfd3/model/RFD3.py +105 -0
rfd3/model/RFD3_diffusion_module.py +387 -0
rfd3/model/cfg_utils.py +81 -0
rfd3/model/inference_sampler.py +635 -0
rfd3/model/layers/attention.py +577 -0
rfd3/model/layers/block_utils.py +580 -0
rfd3/model/layers/blocks.py +777 -0
rfd3/model/layers/chunked_pairwise.py +377 -0
rfd3/model/layers/encoders.py +417 -0
rfd3/model/layers/layer_utils.py +197 -0
rfd3/model/layers/pairformer_layers.py +128 -0
rfd3/run_inference.py +45 -0
rfd3/testing/debug.py +139 -0
rfd3/testing/debug_utils.py +73 -0
rfd3/testing/testing_utils.py +356 -0
rfd3/train.py +194 -0
rfd3/trainer/dump_validation_structures.py +154 -0
rfd3/trainer/fabric_trainer.py +923 -0
rfd3/trainer/recycling.py +42 -0
rfd3/trainer/rfd3.py +485 -0
rfd3/trainer/trainer_utils.py +502 -0
rfd3/transforms/conditioning_base.py +508 -0
rfd3/transforms/conditioning_utils.py +200 -0
rfd3/transforms/design_transforms.py +807 -0
rfd3/transforms/dna_crop.py +523 -0
rfd3/transforms/hbonds.py +407 -0
rfd3/transforms/hbonds_hbplus.py +246 -0
rfd3/transforms/ncaa_transforms.py +153 -0
rfd3/transforms/pipelines.py +632 -0
rfd3/transforms/ppi_transforms.py +541 -0
rfd3/transforms/rasa.py +116 -0
rfd3/transforms/symmetry.py +76 -0
rfd3/transforms/training_conditions.py +552 -0
rfd3/transforms/util_transforms.py +498 -0
rfd3/transforms/virtual_atoms.py +305 -0
rfd3/utils/inference.py +648 -0
rfd3/utils/io.py +245 -0
rfd3/utils/vizualize.py +276 -0

rf3/data/pipelines.py ADDED Viewed

@@ -0,0 +1,558 @@
+from os import PathLike
+from pathlib import Path
+import numpy as np
+from atomworks.common import exists
+from atomworks.constants import (
+    AF3_EXCLUDED_LIGANDS,
+    STANDARD_AA,
+    STANDARD_DNA,
+    STANDARD_RNA,
+)
+from atomworks.enums import ChainType
+from atomworks.ml.encoding_definitions import RF2AA_ATOM36_ENCODING, AF3SequenceEncoding
+from atomworks.ml.transforms.af3_reference_molecule import (
+    GetAF3ReferenceMoleculeFeatures,
+    GroundTruthConformerPolicy,
+    RandomApplyGroundTruthConformerByChainType,
+)
+from atomworks.ml.transforms.atom_array import (
+    AddGlobalAtomIdAnnotation,
+    AddGlobalResIdAnnotation,
+    AddGlobalTokenIdAnnotation,
+    AddWithinChainInstanceResIdx,
+    AddWithinPolyResIdxAnnotation,
+    ComputeAtomToTokenMap,
+    CopyAnnotation,
+)
+from atomworks.ml.transforms.atom_frames import (
+    AddAtomFrames,
+    AddIsRealAtom,
+    AddPolymerFrameIndices,
+)
+from atomworks.ml.transforms.atom_level_embeddings import FeaturizeAtomLevelEmbeddings
+from atomworks.ml.transforms.atomize import (
+    AtomizeByCCDName,
+    FlagNonPolymersForAtomization,
+)
+from atomworks.ml.transforms.base import (
+    AddData,
+    ApplyFunction,
+    Compose,
+    ConditionalRoute,
+    ConvertToTorch,
+    Identity,
+    RandomRoute,
+    SubsetToKeys,
+)
+from atomworks.ml.transforms.bfactor_conditioned_transforms import SetOccToZeroOnBfactor
+from atomworks.ml.transforms.bonds import (
+    AddAF3TokenBondFeatures,
+)
+from atomworks.ml.transforms.cached_residue_data import (
+    LoadCachedResidueLevelData,
+    RandomSubsampleCachedConformers,
+)
+from atomworks.ml.transforms.center_random_augmentation import CenterRandomAugmentation
+from atomworks.ml.transforms.chirals import AddAF3ChiralFeatures
+from atomworks.ml.transforms.covalent_modifications import (
+    FlagAndReassignCovalentModifications,
+)
+from atomworks.ml.transforms.crop import CropContiguousLikeAF3, CropSpatialLikeAF3
+from atomworks.ml.transforms.diffusion.batch_structures import (
+    BatchStructuresForDiffusionNoising,
+)
+from atomworks.ml.transforms.diffusion.edm import SampleEDMNoise
+from atomworks.ml.transforms.encoding import (
+    EncodeAF3TokenLevelFeatures,
+    EncodeAtomArray,
+)
+from atomworks.ml.transforms.feature_aggregation.af3 import AggregateFeaturesLikeAF3
+from atomworks.ml.transforms.feature_aggregation.confidence import (
+    PackageConfidenceFeats,
+)
+from atomworks.ml.transforms.featurize_unresolved_residues import (
+    MaskPolymerResiduesWithUnresolvedFrameAtoms,
+    PlaceUnresolvedTokenAtomsOnRepresentativeAtom,
+    PlaceUnresolvedTokenOnClosestResolvedTokenInSequence,
+)
+from atomworks.ml.transforms.filters import (
+    FilterToSpecifiedPNUnits,
+    HandleUndesiredResTokens,
+    RandomlyRemoveLigands,
+    RemoveHydrogens,
+    RemoveNucleicAcidTerminalOxygen,
+    RemovePolymersWithTooFewResolvedResidues,
+    RemoveTerminalOxygen,
+    RemoveUnresolvedPNUnits,
+)
+from atomworks.ml.transforms.mirror_transform import RandomlyMirrorInputs
+from atomworks.ml.transforms.msa.msa import (
+    EncodeMSA,
+    FeaturizeMSALikeAF3,
+    FillFullMSAFromEncoded,
+    LoadPolymerMSAs,
+    PairAndMergePolymerMSAs,
+)
+from atomworks.ml.transforms.random_atomize_residues import RandomAtomizeResidues
+from atomworks.ml.transforms.rdkit_utils import GetRDKitChiralCenters
+from atomworks.ml.transforms.symmetry import FindAutomorphismsWithNetworkX
+from omegaconf import DictConfig
+from rf3.data.cyclic_transform import AddCyclicBonds
+from rf3.data.extra_xforms import CheckForNaNsInInputs
+from rf3.data.pipeline_utils import (
+    annotate_post_crop_hash,
+    annotate_pre_crop_hash,
+    build_ground_truth_distogram_transform,
+    set_to_occupancy_0_where_crop_hashes_differ,
+)
+def TrainingRoute(transform):
+    return ConditionalRoute(
+        condition_func=lambda data: data["is_inference"],
+        transform_map={True: Identity(), False: transform},
+    )
+def InferenceRoute(transform):
+    return ConditionalRoute(
+        condition_func=lambda data: data["is_inference"],
+        transform_map={False: Identity(), True: transform},
+    )
+def build_af3_transform_pipeline(
+    *,
+    # Training or inference (required)
+    is_inference: bool,  # If True, we skip cropping, etc.
+    # MSA dirs
+    protein_msa_dirs: list[dict],
+    rna_msa_dirs: list[dict],
+    # Recycles
+    n_recycles: int = 5,
+    # Crop params
+    crop_size: int = 384,
+    crop_center_cutoff_distance: float = 15.0,
+    crop_contiguous_probability: float = 0.5,
+    crop_spatial_probability: float = 0.5,
+    max_atoms_in_crop: int | None = None,
+    # Undesired res names
+    undesired_res_names: list[str] = AF3_EXCLUDED_LIGANDS,
+    # Conformer generation params
+    conformer_generation_timeout: float = 5.0,  # seconds
+    use_element_for_atom_names_of_atomized_tokens: bool = False,
+    # MSA parameters
+    max_msa_sequences: int = 10_000,  # Paper: 16,000, but we only have 10K stored on disk
+    n_msa: int = 10_000,  # Paper: ?? I think ~12K?
+    dense_msa: bool = True,  # True for AF3
+    add_residue_is_paired_feature: bool = False,
+    # Cache paths
+    msa_cache_dir: PathLike | str | None = None,
+    residue_cache_dir: PathLike
+    | str
+    | None = "/net/tukwila/lschaaf/datahub/MACE-OMOL-Jul2025/mace_embeddings",
+    # Diffusion parameters
+    sigma_data: float = 16.0,
+    diffusion_batch_size: int = 48,
+    # Whether to include features for confidence head
+    run_confidence_head: bool = False,
+    return_atom_array: bool = True,
+    # DNA
+    pad_dna_p_skip: float = 0.0,
+    b_factor_min: float | None = None,
+    b_factor_max: float | None = None,
+    # ------ Atom-level conditioning ------ #
+    p_unconditional: float = 1.0,  # Show no conditioning, anywhere (i.e., unconditional)
+    template_noise_scales: dict | DictConfig = {
+        "atomized": 1e-5,  # No noise (for atomized tokens)
+        "not_atomized": 0.2,  # Up to 0.2A of noise (for non-atomized tokens)
+    },
+    allowed_chain_types_for_conditioning: list[int | str | ChainType]
+    | None = ChainType.get_all_types(),  # All chain types (None = no conditioning)
+    p_condition_per_token: float = 0.0,  # When sampling with conditions, X% of tokens are conditioned (e.g., X^2% of pairs have conditions)
+    p_provide_inter_molecule_distances: float = 0.0,  # When sampling with conditions, X% of the time, show any inter-molecule distances
+    # (Reference Conformer)
+    p_give_non_polymer_ref_conf: float = 0.0,  # When sampling with conditions, X% of non-polymer chains get a ground-truth reference conformer
+    p_give_polymer_ref_conf: float = 0.0,  # When sampling with conditions, X% of polymer chains get a ground-truth reference conformer
+    # -------------------------------------- #
+    take_first_chiral_subordering: bool = False,
+    mirror_prob: float = 0.0,
+    input_contains_explicit_msa: bool = False,
+    atomization_prob: float = 0.0,
+    ligand_dropout_prob: float = 0.0,
+    raise_if_missing_msa_for_protein_of_length_n: int | None = None,
+    mask_crop_edges: bool = False,
+    p_dropout_atom_level_embeddings: float = 0.0,
+    embedding_dim: int = 384,
+    n_conformers: int = 8,
+    add_cyclic_bonds: bool = True,
+    metrics_tags: list[str] | set[str] | None = None,
+    p_dropout_ref_conf: float = 0.0,  # Unused
+):
+    """Build the AF3 pipeline with specified parameters.
+    This function constructs a pipeline of transforms for processing protein structures
+    in a manner similar to AlphaFold 3. The pipeline includes steps for removing hydrogens,
+    adding annotations, atomizing residues, cropping, adding templates, encoding features,
+    and generating reference molecule features.
+    Args:
+        crop_size (int, optional): The size of the crop. Defaults to 384.
+        crop_center_cutoff_distance (float, optional): The cutoff distance for spatial cropping.
+            Defaults to 15.0.
+        crop_contiguous_probability (float, optional): The probability of using contiguous cropping.
+            Defaults to 0.5.
+        crop_spatial_probability (float, optional): The probability of using spatial cropping.
+            Defaults to 0.5.
+        conformer_generation_timeout (float, optional): The timeout for conformer generation in seconds.
+            Defaults to 10.0.
+        metrics_tags (list[str] | set[str] | None, optional): Tags to use for determining which Metrics apply.
+            Defaults to None (tags not added).
+    Returns:
+        Transform: A composed pipeline of transforms.
+    Raises:
+        AssertionError: If the crop probabilities do not sum to 1.0, if the crop size is not positive,
+        or if the crop center cutoff distance is not positive.
+    Note:
+        The cropping method is chosen randomly based on the provided probabilities.
+        The pipeline includes steps for processing the structure, adding annotations,
+        and generating features required for AF3-like predictions.
+    References:
+        - AlphaFold 3 Supplementary Information.
+          https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-024-07487-w/MediaObjects/41586_2024_7487_MOESM1_ESM.pdf
+    """
+    if (
+        crop_contiguous_probability > 0 or crop_spatial_probability > 0
+    ) and not is_inference:
+        assert np.isclose(
+            crop_contiguous_probability + crop_spatial_probability, 1.0, atol=1e-6
+        ), "Crop probabilities must sum to 1.0"
+        assert crop_size > 0, "Crop size must be greater than 0"
+        assert (
+            crop_center_cutoff_distance > 0
+        ), "Crop center cutoff distance must be greater than 0"
+    af3_sequence_encoding = AF3SequenceEncoding()
+    rf2aa_sequence_encoding = RF2AA_ATOM36_ENCODING
+    transforms = [
+        AddData(
+            {"is_inference": is_inference, "run_confidence_head": run_confidence_head}
+        ),
+        # ... unconditional vs. conditional
+        TrainingRoute(
+            RandomRoute(
+                transforms=[
+                    AddData({"is_unconditional": True}),
+                    AddData({"is_unconditional": False}),
+                ],
+                probs=[p_unconditional, 1 - p_unconditional],
+            ),
+        ),
+        RemoveHydrogens(),
+        TrainingRoute(
+            FilterToSpecifiedPNUnits(
+                extra_info_key_with_pn_unit_iids_to_keep="all_pn_unit_iids_after_processing"
+            ),
+        ),
+    ]
+    if exists(metrics_tags):
+        transforms.append(AddData({"metrics_tags": metrics_tags}))
+    transforms.append(
+        ConditionalRoute(
+            condition_func=lambda data: data.get("is_inference", False),
+            transform_map={
+                True: Identity(),
+                False: RandomlyMirrorInputs(mirror_prob),
+            },
+        )
+    )
+    transforms += [
+        RemoveTerminalOxygen(),
+        TrainingRoute(
+            SetOccToZeroOnBfactor(b_factor_min, b_factor_max),
+        ),
+        TrainingRoute(RemoveUnresolvedPNUnits()),
+        RemovePolymersWithTooFewResolvedResidues(min_residues=4),
+        MaskPolymerResiduesWithUnresolvedFrameAtoms(),
+        ConditionalRoute(
+            condition_func=lambda data: data.get("is_inference", False),
+            transform_map={
+                # UNX causes RDKit to crash (element is "X"), so we exclude even at inference
+                True: HandleUndesiredResTokens(undesired_res_tokens=["UNX"]),
+                False: HandleUndesiredResTokens(
+                    undesired_res_tokens=undesired_res_names
+                ),
+            },
+        ),
+        # NOTE: this is used in training to pad DNA sequences, but we don't use it in inference
+        # TrainingRoute(
+        # PadDNA(p_skip=pad_dna_p_skip),
+        # ),
+        FlagAndReassignCovalentModifications(),
+        FlagNonPolymersForAtomization(),
+    ]
+    transforms.append(
+        ConditionalRoute(
+            condition_func=lambda data: data.get("is_inference", False),
+            transform_map={
+                True: Identity(),
+                False: RandomAtomizeResidues(atomization_prob),
+            },
+        )
+    )
+    transforms.append(
+        ConditionalRoute(
+            condition_func=lambda data: data.get("is_inference", False),
+            transform_map={
+                True: Identity(),
+                False: RandomlyRemoveLigands(ligand_dropout_prob),
+            },
+        )
+    )
+    transforms += [
+        AddGlobalAtomIdAnnotation(),
+        AtomizeByCCDName(
+            atomize_by_default=True,
+            res_names_to_ignore=STANDARD_AA + STANDARD_RNA + STANDARD_DNA,
+            move_atomized_part_to_end=False,
+            validate_atomize=False,
+        ),
+        RemoveNucleicAcidTerminalOxygen(),
+        AddWithinChainInstanceResIdx(),
+        AddWithinPolyResIdxAnnotation(),
+    ]
+    # Crop
+    # ... crop around our query pn_unit(s) early, since we don't need the full structure moving forward
+    cropping_transform = Identity()
+    if crop_size is not None:
+        cropping_transform = RandomRoute(
+            transforms=[
+                CropContiguousLikeAF3(
+                    crop_size=crop_size,
+                    keep_uncropped_atom_array=True,
+                    max_atoms_in_crop=max_atoms_in_crop,
+                ),
+                CropSpatialLikeAF3(
+                    crop_size=crop_size,
+                    crop_center_cutoff_distance=crop_center_cutoff_distance,
+                    keep_uncropped_atom_array=True,
+                    max_atoms_in_crop=max_atoms_in_crop,
+                ),
+            ],
+            probs=[crop_contiguous_probability, crop_spatial_probability],
+        )
+    transforms += [
+        TrainingRoute(ApplyFunction(annotate_pre_crop_hash)),
+        ConditionalRoute(
+            condition_func=lambda data: data.get("is_inference", False),
+            transform_map={
+                True: Identity(),
+                False: cropping_transform,
+                # Default to Identity during inference (`is_inference == True`)
+            },
+        ),
+        TrainingRoute(ApplyFunction(annotate_post_crop_hash)),
+    ]
+    if mask_crop_edges:
+        transforms += [
+            TrainingRoute(ApplyFunction(set_to_occupancy_0_where_crop_hashes_differ)),
+        ]
+    # +-----------------------------------------------------------+
+    # +------------------ GROUND TRUTH TEMPLATE ------------------+
+    # +-----------------------------------------------------------+
+    # Ground truth template noising (for training)
+    transforms.append(
+        build_ground_truth_distogram_transform(
+            template_noise_scales=template_noise_scales,
+            allowed_chain_types_for_conditioning=allowed_chain_types_for_conditioning,
+            p_condition_per_token=p_condition_per_token,
+            p_provide_inter_molecule_distances=p_provide_inter_molecule_distances,
+            is_inference=is_inference,
+        )
+    )
+    # +----------------------------------------------------------------------+
+    # +------------------ GROUND TRUTH REFERENCE CONFORMER ------------------+
+    # +----------------------------------------------------------------------+
+    transforms.append(
+        RandomApplyGroundTruthConformerByChainType(
+            chain_type_probabilities={
+                tuple(ChainType.get_polymers()): p_give_polymer_ref_conf,
+                tuple(ChainType.get_non_polymers()): p_give_non_polymer_ref_conf,
+            },
+            policy=GroundTruthConformerPolicy.ADD,
+        )
+    )
+    transforms += [
+        AddGlobalTokenIdAnnotation(),  # required for reference molecule features and TokenToAtomMap
+        AddGlobalResIdAnnotation(),
+        LoadCachedResidueLevelData(
+            dir=Path(residue_cache_dir) if exists(residue_cache_dir) else None,
+            sharding_depth=1,
+        ),
+        RandomSubsampleCachedConformers(n_conformers=n_conformers),
+        EncodeAF3TokenLevelFeatures(sequence_encoding=af3_sequence_encoding),
+        GetAF3ReferenceMoleculeFeatures(
+            conformer_generation_timeout=conformer_generation_timeout,
+            use_element_for_atom_names_of_atomized_tokens=use_element_for_atom_names_of_atomized_tokens,
+        ),
+        FeaturizeAtomLevelEmbeddings(
+            mask_rdkit_conformers=False,
+            p_dropout_atom_level_embeddings=p_dropout_atom_level_embeddings,
+            embedding_dim=embedding_dim,
+            n_conformers=n_conformers,
+        ),
+        FindAutomorphismsWithNetworkX(),  # Adds the  "automorphisms" key to the data dictionary
+        ComputeAtomToTokenMap(),
+        GetRDKitChiralCenters(),
+        AddAF3ChiralFeatures(
+            take_first_chiral_subordering=take_first_chiral_subordering
+        ),
+    ]
+    transforms += [
+        # ... load and pair MSAs
+        LoadPolymerMSAs(
+            protein_msa_dirs=protein_msa_dirs,
+            rna_msa_dirs=rna_msa_dirs,
+            max_msa_sequences=max_msa_sequences,  # maximum number of sequences to load (we later subsample further)
+            msa_cache_dir=Path(msa_cache_dir) if exists(msa_cache_dir) else None,
+            use_paths_in_chain_info=True,  # if there are paths specified in the `chain_info` for a given chain, use them
+            raise_if_missing_msa_for_protein_of_length_n=raise_if_missing_msa_for_protein_of_length_n,
+        ),
+        PairAndMergePolymerMSAs(
+            dense=dense_msa, add_residue_is_paired_feature=add_residue_is_paired_feature
+        ),
+    ]
+    transforms += [
+        # ... encode MSA to AF-3 format
+        EncodeMSA(
+            encoding=af3_sequence_encoding,
+            token_to_use_for_gap=af3_sequence_encoding.token_to_idx["<G>"],
+        ),
+        # ... fill MSA, indexing into only the portions of the polymers that are present in the cropped structure
+        FillFullMSAFromEncoded(
+            pad_token=af3_sequence_encoding.token_to_idx["<G>"],
+            add_residue_is_paired_feature=add_residue_is_paired_feature,
+        ),
+        ConditionalRoute(
+            condition_func=lambda data: data.get("is_inference", False),
+            transform_map={
+                True: AddAF3TokenBondFeatures(np.inf),
+                False: AddAF3TokenBondFeatures(),
+            },
+        ),
+    ]
+    if add_cyclic_bonds:
+        transforms += [
+            AddCyclicBonds(),
+        ]
+    transforms += [
+        # ... featurize MSA
+        ConvertToTorch(
+            keys=[
+                "encoded",
+                "feats",
+                "full_msa_details",
+            ]
+        ),
+        FeaturizeMSALikeAF3(
+            encoding=af3_sequence_encoding,
+            n_recycles=n_recycles,
+            n_msa=n_msa,
+        ),
+        # Prepare coordinates for noising (without modifying the ground truth)
+        # ... add placeholder coordinates for noising
+        CopyAnnotation(annotation_to_copy="coord", new_annotation="coord_to_be_noised"),
+        # ... handling of unresolved residues (note that these Transforms create the "atom_array_to_noise" dictionary, if not already present)
+        PlaceUnresolvedTokenAtomsOnRepresentativeAtom(
+            annotation_to_update="coord_to_be_noised"
+        ),
+        PlaceUnresolvedTokenOnClosestResolvedTokenInSequence(
+            annotation_to_update="coord_to_be_noised",
+            annotation_to_copy="coord_to_be_noised",
+        ),
+        # Feature aggregation
+        AggregateFeaturesLikeAF3(),
+        # ... batching and noise sampling for diffusion
+        BatchStructuresForDiffusionNoising(batch_size=diffusion_batch_size),
+        CenterRandomAugmentation(batch_size=diffusion_batch_size),
+        SampleEDMNoise(
+            sigma_data=sigma_data, diffusion_batch_size=diffusion_batch_size
+        ),
+        CheckForNaNsInInputs(),
+    ]
+    confidence_transforms = Compose(
+        [
+            # Additions required for confidence calculation
+            EncodeAtomArray(rf2aa_sequence_encoding),
+            AddAtomFrames(),
+            AddIsRealAtom(rf2aa_sequence_encoding),
+            AddPolymerFrameIndices(),
+            # wrap it all together
+            PackageConfidenceFeats(),
+        ]
+    )
+    transforms.append(
+        ConditionalRoute(
+            condition_func=lambda data: data.get("run_confidence_head", False),
+            transform_map={
+                True: confidence_transforms,
+                False: Identity(),
+            },
+        )
+    )
+    keys_to_keep = [
+        "example_id",
+        "feats",
+        "t",
+        "noise",
+        "ground_truth",
+        "coord_atom_lvl_to_be_noised",
+        "automorphisms",
+        "symmetry_resolution",
+        "extra_info",
+    ]
+    if run_confidence_head:
+        keys_to_keep.append("confidence_feats")
+    if return_atom_array:  # and is_inference:
+        keys_to_keep.append("atom_array")
+    transforms += [
+        # Subset to only keys necessary
+        SubsetToKeys(keys_to_keep)
+    ]
+    # ... compose final pipeline
+    pipeline = Compose(transforms)
+    return pipeline