PyPI - rc-foundry - Versions diffs - 0.1.1__py3-none-any.whl - Mend

rc-foundry 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

foundry/__init__.py +57 -0
foundry/callbacks/__init__.py +5 -0
foundry/callbacks/callback.py +116 -0
foundry/callbacks/health_logging.py +419 -0
foundry/callbacks/metrics_logging.py +211 -0
foundry/callbacks/timing_logging.py +67 -0
foundry/callbacks/train_logging.py +278 -0
foundry/common.py +108 -0
foundry/constants.py +28 -0
foundry/hydra/resolvers.py +77 -0
foundry/inference_engines/base.py +235 -0
foundry/inference_engines/checkpoint_registry.py +66 -0
foundry/metrics/__init__.py +12 -0
foundry/metrics/losses.py +30 -0
foundry/metrics/metric.py +319 -0
foundry/model/layers/blocks.py +47 -0
foundry/testing/__init__.py +6 -0
foundry/testing/fixtures.py +19 -0
foundry/testing/pytest_hooks.py +15 -0
foundry/trainers/fabric.py +923 -0
foundry/training/EMA.py +67 -0
foundry/training/checkpoint.py +61 -0
foundry/training/schedulers.py +91 -0
foundry/utils/alignment.py +86 -0
foundry/utils/components.py +415 -0
foundry/utils/datasets.py +405 -0
foundry/utils/ddp.py +103 -0
foundry/utils/instantiators.py +72 -0
foundry/utils/logging.py +279 -0
foundry/utils/rigid.py +1460 -0
foundry/utils/rotation_augmentation.py +65 -0
foundry/utils/squashfs.py +172 -0
foundry/utils/torch.py +317 -0
foundry/utils/weights.py +271 -0
foundry/version.py +34 -0
foundry_cli/__init__.py +3 -0
foundry_cli/download_checkpoints.py +281 -0
mpnn/__init__.py +1 -0
mpnn/collate/feature_collator.py +265 -0
mpnn/inference.py +53 -0
mpnn/inference_engines/mpnn.py +549 -0
mpnn/loss/nll_loss.py +122 -0
mpnn/metrics/nll.py +369 -0
mpnn/metrics/sequence_recovery.py +440 -0
mpnn/model/layers/graph_embeddings.py +2372 -0
mpnn/model/layers/message_passing.py +332 -0
mpnn/model/layers/position_wise_feed_forward.py +44 -0
mpnn/model/layers/positional_encoding.py +98 -0
mpnn/model/mpnn.py +2632 -0
mpnn/pipelines/mpnn.py +162 -0
mpnn/samplers/samplers.py +167 -0
mpnn/train.py +341 -0
mpnn/trainers/mpnn.py +193 -0
mpnn/transforms/feature_aggregation/mpnn.py +184 -0
mpnn/transforms/feature_aggregation/polymer_ligand_interface.py +76 -0
mpnn/transforms/feature_aggregation/token_encodings.py +132 -0
mpnn/transforms/feature_aggregation/user_settings.py +347 -0
mpnn/transforms/polymer_ligand_interface.py +164 -0
mpnn/utils/inference.py +2397 -0
mpnn/utils/probability.py +37 -0
mpnn/utils/weights.py +309 -0
rc_foundry-0.1.1.dist-info/METADATA +239 -0
rc_foundry-0.1.1.dist-info/RECORD +180 -0
rc_foundry-0.1.1.dist-info/WHEEL +4 -0
rc_foundry-0.1.1.dist-info/entry_points.txt +5 -0
rc_foundry-0.1.1.dist-info/licenses/LICENSE.md +28 -0
rf3/__init__.py +3 -0
rf3/_version.py +33 -0
rf3/alignment.py +79 -0
rf3/callbacks/dump_validation_structures.py +101 -0
rf3/callbacks/metrics_logging.py +324 -0
rf3/chemical.py +1529 -0
rf3/cli.py +77 -0
rf3/data/cyclic_transform.py +78 -0
rf3/data/extra_xforms.py +36 -0
rf3/data/ground_truth_template.py +463 -0
rf3/data/paired_msa.py +206 -0
rf3/data/pipeline_utils.py +128 -0
rf3/data/pipelines.py +558 -0
rf3/diffusion_samplers/inference_sampler.py +222 -0
rf3/inference.py +65 -0
rf3/inference_engines/__init__.py +5 -0
rf3/inference_engines/rf3.py +735 -0
rf3/kinematics.py +354 -0
rf3/loss/af3_confidence_loss.py +515 -0
rf3/loss/af3_losses.py +655 -0
rf3/loss/loss.py +179 -0
rf3/metrics/chiral.py +179 -0
rf3/metrics/clashing_chains.py +68 -0
rf3/metrics/distogram.py +421 -0
rf3/metrics/lddt.py +523 -0
rf3/metrics/metadata.py +43 -0
rf3/metrics/metric_utils.py +192 -0
rf3/metrics/predicted_error.py +134 -0
rf3/metrics/rasa.py +108 -0
rf3/metrics/selected_distances.py +91 -0
rf3/model/RF3.py +527 -0
rf3/model/RF3_blocks.py +92 -0
rf3/model/RF3_structure.py +303 -0
rf3/model/layers/af3_auxiliary_heads.py +255 -0
rf3/model/layers/af3_diffusion_transformer.py +544 -0
rf3/model/layers/attention.py +313 -0
rf3/model/layers/layer_utils.py +127 -0
rf3/model/layers/mlff.py +118 -0
rf3/model/layers/outer_product.py +59 -0
rf3/model/layers/pairformer_layers.py +783 -0
rf3/model/layers/structure_bias.py +56 -0
rf3/scoring.py +1787 -0
rf3/symmetry/resolve.py +284 -0
rf3/train.py +194 -0
rf3/trainers/rf3.py +570 -0
rf3/util_module.py +47 -0
rf3/utils/frames.py +109 -0
rf3/utils/inference.py +665 -0
rf3/utils/io.py +198 -0
rf3/utils/loss.py +72 -0
rf3/utils/predict_and_score.py +165 -0
rf3/utils/predicted_error.py +673 -0
rf3/utils/recycling.py +42 -0
rf3/validate.py +140 -0
rfd3/.gitignore +7 -0
rfd3/Makefile +76 -0
rfd3/__init__.py +12 -0
rfd3/callbacks.py +66 -0
rfd3/cli.py +41 -0
rfd3/constants.py +212 -0
rfd3/engine.py +543 -0
rfd3/inference/datasets.py +193 -0
rfd3/inference/input_parsing.py +1123 -0
rfd3/inference/legacy_input_parsing.py +717 -0
rfd3/inference/parsing.py +165 -0
rfd3/inference/symmetry/atom_array.py +298 -0
rfd3/inference/symmetry/checks.py +241 -0
rfd3/inference/symmetry/contigs.py +63 -0
rfd3/inference/symmetry/frames.py +355 -0
rfd3/inference/symmetry/symmetry_utils.py +398 -0
rfd3/metrics/design_metrics.py +465 -0
rfd3/metrics/hbonds_hbplus_metrics.py +308 -0
rfd3/metrics/hbonds_metrics.py +389 -0
rfd3/metrics/losses.py +325 -0
rfd3/metrics/metrics_utils.py +118 -0
rfd3/metrics/sidechain_metrics.py +349 -0
rfd3/model/RFD3.py +105 -0
rfd3/model/RFD3_diffusion_module.py +387 -0
rfd3/model/cfg_utils.py +81 -0
rfd3/model/inference_sampler.py +635 -0
rfd3/model/layers/attention.py +577 -0
rfd3/model/layers/block_utils.py +580 -0
rfd3/model/layers/blocks.py +777 -0
rfd3/model/layers/chunked_pairwise.py +377 -0
rfd3/model/layers/encoders.py +417 -0
rfd3/model/layers/layer_utils.py +197 -0
rfd3/model/layers/pairformer_layers.py +128 -0
rfd3/run_inference.py +45 -0
rfd3/testing/debug.py +139 -0
rfd3/testing/debug_utils.py +73 -0
rfd3/testing/testing_utils.py +356 -0
rfd3/train.py +194 -0
rfd3/trainer/dump_validation_structures.py +154 -0
rfd3/trainer/fabric_trainer.py +923 -0
rfd3/trainer/recycling.py +42 -0
rfd3/trainer/rfd3.py +485 -0
rfd3/trainer/trainer_utils.py +502 -0
rfd3/transforms/conditioning_base.py +508 -0
rfd3/transforms/conditioning_utils.py +200 -0
rfd3/transforms/design_transforms.py +807 -0
rfd3/transforms/dna_crop.py +523 -0
rfd3/transforms/hbonds.py +407 -0
rfd3/transforms/hbonds_hbplus.py +246 -0
rfd3/transforms/ncaa_transforms.py +153 -0
rfd3/transforms/pipelines.py +632 -0
rfd3/transforms/ppi_transforms.py +541 -0
rfd3/transforms/rasa.py +116 -0
rfd3/transforms/symmetry.py +76 -0
rfd3/transforms/training_conditions.py +552 -0
rfd3/transforms/util_transforms.py +498 -0
rfd3/transforms/virtual_atoms.py +305 -0
rfd3/utils/inference.py +648 -0
rfd3/utils/io.py +245 -0
rfd3/utils/vizualize.py +276 -0

rfd3/transforms/util_transforms.py ADDED Viewed

@@ -0,0 +1,498 @@
+# see atomworks.ml.ransforms.feature_aggregation
+import time
+from typing import Any, Dict
+import numpy as np
+import torch
+import torch.nn.functional as F
+from atomworks.constants import STANDARD_AA
+from atomworks.enums import ChainTypeInfo
+from atomworks.io.utils.sequence import (
+    is_purine,
+    is_pyrimidine,
+)
+from atomworks.ml.encoding_definitions import AF3SequenceEncoding
+from atomworks.ml.transforms._checks import (
+    check_atom_array_annotation,
+    check_contains_keys,
+    check_is_instance,
+)
+from atomworks.ml.transforms.atom_array import get_within_entity_idx
+from atomworks.ml.transforms.base import Transform
+from atomworks.ml.utils.token import (
+    get_token_count,
+    get_token_starts,
+    is_glycine,
+    is_protein_unknown,
+    is_standard_aa_not_glycine,
+    is_unknown_nucleotide,
+    spread_token_wise,
+)
+from biotite.structure import AtomArray
+af3_sequence_encoding = AF3SequenceEncoding()
+def assert_single_representative(token, central_atom="CB"):
+    mask = get_af3_token_representative_masks(token, central_atom=central_atom)
+    assert (
+        np.sum(mask) == 1
+    ), f"No representative atom (CB) found. mask: {mask}\nToken: {token}"
+def assert_single_token(token):
+    assert get_token_count(token) == 1, f"Token is not a single token: {token}"
+    assert_single_representative(token)
+def add_representative_atom(token, central_atom="CB"):
+    if get_af3_token_representative_masks(token, central_atom=central_atom).sum() == 1:
+        return token
+    length = token.array_length()
+    token.atomize = np.array([True] + [False] * (length - 1), dtype=bool)
+    assert_single_representative(token)
+    return token
+class TimerWrapper(Transform):
+    def check_input(self, *args, **kwargs):
+        pass
+    def __init__(self, transform):
+        self.transform = transform
+    def forward(self, data):
+        start = time.time()
+        data = self.transform.forward(data)
+        print(f"Time taken: {time.time() - start} s  || Transform: {self.transform}")
+        return data
+class IPDB(Transform):
+    def forward(self, data):
+        aa = data["atom_array"]  # noqa
+        import ipdb
+        ipdb.set_trace()
+        return data
+sequence_encoding = AF3SequenceEncoding()
+_aa_like_res_names = sequence_encoding.all_res_names[sequence_encoding.is_aa_like]
+_rna_like_res_names = sequence_encoding.all_res_names[sequence_encoding.is_rna_like]
+_dna_like_res_names = sequence_encoding.all_res_names[sequence_encoding.is_dna_like]
+class AssignTypes(Transform):
+    """
+    Assigns types to the atoms in the atom array using af3 sequence encoding scheme.
+    """
+    def check_input(self, data):
+        assert "atom_array" in data, "Input data must contain 'atom_array'."
+    def forward(self, data):
+        data["atom_array"] = assign_types_(data["atom_array"])
+        return data
+def assign_types_(atom_array):
+    token_starts = get_token_starts(atom_array)
+    res_names = atom_array[token_starts].res_name
+    token_id = np.arange(get_token_count(atom_array), dtype=np.uint32)  # [n_tokens]
+    atom_to_token_map = spread_token_wise(atom_array, token_id)
+    is_protein = np.isin(res_names, _aa_like_res_names).astype(bool)
+    is_residue = np.isin(res_names, STANDARD_AA).astype(bool)
+    is_rna = np.isin(res_names, _rna_like_res_names).astype(bool)
+    is_dna = np.isin(res_names, _dna_like_res_names).astype(bool)
+    is_ligand = ~(is_protein | is_rna | is_dna).astype(bool)
+    # Set annotations
+    atom_array.set_annotation("is_protein", is_protein[atom_to_token_map])
+    atom_array.set_annotation("is_rna", is_rna[atom_to_token_map])
+    atom_array.set_annotation("is_dna", is_dna[atom_to_token_map])
+    atom_array.set_annotation("is_ligand", is_ligand[atom_to_token_map])
+    atom_array.set_annotation("is_residue", is_residue[atom_to_token_map])
+    return atom_array
+class AggregateFeaturesLikeAF3WithoutMSA(Transform):
+    """
+    Exactly like AggregateFeaturesLikeAF3 but without MSAs
+    Removed comments for readability, no additional code is in this function, just removed msa parts
+    """
+    requires_previous_transforms = [
+        "AtomizeByCCDName",
+        "EncodeAF3TokenLevelFeatures",
+        "AddAF3TokenBondFeatures",
+        "UnindexFlaggedTokens",
+    ]
+    incompatible_previous_transforms = [
+        "AggregateFeaturesLikeAF3",
+        "AggregateFeaturesLikeAF3WithoutMSA",
+    ]
+    def check_input(self, data) -> None:
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(
+            data, ["coord_to_be_noised", "chain_iid", "occupancy"]
+        )
+    def forward(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Aggregates features into the format expected by AlphaFold 3.
+        This method processes the input data, combining MSA features, ground truth
+        structures, and other relevant information into a standardized format.
+        Args:
+            data (Dict[str, Any]): The input data dictionary containing MSA features,
+                atom array, and other relevant information.
+        Returns:
+            Dict[str, Any]: The processed data dictionary with aggregated features.
+        """
+        # Initialize feats dictionary if not present
+        if "feats" not in data:
+            data["feats"] = {}
+        data["feats"]["ref_atom_name_chars"] = F.one_hot(
+            data["feats"]["ref_atom_name_chars"].long(), num_classes=64
+        ).float()
+        data["feats"]["ref_element"] = F.one_hot(
+            data["feats"]["ref_element"].long(), num_classes=128
+        ).float()
+        data["feats"]["ref_pos"] = torch.nan_to_num(data["feats"]["ref_pos"], nan=0.0)
+        # Process ground truth structure
+        atom_array = data["atom_array"]
+        coord_atom_lvl = atom_array.coord
+        mask_atom_lvl = atom_array.occupancy > 0.0
+        token_starts = get_token_starts(atom_array)
+        token_level_array = atom_array[token_starts]
+        chain_iid_token_lvl = token_level_array.chain_iid
+        if "ground_truth" not in data:
+            data["ground_truth"] = {}
+        data["ground_truth"].update(
+            {
+                "coord_atom_lvl": torch.tensor(coord_atom_lvl),  # [n_atoms, 3]
+                "mask_atom_lvl": torch.tensor(mask_atom_lvl),  # [n_atoms]
+                "chain_iid_token_lvl": chain_iid_token_lvl,  # numpy.ndarray of strings with shape (n_tokens,)
+                "is_original_unindexed_token": torch.from_numpy(
+                    data["ground_truth"].get(
+                        "is_original_unindexed_token",
+                        np.zeros(len(token_starts), dtype=bool),
+                    )
+                ).bool(),  # [n_tokens]
+            }
+        )
+        data["coord_atom_lvl_to_be_noised"] = torch.tensor(
+            atom_array.coord_to_be_noised
+        )
+        # Remove any token bond features relating to unindexed tokens
+        if "token_bonds" in data["feats"]:
+            token_bonds = data["feats"]["token_bonds"]
+            mask = data["feats"]["is_motif_token_unindexed"]
+            # tokens bonded to unindexed & unindexed bonded to tokens
+            token_bonds[mask, :] = False
+            token_bonds[:, mask] = False
+        # Add partial t during inference
+        if "partial_t" in atom_array.get_annotation_categories():
+            assert data["is_inference"], "Partial diffusion only inference!"
+            data["feats"]["partial_t"] = torch.from_numpy(
+                atom_array.get_annotation("partial_t")
+            )
+        return data
+def add_backbone_and_sidechain_annotations(atom_array: AtomArray) -> AtomArray:
+    """
+    Adds the backbone and sidechain annotations to the AtomArray.
+    Args:
+        atom_array (AtomArray): The AtomArray to which the annotations will be added.
+    Returns:
+        AtomArray: The AtomArray with the added annotations.
+    """
+    # Get the backbone atoms
+    atomized = atom_array.atomize
+    is_protein = np.isin(atom_array.chain_type, ChainTypeInfo.PROTEINS)
+    backbone_atoms = ["N", "CA", "C", "O"]
+    backbone_mask = np.isin(atom_array.atom_name, backbone_atoms) & is_protein
+    backbone_mask = backbone_mask | atomized
+    sidechain_mask = ~backbone_mask & ~atomized & is_protein
+    # Add the annotations
+    atom_array.set_annotation("is_backbone", backbone_mask)
+    atom_array.set_annotation("is_sidechain", sidechain_mask)
+    return atom_array
+####################################################################################################
+# Changes to datahub base transforms (instead of creating new branches)
+####################################################################################################
+# from atomworks.ml.utils.token import get_af3_token_representative_masks
+def get_af3_token_representative_masks(
+    atom_array: AtomArray, central_atom: str = "CA"
+) -> np.ndarray:
+    pyrimidine_representative_atom = is_pyrimidine(atom_array.res_name) & (
+        atom_array.atom_name == "C2"
+    )
+    purine_representative_atom = is_purine(atom_array.res_name) & (
+        atom_array.atom_name == "C4"
+    )
+    unknown_na_representative_atom = is_unknown_nucleotide(atom_array.res_name) & (
+        atom_array.atom_name == "C4"
+    )
+    glycine_representative_atom = is_glycine(atom_array.res_name) & (
+        atom_array.atom_name == "CA"
+    )
+    protein_residue_not_glycine_representative_atom = is_standard_aa_not_glycine(
+        atom_array.res_name
+    ) & (
+        atom_array.atom_name == central_atom  # only change
+    )
+    unknown_protein_residue_representative_atom = (
+        is_protein_unknown(atom_array.res_name)
+    ) & (atom_array.atom_name == "CA")
+    atoms = atom_array.atomize
+    _token_rep_mask = (
+        pyrimidine_representative_atom
+        | purine_representative_atom
+        | unknown_na_representative_atom
+        | glycine_representative_atom
+        | protein_residue_not_glycine_representative_atom
+        | unknown_protein_residue_representative_atom
+        | atoms
+    )
+    return _token_rep_mask
+class RemoveTokensWithoutCorrespondingCentralAtom(Transform):
+    """
+    Remove tokens with missing central atoms.
+    """
+    def __init__(self, central_atom: str = "CA"):
+        self.central_atom = central_atom
+    def check_input(self, data):
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(data, ["atom_name", "res_name"])
+    def forward(self, data):
+        central_atom = self.central_atom
+        atom_array = data["atom_array"]
+        pyrimidine_mask = is_pyrimidine(atom_array.res_name)
+        purine_mask = is_purine(atom_array.res_name)
+        unknown_na_mask = is_unknown_nucleotide(atom_array.res_name)
+        glycine_mask = is_glycine(atom_array.res_name)
+        aa_not_glycine_mask = is_standard_aa_not_glycine(atom_array.res_name)
+        unknown_aa_mask = is_protein_unknown(atom_array.res_name)
+        anything_else_mask = ~(
+            pyrimidine_mask
+            | purine_mask
+            | unknown_na_mask
+            | glycine_mask
+            | aa_not_glycine_mask
+            | unknown_aa_mask
+        )
+        def _get_if_central_atom_present_mask(atom_array, case_mask, central_atom):
+            token_starts = get_token_starts(atom_array[case_mask])
+            central_atom_mask = atom_array[case_mask].atom_name == central_atom
+            if len(token_starts) == central_atom_mask.sum():
+                ## all tokens have central atom, *vast majority*
+                return case_mask
+            else:
+                ## find the missing ones, *very rare*
+                out_mask = case_mask
+                all_token_starts = get_token_starts(atom_array)
+                token_start_mask = case_mask[all_token_starts]
+                case_token_starts = all_token_starts[token_start_mask]
+                for item in case_token_starts:
+                    res_start = item
+                    idx = all_token_starts.tolist().index(res_start)
+                    res_mask = np.bool_(np.zeros(len(atom_array)))
+                    if idx == len(all_token_starts) - 1:
+                        res_mask[res_start:] = True
+                    else:
+                        res_end = all_token_starts[idx + 1]
+                        res_mask[res_start:res_end] = True
+                    res_array = atom_array[res_mask]
+                    # remove if central atom not present
+                    if (res_array.atom_name == central_atom).sum() == 0:
+                        out_mask = out_mask & ~res_mask
+                return out_mask
+        keep_mask = (
+            _get_if_central_atom_present_mask(atom_array, pyrimidine_mask, "C2")
+            | _get_if_central_atom_present_mask(atom_array, purine_mask, "C4")
+            | _get_if_central_atom_present_mask(atom_array, unknown_na_mask, "C4")
+            | _get_if_central_atom_present_mask(atom_array, glycine_mask, "CA")
+            | _get_if_central_atom_present_mask(
+                atom_array, aa_not_glycine_mask, central_atom
+            )
+            | _get_if_central_atom_present_mask(atom_array, unknown_aa_mask, "CA")
+            | anything_else_mask
+        )
+        data["atom_array"] = atom_array[keep_mask]
+        return data
+class EncodeAF3TokenLevelFeatures(Transform):
+    def __init__(
+        self, sequence_encoding: AF3SequenceEncoding, encode_residues_to: int = None
+    ):
+        self.sequence_encoding = sequence_encoding
+        self.encode_residues_to = encode_residues_to  # for spoofing the restype
+    def check_input(self, data: dict[str, Any]) -> None:
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(
+            data,
+            [
+                "atomize",
+                "pn_unit_iid",
+                "chain_entity",
+                "res_name",
+                "within_chain_res_idx",
+            ],
+        )
+    def forward(self, data: dict[str, Any]) -> dict[str, Any]:
+        atom_array = data["atom_array"]
+        # ... get token-level array
+        token_starts = get_token_starts(atom_array)
+        token_level_array = atom_array[token_starts]
+        # ... identifier tokens
+        # ... (residue)
+        residue_index = token_level_array.within_chain_res_idx
+        # ... (token)
+        token_index = np.arange(len(token_starts))
+        # ... (chain instance)
+        asym_name, asym_id = np.unique(
+            token_level_array.pn_unit_iid, return_inverse=True
+        )
+        # ... (chain entity)
+        entity_name, entity_id = np.unique(
+            token_level_array.pn_unit_entity, return_inverse=True
+        )
+        # ... (within chain entity)
+        sym_name, sym_id = get_within_entity_idx(token_level_array, level="pn_unit")
+        # ... molecule type
+        _aa_like_res_names = self.sequence_encoding.all_res_names[
+            self.sequence_encoding.is_aa_like
+        ]
+        is_protein = np.isin(token_level_array.res_name, _aa_like_res_names)
+        _rna_like_res_names = self.sequence_encoding.all_res_names[
+            self.sequence_encoding.is_rna_like
+        ]
+        is_rna = np.isin(token_level_array.res_name, _rna_like_res_names)
+        _dna_like_res_names = self.sequence_encoding.all_res_names[
+            self.sequence_encoding.is_dna_like
+        ]
+        is_dna = np.isin(token_level_array.res_name, _dna_like_res_names)
+        is_ligand = ~(is_protein | is_rna | is_dna)
+        # Get is_polar features
+        polar_restypes = np.array(
+            [
+                "SER",
+                "THR",
+                "ASN",
+                "GLN",
+                "TYR",
+                "CYS",
+                "HIS",
+                "LYS",
+                "ARG",
+                "ASP",
+                "GLU",
+            ]
+        )
+        is_polar = is_protein & np.isin(token_level_array.res_name, polar_restypes)
+        # ... sequence tokens
+        res_names = token_level_array.res_name
+        if self.encode_residues_to is not None:
+            is_masked = ~token_level_array.is_motif_atom_with_fixed_seq
+            res_names[is_masked] = np.full(
+                np.sum(is_masked), self.encode_residues_to, dtype=res_names.dtype
+            )
+        restype = self.sequence_encoding.encode(res_names)
+        data["encoded"] = {"seq": restype}  # For msa's
+        restype = F.one_hot(
+            torch.tensor(restype), num_classes=self.sequence_encoding.n_tokens
+        ).numpy()
+        # ... Add termini annotations (n_tok, 2)
+        terminus_type = np.zeros(
+            (
+                len(token_level_array),
+                2,
+            ),
+            dtype=restype.dtype,
+        )
+        terminus_type[token_level_array.is_C_terminus, 0] = 1
+        terminus_type[token_level_array.is_N_terminus, 1] = 1
+        # ... add to data dict
+        if "feats" not in data:
+            data["feats"] = {}
+        if "feat_metadata" not in data:
+            data["feat_metadata"] = {}
+        # ... add to data dict
+        data["feats"] |= {
+            "residue_index": residue_index,  # (N_tokens) (int)
+            "token_index": token_index,  # (N_tokens) (int)
+            "asym_id": asym_id,  # (N_tokens) (int)
+            "entity_id": entity_id,  # (N_tokens) (int)
+            "sym_id": sym_id,  # (N_tokens) (int)
+            "restype": restype,  # (N_tokens, 32) (float, one-hot)
+            "is_protein": is_protein,  # (N_tokens) (bool)
+            "is_rna": is_rna,  # (N_tokens) (bool)
+            "is_dna": is_dna,  # (N_tokens) (bool)
+            "is_ligand": is_ligand,  # (N_tokens) (bool)
+            "terminus_type": terminus_type,  # (N_tokens, 2) (int)
+            "is_polar": is_polar,  # (N_tokens) (bool)
+        }
+        data["feat_metadata"] |= {
+            "asym_name": asym_name,  # (N_asyms)
+            "entity_name": entity_name,  # (N_entities)
+            "sym_name": sym_name,  # (N_entities)
+        }
+        return data