PyPI - rc-foundry - Versions diffs - 0.1.1__py3-none-any.whl - Mend

rc-foundry 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

foundry/__init__.py +57 -0
foundry/callbacks/__init__.py +5 -0
foundry/callbacks/callback.py +116 -0
foundry/callbacks/health_logging.py +419 -0
foundry/callbacks/metrics_logging.py +211 -0
foundry/callbacks/timing_logging.py +67 -0
foundry/callbacks/train_logging.py +278 -0
foundry/common.py +108 -0
foundry/constants.py +28 -0
foundry/hydra/resolvers.py +77 -0
foundry/inference_engines/base.py +235 -0
foundry/inference_engines/checkpoint_registry.py +66 -0
foundry/metrics/__init__.py +12 -0
foundry/metrics/losses.py +30 -0
foundry/metrics/metric.py +319 -0
foundry/model/layers/blocks.py +47 -0
foundry/testing/__init__.py +6 -0
foundry/testing/fixtures.py +19 -0
foundry/testing/pytest_hooks.py +15 -0
foundry/trainers/fabric.py +923 -0
foundry/training/EMA.py +67 -0
foundry/training/checkpoint.py +61 -0
foundry/training/schedulers.py +91 -0
foundry/utils/alignment.py +86 -0
foundry/utils/components.py +415 -0
foundry/utils/datasets.py +405 -0
foundry/utils/ddp.py +103 -0
foundry/utils/instantiators.py +72 -0
foundry/utils/logging.py +279 -0
foundry/utils/rigid.py +1460 -0
foundry/utils/rotation_augmentation.py +65 -0
foundry/utils/squashfs.py +172 -0
foundry/utils/torch.py +317 -0
foundry/utils/weights.py +271 -0
foundry/version.py +34 -0
foundry_cli/__init__.py +3 -0
foundry_cli/download_checkpoints.py +281 -0
mpnn/__init__.py +1 -0
mpnn/collate/feature_collator.py +265 -0
mpnn/inference.py +53 -0
mpnn/inference_engines/mpnn.py +549 -0
mpnn/loss/nll_loss.py +122 -0
mpnn/metrics/nll.py +369 -0
mpnn/metrics/sequence_recovery.py +440 -0
mpnn/model/layers/graph_embeddings.py +2372 -0
mpnn/model/layers/message_passing.py +332 -0
mpnn/model/layers/position_wise_feed_forward.py +44 -0
mpnn/model/layers/positional_encoding.py +98 -0
mpnn/model/mpnn.py +2632 -0
mpnn/pipelines/mpnn.py +162 -0
mpnn/samplers/samplers.py +167 -0
mpnn/train.py +341 -0
mpnn/trainers/mpnn.py +193 -0
mpnn/transforms/feature_aggregation/mpnn.py +184 -0
mpnn/transforms/feature_aggregation/polymer_ligand_interface.py +76 -0
mpnn/transforms/feature_aggregation/token_encodings.py +132 -0
mpnn/transforms/feature_aggregation/user_settings.py +347 -0
mpnn/transforms/polymer_ligand_interface.py +164 -0
mpnn/utils/inference.py +2397 -0
mpnn/utils/probability.py +37 -0
mpnn/utils/weights.py +309 -0
rc_foundry-0.1.1.dist-info/METADATA +239 -0
rc_foundry-0.1.1.dist-info/RECORD +180 -0
rc_foundry-0.1.1.dist-info/WHEEL +4 -0
rc_foundry-0.1.1.dist-info/entry_points.txt +5 -0
rc_foundry-0.1.1.dist-info/licenses/LICENSE.md +28 -0
rf3/__init__.py +3 -0
rf3/_version.py +33 -0
rf3/alignment.py +79 -0
rf3/callbacks/dump_validation_structures.py +101 -0
rf3/callbacks/metrics_logging.py +324 -0
rf3/chemical.py +1529 -0
rf3/cli.py +77 -0
rf3/data/cyclic_transform.py +78 -0
rf3/data/extra_xforms.py +36 -0
rf3/data/ground_truth_template.py +463 -0
rf3/data/paired_msa.py +206 -0
rf3/data/pipeline_utils.py +128 -0
rf3/data/pipelines.py +558 -0
rf3/diffusion_samplers/inference_sampler.py +222 -0
rf3/inference.py +65 -0
rf3/inference_engines/__init__.py +5 -0
rf3/inference_engines/rf3.py +735 -0
rf3/kinematics.py +354 -0
rf3/loss/af3_confidence_loss.py +515 -0
rf3/loss/af3_losses.py +655 -0
rf3/loss/loss.py +179 -0
rf3/metrics/chiral.py +179 -0
rf3/metrics/clashing_chains.py +68 -0
rf3/metrics/distogram.py +421 -0
rf3/metrics/lddt.py +523 -0
rf3/metrics/metadata.py +43 -0
rf3/metrics/metric_utils.py +192 -0
rf3/metrics/predicted_error.py +134 -0
rf3/metrics/rasa.py +108 -0
rf3/metrics/selected_distances.py +91 -0
rf3/model/RF3.py +527 -0
rf3/model/RF3_blocks.py +92 -0
rf3/model/RF3_structure.py +303 -0
rf3/model/layers/af3_auxiliary_heads.py +255 -0
rf3/model/layers/af3_diffusion_transformer.py +544 -0
rf3/model/layers/attention.py +313 -0
rf3/model/layers/layer_utils.py +127 -0
rf3/model/layers/mlff.py +118 -0
rf3/model/layers/outer_product.py +59 -0
rf3/model/layers/pairformer_layers.py +783 -0
rf3/model/layers/structure_bias.py +56 -0
rf3/scoring.py +1787 -0
rf3/symmetry/resolve.py +284 -0
rf3/train.py +194 -0
rf3/trainers/rf3.py +570 -0
rf3/util_module.py +47 -0
rf3/utils/frames.py +109 -0
rf3/utils/inference.py +665 -0
rf3/utils/io.py +198 -0
rf3/utils/loss.py +72 -0
rf3/utils/predict_and_score.py +165 -0
rf3/utils/predicted_error.py +673 -0
rf3/utils/recycling.py +42 -0
rf3/validate.py +140 -0
rfd3/.gitignore +7 -0
rfd3/Makefile +76 -0
rfd3/__init__.py +12 -0
rfd3/callbacks.py +66 -0
rfd3/cli.py +41 -0
rfd3/constants.py +212 -0
rfd3/engine.py +543 -0
rfd3/inference/datasets.py +193 -0
rfd3/inference/input_parsing.py +1123 -0
rfd3/inference/legacy_input_parsing.py +717 -0
rfd3/inference/parsing.py +165 -0
rfd3/inference/symmetry/atom_array.py +298 -0
rfd3/inference/symmetry/checks.py +241 -0
rfd3/inference/symmetry/contigs.py +63 -0
rfd3/inference/symmetry/frames.py +355 -0
rfd3/inference/symmetry/symmetry_utils.py +398 -0
rfd3/metrics/design_metrics.py +465 -0
rfd3/metrics/hbonds_hbplus_metrics.py +308 -0
rfd3/metrics/hbonds_metrics.py +389 -0
rfd3/metrics/losses.py +325 -0
rfd3/metrics/metrics_utils.py +118 -0
rfd3/metrics/sidechain_metrics.py +349 -0
rfd3/model/RFD3.py +105 -0
rfd3/model/RFD3_diffusion_module.py +387 -0
rfd3/model/cfg_utils.py +81 -0
rfd3/model/inference_sampler.py +635 -0
rfd3/model/layers/attention.py +577 -0
rfd3/model/layers/block_utils.py +580 -0
rfd3/model/layers/blocks.py +777 -0
rfd3/model/layers/chunked_pairwise.py +377 -0
rfd3/model/layers/encoders.py +417 -0
rfd3/model/layers/layer_utils.py +197 -0
rfd3/model/layers/pairformer_layers.py +128 -0
rfd3/run_inference.py +45 -0
rfd3/testing/debug.py +139 -0
rfd3/testing/debug_utils.py +73 -0
rfd3/testing/testing_utils.py +356 -0
rfd3/train.py +194 -0
rfd3/trainer/dump_validation_structures.py +154 -0
rfd3/trainer/fabric_trainer.py +923 -0
rfd3/trainer/recycling.py +42 -0
rfd3/trainer/rfd3.py +485 -0
rfd3/trainer/trainer_utils.py +502 -0
rfd3/transforms/conditioning_base.py +508 -0
rfd3/transforms/conditioning_utils.py +200 -0
rfd3/transforms/design_transforms.py +807 -0
rfd3/transforms/dna_crop.py +523 -0
rfd3/transforms/hbonds.py +407 -0
rfd3/transforms/hbonds_hbplus.py +246 -0
rfd3/transforms/ncaa_transforms.py +153 -0
rfd3/transforms/pipelines.py +632 -0
rfd3/transforms/ppi_transforms.py +541 -0
rfd3/transforms/rasa.py +116 -0
rfd3/transforms/symmetry.py +76 -0
rfd3/transforms/training_conditions.py +552 -0
rfd3/transforms/util_transforms.py +498 -0
rfd3/transforms/virtual_atoms.py +305 -0
rfd3/utils/inference.py +648 -0
rfd3/utils/io.py +245 -0
rfd3/utils/vizualize.py +276 -0

rfd3/inference/parsing.py ADDED Viewed

@@ -0,0 +1,165 @@
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+from biotite.structure import AtomArray, get_residue_starts
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    model_serializer,
+    model_validator,
+)
+from foundry.utils.components import (
+    ComponentStr,
+    fetch_mask_from_idx,
+    get_name_mask,
+    split_contig,
+    unravel_components,
+)
+# ============================================================================
+# Input Specification & Validation
+# ============================================================================
+class InputSelection(BaseModel):
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        str_strip_whitespace=True,
+        str_min_length=1,
+    )
+    data: Dict[ComponentStr | str, List[str]] = Field(
+        ..., description="Validated selection dictionary", exclude=True
+    )
+    raw: Any = Field(..., description="Original input value")
+    mask: np.ndarray[np.bool_] = Field(
+        ..., description="Boolean mask over atom array", exclude=True
+    )
+    tokens: Optional[Dict[ComponentStr | str, AtomArray]] = Field(
+        ..., description="Selected atom arrays per component", exclude=True
+    )
+    @classmethod
+    def from_any(
+        cls, v: Union[str, bool, dict, None], atom_array: AtomArray
+    ) -> Optional["InputSelection"]:
+        """Create InputSelection from various input types."""
+        if v is None:
+            return None
+        data, mask, _ = from_any_(v=v, atom_array=atom_array)
+        return cls(
+            raw=v,
+            data=data,
+            mask=mask,
+            tokens=None,
+        )
+    @model_validator(mode="after")
+    def check_keys(self):
+        # lightweight validation that all keys have contig format (are splittable indices)
+        assert all([split_contig(k) for k in self.data.keys()])
+        return self
+    # Wrapper functionality as dict-like
+    def __getitem__(self, key: str) -> List[str]:
+        """Allow dict-like access."""
+        return self.data[key]
+    def items(self):
+        return self.data.items()
+    def keys(self):
+        return self.data.keys()
+    def values(self):
+        return self.data.values()
+    def get(self, *args, **kwargs):
+        return self.data.get(*args, **kwargs)
+    # Serialization & repr
+    def __repr__(self) -> str:
+        num_atoms = self.mask.sum() if hasattr(self.mask, "sum") else 0
+        num_tokens = len(self.tokens) if self.tokens else 0
+        return (
+            f"InputSelection(raw={self.raw!r}, atoms={num_atoms}, tokens={num_tokens})"
+        )
+    @model_serializer
+    def serialize_raw(self) -> Any:
+        return self.raw
+    # Listed as separate methods for future changes to parsing.
+    def get_mask(self):
+        return self.mask
+    def get_tokens(self, aa):
+        _, _, tokens = from_any_(v=self.raw, atom_array=aa)
+        return tokens
+def from_any_(v: Any, atom_array: AtomArray):
+    data_norm = canonicalize_(v, atom_array)
+    # Canonicalize dictionaries to SelectionDict (I.e. convert "ALL" / "TIP" -> concrete atom names)
+    data_split = {}
+    mask = np.array([False] * len(atom_array))
+    tokens = {}
+    for idx, atm_names in data_norm.items():
+        # Find atom array subset
+        comp_mask = fetch_mask_from_idx(idx, atom_array=atom_array)
+        token = atom_array[comp_mask]
+        comp_mask_subset = get_name_mask(
+            token.atom_name, atm_names, token.res_name[0]
+        )  # [N_atoms_in_token,]
+        # Split to atom names
+        data_split[idx] = token.atom_name[comp_mask_subset].tolist()
+        # Update mask & token dictionary
+        mask[comp_mask] = comp_mask_subset
+        tokens[idx] = token[comp_mask_subset]
+    return (data_split, mask, tokens)
+def canonicalize_(v, atom_array: AtomArray):
+    # Canonicalize inputs to dictionaries of strings:
+    # "A11-12" -> {"A11": "N,CA,C,...", "A12": "N,CA,C,..."}
+    # True -> {"A1": "ALL", "A2": "ALL", ...}
+    # False -> {"A1": "", "A2": "", ...}
+    # "LIG" -> {"B1": "ALL", "C1": "ALL"}  (for two ligands named LIG)
+    data = {}
+    if isinstance(v, str):
+        for component in unravel_components(
+            v, atom_array=atom_array, allow_multiple_matches=True
+        ):
+            if (
+                isinstance(component, str) and component[0].isalpha()
+            ):  # filter on valid chain IDs
+                data[component] = "ALL"
+    elif isinstance(v, bool):
+        starts = get_residue_starts(atom_array, add_exclusive_stop=True)
+        for start, stop in zip(starts[:-1], starts[1:]):
+            token = atom_array[start:stop]
+            # All atoms selected for every token or None
+            data[f"{token.chain_id[0]}{token.res_id[0]}"] = "ALL" if v else ""
+    elif isinstance(v, dict):
+        # Ensure all values of dictionaries are strings
+        data = {}
+        for k, vv in v.items():
+            for component in unravel_components(
+                k, atom_array=atom_array, allow_multiple_matches=True
+            ):
+                if isinstance(vv, list):
+                    data[component] = ",".join(vv)
+                else:
+                    data[component] = vv
+    else:
+        raise ValueError(f"Cannot convert {type(v)} to InputSelection")
+    return data

rfd3/inference/symmetry/atom_array.py ADDED Viewed

@@ -0,0 +1,298 @@
+import numpy as np
+from rfd3.inference.symmetry.frames import (
+    decompose_symmetry_frame,
+    get_symmetry_frames_from_symmetry_id,
+)
+from foundry.utils.ddp import RankedLogger
+FIXED_TRANSFORM_ID = -1
+FIXED_ENTITY_ID = -1
+ranked_logger = RankedLogger(__name__, rank_zero_only=True)
+########################################################
+# Symmetry annotations
+########################################################
+def add_sym_annotations(atom_array, sym_conf):
+    """
+    Add symmetry base annotations to an atom array.
+    Arguments:
+        atom_array: atom array of symmetry subunit
+        sym_conf: symmetry configuration (dict, "id" key is required)
+    """
+    n = atom_array.shape[0]
+    # which is the asymmetric unit? At this point, we annotate everything as the asu
+    is_asu = np.full(n, True, dtype=np.bool_)
+    atom_array.set_annotation("is_sym_asu", is_asu)
+    # symmetry_id
+    symmetry_ids = np.full(n, sym_conf.get("id"), dtype="U6")
+    atom_array.set_annotation("symmetry_id", symmetry_ids)
+    return atom_array
+def add_sym_annotations_to_fixed_motif(atom_array):
+    """
+    Add symmetry annotations to a motif atom array.
+    Arguments:
+        atom_array: atom array of symmetry subunit
+    """
+    n = atom_array.shape[0]
+    # setting the identity transform
+    Ori, X, Y = decompose_symmetry_frame((np.eye(3), np.zeros(3)))
+    Oris = np.full(n, Ori)
+    Xs = np.full(n, X)
+    Ys = np.full(n, Y)
+    atom_array.set_annotation("sym_transform_Ori", Oris)
+    atom_array.set_annotation("sym_transform_X", Xs)
+    atom_array.set_annotation("sym_transform_Y", Ys)
+    transform_ids = np.full(n, FIXED_TRANSFORM_ID, dtype=np.int32)
+    atom_array.set_annotation("sym_transform_id", transform_ids)
+    entity_ids = np.full(n, FIXED_ENTITY_ID, dtype=np.int32)
+    atom_array.set_annotation("sym_entity_id", entity_ids)
+    # make sure that the motif is not the asu
+    is_sym_asu = np.full(n, False, dtype=np.bool_)
+    atom_array.set_annotation("is_sym_asu", is_sym_asu)
+    return atom_array
+def add_src_sym_component_annotations(atom_array):
+    """
+    Add src_sym_component annotations to an atom array.
+    This is used to correctly map the original motif id to diffused unindexed motifs.
+    Arguments:
+        atom_array: atom array with src_component annotated
+    """
+    if "src_component" not in atom_array.get_annotation_categories():
+        return atom_array
+    src_sym_component = atom_array.src_component.copy()
+    src_tokens = np.unique(atom_array.src_component)
+    for src_token in src_tokens:
+        # Skip non-alphabetic tokens
+        if len(src_token) == 0:
+            continue
+        if not src_token[0].isalpha():
+            continue
+        # Get block of atoms with this src token
+        src_block_mask = atom_array.src_component == src_token
+        src_block = atom_array[src_block_mask]
+        # Skip if not all unindexed motif atoms
+        if not src_block.is_motif_atom_unindexed.all():
+            continue
+        # Update src component with chain ID prefix
+        for chain_id in np.unique(src_block.chain_id):
+            chain_mask = src_block.chain_id == chain_id
+            src_block.src_component[chain_mask] = chain_id + src_token[1:]
+        src_sym_component[src_block_mask] = src_block.src_component
+    atom_array.set_annotation("src_sym_component", src_sym_component)
+    return atom_array
+def fix_3D_sym_motif_annotations(atom_array):
+    """
+    Add fixed motif annotations to the 3D NON-indexed motifs (only unindexed and ligands).
+    since indexed motifs are contiguously connected to generative residues,
+    it should NOT be fixed, instead get symmetrized at each step
+    Arguments:
+        atom_array: atom array
+    """
+    # fixed_motif_mask = atom_array.is_motif_atom_with_fixed_coord == 1
+    fixed_motif_mask = atom_array._is_motif & ~atom_array._is_indexed_motif
+    fixed_motif_array = atom_array[fixed_motif_mask].copy()
+    fixed_motif_array = add_sym_annotations_to_fixed_motif(fixed_motif_array)
+    atom_array[fixed_motif_mask] = fixed_motif_array
+    return atom_array
+def add_sym_transform_annotations(atom_array, transform_id, frame, is_asu=False):
+    """
+    Add symmetry annotations to an atom array.
+    Arguments:
+        atom_array: atom array of symmetry subunit
+        transform_id: index of the transform frame
+        is_asu: whether this is the asymmetric unit
+    Returns:
+        atom_array: atom array with symmetry annotations
+    """
+    Ori, X, Y = decompose_symmetry_frame(frame)
+    n = atom_array.shape[0]
+    # symmetry transform (decomposed into Ori, X, Y)
+    Oris = np.full(n, Ori)
+    Xs = np.full(n, X)
+    Ys = np.full(n, Y)
+    atom_array.set_annotation("sym_transform_Ori", Oris)
+    atom_array.set_annotation("sym_transform_X", Xs)
+    atom_array.set_annotation("sym_transform_Y", Ys)
+    # symmetry transform id
+    transform_ids = np.full(n, transform_id, dtype=np.int32)
+    atom_array.set_annotation("sym_transform_id", transform_ids)
+    # entity ids - this will help keep track of different multiplicities
+    # if there are sm, they will have different entity ids from the prot atoms
+    unique_chain_ids = np.unique(atom_array.chain_id).tolist()
+    unique_chain_ids.sort()
+    entity_ids = np.array([unique_chain_ids.index(id) for id in atom_array.chain_id])
+    atom_array.set_annotation("sym_entity_id", entity_ids)
+    is_sym_asu = np.full(n, is_asu, dtype=np.bool_)
+    atom_array.set_annotation("is_sym_asu", is_sym_asu)
+    return atom_array
+def apply_symmetry_to_atomarray_coord(atom_array, frame):
+    """
+    Apply symmetry to the atom array coordinates.
+    Arguments:
+        atom_array: atom array
+        frame: symmetry frame (R, T)
+    """
+    R, T = frame
+    atom_array.coord = atom_array.coord @ R.T
+    atom_array.coord += T  # T should be 0 for most symmetry cases
+    return atom_array
+########################################################
+# Motif functions
+########################################################
+def annotate_unsym_atom_array(atom_array):
+    """
+    Annotate the unsym motif and return it.
+    Arguments:
+        atom_array: atom array
+        unsym_motif_mask: mask of unsym motifs
+    """
+    unsym_atom_array = atom_array.copy()
+    unsym_atom_array._is_asu = np.full(unsym_atom_array.shape[0], False, dtype=np.bool_)
+    unsym_atom_array.is_sym_asu = unsym_atom_array._is_asu
+    unsym_atom_array = reset_chain_ids(
+        unsym_atom_array, start_id="a"
+    )  # give it a lowercase chain id to avoid confusion with symmetry units
+    unsym_atom_array = add_sym_annotations_to_fixed_motif(unsym_atom_array)
+    return unsym_atom_array
+########################################################
+# 2D conditioning functions
+########################################################
+def add_2d_entity_annotations(atom_array):
+    entity_ids = np.zeros(atom_array.shape[0], dtype=np.int32)
+    categories = get_2d_annotation_categories(atom_array)
+    entity_id = 1
+    for i, anno in enumerate(categories):
+        entity_id = i + 1
+        entity_ids[atom_array.get_annotation(anno) == 1] = entity_id
+    atom_array.set_annotation("_2d_entity_id", entity_ids)
+    return atom_array
+def reannotate_2d_entity_ids(atom_array, transform_id):
+    if "_2d_entity_id" not in atom_array.get_annotation_categories():
+        return atom_array
+    _2d_annos = get_2d_annotation_categories(atom_array)
+    frames = get_symmetry_frames_from_symmetry_id(atom_array.symmetry_id[0])
+    # NOTE: assuming its either 2d cond is within a subunit was specified or all active sites were explicity specified
+    max_entity_id = max(len(_2d_annos), len(frames))
+    mask = atom_array.get_annotation("_2d_entity_id") != 0
+    atom_array._2d_entity_id[mask] = (
+        (atom_array._2d_entity_id[mask] + transform_id - 1) % max_entity_id
+    ) + 1
+    return atom_array
+def get_2d_annotation_categories(atom_array):
+    categories = []
+    for anno in atom_array.get_annotation_categories():
+        if "2d_condition" in anno:
+            categories.append(anno)
+    categories.sort()  # sort to make sure that categories are in ascending order
+    return categories
+def reannotate_2d_conditions(atom_array):
+    entity_ids_anno = atom_array.get_annotation("_2d_entity_id")
+    entity_ids = [d for d in np.unique(entity_ids_anno) if d != 0]
+    categories = get_2d_annotation_categories(atom_array)
+    diff = len(entity_ids) - len(categories)
+    if diff > 0:
+        for i in range(len(categories), len(categories) + diff):
+            categories.append(f"{categories[0]}_{i}")
+    for d, anno in zip(entity_ids, categories):
+        atom_array.set_annotation(anno, entity_ids_anno == d)
+    atom_array.del_annotation("_2d_entity_id")
+    return atom_array
+########################################################
+# Utility functions
+########################################################
+def reset_chain_ids(atom_array, start_id):
+    """
+    Reset the chain ids and pn_unit_iids of an atom array to start from the given id.
+    Arguments:
+        atom_array: atom array with chain_ids and pn_unit_iids annotated
+    """
+    chain_ids = np.unique(atom_array.chain_id)
+    new_chain_range = range(ord(start_id), ord(start_id) + len(chain_ids))
+    for new_id, old_id in zip(new_chain_range, chain_ids):
+        atom_array.chain_id[atom_array.chain_id == old_id] = chr(new_id)
+    atom_array.pn_unit_iid = atom_array.chain_id
+    return atom_array
+def reannotate_chain_ids(atom_array, offset, multiplier=0):
+    """
+    Reannotate the chain ids and pn_unit_iids of an atom array.
+    Arguments:
+        atom_array: protein atom array with chain_ids and pn_unit_iids annotated
+        offset: offset to add to the chain ids
+        multiplier: multiplier to add to the chain ids
+    """
+    chain_ids_int = (
+        np.array([ord(c) for c in atom_array.chain_id]) + offset * multiplier
+    )
+    chain_ids = np.array([chr(id) for id in chain_ids_int], dtype=str)
+    atom_array.chain_id = chain_ids
+    atom_array.pn_unit_iid = chain_ids
+    return atom_array
+def get_symmetry_unit(asu_atom_array, transform_id, frame):
+    """
+    Annotate the ASU protein atom array and return it for each symmetry unit.
+    Arguments:
+        asu_atom_array: atom array of the asymmetric unit, annotated with symmetry_id
+        transform_id: index of the symmetry unit
+        frame: symmetry frame
+    """
+    num_prot_chains = len(np.unique(asu_atom_array.chain_id))
+    symmetry_unit = asu_atom_array.copy()
+    symmetry_unit = reannotate_chain_ids(symmetry_unit, num_prot_chains, transform_id)
+    symmetry_unit = reannotate_2d_entity_ids(symmetry_unit, transform_id)
+    symmetry_unit = add_sym_transform_annotations(
+        symmetry_unit, transform_id, frame, is_asu=(transform_id == 0)
+    )
+    # apply symmetry to indexed motifs
+    # at this point, the diffused coordinates are at the origin/ have no xyz
+    symmetry_unit = apply_symmetry_to_atomarray_coord(symmetry_unit, frame)
+    return symmetry_unit

rfd3/inference/symmetry/checks.py ADDED Viewed

@@ -0,0 +1,241 @@
+import numpy as np
+from rfd3.inference.symmetry.contigs import expand_contig_unsym_motif
+from rfd3.transforms.conditioning_base import get_motif_features
+from foundry.utils.ddp import RankedLogger
+MIN_ATOMS_ALIGN = 100
+MAX_TRANSFORMS = 10
+RMSD_CUT = 1.0  # Angstroms
+ranked_logger = RankedLogger(__name__, rank_zero_only=True)
+def check_symmetry_config(
+    atom_array, sym_conf, sm, has_dist_cond, src_atom_array=None, partial=False
+):
+    """
+    Check if the symmetry configuration is valid. Add all basic checks here.
+    """
+    assert sym_conf.get("id"), "symmetry_id is required. e.g. {'id': 'C2'}"
+    # if unsym motif is provided, check that each motif name is in the atom array
+    if sym_conf.get("is_unsym_motif"):
+        assert (
+            src_atom_array is not None
+        ), "Source atom array must be provided for symmetric motifs"
+        unsym_motif_names = sym_conf["is_unsym_motif"].split(",")
+        unsym_motif_names = expand_contig_unsym_motif(unsym_motif_names)
+        for n in unsym_motif_names:
+            if (sm and n not in sm.split(",")) and (n not in atom_array.src_component):
+                raise ValueError(f"Unsym motif {n} not found in atom_array")
+    if (
+        get_motif_features(atom_array)["is_motif_token"].any()
+        and not sym_conf.get("is_symmetric_motif")
+        and not has_dist_cond
+    ):
+        raise ValueError(
+            "Asymmetric motif inputs should be distance constrained. "
+            "Use atomwise_fixed_dist to constrain the distance between the motif atoms."
+        )
+    # else: if unconditional symmetry, no need to have symmetric input motif
+    if partial and not sym_conf.get("is_symmetric_motif"):
+        raise ValueError(
+            "Partial diffusion with symmetry is only supported for symmetric inputs."
+        )
+def check_atom_array_is_symmetric(atom_array):
+    """
+    Check if the atom array is symmetric. This is NOT to check that the atom array symmetry matches that of the symmetry_id.
+    Arguments:
+        atom_array: atom arrays to check
+    Returns:
+        bool: True if the atom array is symmetric, False otherwise
+    """
+    # TODO: Implement something like this https://github.com/baker-laboratory/ipd/blob/main/ipd/sym/sym_detect.py#L303
+    #       and maybe this https://github.com/baker-laboratory/ipd/blob/main/ipd/sym/sym_detect.py#L231
+    import biotite.structure as struc
+    from rfd3.inference.symmetry.atom_array import (
+        apply_symmetry_to_atomarray_coord,
+    )
+    from rfd3.inference.symmetry.frames import (
+        get_symmetry_frames_from_symmetry_id,
+    )
+    # remove hetero atoms
+    atom_array = atom_array[~atom_array.hetero]
+    if len(atom_array) == 0:
+        ranked_logger.info("Atom array has no protein chains. Please check your input.")
+        return False
+    chains = np.unique(atom_array.chain_id)
+    asu_mask = atom_array.chain_id == chains[0]
+    asu_atoms = atom_array[asu_mask].copy()
+    # Check that all atom arrays have the same number of atoms
+    for chain in chains[1:]:
+        chain_mask = atom_array.chain_id == chain
+        if len(asu_atoms) != len(atom_array[chain_mask]):
+            ranked_logger.info(
+                f"Atom array has different number of atoms in chain {chain}. {len(asu_atoms)} != {len(atom_array[chain_mask])}"
+            )
+            return False
+    # Check that all atom arrays have the same atoms
+    for chain in chains[1:]:
+        chain_mask = atom_array.chain_id == chain
+        for i in range(len(asu_atoms)):
+            if asu_atoms.atom_name[i] != atom_array[chain_mask].atom_name[i]:
+                ranked_logger.info(
+                    f"Atom array has different atoms in chain {chain}. {asu_atoms.atom_name[i]} != {atom_array[chain_mask].atom_name[i]}"
+                )
+                return False
+    # Check that the atom array aligns with the standard symmetry frames
+    standard_frames = get_symmetry_frames_from_symmetry_id(atom_array.symmetry_id[0])
+    standard_atom_array = []
+    for frame in standard_frames:
+        symmed_atoms = apply_symmetry_to_atomarray_coord(asu_atoms, frame)
+        standard_atom_array.append(symmed_atoms)
+    standard_atom_array = struc.concatenate(standard_atom_array)
+    R_standard_obtained = find_optimal_rotation(
+        standard_atom_array.coord, atom_array.coord
+    )
+    if R_standard_obtained is None:
+        ranked_logger.info(
+            "Atom array does not align with the standard symmetry frames."
+        )
+        return False
+    return True
+def find_optimal_rotation(coords1, coords2, max_points=1000):
+    """
+    Find optimal rotation matrix between two sets of coordinates using Kabsch algorithm.
+    Args:
+        coords1: reference coordinates (N, 3)
+        coords2: target coordinates (N, 3)
+        max_points: maximum number of points to use for efficiency
+    Returns:
+        rotation_matrix: 3x3 rotation matrix or None if failed
+    """
+    if len(coords1) > max_points:
+        indices = np.random.choice(len(coords1), max_points, replace=False)
+        coords1 = coords1[indices]
+        coords2 = coords2[indices]
+    # Ensure same number of points
+    min_len = min(len(coords1), len(coords2))
+    coords1 = coords1[:min_len]
+    coords2 = coords2[:min_len]
+    if min_len < 3:
+        return None
+    # Kabsch algorithm
+    try:
+        centroid1 = np.mean(coords1, axis=0)
+        centroid2 = np.mean(coords2, axis=0)
+        coords1_centered = coords1 - centroid1
+        coords2_centered = coords2 - centroid2
+        # Compute covariance matrix
+        H = coords1_centered.T @ coords2_centered
+        U, S, Vt = np.linalg.svd(H)
+        R = Vt.T @ U.T
+        # Ensure proper rotation matrix
+        if np.linalg.det(R) < 0:
+            Vt[-1, :] *= -1
+            R = Vt.T @ U.T
+        return R
+    except Exception as e:
+        print(f"Error in rotation calculation: {e}")
+        return None
+def check_input_frames_match_symmetry_frames(computed_frames, original_frames) -> None:
+    """
+    Check if the atom array matches the symmetry_id.
+    Arguments:
+        computed_frames: list of computed frames
+        original_frames: list of original frames
+    """
+    assert len(computed_frames) == len(
+        original_frames
+    ), "Number of computed frames does not match number of original frames"
+def check_valid_multiplicity(nids_by_entity) -> None:
+    """
+    Check if the multiplicity is valid.
+    Arguments:
+        nids_by_entity: dict mapping entity to ids
+    """
+    # get multiplicities of subunits
+    multiplicity = min([len(i) for i in nids_by_entity.values()])
+    if multiplicity == 1:  # no possible symmetry
+        raise ValueError(
+            "Input has no possible symmetry. If asymmetric motif, please use 2D conditioning inference instead."
+        )
+    # Check that the input is not asymmetric
+    multiplicity_good = [len(i) % multiplicity == 0 for i in nids_by_entity.values()]
+    if not all(multiplicity_good):
+        raise ValueError("Invalid multiplicities of subunits. Please check your input.")
+def check_valid_subunit_size(nids_by_entity, pn_unit_id) -> None:
+    """
+    Check that the subunits in the input are of the same size.
+    Arguments:
+        nids_by_entity: dict mapping entity to ids
+    """
+    for i, js in nids_by_entity.items():
+        for j in js[1:]:
+            if (pn_unit_id == js[0]).sum() != (pn_unit_id == j).sum():
+                raise ValueError("Size mismatch in the input. Please check your file.")
+def check_min_atoms_to_align(natm_per_unique, reference_entity) -> None:
+    """
+    Check that we have enough atoms to align.
+    Arguments:
+        nids_by_entity: dict mapping entity to ids
+    """
+    if natm_per_unique[reference_entity] < MIN_ATOMS_ALIGN:
+        raise ValueError("Not enough atoms to align. Please check your input.")
+def check_max_transforms(chains_to_consider) -> None:
+    """
+    Check that we are not exceeding the max number of transforms.
+    Arguments:
+        chains_to_consider: list of chains to consider
+        max_transforms: max number of transforms
+    """
+    if len(chains_to_consider) > MAX_TRANSFORMS:
+        raise ValueError(
+            "Number of transforms exceeds the max number of transforms (10)"
+        )
+def check_max_rmsds(rmsds) -> None:
+    """
+    Check that the RMSD between the reference molecule and the other molecules is not too big.
+    Arguments:
+        rmsds: dict mapping chain to RMSD
+    """
+    if max(rmsds.values()) > RMSD_CUT:
+        ranked_logger.warning(
+            f"RMSD between the reference molecule and the other molecules is too big ({max(rmsds.values())} > {RMSD_CUT}). Please provide a symmetric input PDB file."
+        )
+        # raise ValueError(f"RMSD between the reference molecule and the other molecules is too big ({max(rmsds.values())} > {RMSD_CUT}). Please provide a symmetric input PDB file.")