PyPI - rc-foundry - Versions diffs - 0.1.1__py3-none-any.whl - Mend

rc-foundry 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

foundry/__init__.py +57 -0
foundry/callbacks/__init__.py +5 -0
foundry/callbacks/callback.py +116 -0
foundry/callbacks/health_logging.py +419 -0
foundry/callbacks/metrics_logging.py +211 -0
foundry/callbacks/timing_logging.py +67 -0
foundry/callbacks/train_logging.py +278 -0
foundry/common.py +108 -0
foundry/constants.py +28 -0
foundry/hydra/resolvers.py +77 -0
foundry/inference_engines/base.py +235 -0
foundry/inference_engines/checkpoint_registry.py +66 -0
foundry/metrics/__init__.py +12 -0
foundry/metrics/losses.py +30 -0
foundry/metrics/metric.py +319 -0
foundry/model/layers/blocks.py +47 -0
foundry/testing/__init__.py +6 -0
foundry/testing/fixtures.py +19 -0
foundry/testing/pytest_hooks.py +15 -0
foundry/trainers/fabric.py +923 -0
foundry/training/EMA.py +67 -0
foundry/training/checkpoint.py +61 -0
foundry/training/schedulers.py +91 -0
foundry/utils/alignment.py +86 -0
foundry/utils/components.py +415 -0
foundry/utils/datasets.py +405 -0
foundry/utils/ddp.py +103 -0
foundry/utils/instantiators.py +72 -0
foundry/utils/logging.py +279 -0
foundry/utils/rigid.py +1460 -0
foundry/utils/rotation_augmentation.py +65 -0
foundry/utils/squashfs.py +172 -0
foundry/utils/torch.py +317 -0
foundry/utils/weights.py +271 -0
foundry/version.py +34 -0
foundry_cli/__init__.py +3 -0
foundry_cli/download_checkpoints.py +281 -0
mpnn/__init__.py +1 -0
mpnn/collate/feature_collator.py +265 -0
mpnn/inference.py +53 -0
mpnn/inference_engines/mpnn.py +549 -0
mpnn/loss/nll_loss.py +122 -0
mpnn/metrics/nll.py +369 -0
mpnn/metrics/sequence_recovery.py +440 -0
mpnn/model/layers/graph_embeddings.py +2372 -0
mpnn/model/layers/message_passing.py +332 -0
mpnn/model/layers/position_wise_feed_forward.py +44 -0
mpnn/model/layers/positional_encoding.py +98 -0
mpnn/model/mpnn.py +2632 -0
mpnn/pipelines/mpnn.py +162 -0
mpnn/samplers/samplers.py +167 -0
mpnn/train.py +341 -0
mpnn/trainers/mpnn.py +193 -0
mpnn/transforms/feature_aggregation/mpnn.py +184 -0
mpnn/transforms/feature_aggregation/polymer_ligand_interface.py +76 -0
mpnn/transforms/feature_aggregation/token_encodings.py +132 -0
mpnn/transforms/feature_aggregation/user_settings.py +347 -0
mpnn/transforms/polymer_ligand_interface.py +164 -0
mpnn/utils/inference.py +2397 -0
mpnn/utils/probability.py +37 -0
mpnn/utils/weights.py +309 -0
rc_foundry-0.1.1.dist-info/METADATA +239 -0
rc_foundry-0.1.1.dist-info/RECORD +180 -0
rc_foundry-0.1.1.dist-info/WHEEL +4 -0
rc_foundry-0.1.1.dist-info/entry_points.txt +5 -0
rc_foundry-0.1.1.dist-info/licenses/LICENSE.md +28 -0
rf3/__init__.py +3 -0
rf3/_version.py +33 -0
rf3/alignment.py +79 -0
rf3/callbacks/dump_validation_structures.py +101 -0
rf3/callbacks/metrics_logging.py +324 -0
rf3/chemical.py +1529 -0
rf3/cli.py +77 -0
rf3/data/cyclic_transform.py +78 -0
rf3/data/extra_xforms.py +36 -0
rf3/data/ground_truth_template.py +463 -0
rf3/data/paired_msa.py +206 -0
rf3/data/pipeline_utils.py +128 -0
rf3/data/pipelines.py +558 -0
rf3/diffusion_samplers/inference_sampler.py +222 -0
rf3/inference.py +65 -0
rf3/inference_engines/__init__.py +5 -0
rf3/inference_engines/rf3.py +735 -0
rf3/kinematics.py +354 -0
rf3/loss/af3_confidence_loss.py +515 -0
rf3/loss/af3_losses.py +655 -0
rf3/loss/loss.py +179 -0
rf3/metrics/chiral.py +179 -0
rf3/metrics/clashing_chains.py +68 -0
rf3/metrics/distogram.py +421 -0
rf3/metrics/lddt.py +523 -0
rf3/metrics/metadata.py +43 -0
rf3/metrics/metric_utils.py +192 -0
rf3/metrics/predicted_error.py +134 -0
rf3/metrics/rasa.py +108 -0
rf3/metrics/selected_distances.py +91 -0
rf3/model/RF3.py +527 -0
rf3/model/RF3_blocks.py +92 -0
rf3/model/RF3_structure.py +303 -0
rf3/model/layers/af3_auxiliary_heads.py +255 -0
rf3/model/layers/af3_diffusion_transformer.py +544 -0
rf3/model/layers/attention.py +313 -0
rf3/model/layers/layer_utils.py +127 -0
rf3/model/layers/mlff.py +118 -0
rf3/model/layers/outer_product.py +59 -0
rf3/model/layers/pairformer_layers.py +783 -0
rf3/model/layers/structure_bias.py +56 -0
rf3/scoring.py +1787 -0
rf3/symmetry/resolve.py +284 -0
rf3/train.py +194 -0
rf3/trainers/rf3.py +570 -0
rf3/util_module.py +47 -0
rf3/utils/frames.py +109 -0
rf3/utils/inference.py +665 -0
rf3/utils/io.py +198 -0
rf3/utils/loss.py +72 -0
rf3/utils/predict_and_score.py +165 -0
rf3/utils/predicted_error.py +673 -0
rf3/utils/recycling.py +42 -0
rf3/validate.py +140 -0
rfd3/.gitignore +7 -0
rfd3/Makefile +76 -0
rfd3/__init__.py +12 -0
rfd3/callbacks.py +66 -0
rfd3/cli.py +41 -0
rfd3/constants.py +212 -0
rfd3/engine.py +543 -0
rfd3/inference/datasets.py +193 -0
rfd3/inference/input_parsing.py +1123 -0
rfd3/inference/legacy_input_parsing.py +717 -0
rfd3/inference/parsing.py +165 -0
rfd3/inference/symmetry/atom_array.py +298 -0
rfd3/inference/symmetry/checks.py +241 -0
rfd3/inference/symmetry/contigs.py +63 -0
rfd3/inference/symmetry/frames.py +355 -0
rfd3/inference/symmetry/symmetry_utils.py +398 -0
rfd3/metrics/design_metrics.py +465 -0
rfd3/metrics/hbonds_hbplus_metrics.py +308 -0
rfd3/metrics/hbonds_metrics.py +389 -0
rfd3/metrics/losses.py +325 -0
rfd3/metrics/metrics_utils.py +118 -0
rfd3/metrics/sidechain_metrics.py +349 -0
rfd3/model/RFD3.py +105 -0
rfd3/model/RFD3_diffusion_module.py +387 -0
rfd3/model/cfg_utils.py +81 -0
rfd3/model/inference_sampler.py +635 -0
rfd3/model/layers/attention.py +577 -0
rfd3/model/layers/block_utils.py +580 -0
rfd3/model/layers/blocks.py +777 -0
rfd3/model/layers/chunked_pairwise.py +377 -0
rfd3/model/layers/encoders.py +417 -0
rfd3/model/layers/layer_utils.py +197 -0
rfd3/model/layers/pairformer_layers.py +128 -0
rfd3/run_inference.py +45 -0
rfd3/testing/debug.py +139 -0
rfd3/testing/debug_utils.py +73 -0
rfd3/testing/testing_utils.py +356 -0
rfd3/train.py +194 -0
rfd3/trainer/dump_validation_structures.py +154 -0
rfd3/trainer/fabric_trainer.py +923 -0
rfd3/trainer/recycling.py +42 -0
rfd3/trainer/rfd3.py +485 -0
rfd3/trainer/trainer_utils.py +502 -0
rfd3/transforms/conditioning_base.py +508 -0
rfd3/transforms/conditioning_utils.py +200 -0
rfd3/transforms/design_transforms.py +807 -0
rfd3/transforms/dna_crop.py +523 -0
rfd3/transforms/hbonds.py +407 -0
rfd3/transforms/hbonds_hbplus.py +246 -0
rfd3/transforms/ncaa_transforms.py +153 -0
rfd3/transforms/pipelines.py +632 -0
rfd3/transforms/ppi_transforms.py +541 -0
rfd3/transforms/rasa.py +116 -0
rfd3/transforms/symmetry.py +76 -0
rfd3/transforms/training_conditions.py +552 -0
rfd3/transforms/util_transforms.py +498 -0
rfd3/transforms/virtual_atoms.py +305 -0
rfd3/utils/inference.py +648 -0
rfd3/utils/io.py +245 -0
rfd3/utils/vizualize.py +276 -0

rfd3/transforms/hbonds.py ADDED Viewed

@@ -0,0 +1,407 @@
+from typing import Any, Literal, Tuple
+import biotite.structure as struc
+import hydride
+import numpy as np
+from atomworks.io.transforms.atom_array import remove_hydrogens
+from atomworks.io.utils.ccd import atom_array_from_ccd_code
+from atomworks.ml.transforms._checks import (
+    check_atom_array_annotation,
+    check_contains_keys,
+    check_is_instance,
+)
+from atomworks.ml.transforms.base import Transform
+from biotite.structure import AtomArray, AtomArrayStack
+from rfd3.constants import SELECTION_NONPROTEIN, SELECTION_PROTEIN
+from foundry.utils.ddp import RankedLogger
+ranked_logger = RankedLogger()
+HYDROGEN_LIKE_SYMBOLS = ("H", "H2", "D", "T")
+# TODO: Once the cifutils submodule is bumped, we can use the built-in add_hydrogen_atom_positions function
+def add_hydrogen_atom_positions(
+    atom_array: AtomArray | AtomArrayStack,
+) -> AtomArray | AtomArrayStack:
+    """Add hydrogens using biotite supported hydride library
+    Args:
+        atom_array (AtomArray | AtomArrayStack): The atom array containing the chain information.
+    Returns:
+        AtomArray: The updated atom array with hydrogens added.
+    """
+    def _get_charge_from_ccd_code(atom):
+        try:
+            ccd_array = atom_array_from_ccd_code(atom.res_name)
+            charge = ccd_array[
+                ccd_array.atom_name.tolist().index(atom.atom_name)
+            ].charge
+        except Exception:
+            ## res_name not found in ccd or atom_name not found in ccd_array
+            charge = 0
+        return charge
+    if "charge" not in atom_array.get_annotation_categories():
+        charges = np.vectorize(_get_charge_from_ccd_code)(atom_array)
+        atom_array.set_annotation("charge", charges)
+    # Add as a custom annotation
+    array = remove_hydrogens(atom_array)
+    fields_to_copy_from_residue_if_present = [
+        "auth_seq_id",
+        "label_entity_id",
+        "is_can_prot",
+        "is_can_nucl",
+        "is_sm",
+        "chain_type",
+    ]
+    fields_to_copy_from_residue_if_present = list(
+        set(fields_to_copy_from_residue_if_present).intersection(
+            set(atom_array.get_annotation_categories())
+        )
+    )
+    def _copy_missing_annotations_residue_wise(
+        arr_to_copy_from: AtomArray,
+        arr_to_update: AtomArray,
+        fields_to_copy_from_residue_if_present: list[str],
+    ) -> AtomArray:
+        """Copy specified annotations residue-wise from one AtomArray to another. Updates annotations in-place."""
+        residue_starts = struc.get_residue_starts(arr_to_copy_from)
+        residue_starts_atom_array = arr_to_copy_from[residue_starts]
+        annot = {
+            item: getattr(residue_starts_atom_array, item)
+            for item in fields_to_copy_from_residue_if_present
+        }
+        for field in fields_to_copy_from_residue_if_present:
+            updated_field = struc.spread_residue_wise(arr_to_update, annot[field])
+            arr_to_update.set_annotation(field, updated_field)
+        return arr_to_update
+    def _handle_nan_coords(atom_array, noise_level=1e-3):
+        coords = atom_array.coord
+        # Find NaNs
+        nan_mask = np.isnan(coords)
+        # Replace NaNs with 0 + small random offset
+        coords[nan_mask] = np.random.uniform(
+            -noise_level, noise_level, size=nan_mask.sum()
+        )
+        # Update atom_array in-place
+        atom_array.coord = coords
+        return atom_array, nan_mask
+    if isinstance(array, AtomArrayStack):
+        updated_arrays = []
+        for old_arr in array:
+            if old_arr.bonds is None:
+                old_arr.bonds = struc.connect_via_distances(old_arr)
+            ## give some values to nan
+            old_arr, nan_mask = _handle_nan_coords(old_arr)
+            arr, mask = hydride.add_hydrogen(old_arr)
+            ## put back nans
+            arr.coord[mask, :][nan_mask] = np.nan
+            arr = _copy_missing_annotations_residue_wise(
+                old_arr, arr, fields_to_copy_from_residue_if_present
+            )
+            updated_arrays.append(arr)
+        ret_array = struc.stack(updated_arrays)
+    elif isinstance(array, AtomArray):
+        if array.bonds is None:
+            array.bonds = struc.connect_via_distances(array)
+        ## give some values to nan
+        array, nan_mask = _handle_nan_coords(array)
+        arr, mask = hydride.add_hydrogen(array)
+        ## put back nans
+        arr.coord[mask, :][nan_mask] = np.nan
+        ret_array = _copy_missing_annotations_residue_wise(
+            array, arr, fields_to_copy_from_residue_if_present
+        )
+    return ret_array
+def check_atom_array_has_hydrogen(data: dict[str, Any]):
+    """Check if `atom_array` key has bonds."""
+    import numpy as np
+    if not np.any(data["atom_array"].element == "H"):
+        raise ValueError("Key `atom_array` in data has no hydrogens.")
+def calculate_hbonds(
+    atom_array: AtomArray,
+    selection1: np.ndarray = None,
+    selection2: np.ndarray = None,
+    selection1_type: Literal["acceptor", "donor", "both"] = "both",
+    cutoff_dist: float = 3,
+    cutoff_angle: float = 120,
+    donor_elements: Tuple[str] = ("O", "N", "S", "F"),
+    acceptor_elements: Tuple[str] = ("O", "N", "S", "F"),
+    periodic: bool = False,
+) -> Tuple[np.ndarray, np.ndarray, AtomArray]:
+    """
+    Calculates Hbonds with biotite.struc.Hbond.
+    Assigns donor, acceptor annotation for each heavy atom involved.
+    Args:
+            atom_array (AtomArray):Expects the atom_array that contains hydrogens.
+            selection1 and selection2 (np.ndarray, optional): (Boolean mask for atoms to limit the hydrogen bond search to specific sections of the model.
+            The shape must match the shape of the atoms argument. If None is given, the whole atoms stack is used instead. (Default: None))
+            selection1_type (Literal, optional): Determines the type of selection1. The type of selection2 is chosen accordingly (‘both’ or the opposite).
+                                               (Default: 'both')
+            cutoff_dist (float, optional): The maximal distance between the hydrogen and acceptor to be considered a hydrogen bond. (Default: 2.5)
+            cutoff_angle (float, optional): The angle cutoff in degree between Donor-H..Acceptor to be considered a hydrogen bond. (Default: 120)
+            donor_elements, acceptor_elements (tuple of str): Elements to be considered as possible donors or acceptors. (Default: O, N, S)
+            periodic (bool, optional): If true, hydrogen bonds can also be detected in periodic boundary conditions. The box attribute of atoms is required in this case. (Default: False)
+    """
+    # Remove NaN coordinates
+    has_resolved_coordinates = ~np.isnan(atom_array.coord).any(axis=-1)
+    nonNaN_array = atom_array[has_resolved_coordinates]
+    # update selections if any
+    if selection1 is not None:
+        selection1 = selection1[has_resolved_coordinates]
+    if selection2 is not None:
+        selection2 = selection2[has_resolved_coordinates]
+    ## index map from nonNaN_array to original
+    index_map = {
+        counter: i for counter, i in enumerate(has_resolved_coordinates.nonzero()[0])
+    }
+    if selection1.sum() == 0 or selection2.sum() == 0:
+        # no ligand, or ligand is of same type as selection1 (e.g. 6) (peptide)
+        triplets = np.array([])
+    else:
+        # Compute H bonds
+        triplets = struc.hbond(  ## assuming AtomArray, not AtomArrayStack (returns an extra masks in that case)
+            nonNaN_array,
+            selection1=selection1,
+            selection2=selection2,
+            selection1_type=selection1_type,
+            cutoff_dist=cutoff_dist,
+            cutoff_angle=cutoff_angle,
+            donor_elements=donor_elements,
+            acceptor_elements=acceptor_elements,
+            periodic=periodic,
+        )
+    ## map back triplet indices, nonNaN indices to original indices
+    flattened = triplets.flatten()
+    triplets = np.array([index_map[i] for i in flattened]).reshape(-1, 3)
+    ## add back NaNs
+    donor_array = np.array([[0.0] * len(atom_array)])
+    acceptor_array = np.array([[0.0] * len(atom_array)])
+    if len(triplets) > 0:
+        donor_array[:, triplets[:, 0]] = 1.0
+        acceptor_array[:, triplets[:, 2]] = 1.0
+    ## [is_active_donor, is_active_acceptor] per atom
+    types = np.vstack((donor_array, acceptor_array)).T
+    return triplets, types, atom_array
+class CalculateHbonds(Transform):
+    """Transform for calculating Hbonds, expects an AtomArray containing hydrogens."""
+    def __init__(
+        self,
+        selection1_type: Literal["acceptor", "donor", "both"] = "both",
+        cutoff_dist: float = 3,
+        cutoff_angle: float = 120,
+        donor_elements: Tuple[str] = ("O", "N", "S", "F"),
+        acceptor_elements: Tuple[str] = ("O", "N", "S", "F"),
+        periodic: bool = False,
+        make2d: bool = False,
+    ):
+        """
+        Initialize the Hbonds transform.
+        Args:
+            selection1 and selection2 (list[str], optional): Specify a list of ChainTypes as in atomworks.enums. e.g. selectoin1 = ['POLYPEPTIDE(L)'], selection2 = ['NON-POLYMER', 'POLYRIBONUCLEOTIDE']
+            Allowed values: {'PEPTIDE NUCLEIC ACID', 'BRANCHED', 'POLYDEOXYRIBONUCLEOTIDE', 'POLYRIBONUCLEOTIDE', 'CYCLIC-PSEUDO-PEPTIDE', 'MACROLIDE', 'POLYDEOXYRIBONUCLEOTIDE/POLYRIBONUCLEOTIDE HYBRID', 'OTHER', 'POLYPEPTIDE(L)', 'NON-POLYMER', 'POLYPEPTIDE(D)', 'WATER'}
+            selection1_type (Literal, optional): Determines the type of selection1. The type of selection2 is chosen accordingly (‘both’ or the opposite).
+                                               (Default: 'both')
+            cutoff_dist (float, optional): The maximal distance between the hydrogen and acceptor to be considered a hydrogen bond. (Default: 2.5)
+            cutoff_angle (float, optional): The angle cutoff in degree between Donor-H..Acceptor to be considered a hydrogen bond. (Default: 120)
+            donor_elements, acceptor_elements (tuple of str): Elements to be considered as possible donors or acceptors. (Default: O, N, S)
+            periodic (bool, optional): If true, hydrogen bonds can also be detected in periodic boundary conditions. The box attribute of atoms is required in this case. (Default: False)
+        """
+        self.selection1_type = selection1_type
+        self.cutoff_dist = cutoff_dist
+        self.cutoff_angle = cutoff_angle
+        self.donor_elements = donor_elements
+        self.acceptor_elements = acceptor_elements
+        self.periodic = periodic
+        self.make2d = make2d
+    def check_input(self, data: dict[str, Any]) -> None:
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(data, ["res_name"])
+        ## turn off cause H addition debug ongoing
+        # check_atom_array_has_hydrogen(data)
+    def forward(self, data: dict) -> dict:
+        """
+        Calculates Hbonds and adds it to the data dictionary under the key `hbonds`.
+        Args:
+            data: dict
+                A dictionary containing the input data atomarray.
+                Expects the atom_array in data["atom_array"] contains hydrogens.
+        Returns:
+            dict: The data dictionary with hbonds added.
+            Sets hbond_type = [Donor, Acceptor] annotation to each atom. Donor, Acceptor can be both 0 or 1 (float). size: Lx2 (L: length of AtomArray)
+        """
+        atom_array: AtomArray = data["atom_array"]
+        try:
+            atom_array = add_hydrogen_atom_positions(atom_array)
+        except Exception as e:
+            print(
+                f"WARNING: problem adding hydrogens: {e}.\nThis example will get no hydrogen bond annotations."
+            )
+            atom_array.set_annotation(
+                "active_donor", np.zeros(atom_array.array_length(), dtype=bool)
+            )
+            atom_array.set_annotation(
+                "active_acceptor", np.zeros(atom_array.array_length(), dtype=bool)
+            )
+            data["atom_array"] = atom_array
+            return data
+        ## These are the only two use-cases we have so far. Can be extended as needed
+        if data["sampled_condition_name"] == "ppi":
+            selection1_chain_types = ["POLYPEPTIDE(D)", "POLYPEPTIDE(L)"]
+            selection2_chain_types = ["POLYPEPTIDE(D)", "POLYPEPTIDE(L)"]
+            separate_selections_for_motif_and_diffused = True
+        else:
+            selection1_chain_types = SELECTION_PROTEIN
+            selection2_chain_types = SELECTION_NONPROTEIN
+            separate_selections_for_motif_and_diffused = False
+        selection1 = np.isin(atom_array.chain_type, selection1_chain_types)
+        selection2 = np.isin(atom_array.chain_type, selection2_chain_types)
+        # Optionally restrict to Hbonds between motif and diffused regions
+        if separate_selections_for_motif_and_diffused:
+            selection1 = selection1 & atom_array.is_motif_atom
+            selection2 = selection2 & ~atom_array.is_motif_atom
+        else:
+            # Include fixed motif atoms for hbond calculations
+            selection2 |= np.array(atom_array.is_motif_atom, dtype=bool)
+            selection1 = ~selection2
+        hbonds, hbond_types, atom_array = calculate_hbonds(
+            atom_array,
+            selection1=selection1,
+            selection2=selection2,
+            selection1_type=self.selection1_type,
+            cutoff_dist=self.cutoff_dist,
+            cutoff_angle=self.cutoff_angle,
+            donor_elements=self.donor_elements,
+            acceptor_elements=self.acceptor_elements,
+            periodic=self.periodic,
+        )
+        # Initialize log_dict if not present
+        data.setdefault("log_dict", {})
+        log_dict = data["log_dict"]
+        # Log hbond statistics
+        log_dict["hbond_total_count"] = len(hbonds)
+        log_dict["hbond_total_atoms"] = hbond_types.sum()
+        # Subsample if hbond_subsample is set and number of atoms is bigger than 3
+        final_hbond_types = hbond_types
+        final_hbond_types[:, 0] = final_hbond_types[:, 0] * np.array(
+            atom_array.is_motif_atom
+        )
+        final_hbond_types[:, 1] = final_hbond_types[:, 1] * np.array(
+            atom_array.is_motif_atom
+        )
+        if data["conditions"]["hbond_subsample"] and np.sum(hbond_types) > 3:
+            # Linear correlation: fewer hbonds = higher fraction
+            base_fraction = 0.1  # minimum fraction (when many hbonds)
+            max_fraction = 0.9  # maximum fraction (when few hbonds)
+            n_hbonds = len(hbonds)
+            max_hbonds = 50  # Expected maximum number of hbonds for scaling
+            # Linear interpolation: fraction decreases linearly with number of hbonds
+            fraction = max_fraction - (max_fraction - base_fraction) * min(
+                n_hbonds / max_hbonds, 1.0
+            )
+            final_hbond_types = subsample_one_hot_np(hbond_types, fraction)
+        # Set annotations and log subsample atoms
+        atom_array.set_annotation("active_donor", final_hbond_types[:, 0])
+        atom_array.set_annotation("active_acceptor", final_hbond_types[:, 1])
+        log_dict["hbond_subsample_atoms"] = final_hbond_types.sum()
+        # Remove hydrogens after processing
+        atom_array = remove_hydrogens(atom_array)
+        data["log_dict"] = log_dict
+        data["atom_array"] = atom_array
+        return data
+def subsample_one_hot_np(array, fraction):
+    """
+    Subsamples a one-hot encoded NumPy array by randomly keeping a given fraction of the 1s.
+    Args:
+        array (np.ndarray): One-hot array of 0s and 1s.
+        fraction (float): Fraction of 1s to keep (0 < fraction <= 1).
+    Returns:
+        np.ndarray: Subsampled array with same shape.
+    """
+    if not (0 < fraction <= 1):
+        raise ValueError("Fraction must be in the range (0, 1].")
+    array = array.copy()  # Don't modify original
+    one_indices = np.argwhere(array == 1)
+    num_ones = len(one_indices)
+    keep_count = int(num_ones * fraction)
+    # Shuffle and choose a subset of indices to keep
+    np.random.shuffle(one_indices)
+    keep_indices = one_indices[:keep_count]
+    # Create new zero array
+    new_array = np.zeros_like(array)
+    # Set selected indices to 1
+    for i, j in keep_indices:
+        new_array[i, j] = 1
+    return new_array

rfd3/transforms/hbonds_hbplus.py ADDED Viewed

@@ -0,0 +1,246 @@
+import os
+import string
+import subprocess
+from datetime import datetime
+from typing import Any, Tuple
+import numpy as np
+from atomworks.ml.transforms._checks import (
+    check_atom_array_annotation,
+    check_contains_keys,
+    check_is_instance,
+)
+from atomworks.ml.transforms.base import Transform
+from biotite.structure import AtomArray
+from biotite.structure.io.pdb import PDBFile
+def save_atomarray_to_pdb(atom_array, output_path):
+    def _handle_nan_coords(atom_array, noise_level=1e-3):
+        coords = atom_array.coord
+        nan_mask = np.isnan(coords)
+        coords[nan_mask] = np.random.uniform(
+            -noise_level, noise_level, size=nan_mask.sum()
+        )
+        atom_array.coord = coords
+        return atom_array, nan_mask
+    atom_array, nan_mask = _handle_nan_coords(atom_array)
+    chain_iids = np.unique(atom_array.chain_iid)
+    if len(chain_iids) > 52:
+        raise ValueError(
+            "Too many chain_iids, cannot convert to PDB", "skipping HBPLUS"
+        )
+    all_possible_chainIDS = string.ascii_letters
+    chain_map = {}
+    for item in chain_iids:
+        if len(item) == 1:
+            chain_map[item] = item
+            all_possible_chainIDS = all_possible_chainIDS.replace(item, "")
+    for item in chain_iids:
+        if len(item) > 1:
+            chain_map[item] = all_possible_chainIDS[0]
+            all_possible_chainIDS = all_possible_chainIDS.replace(chain_map[item], "")
+    new_chain_ids = [chain_map[i] for i in atom_array.chain_iid]
+    inverted_chain_map = {v: k for k, v in chain_map.items()}
+    atom_array.chain_id = new_chain_ids
+    atom_array.b_factor = np.zeros(len(atom_array))
+    pdb = PDBFile()
+    pdb.set_structure(atom_array)
+    pdb.write(output_path)
+    return atom_array, nan_mask, inverted_chain_map
+def check_atom_array_has_hydrogen(data: dict[str, Any]):
+    if not np.any(data["atom_array"].element == "H"):
+        raise ValueError("Key `atom_array` in data has no hydrogens.")
+def calculate_hbonds(
+    atom_array: AtomArray,
+    cutoff_HA_dist: float = 3,
+    cutoff_DA_distance: float = 3.5,
+) -> Tuple[np.ndarray, np.ndarray, AtomArray]:
+    dtstr = datetime.now().strftime("%Y%m%d%H%M%S")
+    pdb_path = f"{dtstr}_{np.random.randint(10000)}.pdb"
+    atom_array, nan_mask, chain_map = save_atomarray_to_pdb(atom_array, pdb_path)
+    hbplus_exe = os.environ.get("HBPLUS_PATH")
+    if hbplus_exe is None or hbplus_exe == "":
+        raise ValueError(
+            "HBPLUS_PATH environment variable not set. "
+            "Please set it to the path of the hbplus executable in order to calculate hydrogen bonds."
+        )
+    subprocess.call(
+        [
+            hbplus_exe,
+            "-h",
+            str(cutoff_HA_dist),
+            "-d",
+            str(cutoff_DA_distance),
+            pdb_path,
+            pdb_path,
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    HB = open(pdb_path.replace("pdb", "hb2"), "r").readlines()
+    hbonds = []
+    for i in range(8, len(HB)):
+        d_chain = HB[i][0]
+        d_resi = str(int(HB[i][1:5].strip()))
+        d_resn = HB[i][6:9].strip()
+        d_ins = HB[i][5].replace("-", " ")
+        d_atom = HB[i][9:13].strip()
+        a_chain = HB[i][14]
+        a_resi = str(int(HB[i][15:19].strip()))
+        a_ins = HB[i][19].replace("-", " ")
+        a_resn = HB[i][20:23].strip()
+        a_atom = HB[i][23:27].strip()
+        dist = float(HB[i][27:32].strip())
+        items = {
+            "d_chain": chain_map[d_chain],
+            "d_resi": d_resi,
+            "d_resn": d_resn,
+            "d_ins": d_ins,
+            "d_atom": d_atom,
+            "a_chain": chain_map[a_chain],
+            "a_resi": a_resi,
+            "a_resn": a_resn,
+            "a_ins": a_ins,
+            "a_atom": a_atom,
+            "dist": dist,
+        }
+        hbonds.append(items)
+    donor_array = np.zeros(len(atom_array))
+    acceptor_array = np.zeros(len(atom_array))
+    donor_mask = np.bool_(donor_array)
+    acceptor_mask = np.bool_(acceptor_array)
+    motif_hbonds = []
+    for item in hbonds:
+        current_donor_mask = (
+            (atom_array.chain_iid == item["d_chain"])
+            & (atom_array.res_id == float(item["d_resi"]))
+            & (atom_array.atom_name == item["d_atom"])
+        )
+        current_acceptor_mask = (
+            (atom_array.chain_iid == item["a_chain"])
+            & (atom_array.res_id == float(item["a_resi"]))
+            & (atom_array.atom_name == item["a_atom"])
+        )
+        # Ensure that we can uniquely identify the donor and acceptor atoms
+        if current_donor_mask.sum() != 1:
+            raise ValueError(
+                f"Unable to uniquely identify a donor atom with chain_iid={item['d_chain']}, res_id={item['d_resi']}, atom_name={item['d_atom']}."
+            )
+        if current_acceptor_mask.sum() != 1:
+            raise ValueError(
+                f"Unable to uniquely identify an acceptor atom with chain_iid={item['a_chain']}, res_id={item['a_resi']}, atom_name={item['a_atom']}."
+            )
+        current_donor_is_motif = atom_array.is_motif_atom[current_donor_mask][0]
+        current_acceptor_is_motif = atom_array.is_motif_atom[current_acceptor_mask][0]
+        # Only keep hbonds between the motif and diffused regions
+        if current_donor_is_motif != current_acceptor_is_motif:
+            motif_hbonds.append(item)
+            donor_mask |= current_donor_mask
+            acceptor_mask |= current_acceptor_mask
+    donor_array[donor_mask] = 1
+    acceptor_array[acceptor_mask] = 1
+    os.remove(pdb_path)
+    os.remove(pdb_path.replace("pdb", "hb2"))
+    atom_array.set_annotation("active_donor", donor_array)
+    atom_array.set_annotation("active_acceptor", acceptor_array)
+    return atom_array, motif_hbonds, len(motif_hbonds)
+class CalculateHbondsPlus(Transform):
+    """Transform for calculating Hbonds, expects an AtomArray containing hydrogens."""
+    def __init__(
+        self,
+        cutoff_HA_dist: float = 3,
+        cutoff_DA_distance: float = 3.5,
+    ):
+        self.cutoff_HA_dist = cutoff_HA_dist
+        self.cutoff_DA_distance = cutoff_DA_distance
+    def check_input(self, data: dict[str, Any]) -> None:
+        check_contains_keys(data, ["atom_array"])
+        check_is_instance(data, "atom_array", AtomArray)
+        check_atom_array_annotation(data, ["res_name"])
+        # check_atom_array_has_hydrogen(data)
+    def forward(self, data: dict) -> dict:
+        atom_array: AtomArray = data["atom_array"]
+        atom_array, hbonds, _ = calculate_hbonds(
+            atom_array,
+            cutoff_HA_dist=self.cutoff_HA_dist,
+            cutoff_DA_distance=self.cutoff_DA_distance,
+        )
+        data.setdefault("log_dict", {})
+        log_dict = data["log_dict"]
+        hbond_types = np.vstack((atom_array.active_donor, atom_array.active_acceptor)).T
+        final_hbond_types = hbond_types
+        final_hbond_types[:, 0] *= np.array(atom_array.is_motif_atom)
+        final_hbond_types[:, 1] *= np.array(atom_array.is_motif_atom)
+        log_dict["hbond_total_count"] = np.sum(final_hbond_types)
+        if data["conditions"]["hbond_subsample"] and np.sum(final_hbond_types) > 3:
+            base_fraction = 0.1
+            max_fraction = 0.9
+            n_hbonds = np.sum(final_hbond_types)
+            max_hbonds = 50
+            fraction = max_fraction - (max_fraction - base_fraction) * min(
+                n_hbonds / max_hbonds, 1.0
+            )
+            final_hbond_types = subsample_one_hot_np(final_hbond_types, fraction)
+        atom_array.set_annotation("active_donor", final_hbond_types[:, 0])
+        atom_array.set_annotation("active_acceptor", final_hbond_types[:, 1])
+        log_dict["hbond_subsample_atoms"] = np.sum(final_hbond_types)
+        data["log_dict"] = log_dict
+        data["atom_array"] = atom_array
+        return data
+def subsample_one_hot_np(array, fraction):
+    if not (0 < fraction <= 1):
+        raise ValueError("Fraction must be in the range (0, 1].")
+    array = array.copy()
+    one_indices = np.argwhere(array == 1)
+    num_ones = len(one_indices)
+    keep_count = int(num_ones * fraction)
+    np.random.shuffle(one_indices)
+    keep_indices = one_indices[:keep_count]
+    new_array = np.zeros_like(array)
+    for i, j in keep_indices:
+        new_array[i, j] = 1
+    return new_array