PyPI - rc-foundry - Versions diffs - 0.1.1__py3-none-any.whl - Mend

rc-foundry 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

foundry/__init__.py +57 -0
foundry/callbacks/__init__.py +5 -0
foundry/callbacks/callback.py +116 -0
foundry/callbacks/health_logging.py +419 -0
foundry/callbacks/metrics_logging.py +211 -0
foundry/callbacks/timing_logging.py +67 -0
foundry/callbacks/train_logging.py +278 -0
foundry/common.py +108 -0
foundry/constants.py +28 -0
foundry/hydra/resolvers.py +77 -0
foundry/inference_engines/base.py +235 -0
foundry/inference_engines/checkpoint_registry.py +66 -0
foundry/metrics/__init__.py +12 -0
foundry/metrics/losses.py +30 -0
foundry/metrics/metric.py +319 -0
foundry/model/layers/blocks.py +47 -0
foundry/testing/__init__.py +6 -0
foundry/testing/fixtures.py +19 -0
foundry/testing/pytest_hooks.py +15 -0
foundry/trainers/fabric.py +923 -0
foundry/training/EMA.py +67 -0
foundry/training/checkpoint.py +61 -0
foundry/training/schedulers.py +91 -0
foundry/utils/alignment.py +86 -0
foundry/utils/components.py +415 -0
foundry/utils/datasets.py +405 -0
foundry/utils/ddp.py +103 -0
foundry/utils/instantiators.py +72 -0
foundry/utils/logging.py +279 -0
foundry/utils/rigid.py +1460 -0
foundry/utils/rotation_augmentation.py +65 -0
foundry/utils/squashfs.py +172 -0
foundry/utils/torch.py +317 -0
foundry/utils/weights.py +271 -0
foundry/version.py +34 -0
foundry_cli/__init__.py +3 -0
foundry_cli/download_checkpoints.py +281 -0
mpnn/__init__.py +1 -0
mpnn/collate/feature_collator.py +265 -0
mpnn/inference.py +53 -0
mpnn/inference_engines/mpnn.py +549 -0
mpnn/loss/nll_loss.py +122 -0
mpnn/metrics/nll.py +369 -0
mpnn/metrics/sequence_recovery.py +440 -0
mpnn/model/layers/graph_embeddings.py +2372 -0
mpnn/model/layers/message_passing.py +332 -0
mpnn/model/layers/position_wise_feed_forward.py +44 -0
mpnn/model/layers/positional_encoding.py +98 -0
mpnn/model/mpnn.py +2632 -0
mpnn/pipelines/mpnn.py +162 -0
mpnn/samplers/samplers.py +167 -0
mpnn/train.py +341 -0
mpnn/trainers/mpnn.py +193 -0
mpnn/transforms/feature_aggregation/mpnn.py +184 -0
mpnn/transforms/feature_aggregation/polymer_ligand_interface.py +76 -0
mpnn/transforms/feature_aggregation/token_encodings.py +132 -0
mpnn/transforms/feature_aggregation/user_settings.py +347 -0
mpnn/transforms/polymer_ligand_interface.py +164 -0
mpnn/utils/inference.py +2397 -0
mpnn/utils/probability.py +37 -0
mpnn/utils/weights.py +309 -0
rc_foundry-0.1.1.dist-info/METADATA +239 -0
rc_foundry-0.1.1.dist-info/RECORD +180 -0
rc_foundry-0.1.1.dist-info/WHEEL +4 -0
rc_foundry-0.1.1.dist-info/entry_points.txt +5 -0
rc_foundry-0.1.1.dist-info/licenses/LICENSE.md +28 -0
rf3/__init__.py +3 -0
rf3/_version.py +33 -0
rf3/alignment.py +79 -0
rf3/callbacks/dump_validation_structures.py +101 -0
rf3/callbacks/metrics_logging.py +324 -0
rf3/chemical.py +1529 -0
rf3/cli.py +77 -0
rf3/data/cyclic_transform.py +78 -0
rf3/data/extra_xforms.py +36 -0
rf3/data/ground_truth_template.py +463 -0
rf3/data/paired_msa.py +206 -0
rf3/data/pipeline_utils.py +128 -0
rf3/data/pipelines.py +558 -0
rf3/diffusion_samplers/inference_sampler.py +222 -0
rf3/inference.py +65 -0
rf3/inference_engines/__init__.py +5 -0
rf3/inference_engines/rf3.py +735 -0
rf3/kinematics.py +354 -0
rf3/loss/af3_confidence_loss.py +515 -0
rf3/loss/af3_losses.py +655 -0
rf3/loss/loss.py +179 -0
rf3/metrics/chiral.py +179 -0
rf3/metrics/clashing_chains.py +68 -0
rf3/metrics/distogram.py +421 -0
rf3/metrics/lddt.py +523 -0
rf3/metrics/metadata.py +43 -0
rf3/metrics/metric_utils.py +192 -0
rf3/metrics/predicted_error.py +134 -0
rf3/metrics/rasa.py +108 -0
rf3/metrics/selected_distances.py +91 -0
rf3/model/RF3.py +527 -0
rf3/model/RF3_blocks.py +92 -0
rf3/model/RF3_structure.py +303 -0
rf3/model/layers/af3_auxiliary_heads.py +255 -0
rf3/model/layers/af3_diffusion_transformer.py +544 -0
rf3/model/layers/attention.py +313 -0
rf3/model/layers/layer_utils.py +127 -0
rf3/model/layers/mlff.py +118 -0
rf3/model/layers/outer_product.py +59 -0
rf3/model/layers/pairformer_layers.py +783 -0
rf3/model/layers/structure_bias.py +56 -0
rf3/scoring.py +1787 -0
rf3/symmetry/resolve.py +284 -0
rf3/train.py +194 -0
rf3/trainers/rf3.py +570 -0
rf3/util_module.py +47 -0
rf3/utils/frames.py +109 -0
rf3/utils/inference.py +665 -0
rf3/utils/io.py +198 -0
rf3/utils/loss.py +72 -0
rf3/utils/predict_and_score.py +165 -0
rf3/utils/predicted_error.py +673 -0
rf3/utils/recycling.py +42 -0
rf3/validate.py +140 -0
rfd3/.gitignore +7 -0
rfd3/Makefile +76 -0
rfd3/__init__.py +12 -0
rfd3/callbacks.py +66 -0
rfd3/cli.py +41 -0
rfd3/constants.py +212 -0
rfd3/engine.py +543 -0
rfd3/inference/datasets.py +193 -0
rfd3/inference/input_parsing.py +1123 -0
rfd3/inference/legacy_input_parsing.py +717 -0
rfd3/inference/parsing.py +165 -0
rfd3/inference/symmetry/atom_array.py +298 -0
rfd3/inference/symmetry/checks.py +241 -0
rfd3/inference/symmetry/contigs.py +63 -0
rfd3/inference/symmetry/frames.py +355 -0
rfd3/inference/symmetry/symmetry_utils.py +398 -0
rfd3/metrics/design_metrics.py +465 -0
rfd3/metrics/hbonds_hbplus_metrics.py +308 -0
rfd3/metrics/hbonds_metrics.py +389 -0
rfd3/metrics/losses.py +325 -0
rfd3/metrics/metrics_utils.py +118 -0
rfd3/metrics/sidechain_metrics.py +349 -0
rfd3/model/RFD3.py +105 -0
rfd3/model/RFD3_diffusion_module.py +387 -0
rfd3/model/cfg_utils.py +81 -0
rfd3/model/inference_sampler.py +635 -0
rfd3/model/layers/attention.py +577 -0
rfd3/model/layers/block_utils.py +580 -0
rfd3/model/layers/blocks.py +777 -0
rfd3/model/layers/chunked_pairwise.py +377 -0
rfd3/model/layers/encoders.py +417 -0
rfd3/model/layers/layer_utils.py +197 -0
rfd3/model/layers/pairformer_layers.py +128 -0
rfd3/run_inference.py +45 -0
rfd3/testing/debug.py +139 -0
rfd3/testing/debug_utils.py +73 -0
rfd3/testing/testing_utils.py +356 -0
rfd3/train.py +194 -0
rfd3/trainer/dump_validation_structures.py +154 -0
rfd3/trainer/fabric_trainer.py +923 -0
rfd3/trainer/recycling.py +42 -0
rfd3/trainer/rfd3.py +485 -0
rfd3/trainer/trainer_utils.py +502 -0
rfd3/transforms/conditioning_base.py +508 -0
rfd3/transforms/conditioning_utils.py +200 -0
rfd3/transforms/design_transforms.py +807 -0
rfd3/transforms/dna_crop.py +523 -0
rfd3/transforms/hbonds.py +407 -0
rfd3/transforms/hbonds_hbplus.py +246 -0
rfd3/transforms/ncaa_transforms.py +153 -0
rfd3/transforms/pipelines.py +632 -0
rfd3/transforms/ppi_transforms.py +541 -0
rfd3/transforms/rasa.py +116 -0
rfd3/transforms/symmetry.py +76 -0
rfd3/transforms/training_conditions.py +552 -0
rfd3/transforms/util_transforms.py +498 -0
rfd3/transforms/virtual_atoms.py +305 -0
rfd3/utils/inference.py +648 -0
rfd3/utils/io.py +245 -0
rfd3/utils/vizualize.py +276 -0

rf3/utils/inference.py ADDED Viewed

@@ -0,0 +1,665 @@
+import json
+import logging
+import os
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass
+from os import PathLike
+from pathlib import Path
+from typing import Iterable
+import numpy as np
+import pandas as pd
+from atomworks.common import as_list
+from atomworks.enums import GroundTruthConformerPolicy
+from atomworks.io import parse
+from atomworks.io.parser import parse_atom_array
+from atomworks.io.tools.inference import (
+    build_msa_paths_by_chain_id_from_component_list,
+    components_to_atom_array,
+)
+from atomworks.io.transforms.categories import category_to_dict
+from atomworks.io.utils.selection import AtomSelectionStack
+from atomworks.ml.transforms.atom_array import add_global_token_id_annotation
+from biotite.structure import AtomArray
+from rf3.utils.io import (
+    CIF_LIKE_EXTENSIONS,
+    DICTIONARY_LIKE_EXTENSIONS,
+    get_sharded_output_path,
+)
+from torch.utils.data import Dataset
+logger = logging.getLogger(__name__)
+def _resolve_override(override_value, source_value, param_name: str, example_id: str):
+    """Resolve CLI override vs source value with warning."""
+    if override_value is not None and source_value:
+        logger.warning(f"CLI {param_name} overriding source value for {example_id}")
+        return override_value
+    return override_value if override_value is not None else source_value
+def extract_example_id_from_path(path: Path) -> str:
+    """Extract example ID from file path."""
+    path_str = str(path.name)
+    # Check for known extensions (longer matches first to handle .cif.gz before .gz)
+    for ext in sorted(CIF_LIKE_EXTENSIONS | {".json"}, key=len, reverse=True):
+        if path_str.endswith(ext):
+            return path_str[: -len(ext)]
+    # Fallback to simple stem
+    return path.stem
+def extract_example_ids_from_json(path: Path) -> list[str]:
+    """Extract example IDs from a JSON file containing one or more examples."""
+    with open(path, "r") as f:
+        data = json.load(f)
+    return [ex["name"] for ex in data]
+@dataclass
+class InferenceInput:
+    """Input specification for RF3 inference."""
+    atom_array: AtomArray
+    chain_info: dict
+    example_id: str
+    template_selection: list[str] | None = None
+    ground_truth_conformer_selection: list[str] | None = None
+    cyclic_chains: list[str] | None = None
+    @classmethod
+    def from_cif_path(
+        cls,
+        path: PathLike,
+        example_id: str | None = None,
+        template_selection: list[str] | str | None = None,
+        ground_truth_conformer_selection: list[str] | str | None = None,
+    ) -> "InferenceInput":
+        """Load from CIF/PDB file.
+        Args:
+          path: Path to CIF/PDB file.
+          example_id: Example ID. Defaults to filename stem.
+          template_selection: Template selection override.
+          ground_truth_conformer_selection: Conformer selection override.
+        Returns:
+          InferenceInput object.
+        """
+        parsed = parse(path, hydrogen_policy="remove", keep_cif_block=True)
+        atom_array = (
+            parsed["assemblies"]["1"][0]
+            if "assemblies" in parsed
+            else parsed["asym_unit"][0]
+        )
+        example_id = example_id or extract_example_id_from_path(Path(path))
+        # Extract from CIF
+        cif_template_sel = None
+        cif_conformer_sel = None
+        if "cif_block" in parsed:
+            template_dict = category_to_dict(parsed["cif_block"], "template_selection")
+            if template_dict:
+                cif_template_sel = list(template_dict.get("template_selection", []))
+            conformer_dict = category_to_dict(
+                parsed["cif_block"], "ground_truth_conformer_selection"
+            )
+            if conformer_dict:
+                cif_conformer_sel = list(
+                    conformer_dict.get("ground_truth_conformer_selection", [])
+                )
+        # Resolve overrides (CLI priority)
+        final_template_sel = _resolve_override(
+            template_selection, cif_template_sel, "template_selection", example_id
+        )
+        final_conformer_sel = _resolve_override(
+            ground_truth_conformer_selection,
+            cif_conformer_sel,
+            "ground_truth_conformer_selection",
+            example_id,
+        )
+        return cls(
+            atom_array=atom_array,
+            chain_info=parsed["chain_info"],
+            example_id=example_id,
+            template_selection=final_template_sel,
+            ground_truth_conformer_selection=final_conformer_sel,
+        )
+    @classmethod
+    def from_json_dict(
+        cls,
+        data: dict,
+        template_selection: list[str] | str | None = None,
+        ground_truth_conformer_selection: list[str] | str | None = None,
+    ) -> "InferenceInput":
+        """Create from JSON dict with components.
+        CLI args override JSON metadata.
+        Args:
+          data: JSON dictionary with components.
+          template_selection: Template selection override.
+          ground_truth_conformer_selection: Conformer selection override.
+        Returns:
+          InferenceInput object.
+        """
+        # Build atom_array from components
+        atom_array, component_list = components_to_atom_array(
+            data["components"],
+            bonds=data.get("bonds"),
+            return_components=True,
+        )
+        parsed = parse_atom_array(
+            atom_array,
+            build_assembly="_spoof",
+            hydrogen_policy="keep",
+        )
+        chain_info = parsed.get("chain_info", {})
+        atom_array = (
+            parsed["assemblies"]["1"][0]
+            if "assemblies" in parsed
+            else parsed["asym_unit"][0]
+        )
+        # Merge MSA paths into chain_info
+        msa_paths_by_chain_id = build_msa_paths_by_chain_id_from_component_list(
+            component_list
+        )
+        if data.get("msa_paths") and isinstance(data.get("msa_paths"), dict):
+            msa_paths_by_chain_id.update(data.get("msa_paths"))
+        for chain_id, msa_path in msa_paths_by_chain_id.items():
+            if chain_id in chain_info:
+                chain_info[chain_id]["msa_path"] = msa_path
+        # Resolve overrides (CLI priority)
+        final_template_sel = _resolve_override(
+            template_selection,
+            data.get("template_selection"),
+            "template_selection",
+            data["name"],
+        )
+        final_conformer_sel = _resolve_override(
+            ground_truth_conformer_selection,
+            data.get("ground_truth_conformer_selection"),
+            "ground_truth_conformer_selection",
+            data["name"],
+        )
+        return cls(
+            atom_array=atom_array,
+            chain_info=chain_info,
+            example_id=data["name"],
+            template_selection=final_template_sel,
+            ground_truth_conformer_selection=final_conformer_sel,
+        )
+    @classmethod
+    def from_atom_array(
+        cls,
+        atom_array: AtomArray,
+        chain_info: dict | None = None,
+        example_id: str | None = None,
+        template_selection: list[str] | str | None = None,
+        ground_truth_conformer_selection: list[str] | str | None = None,
+    ) -> "InferenceInput":
+        """Create from AtomArray.
+        Args:
+          atom_array: Input AtomArray.
+          chain_info: Chain info dict. Defaults to extracted from atom_array.
+          example_id: Example ID. Defaults to generated ID.
+          template_selection: Template selection.
+          ground_truth_conformer_selection: Conformer selection.
+        Returns:
+          InferenceInput object.
+        """
+        # Use parse_atom_array
+        parsed = parse_atom_array(
+            atom_array,
+            build_assembly="_spoof",
+            hydrogen_policy="keep",
+            extra_fields="all",
+        )
+        extracted_chain_info = parsed.get("chain_info", {})
+        # Merge with provided chain_info (provided takes priority)
+        if chain_info is not None:
+            for chain_id, chain_data in chain_info.items():
+                if chain_id in extracted_chain_info:
+                    extracted_chain_info[chain_id].update(chain_data)
+                else:
+                    extracted_chain_info[chain_id] = chain_data
+        final_atom_array = (
+            parsed["assemblies"]["1"][0]
+            if "assemblies" in parsed
+            else parsed["asym_unit"][0]
+        )
+        return cls(
+            atom_array=final_atom_array,
+            chain_info=extracted_chain_info,
+            example_id=example_id or f"inference_{id(atom_array)}",
+            template_selection=template_selection,
+            ground_truth_conformer_selection=ground_truth_conformer_selection,
+        )
+    def to_pipeline_input(self) -> dict:
+        """Apply transformations and return input for Transform pipeline.
+        Returns:
+          Pipeline input dict with example_id, atom_array, and chain_info.
+        """
+        atom_array = self.atom_array.copy()
+        # Apply template and conformer selections
+        atom_array = apply_conformer_and_template_selections(
+            atom_array,
+            template_selection=self.template_selection,
+            ground_truth_conformer_selection=self.ground_truth_conformer_selection,
+        )
+        if self.cyclic_chains:
+            atom_array = cyclize_atom_array(atom_array, self.cyclic_chains)
+        return {
+            "example_id": self.example_id,
+            "atom_array": atom_array,
+            "chain_info": self.chain_info,
+        }
+def _process_single_path(
+    path: Path,
+    existing_outputs_dir: Path | None,
+    sharding_pattern: str | None,
+    template_selection: list[str] | str | None,
+    ground_truth_conformer_selection: list[str] | str | None,
+) -> list[InferenceInput]:
+    """Worker function to process a single input file path.
+    This function is defined at module level to be picklable for multiprocessing.
+    Args:
+      path: Path to a single input file.
+      existing_outputs_dir: If set, skip examples with existing outputs.
+      sharding_pattern: Sharding pattern for output paths.
+      template_selection: Override for template selection.
+      ground_truth_conformer_selection: Override for conformer selection.
+    Returns:
+      List of InferenceInput objects (may be empty if file is skipped).
+    """
+    def example_exists(example_id: str) -> bool:
+        """Check if example already has predictions (sharding-aware)."""
+        if not existing_outputs_dir:
+            return False
+        example_dir = get_sharded_output_path(
+            example_id, existing_outputs_dir, sharding_pattern
+        )
+        return (example_dir / f"{example_id}_metrics.csv").exists()
+    inference_inputs = []
+    if path.suffix == ".json":
+        # Load JSON and convert each entry
+        with open(path, "r") as f:
+            data = json.load(f)
+        # Normalize to list
+        if isinstance(data, dict):
+            data = [data]
+        for item in data:
+            example_id = item["name"]
+            if not example_exists(example_id):
+                inference_inputs.append(
+                    InferenceInput.from_json_dict(
+                        item,
+                        template_selection=template_selection,
+                        ground_truth_conformer_selection=ground_truth_conformer_selection,
+                    )
+                )
+    elif any(path.name.endswith(ext) for ext in CIF_LIKE_EXTENSIONS):
+        # CIF/PDB file
+        example_id = extract_example_id_from_path(path)
+        if not example_exists(example_id):
+            inference_inputs.append(
+                InferenceInput.from_cif_path(
+                    path,
+                    example_id=example_id,
+                    template_selection=template_selection,
+                    ground_truth_conformer_selection=ground_truth_conformer_selection,
+                )
+            )
+    else:
+        raise ValueError(
+            f"Unsupported file type: {path.suffix} (path: {path}). "
+            f"Supported: {CIF_LIKE_EXTENSIONS | DICTIONARY_LIKE_EXTENSIONS}"
+        )
+    return inference_inputs
+def prepare_inference_inputs_from_paths(
+    inputs: PathLike | list[PathLike],
+    existing_outputs_dir: PathLike | None = None,
+    sharding_pattern: str | None = None,
+    template_selection: list[str] | str | None = None,
+    ground_truth_conformer_selection: list[str] | str | None = None,
+) -> list[InferenceInput]:
+    """Load InferenceInput objects from file paths.
+    Handles CIF, PDB, and JSON files. Filters out existing outputs if requested.
+    Uses multiprocessing to parallelize file loading across all available CPUs.
+    Args:
+      inputs: File path(s) or directory path(s).
+      existing_outputs_dir: If set, skip examples with existing outputs.
+      sharding_pattern: Sharding pattern for output paths.
+      template_selection: Override for template selection (applied to all inputs).
+      ground_truth_conformer_selection: Override for conformer selection (applied to all inputs).
+    Returns:
+      List of InferenceInput objects.
+    """
+    input_paths = as_list(inputs)
+    # Collect all raw input files (reusing logic from build_file_paths_for_prediction)
+    paths_to_raw_input_files = []
+    for _path in input_paths:
+        if Path(_path).is_dir():
+            # Scan directory for supported file types (JSON + CIF-like)
+            for file_type in CIF_LIKE_EXTENSIONS | DICTIONARY_LIKE_EXTENSIONS:
+                paths_to_raw_input_files.extend(Path(_path).glob(f"*{file_type}"))
+        else:
+            paths_to_raw_input_files.append(Path(_path))
+    # Determine number of CPUs to use
+    num_cpus = min(os.cpu_count() or 1, len(paths_to_raw_input_files))
+    logger.info(
+        f"Processing {len(paths_to_raw_input_files)} files using {num_cpus} CPUs"
+    )
+    # Convert existing_outputs_dir to Path if needed
+    existing_outputs_dir_path = (
+        Path(existing_outputs_dir) if existing_outputs_dir else None
+    )
+    # Process files in parallel using all available CPUs
+    inference_inputs = []
+    with ProcessPoolExecutor(max_workers=num_cpus) as executor:
+        # Submit all tasks
+        futures = [
+            executor.submit(
+                _process_single_path,
+                path,
+                existing_outputs_dir_path,
+                sharding_pattern,
+                template_selection,
+                ground_truth_conformer_selection,
+            )
+            for path in paths_to_raw_input_files
+        ]
+        # Collect results as they complete
+        for future in futures:
+            result = future.result()
+            inference_inputs.extend(result)
+    logger.info(f"Loaded {len(inference_inputs)} inference inputs")
+    return inference_inputs
+def apply_atom_selection_mask(
+    atom_array: AtomArray, selection_list: Iterable[str]
+) -> np.ndarray:
+    """Return a combined boolean mask for a list of AtomSelectionStack queries.
+    Args:
+        atom_array: AtomArray to select from.
+        selection_list: Iterable of AtomSelectionStack queries (e.g., "*/LIG", "A1-10").
+    Returns:
+        A boolean numpy array of shape (num_atoms,) where True indicates a selected atom.
+    """
+    selection_mask = np.zeros(len(atom_array), dtype=bool)
+    for selection in selection_list:
+        if not selection:
+            continue
+        try:
+            selector = AtomSelectionStack.from_query(selection)
+            mask = selector.get_mask(atom_array)
+            selection_mask = selection_mask | mask
+        except Exception as exc:  # Defensive: keep going if one selection fails
+            logging.warning(
+                "Failed to parse selection '%s': %s. Skipping.", selection, exc
+            )
+    return selection_mask
+def apply_template_selection(
+    atom_array: AtomArray, template_selection: list[str] | str | None
+) -> AtomArray:
+    """Apply token-level template selection to `atom_array` with OR semantics.
+    If the `is_input_file_templated` annotation already exists, this function ORs
+    the new selection with the existing annotation. Otherwise, it creates it.
+    Args:
+        atom_array: AtomArray to annotate.
+        template_selection: Selection string(s). Single strings are converted to lists. If None/empty, no-op.
+    Returns:
+        The same AtomArray with `is_input_file_templated` updated.
+    """
+    # Convert to list if needed
+    template_selection_list = as_list(template_selection) if template_selection else []
+    if not template_selection_list:
+        # Ensure the annotation exists even if no selection provided
+        if "is_input_file_templated" not in atom_array.get_annotation_categories():
+            atom_array.set_annotation(
+                "is_input_file_templated", np.zeros(len(atom_array), dtype=bool)
+            )
+        return atom_array
+    # Build new mask
+    selection_mask = apply_atom_selection_mask(atom_array, template_selection_list)
+    logging.info(
+        "Selected %d atoms for token-level templating with %d syntaxes",
+        int(np.sum(selection_mask)),
+        len([s for s in template_selection_list if s]),
+    )
+    # OR with existing annotation if present
+    if "is_input_file_templated" in atom_array.get_annotation_categories():
+        existing = atom_array.get_annotation("is_input_file_templated").astype(bool)
+        selection_mask = existing | selection_mask
+    atom_array.set_annotation("is_input_file_templated", selection_mask)
+    return atom_array
+def apply_ground_truth_conformer_selection(
+    atom_array: AtomArray, ground_truth_conformer_selection: list[str] | str | None
+) -> AtomArray:
+    """Apply ground-truth conformer policy selection with union semantics.
+    Behavior:
+    - Creates `ground_truth_conformer_policy` if missing and initializes to IGNORE.
+    - For selected atoms, sets policy to at least ADD without downgrading any
+      existing policy (e.g., preserves REPLACE if present).
+    Args:
+        atom_array: AtomArray to annotate.
+        ground_truth_conformer_selection: Selection string(s). Single strings are converted to lists. If None/empty, no-op.
+    Returns:
+        The same AtomArray with `ground_truth_conformer_policy` updated.
+    """
+    # Convert to list if needed
+    ground_truth_conformer_selection_list = (
+        as_list(ground_truth_conformer_selection)
+        if ground_truth_conformer_selection
+        else []
+    )
+    if not ground_truth_conformer_selection_list:
+        if (
+            "ground_truth_conformer_policy"
+            not in atom_array.get_annotation_categories()
+        ):
+            atom_array.set_annotation(
+                "ground_truth_conformer_policy",
+                np.full(
+                    len(atom_array), GroundTruthConformerPolicy.IGNORE, dtype=np.int8
+                ),
+            )
+        return atom_array
+    # Ensure annotation exists
+    if "ground_truth_conformer_policy" not in atom_array.get_annotation_categories():
+        atom_array.set_annotation(
+            "ground_truth_conformer_policy",
+            np.full(len(atom_array), GroundTruthConformerPolicy.IGNORE, dtype=np.int8),
+        )
+    selection_mask = apply_atom_selection_mask(
+        atom_array, ground_truth_conformer_selection_list
+    )
+    logging.info(
+        "Selected %d atoms for ground-truth conformer policy with %d syntaxes",
+        int(np.sum(selection_mask)),
+        len([s for s in ground_truth_conformer_selection_list if s]),
+    )
+    existing = atom_array.get_annotation("ground_truth_conformer_policy")
+    existing[selection_mask] = GroundTruthConformerPolicy.ADD
+    atom_array.set_annotation("ground_truth_conformer_policy", existing)
+    return atom_array
+def apply_conformer_and_template_selections(
+    atom_array: AtomArray,
+    template_selection: list[str] | str | None = None,
+    ground_truth_conformer_selection: list[str] | str | None = None,
+) -> AtomArray:
+    """Apply template and conformer selections and basic preprocessing.
+    This function replaces the former class method `prepare_atom_array`.
+    - Applies `apply_template_selection` then `apply_ground_truth_conformer_selection`.
+    - Replaces NaN coordinates with -1 for safety.
+    Args:
+        atom_array: AtomArray to prepare.
+        template_selection: Template selection string(s). Single strings are converted to lists.
+        ground_truth_conformer_selection: Ground-truth conformer selection string(s). Single strings are converted to lists.
+    Returns:
+        The same AtomArray with `is_input_file_templated` and `ground_truth_conformer_policy` updated.
+    """
+    atom_array = apply_template_selection(atom_array, template_selection)
+    atom_array = apply_ground_truth_conformer_selection(
+        atom_array, ground_truth_conformer_selection
+    )
+    # Safety: avoid unexpected behavior downstream
+    atom_array.coord[np.isnan(atom_array.coord)] = -1
+    return atom_array
+def cyclize_atom_array(atom_array: AtomArray, cyclic_chains: list[str]) -> AtomArray:
+    """Cyclize the atom array by positioining the termini properly if not already done.
+    Behavior:
+    - Positions the last carbon atom in the chain to be 1.3 Angstroms away from the first nitrogen atom if they are not already close.
+    - Adds a bond between the termini for proper cif output.
+    Args:
+        atom_array: AtomArray to cyclize.
+        cyclic_chains: List of chain IDs to cyclize.
+    Returns:
+        The same AtomArray with the specified chains cyclized.
+    """
+    for chain in cyclic_chains:
+        # Find the first nitrogen atom in the chain
+        nitrogen_mask = (atom_array.chain_id == chain) & (atom_array.atom_name == "N")
+        nitrogen_mask_indices = np.where(nitrogen_mask)[0]
+        first_nitrogen_index = nitrogen_mask_indices[0]
+        nitrogen_coord = atom_array.coord[first_nitrogen_index]
+        # move the last carbon atom in the chain to be 1.3 Angstroms away from the nitrogen
+        carbon_mask = (atom_array.chain_id == chain) & (atom_array.atom_name == "C")
+        carbon_mask_indices = np.where(carbon_mask)[0]
+        last_carbon_index = carbon_mask_indices[-1]
+        # check if the last carbon is already close to the nitrogen
+        termini_distance = np.linalg.norm(
+            atom_array.coord[last_carbon_index] - nitrogen_coord
+        )
+        if not (termini_distance < 1.5 and termini_distance > 0.5):
+            atom_array.coord[last_carbon_index] = nitrogen_coord + np.array(
+                [1.3, 0.0, 0.0]
+            )
+        # add a bond between the nitrogen and carbon so output cif has a connection
+        atom_array.bonds.add_bond(first_nitrogen_index, last_carbon_index)
+        atom_array.bonds.add_bond(last_carbon_index, first_nitrogen_index)
+    return atom_array
+class InferenceInputDataset(Dataset):
+    """
+    Dataset for inference inputs. Also has a length key telling you the number of tokens in each example for LoadBalancedDistributedSampler.
+    To calculate the length of each example, we need to add the token_id annotation to the atom_array. If it doesn't exist yet, we add it,
+    calculate the length, and then remove it since the downstream pipeline may not be expecting it. That means the num_tokens key may not ultimately
+    be the same as what's actually used in the model, but this is a close enough approximation for load balancing.
+    Args:
+      inference_inputs: List of InferenceInput objects to wrap in a Dataset.
+    """
+    def __init__(self, inference_inputs: list[InferenceInput]):
+        self.inference_inputs = inference_inputs
+        self.key_to_balance = "num_tokens_approximate"
+        # LoadBalancedDistributedSampler checks in dataset.data[key_to_balance] to determine balancing.
+        # That means we need to make a dataframe in self.data that has a column with the key_to_balance.
+        atom_array_token_lens = []
+        for inf_input in self.inference_inputs:
+            if "token_id" not in inf_input.atom_array.get_annotation_categories():
+                inf_input.atom_array = add_global_token_id_annotation(
+                    inf_input.atom_array
+                )
+                num_tokens = len(np.unique(inf_input.atom_array.token_id))
+                # remove the token_id annotation since the pipeline may not be expecting it
+                inf_input.atom_array.del_annotation("token_id")
+            else:
+                num_tokens = len(np.unique(inf_input.atom_array.token_id))
+            atom_array_token_lens.append(num_tokens)
+        self.data = pd.DataFrame({self.key_to_balance: atom_array_token_lens})
+    def __len__(self):
+        return len(self.inference_inputs)
+    def __getitem__(self, idx):
+        return self.inference_inputs[idx]