PyPI - profam - Versions diffs - 0.1.1__tar.gz - Mend

profam 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

profam-0.1.1/.github/workflows/pre-commit.yml +15 -0
profam-0.1.1/.github/workflows/publish.yml +49 -0
profam-0.1.1/.github/workflows/train.yml +75 -0
profam-0.1.1/.gitignore +64 -0
profam-0.1.1/.pre-commit-config.yaml +38 -0
profam-0.1.1/.project-root +0 -0
profam-0.1.1/.syncignore +27 -0
profam-0.1.1/LICENSE +20 -0
profam-0.1.1/PKG-INFO +296 -0
profam-0.1.1/README.md +250 -0
profam-0.1.1/README_ProFam_atlas.md +61 -0
profam-0.1.1/configs/data/profam.yaml +153 -0
profam-0.1.1/configs/data/train_example.yaml +155 -0
profam-0.1.1/configs/experiment/replicate_profam1_preprint_version.yaml +243 -0
profam-0.1.1/configs/experiment/train_profam_example.yaml +23 -0
profam-0.1.1/configs/train.yaml +194 -0
profam-0.1.1/data/.gitkeep +0 -0
profam-0.1.1/data/generate_sequences_example/4_1_1_39_cluster.filtered.fasta +130 -0
profam-0.1.1/data/generate_sequences_example/generate_sequences_test_case.fasta +4 -0
profam-0.1.1/data/profam_logo_grey.png +0 -0
profam-0.1.1/data/profam_tokenizer.json +290 -0
profam-0.1.1/data/score_sequences_example/CCDB_ECOLI_Adkar_2012.a3m +2000 -0
profam-0.1.1/data/score_sequences_example/CCDB_ECOLI_Adkar_2012.csv +1177 -0
profam-0.1.1/data/train_example/OpenFold_OpenProteinSet/OpenFold_OpenProteinSet_example.mapping +6 -0
profam-0.1.1/data/train_example/OpenFold_OpenProteinSet/OpenFold_OpenProteinSet_example.sequences +18 -0
profam-0.1.1/data/train_example/foldseek_s50/foldseek_s50_example.mapping +6 -0
profam-0.1.1/data/train_example/foldseek_s50/foldseek_s50_example.sequences +14 -0
profam-0.1.1/data/train_example/ted/ted_example.mapping +6 -0
profam-0.1.1/data/train_example/ted/ted_example.sequences +60 -0
profam-0.1.1/data/train_example/uniref90/uniref90_example.mapping +6 -0
profam-0.1.1/data/train_example/uniref90/uniref90_example.sequences +6 -0
profam-0.1.1/data_creation_scripts/create_ted_text_min_20_max_90.py +305 -0
profam-0.1.1/data_creation_scripts/fasta_to_text_memmap.py +96 -0
profam-0.1.1/data_creation_scripts/mmseqs_train_test_split.py +700 -0
profam-0.1.1/data_creation_scripts/openfold_process_msa_fragments.py +610 -0
profam-0.1.1/launch.sh +468 -0
profam-0.1.1/profam/__init__.py +6 -0
profam-0.1.1/profam/cli/__init__.py +0 -0
profam-0.1.1/profam/cli/generate_sequences.py +362 -0
profam-0.1.1/profam/cli/score_sequences.py +418 -0
profam-0.1.1/profam/configs/__init__.py +0 -0
profam-0.1.1/profam/configs/data/__init__.py +0 -0
profam-0.1.1/profam/configs/data/profam.yaml +153 -0
profam-0.1.1/profam/configs/data/train_example.yaml +155 -0
profam-0.1.1/profam/configs/experiment/__init__.py +0 -0
profam-0.1.1/profam/configs/experiment/replicate_profam1_preprint_version.yaml +243 -0
profam-0.1.1/profam/configs/experiment/train_profam_example.yaml +23 -0
profam-0.1.1/profam/configs/train.yaml +194 -0
profam-0.1.1/profam/constants.py +113 -0
profam-0.1.1/profam/data/__init__.py +10 -0
profam-0.1.1/profam/data/builders/__init__.py +2 -0
profam-0.1.1/profam/data/builders/family_text_memmap_datasets.py +278 -0
profam-0.1.1/profam/data/builders/proteingym.py +538 -0
profam-0.1.1/profam/data/collators.py +412 -0
profam-0.1.1/profam/data/datamodule.py +301 -0
profam-0.1.1/profam/data/msa_subsampling.py +383 -0
profam-0.1.1/profam/data/objects.py +350 -0
profam-0.1.1/profam/data/online_sample_mapping.py +598 -0
profam-0.1.1/profam/data/processors/__init__.py +5 -0
profam-0.1.1/profam/data/processors/batch_transforms.py +133 -0
profam-0.1.1/profam/data/processors/preprocessing.py +169 -0
profam-0.1.1/profam/data/processors/transforms.py +394 -0
profam-0.1.1/profam/data/profam_tokenizer.json +290 -0
profam-0.1.1/profam/data/samplers.py +85 -0
profam-0.1.1/profam/data/text_memmap_datasets.py +657 -0
profam-0.1.1/profam/data/tokenizers.py +229 -0
profam-0.1.1/profam/data/utils.py +30 -0
profam-0.1.1/profam/download_checkpoint.py +76 -0
profam-0.1.1/profam/evaluators/base.py +51 -0
profam-0.1.1/profam/evaluators/esmfold.py +357 -0
profam-0.1.1/profam/evaluators/hmmer.py +286 -0
profam-0.1.1/profam/evaluators/identity.py +70 -0
profam-0.1.1/profam/model_summary.py +37 -0
profam-0.1.1/profam/models/__init__.py +0 -0
profam-0.1.1/profam/models/base.py +1011 -0
profam-0.1.1/profam/models/inference.py +849 -0
profam-0.1.1/profam/models/llama.py +59 -0
profam-0.1.1/profam/models/metrics.py +189 -0
profam-0.1.1/profam/models/utils.py +100 -0
profam-0.1.1/profam/pipelines/callback.py +111 -0
profam-0.1.1/profam/pipelines/pipeline.py +588 -0
profam-0.1.1/profam/sequence/alignment.py +373 -0
profam-0.1.1/profam/sequence/fasta.py +151 -0
profam-0.1.1/profam/sequence/utils.py +55 -0
profam-0.1.1/profam/train.py +179 -0
profam-0.1.1/profam/utils/__init__.py +7 -0
profam-0.1.1/profam/utils/callbacks.py +547 -0
profam-0.1.1/profam/utils/config_validation.py +88 -0
profam-0.1.1/profam/utils/evaluation_utils.py +627 -0
profam-0.1.1/profam/utils/instantiators.py +79 -0
profam-0.1.1/profam/utils/loggers.py +99 -0
profam-0.1.1/profam/utils/logging_utils.py +46 -0
profam-0.1.1/profam/utils/profilers.py +57 -0
profam-0.1.1/profam/utils/pylogger.py +55 -0
profam-0.1.1/profam/utils/rich_utils.py +99 -0
profam-0.1.1/profam/utils/sampling_utils.py +43 -0
profam-0.1.1/profam/utils/throughput.py +275 -0
profam-0.1.1/profam/utils/trainer.py +167 -0
profam-0.1.1/profam/utils/utils.py +220 -0
profam-0.1.1/pyproject.toml +102 -0
profam-0.1.1/requirements-cpu.txt +86 -0
profam-0.1.1/requirements-dev.txt +2 -0
profam-0.1.1/requirements.txt +100 -0
profam-0.1.1/scripts/remove_all_memmap_idx_files.py +34 -0
profam-0.1.1/scripts/vocab.json +68 -0
profam-0.1.1/scripts/wandb_sync_daemon.py +70 -0
profam-0.1.1/tests/__init__.py +0 -0
profam-0.1.1/tests/conftest.py +162 -0
profam-0.1.1/tests/data/test_dataclasses.py +16 -0
profam-0.1.1/tests/data/test_fasta.py +80 -0
profam-0.1.1/tests/data/test_packing.py +74 -0
profam-0.1.1/tests/helpers/__init__.py +0 -0
profam-0.1.1/tests/helpers/package_available.py +32 -0
profam-0.1.1/tests/helpers/run_if.py +140 -0
profam-0.1.1/tests/helpers/run_sh_command.py +22 -0
profam-0.1.1/tests/model/test_scoring.py +181 -0
profam-0.1.1/tests/model/test_sequence_packing.py +139 -0
profam-0.1.1/tests/test_convert_sequence_with_positions.py +128 -0
profam-0.1.1/tests/test_metrics.py +390 -0
profam-0.1.1/tests/test_packaging.py +32 -0
profam-0.1.1/tests/test_tokenizer.py +25 -0
profam-0.1.1/tests/test_train_sample_score.py +191 -0
profam-0.1.1/tests/test_transforms.py +47 -0
profam-0.1.1/uv.lock +3827 -0

profam-0.1.1/.github/workflows/pre-commit.yml ADDED Viewed

@@ -0,0 +1,15 @@
+# https://github.com/pre-commit/action?tab=readme-ov-file
+name: pre-commit
+on:
+  pull_request:
+  push:
+    branches: [main]
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v3
+    - uses: pre-commit/action@v3.0.1

profam-0.1.1/.github/workflows/publish.yml ADDED Viewed

@@ -0,0 +1,49 @@
+name: Publish
+on:
+  workflow_dispatch:
+jobs:
+  publish-testpypi:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+      - name: Install development dependencies
+        run: uv sync --group dev
+      - name: Run packaging tests
+        run: uv run --group dev pytest tests/test_packaging.py
+      - name: Build package
+        run: uv build --no-sources
+      - name: Check package metadata
+        run: uv tool run twine check dist/*
+      - name: Smoke test built wheel
+        run: |
+          tmpdir="$(mktemp -d)"
+          uv venv "$tmpdir/venv"
+          uv pip install --python "$tmpdir/venv/bin/python" dist/profam-*.whl
+          cd "$tmpdir"
+          "$tmpdir/venv/bin/python" -c "import profam; from profam.constants import CONFIGS_DIR, TOKENIZER_FILE; print(profam.__version__); assert CONFIGS_DIR.is_dir(); assert TOKENIZER_FILE.is_file()"
+          "$tmpdir/venv/bin/profam-download-checkpoint" --help
+          "$tmpdir/venv/bin/profam-train" --help
+          "$tmpdir/venv/bin/profam-model-summary" --help
+          "$tmpdir/venv/bin/profam-generate-sequences" --help
+          "$tmpdir/venv/bin/profam-score-sequences" --help
+      - name: Publish to TestPyPI
+        run: uv publish --index testpypi --token "${{ secrets.TEST_PYPI_API_TOKEN }}"

profam-0.1.1/.github/workflows/train.yml ADDED Viewed

@@ -0,0 +1,75 @@
+name: Train Model
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [master, main]
+jobs:
+  train:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+    - name: Set up uv
+      uses: astral-sh/setup-uv@v7
+      with:
+        enable-cache: true
+    - name: Install dependencies
+      run: |
+        uv sync --group dev
+    - name: Build package
+      run: |
+        uv build --no-sources
+    - name: Check package metadata
+      run: |
+        uv tool run twine check dist/*
+    - name: Smoke test built wheel
+      run: |
+        tmpdir="$(mktemp -d)"
+        uv venv "$tmpdir/venv"
+        uv pip install --python "$tmpdir/venv/bin/python" dist/profam-*.whl
+        cd "$tmpdir"
+        "$tmpdir/venv/bin/python" -c "import profam; from profam.constants import CONFIGS_DIR, TOKENIZER_FILE; print(profam.__version__); assert CONFIGS_DIR.is_dir(); assert TOKENIZER_FILE.is_file()"
+        "$tmpdir/venv/bin/profam-download-checkpoint" --help
+        "$tmpdir/venv/bin/profam-train" --help
+        "$tmpdir/venv/bin/profam-model-summary" --help
+        "$tmpdir/venv/bin/profam-generate-sequences" --help
+        "$tmpdir/venv/bin/profam-score-sequences" --help
+# TODO: seed?
+    - name: Test
+      run: |
+        pytest -k "not example"
+    - name: Train model
+      run: |
+        HYDRA_FULL_ERROR=1 OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
+          python -m profam.train \
+            data=train_example \
+            model.config.hidden_size=64 \
+            model.config.intermediate_size=128 \
+            model.config.num_attention_heads=1 \
+            model.config.num_key_value_heads=1 \
+            model.config.num_hidden_layers=1 \
+            model.config.attn_implementation=eager \
+            trainer.accelerator=cpu \
+            trainer.strategy=auto \
+            trainer.devices=1 \
+            trainer.max_steps=2 \
+            +trainer.num_sanity_val_steps=0 \
+            +trainer.limit_val_batches=1 \
+            +trainer.enable_checkpointing=false \
+            logger=null \
+            callbacks=null \
+            data.num_workers=0 \
+            data.prefetch_factor=null

profam-0.1.1/.gitignore ADDED Viewed

@@ -0,0 +1,64 @@
+**/__pycache__/
+**/*.pyc
+*.pyc
+.vscode/
+benchmark_results/
+logs/
+.idea/
+*.DS_Store
+*.egg-info/
+build/
+venvPF/
+.vscode/
+wandb
+outputs
+.git
+*.o*
+nohup.out
+commit_hash.txt
+.pytest_cache
+data/val_test/pfam/test/clustered_split_fastas/
+data/val_test/pfam/val/clustered_split_fastas/
+data/val_test/overlap_counts/
+proteingym_variants/
+debug_gym_results/
+data/val_test/pfam/test/random_split_fastas/
+out/
+data/val_test/pfam/val/random_split_fastas/
+data/all_heldout_sequences.csv
+data/val_test_heldout_representative_sequences.fasta
+data/val_test/ted_esmif_accessions_split.json
+src/models/tranception
+data/val_test/overlap_counts/
+src/models/tranception
+data/val_test/overlap_counts/
+data/cath_chain_topology_class.pickle
+scripts/adhoc_analysis/redundant_scripts/
+*/.ipynb_checkpoints/
+.ipynb_checkpoints/
+.env
+.bash_history
+.cache
+.ipython
+.local
+.python_history
+.dotnet
+.gitconfig
+.gnupg
+.ssh
+.vscode-server
+src/tools/foldtoken/
+results/
+out/
+.lesshst
+.zshrc
+.zcompdump
+.viminfo
+.venv/
+evo_finetune_gym/
+model_checkpoints/profam-1/checkpoints/last.ckpt
+tests/test_min20_max90_mapping_alignment.py
+model_checkpoints/
+data/train_example/*/*.idx.info
+data/train_example/*/*.idx.npy
+data/score_sequences_example/*_weights.npz

profam-0.1.1/.pre-commit-config.yaml ADDED Viewed

@@ -0,0 +1,38 @@
+repos:
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        args: ["--profile", "black"]
+        exclude: '^data'
+  - repo: https://github.com/psf/black
+    rev: 22.10.0
+    hooks:
+      - id: black
+        exclude: '^data'
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+      - id: check-yaml
+        exclude: '^data'
+      - id: end-of-file-fixer
+        exclude: '^data'
+      - id: trailing-whitespace
+        exclude: '^data'
+        # exclude: '^data|^scripts/gvp'
+  # - repo: https://github.com/PyCQA/flake8
+  #   rev: 4.0.1
+  #   hooks:
+  #     - id: flake8
+  #       name: "Linter"
+  #       args:
+  #         - --config=setup.cfg
+  #       additional_dependencies:
+  #         - pep8-naming
+  #         - flake8-builtins
+  #         - flake8-comprehensions
+  #         - flake8-bugbear
+  #         - flake8-pytest-style
+  #         - flake8-cognitive-complexity
+  #       exclude: '^evoif/gvp|^evoif/lm-design'

profam-0.1.1/.project-root ADDED Viewed

File without changes

profam-0.1.1/.syncignore ADDED Viewed

@@ -0,0 +1,27 @@
+# ignore these files from rsync  --exclude-from=.syncignore
+# n.b. commit_hash.txt is ignored by .gitignore but not here!
+**/__pycache__/
+**/*.pyc
+.vscode/
+benchmark_results/
+logs/
+.idea/
+*.DS_Store
+*.egg-info/
+build/
+venvPF/
+.vscode/
+wandb
+outputs
+.git
+nohup.out
+.pytest_cache
+data/val_test/pfam/test/clustered_split_fastas/
+data/val_test/pfam/val/clustered_split_fastas/
+data/val_test/overlap_counts/
+data/val_test/pfam/test/random_split_fastas/
+data/val_test/pfam/val/random_split_fastas/
+src/models/tranception
+data/val_test/overlap_counts/
+src/models/tranception
+data/val_test/overlap_counts/

profam-0.1.1/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2025 UCL & NVIDIA
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

profam-0.1.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,296 @@
+Metadata-Version: 2.4
+Name: profam
+Version: 0.1.1
+Summary: Protein family language models
+Project-URL: Homepage, https://github.com/alex-hh/profam
+Project-URL: Repository, https://github.com/alex-hh/profam
+Project-URL: Issues, https://github.com/alex-hh/profam/issues
+License-Expression: MIT
+License-File: LICENSE
+Keywords: bioinformatics,machine-learning,protein-design,protein-language-model
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires-Python: >=3.10
+Requires-Dist: accelerate>=0.30
+Requires-Dist: biopython>=1.80
+Requires-Dist: biotite>=1.0
+Requires-Dist: huggingface-hub>=0.23
+Requires-Dist: hydra-core>=1.3
+Requires-Dist: lightning>=2.1
+Requires-Dist: matplotlib>=3.7
+Requires-Dist: numba>=0.58
+Requires-Dist: numpy>=1.24
+Requires-Dist: omegaconf>=2.3
+Requires-Dist: pandas>=2.0
+Requires-Dist: python-dotenv>=1.0
+Requires-Dist: rich>=13.0
+Requires-Dist: safetensors>=0.4
+Requires-Dist: scikit-learn>=1.3
+Requires-Dist: scipy>=1.10
+Requires-Dist: tokenizers>=0.19
+Requires-Dist: torch>=2.1
+Requires-Dist: torchmetrics>=1.0
+Requires-Dist: tqdm>=4.65
+Requires-Dist: transformers<5,>=4.40
+Requires-Dist: typer-slim>=0.9
+Requires-Dist: wandb>=0.16
+Provides-Extra: flash-attn
+Requires-Dist: flash-attn>=2.5; extra == 'flash-attn'
+Description-Content-Type: text/markdown
+<div align="center">
+<img src="data/profam_logo_grey.png" alt="ProFam logo" width="720" />
+# ProFam: Open-Source Protein Family Language Modelling for Fitness Prediction and Design
+[![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
+[![License: MIT](https://img.shields.io/badge/license-MIT-yellow.svg)](LICENSE)
+[![PyPI version](https://img.shields.io/pypi/v/profam.svg)](https://pypi.org/project/profam/)
+[![DOI](https://img.shields.io/badge/DOI-10.64898%2F2025.12.19.695431-blue.svg)](https://www.biorxiv.org/content/10.64898/2025.12.19.695431v1)
+</div>
+ProFam is an open-source toolkit for training, scoring, and generating protein sequences with protein family language models. It packages the **ProFam-1** 251M-parameter pfLM together with open training and inference workflows, a downloadable pretrained checkpoint, and an open dataset release for reproducible experimentation.
+## Installation
+### From PyPI
+Install ProFam as a standard Python package:
+```bash
+uv pip install profam
+```
+or
+```bash
+pip install profam
+```
+### From Source
+If you want the full repository workflows, example data, and inference scripts:
+```bash
+git clone https://github.com/alex-hh/profam.git
+cd profam
+uv sync
+uv run profam-download-checkpoint
+```
+Optional installs:
+- Development tooling: `uv sync --group dev`
+- FlashAttention 2: `uv sync --extra flash-attn`
+If you run into CUDA or `flash-attn` issues, see [Installation Details](#installation-details).
+## Quickstart
+### Verify the installed package
+```bash
+uv run --with profam --no-project -- python -c "import profam; print(profam.__version__)"
+```
+### Run a lightweight training example
+The bundled example config uses the small dataset under `data/train_example`:
+```bash
+uv run profam-train experiment=train_profam_example logger=null_logger
+```
+### Download the pretrained checkpoint
+```bash
+uv run profam-download-checkpoint
+```
+## Main Workflows
+| Workflow | Purpose | Command |
+| --- | --- | --- |
+| Train | Train a ProFam model with Hydra configs | `uv run profam-train` |
+| Example training | Run a lightweight smoke test on example data | `uv run profam-train experiment=train_profam_example logger=null_logger` |
+| Model summary | Print a model architecture summary | `uv run profam-model-summary` |
+| Download checkpoint | Fetch the pretrained `ProFam-1` checkpoint | `uv run profam-download-checkpoint` |
+| Generate sequences | Sample new sequences from family prompts | `uv run profam-generate-sequences ...` |
+| Score sequences | Score candidate sequences with family context | `uv run profam-score-sequences ...` |
+The packaged CLI now covers the main package entrypoints, including training, checkpoint download, sequence generation, and sequence scoring.
+## Input Sequence Formats
+ProFam supports:
+- **Unaligned FASTA** for standard protein sequence inputs
+- **Aligned / MSA-style files** such as A2M/A3M content with gaps and insertions
+For `profam-score-sequences`, we recommend providing an aligned MSA file because sequence weighting is used to encourage diversity when subsampling prompt sequences. Even when aligned inputs are provided, the standard ProFam model converts them into unaligned gap-free sequences before the forward pass.
+During preprocessing:
+- gaps (`-` and alignment-like `.`) are removed
+- lowercase insertions are converted to uppercase
+- `U -> C` and `O -> K`
+- remaining out-of-vocabulary characters map to `[UNK]` only when `allow_unk=true`
+## Training
+### Run a lightweight example
+`configs/experiment/train_profam_example.yaml` is configured to run on the bundled example data:
+```bash
+uv run profam-train experiment=train_profam_example logger=null_logger
+```
+### Train with the ProFam-Atlas dataset
+Training data for ProFam can be downloaded from:
+- [Zenodo: ProFam Atlas Dataset](https://zenodo.org/records/17713590)
+The default configuration in `configs/train.yaml` is compatible with the latest ProFam-Atlas release:
+```bash
+uv run profam-train
+```
+## Resources
+- [bioRxiv preprint](https://www.biorxiv.org/content/10.64898/2025.12.19.695431v1)
+- [Hugging Face: ProFam-1 checkpoint](https://huggingface.co/judewells/ProFam-1)
+- [Zenodo: ProFam Atlas Dataset](https://zenodo.org/records/17713590)
+- [GitHub repository](https://github.com/alex-hh/profam)
+## Citation
+If you use ProFam in your work, please cite the preprint:
+```bibtex
+@article{wells2025profam,
+  title = {ProFam: Open-Source Protein Family Language Modelling for Fitness Prediction and Design},
+  author = {Wells, Jude and Hawkins Hooker, Alex and Livne, Micha and Lin, Weining and Miller, David and Dallago, Christian and Bordin, Nicola and Paige, Brooks and Rost, Burkhard and Orengo, Christine and Heinzinger, Michael},
+  journal = {bioRxiv},
+  year = {2025},
+  doi = {10.64898/2025.12.19.695431},
+  url = {https://www.biorxiv.org/content/10.64898/2025.12.19.695431v1}
+}
+```
+## Installation Details
+### CPU-only installation
+```bash
+uv sync
+uv pip install torch --index-url https://download.pytorch.org/whl/cpu
+```
+### FlashAttention 2
+We recommend installing FlashAttention 2 for faster scoring and generation. For training, it is strongly recommended because ProFam uses sequence packing with `batch_size=1` and no padding.
+If you need to train without Flash Attention, update the configuration to set `data.pack_to_max_tokens=null`.
+```bash
+uv sync --extra flash-attn
+python -c "import flash_attn; print(flash_attn.__version__)"
+```
+### Troubleshooting: conda fallback
+If a matching `flash-attn` wheel is unavailable and a source build is required, this conda-based fallback is often the easiest route:
+```bash
+conda create -n pfenv python=3.11 -y
+conda activate pfenv
+conda install -c conda-forge ninja packaging -y
+conda install -c nvidia cuda-toolkit=12.4 -y
+pip install profam
+# install a CUDA-enabled PyTorch build (adjust CUDA version/index-url to match your setup)
+pip install torch==2.5.1+cu121 torchvision==0.20.1+cu121 --index-url https://download.pytorch.org/whl/cu121
+pip install setuptools wheel packaging psutil numpy
+pip install flash-attn==2.5.6 --no-build-isolation
+python -c "import flash_attn; print(flash_attn.__version__)"
+```
+## Development
+We're using pre-commit to format code and pytest to run tests.
+Pull requests will automatically have pre-commit and pytest run on them
+and will only be approved once these checks are all passing
+Before submitting a pull request, run the checks locally with:
+```bash
+uv run --group dev pre-commit run --all-files
+```
+and
+```bash
+uv run --group dev pytest -k 'not example'
+```
+Pull requests adding complex new features or making any significant changes
+or additions should be accompanied with associated tests in the tests/ directory.
+## Concepts
+### Data loading
+ProFam uses **text memmap datasets**
+for fast random access over large corpora:
+- `profam/data/text_memmap_datasets.py`: generic **memory-mapped** line access + index building (`*.idx.{npy,info}`)
+- `profam/data/builders/family_text_memmap_datasets.py`: ProFam-Atlas-specific datasets built on top of the memmap layer
+#### ProFam-Atlas on-disk format (`.mapping` / `.sequences`)
+The ProFam-Atlas dataset is distributed as paired files:
+- **`*.mapping`**: family id + indices into one or more `*.sequences` files
+  - **Format**:
+    - Line 1: `>FAMILY_ID`
+    - Line 2+: `sequences_filename:idx0,idx1,idx2,...`
+  - **Important**: `*.mapping` files **must not** have a trailing newline at end-of-file.
+- **`*.sequences`**: FASTA-like accessions + sequences
+  - **Format** (repeated):
+    - `>ACCESSION ...`
+    - `SEQUENCE`
+  - **Important**: `*.sequences` files **should** have a final trailing newline.
+See `README_ProFam_atlas.md` for examples and additional details.
+#### How it’s loaded
+At a high level, training loads one **protein family** at a time by:
+1. Reading a family record from `MappingProteinFamilyMemmapDataset` (a memmapped `*.mapping` dataset)
+2. Fetching the referenced sequences from `SequencesProteinFamilyMemmapDataset` (memmapped `*.sequences` files)
+3. Building a `ProteinDocument` and preprocessing it (see `profam/data/processors/preprocessing.py`)
+4. Encoding with `ProFamTokenizer` and forming batches (optionally with packing)
+#### Converting FASTA → text memmap
+If you have a directory of per-family FASTA files and want to create `*.mapping` / `*.sequences` files for training,
+see:
+- `data_creation_scripts/fasta_to_text_memmap.py`