profam 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. profam-0.1.1/.github/workflows/pre-commit.yml +15 -0
  2. profam-0.1.1/.github/workflows/publish.yml +49 -0
  3. profam-0.1.1/.github/workflows/train.yml +75 -0
  4. profam-0.1.1/.gitignore +64 -0
  5. profam-0.1.1/.pre-commit-config.yaml +38 -0
  6. profam-0.1.1/.project-root +0 -0
  7. profam-0.1.1/.syncignore +27 -0
  8. profam-0.1.1/LICENSE +20 -0
  9. profam-0.1.1/PKG-INFO +296 -0
  10. profam-0.1.1/README.md +250 -0
  11. profam-0.1.1/README_ProFam_atlas.md +61 -0
  12. profam-0.1.1/configs/data/profam.yaml +153 -0
  13. profam-0.1.1/configs/data/train_example.yaml +155 -0
  14. profam-0.1.1/configs/experiment/replicate_profam1_preprint_version.yaml +243 -0
  15. profam-0.1.1/configs/experiment/train_profam_example.yaml +23 -0
  16. profam-0.1.1/configs/train.yaml +194 -0
  17. profam-0.1.1/data/.gitkeep +0 -0
  18. profam-0.1.1/data/generate_sequences_example/4_1_1_39_cluster.filtered.fasta +130 -0
  19. profam-0.1.1/data/generate_sequences_example/generate_sequences_test_case.fasta +4 -0
  20. profam-0.1.1/data/profam_logo_grey.png +0 -0
  21. profam-0.1.1/data/profam_tokenizer.json +290 -0
  22. profam-0.1.1/data/score_sequences_example/CCDB_ECOLI_Adkar_2012.a3m +2000 -0
  23. profam-0.1.1/data/score_sequences_example/CCDB_ECOLI_Adkar_2012.csv +1177 -0
  24. profam-0.1.1/data/train_example/OpenFold_OpenProteinSet/OpenFold_OpenProteinSet_example.mapping +6 -0
  25. profam-0.1.1/data/train_example/OpenFold_OpenProteinSet/OpenFold_OpenProteinSet_example.sequences +18 -0
  26. profam-0.1.1/data/train_example/foldseek_s50/foldseek_s50_example.mapping +6 -0
  27. profam-0.1.1/data/train_example/foldseek_s50/foldseek_s50_example.sequences +14 -0
  28. profam-0.1.1/data/train_example/ted/ted_example.mapping +6 -0
  29. profam-0.1.1/data/train_example/ted/ted_example.sequences +60 -0
  30. profam-0.1.1/data/train_example/uniref90/uniref90_example.mapping +6 -0
  31. profam-0.1.1/data/train_example/uniref90/uniref90_example.sequences +6 -0
  32. profam-0.1.1/data_creation_scripts/create_ted_text_min_20_max_90.py +305 -0
  33. profam-0.1.1/data_creation_scripts/fasta_to_text_memmap.py +96 -0
  34. profam-0.1.1/data_creation_scripts/mmseqs_train_test_split.py +700 -0
  35. profam-0.1.1/data_creation_scripts/openfold_process_msa_fragments.py +610 -0
  36. profam-0.1.1/launch.sh +468 -0
  37. profam-0.1.1/profam/__init__.py +6 -0
  38. profam-0.1.1/profam/cli/__init__.py +0 -0
  39. profam-0.1.1/profam/cli/generate_sequences.py +362 -0
  40. profam-0.1.1/profam/cli/score_sequences.py +418 -0
  41. profam-0.1.1/profam/configs/__init__.py +0 -0
  42. profam-0.1.1/profam/configs/data/__init__.py +0 -0
  43. profam-0.1.1/profam/configs/data/profam.yaml +153 -0
  44. profam-0.1.1/profam/configs/data/train_example.yaml +155 -0
  45. profam-0.1.1/profam/configs/experiment/__init__.py +0 -0
  46. profam-0.1.1/profam/configs/experiment/replicate_profam1_preprint_version.yaml +243 -0
  47. profam-0.1.1/profam/configs/experiment/train_profam_example.yaml +23 -0
  48. profam-0.1.1/profam/configs/train.yaml +194 -0
  49. profam-0.1.1/profam/constants.py +113 -0
  50. profam-0.1.1/profam/data/__init__.py +10 -0
  51. profam-0.1.1/profam/data/builders/__init__.py +2 -0
  52. profam-0.1.1/profam/data/builders/family_text_memmap_datasets.py +278 -0
  53. profam-0.1.1/profam/data/builders/proteingym.py +538 -0
  54. profam-0.1.1/profam/data/collators.py +412 -0
  55. profam-0.1.1/profam/data/datamodule.py +301 -0
  56. profam-0.1.1/profam/data/msa_subsampling.py +383 -0
  57. profam-0.1.1/profam/data/objects.py +350 -0
  58. profam-0.1.1/profam/data/online_sample_mapping.py +598 -0
  59. profam-0.1.1/profam/data/processors/__init__.py +5 -0
  60. profam-0.1.1/profam/data/processors/batch_transforms.py +133 -0
  61. profam-0.1.1/profam/data/processors/preprocessing.py +169 -0
  62. profam-0.1.1/profam/data/processors/transforms.py +394 -0
  63. profam-0.1.1/profam/data/profam_tokenizer.json +290 -0
  64. profam-0.1.1/profam/data/samplers.py +85 -0
  65. profam-0.1.1/profam/data/text_memmap_datasets.py +657 -0
  66. profam-0.1.1/profam/data/tokenizers.py +229 -0
  67. profam-0.1.1/profam/data/utils.py +30 -0
  68. profam-0.1.1/profam/download_checkpoint.py +76 -0
  69. profam-0.1.1/profam/evaluators/base.py +51 -0
  70. profam-0.1.1/profam/evaluators/esmfold.py +357 -0
  71. profam-0.1.1/profam/evaluators/hmmer.py +286 -0
  72. profam-0.1.1/profam/evaluators/identity.py +70 -0
  73. profam-0.1.1/profam/model_summary.py +37 -0
  74. profam-0.1.1/profam/models/__init__.py +0 -0
  75. profam-0.1.1/profam/models/base.py +1011 -0
  76. profam-0.1.1/profam/models/inference.py +849 -0
  77. profam-0.1.1/profam/models/llama.py +59 -0
  78. profam-0.1.1/profam/models/metrics.py +189 -0
  79. profam-0.1.1/profam/models/utils.py +100 -0
  80. profam-0.1.1/profam/pipelines/callback.py +111 -0
  81. profam-0.1.1/profam/pipelines/pipeline.py +588 -0
  82. profam-0.1.1/profam/sequence/alignment.py +373 -0
  83. profam-0.1.1/profam/sequence/fasta.py +151 -0
  84. profam-0.1.1/profam/sequence/utils.py +55 -0
  85. profam-0.1.1/profam/train.py +179 -0
  86. profam-0.1.1/profam/utils/__init__.py +7 -0
  87. profam-0.1.1/profam/utils/callbacks.py +547 -0
  88. profam-0.1.1/profam/utils/config_validation.py +88 -0
  89. profam-0.1.1/profam/utils/evaluation_utils.py +627 -0
  90. profam-0.1.1/profam/utils/instantiators.py +79 -0
  91. profam-0.1.1/profam/utils/loggers.py +99 -0
  92. profam-0.1.1/profam/utils/logging_utils.py +46 -0
  93. profam-0.1.1/profam/utils/profilers.py +57 -0
  94. profam-0.1.1/profam/utils/pylogger.py +55 -0
  95. profam-0.1.1/profam/utils/rich_utils.py +99 -0
  96. profam-0.1.1/profam/utils/sampling_utils.py +43 -0
  97. profam-0.1.1/profam/utils/throughput.py +275 -0
  98. profam-0.1.1/profam/utils/trainer.py +167 -0
  99. profam-0.1.1/profam/utils/utils.py +220 -0
  100. profam-0.1.1/pyproject.toml +102 -0
  101. profam-0.1.1/requirements-cpu.txt +86 -0
  102. profam-0.1.1/requirements-dev.txt +2 -0
  103. profam-0.1.1/requirements.txt +100 -0
  104. profam-0.1.1/scripts/remove_all_memmap_idx_files.py +34 -0
  105. profam-0.1.1/scripts/vocab.json +68 -0
  106. profam-0.1.1/scripts/wandb_sync_daemon.py +70 -0
  107. profam-0.1.1/tests/__init__.py +0 -0
  108. profam-0.1.1/tests/conftest.py +162 -0
  109. profam-0.1.1/tests/data/test_dataclasses.py +16 -0
  110. profam-0.1.1/tests/data/test_fasta.py +80 -0
  111. profam-0.1.1/tests/data/test_packing.py +74 -0
  112. profam-0.1.1/tests/helpers/__init__.py +0 -0
  113. profam-0.1.1/tests/helpers/package_available.py +32 -0
  114. profam-0.1.1/tests/helpers/run_if.py +140 -0
  115. profam-0.1.1/tests/helpers/run_sh_command.py +22 -0
  116. profam-0.1.1/tests/model/test_scoring.py +181 -0
  117. profam-0.1.1/tests/model/test_sequence_packing.py +139 -0
  118. profam-0.1.1/tests/test_convert_sequence_with_positions.py +128 -0
  119. profam-0.1.1/tests/test_metrics.py +390 -0
  120. profam-0.1.1/tests/test_packaging.py +32 -0
  121. profam-0.1.1/tests/test_tokenizer.py +25 -0
  122. profam-0.1.1/tests/test_train_sample_score.py +191 -0
  123. profam-0.1.1/tests/test_transforms.py +47 -0
  124. profam-0.1.1/uv.lock +3827 -0
@@ -0,0 +1,15 @@
1
+ # https://github.com/pre-commit/action?tab=readme-ov-file
2
+ name: pre-commit
3
+
4
+ on:
5
+ pull_request:
6
+ push:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ pre-commit:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ - uses: actions/setup-python@v3
15
+ - uses: pre-commit/action@v3.0.1
@@ -0,0 +1,49 @@
1
+ name: Publish
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ jobs:
7
+ publish-testpypi:
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.11"
17
+
18
+ - name: Set up uv
19
+ uses: astral-sh/setup-uv@v7
20
+ with:
21
+ enable-cache: true
22
+
23
+ - name: Install development dependencies
24
+ run: uv sync --group dev
25
+
26
+ - name: Run packaging tests
27
+ run: uv run --group dev pytest tests/test_packaging.py
28
+
29
+ - name: Build package
30
+ run: uv build --no-sources
31
+
32
+ - name: Check package metadata
33
+ run: uv tool run twine check dist/*
34
+
35
+ - name: Smoke test built wheel
36
+ run: |
37
+ tmpdir="$(mktemp -d)"
38
+ uv venv "$tmpdir/venv"
39
+ uv pip install --python "$tmpdir/venv/bin/python" dist/profam-*.whl
40
+ cd "$tmpdir"
41
+ "$tmpdir/venv/bin/python" -c "import profam; from profam.constants import CONFIGS_DIR, TOKENIZER_FILE; print(profam.__version__); assert CONFIGS_DIR.is_dir(); assert TOKENIZER_FILE.is_file()"
42
+ "$tmpdir/venv/bin/profam-download-checkpoint" --help
43
+ "$tmpdir/venv/bin/profam-train" --help
44
+ "$tmpdir/venv/bin/profam-model-summary" --help
45
+ "$tmpdir/venv/bin/profam-generate-sequences" --help
46
+ "$tmpdir/venv/bin/profam-score-sequences" --help
47
+
48
+ - name: Publish to TestPyPI
49
+ run: uv publish --index testpypi --token "${{ secrets.TEST_PYPI_API_TOKEN }}"
@@ -0,0 +1,75 @@
1
+ name: Train Model
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ pull_request:
6
+ branches: [master, main]
7
+
8
+ jobs:
9
+ train:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v5
17
+ with:
18
+ python-version: '3.11'
19
+
20
+ - name: Set up uv
21
+ uses: astral-sh/setup-uv@v7
22
+ with:
23
+ enable-cache: true
24
+
25
+ - name: Install dependencies
26
+ run: |
27
+ uv sync --group dev
28
+
29
+ - name: Build package
30
+ run: |
31
+ uv build --no-sources
32
+
33
+ - name: Check package metadata
34
+ run: |
35
+ uv tool run twine check dist/*
36
+
37
+ - name: Smoke test built wheel
38
+ run: |
39
+ tmpdir="$(mktemp -d)"
40
+ uv venv "$tmpdir/venv"
41
+ uv pip install --python "$tmpdir/venv/bin/python" dist/profam-*.whl
42
+ cd "$tmpdir"
43
+ "$tmpdir/venv/bin/python" -c "import profam; from profam.constants import CONFIGS_DIR, TOKENIZER_FILE; print(profam.__version__); assert CONFIGS_DIR.is_dir(); assert TOKENIZER_FILE.is_file()"
44
+ "$tmpdir/venv/bin/profam-download-checkpoint" --help
45
+ "$tmpdir/venv/bin/profam-train" --help
46
+ "$tmpdir/venv/bin/profam-model-summary" --help
47
+ "$tmpdir/venv/bin/profam-generate-sequences" --help
48
+ "$tmpdir/venv/bin/profam-score-sequences" --help
49
+
50
+ # TODO: seed?
51
+ - name: Test
52
+ run: |
53
+ pytest -k "not example"
54
+ - name: Train model
55
+ run: |
56
+ HYDRA_FULL_ERROR=1 OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
57
+ python -m profam.train \
58
+ data=train_example \
59
+ model.config.hidden_size=64 \
60
+ model.config.intermediate_size=128 \
61
+ model.config.num_attention_heads=1 \
62
+ model.config.num_key_value_heads=1 \
63
+ model.config.num_hidden_layers=1 \
64
+ model.config.attn_implementation=eager \
65
+ trainer.accelerator=cpu \
66
+ trainer.strategy=auto \
67
+ trainer.devices=1 \
68
+ trainer.max_steps=2 \
69
+ +trainer.num_sanity_val_steps=0 \
70
+ +trainer.limit_val_batches=1 \
71
+ +trainer.enable_checkpointing=false \
72
+ logger=null \
73
+ callbacks=null \
74
+ data.num_workers=0 \
75
+ data.prefetch_factor=null
@@ -0,0 +1,64 @@
1
+ **/__pycache__/
2
+ **/*.pyc
3
+ *.pyc
4
+ .vscode/
5
+ benchmark_results/
6
+ logs/
7
+ .idea/
8
+ *.DS_Store
9
+ *.egg-info/
10
+ build/
11
+ venvPF/
12
+ .vscode/
13
+ wandb
14
+ outputs
15
+ .git
16
+ *.o*
17
+ nohup.out
18
+ commit_hash.txt
19
+ .pytest_cache
20
+ data/val_test/pfam/test/clustered_split_fastas/
21
+ data/val_test/pfam/val/clustered_split_fastas/
22
+ data/val_test/overlap_counts/
23
+ proteingym_variants/
24
+ debug_gym_results/
25
+ data/val_test/pfam/test/random_split_fastas/
26
+ out/
27
+ data/val_test/pfam/val/random_split_fastas/
28
+ data/all_heldout_sequences.csv
29
+ data/val_test_heldout_representative_sequences.fasta
30
+ data/val_test/ted_esmif_accessions_split.json
31
+ src/models/tranception
32
+ data/val_test/overlap_counts/
33
+ src/models/tranception
34
+ data/val_test/overlap_counts/
35
+ data/cath_chain_topology_class.pickle
36
+ scripts/adhoc_analysis/redundant_scripts/
37
+ */.ipynb_checkpoints/
38
+ .ipynb_checkpoints/
39
+ .env
40
+ .bash_history
41
+ .cache
42
+ .ipython
43
+ .local
44
+ .python_history
45
+ .dotnet
46
+ .gitconfig
47
+ .gnupg
48
+ .ssh
49
+ .vscode-server
50
+ src/tools/foldtoken/
51
+ results/
52
+ out/
53
+ .lesshst
54
+ .zshrc
55
+ .zcompdump
56
+ .viminfo
57
+ .venv/
58
+ evo_finetune_gym/
59
+ model_checkpoints/profam-1/checkpoints/last.ckpt
60
+ tests/test_min20_max90_mapping_alignment.py
61
+ model_checkpoints/
62
+ data/train_example/*/*.idx.info
63
+ data/train_example/*/*.idx.npy
64
+ data/score_sequences_example/*_weights.npz
@@ -0,0 +1,38 @@
1
+ repos:
2
+ - repo: https://github.com/PyCQA/isort
3
+ rev: 5.13.2
4
+ hooks:
5
+ - id: isort
6
+ args: ["--profile", "black"]
7
+ exclude: '^data'
8
+
9
+ - repo: https://github.com/psf/black
10
+ rev: 22.10.0
11
+ hooks:
12
+ - id: black
13
+ exclude: '^data'
14
+ - repo: https://github.com/pre-commit/pre-commit-hooks
15
+ rev: v2.3.0
16
+ hooks:
17
+ - id: check-yaml
18
+ exclude: '^data'
19
+ - id: end-of-file-fixer
20
+ exclude: '^data'
21
+ - id: trailing-whitespace
22
+ exclude: '^data'
23
+ # exclude: '^data|^scripts/gvp'
24
+ # - repo: https://github.com/PyCQA/flake8
25
+ # rev: 4.0.1
26
+ # hooks:
27
+ # - id: flake8
28
+ # name: "Linter"
29
+ # args:
30
+ # - --config=setup.cfg
31
+ # additional_dependencies:
32
+ # - pep8-naming
33
+ # - flake8-builtins
34
+ # - flake8-comprehensions
35
+ # - flake8-bugbear
36
+ # - flake8-pytest-style
37
+ # - flake8-cognitive-complexity
38
+ # exclude: '^evoif/gvp|^evoif/lm-design'
File without changes
@@ -0,0 +1,27 @@
1
+ # ignore these files from rsync --exclude-from=.syncignore
2
+ # n.b. commit_hash.txt is ignored by .gitignore but not here!
3
+ **/__pycache__/
4
+ **/*.pyc
5
+ .vscode/
6
+ benchmark_results/
7
+ logs/
8
+ .idea/
9
+ *.DS_Store
10
+ *.egg-info/
11
+ build/
12
+ venvPF/
13
+ .vscode/
14
+ wandb
15
+ outputs
16
+ .git
17
+ nohup.out
18
+ .pytest_cache
19
+ data/val_test/pfam/test/clustered_split_fastas/
20
+ data/val_test/pfam/val/clustered_split_fastas/
21
+ data/val_test/overlap_counts/
22
+ data/val_test/pfam/test/random_split_fastas/
23
+ data/val_test/pfam/val/random_split_fastas/
24
+ src/models/tranception
25
+ data/val_test/overlap_counts/
26
+ src/models/tranception
27
+ data/val_test/overlap_counts/
profam-0.1.1/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+
2
+ Copyright (c) 2025 UCL & NVIDIA
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ of this software and associated documentation files (the "Software"), to deal
6
+ in the Software without restriction, including without limitation the rights
7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the Software is
9
+ furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all
12
+ copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ SOFTWARE.
profam-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,296 @@
1
+ Metadata-Version: 2.4
2
+ Name: profam
3
+ Version: 0.1.1
4
+ Summary: Protein family language models
5
+ Project-URL: Homepage, https://github.com/alex-hh/profam
6
+ Project-URL: Repository, https://github.com/alex-hh/profam
7
+ Project-URL: Issues, https://github.com/alex-hh/profam/issues
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: bioinformatics,machine-learning,protein-design,protein-language-model
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: accelerate>=0.30
21
+ Requires-Dist: biopython>=1.80
22
+ Requires-Dist: biotite>=1.0
23
+ Requires-Dist: huggingface-hub>=0.23
24
+ Requires-Dist: hydra-core>=1.3
25
+ Requires-Dist: lightning>=2.1
26
+ Requires-Dist: matplotlib>=3.7
27
+ Requires-Dist: numba>=0.58
28
+ Requires-Dist: numpy>=1.24
29
+ Requires-Dist: omegaconf>=2.3
30
+ Requires-Dist: pandas>=2.0
31
+ Requires-Dist: python-dotenv>=1.0
32
+ Requires-Dist: rich>=13.0
33
+ Requires-Dist: safetensors>=0.4
34
+ Requires-Dist: scikit-learn>=1.3
35
+ Requires-Dist: scipy>=1.10
36
+ Requires-Dist: tokenizers>=0.19
37
+ Requires-Dist: torch>=2.1
38
+ Requires-Dist: torchmetrics>=1.0
39
+ Requires-Dist: tqdm>=4.65
40
+ Requires-Dist: transformers<5,>=4.40
41
+ Requires-Dist: typer-slim>=0.9
42
+ Requires-Dist: wandb>=0.16
43
+ Provides-Extra: flash-attn
44
+ Requires-Dist: flash-attn>=2.5; extra == 'flash-attn'
45
+ Description-Content-Type: text/markdown
46
+
47
+ <div align="center">
48
+
49
+ <img src="data/profam_logo_grey.png" alt="ProFam logo" width="720" />
50
+
51
+ # ProFam: Open-Source Protein Family Language Modelling for Fitness Prediction and Design
52
+
53
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
54
+ [![License: MIT](https://img.shields.io/badge/license-MIT-yellow.svg)](LICENSE)
55
+ [![PyPI version](https://img.shields.io/pypi/v/profam.svg)](https://pypi.org/project/profam/)
56
+ [![DOI](https://img.shields.io/badge/DOI-10.64898%2F2025.12.19.695431-blue.svg)](https://www.biorxiv.org/content/10.64898/2025.12.19.695431v1)
57
+
58
+ </div>
59
+
60
+ ProFam is an open-source toolkit for training, scoring, and generating protein sequences with protein family language models. It packages the **ProFam-1** 251M-parameter pfLM together with open training and inference workflows, a downloadable pretrained checkpoint, and an open dataset release for reproducible experimentation.
61
+
62
+ ## Installation
63
+
64
+ ### From PyPI
65
+
66
+ Install ProFam as a standard Python package:
67
+
68
+ ```bash
69
+ uv pip install profam
70
+ ```
71
+
72
+ or
73
+
74
+ ```bash
75
+ pip install profam
76
+ ```
77
+
78
+ ### From Source
79
+
80
+ If you want the full repository workflows, example data, and inference scripts:
81
+
82
+ ```bash
83
+ git clone https://github.com/alex-hh/profam.git
84
+ cd profam
85
+ uv sync
86
+ uv run profam-download-checkpoint
87
+ ```
88
+
89
+ Optional installs:
90
+
91
+ - Development tooling: `uv sync --group dev`
92
+ - FlashAttention 2: `uv sync --extra flash-attn`
93
+
94
+ If you run into CUDA or `flash-attn` issues, see [Installation Details](#installation-details).
95
+
96
+ ## Quickstart
97
+
98
+ ### Verify the installed package
99
+
100
+ ```bash
101
+ uv run --with profam --no-project -- python -c "import profam; print(profam.__version__)"
102
+ ```
103
+
104
+ ### Run a lightweight training example
105
+
106
+ The bundled example config uses the small dataset under `data/train_example`:
107
+
108
+ ```bash
109
+ uv run profam-train experiment=train_profam_example logger=null_logger
110
+ ```
111
+
112
+ ### Download the pretrained checkpoint
113
+
114
+ ```bash
115
+ uv run profam-download-checkpoint
116
+ ```
117
+
118
+ ## Main Workflows
119
+
120
+ | Workflow | Purpose | Command |
121
+ | --- | --- | --- |
122
+ | Train | Train a ProFam model with Hydra configs | `uv run profam-train` |
123
+ | Example training | Run a lightweight smoke test on example data | `uv run profam-train experiment=train_profam_example logger=null_logger` |
124
+ | Model summary | Print a model architecture summary | `uv run profam-model-summary` |
125
+ | Download checkpoint | Fetch the pretrained `ProFam-1` checkpoint | `uv run profam-download-checkpoint` |
126
+ | Generate sequences | Sample new sequences from family prompts | `uv run profam-generate-sequences ...` |
127
+ | Score sequences | Score candidate sequences with family context | `uv run profam-score-sequences ...` |
128
+
129
+ The packaged CLI now covers the main package entrypoints, including training, checkpoint download, sequence generation, and sequence scoring.
130
+
131
+ ## Input Sequence Formats
132
+
133
+ ProFam supports:
134
+
135
+ - **Unaligned FASTA** for standard protein sequence inputs
136
+ - **Aligned / MSA-style files** such as A2M/A3M content with gaps and insertions
137
+
138
+ For `profam-score-sequences`, we recommend providing an aligned MSA file because sequence weighting is used to encourage diversity when subsampling prompt sequences. Even when aligned inputs are provided, the standard ProFam model converts them into unaligned gap-free sequences before the forward pass.
139
+
140
+ During preprocessing:
141
+
142
+ - gaps (`-` and alignment-like `.`) are removed
143
+ - lowercase insertions are converted to uppercase
144
+ - `U -> C` and `O -> K`
145
+ - remaining out-of-vocabulary characters map to `[UNK]` only when `allow_unk=true`
146
+
147
+ ## Training
148
+
149
+ ### Run a lightweight example
150
+
151
+ `configs/experiment/train_profam_example.yaml` is configured to run on the bundled example data:
152
+
153
+ ```bash
154
+ uv run profam-train experiment=train_profam_example logger=null_logger
155
+ ```
156
+
157
+ ### Train with the ProFam-Atlas dataset
158
+
159
+ Training data for ProFam can be downloaded from:
160
+
161
+ - [Zenodo: ProFam Atlas Dataset](https://zenodo.org/records/17713590)
162
+
163
+ The default configuration in `configs/train.yaml` is compatible with the latest ProFam-Atlas release:
164
+
165
+ ```bash
166
+ uv run profam-train
167
+ ```
168
+
169
+ ## Resources
170
+
171
+ - [bioRxiv preprint](https://www.biorxiv.org/content/10.64898/2025.12.19.695431v1)
172
+ - [Hugging Face: ProFam-1 checkpoint](https://huggingface.co/judewells/ProFam-1)
173
+ - [Zenodo: ProFam Atlas Dataset](https://zenodo.org/records/17713590)
174
+ - [GitHub repository](https://github.com/alex-hh/profam)
175
+
176
+ ## Citation
177
+
178
+ If you use ProFam in your work, please cite the preprint:
179
+
180
+ ```bibtex
181
+ @article{wells2025profam,
182
+ title = {ProFam: Open-Source Protein Family Language Modelling for Fitness Prediction and Design},
183
+ author = {Wells, Jude and Hawkins Hooker, Alex and Livne, Micha and Lin, Weining and Miller, David and Dallago, Christian and Bordin, Nicola and Paige, Brooks and Rost, Burkhard and Orengo, Christine and Heinzinger, Michael},
184
+ journal = {bioRxiv},
185
+ year = {2025},
186
+ doi = {10.64898/2025.12.19.695431},
187
+ url = {https://www.biorxiv.org/content/10.64898/2025.12.19.695431v1}
188
+ }
189
+ ```
190
+
191
+ ## Installation Details
192
+
193
+ ### CPU-only installation
194
+
195
+ ```bash
196
+ uv sync
197
+ uv pip install torch --index-url https://download.pytorch.org/whl/cpu
198
+ ```
199
+
200
+ ### FlashAttention 2
201
+
202
+ We recommend installing FlashAttention 2 for faster scoring and generation. For training, it is strongly recommended because ProFam uses sequence packing with `batch_size=1` and no padding.
203
+
204
+ If you need to train without Flash Attention, update the configuration to set `data.pack_to_max_tokens=null`.
205
+
206
+ ```bash
207
+ uv sync --extra flash-attn
208
+ python -c "import flash_attn; print(flash_attn.__version__)"
209
+ ```
210
+
211
+ ### Troubleshooting: conda fallback
212
+
213
+ If a matching `flash-attn` wheel is unavailable and a source build is required, this conda-based fallback is often the easiest route:
214
+
215
+ ```bash
216
+ conda create -n pfenv python=3.11 -y
217
+ conda activate pfenv
218
+
219
+ conda install -c conda-forge ninja packaging -y
220
+ conda install -c nvidia cuda-toolkit=12.4 -y
221
+
222
+ pip install profam
223
+
224
+ # install a CUDA-enabled PyTorch build (adjust CUDA version/index-url to match your setup)
225
+ pip install torch==2.5.1+cu121 torchvision==0.20.1+cu121 --index-url https://download.pytorch.org/whl/cu121
226
+
227
+ pip install setuptools wheel packaging psutil numpy
228
+ pip install flash-attn==2.5.6 --no-build-isolation
229
+
230
+ python -c "import flash_attn; print(flash_attn.__version__)"
231
+ ```
232
+
233
+ ## Development
234
+
235
+ We're using pre-commit to format code and pytest to run tests.
236
+
237
+ Pull requests will automatically have pre-commit and pytest run on them
238
+ and will only be approved once these checks are all passing
239
+
240
+ Before submitting a pull request, run the checks locally with:
241
+
242
+ ```bash
243
+ uv run --group dev pre-commit run --all-files
244
+ ```
245
+
246
+ and
247
+
248
+ ```bash
249
+ uv run --group dev pytest -k 'not example'
250
+ ```
251
+
252
+ Pull requests adding complex new features or making any significant changes
253
+ or additions should be accompanied with associated tests in the tests/ directory.
254
+
255
+ ## Concepts
256
+
257
+ ### Data loading
258
+
259
+ ProFam uses **text memmap datasets**
260
+ for fast random access over large corpora:
261
+
262
+ - `profam/data/text_memmap_datasets.py`: generic **memory-mapped** line access + index building (`*.idx.{npy,info}`)
263
+ - `profam/data/builders/family_text_memmap_datasets.py`: ProFam-Atlas-specific datasets built on top of the memmap layer
264
+
265
+ #### ProFam-Atlas on-disk format (`.mapping` / `.sequences`)
266
+
267
+ The ProFam-Atlas dataset is distributed as paired files:
268
+
269
+ - **`*.mapping`**: family id + indices into one or more `*.sequences` files
270
+ - **Format**:
271
+ - Line 1: `>FAMILY_ID`
272
+ - Line 2+: `sequences_filename:idx0,idx1,idx2,...`
273
+ - **Important**: `*.mapping` files **must not** have a trailing newline at end-of-file.
274
+ - **`*.sequences`**: FASTA-like accessions + sequences
275
+ - **Format** (repeated):
276
+ - `>ACCESSION ...`
277
+ - `SEQUENCE`
278
+ - **Important**: `*.sequences` files **should** have a final trailing newline.
279
+
280
+ See `README_ProFam_atlas.md` for examples and additional details.
281
+
282
+ #### How it’s loaded
283
+
284
+ At a high level, training loads one **protein family** at a time by:
285
+
286
+ 1. Reading a family record from `MappingProteinFamilyMemmapDataset` (a memmapped `*.mapping` dataset)
287
+ 2. Fetching the referenced sequences from `SequencesProteinFamilyMemmapDataset` (memmapped `*.sequences` files)
288
+ 3. Building a `ProteinDocument` and preprocessing it (see `profam/data/processors/preprocessing.py`)
289
+ 4. Encoding with `ProFamTokenizer` and forming batches (optionally with packing)
290
+
291
+ #### Converting FASTA → text memmap
292
+
293
+ If you have a directory of per-family FASTA files and want to create `*.mapping` / `*.sequences` files for training,
294
+ see:
295
+
296
+ - `data_creation_scripts/fasta_to_text_memmap.py`