RP3Net 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. rp3net-0.0.1/.github/workflows/pypi-publish.yml +49 -0
  2. rp3net-0.0.1/.github/workflows/python-app-training.yml +29 -0
  3. rp3net-0.0.1/.github/workflows/python-app.yml +29 -0
  4. rp3net-0.0.1/.github/workflows/testpypi-publish.yml +53 -0
  5. rp3net-0.0.1/.gitignore +182 -0
  6. rp3net-0.0.1/.vscode/launch.json +23 -0
  7. rp3net-0.0.1/.vscode/settings.json +18 -0
  8. rp3net-0.0.1/LICENSE +21 -0
  9. rp3net-0.0.1/PKG-INFO +77 -0
  10. rp3net-0.0.1/README.md +53 -0
  11. rp3net-0.0.1/config/trainer_a.yml +59 -0
  12. rp3net-0.0.1/config/trainer_b.yml +66 -0
  13. rp3net-0.0.1/config/trainer_d.yml +143 -0
  14. rp3net-0.0.1/config/trainer_ebi_test.yml +10 -0
  15. rp3net-0.0.1/config/trainer_mac.yml +11 -0
  16. rp3net-0.0.1/pyproject.toml +39 -0
  17. rp3net-0.0.1/setup.cfg +4 -0
  18. rp3net-0.0.1/src/RP3Net/__init__.py +8 -0
  19. rp3net-0.0.1/src/RP3Net/fm_cfg/esm2_650m/config.json +29 -0
  20. rp3net-0.0.1/src/RP3Net/fm_cfg/esm2_650m/special_tokens_map.json +7 -0
  21. rp3net-0.0.1/src/RP3Net/fm_cfg/esm2_650m/tokenizer_config.json +4 -0
  22. rp3net-0.0.1/src/RP3Net/fm_cfg/esm2_650m/vocab.txt +33 -0
  23. rp3net-0.0.1/src/RP3Net/model/__init__.py +1 -0
  24. rp3net-0.0.1/src/RP3Net/model/layers.py +171 -0
  25. rp3net-0.0.1/src/RP3Net/model/model.py +233 -0
  26. rp3net-0.0.1/src/RP3Net/rp3_main.py +85 -0
  27. rp3net-0.0.1/src/RP3Net/rp3_train.py +18 -0
  28. rp3net-0.0.1/src/RP3Net/training/__init__.py +6 -0
  29. rp3net-0.0.1/src/RP3Net/training/cli.py +166 -0
  30. rp3net-0.0.1/src/RP3Net/training/data.py +300 -0
  31. rp3net-0.0.1/src/RP3Net/training/data_emlc.py +94 -0
  32. rp3net-0.0.1/src/RP3Net/training/lm.py +123 -0
  33. rp3net-0.0.1/src/RP3Net/training/lm_emlc.py +400 -0
  34. rp3net-0.0.1/src/RP3Net/training/metrics.py +357 -0
  35. rp3net-0.0.1/src/RP3Net/util/__init__.py +3 -0
  36. rp3net-0.0.1/src/RP3Net/util/fasta.py +26 -0
  37. rp3net-0.0.1/src/RP3Net/util/torch.py +89 -0
  38. rp3net-0.0.1/src/RP3Net/util/util.py +65 -0
  39. rp3net-0.0.1/src/RP3Net.egg-info/PKG-INFO +77 -0
  40. rp3net-0.0.1/src/RP3Net.egg-info/SOURCES.txt +49 -0
  41. rp3net-0.0.1/src/RP3Net.egg-info/dependency_links.txt +1 -0
  42. rp3net-0.0.1/src/RP3Net.egg-info/entry_points.txt +3 -0
  43. rp3net-0.0.1/src/RP3Net.egg-info/requires.txt +11 -0
  44. rp3net-0.0.1/src/RP3Net.egg-info/top_level.txt +1 -0
  45. rp3net-0.0.1/tests/__init__.py +0 -0
  46. rp3net-0.0.1/tests/rp3_test.py +73 -0
  47. rp3net-0.0.1/tests/split_test.py +106 -0
  48. rp3net-0.0.1/tests_ebi/__init__.py +0 -0
  49. rp3net-0.0.1/tests_ebi/rp3_ebi_test.py +70 -0
  50. rp3net-0.0.1/tests_training/__init__.py +0 -0
  51. rp3net-0.0.1/tests_training/rp3_public_checkpoint_test.py +35 -0
@@ -0,0 +1,49 @@
1
+ name: Publish Python distribution to PyPI and TestPyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v[0-9]+.[0-9]+.[0-9]+' # Trigger on version tags like v1.0.0
7
+
8
+ jobs:
9
+ build:
10
+ name: Build distribution
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ persist-credentials: false
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.12"
21
+ - name: Install pypa/build
22
+ run: python3 -m pip install build --user
23
+ - name: Build a binary wheel and a source tarball
24
+ run: python3 -m build
25
+ - name: Store the distribution packages
26
+ uses: actions/upload-artifact@v4
27
+ with:
28
+ name: python-package-distributions
29
+ path: dist/
30
+
31
+ publish-to-pypi:
32
+ name: Publish Python distribution to PyPI
33
+ needs:
34
+ - build
35
+ runs-on: ubuntu-latest
36
+ environment:
37
+ name: pypi
38
+ url: https://pypi.org/p/<package-name> # Replace <package-name> with your PyPI project name
39
+ permissions:
40
+ id-token: write # IMPORTANT: mandatory for trusted publishing
41
+
42
+ steps:
43
+ - name: Download all the dists
44
+ uses: actions/download-artifact@v4
45
+ with:
46
+ name: python-package-distributions
47
+ path: dist/
48
+ - name: Publish distribution to PyPI
49
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,29 @@
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Build and test RP3Net with training dependencies
5
+
6
+ on:
7
+ push:
8
+ branches: [ "main" ]
9
+ pull_request:
10
+ branches: [ "main" ]
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ test:
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - name: Set up Python
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: '3.12'
24
+ - name: Install
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install .[training]
28
+ - name: Test with unittest
29
+ run: python -m unittest discover -s ./tests_training -p '*test.py'
@@ -0,0 +1,29 @@
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Build and test RP3Net without training dependencies
5
+
6
+ on:
7
+ push:
8
+ branches: [ "main" ]
9
+ pull_request:
10
+ branches: [ "main" ]
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ test:
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - name: Set up Python
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: '3.12'
24
+ - name: Install
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install .
28
+ - name: Test with unittest
29
+ run: python -m unittest discover -s ./tests -p '*test.py'
@@ -0,0 +1,53 @@
1
+ name: Publish Python distribution to PyPI and TestPyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v[0-9]+.[0-9]+.[0-9]+*' # Trigger on version tags like v0.0.1rc1
7
+
8
+ jobs:
9
+ build:
10
+ name: Build distribution
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ persist-credentials: false
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.12"
21
+ - name: Install pypa/build
22
+ run: python3 -m pip install build --user
23
+ - name: Build a binary wheel and a source tarball
24
+ run: python3 -m build
25
+ - name: Store the distribution packages
26
+ uses: actions/upload-artifact@v4
27
+ with:
28
+ name: python-package-distributions
29
+ path: dist/
30
+
31
+ publish-to-testpypi:
32
+ name: Publish Python distribution to TestPyPI
33
+ needs:
34
+ - build
35
+ runs-on: ubuntu-latest
36
+
37
+ environment:
38
+ name: testpypi
39
+ url: https://test.pypi.org/p/<package-name>
40
+
41
+ permissions:
42
+ id-token: write # IMPORTANT: mandatory for trusted publishing
43
+
44
+ steps:
45
+ - name: Download all the dists
46
+ uses: actions/download-artifact@v4
47
+ with:
48
+ name: python-package-distributions
49
+ path: dist/
50
+ - name: Publish distribution to TestPyPI
51
+ uses: pypa/gh-action-pypi-publish@release/v1
52
+ with:
53
+ repository-url: https://test.pypi.org/legacy/
@@ -0,0 +1,182 @@
1
+ .scripts
2
+ .vscode
3
+ .venv
4
+ scratch*
5
+ .secrets
6
+ .env
7
+ .nb
8
+ ~$*
9
+ .DS_Store
10
+ *log
11
+ .history
12
+
13
+ # LaTex
14
+ *.aux
15
+ *.fdb_latexmk
16
+ *.fls
17
+ *.synctex.gz
18
+
19
+ .idea/
20
+ .RData
21
+
22
+ *.session.sql
23
+
24
+ # Byte-compiled / optimized / DLL files
25
+ __pycache__/
26
+ *.py[cod]
27
+ *$py.class
28
+
29
+ # C extensions
30
+ *.so
31
+
32
+ # Distribution / packaging
33
+ .Python
34
+ build/
35
+ develop-eggs/
36
+ dist/
37
+ downloads/
38
+ eggs/
39
+ .eggs/
40
+ lib/
41
+ lib64/
42
+ parts/
43
+ sdist/
44
+ var/
45
+ wheels/
46
+ share/python-wheels/
47
+ *.egg-info/
48
+ .installed.cfg
49
+ *.egg
50
+ MANIFEST
51
+
52
+ # PyInstaller
53
+ # Usually these files are written by a python script from a template
54
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
55
+ *.manifest
56
+ *.spec
57
+
58
+ # Installer logs
59
+ pip-log.txt
60
+ pip-delete-this-directory.txt
61
+
62
+ # Unit test / coverage reports
63
+ htmlcov/
64
+ .tox/
65
+ .nox/
66
+ .coverage
67
+ .coverage.*
68
+ .cache
69
+ nosetests.xml
70
+ coverage.xml
71
+ *.cover
72
+ *.py,cover
73
+ .hypothesis/
74
+ .pytest_cache/
75
+ cover/
76
+
77
+ # Translations
78
+ *.mo
79
+ *.pot
80
+
81
+ # Django stuff:
82
+ *.log
83
+ local_settings.py
84
+ db.sqlite3
85
+ db.sqlite3-journal
86
+
87
+ # Flask stuff:
88
+ instance/
89
+ .webassets-cache
90
+
91
+ # Scrapy stuff:
92
+ .scrapy
93
+
94
+ # Sphinx documentation
95
+ docs/_build/
96
+
97
+ # PyBuilder
98
+ .pybuilder/
99
+ target/
100
+
101
+ # Jupyter Notebook
102
+ .ipynb_checkpoints
103
+
104
+ # IPython
105
+ profile_default/
106
+ ipython_config.py
107
+
108
+ # pyenv
109
+ # For a library or package, you might want to ignore these files since the code is
110
+ # intended to run in multiple environments; otherwise, check them in:
111
+ # .python-version
112
+
113
+ # pipenv
114
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
115
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
116
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
117
+ # install all needed dependencies.
118
+ #Pipfile.lock
119
+
120
+ # poetry
121
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
122
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
123
+ # commonly ignored for libraries.
124
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
125
+ #poetry.lock
126
+
127
+ # pdm
128
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
129
+ #pdm.lock
130
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
131
+ # in version control.
132
+ # https://pdm.fming.dev/#use-with-ide
133
+ .pdm.toml
134
+
135
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
136
+ __pypackages__/
137
+
138
+ # Celery stuff
139
+ celerybeat-schedule
140
+ celerybeat.pid
141
+
142
+ # SageMath parsed files
143
+ *.sage.py
144
+
145
+ # Environments
146
+ # .venv
147
+ # env/
148
+ # venv/
149
+ # ENV/
150
+ # env.bak/
151
+ # venv.bak/
152
+
153
+ # Spyder project settings
154
+ .spyderproject
155
+ .spyproject
156
+
157
+ # Rope project settings
158
+ .ropeproject
159
+
160
+ # mkdocs documentation
161
+ /site
162
+
163
+ # mypy
164
+ .mypy_cache/
165
+ .dmypy.json
166
+ dmypy.json
167
+
168
+ # Pyre type checker
169
+ .pyre/
170
+
171
+ # pytype static type analyzer
172
+ .pytype/
173
+
174
+ # Cython debug symbols
175
+ cython_debug/
176
+
177
+ # PyCharm
178
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
179
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
180
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
181
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
182
+ #.idea/
@@ -0,0 +1,23 @@
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Fit",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "program": "src/RP3Net/rp3_train.py",
12
+ "console": "integratedTerminal",
13
+ "justMyCode": false,
14
+ // "args": ["fit", "-c", "config/trainer_a.yml",
15
+ // "--trainer.default_root_dir", "/tmp/pp",
16
+ // "--trainer.accelerator", "gpu",
17
+ // ],
18
+ "args": ["fit", "-c", "~/test/rp3/emlc.yml"],
19
+ "cwd": "${workspaceFolder}"
20
+ },
21
+
22
+ ]
23
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "python.testing.unittestArgs": [
3
+ "-v",
4
+ "-s",
5
+ ".",
6
+ "-p",
7
+ "*_test.py"
8
+ ],
9
+ "python.testing.pytestEnabled": false,
10
+ "python.testing.unittestEnabled": true,
11
+ "jupyter.notebookFileRoot": "${workspaceFolder}/src",
12
+ "terminal.integrated.env.linux": {
13
+ "PYTHONPATH": "${workspaceFolder}/src"
14
+ },
15
+ "terminal.integrated.env.osx": {
16
+ "PYTHONPATH": "${workspaceFolder}/src"
17
+ },
18
+ }
rp3net-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 tanhevg
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
rp3net-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,77 @@
1
+ Metadata-Version: 2.4
2
+ Name: RP3Net
3
+ Version: 0.0.1
4
+ Summary: RP3Net: Modelling of recombinant soluble protein production in E. coli
5
+ Maintainer-email: Evgeny Tankhilevich <evgeny@ebi.ac.uk>
6
+ License: MIT
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: <4.0,>=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: torch==2.5.*
14
+ Requires-Dist: pandas
15
+ Requires-Dist: transformers==4.46.*
16
+ Requires-Dist: ml_collections
17
+ Requires-Dist: peft
18
+ Requires-Dist: tqdm
19
+ Provides-Extra: training
20
+ Requires-Dist: lightning[pytorch-extra]==2.4.*; extra == "training"
21
+ Requires-Dist: polars==1.27.*; extra == "training"
22
+ Requires-Dist: wandb; extra == "training"
23
+ Dynamic: license-file
24
+
25
+ # RP3Net
26
+ RP3Net is an AI model for predicting the results of recombinant small-scale protein production in _E. coli_ from the construct sequence. See [the preprint](https://www.biorxiv.org/content/10.1101/2025.05.13.652824v1) and [supplements](https://ftp.ebi.ac.uk/pub/software/RP3Net/) for more details on how it works.
27
+
28
+ # Checkpoints
29
+ * https://ftp.ebi.ac.uk/pub/software/RP3Net/v0.1/checkpoints/
30
+
31
+ # Inference
32
+ ## Installation
33
+ ```
34
+ pip install RP3Net
35
+ ```
36
+
37
+ ## Command line
38
+ Simple usage:
39
+ ```
40
+ rp3 -p <path_to_checkpoint_file> -f <in_fasta_file> -o <out_csv_file>
41
+ ```
42
+ The `out_csv_file` will contain the dataframe with the ids from the `in_fasta_file` and the predicted probabilities of successfull recombinant small-scale protein production in _E. coli_.
43
+ For more information on the command line arguments, type `rp3 -h`.
44
+
45
+ ## Python interface
46
+ ```python
47
+ import RP3Net as rp3
48
+ m = rp3.load_model(rp3.RP3_DEFAULT_CONFIG, '/path/to/checkpoint')
49
+ scores = m.predict(['PRTEINWQENCE', 'PRTEIN', 'SQWENCE'])
50
+ print(scores)
51
+ # tensor([0.4223, 0.4134, 0.4165])
52
+ score_map = m.predict({'seq1': 'PRTEINWQENCE', 'seq2': 'PRTEIN', 'seq3': 'SQWENCE'})
53
+ print(score_map)
54
+ # {'seq1': 0.4223055839538574, 'seq2': 0.41336774826049805, 'seq3': 0.4165498912334442}
55
+ ```
56
+
57
+ The `load_model` function returns the model object that can be used directly for prediction (`predict`), and is otherwise a fully functional implementation of a [Pytorch module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), so can be used for computing gradients and training as well. The `predict` method accepts either a list of sequences as strings, or a dictionary of sequences keyed by their ids. The return type depends on the input, and is either a one-dimensional tensor or a dictionary of floats. In the former case the order of the scores corresponds to the order of the input sequences, in the latter case the dictionary is keyed by the sequence ids.
58
+
59
+ ## Performance and resource usage
60
+ The command line verstion on a modern CPU (base frequency 2.6 GHz) for a batch of 16 constructs with length under 500aa runs in about 3 minutes, using under 5Gb of RAM.
61
+
62
+ # Training
63
+ Note that installation for inference does not bring in the libraries that are used for training.
64
+
65
+ ## Installation
66
+ ```
67
+ pip install 'RP3Net[training]'
68
+ ```
69
+
70
+ ## Command line
71
+ ```
72
+ rp3_train fit -c <training_config_file>
73
+ ```
74
+ Examples of trainer cofigs can be found under `config` folder. Training is managed by [Pytorch Lightning CLI](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli.html); more information can be found by typing `rp3_train -h`
75
+
76
+ ## Training data
77
+ * https://ftp.ebi.ac.uk/pub/software/RP3Net/v0.1/data/
rp3net-0.0.1/README.md ADDED
@@ -0,0 +1,53 @@
1
+ # RP3Net
2
+ RP3Net is an AI model for predicting the results of recombinant small-scale protein production in _E. coli_ from the construct sequence. See [the preprint](https://www.biorxiv.org/content/10.1101/2025.05.13.652824v1) and [supplements](https://ftp.ebi.ac.uk/pub/software/RP3Net/) for more details on how it works.
3
+
4
+ # Checkpoints
5
+ * https://ftp.ebi.ac.uk/pub/software/RP3Net/v0.1/checkpoints/
6
+
7
+ # Inference
8
+ ## Installation
9
+ ```
10
+ pip install RP3Net
11
+ ```
12
+
13
+ ## Command line
14
+ Simple usage:
15
+ ```
16
+ rp3 -p <path_to_checkpoint_file> -f <in_fasta_file> -o <out_csv_file>
17
+ ```
18
+ The `out_csv_file` will contain the dataframe with the ids from the `in_fasta_file` and the predicted probabilities of successfull recombinant small-scale protein production in _E. coli_.
19
+ For more information on the command line arguments, type `rp3 -h`.
20
+
21
+ ## Python interface
22
+ ```python
23
+ import RP3Net as rp3
24
+ m = rp3.load_model(rp3.RP3_DEFAULT_CONFIG, '/path/to/checkpoint')
25
+ scores = m.predict(['PRTEINWQENCE', 'PRTEIN', 'SQWENCE'])
26
+ print(scores)
27
+ # tensor([0.4223, 0.4134, 0.4165])
28
+ score_map = m.predict({'seq1': 'PRTEINWQENCE', 'seq2': 'PRTEIN', 'seq3': 'SQWENCE'})
29
+ print(score_map)
30
+ # {'seq1': 0.4223055839538574, 'seq2': 0.41336774826049805, 'seq3': 0.4165498912334442}
31
+ ```
32
+
33
+ The `load_model` function returns the model object that can be used directly for prediction (`predict`), and is otherwise a fully functional implementation of a [Pytorch module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), so can be used for computing gradients and training as well. The `predict` method accepts either a list of sequences as strings, or a dictionary of sequences keyed by their ids. The return type depends on the input, and is either a one-dimensional tensor or a dictionary of floats. In the former case the order of the scores corresponds to the order of the input sequences, in the latter case the dictionary is keyed by the sequence ids.
34
+
35
+ ## Performance and resource usage
36
+ The command line verstion on a modern CPU (base frequency 2.6 GHz) for a batch of 16 constructs with length under 500aa runs in about 3 minutes, using under 5Gb of RAM.
37
+
38
+ # Training
39
+ Note that installation for inference does not bring in the libraries that are used for training.
40
+
41
+ ## Installation
42
+ ```
43
+ pip install 'RP3Net[training]'
44
+ ```
45
+
46
+ ## Command line
47
+ ```
48
+ rp3_train fit -c <training_config_file>
49
+ ```
50
+ Examples of trainer cofigs can be found under `config` folder. Training is managed by [Pytorch Lightning CLI](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli.html); more information can be found by typing `rp3_train -h`
51
+
52
+ ## Training data
53
+ * https://ftp.ebi.ac.uk/pub/software/RP3Net/v0.1/data/
@@ -0,0 +1,59 @@
1
+ ckpt_path: last
2
+ seed_everything: 42
3
+ test_after_fit_metric: full_val_auc_roc
4
+ track_metric_checkpoints: best
5
+ data:
6
+ class_path: RP3Net.training.data.RP3GlobalEmbeddingsLDM
7
+ init_args:
8
+ hypers:
9
+ sources:
10
+ - SGC_Stockholm
11
+ ds_path: ~/rp3/prod/v0.1/data/rp3.csv.gz
12
+ embeddings_file: ~/data/pp/ai/embeddings/16_01/esm2_t33_650M_UR50D_mean.pt
13
+ test_val_seed: 42
14
+ seed: 42
15
+ training_batch_size: 8
16
+ val_test_batch_size: 8
17
+ model:
18
+ class_path: RP3Net.training.lm.RP3LM
19
+ init_args:
20
+ hypers:
21
+ sources:
22
+ - SGC_Stockholm
23
+ model:
24
+ mode: Training_A
25
+ fm:
26
+ type: esm2_650m
27
+ aggregation: mean
28
+ classification_head:
29
+ embedding_dim: 1280
30
+ bias: false
31
+ end_bias: true
32
+ layer_norm: false
33
+ p_drop: 0.1
34
+ layers:
35
+ d: 1280
36
+ n: 1
37
+ nonlinearity: SiLU
38
+ optimizer:
39
+ class_path: torch.optim.Adam
40
+ init_args:
41
+ lr: 1.0e-5
42
+ weight_decay: 0.0
43
+ trainer:
44
+ enable_progress_bar: false
45
+ callbacks:
46
+ - class_path: lightning.pytorch.callbacks.EarlyStopping
47
+ init_args:
48
+ check_on_train_epoch_end: false
49
+ log_rank_zero_only: true
50
+ min_delta: 0.0001
51
+ mode: max
52
+ monitor: full_val_auc_roc
53
+ patience: 10
54
+ stopping_threshold: 0.99
55
+ strict: true
56
+ verbose: true
57
+ - class_path: lightning.pytorch.callbacks.LearningRateMonitor
58
+ init_args:
59
+ logging_interval: epoch
@@ -0,0 +1,66 @@
1
+ ckpt_path: last
2
+ seed_everything: 42
3
+ test_after_fit_metric: full_val_auc_roc
4
+ track_metric_checkpoints: best
5
+ data:
6
+ class_path: RP3Net.training.data.RP3SequenceEmbeddingsLDM
7
+ init_args:
8
+ hypers:
9
+ sources:
10
+ - SGC_Stockholm
11
+ - SGC_Toronto
12
+ ds_path: ~/rp3/prod/v0.1/data/rp3.csv.gz
13
+ embeddings_file: ~/data/pp/ai/embeddings/16_01/esm2_t33_650M_UR50D_unpooled.zip
14
+ test_val_seed: 42
15
+ seed: 42
16
+ training_batch_size: 8
17
+ val_test_batch_size: 8
18
+ model:
19
+ class_path: RP3Net.training.lm.RP3LM
20
+ init_args:
21
+ hypers:
22
+ sources:
23
+ - SGC_Stockholm
24
+ - SGC_Toronto
25
+ model:
26
+ mode: Training_B
27
+ fm:
28
+ type: esm2_650m
29
+ aggregation: stp
30
+ stp:
31
+ seq_dim: 1280
32
+ d: 256
33
+ num_heads: 8
34
+ layer_norm: True
35
+ p_drop: 0.1
36
+ classification_head:
37
+ embedding_dim: 256
38
+ bias: false
39
+ end_bias: true
40
+ layer_norm: false
41
+ p_drop: 0.1
42
+ layers:
43
+ d: 256
44
+ n: 1
45
+ nonlinearity: SiLU
46
+ optimizer:
47
+ class_path: torch.optim.Adam
48
+ init_args:
49
+ lr: 1.0e-5
50
+ weight_decay: 0.0
51
+ trainer:
52
+ callbacks:
53
+ - class_path: lightning.pytorch.callbacks.EarlyStopping
54
+ init_args:
55
+ check_on_train_epoch_end: false
56
+ log_rank_zero_only: true
57
+ min_delta: 0.0001
58
+ mode: max
59
+ monitor: full_val_auc_roc
60
+ patience: 10
61
+ stopping_threshold: 0.99
62
+ strict: true
63
+ verbose: true
64
+ - class_path: lightning.pytorch.callbacks.LearningRateMonitor
65
+ init_args:
66
+ logging_interval: epoch