sbol-torch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. sbol_torch-0.1.0/.flake8 +11 -0
  2. sbol_torch-0.1.0/.gitignore +20 -0
  3. sbol_torch-0.1.0/.pre-commit-config.yaml +25 -0
  4. sbol_torch-0.1.0/LICENSE +21 -0
  5. sbol_torch-0.1.0/PKG-INFO +135 -0
  6. sbol_torch-0.1.0/README.md +103 -0
  7. sbol_torch-0.1.0/docs/architecture.md +88 -0
  8. sbol_torch-0.1.0/docs/backbones.md +47 -0
  9. sbol_torch-0.1.0/docs/capabilities.md +90 -0
  10. sbol_torch-0.1.0/docs/configuration.md +152 -0
  11. sbol_torch-0.1.0/docs/data.md +90 -0
  12. sbol_torch-0.1.0/docs/extending.md +91 -0
  13. sbol_torch-0.1.0/docs/images/wandb_structure_aware.png +0 -0
  14. sbol_torch-0.1.0/docs/images/wandb_train_graph.png +0 -0
  15. sbol_torch-0.1.0/examples/README.md +52 -0
  16. sbol_torch-0.1.0/examples/configs/finetune_expression.yaml +46 -0
  17. sbol_torch-0.1.0/examples/configs/finetune_structure_aware.yaml +59 -0
  18. sbol_torch-0.1.0/examples/configs/pretrain_mlm.yaml +51 -0
  19. sbol_torch-0.1.0/examples/configs/train_graph.yaml +50 -0
  20. sbol_torch-0.1.0/examples/quickstart.py +31 -0
  21. sbol_torch-0.1.0/examples/run_wandb_examples.py +50 -0
  22. sbol_torch-0.1.0/pyproject.toml +64 -0
  23. sbol_torch-0.1.0/src/sboltorch/__init__.py +32 -0
  24. sbol_torch-0.1.0/src/sboltorch/cli.py +53 -0
  25. sbol_torch-0.1.0/src/sboltorch/config.py +181 -0
  26. sbol_torch-0.1.0/src/sboltorch/data/__init__.py +21 -0
  27. sbol_torch-0.1.0/src/sboltorch/data/corpus.py +36 -0
  28. sbol_torch-0.1.0/src/sboltorch/data/local.py +150 -0
  29. sbol_torch-0.1.0/src/sboltorch/data/materialize.py +166 -0
  30. sbol_torch-0.1.0/src/sboltorch/data/sbol_db.py +226 -0
  31. sbol_torch-0.1.0/src/sboltorch/data/synthetic.py +195 -0
  32. sbol_torch-0.1.0/src/sboltorch/datasets/__init__.py +8 -0
  33. sbol_torch-0.1.0/src/sboltorch/datasets/dataset.py +62 -0
  34. sbol_torch-0.1.0/src/sboltorch/datasets/mlm_collator.py +57 -0
  35. sbol_torch-0.1.0/src/sboltorch/datasets/splits.py +87 -0
  36. sbol_torch-0.1.0/src/sboltorch/encoders/__init__.py +19 -0
  37. sbol_torch-0.1.0/src/sboltorch/encoders/base.py +76 -0
  38. sbol_torch-0.1.0/src/sboltorch/encoders/graph.py +111 -0
  39. sbol_torch-0.1.0/src/sboltorch/encoders/sequence.py +34 -0
  40. sbol_torch-0.1.0/src/sboltorch/encoders/structure.py +113 -0
  41. sbol_torch-0.1.0/src/sboltorch/engine/__init__.py +20 -0
  42. sbol_torch-0.1.0/src/sboltorch/engine/batch.py +51 -0
  43. sbol_torch-0.1.0/src/sboltorch/engine/callbacks.py +185 -0
  44. sbol_torch-0.1.0/src/sboltorch/engine/trainer.py +179 -0
  45. sbol_torch-0.1.0/src/sboltorch/exceptions.py +27 -0
  46. sbol_torch-0.1.0/src/sboltorch/models/__init__.py +55 -0
  47. sbol_torch-0.1.0/src/sboltorch/models/backbone.py +41 -0
  48. sbol_torch-0.1.0/src/sboltorch/models/graph.py +73 -0
  49. sbol_torch-0.1.0/src/sboltorch/models/heads.py +26 -0
  50. sbol_torch-0.1.0/src/sboltorch/models/mlm.py +53 -0
  51. sbol_torch-0.1.0/src/sboltorch/models/sequence_model.py +39 -0
  52. sbol_torch-0.1.0/src/sboltorch/pipeline.py +157 -0
  53. sbol_torch-0.1.0/src/sboltorch/reproducibility.py +17 -0
  54. sbol_torch-0.1.0/src/sboltorch/tasks/__init__.py +8 -0
  55. sbol_torch-0.1.0/src/sboltorch/tasks/base.py +49 -0
  56. sbol_torch-0.1.0/src/sboltorch/tasks/mlm.py +39 -0
  57. sbol_torch-0.1.0/src/sboltorch/tasks/supervised.py +63 -0
  58. sbol_torch-0.1.0/src/sboltorch/tokenize/__init__.py +17 -0
  59. sbol_torch-0.1.0/src/sboltorch/tokenize/base.py +65 -0
  60. sbol_torch-0.1.0/src/sboltorch/tokenize/char.py +45 -0
  61. sbol_torch-0.1.0/src/sboltorch/tokenize/hf.py +59 -0
  62. sbol_torch-0.1.0/src/sboltorch/tokenize/kmer.py +61 -0
  63. sbol_torch-0.1.0/src/sboltorch/types.py +225 -0
  64. sbol_torch-0.1.0/tests/conftest.py +73 -0
  65. sbol_torch-0.1.0/tests/fixtures/sbol/COMMIT_SHA.txt +1 -0
  66. sbol_torch-0.1.0/tests/fixtures/sbol/PROVENANCE.md +22 -0
  67. sbol_torch-0.1.0/tests/fixtures/sbol/sbol2/pICH44179.xml +160 -0
  68. sbol_torch-0.1.0/tests/fixtures/sbol/sbol3/BBa_F2620_PoPSReceiver.ttl +354 -0
  69. sbol_torch-0.1.0/tests/fixtures/sbol/sbol3/toggle_switch.nt +371 -0
  70. sbol_torch-0.1.0/tests/fixtures/sbol/sbol3/toggle_switch.ttl +464 -0
  71. sbol_torch-0.1.0/tests/test_config.py +56 -0
  72. sbol_torch-0.1.0/tests/test_engine.py +108 -0
  73. sbol_torch-0.1.0/tests/test_fixtures_sbol.py +49 -0
  74. sbol_torch-0.1.0/tests/test_graph.py +70 -0
  75. sbol_torch-0.1.0/tests/test_learning.py +156 -0
  76. sbol_torch-0.1.0/tests/test_local_corpus.py +33 -0
  77. sbol_torch-0.1.0/tests/test_materialize.py +53 -0
  78. sbol_torch-0.1.0/tests/test_mlm.py +114 -0
  79. sbol_torch-0.1.0/tests/test_sbol_db_client.py +64 -0
  80. sbol_torch-0.1.0/tests/test_splits.py +33 -0
  81. sbol_torch-0.1.0/tests/test_structure.py +86 -0
  82. sbol_torch-0.1.0/tests/test_synthetic.py +70 -0
  83. sbol_torch-0.1.0/tests/test_tokenize.py +47 -0
  84. sbol_torch-0.1.0/tests/test_types.py +29 -0
  85. sbol_torch-0.1.0/tests/test_wandb.py +270 -0
  86. sbol_torch-0.1.0/uv.lock +2629 -0
@@ -0,0 +1,11 @@
1
+ [flake8]
2
+ max-line-length = 120
3
+ extend-ignore = E402,W503,E203,W605
4
+ per-file-ignores =
5
+ __init__.py:F401
6
+ exclude =
7
+ .git,
8
+ __pycache__,
9
+ .venv,
10
+ build,
11
+ dist
@@ -0,0 +1,20 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ venv/
5
+ *.egg-info/
6
+ build/
7
+ dist/
8
+ .mypy_cache/
9
+ .pytest_cache/
10
+ .ruff_cache/
11
+
12
+ # Local materialized corpora and run outputs
13
+ .sboltorch_cache/
14
+ runs/
15
+ *.parquet
16
+ wandb/
17
+
18
+ # Environment
19
+ .env
20
+ .env.*
@@ -0,0 +1,25 @@
1
+ repos:
2
+ - repo: https://github.com/pycqa/isort
3
+ rev: 5.13.2
4
+ hooks:
5
+ - id: isort
6
+ - repo: https://github.com/psf/black
7
+ rev: 24.3.0
8
+ hooks:
9
+ - id: black
10
+ - repo: https://github.com/pycqa/flake8
11
+ rev: 7.0.0
12
+ hooks:
13
+ - id: flake8
14
+ # Run mypy in the project environment so it type-checks against the real
15
+ # dependencies (torch, transformers, wandb). An isolated env would treat them
16
+ # as Any and miss real type errors.
17
+ - repo: local
18
+ hooks:
19
+ - id: mypy
20
+ name: mypy
21
+ entry: uv run mypy
22
+ language: system
23
+ types: [python]
24
+ pass_filenames: false
25
+ args: [--config-file=pyproject.toml]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mike Arpaia
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.4
2
+ Name: sbol-torch
3
+ Version: 0.1.0
4
+ Summary: A PyTorch library for synthetic biology and biodesign automation
5
+ Author-email: Mike Arpaia <mike@arpaia.co>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.11
9
+ Requires-Dist: einops>=0.7
10
+ Requires-Dist: httpx>=0.27
11
+ Requires-Dist: numpy>=1.26
12
+ Requires-Dist: pyarrow>=15
13
+ Requires-Dist: pydantic>=2.6
14
+ Requires-Dist: pyyaml>=6
15
+ Requires-Dist: rdflib>=7
16
+ Requires-Dist: tokenizers>=0.19
17
+ Requires-Dist: torch-geometric>=2.5
18
+ Requires-Dist: torch>=2.2
19
+ Requires-Dist: transformers<5,>=4.40
20
+ Requires-Dist: wandb>=0.17
21
+ Provides-Extra: dev
22
+ Requires-Dist: black>=24.3; extra == 'dev'
23
+ Requires-Dist: flake8>=7; extra == 'dev'
24
+ Requires-Dist: isort>=5.13; extra == 'dev'
25
+ Requires-Dist: mypy>=1.9; extra == 'dev'
26
+ Requires-Dist: pre-commit>=3.7; extra == 'dev'
27
+ Requires-Dist: pytest-mock>=3.12; extra == 'dev'
28
+ Requires-Dist: pytest>=8; extra == 'dev'
29
+ Requires-Dist: respx>=0.21; extra == 'dev'
30
+ Requires-Dist: types-pyyaml; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # sbol-torch
34
+
35
+ A PyTorch library for synthetic biology and biodesign automation.
36
+
37
+ Installed as `sbol-torch`, imported as `sboltorch` (commonly `import sboltorch as st`).
38
+
39
+ sbol-torch pulls designs from a running [sbol-db](https://github.com/marpaia/sbol-db)
40
+ instance (or local SBOL/FASTA files), normalizes them into a single record type,
41
+ and trains transformer models against them. The input modality, tokenizer, and
42
+ training objective are all set in configuration, so trying a new combination
43
+ never means forking the pipeline.
44
+
45
+ ## Capabilities
46
+
47
+ | Axis | Options |
48
+ |------|---------|
49
+ | **Data sources** | sbol-db REST API, local SBOL/FASTA files, or a synthetic generator |
50
+ | **Tokenizers** | pretrained HuggingFace (`hf`), overlapping k-mer, or IUPAC character |
51
+ | **Modalities** | `sequence`, `structure_aware` (feature boundaries), `graph` (PyG composition transformer) |
52
+ | **Objectives** | `supervised` fine-tuning, `frozen`-backbone head, `mlm` pretraining (from-scratch and continued) |
53
+ | **Engine** | raw-PyTorch loop, early stopping, checkpointing, AMP, LR schedule, gradient accumulation |
54
+ | **Tracking** | per-epoch `metrics.jsonl`, optional [Weights & Biases](https://docs.wandb.ai/) (scalars, config, lineage, model artifact) |
55
+ | **Reproducibility** | one validated config per run, seeded splits, content-fingerprinted Parquet cache |
56
+
57
+ ## Install
58
+
59
+ ```bash
60
+ pip install sbol-torch
61
+ ```
62
+
63
+ For development:
64
+
65
+ ```bash
66
+ uv venv
67
+ uv pip install -e '.[dev]'
68
+ ```
69
+
70
+ ## Quickstart
71
+
72
+ A run is fully specified by one YAML config. From the command line:
73
+
74
+ ```bash
75
+ # Materialize a corpus to the local Parquet cache (offline, reproducible).
76
+ sboltorch ingest examples/configs/finetune_expression.yaml
77
+
78
+ # Train. Resolved config, per-epoch metrics.jsonl, and best.pt land in output_dir.
79
+ sboltorch train examples/configs/finetune_expression.yaml
80
+ ```
81
+
82
+ Or from Python:
83
+
84
+ ```python
85
+ import sboltorch as st
86
+
87
+ config = st.RunConfig.from_yaml("examples/configs/train_graph.yaml")
88
+ metrics = st.run_training(config)
89
+ ```
90
+
91
+ ### Example configs
92
+
93
+ | Config | What it does |
94
+ |--------|--------------|
95
+ | [`finetune_expression.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/finetune_expression.yaml) | Frozen DNABERT-2 backbone feeding a regression head. |
96
+ | [`pretrain_mlm.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/pretrain_mlm.yaml) | From-scratch masked-LM pretraining; writes a reusable backbone. |
97
+ | [`finetune_structure_aware.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/finetune_structure_aware.yaml) | Sequence + feature-boundary markers. |
98
+ | [`train_graph.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/train_graph.yaml) | Graph transformer over the composition graph. |
99
+
100
+ ## Experiment tracking
101
+
102
+ The two synthetic-data configs ([`train_graph.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/train_graph.yaml)
103
+ and [`finetune_structure_aware.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/finetune_structure_aware.yaml))
104
+ ship with [Weights & Biases](https://docs.wandb.ai/) enabled. Set `WANDB_API_KEY`
105
+ in a `.env` at the repo root and run both:
106
+
107
+ ```bash
108
+ python examples/run_wandb_examples.py
109
+ ```
110
+
111
+ Each run logs per-step loss and learning rate, per-epoch train/val metrics, the
112
+ resolved config, the corpus fingerprint and split sizes as lineage, and the best
113
+ checkpoint as a model artifact.
114
+
115
+ | Graph transformer | Structure-aware sequence |
116
+ |-------------------|--------------------------|
117
+ | ![train_graph W&B run](https://raw.githubusercontent.com/marpaia/sbol-torch/master/docs/images/wandb_train_graph.png) | ![structure_aware W&B run](https://raw.githubusercontent.com/marpaia/sbol-torch/master/docs/images/wandb_structure_aware.png) |
118
+
119
+ ## Documentation
120
+
121
+ | Doc | Contents |
122
+ |-----|----------|
123
+ | [architecture.md](https://github.com/marpaia/sbol-torch/blob/master/docs/architecture.md) | How the system is built — record type, plug points, engine, data flow. |
124
+ | [capabilities.md](https://github.com/marpaia/sbol-torch/blob/master/docs/capabilities.md) | Modalities, objectives, tokenizers, metrics. |
125
+ | [configuration.md](https://github.com/marpaia/sbol-torch/blob/master/docs/configuration.md) | Complete `RunConfig` reference. |
126
+ | [data.md](https://github.com/marpaia/sbol-torch/blob/master/docs/data.md) | Data sources, the sbol-db client, materialization, fixtures. |
127
+ | [backbones.md](https://github.com/marpaia/sbol-torch/blob/master/docs/backbones.md) | Choosing/loading backbones and environment constraints. |
128
+ | [extending.md](https://github.com/marpaia/sbol-torch/blob/master/docs/extending.md) | Adding a tokenizer, encoder, task, callback, or data source. |
129
+
130
+ ## Develop
131
+
132
+ ```bash
133
+ uv run pytest
134
+ pre-commit run --all-files
135
+ ```
@@ -0,0 +1,103 @@
1
+ # sbol-torch
2
+
3
+ A PyTorch library for synthetic biology and biodesign automation.
4
+
5
+ Installed as `sbol-torch`, imported as `sboltorch` (commonly `import sboltorch as st`).
6
+
7
+ sbol-torch pulls designs from a running [sbol-db](https://github.com/marpaia/sbol-db)
8
+ instance (or local SBOL/FASTA files), normalizes them into a single record type,
9
+ and trains transformer models against them. The input modality, tokenizer, and
10
+ training objective are all set in configuration, so trying a new combination
11
+ never means forking the pipeline.
12
+
13
+ ## Capabilities
14
+
15
+ | Axis | Options |
16
+ |------|---------|
17
+ | **Data sources** | sbol-db REST API, local SBOL/FASTA files, or a synthetic generator |
18
+ | **Tokenizers** | pretrained HuggingFace (`hf`), overlapping k-mer, or IUPAC character |
19
+ | **Modalities** | `sequence`, `structure_aware` (feature boundaries), `graph` (PyG composition transformer) |
20
+ | **Objectives** | `supervised` fine-tuning, `frozen`-backbone head, `mlm` pretraining (from-scratch and continued) |
21
+ | **Engine** | raw-PyTorch loop, early stopping, checkpointing, AMP, LR schedule, gradient accumulation |
22
+ | **Tracking** | per-epoch `metrics.jsonl`, optional [Weights & Biases](https://docs.wandb.ai/) (scalars, config, lineage, model artifact) |
23
+ | **Reproducibility** | one validated config per run, seeded splits, content-fingerprinted Parquet cache |
24
+
25
+ ## Install
26
+
27
+ ```bash
28
+ pip install sbol-torch
29
+ ```
30
+
31
+ For development:
32
+
33
+ ```bash
34
+ uv venv
35
+ uv pip install -e '.[dev]'
36
+ ```
37
+
38
+ ## Quickstart
39
+
40
+ A run is fully specified by one YAML config. From the command line:
41
+
42
+ ```bash
43
+ # Materialize a corpus to the local Parquet cache (offline, reproducible).
44
+ sboltorch ingest examples/configs/finetune_expression.yaml
45
+
46
+ # Train. Resolved config, per-epoch metrics.jsonl, and best.pt land in output_dir.
47
+ sboltorch train examples/configs/finetune_expression.yaml
48
+ ```
49
+
50
+ Or from Python:
51
+
52
+ ```python
53
+ import sboltorch as st
54
+
55
+ config = st.RunConfig.from_yaml("examples/configs/train_graph.yaml")
56
+ metrics = st.run_training(config)
57
+ ```
58
+
59
+ ### Example configs
60
+
61
+ | Config | What it does |
62
+ |--------|--------------|
63
+ | [`finetune_expression.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/finetune_expression.yaml) | Frozen DNABERT-2 backbone feeding a regression head. |
64
+ | [`pretrain_mlm.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/pretrain_mlm.yaml) | From-scratch masked-LM pretraining; writes a reusable backbone. |
65
+ | [`finetune_structure_aware.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/finetune_structure_aware.yaml) | Sequence + feature-boundary markers. |
66
+ | [`train_graph.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/train_graph.yaml) | Graph transformer over the composition graph. |
67
+
68
+ ## Experiment tracking
69
+
70
+ The two synthetic-data configs ([`train_graph.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/train_graph.yaml)
71
+ and [`finetune_structure_aware.yaml`](https://github.com/marpaia/sbol-torch/blob/master/examples/configs/finetune_structure_aware.yaml))
72
+ ship with [Weights & Biases](https://docs.wandb.ai/) enabled. Set `WANDB_API_KEY`
73
+ in a `.env` at the repo root and run both:
74
+
75
+ ```bash
76
+ python examples/run_wandb_examples.py
77
+ ```
78
+
79
+ Each run logs per-step loss and learning rate, per-epoch train/val metrics, the
80
+ resolved config, the corpus fingerprint and split sizes as lineage, and the best
81
+ checkpoint as a model artifact.
82
+
83
+ | Graph transformer | Structure-aware sequence |
84
+ |-------------------|--------------------------|
85
+ | ![train_graph W&B run](https://raw.githubusercontent.com/marpaia/sbol-torch/master/docs/images/wandb_train_graph.png) | ![structure_aware W&B run](https://raw.githubusercontent.com/marpaia/sbol-torch/master/docs/images/wandb_structure_aware.png) |
86
+
87
+ ## Documentation
88
+
89
+ | Doc | Contents |
90
+ |-----|----------|
91
+ | [architecture.md](https://github.com/marpaia/sbol-torch/blob/master/docs/architecture.md) | How the system is built — record type, plug points, engine, data flow. |
92
+ | [capabilities.md](https://github.com/marpaia/sbol-torch/blob/master/docs/capabilities.md) | Modalities, objectives, tokenizers, metrics. |
93
+ | [configuration.md](https://github.com/marpaia/sbol-torch/blob/master/docs/configuration.md) | Complete `RunConfig` reference. |
94
+ | [data.md](https://github.com/marpaia/sbol-torch/blob/master/docs/data.md) | Data sources, the sbol-db client, materialization, fixtures. |
95
+ | [backbones.md](https://github.com/marpaia/sbol-torch/blob/master/docs/backbones.md) | Choosing/loading backbones and environment constraints. |
96
+ | [extending.md](https://github.com/marpaia/sbol-torch/blob/master/docs/extending.md) | Adding a tokenizer, encoder, task, callback, or data source. |
97
+
98
+ ## Develop
99
+
100
+ ```bash
101
+ uv run pytest
102
+ pre-commit run --all-files
103
+ ```
@@ -0,0 +1,88 @@
1
+ # Architecture
2
+
3
+ sbol-torch turns SBOL designs into trained transformer models. Three ideas shape
4
+ it: every source normalizes to one record type, the parts that vary plug in
5
+ behind protocols, and the training engine stays small and explicit.
6
+
7
+ ## One record type
8
+
9
+ Every data source — the sbol-db REST API, local SBOL/FASTA files, or the synthetic
10
+ generator — is normalized into `SbolObject` (`sboltorch.types`):
11
+
12
+ ```
13
+ SbolObject(iri, sbol_class, roles, types, sequence, features, neighbors, label, raw)
14
+ ```
15
+
16
+ Training code consumes only `SbolObject`s and never branches on provenance. A
17
+ source is anything satisfying the `Corpus` protocol (`__iter__` yielding
18
+ `SbolObject`s, plus a `fingerprint()` for caching).
19
+
20
+ ## Swappable plug points
21
+
22
+ Three independent axes are each a `Protocol` with interchangeable implementations,
23
+ selected by configuration:
24
+
25
+ - **Tokenizer** — how a sequence becomes tokens (`hf`, `kmer`, `char`).
26
+ - **Encoder** — the input modality, turning an `SbolObject` into model input
27
+ (`sequence`, `structure_aware`, `graph`).
28
+ - **Task** — the training objective, owning loss and metrics (`supervised`,
29
+ `frozen`, `mlm`).
30
+
31
+ Adding an implementation and registering it in the matching `build_*` factory
32
+ extends a capability without touching the engine. See [extending.md](extending.md).
33
+
34
+ ## The training engine
35
+
36
+ The training loop (`sboltorch.engine`) is plain PyTorch: AMP, gradient
37
+ accumulation and clipping, a linear warmup/decay schedule, and a list of
38
+ callbacks (`EarlyStopping`, `ModelCheckpoint`, `MetricLogger`). It learns the
39
+ batch shape only through a `BatchAdapter`, so the same loop trains a sequence
40
+ model (tensor-dict batches) or a graph model (PyG `Batch` objects).
41
+
42
+ ## Data flow
43
+
44
+ A `RunConfig` drives the whole pipeline. The configured `Corpus` source is
45
+ materialized to Parquet and split into train/val/test (seeded). The `Encoder`
46
+ turns each `SbolObject` into model input for its modality, using the `Tokenizer`,
47
+ and a `DataLoader` batches the result through a collator. The `Trainer` then runs
48
+ the loop under the `Task` and `BatchAdapter`, writing a checkpoint and metrics.
49
+
50
+ ## Layers
51
+
52
+ | Layer | Module | Responsibility |
53
+ |-------|--------|----------------|
54
+ | Config | `sboltorch.config` | One Pydantic `RunConfig` per run; validated, serialized. |
55
+ | Data | `sboltorch.data` | Corpus sources (`SbolDbClient`, `LocalFileCorpus`, synthetic) and Parquet materialization. |
56
+ | Tokenize | `sboltorch.tokenize` | `hf` / `kmer` / `char` behind one protocol. |
57
+ | Encoders | `sboltorch.encoders` | Turn an `SbolObject` into model input, per modality. |
58
+ | Datasets | `sboltorch.datasets` | Torch `Dataset`, padding collator, MLM collator, seeded splits. |
59
+ | Models | `sboltorch.models` | Backbone (pretrained or from-scratch) + pooling + head; MLM and graph models. |
60
+ | Tasks | `sboltorch.tasks` | Loss, metrics, label dtype, target transform. |
61
+ | Engine | `sboltorch.engine` | Training loop, callbacks, batch adapters. |
62
+ | Pipeline | `sboltorch.pipeline` | Wires the layers from a `RunConfig`. |
63
+
64
+ ## Key protocols
65
+
66
+ - `Corpus`: `__iter__() -> Iterator[SbolObject]`, `fingerprint() -> str`.
67
+ - `Tokenizer`: `encode`, `tokenize_content`, `vocab_size`, `pad_token_id`,
68
+ `mask_token_id`, `special_token_ids`, `max_length`.
69
+ - `Encoder`: `encode(SbolObject) -> ModelInput`, `output_spec -> EncoderSpec`.
70
+ - `Task`: `loss`, `predict`, `epoch_metrics`, `primary_metric`, `label_dtype`.
71
+ - `BatchAdapter`: `to_device`, `forward(model, batch)`, `labels(batch)`.
72
+ - `Callback`: `on_train_start`, `on_epoch_end`, `on_train_end`.
73
+
74
+ ## Reproducibility
75
+
76
+ - A run is fully specified by its `RunConfig`; the resolved config is written to
77
+ `<output_dir>/config.resolved.yaml`.
78
+ - `seed` seeds Python, NumPy, and torch, and the train/val/test split is a pure
79
+ function of `(n, ratios, seed, strategy)`.
80
+ - Corpora are materialized to content-fingerprinted Parquet, so a run is offline
81
+ and byte-for-byte comparable across executions. See [data.md](data.md).
82
+
83
+ ## Consuming sbol-db
84
+
85
+ `SbolDbClient` reads designs over the sbol-db REST API: keyset-paginated object
86
+ listing, single/bulk IRI resolution, bounded neighborhood traversal, sequence
87
+ search, and ontology descendant expansion. Sequence elements are read from each
88
+ object's JSON-LD slice by predicate local-name. Details in [data.md](data.md).
@@ -0,0 +1,47 @@
1
+ # Backbones & environment
2
+
3
+ The backbone is any HuggingFace encoder, or one built from scratch. The `hf`
4
+ tokenizer is a generic `AutoTokenizer` adapter, so any model that accepts a raw
5
+ sequence string works by pointing `model.backbone` and `tokenizer.model_name` at
6
+ the same id.
7
+
8
+ ## Choosing a backbone
9
+
10
+ | Goal | `model` config |
11
+ |------|----------------|
12
+ | Fine-tune a pretrained DNA model | `backbone: <hub-id>`, `from_scratch: false` |
13
+ | Train a head over a frozen backbone | as above, with `task.kind: frozen` |
14
+ | Pretrain / train from scratch | `from_scratch: true` + `model.arch` (sized to the tokenizer vocab) |
15
+ | Reuse a model you pretrained | `backbone: <output_dir>/backbone` (a local path) |
16
+
17
+ `model.backbone` accepts either a hub id or a local directory. A masked-LM run
18
+ writes its encoder to `<output_dir>/backbone/`, which a later supervised run loads
19
+ directly.
20
+
21
+ ## transformers version
22
+
23
+ sbol-torch pins `transformers` to the 4.x line. The 5.x line changes the ESM and
24
+ custom modeling code that the pretrained DNA backbones rely on, so it is not
25
+ compatible with them.
26
+
27
+ ## Known backbone constraints
28
+
29
+ - **DNABERT-2 (`zhihan1996/DNABERT-2-117M`)** — its remote modeling code requires
30
+ `triton`, which has no macOS wheels, so DNABERT-2 runs on Linux/GPU only.
31
+ `einops` (also required by that code) is a declared dependency. The DNABERT-2
32
+ tokenizer loads on any platform; only the model weights need Linux/GPU.
33
+ - **Nucleotide Transformer v2** — the tokenizer loads everywhere; the checkpoint
34
+ needs a transformers version whose ESM implementation matches its gated-MLP
35
+ shapes. Verify the pairing before relying on it.
36
+
37
+ For local development on CPU/macOS, use a `from_scratch` model with the `kmer` or
38
+ `char` tokenizer, or a small standard encoder; run bio-specific backbones like
39
+ DNABERT-2 on a Linux/GPU host.
40
+
41
+ ## Structure-aware backbones
42
+
43
+ The structure-aware encoder adds feature-boundary markers to the vocabulary, so
44
+ its `output_spec.vocab_size` exceeds a base tokenizer's. A `from_scratch` model is
45
+ sized to that vocabulary automatically. To use a pretrained backbone with the
46
+ structure-aware encoder, resize its token embeddings to the encoder's vocab size
47
+ first.
@@ -0,0 +1,90 @@
1
+ # Capabilities
2
+
3
+ sbol-torch trains transformer models on SBOL data. The input modality, training
4
+ objective, data source, and tokenizer are independent axes, each picked in
5
+ configuration. Changing one reuses the same code path rather than branching it.
6
+
7
+ ## Input modalities (`encoder.kind`)
8
+
9
+ | Modality | Consumes | Model | Config |
10
+ |----------|----------|-------|--------|
11
+ | `sequence` | raw sequence elements | pretrained or from-scratch encoder + pooling + head | [finetune_expression.yaml](../examples/configs/finetune_expression.yaml) |
12
+ | `structure_aware` | sequence + feature boundaries (role, orientation) | same, over an extended vocabulary | [finetune_structure_aware.yaml](../examples/configs/finetune_structure_aware.yaml) |
13
+ | `graph` | the SBOL composition graph | PyG graph transformer (`TransformerConv`) | [train_graph.yaml](../examples/configs/train_graph.yaml) |
14
+
15
+ - **Sequence** tokenizes the object's elements directly.
16
+ - **Structure-aware** wraps each annotated feature span with role-keyed boundary
17
+ markers (e.g. `[promoter] … [/promoter]`) and a reverse-complement marker,
18
+ injected inline as tokens. The markers extend the base tokenizer's vocabulary,
19
+ so the model sees SBOL structure alongside sequence. Use it with a
20
+ `from_scratch` model, or a pretrained backbone whose embeddings you've resized.
21
+ - **Graph** turns each object's neighborhood into a graph: nodes carry a
22
+ `(sbol_class, role, identity)` feature triple, edges carry a predicate type,
23
+ edges are bidirectional, and a global mean pool feeds the task head.
24
+
25
+ All three produce batches consumed by one training engine through a
26
+ `BatchAdapter`, so the loop is modality-agnostic (see [extending.md](extending.md)).
27
+
28
+ ## Objectives (`task.kind`)
29
+
30
+ | Objective | Head / loss | Metrics | Label |
31
+ |-----------|-------------|---------|-------|
32
+ | `frozen` | regression or classification head; backbone frozen | `val_mae`/`val_mse`/`val_r2` or `val_accuracy` | required |
33
+ | `supervised` | same, fine-tuned end to end | same | required |
34
+ | `mlm` | tied LM head; masked cross-entropy | `val_loss` (= masked CE), `val_masked_accuracy` | none (self-supervised) |
35
+
36
+ - **Supervised regression** supports `target_transform: log1p` for expression/
37
+ fitness-style targets; metrics are reported back in the original space.
38
+ - **Classification** requires `task.num_classes`.
39
+ - **MLM** masks ~`mlm_probability` of content tokens (80% `<mask>`, 10% random,
40
+ 10% unchanged), never masking special tokens.
41
+
42
+ ## Pretrain, then fine-tune
43
+
44
+ An `mlm` run writes its trained encoder to `<output_dir>/backbone/` in
45
+ HuggingFace format. A later `supervised`/`frozen` run loads it by setting
46
+ `model.backbone` to that directory — the masked-LM pretraining and the
47
+ downstream task share one backbone.
48
+
49
+ MLM supports two modes via `model.from_scratch`:
50
+
51
+ - `true` — build a fresh architecture from `model.arch` + the tokenizer vocab
52
+ (pretrain a DNA LM on the SBOL corpus with the k-mer/char tokenizer).
53
+ - `false` — continued pretraining of a pretrained `model.backbone`.
54
+
55
+ ## Tokenizers (`tokenizer.kind`)
56
+
57
+ | Tokenizer | Description |
58
+ |-----------|-------------|
59
+ | `hf` | Any pretrained HuggingFace `AutoTokenizer` (DNABERT-2, Nucleotide Transformer, …), wrapped behind the library's tokenizer protocol. |
60
+ | `kmer` | Overlapping k-mers over `{A,C,G,T}`; ambiguous bases map to `<unk>`. |
61
+ | `char` | IUPAC character-level. |
62
+
63
+ All expose the same protocol (`vocab_size`, `pad_token_id`, `mask_token_id`,
64
+ `special_token_ids`, `tokenize_content`, `encode`), so tokenizers, encoders, and
65
+ objectives mix freely.
66
+
67
+ ## Metrics
68
+
69
+ - Regression: MAE, MSE, R² (`val_mae`, `val_mse`, `val_r2`).
70
+ - Classification: accuracy (`val_accuracy`).
71
+ - MLM: masked cross-entropy as `val_loss` (perplexity is `exp(val_loss)`) and
72
+ `val_masked_accuracy`.
73
+
74
+ Per-epoch metrics are printed and appended to `<output_dir>/metrics.jsonl`; the
75
+ best checkpoint (by the task's primary metric) is saved to `best.pt`.
76
+
77
+ ## What the test suite verifies
78
+
79
+ `tests/test_learning.py` trains each capability on a learnable signal and asserts
80
+ that the loss drops and the model generalizes, not just that the code runs:
81
+
82
+ | Capability | Asserted |
83
+ |------------|----------|
84
+ | Supervised sequence (from scratch) | val_loss falls, `val_r2 > 0.5` |
85
+ | MLM (from scratch) | val_loss falls, masked accuracy rises |
86
+ | Structure-aware | val_loss falls, `val_r2 > 0.5` |
87
+ | Graph transformer | val_loss falls, `val_r2 > 0.5` |
88
+
89
+ Continued pretraining and pretrained-backbone loading are exercised against a
90
+ real hub model in the data/model tests.