synbio-torch 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synbio_torch-0.2.0/LICENSE +21 -0
- synbio_torch-0.2.0/PKG-INFO +166 -0
- synbio_torch-0.2.0/README.md +133 -0
- synbio_torch-0.2.0/pyproject.toml +77 -0
- synbio_torch-0.2.0/rust/Cargo.lock +1036 -0
- synbio_torch-0.2.0/rust/Cargo.toml +20 -0
- synbio_torch-0.2.0/rust/src/lib.rs +270 -0
- synbio_torch-0.2.0/src/synbiotorch/__init__.py +35 -0
- synbio_torch-0.2.0/src/synbiotorch/_sbol.pyi +9 -0
- synbio_torch-0.2.0/src/synbiotorch/cli.py +95 -0
- synbio_torch-0.2.0/src/synbiotorch/config.py +253 -0
- synbio_torch-0.2.0/src/synbiotorch/data/__init__.py +17 -0
- synbio_torch-0.2.0/src/synbiotorch/data/corpus.py +48 -0
- synbio_torch-0.2.0/src/synbiotorch/data/materialize.py +228 -0
- synbio_torch-0.2.0/src/synbiotorch/datasets/__init__.py +8 -0
- synbio_torch-0.2.0/src/synbiotorch/datasets/causal_collator.py +30 -0
- synbio_torch-0.2.0/src/synbiotorch/datasets/dataset.py +62 -0
- synbio_torch-0.2.0/src/synbiotorch/datasets/mlm_collator.py +57 -0
- synbio_torch-0.2.0/src/synbiotorch/datasets/packing.py +57 -0
- synbio_torch-0.2.0/src/synbiotorch/datasets/splits.py +108 -0
- synbio_torch-0.2.0/src/synbiotorch/datasets/streaming.py +93 -0
- synbio_torch-0.2.0/src/synbiotorch/distributed.py +109 -0
- synbio_torch-0.2.0/src/synbiotorch/encoders/__init__.py +19 -0
- synbio_torch-0.2.0/src/synbiotorch/encoders/base.py +76 -0
- synbio_torch-0.2.0/src/synbiotorch/encoders/graph.py +111 -0
- synbio_torch-0.2.0/src/synbiotorch/encoders/sequence.py +34 -0
- synbio_torch-0.2.0/src/synbiotorch/encoders/structure.py +113 -0
- synbio_torch-0.2.0/src/synbiotorch/engine/__init__.py +20 -0
- synbio_torch-0.2.0/src/synbiotorch/engine/batch.py +51 -0
- synbio_torch-0.2.0/src/synbiotorch/engine/callbacks.py +228 -0
- synbio_torch-0.2.0/src/synbiotorch/engine/trainer.py +346 -0
- synbio_torch-0.2.0/src/synbiotorch/exceptions.py +27 -0
- synbio_torch-0.2.0/src/synbiotorch/generate.py +104 -0
- synbio_torch-0.2.0/src/synbiotorch/models/__init__.py +64 -0
- synbio_torch-0.2.0/src/synbiotorch/models/backbone.py +63 -0
- synbio_torch-0.2.0/src/synbiotorch/models/causal.py +46 -0
- synbio_torch-0.2.0/src/synbiotorch/models/graph.py +73 -0
- synbio_torch-0.2.0/src/synbiotorch/models/heads.py +26 -0
- synbio_torch-0.2.0/src/synbiotorch/models/mlm.py +46 -0
- synbio_torch-0.2.0/src/synbiotorch/models/sequence_model.py +39 -0
- synbio_torch-0.2.0/src/synbiotorch/pipeline.py +268 -0
- synbio_torch-0.2.0/src/synbiotorch/reproducibility.py +42 -0
- synbio_torch-0.2.0/src/synbiotorch/sources/__init__.py +27 -0
- synbio_torch-0.2.0/src/synbiotorch/sources/fasta.py +105 -0
- synbio_torch-0.2.0/src/synbiotorch/sources/files.py +27 -0
- synbio_torch-0.2.0/src/synbiotorch/sources/genbank.py +51 -0
- synbio_torch-0.2.0/src/synbiotorch/sources/sbol.py +211 -0
- synbio_torch-0.2.0/src/synbiotorch/sources/sbol_db.py +227 -0
- synbio_torch-0.2.0/src/synbiotorch/sources/synthetic.py +217 -0
- synbio_torch-0.2.0/src/synbiotorch/sources/table.py +94 -0
- synbio_torch-0.2.0/src/synbiotorch/tasks/__init__.py +8 -0
- synbio_torch-0.2.0/src/synbiotorch/tasks/base.py +53 -0
- synbio_torch-0.2.0/src/synbiotorch/tasks/causal.py +37 -0
- synbio_torch-0.2.0/src/synbiotorch/tasks/mlm.py +39 -0
- synbio_torch-0.2.0/src/synbiotorch/tasks/supervised.py +63 -0
- synbio_torch-0.2.0/src/synbiotorch/tokenize/__init__.py +17 -0
- synbio_torch-0.2.0/src/synbiotorch/tokenize/base.py +69 -0
- synbio_torch-0.2.0/src/synbiotorch/tokenize/char.py +56 -0
- synbio_torch-0.2.0/src/synbiotorch/tokenize/hf.py +62 -0
- synbio_torch-0.2.0/src/synbiotorch/tokenize/kmer.py +79 -0
- synbio_torch-0.2.0/src/synbiotorch/types.py +177 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mike Arpaia
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: synbio-torch
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Requires-Dist: httpx>=0.28
|
|
5
|
+
Requires-Dist: pydantic>=2.13
|
|
6
|
+
Requires-Dist: pyarrow>=24
|
|
7
|
+
Requires-Dist: numpy>=2.4
|
|
8
|
+
Requires-Dist: pyyaml>=6
|
|
9
|
+
Requires-Dist: torch>=2.12
|
|
10
|
+
Requires-Dist: transformers>=4.57,<5
|
|
11
|
+
Requires-Dist: tokenizers>=0.22
|
|
12
|
+
Requires-Dist: einops>=0.8
|
|
13
|
+
Requires-Dist: torch-geometric>=2.8
|
|
14
|
+
Requires-Dist: wandb>=0.27
|
|
15
|
+
Requires-Dist: pytest>=9 ; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest-mock>=3.15 ; extra == 'dev'
|
|
17
|
+
Requires-Dist: respx>=0.23 ; extra == 'dev'
|
|
18
|
+
Requires-Dist: black>=26.5 ; extra == 'dev'
|
|
19
|
+
Requires-Dist: isort>=8 ; extra == 'dev'
|
|
20
|
+
Requires-Dist: flake8>=7.3 ; extra == 'dev'
|
|
21
|
+
Requires-Dist: mypy>=2.1 ; extra == 'dev'
|
|
22
|
+
Requires-Dist: pre-commit>=4.6 ; extra == 'dev'
|
|
23
|
+
Requires-Dist: types-pyyaml>=6.0 ; extra == 'dev'
|
|
24
|
+
Requires-Dist: maturin>=1.7 ; extra == 'dev'
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Summary: A PyTorch library for synthetic biology and biodesign machine learning
|
|
28
|
+
Author-email: Mike Arpaia <mike@arpaia.co>
|
|
29
|
+
License: MIT
|
|
30
|
+
Requires-Python: >=3.11
|
|
31
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
32
|
+
|
|
33
|
+
# synbio-torch
|
|
34
|
+
|
|
35
|
+
A PyTorch library for synthetic biology and biodesign machine learning.
|
|
36
|
+
|
|
37
|
+
Installed as `synbio-torch`, imported as `synbiotorch` (commonly `import synbiotorch as st`).
|
|
38
|
+
|
|
39
|
+
synbio-torch ingests biological designs and sequences from many sources — labeled
|
|
40
|
+
FASTA, CSV/TSV tables, GenBank, SBOL, an [sbol-db](https://github.com/marpaia/sbol-db)
|
|
41
|
+
instance, or a synthetic generator — normalizes them into a single record type
|
|
42
|
+
(`Design`), and trains transformer models against them. The input modality,
|
|
43
|
+
tokenizer, and training objective are all set in configuration, so trying a new
|
|
44
|
+
combination never means forking the pipeline. GenBank and SBOL are parsed
|
|
45
|
+
in-process by native [sbol-rs](https://github.com/marpaia/sbol-rs) bindings.
|
|
46
|
+
|
|
47
|
+
## Capabilities
|
|
48
|
+
|
|
49
|
+
| Axis | Options |
|
|
50
|
+
|------|---------|
|
|
51
|
+
| **Data sources** | labeled **FASTA**, CSV/TSV **tables**, **GenBank**, **SBOL** (2 & 3), the sbol-db REST API, or a **synthetic** generator; loaded in-memory or **streamed** from sharded Parquet for corpora larger than RAM |
|
|
52
|
+
| **Tokenizers** | pretrained HuggingFace (`hf`), overlapping k-mer, or character-level over a nucleotide **or protein** alphabet (encode + decode) |
|
|
53
|
+
| **Modalities** | `sequence`, `structure_aware` (feature boundaries), `graph` (PyG composition transformer) |
|
|
54
|
+
| **Objectives** | `supervised` / `frozen` heads, `mlm` and `causal` pretraining (from-scratch or continued) |
|
|
55
|
+
| **Architectures** | from-scratch or pretrained; absolute or **RoPE** positions (`gpt_neox`/`llama`/`modernbert`), SDPA/FlashAttention, configurable context length |
|
|
56
|
+
| **Generation** | autoregressive sampling (temperature / top-k / top-p) and design completion from a causal backbone (`synbiotorch generate`) |
|
|
57
|
+
| **Engine** | raw-PyTorch loop, epoch- or **step-budgeted**; AMP (`fp16`/`bf16`), gradient accumulation/clipping, gradient checkpointing, `torch.compile`; **resumable** checkpoints; early stopping; LR schedule |
|
|
58
|
+
| **Scaling** | token **packing**, multi-GPU **DDP** (data-parallel) via `torchrun` |
|
|
59
|
+
| **Tracking** | per-epoch `metrics.jsonl`, optional [Weights & Biases](https://docs.wandb.ai/) (scalars, config, lineage, model artifact) |
|
|
60
|
+
| **Reproducibility** | one validated config per run, seeded / hash splits, content-fingerprinted sharded Parquet cache, resumable runs |
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install synbio-torch
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
synbio-torch ships a native extension (PyO3 bindings to the sbol-rs Rust crates).
|
|
69
|
+
Building from source needs a Rust toolchain (≥ 1.93); a prebuilt wheel needs none.
|
|
70
|
+
|
|
71
|
+
For development, build the extension into the venv with maturin:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
uv venv
|
|
75
|
+
uv pip install -e '.[dev]' # compiles the Rust extension on install
|
|
76
|
+
# After editing Rust under rust/, rebuild with:
|
|
77
|
+
uv run maturin develop
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Quickstart
|
|
81
|
+
|
|
82
|
+
A run is fully specified by one YAML config. From the command line:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Materialize a corpus to the local Parquet cache (offline, reproducible).
|
|
86
|
+
synbiotorch ingest examples/configs/finetune_expression.yaml
|
|
87
|
+
|
|
88
|
+
# Train. Resolved config, per-epoch metrics.jsonl, and best.pt land in output_dir.
|
|
89
|
+
synbiotorch train examples/configs/finetune_expression.yaml
|
|
90
|
+
|
|
91
|
+
# Resume an interrupted run from its rolling checkpoint (needs checkpoint_every_n_steps).
|
|
92
|
+
synbiotorch train examples/configs/pretrain_mlm.yaml --resume runs/pretrain_mlm/last.pt
|
|
93
|
+
|
|
94
|
+
# Generate from a trained causal backbone — point model.backbone at the run's
|
|
95
|
+
# backbone/ (with from_scratch: false), then complete a design from a prompt.
|
|
96
|
+
synbiotorch generate my_causal_run.yaml --prompt ATGCGT --max-new-tokens 200 --temperature 0.8
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Train multi-GPU with `torchrun` and `train.distributed.strategy: ddp`:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
torchrun --nproc_per_node=<gpus> -m synbiotorch.cli train examples/configs/pretrain_causal_long.yaml
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Or from Python:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
import synbiotorch as st
|
|
109
|
+
|
|
110
|
+
config = st.RunConfig.from_yaml("examples/configs/train_graph.yaml")
|
|
111
|
+
metrics = st.run_training(config)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Example configs
|
|
115
|
+
|
|
116
|
+
| Config | What it does |
|
|
117
|
+
|--------|--------------|
|
|
118
|
+
| [`finetune_expression.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_expression.yaml) | Frozen DNABERT-2 backbone feeding a regression head. |
|
|
119
|
+
| [`pretrain_mlm.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/pretrain_mlm.yaml) | From-scratch masked-LM pretraining; writes a reusable backbone. |
|
|
120
|
+
| [`finetune_structure_aware.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_structure_aware.yaml) | Sequence + feature-boundary markers. |
|
|
121
|
+
| [`train_graph.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/train_graph.yaml) | Graph transformer over the composition graph. |
|
|
122
|
+
| [`pretrain_causal_long.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/pretrain_causal_long.yaml) | Long-context causal pretraining: RoPE decoder, SDPA, streamed + packed corpus. |
|
|
123
|
+
| [`finetune_protein.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_protein.yaml) | Protein regression from a labeled CSV table with the protein char tokenizer. |
|
|
124
|
+
| [`benchmark_dna_classification.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/benchmark_dna_classification.yaml) | Genomics-ML benchmark shape: a labeled table fed to a pretrained DNA backbone for classification. |
|
|
125
|
+
| [`ingest_genbank.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/ingest_genbank.yaml) | Import GenBank to the Parquet cache via the native binding. |
|
|
126
|
+
|
|
127
|
+
## Experiment tracking
|
|
128
|
+
|
|
129
|
+
The two synthetic-data configs ([`train_graph.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/train_graph.yaml)
|
|
130
|
+
and [`finetune_structure_aware.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_structure_aware.yaml))
|
|
131
|
+
ship with [Weights & Biases](https://docs.wandb.ai/) enabled. Set `WANDB_API_KEY`
|
|
132
|
+
in a `.env` at the repo root and run both:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python examples/run_wandb_examples.py
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Each run logs per-step loss and learning rate, per-epoch train/val metrics, the
|
|
139
|
+
resolved config, the corpus fingerprint and split sizes as lineage, and the best
|
|
140
|
+
checkpoint as a model artifact.
|
|
141
|
+
|
|
142
|
+
| Graph transformer | Structure-aware sequence |
|
|
143
|
+
|-------------------|--------------------------|
|
|
144
|
+
|  |  |
|
|
145
|
+
|
|
146
|
+
## Documentation
|
|
147
|
+
|
|
148
|
+
| Doc | Contents |
|
|
149
|
+
|-----|----------|
|
|
150
|
+
| [architecture.md](https://github.com/marpaia/synbio-torch/blob/master/docs/architecture.md) | How the system is built — record type, plug points, engine, data flow. |
|
|
151
|
+
| [capabilities.md](https://github.com/marpaia/synbio-torch/blob/master/docs/capabilities.md) | Modalities, objectives, tokenizers, metrics. |
|
|
152
|
+
| [configuration.md](https://github.com/marpaia/synbio-torch/blob/master/docs/configuration.md) | Complete `RunConfig` reference. |
|
|
153
|
+
| [data.md](https://github.com/marpaia/synbio-torch/blob/master/docs/data.md) | Data sources, native parsing, materialization, fixtures. |
|
|
154
|
+
| [backbones.md](https://github.com/marpaia/synbio-torch/blob/master/docs/backbones.md) | Choosing/loading backbones and environment constraints. |
|
|
155
|
+
| [extending.md](https://github.com/marpaia/synbio-torch/blob/master/docs/extending.md) | Adding a tokenizer, encoder, task, callback, or data source. |
|
|
156
|
+
|
|
157
|
+
Release history is in [CHANGELOG.md](https://github.com/marpaia/synbio-torch/blob/master/CHANGELOG.md).
|
|
158
|
+
|
|
159
|
+
## Develop
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
uv run maturin develop # rebuild the Rust extension after editing rust/
|
|
163
|
+
uv run pytest
|
|
164
|
+
pre-commit run --all-files
|
|
165
|
+
```
|
|
166
|
+
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# synbio-torch
|
|
2
|
+
|
|
3
|
+
A PyTorch library for synthetic biology and biodesign machine learning.
|
|
4
|
+
|
|
5
|
+
Installed as `synbio-torch`, imported as `synbiotorch` (commonly `import synbiotorch as st`).
|
|
6
|
+
|
|
7
|
+
synbio-torch ingests biological designs and sequences from many sources — labeled
|
|
8
|
+
FASTA, CSV/TSV tables, GenBank, SBOL, an [sbol-db](https://github.com/marpaia/sbol-db)
|
|
9
|
+
instance, or a synthetic generator — normalizes them into a single record type
|
|
10
|
+
(`Design`), and trains transformer models against them. The input modality,
|
|
11
|
+
tokenizer, and training objective are all set in configuration, so trying a new
|
|
12
|
+
combination never means forking the pipeline. GenBank and SBOL are parsed
|
|
13
|
+
in-process by native [sbol-rs](https://github.com/marpaia/sbol-rs) bindings.
|
|
14
|
+
|
|
15
|
+
## Capabilities
|
|
16
|
+
|
|
17
|
+
| Axis | Options |
|
|
18
|
+
|------|---------|
|
|
19
|
+
| **Data sources** | labeled **FASTA**, CSV/TSV **tables**, **GenBank**, **SBOL** (2 & 3), the sbol-db REST API, or a **synthetic** generator; loaded in-memory or **streamed** from sharded Parquet for corpora larger than RAM |
|
|
20
|
+
| **Tokenizers** | pretrained HuggingFace (`hf`), overlapping k-mer, or character-level over a nucleotide **or protein** alphabet (encode + decode) |
|
|
21
|
+
| **Modalities** | `sequence`, `structure_aware` (feature boundaries), `graph` (PyG composition transformer) |
|
|
22
|
+
| **Objectives** | `supervised` / `frozen` heads, `mlm` and `causal` pretraining (from-scratch or continued) |
|
|
23
|
+
| **Architectures** | from-scratch or pretrained; absolute or **RoPE** positions (`gpt_neox`/`llama`/`modernbert`), SDPA/FlashAttention, configurable context length |
|
|
24
|
+
| **Generation** | autoregressive sampling (temperature / top-k / top-p) and design completion from a causal backbone (`synbiotorch generate`) |
|
|
25
|
+
| **Engine** | raw-PyTorch loop, epoch- or **step-budgeted**; AMP (`fp16`/`bf16`), gradient accumulation/clipping, gradient checkpointing, `torch.compile`; **resumable** checkpoints; early stopping; LR schedule |
|
|
26
|
+
| **Scaling** | token **packing**, multi-GPU **DDP** (data-parallel) via `torchrun` |
|
|
27
|
+
| **Tracking** | per-epoch `metrics.jsonl`, optional [Weights & Biases](https://docs.wandb.ai/) (scalars, config, lineage, model artifact) |
|
|
28
|
+
| **Reproducibility** | one validated config per run, seeded / hash splits, content-fingerprinted sharded Parquet cache, resumable runs |
|
|
29
|
+
|
|
30
|
+
## Install
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install synbio-torch
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
synbio-torch ships a native extension (PyO3 bindings to the sbol-rs Rust crates).
|
|
37
|
+
Building from source needs a Rust toolchain (≥ 1.93); a prebuilt wheel needs none.
|
|
38
|
+
|
|
39
|
+
For development, build the extension into the venv with maturin:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uv venv
|
|
43
|
+
uv pip install -e '.[dev]' # compiles the Rust extension on install
|
|
44
|
+
# After editing Rust under rust/, rebuild with:
|
|
45
|
+
uv run maturin develop
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Quickstart
|
|
49
|
+
|
|
50
|
+
A run is fully specified by one YAML config. From the command line:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Materialize a corpus to the local Parquet cache (offline, reproducible).
|
|
54
|
+
synbiotorch ingest examples/configs/finetune_expression.yaml
|
|
55
|
+
|
|
56
|
+
# Train. Resolved config, per-epoch metrics.jsonl, and best.pt land in output_dir.
|
|
57
|
+
synbiotorch train examples/configs/finetune_expression.yaml
|
|
58
|
+
|
|
59
|
+
# Resume an interrupted run from its rolling checkpoint (needs checkpoint_every_n_steps).
|
|
60
|
+
synbiotorch train examples/configs/pretrain_mlm.yaml --resume runs/pretrain_mlm/last.pt
|
|
61
|
+
|
|
62
|
+
# Generate from a trained causal backbone — point model.backbone at the run's
|
|
63
|
+
# backbone/ (with from_scratch: false), then complete a design from a prompt.
|
|
64
|
+
synbiotorch generate my_causal_run.yaml --prompt ATGCGT --max-new-tokens 200 --temperature 0.8
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Train multi-GPU with `torchrun` and `train.distributed.strategy: ddp`:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
torchrun --nproc_per_node=<gpus> -m synbiotorch.cli train examples/configs/pretrain_causal_long.yaml
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or from Python:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import synbiotorch as st
|
|
77
|
+
|
|
78
|
+
config = st.RunConfig.from_yaml("examples/configs/train_graph.yaml")
|
|
79
|
+
metrics = st.run_training(config)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Example configs
|
|
83
|
+
|
|
84
|
+
| Config | What it does |
|
|
85
|
+
|--------|--------------|
|
|
86
|
+
| [`finetune_expression.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_expression.yaml) | Frozen DNABERT-2 backbone feeding a regression head. |
|
|
87
|
+
| [`pretrain_mlm.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/pretrain_mlm.yaml) | From-scratch masked-LM pretraining; writes a reusable backbone. |
|
|
88
|
+
| [`finetune_structure_aware.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_structure_aware.yaml) | Sequence + feature-boundary markers. |
|
|
89
|
+
| [`train_graph.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/train_graph.yaml) | Graph transformer over the composition graph. |
|
|
90
|
+
| [`pretrain_causal_long.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/pretrain_causal_long.yaml) | Long-context causal pretraining: RoPE decoder, SDPA, streamed + packed corpus. |
|
|
91
|
+
| [`finetune_protein.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_protein.yaml) | Protein regression from a labeled CSV table with the protein char tokenizer. |
|
|
92
|
+
| [`benchmark_dna_classification.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/benchmark_dna_classification.yaml) | Genomics-ML benchmark shape: a labeled table fed to a pretrained DNA backbone for classification. |
|
|
93
|
+
| [`ingest_genbank.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/ingest_genbank.yaml) | Import GenBank to the Parquet cache via the native binding. |
|
|
94
|
+
|
|
95
|
+
## Experiment tracking
|
|
96
|
+
|
|
97
|
+
The two synthetic-data configs ([`train_graph.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/train_graph.yaml)
|
|
98
|
+
and [`finetune_structure_aware.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_structure_aware.yaml))
|
|
99
|
+
ship with [Weights & Biases](https://docs.wandb.ai/) enabled. Set `WANDB_API_KEY`
|
|
100
|
+
in a `.env` at the repo root and run both:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
python examples/run_wandb_examples.py
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Each run logs per-step loss and learning rate, per-epoch train/val metrics, the
|
|
107
|
+
resolved config, the corpus fingerprint and split sizes as lineage, and the best
|
|
108
|
+
checkpoint as a model artifact.
|
|
109
|
+
|
|
110
|
+
| Graph transformer | Structure-aware sequence |
|
|
111
|
+
|-------------------|--------------------------|
|
|
112
|
+
|  |  |
|
|
113
|
+
|
|
114
|
+
## Documentation
|
|
115
|
+
|
|
116
|
+
| Doc | Contents |
|
|
117
|
+
|-----|----------|
|
|
118
|
+
| [architecture.md](https://github.com/marpaia/synbio-torch/blob/master/docs/architecture.md) | How the system is built — record type, plug points, engine, data flow. |
|
|
119
|
+
| [capabilities.md](https://github.com/marpaia/synbio-torch/blob/master/docs/capabilities.md) | Modalities, objectives, tokenizers, metrics. |
|
|
120
|
+
| [configuration.md](https://github.com/marpaia/synbio-torch/blob/master/docs/configuration.md) | Complete `RunConfig` reference. |
|
|
121
|
+
| [data.md](https://github.com/marpaia/synbio-torch/blob/master/docs/data.md) | Data sources, native parsing, materialization, fixtures. |
|
|
122
|
+
| [backbones.md](https://github.com/marpaia/synbio-torch/blob/master/docs/backbones.md) | Choosing/loading backbones and environment constraints. |
|
|
123
|
+
| [extending.md](https://github.com/marpaia/synbio-torch/blob/master/docs/extending.md) | Adding a tokenizer, encoder, task, callback, or data source. |
|
|
124
|
+
|
|
125
|
+
Release history is in [CHANGELOG.md](https://github.com/marpaia/synbio-torch/blob/master/CHANGELOG.md).
|
|
126
|
+
|
|
127
|
+
## Develop
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
uv run maturin develop # rebuild the Rust extension after editing rust/
|
|
131
|
+
uv run pytest
|
|
132
|
+
pre-commit run --all-files
|
|
133
|
+
```
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
# Distribution name on PyPI is hyphenated (`pip install synbio-torch`); the import
|
|
3
|
+
# package is `synbiotorch` (commonly `import synbiotorch as st`).
|
|
4
|
+
name = "synbio-torch"
|
|
5
|
+
version = "0.2.0"
|
|
6
|
+
description = "A PyTorch library for synthetic biology and biodesign machine learning"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
requires-python = ">=3.11"
|
|
9
|
+
license = { text = "MIT" }
|
|
10
|
+
authors = [{ name = "Mike Arpaia", email = "mike@arpaia.co" }]
|
|
11
|
+
dependencies = [
|
|
12
|
+
"httpx>=0.28",
|
|
13
|
+
"pydantic>=2.13",
|
|
14
|
+
"pyarrow>=24",
|
|
15
|
+
"numpy>=2.4",
|
|
16
|
+
"pyyaml>=6",
|
|
17
|
+
"torch>=2.12",
|
|
18
|
+
# Pin to the 4.x line: transformers 5.x breaks the custom/ESM modeling code
|
|
19
|
+
# the pretrained DNA backbones (Nucleotide Transformer, DNABERT) target.
|
|
20
|
+
"transformers>=4.57,<5",
|
|
21
|
+
"tokenizers>=0.22",
|
|
22
|
+
"einops>=0.8",
|
|
23
|
+
"torch-geometric>=2.8",
|
|
24
|
+
"wandb>=0.27",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
dev = [
|
|
29
|
+
"pytest>=9",
|
|
30
|
+
"pytest-mock>=3.15",
|
|
31
|
+
"respx>=0.23",
|
|
32
|
+
"black>=26.5",
|
|
33
|
+
"isort>=8",
|
|
34
|
+
"flake8>=7.3",
|
|
35
|
+
"mypy>=2.1",
|
|
36
|
+
"pre-commit>=4.6",
|
|
37
|
+
"types-PyYAML>=6.0",
|
|
38
|
+
"maturin>=1.7",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
synbiotorch = "synbiotorch.cli:main"
|
|
43
|
+
|
|
44
|
+
[build-system]
|
|
45
|
+
requires = ["maturin>=1.7,<2"]
|
|
46
|
+
build-backend = "maturin"
|
|
47
|
+
|
|
48
|
+
[tool.maturin]
|
|
49
|
+
# Mixed Python/Rust project: Python lives under src/, the Rust crate under rust/.
|
|
50
|
+
# The compiled extension is importable as ``synbiotorch._sbol``.
|
|
51
|
+
python-source = "src"
|
|
52
|
+
manifest-path = "rust/Cargo.toml"
|
|
53
|
+
module-name = "synbiotorch._sbol"
|
|
54
|
+
features = ["pyo3/extension-module"]
|
|
55
|
+
# Bundle the license into the sdist; its metadata declares License-File: LICENSE.
|
|
56
|
+
include = ["LICENSE"]
|
|
57
|
+
|
|
58
|
+
[tool.black]
|
|
59
|
+
line-length = 120
|
|
60
|
+
target-version = ["py311"]
|
|
61
|
+
|
|
62
|
+
[tool.isort]
|
|
63
|
+
profile = "black"
|
|
64
|
+
line_length = 120
|
|
65
|
+
# Confine first-party detection to the source tree so import grouping is
|
|
66
|
+
# deterministic across machines. Without this, isort's auto-detection varies by
|
|
67
|
+
# OS and installed packages (e.g. misclassifying wandb as first-party), making
|
|
68
|
+
# the same code pass on one host and fail on another.
|
|
69
|
+
src_paths = ["src"]
|
|
70
|
+
known_first_party = ["synbiotorch"]
|
|
71
|
+
|
|
72
|
+
[tool.mypy]
|
|
73
|
+
python_version = "3.11"
|
|
74
|
+
warn_unused_configs = true
|
|
75
|
+
disallow_untyped_defs = true
|
|
76
|
+
ignore_missing_imports = true
|
|
77
|
+
files = ["src/synbiotorch"]
|