synbio-torch 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. synbio_torch-0.2.0/LICENSE +21 -0
  2. synbio_torch-0.2.0/PKG-INFO +166 -0
  3. synbio_torch-0.2.0/README.md +133 -0
  4. synbio_torch-0.2.0/pyproject.toml +77 -0
  5. synbio_torch-0.2.0/rust/Cargo.lock +1036 -0
  6. synbio_torch-0.2.0/rust/Cargo.toml +20 -0
  7. synbio_torch-0.2.0/rust/src/lib.rs +270 -0
  8. synbio_torch-0.2.0/src/synbiotorch/__init__.py +35 -0
  9. synbio_torch-0.2.0/src/synbiotorch/_sbol.pyi +9 -0
  10. synbio_torch-0.2.0/src/synbiotorch/cli.py +95 -0
  11. synbio_torch-0.2.0/src/synbiotorch/config.py +253 -0
  12. synbio_torch-0.2.0/src/synbiotorch/data/__init__.py +17 -0
  13. synbio_torch-0.2.0/src/synbiotorch/data/corpus.py +48 -0
  14. synbio_torch-0.2.0/src/synbiotorch/data/materialize.py +228 -0
  15. synbio_torch-0.2.0/src/synbiotorch/datasets/__init__.py +8 -0
  16. synbio_torch-0.2.0/src/synbiotorch/datasets/causal_collator.py +30 -0
  17. synbio_torch-0.2.0/src/synbiotorch/datasets/dataset.py +62 -0
  18. synbio_torch-0.2.0/src/synbiotorch/datasets/mlm_collator.py +57 -0
  19. synbio_torch-0.2.0/src/synbiotorch/datasets/packing.py +57 -0
  20. synbio_torch-0.2.0/src/synbiotorch/datasets/splits.py +108 -0
  21. synbio_torch-0.2.0/src/synbiotorch/datasets/streaming.py +93 -0
  22. synbio_torch-0.2.0/src/synbiotorch/distributed.py +109 -0
  23. synbio_torch-0.2.0/src/synbiotorch/encoders/__init__.py +19 -0
  24. synbio_torch-0.2.0/src/synbiotorch/encoders/base.py +76 -0
  25. synbio_torch-0.2.0/src/synbiotorch/encoders/graph.py +111 -0
  26. synbio_torch-0.2.0/src/synbiotorch/encoders/sequence.py +34 -0
  27. synbio_torch-0.2.0/src/synbiotorch/encoders/structure.py +113 -0
  28. synbio_torch-0.2.0/src/synbiotorch/engine/__init__.py +20 -0
  29. synbio_torch-0.2.0/src/synbiotorch/engine/batch.py +51 -0
  30. synbio_torch-0.2.0/src/synbiotorch/engine/callbacks.py +228 -0
  31. synbio_torch-0.2.0/src/synbiotorch/engine/trainer.py +346 -0
  32. synbio_torch-0.2.0/src/synbiotorch/exceptions.py +27 -0
  33. synbio_torch-0.2.0/src/synbiotorch/generate.py +104 -0
  34. synbio_torch-0.2.0/src/synbiotorch/models/__init__.py +64 -0
  35. synbio_torch-0.2.0/src/synbiotorch/models/backbone.py +63 -0
  36. synbio_torch-0.2.0/src/synbiotorch/models/causal.py +46 -0
  37. synbio_torch-0.2.0/src/synbiotorch/models/graph.py +73 -0
  38. synbio_torch-0.2.0/src/synbiotorch/models/heads.py +26 -0
  39. synbio_torch-0.2.0/src/synbiotorch/models/mlm.py +46 -0
  40. synbio_torch-0.2.0/src/synbiotorch/models/sequence_model.py +39 -0
  41. synbio_torch-0.2.0/src/synbiotorch/pipeline.py +268 -0
  42. synbio_torch-0.2.0/src/synbiotorch/reproducibility.py +42 -0
  43. synbio_torch-0.2.0/src/synbiotorch/sources/__init__.py +27 -0
  44. synbio_torch-0.2.0/src/synbiotorch/sources/fasta.py +105 -0
  45. synbio_torch-0.2.0/src/synbiotorch/sources/files.py +27 -0
  46. synbio_torch-0.2.0/src/synbiotorch/sources/genbank.py +51 -0
  47. synbio_torch-0.2.0/src/synbiotorch/sources/sbol.py +211 -0
  48. synbio_torch-0.2.0/src/synbiotorch/sources/sbol_db.py +227 -0
  49. synbio_torch-0.2.0/src/synbiotorch/sources/synthetic.py +217 -0
  50. synbio_torch-0.2.0/src/synbiotorch/sources/table.py +94 -0
  51. synbio_torch-0.2.0/src/synbiotorch/tasks/__init__.py +8 -0
  52. synbio_torch-0.2.0/src/synbiotorch/tasks/base.py +53 -0
  53. synbio_torch-0.2.0/src/synbiotorch/tasks/causal.py +37 -0
  54. synbio_torch-0.2.0/src/synbiotorch/tasks/mlm.py +39 -0
  55. synbio_torch-0.2.0/src/synbiotorch/tasks/supervised.py +63 -0
  56. synbio_torch-0.2.0/src/synbiotorch/tokenize/__init__.py +17 -0
  57. synbio_torch-0.2.0/src/synbiotorch/tokenize/base.py +69 -0
  58. synbio_torch-0.2.0/src/synbiotorch/tokenize/char.py +56 -0
  59. synbio_torch-0.2.0/src/synbiotorch/tokenize/hf.py +62 -0
  60. synbio_torch-0.2.0/src/synbiotorch/tokenize/kmer.py +79 -0
  61. synbio_torch-0.2.0/src/synbiotorch/types.py +177 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mike Arpaia
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,166 @@
1
+ Metadata-Version: 2.4
2
+ Name: synbio-torch
3
+ Version: 0.2.0
4
+ Requires-Dist: httpx>=0.28
5
+ Requires-Dist: pydantic>=2.13
6
+ Requires-Dist: pyarrow>=24
7
+ Requires-Dist: numpy>=2.4
8
+ Requires-Dist: pyyaml>=6
9
+ Requires-Dist: torch>=2.12
10
+ Requires-Dist: transformers>=4.57,<5
11
+ Requires-Dist: tokenizers>=0.22
12
+ Requires-Dist: einops>=0.8
13
+ Requires-Dist: torch-geometric>=2.8
14
+ Requires-Dist: wandb>=0.27
15
+ Requires-Dist: pytest>=9 ; extra == 'dev'
16
+ Requires-Dist: pytest-mock>=3.15 ; extra == 'dev'
17
+ Requires-Dist: respx>=0.23 ; extra == 'dev'
18
+ Requires-Dist: black>=26.5 ; extra == 'dev'
19
+ Requires-Dist: isort>=8 ; extra == 'dev'
20
+ Requires-Dist: flake8>=7.3 ; extra == 'dev'
21
+ Requires-Dist: mypy>=2.1 ; extra == 'dev'
22
+ Requires-Dist: pre-commit>=4.6 ; extra == 'dev'
23
+ Requires-Dist: types-pyyaml>=6.0 ; extra == 'dev'
24
+ Requires-Dist: maturin>=1.7 ; extra == 'dev'
25
+ Provides-Extra: dev
26
+ License-File: LICENSE
27
+ Summary: A PyTorch library for synthetic biology and biodesign machine learning
28
+ Author-email: Mike Arpaia <mike@arpaia.co>
29
+ License: MIT
30
+ Requires-Python: >=3.11
31
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
32
+
33
+ # synbio-torch
34
+
35
+ A PyTorch library for synthetic biology and biodesign machine learning.
36
+
37
+ Installed as `synbio-torch`, imported as `synbiotorch` (commonly `import synbiotorch as st`).
38
+
39
+ synbio-torch ingests biological designs and sequences from many sources — labeled
40
+ FASTA, CSV/TSV tables, GenBank, SBOL, an [sbol-db](https://github.com/marpaia/sbol-db)
41
+ instance, or a synthetic generator — normalizes them into a single record type
42
+ (`Design`), and trains transformer models against them. The input modality,
43
+ tokenizer, and training objective are all set in configuration, so trying a new
44
+ combination never means forking the pipeline. GenBank and SBOL are parsed
45
+ in-process by native [sbol-rs](https://github.com/marpaia/sbol-rs) bindings.
46
+
47
+ ## Capabilities
48
+
49
+ | Axis | Options |
50
+ |------|---------|
51
+ | **Data sources** | labeled **FASTA**, CSV/TSV **tables**, **GenBank**, **SBOL** (2 & 3), the sbol-db REST API, or a **synthetic** generator; loaded in-memory or **streamed** from sharded Parquet for corpora larger than RAM |
52
+ | **Tokenizers** | pretrained HuggingFace (`hf`), overlapping k-mer, or character-level over a nucleotide **or protein** alphabet (encode + decode) |
53
+ | **Modalities** | `sequence`, `structure_aware` (feature boundaries), `graph` (PyG composition transformer) |
54
+ | **Objectives** | `supervised` / `frozen` heads, `mlm` and `causal` pretraining (from-scratch or continued) |
55
+ | **Architectures** | from-scratch or pretrained; absolute or **RoPE** positions (`gpt_neox`/`llama`/`modernbert`), SDPA/FlashAttention, configurable context length |
56
+ | **Generation** | autoregressive sampling (temperature / top-k / top-p) and design completion from a causal backbone (`synbiotorch generate`) |
57
+ | **Engine** | raw-PyTorch loop, epoch- or **step-budgeted**; AMP (`fp16`/`bf16`), gradient accumulation/clipping, gradient checkpointing, `torch.compile`; **resumable** checkpoints; early stopping; LR schedule |
58
+ | **Scaling** | token **packing**, multi-GPU **DDP** (data-parallel) via `torchrun` |
59
+ | **Tracking** | per-epoch `metrics.jsonl`, optional [Weights & Biases](https://docs.wandb.ai/) (scalars, config, lineage, model artifact) |
60
+ | **Reproducibility** | one validated config per run, seeded / hash splits, content-fingerprinted sharded Parquet cache, resumable runs |
61
+
62
+ ## Install
63
+
64
+ ```bash
65
+ pip install synbio-torch
66
+ ```
67
+
68
+ synbio-torch ships a native extension (PyO3 bindings to the sbol-rs Rust crates).
69
+ Building from source needs a Rust toolchain (≥ 1.93); a prebuilt wheel needs none.
70
+
71
+ For development, build the extension into the venv with maturin:
72
+
73
+ ```bash
74
+ uv venv
75
+ uv pip install -e '.[dev]' # compiles the Rust extension on install
76
+ # After editing Rust under rust/, rebuild with:
77
+ uv run maturin develop
78
+ ```
79
+
80
+ ## Quickstart
81
+
82
+ A run is fully specified by one YAML config. From the command line:
83
+
84
+ ```bash
85
+ # Materialize a corpus to the local Parquet cache (offline, reproducible).
86
+ synbiotorch ingest examples/configs/finetune_expression.yaml
87
+
88
+ # Train. Resolved config, per-epoch metrics.jsonl, and best.pt land in output_dir.
89
+ synbiotorch train examples/configs/finetune_expression.yaml
90
+
91
+ # Resume an interrupted run from its rolling checkpoint (needs checkpoint_every_n_steps).
92
+ synbiotorch train examples/configs/pretrain_mlm.yaml --resume runs/pretrain_mlm/last.pt
93
+
94
+ # Generate from a trained causal backbone — point model.backbone at the run's
95
+ # backbone/ (with from_scratch: false), then complete a design from a prompt.
96
+ synbiotorch generate my_causal_run.yaml --prompt ATGCGT --max-new-tokens 200 --temperature 0.8
97
+ ```
98
+
99
+ Train multi-GPU with `torchrun` and `train.distributed.strategy: ddp`:
100
+
101
+ ```bash
102
+ torchrun --nproc_per_node=<gpus> -m synbiotorch.cli train examples/configs/pretrain_causal_long.yaml
103
+ ```
104
+
105
+ Or from Python:
106
+
107
+ ```python
108
+ import synbiotorch as st
109
+
110
+ config = st.RunConfig.from_yaml("examples/configs/train_graph.yaml")
111
+ metrics = st.run_training(config)
112
+ ```
113
+
114
+ ### Example configs
115
+
116
+ | Config | What it does |
117
+ |--------|--------------|
118
+ | [`finetune_expression.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_expression.yaml) | Frozen DNABERT-2 backbone feeding a regression head. |
119
+ | [`pretrain_mlm.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/pretrain_mlm.yaml) | From-scratch masked-LM pretraining; writes a reusable backbone. |
120
+ | [`finetune_structure_aware.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_structure_aware.yaml) | Sequence + feature-boundary markers. |
121
+ | [`train_graph.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/train_graph.yaml) | Graph transformer over the composition graph. |
122
+ | [`pretrain_causal_long.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/pretrain_causal_long.yaml) | Long-context causal pretraining: RoPE decoder, SDPA, streamed + packed corpus. |
123
+ | [`finetune_protein.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_protein.yaml) | Protein regression from a labeled CSV table with the protein char tokenizer. |
124
+ | [`benchmark_dna_classification.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/benchmark_dna_classification.yaml) | Genomics-ML benchmark shape: a labeled table fed to a pretrained DNA backbone for classification. |
125
+ | [`ingest_genbank.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/ingest_genbank.yaml) | Import GenBank to the Parquet cache via the native binding. |
126
+
127
+ ## Experiment tracking
128
+
129
+ The two synthetic-data configs ([`train_graph.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/train_graph.yaml)
130
+ and [`finetune_structure_aware.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_structure_aware.yaml))
131
+ ship with [Weights & Biases](https://docs.wandb.ai/) enabled. Set `WANDB_API_KEY`
132
+ in a `.env` at the repo root and run both:
133
+
134
+ ```bash
135
+ python examples/run_wandb_examples.py
136
+ ```
137
+
138
+ Each run logs per-step loss and learning rate, per-epoch train/val metrics, the
139
+ resolved config, the corpus fingerprint and split sizes as lineage, and the best
140
+ checkpoint as a model artifact.
141
+
142
+ | Graph transformer | Structure-aware sequence |
143
+ |-------------------|--------------------------|
144
+ | ![train_graph W&B run](https://raw.githubusercontent.com/marpaia/synbio-torch/master/docs/images/wandb_train_graph.png) | ![structure_aware W&B run](https://raw.githubusercontent.com/marpaia/synbio-torch/master/docs/images/wandb_structure_aware.png) |
145
+
146
+ ## Documentation
147
+
148
+ | Doc | Contents |
149
+ |-----|----------|
150
+ | [architecture.md](https://github.com/marpaia/synbio-torch/blob/master/docs/architecture.md) | How the system is built — record type, plug points, engine, data flow. |
151
+ | [capabilities.md](https://github.com/marpaia/synbio-torch/blob/master/docs/capabilities.md) | Modalities, objectives, tokenizers, metrics. |
152
+ | [configuration.md](https://github.com/marpaia/synbio-torch/blob/master/docs/configuration.md) | Complete `RunConfig` reference. |
153
+ | [data.md](https://github.com/marpaia/synbio-torch/blob/master/docs/data.md) | Data sources, native parsing, materialization, fixtures. |
154
+ | [backbones.md](https://github.com/marpaia/synbio-torch/blob/master/docs/backbones.md) | Choosing/loading backbones and environment constraints. |
155
+ | [extending.md](https://github.com/marpaia/synbio-torch/blob/master/docs/extending.md) | Adding a tokenizer, encoder, task, callback, or data source. |
156
+
157
+ Release history is in [CHANGELOG.md](https://github.com/marpaia/synbio-torch/blob/master/CHANGELOG.md).
158
+
159
+ ## Develop
160
+
161
+ ```bash
162
+ uv run maturin develop # rebuild the Rust extension after editing rust/
163
+ uv run pytest
164
+ pre-commit run --all-files
165
+ ```
166
+
@@ -0,0 +1,133 @@
1
+ # synbio-torch
2
+
3
+ A PyTorch library for synthetic biology and biodesign machine learning.
4
+
5
+ Installed as `synbio-torch`, imported as `synbiotorch` (commonly `import synbiotorch as st`).
6
+
7
+ synbio-torch ingests biological designs and sequences from many sources — labeled
8
+ FASTA, CSV/TSV tables, GenBank, SBOL, an [sbol-db](https://github.com/marpaia/sbol-db)
9
+ instance, or a synthetic generator — normalizes them into a single record type
10
+ (`Design`), and trains transformer models against them. The input modality,
11
+ tokenizer, and training objective are all set in configuration, so trying a new
12
+ combination never means forking the pipeline. GenBank and SBOL are parsed
13
+ in-process by native [sbol-rs](https://github.com/marpaia/sbol-rs) bindings.
14
+
15
+ ## Capabilities
16
+
17
+ | Axis | Options |
18
+ |------|---------|
19
+ | **Data sources** | labeled **FASTA**, CSV/TSV **tables**, **GenBank**, **SBOL** (2 & 3), the sbol-db REST API, or a **synthetic** generator; loaded in-memory or **streamed** from sharded Parquet for corpora larger than RAM |
20
+ | **Tokenizers** | pretrained HuggingFace (`hf`), overlapping k-mer, or character-level over a nucleotide **or protein** alphabet (encode + decode) |
21
+ | **Modalities** | `sequence`, `structure_aware` (feature boundaries), `graph` (PyG composition transformer) |
22
+ | **Objectives** | `supervised` / `frozen` heads, `mlm` and `causal` pretraining (from-scratch or continued) |
23
+ | **Architectures** | from-scratch or pretrained; absolute or **RoPE** positions (`gpt_neox`/`llama`/`modernbert`), SDPA/FlashAttention, configurable context length |
24
+ | **Generation** | autoregressive sampling (temperature / top-k / top-p) and design completion from a causal backbone (`synbiotorch generate`) |
25
+ | **Engine** | raw-PyTorch loop, epoch- or **step-budgeted**; AMP (`fp16`/`bf16`), gradient accumulation/clipping, gradient checkpointing, `torch.compile`; **resumable** checkpoints; early stopping; LR schedule |
26
+ | **Scaling** | token **packing**, multi-GPU **DDP** (data-parallel) via `torchrun` |
27
+ | **Tracking** | per-epoch `metrics.jsonl`, optional [Weights & Biases](https://docs.wandb.ai/) (scalars, config, lineage, model artifact) |
28
+ | **Reproducibility** | one validated config per run, seeded / hash splits, content-fingerprinted sharded Parquet cache, resumable runs |
29
+
30
+ ## Install
31
+
32
+ ```bash
33
+ pip install synbio-torch
34
+ ```
35
+
36
+ synbio-torch ships a native extension (PyO3 bindings to the sbol-rs Rust crates).
37
+ Building from source needs a Rust toolchain (≥ 1.93); a prebuilt wheel needs none.
38
+
39
+ For development, build the extension into the venv with maturin:
40
+
41
+ ```bash
42
+ uv venv
43
+ uv pip install -e '.[dev]' # compiles the Rust extension on install
44
+ # After editing Rust under rust/, rebuild with:
45
+ uv run maturin develop
46
+ ```
47
+
48
+ ## Quickstart
49
+
50
+ A run is fully specified by one YAML config. From the command line:
51
+
52
+ ```bash
53
+ # Materialize a corpus to the local Parquet cache (offline, reproducible).
54
+ synbiotorch ingest examples/configs/finetune_expression.yaml
55
+
56
+ # Train. Resolved config, per-epoch metrics.jsonl, and best.pt land in output_dir.
57
+ synbiotorch train examples/configs/finetune_expression.yaml
58
+
59
+ # Resume an interrupted run from its rolling checkpoint (needs checkpoint_every_n_steps).
60
+ synbiotorch train examples/configs/pretrain_mlm.yaml --resume runs/pretrain_mlm/last.pt
61
+
62
+ # Generate from a trained causal backbone — point model.backbone at the run's
63
+ # backbone/ (with from_scratch: false), then complete a design from a prompt.
64
+ synbiotorch generate my_causal_run.yaml --prompt ATGCGT --max-new-tokens 200 --temperature 0.8
65
+ ```
66
+
67
+ Train multi-GPU with `torchrun` and `train.distributed.strategy: ddp`:
68
+
69
+ ```bash
70
+ torchrun --nproc_per_node=<gpus> -m synbiotorch.cli train examples/configs/pretrain_causal_long.yaml
71
+ ```
72
+
73
+ Or from Python:
74
+
75
+ ```python
76
+ import synbiotorch as st
77
+
78
+ config = st.RunConfig.from_yaml("examples/configs/train_graph.yaml")
79
+ metrics = st.run_training(config)
80
+ ```
81
+
82
+ ### Example configs
83
+
84
+ | Config | What it does |
85
+ |--------|--------------|
86
+ | [`finetune_expression.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_expression.yaml) | Frozen DNABERT-2 backbone feeding a regression head. |
87
+ | [`pretrain_mlm.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/pretrain_mlm.yaml) | From-scratch masked-LM pretraining; writes a reusable backbone. |
88
+ | [`finetune_structure_aware.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_structure_aware.yaml) | Sequence + feature-boundary markers. |
89
+ | [`train_graph.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/train_graph.yaml) | Graph transformer over the composition graph. |
90
+ | [`pretrain_causal_long.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/pretrain_causal_long.yaml) | Long-context causal pretraining: RoPE decoder, SDPA, streamed + packed corpus. |
91
+ | [`finetune_protein.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_protein.yaml) | Protein regression from a labeled CSV table with the protein char tokenizer. |
92
+ | [`benchmark_dna_classification.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/benchmark_dna_classification.yaml) | Genomics-ML benchmark shape: a labeled table fed to a pretrained DNA backbone for classification. |
93
+ | [`ingest_genbank.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/ingest_genbank.yaml) | Import GenBank to the Parquet cache via the native binding. |
94
+
95
+ ## Experiment tracking
96
+
97
+ The two synthetic-data configs ([`train_graph.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/train_graph.yaml)
98
+ and [`finetune_structure_aware.yaml`](https://github.com/marpaia/synbio-torch/blob/master/examples/configs/finetune_structure_aware.yaml))
99
+ ship with [Weights & Biases](https://docs.wandb.ai/) enabled. Set `WANDB_API_KEY`
100
+ in a `.env` at the repo root and run both:
101
+
102
+ ```bash
103
+ python examples/run_wandb_examples.py
104
+ ```
105
+
106
+ Each run logs per-step loss and learning rate, per-epoch train/val metrics, the
107
+ resolved config, the corpus fingerprint and split sizes as lineage, and the best
108
+ checkpoint as a model artifact.
109
+
110
+ | Graph transformer | Structure-aware sequence |
111
+ |-------------------|--------------------------|
112
+ | ![train_graph W&B run](https://raw.githubusercontent.com/marpaia/synbio-torch/master/docs/images/wandb_train_graph.png) | ![structure_aware W&B run](https://raw.githubusercontent.com/marpaia/synbio-torch/master/docs/images/wandb_structure_aware.png) |
113
+
114
+ ## Documentation
115
+
116
+ | Doc | Contents |
117
+ |-----|----------|
118
+ | [architecture.md](https://github.com/marpaia/synbio-torch/blob/master/docs/architecture.md) | How the system is built — record type, plug points, engine, data flow. |
119
+ | [capabilities.md](https://github.com/marpaia/synbio-torch/blob/master/docs/capabilities.md) | Modalities, objectives, tokenizers, metrics. |
120
+ | [configuration.md](https://github.com/marpaia/synbio-torch/blob/master/docs/configuration.md) | Complete `RunConfig` reference. |
121
+ | [data.md](https://github.com/marpaia/synbio-torch/blob/master/docs/data.md) | Data sources, native parsing, materialization, fixtures. |
122
+ | [backbones.md](https://github.com/marpaia/synbio-torch/blob/master/docs/backbones.md) | Choosing/loading backbones and environment constraints. |
123
+ | [extending.md](https://github.com/marpaia/synbio-torch/blob/master/docs/extending.md) | Adding a tokenizer, encoder, task, callback, or data source. |
124
+
125
+ Release history is in [CHANGELOG.md](https://github.com/marpaia/synbio-torch/blob/master/CHANGELOG.md).
126
+
127
+ ## Develop
128
+
129
+ ```bash
130
+ uv run maturin develop # rebuild the Rust extension after editing rust/
131
+ uv run pytest
132
+ pre-commit run --all-files
133
+ ```
@@ -0,0 +1,77 @@
1
+ [project]
2
+ # Distribution name on PyPI is hyphenated (`pip install synbio-torch`); the import
3
+ # package is `synbiotorch` (commonly `import synbiotorch as st`).
4
+ name = "synbio-torch"
5
+ version = "0.2.0"
6
+ description = "A PyTorch library for synthetic biology and biodesign machine learning"
7
+ readme = "README.md"
8
+ requires-python = ">=3.11"
9
+ license = { text = "MIT" }
10
+ authors = [{ name = "Mike Arpaia", email = "mike@arpaia.co" }]
11
+ dependencies = [
12
+ "httpx>=0.28",
13
+ "pydantic>=2.13",
14
+ "pyarrow>=24",
15
+ "numpy>=2.4",
16
+ "pyyaml>=6",
17
+ "torch>=2.12",
18
+ # Pin to the 4.x line: transformers 5.x breaks the custom/ESM modeling code
19
+ # the pretrained DNA backbones (Nucleotide Transformer, DNABERT) target.
20
+ "transformers>=4.57,<5",
21
+ "tokenizers>=0.22",
22
+ "einops>=0.8",
23
+ "torch-geometric>=2.8",
24
+ "wandb>=0.27",
25
+ ]
26
+
27
+ [project.optional-dependencies]
28
+ dev = [
29
+ "pytest>=9",
30
+ "pytest-mock>=3.15",
31
+ "respx>=0.23",
32
+ "black>=26.5",
33
+ "isort>=8",
34
+ "flake8>=7.3",
35
+ "mypy>=2.1",
36
+ "pre-commit>=4.6",
37
+ "types-PyYAML>=6.0",
38
+ "maturin>=1.7",
39
+ ]
40
+
41
+ [project.scripts]
42
+ synbiotorch = "synbiotorch.cli:main"
43
+
44
+ [build-system]
45
+ requires = ["maturin>=1.7,<2"]
46
+ build-backend = "maturin"
47
+
48
+ [tool.maturin]
49
+ # Mixed Python/Rust project: Python lives under src/, the Rust crate under rust/.
50
+ # The compiled extension is importable as ``synbiotorch._sbol``.
51
+ python-source = "src"
52
+ manifest-path = "rust/Cargo.toml"
53
+ module-name = "synbiotorch._sbol"
54
+ features = ["pyo3/extension-module"]
55
+ # Bundle the license into the sdist; its metadata declares License-File: LICENSE.
56
+ include = ["LICENSE"]
57
+
58
+ [tool.black]
59
+ line-length = 120
60
+ target-version = ["py311"]
61
+
62
+ [tool.isort]
63
+ profile = "black"
64
+ line_length = 120
65
+ # Confine first-party detection to the source tree so import grouping is
66
+ # deterministic across machines. Without this, isort's auto-detection varies by
67
+ # OS and installed packages (e.g. misclassifying wandb as first-party), making
68
+ # the same code pass on one host and fail on another.
69
+ src_paths = ["src"]
70
+ known_first_party = ["synbiotorch"]
71
+
72
+ [tool.mypy]
73
+ python_version = "3.11"
74
+ warn_unused_configs = true
75
+ disallow_untyped_defs = true
76
+ ignore_missing_imports = true
77
+ files = ["src/synbiotorch"]