flockbio-bioscoring 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flockbio_bioscoring-0.4.0/LICENSE +27 -0
- flockbio_bioscoring-0.4.0/LICENSES.md +107 -0
- flockbio_bioscoring-0.4.0/PKG-INFO +162 -0
- flockbio_bioscoring-0.4.0/README.md +96 -0
- flockbio_bioscoring-0.4.0/pyproject.toml +104 -0
- flockbio_bioscoring-0.4.0/setup.cfg +4 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/__init__.py +103 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/cache.py +116 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/__init__.py +68 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/artifact_schema.py +216 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/merge_artifacts.py +262 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/nnls.py +312 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/ship_gate.py +493 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/upload_gate.py +102 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/dataset_spec.py +280 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/download.py +311 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/europepmc.py +131 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/parsers/__init__.py +44 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/parsers/csv_tsv.py +125 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/parsers/fasta.py +111 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/parsers/gtf.py +116 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/py.typed +0 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/__init__.py +86 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/chromium_pdf.py +160 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/figures.py +129 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/style.py +208 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/templates/base.html.j2 +127 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/templates_loader.py +123 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/sentinels.py +95 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/PKG-INFO +162 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/SOURCES.txt +41 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/dependency_links.txt +1 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/requires.txt +19 -0
- flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/top_level.txt +1 -0
- flockbio_bioscoring-0.4.0/tests/test_cache.py +173 -0
- flockbio_bioscoring-0.4.0/tests/test_calibration.py +657 -0
- flockbio_bioscoring-0.4.0/tests/test_dataset_spec.py +304 -0
- flockbio_bioscoring-0.4.0/tests/test_download.py +201 -0
- flockbio_bioscoring-0.4.0/tests/test_europepmc.py +133 -0
- flockbio_bioscoring-0.4.0/tests/test_merge_artifacts.py +619 -0
- flockbio_bioscoring-0.4.0/tests/test_parsers.py +198 -0
- flockbio_bioscoring-0.4.0/tests/test_reporting.py +486 -0
- flockbio_bioscoring-0.4.0/tests/test_sentinels.py +140 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Flock Bio
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
NOTE: The MIT license applies to the LIBRARY CODE in this repository. License
|
|
24
|
+
terms for downloaded / bundled DATASETS are tracked separately in
|
|
25
|
+
`LICENSES.md` per the L20 4-layer audit pattern (article + supplementary +
|
|
26
|
+
deposit + cross-walk). Datasets that fail the audit are HARD BLOCKED via the
|
|
27
|
+
SENTINEL_LICENSE_BLOCKED marker in their DatasetSpec entries.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# LICENSES
|
|
2
|
+
|
|
3
|
+
This file tracks licenses for **datasets** that the
|
|
4
|
+
`flockbio_bioscoring` library will eventually download or
|
|
5
|
+
bundle. The library code itself is MIT-licensed (see
|
|
6
|
+
`LICENSE`).
|
|
7
|
+
|
|
8
|
+
## License audit framework (per L20 + Part IX.7)
|
|
9
|
+
|
|
10
|
+
Every dataset registered as a :class:`DatasetSpec` carries a
|
|
11
|
+
`license` field + a `license_audit_note` that documents the
|
|
12
|
+
**4-layer license audit**:
|
|
13
|
+
|
|
14
|
+
1. **Article layer**: the journal article's license (CC BY,
|
|
15
|
+
CC BY-NC, CC BY-NC-ND, etc.). HARD BLOCKED if NC or ND
|
|
16
|
+
per L5.
|
|
17
|
+
2. **Supplementary layer**: the supplementary file's license.
|
|
18
|
+
May differ from the article (some publishers grant CC BY
|
|
19
|
+
to the article but retain commercial rights for
|
|
20
|
+
supplementary data).
|
|
21
|
+
3. **Deposit layer**: the data deposit's license (figshare,
|
|
22
|
+
Zenodo, GEO, ArrayExpress, etc.). May further restrict.
|
|
23
|
+
4. **Cross-walk layer**: the per-row license check for
|
|
24
|
+
datasets that include multi-source data (e.g., an MPRA
|
|
25
|
+
library that lists sequences from licensed source X and
|
|
26
|
+
sequences from licensed source Y — each row inherits its
|
|
27
|
+
source's license).
|
|
28
|
+
|
|
29
|
+
Only datasets that pass ALL FOUR layers are added to a
|
|
30
|
+
DatasetSpec with a real (non-`SENTINEL_LICENSE_BLOCKED`)
|
|
31
|
+
`expected_sha256`.
|
|
32
|
+
|
|
33
|
+
## Sentinel marker for blocked datasets
|
|
34
|
+
|
|
35
|
+
Datasets that fail any layer of the audit are recorded as
|
|
36
|
+
`DatasetSpec` entries with
|
|
37
|
+
`expected_sha256=SENTINEL_LICENSE_BLOCKED` + `parser=None`.
|
|
38
|
+
This documents the audit decision in the same registry as the
|
|
39
|
+
allowed datasets (forensic traceability) without any risk of
|
|
40
|
+
accidentally downloading the blocked data.
|
|
41
|
+
|
|
42
|
+
Example (from the codon repo's known-blocked list):
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
DatasetSpec(
|
|
46
|
+
name="mauger2019_human_5utr_mpra",
|
|
47
|
+
url="https://www.pnas.org/doi/10.1073/pnas.1909563116",
|
|
48
|
+
expected_sha256=SENTINEL_LICENSE_BLOCKED,
|
|
49
|
+
license="CC BY-NC 4.0",
|
|
50
|
+
license_audit_note=(
|
|
51
|
+
"PNAS 116:24075. Article published with CC BY-NC. "
|
|
52
|
+
"HARD BLOCKED per L5; not downloaded, not parsed. "
|
|
53
|
+
"Useful as a how-we-would-love-to-use-this footnote."
|
|
54
|
+
),
|
|
55
|
+
parser=None,
|
|
56
|
+
)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Per-dataset audit log
|
|
60
|
+
|
|
61
|
+
When the Tier 1 extraction lands real datasets at Sprint 4
|
|
62
|
+
Day 3, this section will be populated with one entry per
|
|
63
|
+
DatasetSpec.
|
|
64
|
+
|
|
65
|
+
The Sprint 4 Day 1 scaffold (this commit) does NOT include
|
|
66
|
+
any real dataset entries — those migrate over from
|
|
67
|
+
codon / 5utr / 3utr's existing audit ledgers in subsequent
|
|
68
|
+
commits.
|
|
69
|
+
|
|
70
|
+
## Known HARD BLOCKED datasets (inherited from consumers)
|
|
71
|
+
|
|
72
|
+
These are blocked for ALL consumers + the orchestrator. Per
|
|
73
|
+
L5 + L20, NC or ND-licensed datasets are HARD BLOCKED:
|
|
74
|
+
|
|
75
|
+
- **Mauger 2019 (PNAS 116:24075)**: CC BY-NC. 5'UTR MPRA.
|
|
76
|
+
- **Tani 2012 (Genome Res 22:947)**: CC BY-NC. mRNA half-life.
|
|
77
|
+
- **Schwanhäusser 2011 (Nature 473:337)**: CC BY-NC. mRNA
|
|
78
|
+
half-life + protein abundance.
|
|
79
|
+
- **Ginkgo 180k 3'UTR MPRA**: corresponding author no
|
|
80
|
+
longer at Ginkgo (2026-05-02 inquiry); license unknown.
|
|
81
|
+
Treated as blocked until license confirmed.
|
|
82
|
+
- **TargetScan pre-computed scores**: license uncertain
|
|
83
|
+
(CC BY-NC suspected but not confirmed); treated as blocked
|
|
84
|
+
until L20 audit completes. We CAN recompute seed-match
|
|
85
|
+
counts from miRBase (CC BY 4.0).
|
|
86
|
+
|
|
87
|
+
## Reference for new dataset additions
|
|
88
|
+
|
|
89
|
+
When adding a DatasetSpec, follow this checklist:
|
|
90
|
+
|
|
91
|
+
- [ ] Found article + checked CC license tier (Article layer).
|
|
92
|
+
- [ ] Found supplementary file + checked its license
|
|
93
|
+
(Supplementary layer).
|
|
94
|
+
- [ ] Located the data deposit (figshare / Zenodo / GEO /
|
|
95
|
+
ArrayExpress) + checked its license (Deposit layer).
|
|
96
|
+
- [ ] Checked any cross-walk references for per-row
|
|
97
|
+
license inheritance (Cross-walk layer).
|
|
98
|
+
- [ ] Recorded the audit decision in this file with a
|
|
99
|
+
direct link to the article + the deposit.
|
|
100
|
+
- [ ] If approved, computed the SHA-256 (or marked with the
|
|
101
|
+
appropriate sentinel).
|
|
102
|
+
- [ ] If blocked, added DatasetSpec with
|
|
103
|
+
`SENTINEL_LICENSE_BLOCKED` + `parser=None`.
|
|
104
|
+
|
|
105
|
+
The `phase_d_calibration_matrix.yml` GHA workflow runs the
|
|
106
|
+
download + the SHA verification fresh on every dispatch, so
|
|
107
|
+
audit decisions are revisited rather than cached.
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flockbio_bioscoring
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Flock Bio shared scoring + calibration infrastructure for transcript-design sub-pipelines.
|
|
5
|
+
Author: Flock Bio
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Flock Bio
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
NOTE: The MIT license applies to the LIBRARY CODE in this repository. License
|
|
29
|
+
terms for downloaded / bundled DATASETS are tracked separately in
|
|
30
|
+
`LICENSES.md` per the L20 4-layer audit pattern (article + supplementary +
|
|
31
|
+
deposit + cross-walk). Datasets that fail the audit are HARD BLOCKED via the
|
|
32
|
+
SENTINEL_LICENSE_BLOCKED marker in their DatasetSpec entries.
|
|
33
|
+
|
|
34
|
+
Project-URL: Repository, https://github.com/flock-bio/bioscoring-shared-FB
|
|
35
|
+
Project-URL: Documentation, https://github.com/flock-bio/bioscoring-shared-FB/blob/main/docs/INTEGRATION_GUIDE.md
|
|
36
|
+
Project-URL: Changelog, https://github.com/flock-bio/bioscoring-shared-FB/blob/main/CHANGELOG.md
|
|
37
|
+
Keywords: bioinformatics,mRNA,calibration,MPRA,Flock Bio
|
|
38
|
+
Classifier: Development Status :: 3 - Alpha
|
|
39
|
+
Classifier: Intended Audience :: Science/Research
|
|
40
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
41
|
+
Classifier: Programming Language :: Python :: 3
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
44
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
45
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
46
|
+
Requires-Python: >=3.10
|
|
47
|
+
Description-Content-Type: text/markdown
|
|
48
|
+
License-File: LICENSE
|
|
49
|
+
License-File: LICENSES.md
|
|
50
|
+
Requires-Dist: numpy<3,>=1.24
|
|
51
|
+
Requires-Dist: scipy<2,>=1.10
|
|
52
|
+
Provides-Extra: calibration
|
|
53
|
+
Requires-Dist: pandas<3,>=2.0; extra == "calibration"
|
|
54
|
+
Requires-Dist: openpyxl>=3.1; extra == "calibration"
|
|
55
|
+
Provides-Extra: reporting
|
|
56
|
+
Requires-Dist: Jinja2<4,>=3.1; extra == "reporting"
|
|
57
|
+
Requires-Dist: matplotlib<4,>=3.7; extra == "reporting"
|
|
58
|
+
Provides-Extra: dev
|
|
59
|
+
Requires-Dist: pytest<9,>=8.0; extra == "dev"
|
|
60
|
+
Requires-Dist: pytest-cov>=4.1; extra == "dev"
|
|
61
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
62
|
+
Requires-Dist: mypy>=1.8; extra == "dev"
|
|
63
|
+
Provides-Extra: all
|
|
64
|
+
Requires-Dist: flockbio_bioscoring[calibration,dev,reporting]; extra == "all"
|
|
65
|
+
Dynamic: license-file
|
|
66
|
+
|
|
67
|
+
# bioscoring-shared-FB
|
|
68
|
+
|
|
69
|
+
**Flock Bio shared scoring + calibration infrastructure
|
|
70
|
+
library** (`flockbio_bioscoring`). The 5th repo in the Flock
|
|
71
|
+
Bio transcript-design suite.
|
|
72
|
+
|
|
73
|
+
This library packages the common machinery used by the four
|
|
74
|
+
sub-pipelines + the orchestrator:
|
|
75
|
+
|
|
76
|
+
- DatasetSpec dataclass + sentinel framework for offline
|
|
77
|
+
calibration runs
|
|
78
|
+
- License-aware download helpers with retries + EBI tier-0
|
|
79
|
+
probe
|
|
80
|
+
- Generic FASTA / CSV / GTF parsers
|
|
81
|
+
- Bootstrap LODO + null-model + ship gate framework
|
|
82
|
+
- NNLS calibration helpers
|
|
83
|
+
- HTML / PDF report scaffolding (Jinja2 + matplotlib +
|
|
84
|
+
Chromium)
|
|
85
|
+
|
|
86
|
+
Per the **Rule of Three** (L21 in `docs/CROSS_PIPELINE_LESSONS.md`;
|
|
87
|
+
canonical here since 2026-05-12, moved from codon's `docs/`),
|
|
88
|
+
this library is extracted from the **already-shipping** codon
|
|
89
|
+
+ 5'UTR + 3'UTR pipelines after they paid for the
|
|
90
|
+
infrastructure once each.
|
|
91
|
+
|
|
92
|
+
## Status
|
|
93
|
+
|
|
94
|
+
✅ **v0.4.0 shipped** (Sprint 4 Day 8). All three extraction
|
|
95
|
+
tiers are complete; the library is feature-complete for the
|
|
96
|
+
Sprint 4 scope. Next milestone is **v1.0.0** after the four
|
|
97
|
+
consumer pipelines migrate from their inline copies to the
|
|
98
|
+
shared lib (see `docs/CONSUMER_MIGRATION_PLAN.md` +
|
|
99
|
+
`docs/CONSUMER_MIGRATION_STATUS.md`).
|
|
100
|
+
|
|
101
|
+
| Tier | Sprint 4 days | Modules | Status |
|
|
102
|
+
|---|---|---|---|
|
|
103
|
+
| 1 | Days 1-3 | DatasetSpec + sentinels + download + parsers + cache + EuropePMC tier-0 | ✅ shipped (v0.1.0) |
|
|
104
|
+
| 2 | Days 4-5 | NNLS + ship gate + upload-success-gate + merge_artifacts + artifact_schema | ✅ shipped (v0.2.0) |
|
|
105
|
+
| 3 | Days 6-7 | HTML / PDF report scaffolding (style + figures + chromium_pdf + templates_loader) | ✅ shipped (v0.3.0) |
|
|
106
|
+
|
|
107
|
+
## Strategic position
|
|
108
|
+
|
|
109
|
+
| Repo | Role | Status |
|
|
110
|
+
|---|---|---|
|
|
111
|
+
| `flock-bio/codon_optimization-FB` | Codon optimization | ✅ shipping (v2.8.0) |
|
|
112
|
+
| `flock-bio/5utr_optimization-FB` | 5'UTR design | ✅ shipping (v2.8.24) |
|
|
113
|
+
| `flock-bio/3utr_optimization-FB` | 3'UTR design | ✅ shipping (v0.2.0) |
|
|
114
|
+
| `flock-bio/promoter_design-FB` | Tissue-specific promoter design | 📋 Sprint 8-12 |
|
|
115
|
+
| `flock-bio/bioscoring-shared-FB` (THIS REPO) | Shared library | ✅ v0.4.0 shipped (consumer-harmonized); 4-consumer migration in flight → v1.0.0 |
|
|
116
|
+
| `flock-bio/transcript_design-FB` | Orchestrator | 🟡 Sprint 13-16 (currently v0.4.1) |
|
|
117
|
+
|
|
118
|
+
## Installation
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
pip install flockbio_bioscoring # once published
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
For development:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
pip install -e .[dev]
|
|
128
|
+
PYTHONPATH=src python3 -m pytest tests/ -q
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Documentation
|
|
132
|
+
|
|
133
|
+
- `CLAUDE.md` — Claude Code guidance for this repo.
|
|
134
|
+
- `CHANGELOG.md` — chronological history of behavior changes.
|
|
135
|
+
- `docs/INTEGRATION_GUIDE.md` — API surface reference + recipes
|
|
136
|
+
for consuming pipelines to migrate from inline copies to the
|
|
137
|
+
shared library.
|
|
138
|
+
- `docs/CONSUMER_MIGRATION_PLAN.md` — master migration plan
|
|
139
|
+
for the 4-consumer migration (binding order, per-consumer
|
|
140
|
+
effort, Step 0-9 recipe).
|
|
141
|
+
- `docs/CONSUMER_MIGRATION_STATUS.md` — live status tracker
|
|
142
|
+
for the 4-consumer migration.
|
|
143
|
+
- `docs/CALIBRATION_RUNBOOK_TEMPLATE.md` — template for
|
|
144
|
+
consumer-side `CALIBRATION_RUNBOOK.md` (Phase D / OFFLINE
|
|
145
|
+
calibration operator doc; consumers extend with pipeline
|
|
146
|
+
specifics).
|
|
147
|
+
- `docs/CROSS_PIPELINE_LESSONS.md` — canonical strategic doc
|
|
148
|
+
for the 6-repo suite (Strategic North Star + Part I L1-L21
|
|
149
|
+
lessons + Part X per-pipeline kickoff guides + Part XI
|
|
150
|
+
binding I/O contracts). Moved here from codon's `docs/`
|
|
151
|
+
on 2026-05-12 per L21.
|
|
152
|
+
- `docs/CYPHERBIO_PIPELINE_PLAYBOOK.md` +
|
|
153
|
+
`docs/CYPHERBIO_PLAYBOOK_v2.md` — canonical CypherBio
|
|
154
|
+
packaging conventions (moved here from per-consumer copies
|
|
155
|
+
on 2026-05-12 per L21).
|
|
156
|
+
|
|
157
|
+
## License
|
|
158
|
+
|
|
159
|
+
The library code is licensed under the MIT license (see
|
|
160
|
+
`LICENSE`). License terms for bundled / downloaded datasets
|
|
161
|
+
are tracked separately in `LICENSES.md` per the L20 4-layer
|
|
162
|
+
audit pattern (article + supplementary + deposit + cross-walk).
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# bioscoring-shared-FB
|
|
2
|
+
|
|
3
|
+
**Flock Bio shared scoring + calibration infrastructure
|
|
4
|
+
library** (`flockbio_bioscoring`). The 5th repo in the Flock
|
|
5
|
+
Bio transcript-design suite.
|
|
6
|
+
|
|
7
|
+
This library packages the common machinery used by the four
|
|
8
|
+
sub-pipelines + the orchestrator:
|
|
9
|
+
|
|
10
|
+
- DatasetSpec dataclass + sentinel framework for offline
|
|
11
|
+
calibration runs
|
|
12
|
+
- License-aware download helpers with retries + EBI tier-0
|
|
13
|
+
probe
|
|
14
|
+
- Generic FASTA / CSV / GTF parsers
|
|
15
|
+
- Bootstrap LODO + null-model + ship gate framework
|
|
16
|
+
- NNLS calibration helpers
|
|
17
|
+
- HTML / PDF report scaffolding (Jinja2 + matplotlib +
|
|
18
|
+
Chromium)
|
|
19
|
+
|
|
20
|
+
Per the **Rule of Three** (L21 in `docs/CROSS_PIPELINE_LESSONS.md`;
|
|
21
|
+
canonical here since 2026-05-12, moved from codon's `docs/`),
|
|
22
|
+
this library is extracted from the **already-shipping** codon
|
|
23
|
+
+ 5'UTR + 3'UTR pipelines after they paid for the
|
|
24
|
+
infrastructure once each.
|
|
25
|
+
|
|
26
|
+
## Status
|
|
27
|
+
|
|
28
|
+
✅ **v0.4.0 shipped** (Sprint 4 Day 8). All three extraction
|
|
29
|
+
tiers are complete; the library is feature-complete for the
|
|
30
|
+
Sprint 4 scope. Next milestone is **v1.0.0** after the four
|
|
31
|
+
consumer pipelines migrate from their inline copies to the
|
|
32
|
+
shared lib (see `docs/CONSUMER_MIGRATION_PLAN.md` +
|
|
33
|
+
`docs/CONSUMER_MIGRATION_STATUS.md`).
|
|
34
|
+
|
|
35
|
+
| Tier | Sprint 4 days | Modules | Status |
|
|
36
|
+
|---|---|---|---|
|
|
37
|
+
| 1 | Days 1-3 | DatasetSpec + sentinels + download + parsers + cache + EuropePMC tier-0 | ✅ shipped (v0.1.0) |
|
|
38
|
+
| 2 | Days 4-5 | NNLS + ship gate + upload-success-gate + merge_artifacts + artifact_schema | ✅ shipped (v0.2.0) |
|
|
39
|
+
| 3 | Days 6-7 | HTML / PDF report scaffolding (style + figures + chromium_pdf + templates_loader) | ✅ shipped (v0.3.0) |
|
|
40
|
+
|
|
41
|
+
## Strategic position
|
|
42
|
+
|
|
43
|
+
| Repo | Role | Status |
|
|
44
|
+
|---|---|---|
|
|
45
|
+
| `flock-bio/codon_optimization-FB` | Codon optimization | ✅ shipping (v2.8.0) |
|
|
46
|
+
| `flock-bio/5utr_optimization-FB` | 5'UTR design | ✅ shipping (v2.8.24) |
|
|
47
|
+
| `flock-bio/3utr_optimization-FB` | 3'UTR design | ✅ shipping (v0.2.0) |
|
|
48
|
+
| `flock-bio/promoter_design-FB` | Tissue-specific promoter design | 📋 Sprint 8-12 |
|
|
49
|
+
| `flock-bio/bioscoring-shared-FB` (THIS REPO) | Shared library | ✅ v0.4.0 shipped (consumer-harmonized); 4-consumer migration in flight → v1.0.0 |
|
|
50
|
+
| `flock-bio/transcript_design-FB` | Orchestrator | 🟡 Sprint 13-16 (currently v0.4.1) |
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install flockbio_bioscoring # once published
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
For development:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install -e .[dev]
|
|
62
|
+
PYTHONPATH=src python3 -m pytest tests/ -q
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Documentation
|
|
66
|
+
|
|
67
|
+
- `CLAUDE.md` — Claude Code guidance for this repo.
|
|
68
|
+
- `CHANGELOG.md` — chronological history of behavior changes.
|
|
69
|
+
- `docs/INTEGRATION_GUIDE.md` — API surface reference + recipes
|
|
70
|
+
for consuming pipelines to migrate from inline copies to the
|
|
71
|
+
shared library.
|
|
72
|
+
- `docs/CONSUMER_MIGRATION_PLAN.md` — master migration plan
|
|
73
|
+
for the 4-consumer migration (binding order, per-consumer
|
|
74
|
+
effort, Step 0-9 recipe).
|
|
75
|
+
- `docs/CONSUMER_MIGRATION_STATUS.md` — live status tracker
|
|
76
|
+
for the 4-consumer migration.
|
|
77
|
+
- `docs/CALIBRATION_RUNBOOK_TEMPLATE.md` — template for
|
|
78
|
+
consumer-side `CALIBRATION_RUNBOOK.md` (Phase D / OFFLINE
|
|
79
|
+
calibration operator doc; consumers extend with pipeline
|
|
80
|
+
specifics).
|
|
81
|
+
- `docs/CROSS_PIPELINE_LESSONS.md` — canonical strategic doc
|
|
82
|
+
for the 6-repo suite (Strategic North Star + Part I L1-L21
|
|
83
|
+
lessons + Part X per-pipeline kickoff guides + Part XI
|
|
84
|
+
binding I/O contracts). Moved here from codon's `docs/`
|
|
85
|
+
on 2026-05-12 per L21.
|
|
86
|
+
- `docs/CYPHERBIO_PIPELINE_PLAYBOOK.md` +
|
|
87
|
+
`docs/CYPHERBIO_PLAYBOOK_v2.md` — canonical CypherBio
|
|
88
|
+
packaging conventions (moved here from per-consumer copies
|
|
89
|
+
on 2026-05-12 per L21).
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
|
|
93
|
+
The library code is licensed under the MIT license (see
|
|
94
|
+
`LICENSE`). License terms for bundled / downloaded datasets
|
|
95
|
+
are tracked separately in `LICENSES.md` per the L20 4-layer
|
|
96
|
+
audit pattern (article + supplementary + deposit + cross-walk).
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "flockbio_bioscoring"
|
|
7
|
+
version = "0.4.0"
|
|
8
|
+
description = "Flock Bio shared scoring + calibration infrastructure for transcript-design sub-pipelines."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Flock Bio" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"bioinformatics",
|
|
17
|
+
"mRNA",
|
|
18
|
+
"calibration",
|
|
19
|
+
"MPRA",
|
|
20
|
+
"Flock Bio",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
# Core scientific stack. Pinned conservatively — the shared
|
|
34
|
+
# lib must remain compatible with each consumer's existing
|
|
35
|
+
# version. Loosen only after coordinating across all 3
|
|
36
|
+
# consumers.
|
|
37
|
+
# numpy + scipy are required by Tier 2 calibration
|
|
38
|
+
# (`fit_nnls_with_prior`, `bootstrap_lodo`,
|
|
39
|
+
# `null_model_baseline`) per Sprint 4 Day 5 implementation.
|
|
40
|
+
"numpy>=1.24,<3",
|
|
41
|
+
"scipy>=1.10,<2",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.optional-dependencies]
|
|
45
|
+
calibration = [
|
|
46
|
+
# Optional calibration extras. Required for the consumer-
|
|
47
|
+
# side Phase D scripts (DataFrame manipulation, xlsx
|
|
48
|
+
# parsing) but NOT for the shared lib's calibration core
|
|
49
|
+
# (NNLS / bootstrap / null model are numpy-only via the
|
|
50
|
+
# core scipy dep).
|
|
51
|
+
"pandas>=2.0,<3",
|
|
52
|
+
"openpyxl>=3.1",
|
|
53
|
+
]
|
|
54
|
+
reporting = [
|
|
55
|
+
# Tier 3 reporting framework deps (extracted at Sprint 4
|
|
56
|
+
# Day 6-7).
|
|
57
|
+
"Jinja2>=3.1,<4",
|
|
58
|
+
"matplotlib>=3.7,<4",
|
|
59
|
+
]
|
|
60
|
+
dev = [
|
|
61
|
+
"pytest>=8.0,<9",
|
|
62
|
+
"pytest-cov>=4.1",
|
|
63
|
+
"ruff>=0.4",
|
|
64
|
+
"mypy>=1.8",
|
|
65
|
+
]
|
|
66
|
+
all = [
|
|
67
|
+
"flockbio_bioscoring[calibration,reporting,dev]",
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
[project.urls]
|
|
71
|
+
Repository = "https://github.com/flock-bio/bioscoring-shared-FB"
|
|
72
|
+
Documentation = "https://github.com/flock-bio/bioscoring-shared-FB/blob/main/docs/INTEGRATION_GUIDE.md"
|
|
73
|
+
Changelog = "https://github.com/flock-bio/bioscoring-shared-FB/blob/main/CHANGELOG.md"
|
|
74
|
+
|
|
75
|
+
[tool.setuptools.packages.find]
|
|
76
|
+
where = ["src"]
|
|
77
|
+
include = ["flockbio_bioscoring*"]
|
|
78
|
+
namespaces = false
|
|
79
|
+
|
|
80
|
+
[tool.setuptools.package-data]
|
|
81
|
+
flockbio_bioscoring = [
|
|
82
|
+
"py.typed",
|
|
83
|
+
# Tier 3 reporting framework: ship the Jinja2 base template
|
|
84
|
+
# so consumers can extend it via FileSystemLoader on the
|
|
85
|
+
# shared lib's installed templates dir.
|
|
86
|
+
"reporting/templates/*.html.j2",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
[tool.pytest.ini_options]
|
|
90
|
+
testpaths = ["tests"]
|
|
91
|
+
python_files = "test_*.py"
|
|
92
|
+
addopts = "-q --tb=short"
|
|
93
|
+
pythonpath = ["src"]
|
|
94
|
+
|
|
95
|
+
[tool.ruff]
|
|
96
|
+
line-length = 100
|
|
97
|
+
target-version = "py310"
|
|
98
|
+
|
|
99
|
+
[tool.mypy]
|
|
100
|
+
python_version = "3.10"
|
|
101
|
+
strict = true
|
|
102
|
+
warn_return_any = true
|
|
103
|
+
warn_unused_configs = true
|
|
104
|
+
files = ["src"]
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Flock Bio shared scoring + calibration infrastructure library
|
|
2
|
+
(`flockbio_bioscoring`).
|
|
3
|
+
|
|
4
|
+
The 5th repo in the Flock Bio transcript-design suite. This
|
|
5
|
+
library packages the common machinery used by the four
|
|
6
|
+
sub-pipelines (codon + 5'UTR + 3'UTR + promoter) + the
|
|
7
|
+
orchestrator (`transcript_design-FB`):
|
|
8
|
+
|
|
9
|
+
- DatasetSpec dataclass + sentinel framework for offline
|
|
10
|
+
calibration runs (Tier 1).
|
|
11
|
+
- License-aware download helpers with retries + EBI tier-0
|
|
12
|
+
probe (Tier 1).
|
|
13
|
+
- Generic FASTA / CSV / GTF parsers (Tier 1).
|
|
14
|
+
- Bootstrap LODO + null-model + ship gate framework (Tier 2,
|
|
15
|
+
Sprint 4 Day 4-5).
|
|
16
|
+
- NNLS calibration helpers (Tier 2).
|
|
17
|
+
- HTML / PDF report scaffolding (Tier 3, Sprint 4 Day 6-7).
|
|
18
|
+
|
|
19
|
+
Per the **Rule of Three** (L21), the library is extracted from
|
|
20
|
+
the already-shipping codon + 5'UTR + 3'UTR pipelines after
|
|
21
|
+
they each paid for the infrastructure once. The codon
|
|
22
|
+
pipeline's v1.0.1 byte-identical baseline is the hardest
|
|
23
|
+
invariant to preserve during migration.
|
|
24
|
+
|
|
25
|
+
Versioning per Part X.5.6:
|
|
26
|
+
|
|
27
|
+
- **Patch (0.x.Y)**: bug fixes, no API changes.
|
|
28
|
+
- **Minor (0.X.0)**: new APIs added, no removals.
|
|
29
|
+
- **Major (X.0.0)**: breaking changes; coordinated release
|
|
30
|
+
across all consumers; orchestrator bumps in lockstep.
|
|
31
|
+
|
|
32
|
+
Authoritative integration sources:
|
|
33
|
+
|
|
34
|
+
- `docs/INTEGRATION_GUIDE.md` — consumer-side migration recipes.
|
|
35
|
+
- `docs/CROSS_PIPELINE_LESSONS.md` — strategic doc for the
|
|
36
|
+
6-repo suite (canonical here since 2026-05-12, moved from
|
|
37
|
+
codon's `docs/` per L21).
|
|
38
|
+
- Each consumer's `CLAUDE.md` for repo-specific conventions.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
__version__ = "0.4.0"
|
|
44
|
+
|
|
45
|
+
# The shared lib's public schema version. Independent of
|
|
46
|
+
# __version__: bumped only when the public API contracts
|
|
47
|
+
# (DatasetSpec field names, sentinel string values, parser
|
|
48
|
+
# return shapes) change in breaking ways. Per Part XI.8
|
|
49
|
+
# patch bumps for backwards-compatible additions; MAJOR bumps
|
|
50
|
+
# for breaking changes.
|
|
51
|
+
__schema_version__ = "1.0"
|
|
52
|
+
|
|
53
|
+
# Public re-exports — keep in sync with the per-module
|
|
54
|
+
# __all__ declarations + the INTEGRATION_GUIDE.md API surface.
|
|
55
|
+
from flockbio_bioscoring.cache import (
|
|
56
|
+
cache_size,
|
|
57
|
+
clear_cache_for_testing,
|
|
58
|
+
get_or_parse,
|
|
59
|
+
)
|
|
60
|
+
from flockbio_bioscoring.dataset_spec import (
|
|
61
|
+
DatasetSpec,
|
|
62
|
+
DatasetSpecValidationError,
|
|
63
|
+
DownloadStatus,
|
|
64
|
+
SentinelStatus,
|
|
65
|
+
)
|
|
66
|
+
from flockbio_bioscoring.europepmc import (
|
|
67
|
+
EuropePmcMirror,
|
|
68
|
+
get_mirror,
|
|
69
|
+
n_registered_mirrors,
|
|
70
|
+
probe_head,
|
|
71
|
+
register_mirror,
|
|
72
|
+
)
|
|
73
|
+
from flockbio_bioscoring.sentinels import (
|
|
74
|
+
SENTINEL_DEFERRED_BOT_DETECTION,
|
|
75
|
+
SENTINEL_GHA_DISPATCH_ONLY,
|
|
76
|
+
SENTINEL_LICENSE_BLOCKED,
|
|
77
|
+
SENTINEL_VERIFIED_IN_PARSER,
|
|
78
|
+
SHA256_SENTINELS,
|
|
79
|
+
is_sentinel_sha256,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
__all__ = [
|
|
83
|
+
"__version__",
|
|
84
|
+
"__schema_version__",
|
|
85
|
+
"DatasetSpec",
|
|
86
|
+
"DatasetSpecValidationError",
|
|
87
|
+
"DownloadStatus",
|
|
88
|
+
"SentinelStatus",
|
|
89
|
+
"SENTINEL_DEFERRED_BOT_DETECTION",
|
|
90
|
+
"SENTINEL_GHA_DISPATCH_ONLY",
|
|
91
|
+
"SENTINEL_LICENSE_BLOCKED",
|
|
92
|
+
"SENTINEL_VERIFIED_IN_PARSER",
|
|
93
|
+
"SHA256_SENTINELS",
|
|
94
|
+
"is_sentinel_sha256",
|
|
95
|
+
"EuropePmcMirror",
|
|
96
|
+
"register_mirror",
|
|
97
|
+
"get_mirror",
|
|
98
|
+
"n_registered_mirrors",
|
|
99
|
+
"probe_head",
|
|
100
|
+
"get_or_parse",
|
|
101
|
+
"cache_size",
|
|
102
|
+
"clear_cache_for_testing",
|
|
103
|
+
]
|