flockbio-bioscoring 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. flockbio_bioscoring-0.4.0/LICENSE +27 -0
  2. flockbio_bioscoring-0.4.0/LICENSES.md +107 -0
  3. flockbio_bioscoring-0.4.0/PKG-INFO +162 -0
  4. flockbio_bioscoring-0.4.0/README.md +96 -0
  5. flockbio_bioscoring-0.4.0/pyproject.toml +104 -0
  6. flockbio_bioscoring-0.4.0/setup.cfg +4 -0
  7. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/__init__.py +103 -0
  8. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/cache.py +116 -0
  9. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/__init__.py +68 -0
  10. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/artifact_schema.py +216 -0
  11. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/merge_artifacts.py +262 -0
  12. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/nnls.py +312 -0
  13. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/ship_gate.py +493 -0
  14. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/calibration/upload_gate.py +102 -0
  15. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/dataset_spec.py +280 -0
  16. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/download.py +311 -0
  17. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/europepmc.py +131 -0
  18. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/parsers/__init__.py +44 -0
  19. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/parsers/csv_tsv.py +125 -0
  20. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/parsers/fasta.py +111 -0
  21. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/parsers/gtf.py +116 -0
  22. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/py.typed +0 -0
  23. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/__init__.py +86 -0
  24. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/chromium_pdf.py +160 -0
  25. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/figures.py +129 -0
  26. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/style.py +208 -0
  27. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/templates/base.html.j2 +127 -0
  28. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/reporting/templates_loader.py +123 -0
  29. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring/sentinels.py +95 -0
  30. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/PKG-INFO +162 -0
  31. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/SOURCES.txt +41 -0
  32. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/dependency_links.txt +1 -0
  33. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/requires.txt +19 -0
  34. flockbio_bioscoring-0.4.0/src/flockbio_bioscoring.egg-info/top_level.txt +1 -0
  35. flockbio_bioscoring-0.4.0/tests/test_cache.py +173 -0
  36. flockbio_bioscoring-0.4.0/tests/test_calibration.py +657 -0
  37. flockbio_bioscoring-0.4.0/tests/test_dataset_spec.py +304 -0
  38. flockbio_bioscoring-0.4.0/tests/test_download.py +201 -0
  39. flockbio_bioscoring-0.4.0/tests/test_europepmc.py +133 -0
  40. flockbio_bioscoring-0.4.0/tests/test_merge_artifacts.py +619 -0
  41. flockbio_bioscoring-0.4.0/tests/test_parsers.py +198 -0
  42. flockbio_bioscoring-0.4.0/tests/test_reporting.py +486 -0
  43. flockbio_bioscoring-0.4.0/tests/test_sentinels.py +140 -0
@@ -0,0 +1,27 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Flock Bio
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ NOTE: The MIT license applies to the LIBRARY CODE in this repository. License
24
+ terms for downloaded / bundled DATASETS are tracked separately in
25
+ `LICENSES.md` per the L20 4-layer audit pattern (article + supplementary +
26
+ deposit + cross-walk). Datasets that fail the audit are HARD BLOCKED via the
27
+ SENTINEL_LICENSE_BLOCKED marker in their DatasetSpec entries.
@@ -0,0 +1,107 @@
1
+ # LICENSES
2
+
3
+ This file tracks licenses for **datasets** that the
4
+ `flockbio_bioscoring` library will eventually download or
5
+ bundle. The library code itself is MIT-licensed (see
6
+ `LICENSE`).
7
+
8
+ ## License audit framework (per L20 + Part IX.7)
9
+
10
+ Every dataset registered as a :class:`DatasetSpec` carries a
11
+ `license` field + a `license_audit_note` that documents the
12
+ **4-layer license audit**:
13
+
14
+ 1. **Article layer**: the journal article's license (CC BY,
15
+ CC BY-NC, CC BY-NC-ND, etc.). HARD BLOCKED if NC or ND
16
+ per L5.
17
+ 2. **Supplementary layer**: the supplementary file's license.
18
+ May differ from the article (some publishers grant CC BY
19
+ to the article but retain commercial rights for
20
+ supplementary data).
21
+ 3. **Deposit layer**: the data deposit's license (figshare,
22
+ Zenodo, GEO, ArrayExpress, etc.). May further restrict.
23
+ 4. **Cross-walk layer**: the per-row license check for
24
+ datasets that include multi-source data (e.g., an MPRA
25
+ library that lists sequences from licensed source X and
26
+ sequences from licensed source Y — each row inherits its
27
+ source's license).
28
+
29
+ Only datasets that pass ALL FOUR layers are added to a
30
+ DatasetSpec with a real (non-`SENTINEL_LICENSE_BLOCKED`)
31
+ `expected_sha256`.
32
+
33
+ ## Sentinel marker for blocked datasets
34
+
35
+ Datasets that fail any layer of the audit are recorded as
36
+ `DatasetSpec` entries with
37
+ `expected_sha256=SENTINEL_LICENSE_BLOCKED` + `parser=None`.
38
+ This documents the audit decision in the same registry as the
39
+ allowed datasets (forensic traceability) without any risk of
40
+ accidentally downloading the blocked data.
41
+
42
+ Example (from the codon repo's known-blocked list):
43
+
44
+ ```python
45
+ DatasetSpec(
46
+ name="mauger2019_human_5utr_mpra",
47
+ url="https://www.pnas.org/doi/10.1073/pnas.1909563116",
48
+ expected_sha256=SENTINEL_LICENSE_BLOCKED,
49
+ license="CC BY-NC 4.0",
50
+ license_audit_note=(
51
+ "PNAS 116:24075. Article published with CC BY-NC. "
52
+ "HARD BLOCKED per L5; not downloaded, not parsed. "
53
+ "Useful as a how-we-would-love-to-use-this footnote."
54
+ ),
55
+ parser=None,
56
+ )
57
+ ```
58
+
59
+ ## Per-dataset audit log
60
+
61
+ When the Tier 1 extraction lands real datasets at Sprint 4
62
+ Day 3, this section will be populated with one entry per
63
+ DatasetSpec.
64
+
65
+ The Sprint 4 Day 1 scaffold (this commit) does NOT include
66
+ any real dataset entries — those migrate over from
67
+ codon / 5utr / 3utr's existing audit ledgers in subsequent
68
+ commits.
69
+
70
+ ## Known HARD BLOCKED datasets (inherited from consumers)
71
+
72
+ These are blocked for ALL consumers + the orchestrator. Per
73
+ L5 + L20, NC or ND-licensed datasets are HARD BLOCKED:
74
+
75
+ - **Mauger 2019 (PNAS 116:24075)**: CC BY-NC. 5'UTR MPRA.
76
+ - **Tani 2012 (Genome Res 22:947)**: CC BY-NC. mRNA half-life.
77
+ - **Schwanhäusser 2011 (Nature 473:337)**: CC BY-NC. mRNA
78
+ half-life + protein abundance.
79
+ - **Ginkgo 180k 3'UTR MPRA**: corresponding author no
80
+ longer at Ginkgo (2026-05-02 inquiry); license unknown.
81
+ Treated as blocked until license confirmed.
82
+ - **TargetScan pre-computed scores**: license uncertain
83
+ (CC BY-NC suspected but not confirmed); treated as blocked
84
+ until L20 audit completes. We CAN recompute seed-match
85
+ counts from miRBase (CC BY 4.0).
86
+
87
+ ## Reference for new dataset additions
88
+
89
+ When adding a DatasetSpec, follow this checklist:
90
+
91
+ - [ ] Found article + checked CC license tier (Article layer).
92
+ - [ ] Found supplementary file + checked its license
93
+ (Supplementary layer).
94
+ - [ ] Located the data deposit (figshare / Zenodo / GEO /
95
+ ArrayExpress) + checked its license (Deposit layer).
96
+ - [ ] Checked any cross-walk references for per-row
97
+ license inheritance (Cross-walk layer).
98
+ - [ ] Recorded the audit decision in this file with a
99
+ direct link to the article + the deposit.
100
+ - [ ] If approved, computed the SHA-256 (or marked with the
101
+ appropriate sentinel).
102
+ - [ ] If blocked, added DatasetSpec with
103
+ `SENTINEL_LICENSE_BLOCKED` + `parser=None`.
104
+
105
+ The `phase_d_calibration_matrix.yml` GHA workflow runs the
106
+ download + the SHA verification fresh on every dispatch, so
107
+ audit decisions are revisited rather than cached.
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: flockbio_bioscoring
3
+ Version: 0.4.0
4
+ Summary: Flock Bio shared scoring + calibration infrastructure for transcript-design sub-pipelines.
5
+ Author: Flock Bio
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Flock Bio
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ NOTE: The MIT license applies to the LIBRARY CODE in this repository. License
29
+ terms for downloaded / bundled DATASETS are tracked separately in
30
+ `LICENSES.md` per the L20 4-layer audit pattern (article + supplementary +
31
+ deposit + cross-walk). Datasets that fail the audit are HARD BLOCKED via the
32
+ SENTINEL_LICENSE_BLOCKED marker in their DatasetSpec entries.
33
+
34
+ Project-URL: Repository, https://github.com/flock-bio/bioscoring-shared-FB
35
+ Project-URL: Documentation, https://github.com/flock-bio/bioscoring-shared-FB/blob/main/docs/INTEGRATION_GUIDE.md
36
+ Project-URL: Changelog, https://github.com/flock-bio/bioscoring-shared-FB/blob/main/CHANGELOG.md
37
+ Keywords: bioinformatics,mRNA,calibration,MPRA,Flock Bio
38
+ Classifier: Development Status :: 3 - Alpha
39
+ Classifier: Intended Audience :: Science/Research
40
+ Classifier: License :: OSI Approved :: MIT License
41
+ Classifier: Programming Language :: Python :: 3
42
+ Classifier: Programming Language :: Python :: 3.10
43
+ Classifier: Programming Language :: Python :: 3.11
44
+ Classifier: Programming Language :: Python :: 3.12
45
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
46
+ Requires-Python: >=3.10
47
+ Description-Content-Type: text/markdown
48
+ License-File: LICENSE
49
+ License-File: LICENSES.md
50
+ Requires-Dist: numpy<3,>=1.24
51
+ Requires-Dist: scipy<2,>=1.10
52
+ Provides-Extra: calibration
53
+ Requires-Dist: pandas<3,>=2.0; extra == "calibration"
54
+ Requires-Dist: openpyxl>=3.1; extra == "calibration"
55
+ Provides-Extra: reporting
56
+ Requires-Dist: Jinja2<4,>=3.1; extra == "reporting"
57
+ Requires-Dist: matplotlib<4,>=3.7; extra == "reporting"
58
+ Provides-Extra: dev
59
+ Requires-Dist: pytest<9,>=8.0; extra == "dev"
60
+ Requires-Dist: pytest-cov>=4.1; extra == "dev"
61
+ Requires-Dist: ruff>=0.4; extra == "dev"
62
+ Requires-Dist: mypy>=1.8; extra == "dev"
63
+ Provides-Extra: all
64
+ Requires-Dist: flockbio_bioscoring[calibration,dev,reporting]; extra == "all"
65
+ Dynamic: license-file
66
+
67
+ # bioscoring-shared-FB
68
+
69
+ **Flock Bio shared scoring + calibration infrastructure
70
+ library** (`flockbio_bioscoring`). The 5th repo in the Flock
71
+ Bio transcript-design suite.
72
+
73
+ This library packages the common machinery used by the four
74
+ sub-pipelines + the orchestrator:
75
+
76
+ - DatasetSpec dataclass + sentinel framework for offline
77
+ calibration runs
78
+ - License-aware download helpers with retries + EBI tier-0
79
+ probe
80
+ - Generic FASTA / CSV / GTF parsers
81
+ - Bootstrap LODO + null-model + ship gate framework
82
+ - NNLS calibration helpers
83
+ - HTML / PDF report scaffolding (Jinja2 + matplotlib +
84
+ Chromium)
85
+
86
+ Per the **Rule of Three** (L21 in `docs/CROSS_PIPELINE_LESSONS.md`;
87
+ canonical here since 2026-05-12, moved from codon's `docs/`),
88
+ this library is extracted from the **already-shipping** codon
89
+ + 5'UTR + 3'UTR pipelines after they paid for the
90
+ infrastructure once each.
91
+
92
+ ## Status
93
+
94
+ ✅ **v0.4.0 shipped** (Sprint 4 Day 8). All three extraction
95
+ tiers are complete; the library is feature-complete for the
96
+ Sprint 4 scope. Next milestone is **v1.0.0** after the four
97
+ consumer pipelines migrate from their inline copies to the
98
+ shared lib (see `docs/CONSUMER_MIGRATION_PLAN.md` +
99
+ `docs/CONSUMER_MIGRATION_STATUS.md`).
100
+
101
+ | Tier | Sprint 4 days | Modules | Status |
102
+ |---|---|---|---|
103
+ | 1 | Days 1-3 | DatasetSpec + sentinels + download + parsers + cache + EuropePMC tier-0 | ✅ shipped (v0.1.0) |
104
+ | 2 | Days 4-5 | NNLS + ship gate + upload-success-gate + merge_artifacts + artifact_schema | ✅ shipped (v0.2.0) |
105
+ | 3 | Days 6-7 | HTML / PDF report scaffolding (style + figures + chromium_pdf + templates_loader) | ✅ shipped (v0.3.0) |
106
+
107
+ ## Strategic position
108
+
109
+ | Repo | Role | Status |
110
+ |---|---|---|
111
+ | `flock-bio/codon_optimization-FB` | Codon optimization | ✅ shipping (v2.8.0) |
112
+ | `flock-bio/5utr_optimization-FB` | 5'UTR design | ✅ shipping (v2.8.24) |
113
+ | `flock-bio/3utr_optimization-FB` | 3'UTR design | ✅ shipping (v0.2.0) |
114
+ | `flock-bio/promoter_design-FB` | Tissue-specific promoter design | 📋 Sprint 8-12 |
115
+ | `flock-bio/bioscoring-shared-FB` (THIS REPO) | Shared library | ✅ v0.4.0 shipped (consumer-harmonized); 4-consumer migration in flight → v1.0.0 |
116
+ | `flock-bio/transcript_design-FB` | Orchestrator | 🟡 Sprint 13-16 (currently v0.4.1) |
117
+
118
+ ## Installation
119
+
120
+ ```bash
121
+ pip install flockbio_bioscoring # once published
122
+ ```
123
+
124
+ For development:
125
+
126
+ ```bash
127
+ pip install -e .[dev]
128
+ PYTHONPATH=src python3 -m pytest tests/ -q
129
+ ```
130
+
131
+ ## Documentation
132
+
133
+ - `CLAUDE.md` — Claude Code guidance for this repo.
134
+ - `CHANGELOG.md` — chronological history of behavior changes.
135
+ - `docs/INTEGRATION_GUIDE.md` — API surface reference + recipes
136
+ for consuming pipelines to migrate from inline copies to the
137
+ shared library.
138
+ - `docs/CONSUMER_MIGRATION_PLAN.md` — master migration plan
139
+ for the 4-consumer migration (binding order, per-consumer
140
+ effort, Step 0-9 recipe).
141
+ - `docs/CONSUMER_MIGRATION_STATUS.md` — live status tracker
142
+ for the 4-consumer migration.
143
+ - `docs/CALIBRATION_RUNBOOK_TEMPLATE.md` — template for
144
+ consumer-side `CALIBRATION_RUNBOOK.md` (Phase D / OFFLINE
145
+ calibration operator doc; consumers extend with pipeline
146
+ specifics).
147
+ - `docs/CROSS_PIPELINE_LESSONS.md` — canonical strategic doc
148
+ for the 6-repo suite (Strategic North Star + Part I L1-L21
149
+ lessons + Part X per-pipeline kickoff guides + Part XI
150
+ binding I/O contracts). Moved here from codon's `docs/`
151
+ on 2026-05-12 per L21.
152
+ - `docs/CYPHERBIO_PIPELINE_PLAYBOOK.md` +
153
+ `docs/CYPHERBIO_PLAYBOOK_v2.md` — canonical CypherBio
154
+ packaging conventions (moved here from per-consumer copies
155
+ on 2026-05-12 per L21).
156
+
157
+ ## License
158
+
159
+ The library code is licensed under the MIT license (see
160
+ `LICENSE`). License terms for bundled / downloaded datasets
161
+ are tracked separately in `LICENSES.md` per the L20 4-layer
162
+ audit pattern (article + supplementary + deposit + cross-walk).
@@ -0,0 +1,96 @@
1
+ # bioscoring-shared-FB
2
+
3
+ **Flock Bio shared scoring + calibration infrastructure
4
+ library** (`flockbio_bioscoring`). The 5th repo in the Flock
5
+ Bio transcript-design suite.
6
+
7
+ This library packages the common machinery used by the four
8
+ sub-pipelines + the orchestrator:
9
+
10
+ - DatasetSpec dataclass + sentinel framework for offline
11
+ calibration runs
12
+ - License-aware download helpers with retries + EBI tier-0
13
+ probe
14
+ - Generic FASTA / CSV / GTF parsers
15
+ - Bootstrap LODO + null-model + ship gate framework
16
+ - NNLS calibration helpers
17
+ - HTML / PDF report scaffolding (Jinja2 + matplotlib +
18
+ Chromium)
19
+
20
+ Per the **Rule of Three** (L21 in `docs/CROSS_PIPELINE_LESSONS.md`;
21
+ canonical here since 2026-05-12, moved from codon's `docs/`),
22
+ this library is extracted from the **already-shipping** codon
23
+ + 5'UTR + 3'UTR pipelines after they paid for the
24
+ infrastructure once each.
25
+
26
+ ## Status
27
+
28
+ ✅ **v0.4.0 shipped** (Sprint 4 Day 8). All three extraction
29
+ tiers are complete; the library is feature-complete for the
30
+ Sprint 4 scope. Next milestone is **v1.0.0** after the four
31
+ consumer pipelines migrate from their inline copies to the
32
+ shared lib (see `docs/CONSUMER_MIGRATION_PLAN.md` +
33
+ `docs/CONSUMER_MIGRATION_STATUS.md`).
34
+
35
+ | Tier | Sprint 4 days | Modules | Status |
36
+ |---|---|---|---|
37
+ | 1 | Days 1-3 | DatasetSpec + sentinels + download + parsers + cache + EuropePMC tier-0 | ✅ shipped (v0.1.0) |
38
+ | 2 | Days 4-5 | NNLS + ship gate + upload-success-gate + merge_artifacts + artifact_schema | ✅ shipped (v0.2.0) |
39
+ | 3 | Days 6-7 | HTML / PDF report scaffolding (style + figures + chromium_pdf + templates_loader) | ✅ shipped (v0.3.0) |
40
+
41
+ ## Strategic position
42
+
43
+ | Repo | Role | Status |
44
+ |---|---|---|
45
+ | `flock-bio/codon_optimization-FB` | Codon optimization | ✅ shipping (v2.8.0) |
46
+ | `flock-bio/5utr_optimization-FB` | 5'UTR design | ✅ shipping (v2.8.24) |
47
+ | `flock-bio/3utr_optimization-FB` | 3'UTR design | ✅ shipping (v0.2.0) |
48
+ | `flock-bio/promoter_design-FB` | Tissue-specific promoter design | 📋 Sprint 8-12 |
49
+ | `flock-bio/bioscoring-shared-FB` (THIS REPO) | Shared library | ✅ v0.4.0 shipped (consumer-harmonized); 4-consumer migration in flight → v1.0.0 |
50
+ | `flock-bio/transcript_design-FB` | Orchestrator | 🟡 Sprint 13-16 (currently v0.4.1) |
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install flockbio_bioscoring # once published
56
+ ```
57
+
58
+ For development:
59
+
60
+ ```bash
61
+ pip install -e .[dev]
62
+ PYTHONPATH=src python3 -m pytest tests/ -q
63
+ ```
64
+
65
+ ## Documentation
66
+
67
+ - `CLAUDE.md` — Claude Code guidance for this repo.
68
+ - `CHANGELOG.md` — chronological history of behavior changes.
69
+ - `docs/INTEGRATION_GUIDE.md` — API surface reference + recipes
70
+ for consuming pipelines to migrate from inline copies to the
71
+ shared library.
72
+ - `docs/CONSUMER_MIGRATION_PLAN.md` — master migration plan
73
+ for the 4-consumer migration (binding order, per-consumer
74
+ effort, Step 0-9 recipe).
75
+ - `docs/CONSUMER_MIGRATION_STATUS.md` — live status tracker
76
+ for the 4-consumer migration.
77
+ - `docs/CALIBRATION_RUNBOOK_TEMPLATE.md` — template for
78
+ consumer-side `CALIBRATION_RUNBOOK.md` (Phase D / OFFLINE
79
+ calibration operator doc; consumers extend with pipeline
80
+ specifics).
81
+ - `docs/CROSS_PIPELINE_LESSONS.md` — canonical strategic doc
82
+ for the 6-repo suite (Strategic North Star + Part I L1-L21
83
+ lessons + Part X per-pipeline kickoff guides + Part XI
84
+ binding I/O contracts). Moved here from codon's `docs/`
85
+ on 2026-05-12 per L21.
86
+ - `docs/CYPHERBIO_PIPELINE_PLAYBOOK.md` +
87
+ `docs/CYPHERBIO_PLAYBOOK_v2.md` — canonical CypherBio
88
+ packaging conventions (moved here from per-consumer copies
89
+ on 2026-05-12 per L21).
90
+
91
+ ## License
92
+
93
+ The library code is licensed under the MIT license (see
94
+ `LICENSE`). License terms for bundled / downloaded datasets
95
+ are tracked separately in `LICENSES.md` per the L20 4-layer
96
+ audit pattern (article + supplementary + deposit + cross-walk).
@@ -0,0 +1,104 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "flockbio_bioscoring"
7
+ version = "0.4.0"
8
+ description = "Flock Bio shared scoring + calibration infrastructure for transcript-design sub-pipelines."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { file = "LICENSE" }
12
+ authors = [
13
+ { name = "Flock Bio" },
14
+ ]
15
+ keywords = [
16
+ "bioinformatics",
17
+ "mRNA",
18
+ "calibration",
19
+ "MPRA",
20
+ "Flock Bio",
21
+ ]
22
+ classifiers = [
23
+ "Development Status :: 3 - Alpha",
24
+ "Intended Audience :: Science/Research",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
31
+ ]
32
+ dependencies = [
33
+ # Core scientific stack. Pinned conservatively — the shared
34
+ # lib must remain compatible with each consumer's existing
35
+ # version. Loosen only after coordinating across all 3
36
+ # consumers.
37
+ # numpy + scipy are required by Tier 2 calibration
38
+ # (`fit_nnls_with_prior`, `bootstrap_lodo`,
39
+ # `null_model_baseline`) per Sprint 4 Day 5 implementation.
40
+ "numpy>=1.24,<3",
41
+ "scipy>=1.10,<2",
42
+ ]
43
+
44
+ [project.optional-dependencies]
45
+ calibration = [
46
+ # Optional calibration extras. Required for the consumer-
47
+ # side Phase D scripts (DataFrame manipulation, xlsx
48
+ # parsing) but NOT for the shared lib's calibration core
49
+ # (NNLS / bootstrap / null model are numpy-only via the
50
+ # core scipy dep).
51
+ "pandas>=2.0,<3",
52
+ "openpyxl>=3.1",
53
+ ]
54
+ reporting = [
55
+ # Tier 3 reporting framework deps (extracted at Sprint 4
56
+ # Day 6-7).
57
+ "Jinja2>=3.1,<4",
58
+ "matplotlib>=3.7,<4",
59
+ ]
60
+ dev = [
61
+ "pytest>=8.0,<9",
62
+ "pytest-cov>=4.1",
63
+ "ruff>=0.4",
64
+ "mypy>=1.8",
65
+ ]
66
+ all = [
67
+ "flockbio_bioscoring[calibration,reporting,dev]",
68
+ ]
69
+
70
+ [project.urls]
71
+ Repository = "https://github.com/flock-bio/bioscoring-shared-FB"
72
+ Documentation = "https://github.com/flock-bio/bioscoring-shared-FB/blob/main/docs/INTEGRATION_GUIDE.md"
73
+ Changelog = "https://github.com/flock-bio/bioscoring-shared-FB/blob/main/CHANGELOG.md"
74
+
75
+ [tool.setuptools.packages.find]
76
+ where = ["src"]
77
+ include = ["flockbio_bioscoring*"]
78
+ namespaces = false
79
+
80
+ [tool.setuptools.package-data]
81
+ flockbio_bioscoring = [
82
+ "py.typed",
83
+ # Tier 3 reporting framework: ship the Jinja2 base template
84
+ # so consumers can extend it via FileSystemLoader on the
85
+ # shared lib's installed templates dir.
86
+ "reporting/templates/*.html.j2",
87
+ ]
88
+
89
+ [tool.pytest.ini_options]
90
+ testpaths = ["tests"]
91
+ python_files = "test_*.py"
92
+ addopts = "-q --tb=short"
93
+ pythonpath = ["src"]
94
+
95
+ [tool.ruff]
96
+ line-length = 100
97
+ target-version = "py310"
98
+
99
+ [tool.mypy]
100
+ python_version = "3.10"
101
+ strict = true
102
+ warn_return_any = true
103
+ warn_unused_configs = true
104
+ files = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,103 @@
1
+ """Flock Bio shared scoring + calibration infrastructure library
2
+ (`flockbio_bioscoring`).
3
+
4
+ The 5th repo in the Flock Bio transcript-design suite. This
5
+ library packages the common machinery used by the four
6
+ sub-pipelines (codon + 5'UTR + 3'UTR + promoter) + the
7
+ orchestrator (`transcript_design-FB`):
8
+
9
+ - DatasetSpec dataclass + sentinel framework for offline
10
+ calibration runs (Tier 1).
11
+ - License-aware download helpers with retries + EBI tier-0
12
+ probe (Tier 1).
13
+ - Generic FASTA / CSV / GTF parsers (Tier 1).
14
+ - Bootstrap LODO + null-model + ship gate framework (Tier 2,
15
+ Sprint 4 Day 4-5).
16
+ - NNLS calibration helpers (Tier 2).
17
+ - HTML / PDF report scaffolding (Tier 3, Sprint 4 Day 6-7).
18
+
19
+ Per the **Rule of Three** (L21), the library is extracted from
20
+ the already-shipping codon + 5'UTR + 3'UTR pipelines after
21
+ they each paid for the infrastructure once. The codon
22
+ pipeline's v1.0.1 byte-identical baseline is the hardest
23
+ invariant to preserve during migration.
24
+
25
+ Versioning per Part X.5.6:
26
+
27
+ - **Patch (0.x.Y)**: bug fixes, no API changes.
28
+ - **Minor (0.X.0)**: new APIs added, no removals.
29
+ - **Major (X.0.0)**: breaking changes; coordinated release
30
+ across all consumers; orchestrator bumps in lockstep.
31
+
32
+ Authoritative integration sources:
33
+
34
+ - `docs/INTEGRATION_GUIDE.md` — consumer-side migration recipes.
35
+ - `docs/CROSS_PIPELINE_LESSONS.md` — strategic doc for the
36
+ 6-repo suite (canonical here since 2026-05-12, moved from
37
+ codon's `docs/` per L21).
38
+ - Each consumer's `CLAUDE.md` for repo-specific conventions.
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ __version__ = "0.4.0"
44
+
45
+ # The shared lib's public schema version. Independent of
46
+ # __version__: bumped only when the public API contracts
47
+ # (DatasetSpec field names, sentinel string values, parser
48
+ # return shapes) change in breaking ways. Per Part XI.8
49
+ # patch bumps for backwards-compatible additions; MAJOR bumps
50
+ # for breaking changes.
51
+ __schema_version__ = "1.0"
52
+
53
+ # Public re-exports — keep in sync with the per-module
54
+ # __all__ declarations + the INTEGRATION_GUIDE.md API surface.
55
+ from flockbio_bioscoring.cache import (
56
+ cache_size,
57
+ clear_cache_for_testing,
58
+ get_or_parse,
59
+ )
60
+ from flockbio_bioscoring.dataset_spec import (
61
+ DatasetSpec,
62
+ DatasetSpecValidationError,
63
+ DownloadStatus,
64
+ SentinelStatus,
65
+ )
66
+ from flockbio_bioscoring.europepmc import (
67
+ EuropePmcMirror,
68
+ get_mirror,
69
+ n_registered_mirrors,
70
+ probe_head,
71
+ register_mirror,
72
+ )
73
+ from flockbio_bioscoring.sentinels import (
74
+ SENTINEL_DEFERRED_BOT_DETECTION,
75
+ SENTINEL_GHA_DISPATCH_ONLY,
76
+ SENTINEL_LICENSE_BLOCKED,
77
+ SENTINEL_VERIFIED_IN_PARSER,
78
+ SHA256_SENTINELS,
79
+ is_sentinel_sha256,
80
+ )
81
+
82
+ __all__ = [
83
+ "__version__",
84
+ "__schema_version__",
85
+ "DatasetSpec",
86
+ "DatasetSpecValidationError",
87
+ "DownloadStatus",
88
+ "SentinelStatus",
89
+ "SENTINEL_DEFERRED_BOT_DETECTION",
90
+ "SENTINEL_GHA_DISPATCH_ONLY",
91
+ "SENTINEL_LICENSE_BLOCKED",
92
+ "SENTINEL_VERIFIED_IN_PARSER",
93
+ "SHA256_SENTINELS",
94
+ "is_sentinel_sha256",
95
+ "EuropePmcMirror",
96
+ "register_mirror",
97
+ "get_mirror",
98
+ "n_registered_mirrors",
99
+ "probe_head",
100
+ "get_or_parse",
101
+ "cache_size",
102
+ "clear_cache_for_testing",
103
+ ]