ct-validation 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ct_validation-0.1.0/LICENSE +21 -0
- ct_validation-0.1.0/PKG-INFO +240 -0
- ct_validation-0.1.0/README.md +195 -0
- ct_validation-0.1.0/pyproject.toml +77 -0
- ct_validation-0.1.0/src/ct_validation/__init__.py +35 -0
- ct_validation-0.1.0/src/ct_validation/api.py +268 -0
- ct_validation-0.1.0/src/ct_validation/cli.py +188 -0
- ct_validation-0.1.0/src/ct_validation/config/__init__.py +21 -0
- ct_validation-0.1.0/src/ct_validation/config/loader.py +41 -0
- ct_validation-0.1.0/src/ct_validation/config/schema.py +59 -0
- ct_validation-0.1.0/src/ct_validation/data/__init__.py +10 -0
- ct_validation-0.1.0/src/ct_validation/data/schema.py +33 -0
- ct_validation-0.1.0/src/ct_validation/mcp_server.py +153 -0
- ct_validation-0.1.0/src/ct_validation/plotting.py +110 -0
- ct_validation-0.1.0/src/ct_validation/validation/__init__.py +14 -0
- ct_validation-0.1.0/src/ct_validation/validation/enrichment.py +170 -0
- ct_validation-0.1.0/src/ct_validation/validation/matching.py +112 -0
- ct_validation-0.1.0/src/ct_validation/validation/statistics.py +115 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Klim Kostiuk
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ct-validation
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Benchmarking gene-indication evidence against clinical trial outcomes
|
|
5
|
+
Keywords: clinical-trials,target-validation,drug-discovery,genetics,enrichment
|
|
6
|
+
Author: Klim Kostiuk, Daniel Igumnov, Peter Fedichev, Amir Feizi
|
|
7
|
+
Author-email: Klim Kostiuk <2601074@gmail.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Requires-Dist: duckdb>=1.4.3
|
|
18
|
+
Requires-Dist: numpy>=2.3.3
|
|
19
|
+
Requires-Dist: pandas>=2.3.3
|
|
20
|
+
Requires-Dist: pyarrow>=22.0.0
|
|
21
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
22
|
+
Requires-Dist: scipy>=1.16.2
|
|
23
|
+
Requires-Dist: typer>=0.24.0
|
|
24
|
+
Requires-Dist: chembl-webresource-client ; extra == 'fetch'
|
|
25
|
+
Requires-Dist: huggingface-hub ; extra == 'fetch'
|
|
26
|
+
Requires-Dist: joblib ; extra == 'fetch'
|
|
27
|
+
Requires-Dist: tqdm ; extra == 'fetch'
|
|
28
|
+
Requires-Dist: hail ; extra == 'genebass'
|
|
29
|
+
Requires-Dist: mcp[cli]>=1.0 ; extra == 'mcp'
|
|
30
|
+
Requires-Dist: polars>=1.5 ; extra == 'parse'
|
|
31
|
+
Requires-Dist: requests ; extra == 'parse'
|
|
32
|
+
Requires-Dist: tqdm ; extra == 'parse'
|
|
33
|
+
Requires-Dist: cyvcf2 ; extra == 'parse'
|
|
34
|
+
Requires-Dist: matplotlib ; extra == 'plot'
|
|
35
|
+
Requires-Python: >=3.11
|
|
36
|
+
Project-URL: Homepage, https://github.com/gero-science/ct-validation
|
|
37
|
+
Project-URL: Issues, https://github.com/gero-science/ct-validation/issues
|
|
38
|
+
Project-URL: Repository, https://github.com/gero-science/ct-validation
|
|
39
|
+
Provides-Extra: fetch
|
|
40
|
+
Provides-Extra: genebass
|
|
41
|
+
Provides-Extra: mcp
|
|
42
|
+
Provides-Extra: parse
|
|
43
|
+
Provides-Extra: plot
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# <img width="300" alt="ct-validation" src="https://github.com/user-attachments/assets/fc5443b6-e93b-4fbb-a841-fffca899eedf" />
|
|
47
|
+
|
|
48
|
+
An open framework for benchmarking gene-indication evidence against clinical trial outcomes.
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
`ct-validation` tests whether a set of gene-indication pairs is enriched for clinical success. It computes risk ratios and odds ratios with confidence intervals across clinical phase transitions and supports semantic disease matching through ontology-based similarity.
|
|
52
|
+
|
|
53
|
+
> **Paper:** Kostiuk K, Igumnov D, Fedichev P, Feizi A. _ct-validation: an open framework for benchmarking gene-indication evidence against clinical trial outcomes._ (2026)
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
Requires Python 3.11+.
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install ct-validation
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Optional extras:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install ct-validation[plot] # forest plot visualization
|
|
67
|
+
pip install ct-validation[mcp] # MCP server for agent workflows
|
|
68
|
+
pip install ct-validation[parse] # data source parsers
|
|
69
|
+
pip install ct-validation[fetch] # ChEMBL fetching script dependencies
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Quick start
|
|
73
|
+
|
|
74
|
+
### Python API
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import ct_validation as ctv
|
|
78
|
+
|
|
79
|
+
results = ctv.validate(
|
|
80
|
+
clinical_trials="data/clinical_trials/gene_indication_max_phase.parquet",
|
|
81
|
+
targets="data/genetic_evidence/genetic_evidence.parquet",
|
|
82
|
+
similarity_lookup="data/mappings/efo_similarity_lookup_0.5.parquet",
|
|
83
|
+
)
|
|
84
|
+
print(results)
|
|
85
|
+
# phase_label n_yes n_no rr rr_ci_lower rr_ci_upper ...
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Batch mode — compare multiple evidence sources at once:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
results = ctv.validate(
|
|
92
|
+
clinical_trials="data/clinical_trials/gene_indication_max_phase.parquet",
|
|
93
|
+
targets=[
|
|
94
|
+
"data/genetic_evidence/gwas_catalog.parquet",
|
|
95
|
+
"data/genetic_evidence/clinvar.parquet",
|
|
96
|
+
"data/genetic_evidence/omim.parquet",
|
|
97
|
+
],
|
|
98
|
+
similarity_lookup="data/mappings/efo_similarity_lookup_0.5.parquet",
|
|
99
|
+
)
|
|
100
|
+
# returns a list of DataFrames, one per evidence source
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Prioritized mode — test whether a novel source adds value over an established baseline:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
results = ctv.validate(
|
|
107
|
+
clinical_trials="data/clinical_trials/gene_indication_max_phase.parquet",
|
|
108
|
+
targets="data/genetic_evidence/novel_score.parquet",
|
|
109
|
+
baseline_evidence="data/genetic_evidence/established_genetics.parquet",
|
|
110
|
+
similarity_lookup="data/mappings/efo_similarity_lookup_0.5.parquet",
|
|
111
|
+
)
|
|
112
|
+
# pairs supported only by baseline are excluded
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Expand a disease set using semantic similarity:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
expanded = ctv.get_expanded_disease_set(
|
|
119
|
+
efo_ids={"EFO:0000270", "EFO:0000384"},
|
|
120
|
+
similarity_pairs="data/mappings/efo_similarity_lookup_0.5.parquet",
|
|
121
|
+
similarity_threshold=0.8,
|
|
122
|
+
)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### CLI
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
# With config file
|
|
129
|
+
ct-validation --config configs/default.yaml
|
|
130
|
+
|
|
131
|
+
# With explicit arguments
|
|
132
|
+
ct-validation \
|
|
133
|
+
--clinical-trials ct.parquet \
|
|
134
|
+
--targets evidence.parquet \
|
|
135
|
+
--similarity-lookup similarity.parquet \
|
|
136
|
+
-o results/
|
|
137
|
+
|
|
138
|
+
# Batch mode (multiple evidence sources)
|
|
139
|
+
ct-validation \
|
|
140
|
+
--clinical-trials ct.parquet \
|
|
141
|
+
--targets gwas.parquet --targets clinvar.parquet --targets omim.parquet \
|
|
142
|
+
-o results/
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### MCP server
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
ct-validation-mcp
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Exposes two tools for agent-based workflows:
|
|
152
|
+
|
|
153
|
+
- `ct_validate` — compute phase-transition enrichment
|
|
154
|
+
- `expand_disease_set` — expand EFO IDs via semantic similarity
|
|
155
|
+
|
|
156
|
+
## Input schemas
|
|
157
|
+
|
|
158
|
+
| Input | Columns | Description |
|
|
159
|
+
| ------------------- | ------------------------------------ | -------------------------------------------------- |
|
|
160
|
+
| `clinical_trials` | `gene`, `efo_id`, `max_phase` | Target-indication pairs with highest phase reached |
|
|
161
|
+
| `targets` | `gene`, `efo_id` | Gene-indication pairs with supporting evidence |
|
|
162
|
+
| `similarity_lookup` | `efo_id_1`, `efo_id_2`, `similarity` | Pairwise EFO similarity (optional) |
|
|
163
|
+
| `baseline_evidence` | `gene`, `efo_id` | Baseline evidence for prioritized mode (optional) |
|
|
164
|
+
| `gene_universe` | one gene per line (text file) | Restrict analysis to these genes (optional) |
|
|
165
|
+
|
|
166
|
+
All inputs accept Parquet files or pandas DataFrames (except `gene_universe`, which is a text file or a Python set).
|
|
167
|
+
|
|
168
|
+
## Output schema
|
|
169
|
+
|
|
170
|
+
| Column | Description |
|
|
171
|
+
| ---------------------------------- | -------------------------------------------- |
|
|
172
|
+
| `phase_from`, `phase_to` | Phase transition (e.g. 1→2, 1→4) |
|
|
173
|
+
| `n_yes`, `n_no` | Pairs entering phase (with/without evidence) |
|
|
174
|
+
| `x_yes`, `x_no` | Pairs reaching target phase |
|
|
175
|
+
| `rate_yes`, `rate_no` | Progression rates |
|
|
176
|
+
| `rr`, `rr_ci_lower`, `rr_ci_upper` | Risk ratio with 95% CI (Katz log method) |
|
|
177
|
+
| `or`, `or_ci_lower`, `or_ci_upper` | Odds ratio with 95% CI (Woolf logit method) |
|
|
178
|
+
|
|
179
|
+
## Enrichment logic
|
|
180
|
+
|
|
181
|
+
For each phase transition, target-indication pairs that reached at least the starting phase are divided into supported and unsupported groups. The risk ratio is:
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
RR = (x_yes / n_yes) / (x_no / n_no)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
A risk ratio greater than one indicates that genetically supported pairs are more likely to progress. When a similarity lookup is provided, a pair (gene, disease) is considered supported if there exists evidence (gene, disease') with similarity above the threshold (default 0.8).
|
|
188
|
+
|
|
189
|
+
### Prioritized mode
|
|
190
|
+
|
|
191
|
+
When `baseline_evidence` is provided, pairs supported _only_ by the baseline are excluded. This tests whether a novel evidence source adds predictive value beyond an established benchmark.
|
|
192
|
+
|
|
193
|
+
## Visualization
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
import ct_validation as ctv
|
|
197
|
+
|
|
198
|
+
results = ctv.validate(...)
|
|
199
|
+
ctv.forest_plot(results, metric="rr", title="Phase I → Approved")
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Data source parsers
|
|
203
|
+
|
|
204
|
+
The `scripts/` directory contains reproducible parsers for public databases:
|
|
205
|
+
|
|
206
|
+
**Genetic evidence** (`scripts/parse/genetic_evidence/`):
|
|
207
|
+
|
|
208
|
+
- GWAS Catalog — genome-wide significant associations (p < 1e-8)
|
|
209
|
+
- ClinVar — pathogenic/likely pathogenic variants
|
|
210
|
+
- OMIM — established molecular basis (mapping code 3)
|
|
211
|
+
- Open Targets — genetic evidence streams (score ≥ 0.5)
|
|
212
|
+
- Genebass — exome-wide associations (p ≤ 1e-7)
|
|
213
|
+
|
|
214
|
+
**Clinical trials** (`scripts/parse/clinical_trials/`):
|
|
215
|
+
|
|
216
|
+
- ChEMBL — gene-drug and drug-indication links (pChEMBL > 7.0)
|
|
217
|
+
- Open Targets — known drug and indication data
|
|
218
|
+
- STITCH — high-confidence activation/inhibition links
|
|
219
|
+
- DGIdb — drug-gene interactions
|
|
220
|
+
- TrialPanorama — interventional studies
|
|
221
|
+
|
|
222
|
+
**Ontology** (`scripts/r/`):
|
|
223
|
+
|
|
224
|
+
- EFO semantic similarity matrix (Lin + Resnik information content)
|
|
225
|
+
|
|
226
|
+
See [DATA_SOURCES.md](DATA_SOURCES.md) for download links, versions, and fetching instructions.
|
|
227
|
+
|
|
228
|
+
Configure paths in `configs/parsing.yaml` and run:
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
python scripts/parse/run_parsing.py
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Configuration
|
|
235
|
+
|
|
236
|
+
See `configs/default.yaml` for validation settings and `configs/parsing.yaml` for data source paths. All config values can be overridden via CLI arguments.
|
|
237
|
+
|
|
238
|
+
## License
|
|
239
|
+
|
|
240
|
+
MIT
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# <img width="300" alt="ct-validation" src="https://github.com/user-attachments/assets/fc5443b6-e93b-4fbb-a841-fffca899eedf" />
|
|
2
|
+
|
|
3
|
+
An open framework for benchmarking gene-indication evidence against clinical trial outcomes.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
`ct-validation` tests whether a set of gene-indication pairs is enriched for clinical success. It computes risk ratios and odds ratios with confidence intervals across clinical phase transitions and supports semantic disease matching through ontology-based similarity.
|
|
7
|
+
|
|
8
|
+
> **Paper:** Kostiuk K, Igumnov D, Fedichev P, Feizi A. _ct-validation: an open framework for benchmarking gene-indication evidence against clinical trial outcomes._ (2026)
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
Requires Python 3.11+.
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install ct-validation
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Optional extras:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install ct-validation[plot] # forest plot visualization
|
|
22
|
+
pip install ct-validation[mcp] # MCP server for agent workflows
|
|
23
|
+
pip install ct-validation[parse] # data source parsers
|
|
24
|
+
pip install ct-validation[fetch] # ChEMBL fetching script dependencies
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quick start
|
|
28
|
+
|
|
29
|
+
### Python API
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import ct_validation as ctv
|
|
33
|
+
|
|
34
|
+
results = ctv.validate(
|
|
35
|
+
clinical_trials="data/clinical_trials/gene_indication_max_phase.parquet",
|
|
36
|
+
targets="data/genetic_evidence/genetic_evidence.parquet",
|
|
37
|
+
similarity_lookup="data/mappings/efo_similarity_lookup_0.5.parquet",
|
|
38
|
+
)
|
|
39
|
+
print(results)
|
|
40
|
+
# phase_label n_yes n_no rr rr_ci_lower rr_ci_upper ...
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Batch mode — compare multiple evidence sources at once:
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
results = ctv.validate(
|
|
47
|
+
clinical_trials="data/clinical_trials/gene_indication_max_phase.parquet",
|
|
48
|
+
targets=[
|
|
49
|
+
"data/genetic_evidence/gwas_catalog.parquet",
|
|
50
|
+
"data/genetic_evidence/clinvar.parquet",
|
|
51
|
+
"data/genetic_evidence/omim.parquet",
|
|
52
|
+
],
|
|
53
|
+
similarity_lookup="data/mappings/efo_similarity_lookup_0.5.parquet",
|
|
54
|
+
)
|
|
55
|
+
# returns a list of DataFrames, one per evidence source
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Prioritized mode — test whether a novel source adds value over an established baseline:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
results = ctv.validate(
|
|
62
|
+
clinical_trials="data/clinical_trials/gene_indication_max_phase.parquet",
|
|
63
|
+
targets="data/genetic_evidence/novel_score.parquet",
|
|
64
|
+
baseline_evidence="data/genetic_evidence/established_genetics.parquet",
|
|
65
|
+
similarity_lookup="data/mappings/efo_similarity_lookup_0.5.parquet",
|
|
66
|
+
)
|
|
67
|
+
# pairs supported only by baseline are excluded
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Expand a disease set using semantic similarity:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
expanded = ctv.get_expanded_disease_set(
|
|
74
|
+
efo_ids={"EFO:0000270", "EFO:0000384"},
|
|
75
|
+
similarity_pairs="data/mappings/efo_similarity_lookup_0.5.parquet",
|
|
76
|
+
similarity_threshold=0.8,
|
|
77
|
+
)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### CLI
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# With config file
|
|
84
|
+
ct-validation --config configs/default.yaml
|
|
85
|
+
|
|
86
|
+
# With explicit arguments
|
|
87
|
+
ct-validation \
|
|
88
|
+
--clinical-trials ct.parquet \
|
|
89
|
+
--targets evidence.parquet \
|
|
90
|
+
--similarity-lookup similarity.parquet \
|
|
91
|
+
-o results/
|
|
92
|
+
|
|
93
|
+
# Batch mode (multiple evidence sources)
|
|
94
|
+
ct-validation \
|
|
95
|
+
--clinical-trials ct.parquet \
|
|
96
|
+
--targets gwas.parquet --targets clinvar.parquet --targets omim.parquet \
|
|
97
|
+
-o results/
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### MCP server
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
ct-validation-mcp
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Exposes two tools for agent-based workflows:
|
|
107
|
+
|
|
108
|
+
- `ct_validate` — compute phase-transition enrichment
|
|
109
|
+
- `expand_disease_set` — expand EFO IDs via semantic similarity
|
|
110
|
+
|
|
111
|
+
## Input schemas
|
|
112
|
+
|
|
113
|
+
| Input | Columns | Description |
|
|
114
|
+
| ------------------- | ------------------------------------ | -------------------------------------------------- |
|
|
115
|
+
| `clinical_trials` | `gene`, `efo_id`, `max_phase` | Target-indication pairs with highest phase reached |
|
|
116
|
+
| `targets` | `gene`, `efo_id` | Gene-indication pairs with supporting evidence |
|
|
117
|
+
| `similarity_lookup` | `efo_id_1`, `efo_id_2`, `similarity` | Pairwise EFO similarity (optional) |
|
|
118
|
+
| `baseline_evidence` | `gene`, `efo_id` | Baseline evidence for prioritized mode (optional) |
|
|
119
|
+
| `gene_universe` | one gene per line (text file) | Restrict analysis to these genes (optional) |
|
|
120
|
+
|
|
121
|
+
All inputs accept Parquet files or pandas DataFrames (except `gene_universe`, which is a text file or a Python set).
|
|
122
|
+
|
|
123
|
+
## Output schema
|
|
124
|
+
|
|
125
|
+
| Column | Description |
|
|
126
|
+
| ---------------------------------- | -------------------------------------------- |
|
|
127
|
+
| `phase_from`, `phase_to` | Phase transition (e.g. 1→2, 1→4) |
|
|
128
|
+
| `n_yes`, `n_no` | Pairs entering phase (with/without evidence) |
|
|
129
|
+
| `x_yes`, `x_no` | Pairs reaching target phase |
|
|
130
|
+
| `rate_yes`, `rate_no` | Progression rates |
|
|
131
|
+
| `rr`, `rr_ci_lower`, `rr_ci_upper` | Risk ratio with 95% CI (Katz log method) |
|
|
132
|
+
| `or`, `or_ci_lower`, `or_ci_upper` | Odds ratio with 95% CI (Woolf logit method) |
|
|
133
|
+
|
|
134
|
+
## Enrichment logic
|
|
135
|
+
|
|
136
|
+
For each phase transition, target-indication pairs that reached at least the starting phase are divided into supported and unsupported groups. The risk ratio is:
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
RR = (x_yes / n_yes) / (x_no / n_no)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
A risk ratio greater than one indicates that genetically supported pairs are more likely to progress. When a similarity lookup is provided, a pair (gene, disease) is considered supported if there exists evidence (gene, disease') with similarity above the threshold (default 0.8).
|
|
143
|
+
|
|
144
|
+
### Prioritized mode
|
|
145
|
+
|
|
146
|
+
When `baseline_evidence` is provided, pairs supported _only_ by the baseline are excluded. This tests whether a novel evidence source adds predictive value beyond an established benchmark.
|
|
147
|
+
|
|
148
|
+
## Visualization
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
import ct_validation as ctv
|
|
152
|
+
|
|
153
|
+
results = ctv.validate(...)
|
|
154
|
+
ctv.forest_plot(results, metric="rr", title="Phase I → Approved")
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Data source parsers
|
|
158
|
+
|
|
159
|
+
The `scripts/` directory contains reproducible parsers for public databases:
|
|
160
|
+
|
|
161
|
+
**Genetic evidence** (`scripts/parse/genetic_evidence/`):
|
|
162
|
+
|
|
163
|
+
- GWAS Catalog — genome-wide significant associations (p < 1e-8)
|
|
164
|
+
- ClinVar — pathogenic/likely pathogenic variants
|
|
165
|
+
- OMIM — established molecular basis (mapping code 3)
|
|
166
|
+
- Open Targets — genetic evidence streams (score ≥ 0.5)
|
|
167
|
+
- Genebass — exome-wide associations (p ≤ 1e-7)
|
|
168
|
+
|
|
169
|
+
**Clinical trials** (`scripts/parse/clinical_trials/`):
|
|
170
|
+
|
|
171
|
+
- ChEMBL — gene-drug and drug-indication links (pChEMBL > 7.0)
|
|
172
|
+
- Open Targets — known drug and indication data
|
|
173
|
+
- STITCH — high-confidence activation/inhibition links
|
|
174
|
+
- DGIdb — drug-gene interactions
|
|
175
|
+
- TrialPanorama — interventional studies
|
|
176
|
+
|
|
177
|
+
**Ontology** (`scripts/r/`):
|
|
178
|
+
|
|
179
|
+
- EFO semantic similarity matrix (Lin + Resnik information content)
|
|
180
|
+
|
|
181
|
+
See [DATA_SOURCES.md](DATA_SOURCES.md) for download links, versions, and fetching instructions.
|
|
182
|
+
|
|
183
|
+
Configure paths in `configs/parsing.yaml` and run:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
python scripts/parse/run_parsing.py
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Configuration
|
|
190
|
+
|
|
191
|
+
See `configs/default.yaml` for validation settings and `configs/parsing.yaml` for data source paths. All config values can be overridden via CLI arguments.
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
MIT
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ct-validation"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Benchmarking gene-indication evidence against clinical trial outcomes"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Klim Kostiuk", email = "2601074@gmail.com" },
|
|
8
|
+
{ name = "Daniel Igumnov" },
|
|
9
|
+
{ name = "Peter Fedichev" },
|
|
10
|
+
{ name = "Amir Feizi" },
|
|
11
|
+
]
|
|
12
|
+
license = "MIT"
|
|
13
|
+
license-files = ["LICENSE"]
|
|
14
|
+
requires-python = ">=3.11"
|
|
15
|
+
keywords = ["clinical-trials", "target-validation", "drug-discovery", "genetics", "enrichment"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"duckdb>=1.4.3",
|
|
28
|
+
"numpy>=2.3.3",
|
|
29
|
+
"pandas>=2.3.3",
|
|
30
|
+
"pyarrow>=22.0.0",
|
|
31
|
+
"pyyaml>=6.0.3",
|
|
32
|
+
"scipy>=1.16.2",
|
|
33
|
+
"typer>=0.24.0",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/gero-science/ct-validation"
|
|
38
|
+
Repository = "https://github.com/gero-science/ct-validation"
|
|
39
|
+
Issues = "https://github.com/gero-science/ct-validation/issues"
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
ct-validation = "ct_validation.cli:cli"
|
|
43
|
+
ct-validation-mcp = "ct_validation.mcp_server:main"
|
|
44
|
+
|
|
45
|
+
[build-system]
|
|
46
|
+
requires = ["uv_build>=0.8.18,<0.9.0"]
|
|
47
|
+
build-backend = "uv_build"
|
|
48
|
+
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
parse = [
|
|
51
|
+
"polars>=1.5", # stitch.py (scan_csv on .gz requires >=1.5)
|
|
52
|
+
"requests",
|
|
53
|
+
"tqdm",
|
|
54
|
+
"cyvcf2", # clinvar.py
|
|
55
|
+
]
|
|
56
|
+
fetch = [
|
|
57
|
+
"chembl-webresource-client",
|
|
58
|
+
"huggingface_hub",
|
|
59
|
+
"joblib",
|
|
60
|
+
"tqdm",
|
|
61
|
+
]
|
|
62
|
+
genebass = [
|
|
63
|
+
"hail", # requires Java 11; for genebass_preprocess.py only
|
|
64
|
+
]
|
|
65
|
+
plot = [
|
|
66
|
+
"matplotlib",
|
|
67
|
+
]
|
|
68
|
+
mcp = [
|
|
69
|
+
"mcp[cli]>=1.0",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
[dependency-groups]
|
|
73
|
+
dev = [
|
|
74
|
+
"pytest>=9.0.2",
|
|
75
|
+
"pytest-cov>=7.0.0",
|
|
76
|
+
"ruff>=0.14.10",
|
|
77
|
+
]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Clinical trial validation package.
|
|
2
|
+
|
|
3
|
+
Main entry point:
|
|
4
|
+
- validate(): Run validation pipeline (with config and/or explicit args)
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ct_validation.api import validate
|
|
8
|
+
from ct_validation.config import load_config
|
|
9
|
+
from ct_validation.config.schema import PHASE_NAMES, Config, phase_label
|
|
10
|
+
from ct_validation.plotting import forest_plot
|
|
11
|
+
from ct_validation.validation import (
|
|
12
|
+
EnrichmentResult,
|
|
13
|
+
calculate_enrichment,
|
|
14
|
+
create_matched_pairs_set,
|
|
15
|
+
get_expanded_disease_set,
|
|
16
|
+
katz_ci_risk_ratio,
|
|
17
|
+
woolf_ci_odds_ratio,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__version__ = "0.1.0"
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"PHASE_NAMES",
|
|
24
|
+
"Config",
|
|
25
|
+
"EnrichmentResult",
|
|
26
|
+
"calculate_enrichment",
|
|
27
|
+
"create_matched_pairs_set",
|
|
28
|
+
"forest_plot",
|
|
29
|
+
"get_expanded_disease_set",
|
|
30
|
+
"katz_ci_risk_ratio",
|
|
31
|
+
"load_config",
|
|
32
|
+
"phase_label",
|
|
33
|
+
"validate", # Main API
|
|
34
|
+
"woolf_ci_odds_ratio",
|
|
35
|
+
]
|