allelix 1.8.3__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {allelix-1.8.3 → allelix-1.9.0}/PKG-INFO +9 -6
- {allelix-1.8.3 → allelix-1.9.0}/README.md +8 -5
- {allelix-1.8.3 → allelix-1.9.0}/allelix/cli.py +78 -6
- {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/_pipeline.py +13 -3
- {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/html.py +2 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/json_report.py +4 -1
- {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/terminal.py +2 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/PKG-INFO +9 -6
- {allelix-1.8.3 → allelix-1.9.0}/pyproject.toml +1 -1
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_cli.py +247 -0
- {allelix-1.8.3 → allelix-1.9.0}/LICENSE +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/__init__.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/__init__.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/alphamissense.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/base.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/cadd.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/clinvar.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/gnomad.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/gwas.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/pharmgkb.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/snpedia.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/compare.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/config.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/data/__init__.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/data/high_value_snps.yaml +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/__init__.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/_versions.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/alphamissense_loader.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/cadd_loader.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/cpic_loader.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/gnomad_loader.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/gwas_loader.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/loader_utils.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/manager.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/pharmgkb_loader.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/schema.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/snpedia_loader.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/snpedia_parser.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/exporters/__init__.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/exporters/plink.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/models.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/__init__.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/_helpers.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/ancestrydna.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/base.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/ftdna.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/livingdna.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/myhappygenes.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/myheritage.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/twentythreeandme.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/py.typed +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/__init__.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/diff.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/high_value.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/methylation.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/utils/__init__.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/utils/allele.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix/utils/build_detect.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/SOURCES.txt +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/dependency_links.txt +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/entry_points.txt +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/requires.txt +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/top_level.txt +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/setup.cfg +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_cli_helpers.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_compare.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_config.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_end_to_end.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_mock_data_invariants.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_models.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_registry.py +0 -0
- {allelix-1.8.3 → allelix-1.9.0}/tests/test_version.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: allelix
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
5
5
|
Author-email: dial481 <dial481@users.noreply.github.com>
|
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
|
@@ -44,8 +44,8 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
|
|
|
44
44
|
> HTML/JSON/terminal reports, methylation + pharmacogenomics focused
|
|
45
45
|
> commands, report diffing, persistent config with commercial-mode
|
|
46
46
|
> safety switch. Build auto-detection from position data (ADR-0021).
|
|
47
|
-
> No regex on prose anywhere in production. **Latest: v1.
|
|
48
|
-
>
|
|
47
|
+
> No regex on prose anywhere in production. **Latest: v1.9.0** —
|
|
48
|
+
> `--filter-file` flag for custom-panel filtering on `analyze`.
|
|
49
49
|
> Release notes:
|
|
50
50
|
> [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
|
|
51
51
|
|
|
@@ -61,6 +61,9 @@ allelix db update
|
|
|
61
61
|
|
|
62
62
|
# Analyze a genotype file
|
|
63
63
|
allelix analyze your_genotype_file.txt --output report.html
|
|
64
|
+
|
|
65
|
+
# Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
|
|
66
|
+
allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
|
|
64
67
|
```
|
|
65
68
|
|
|
66
69
|
Requires Python 3.11+. See [Development](#development) for source installs and running tests.
|
|
@@ -98,7 +101,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
|
|
|
98
101
|
| GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
|
|
99
102
|
| gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
|
|
100
103
|
| AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
|
|
101
|
-
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
104
|
+
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
102
105
|
|
|
103
106
|
### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
|
|
104
107
|
|
|
@@ -155,7 +158,7 @@ allelix config set license.commercial true
|
|
|
155
158
|
allelix config set license.cadd true
|
|
156
159
|
```
|
|
157
160
|
|
|
158
|
-
CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
|
|
161
|
+
CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
|
|
159
162
|
|
|
160
163
|
### Database sizes and download times
|
|
161
164
|
|
|
@@ -187,7 +190,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
|
|
|
187
190
|
| SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
|
|
188
191
|
| gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
|
|
189
192
|
| AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
|
|
190
|
-
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
|
|
193
|
+
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
|
|
191
194
|
|
|
192
195
|
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
|
|
193
196
|
|
|
@@ -10,8 +10,8 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
|
|
|
10
10
|
> HTML/JSON/terminal reports, methylation + pharmacogenomics focused
|
|
11
11
|
> commands, report diffing, persistent config with commercial-mode
|
|
12
12
|
> safety switch. Build auto-detection from position data (ADR-0021).
|
|
13
|
-
> No regex on prose anywhere in production. **Latest: v1.
|
|
14
|
-
>
|
|
13
|
+
> No regex on prose anywhere in production. **Latest: v1.9.0** —
|
|
14
|
+
> `--filter-file` flag for custom-panel filtering on `analyze`.
|
|
15
15
|
> Release notes:
|
|
16
16
|
> [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
|
|
17
17
|
|
|
@@ -27,6 +27,9 @@ allelix db update
|
|
|
27
27
|
|
|
28
28
|
# Analyze a genotype file
|
|
29
29
|
allelix analyze your_genotype_file.txt --output report.html
|
|
30
|
+
|
|
31
|
+
# Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
|
|
32
|
+
allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
|
|
30
33
|
```
|
|
31
34
|
|
|
32
35
|
Requires Python 3.11+. See [Development](#development) for source installs and running tests.
|
|
@@ -64,7 +67,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
|
|
|
64
67
|
| GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
|
|
65
68
|
| gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
|
|
66
69
|
| AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
|
|
67
|
-
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
70
|
+
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
68
71
|
|
|
69
72
|
### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
|
|
70
73
|
|
|
@@ -121,7 +124,7 @@ allelix config set license.commercial true
|
|
|
121
124
|
allelix config set license.cadd true
|
|
122
125
|
```
|
|
123
126
|
|
|
124
|
-
CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
|
|
127
|
+
CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
|
|
125
128
|
|
|
126
129
|
### Database sizes and download times
|
|
127
130
|
|
|
@@ -153,7 +156,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
|
|
|
153
156
|
| SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
|
|
154
157
|
| gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
|
|
155
158
|
| AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
|
|
156
|
-
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
|
|
159
|
+
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
|
|
157
160
|
|
|
158
161
|
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
|
|
159
162
|
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
import logging
|
|
8
|
+
import re
|
|
8
9
|
import sys
|
|
9
10
|
import time
|
|
10
11
|
from pathlib import Path
|
|
@@ -195,6 +196,35 @@ def _format_from_path(output: Path, override: str | None) -> str:
|
|
|
195
196
|
)
|
|
196
197
|
|
|
197
198
|
|
|
199
|
+
_RSID_PATTERN = re.compile(r"^rs\d+$", re.IGNORECASE)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _parse_filter_file(path: Path) -> tuple[frozenset[str], frozenset[str]]:
|
|
203
|
+
r"""Parse a filter file into ``(gene_names, rsids)``.
|
|
204
|
+
|
|
205
|
+
Lines matching ``^rs\d+$`` (case-insensitive) are rsIDs. Everything
|
|
206
|
+
else is a gene name. Lines starting with ``#`` and blank lines are
|
|
207
|
+
ignored. Gene names starting with ``RS`` (e.g., RSPO1, RSF1) are
|
|
208
|
+
correctly classified as gene names, not rsIDs.
|
|
209
|
+
|
|
210
|
+
Input is case-tolerant; output is canonical: rsIDs are normalized to
|
|
211
|
+
lowercase (``rs1801133``), gene names to uppercase (``MTHFR``). The
|
|
212
|
+
filter recorded in JSON output therefore looks identical regardless
|
|
213
|
+
of how the user typed the entries in the filter file.
|
|
214
|
+
"""
|
|
215
|
+
genes: set[str] = set()
|
|
216
|
+
rsids: set[str] = set()
|
|
217
|
+
for raw in path.read_text().splitlines():
|
|
218
|
+
line = raw.strip()
|
|
219
|
+
if not line or line.startswith("#"):
|
|
220
|
+
continue
|
|
221
|
+
if _RSID_PATTERN.match(line):
|
|
222
|
+
rsids.add(line.lower())
|
|
223
|
+
else:
|
|
224
|
+
genes.add(line.upper())
|
|
225
|
+
return frozenset(genes), frozenset(rsids)
|
|
226
|
+
|
|
227
|
+
|
|
198
228
|
def _run_analysis_command(
|
|
199
229
|
file_path: Path,
|
|
200
230
|
fmt: str | None,
|
|
@@ -204,6 +234,7 @@ def _run_analysis_command(
|
|
|
204
234
|
min_magnitude: float,
|
|
205
235
|
category: str | None,
|
|
206
236
|
genes: frozenset[str] | None,
|
|
237
|
+
rsids: frozenset[str] | None = None,
|
|
207
238
|
build: str | None = None,
|
|
208
239
|
include_benign: bool = False,
|
|
209
240
|
gwas_min_magnitude: float | None = None,
|
|
@@ -214,6 +245,7 @@ def _run_analysis_command(
|
|
|
214
245
|
no_update: bool = False,
|
|
215
246
|
no_gnomad: bool = False,
|
|
216
247
|
no_alphamissense: bool = False,
|
|
248
|
+
no_cadd: bool = False,
|
|
217
249
|
) -> None:
|
|
218
250
|
resolved = resolve_data_dir(data_dir)
|
|
219
251
|
if not no_update:
|
|
@@ -256,12 +288,13 @@ def _run_analysis_command(
|
|
|
256
288
|
ready = [a for a in ready if a.name != "alphamissense"]
|
|
257
289
|
|
|
258
290
|
cadd_annotator = None
|
|
259
|
-
|
|
291
|
+
if not no_cadd:
|
|
292
|
+
from allelix.annotators.cadd import CaddAnnotator
|
|
260
293
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
294
|
+
for a in ready:
|
|
295
|
+
if isinstance(a, CaddAnnotator):
|
|
296
|
+
cadd_annotator = a
|
|
297
|
+
break
|
|
265
298
|
ready = [a for a in ready if a.name != "cadd"]
|
|
266
299
|
|
|
267
300
|
if not_ready:
|
|
@@ -333,6 +366,7 @@ def _run_analysis_command(
|
|
|
333
366
|
min_magnitude=min_magnitude,
|
|
334
367
|
category=category,
|
|
335
368
|
genes=genes,
|
|
369
|
+
rsids=rsids,
|
|
336
370
|
source_min_magnitudes=source_floors,
|
|
337
371
|
)
|
|
338
372
|
from allelix.reports._pipeline import rollup_gwas_duplicates
|
|
@@ -354,6 +388,7 @@ def _run_analysis_command(
|
|
|
354
388
|
min_magnitude=min_magnitude,
|
|
355
389
|
category=category,
|
|
356
390
|
genes=genes,
|
|
391
|
+
rsids=rsids,
|
|
357
392
|
source_min_magnitudes=source_floors,
|
|
358
393
|
)
|
|
359
394
|
else:
|
|
@@ -371,6 +406,7 @@ def _run_analysis_command(
|
|
|
371
406
|
min_magnitude=min_magnitude,
|
|
372
407
|
category=category,
|
|
373
408
|
genes=genes,
|
|
409
|
+
rsids=rsids,
|
|
374
410
|
source_min_magnitudes=source_floors,
|
|
375
411
|
diff=diff_result,
|
|
376
412
|
high_value_no_calls=hv_dicts,
|
|
@@ -382,6 +418,7 @@ def _run_analysis_command(
|
|
|
382
418
|
min_magnitude=min_magnitude,
|
|
383
419
|
category=category,
|
|
384
420
|
genes=genes,
|
|
421
|
+
rsids=rsids,
|
|
385
422
|
source_min_magnitudes=source_floors,
|
|
386
423
|
diff=diff_result,
|
|
387
424
|
high_value_no_calls=hv_warning_lines,
|
|
@@ -558,6 +595,16 @@ _DIFF_OPT = click.option(
|
|
|
558
595
|
"Not a monitoring tool — use for version-to-version validation."
|
|
559
596
|
),
|
|
560
597
|
)
|
|
598
|
+
_FILTER_FILE_OPT = click.option(
|
|
599
|
+
"--filter-file",
|
|
600
|
+
type=click.Path(exists=True, dir_okay=False, path_type=Path),
|
|
601
|
+
default=None,
|
|
602
|
+
help=(
|
|
603
|
+
"Plain text file with rsIDs and/or gene names (one per line) to "
|
|
604
|
+
"filter the report. Lines matching '^rs\\d+$' are rsIDs; everything "
|
|
605
|
+
"else is a gene name. Comments (#) and blank lines are ignored."
|
|
606
|
+
),
|
|
607
|
+
)
|
|
561
608
|
_NO_UPDATE_OPT = click.option(
|
|
562
609
|
"--no-update",
|
|
563
610
|
is_flag=True,
|
|
@@ -576,6 +623,12 @@ _NO_ALPHAMISSENSE_OPT = click.option(
|
|
|
576
623
|
default=False,
|
|
577
624
|
help="Skip AlphaMissense variant pathogenicity enrichment.",
|
|
578
625
|
)
|
|
626
|
+
_NO_CADD_OPT = click.option(
|
|
627
|
+
"--no-cadd",
|
|
628
|
+
is_flag=True,
|
|
629
|
+
default=False,
|
|
630
|
+
help="Skip CADD deleteriousness score enrichment.",
|
|
631
|
+
)
|
|
579
632
|
_BUILD_OPT = click.option(
|
|
580
633
|
"--build",
|
|
581
634
|
type=click.Choice(["grch37", "grch38", "auto"], case_sensitive=False),
|
|
@@ -669,9 +722,11 @@ def _emit_build_diagnostics(result: object) -> None:
|
|
|
669
722
|
@_GWAS_ALL_OPT
|
|
670
723
|
@_EXCLUDE_SNPEDIA_OPT
|
|
671
724
|
@_DIFF_OPT
|
|
725
|
+
@_FILTER_FILE_OPT
|
|
672
726
|
@_NO_UPDATE_OPT
|
|
673
727
|
@_NO_GNOMAD_OPT
|
|
674
728
|
@_NO_ALPHAMISSENSE_OPT
|
|
729
|
+
@_NO_CADD_OPT
|
|
675
730
|
def analyze(
|
|
676
731
|
file_path: Path,
|
|
677
732
|
fmt: str | None,
|
|
@@ -687,11 +742,20 @@ def analyze(
|
|
|
687
742
|
gwas_all: bool,
|
|
688
743
|
exclude_snpedia: bool,
|
|
689
744
|
diff_path: Path | None,
|
|
745
|
+
filter_file: Path | None,
|
|
690
746
|
no_update: bool,
|
|
691
747
|
no_gnomad: bool,
|
|
692
748
|
no_alphamissense: bool,
|
|
749
|
+
no_cadd: bool,
|
|
693
750
|
) -> None:
|
|
694
751
|
"""Annotate a genotype file against all ready reference databases."""
|
|
752
|
+
filter_genes: frozenset[str] | None = None
|
|
753
|
+
filter_rsids: frozenset[str] | None = None
|
|
754
|
+
if filter_file is not None:
|
|
755
|
+
filter_genes, filter_rsids = _parse_filter_file(filter_file)
|
|
756
|
+
# Empty sets (file had only comments/blanks) still apply — they
|
|
757
|
+
# mean "match nothing", producing an empty report.
|
|
758
|
+
|
|
695
759
|
_run_analysis_command(
|
|
696
760
|
file_path=file_path,
|
|
697
761
|
fmt=fmt,
|
|
@@ -700,7 +764,8 @@ def analyze(
|
|
|
700
764
|
report_format=report_format,
|
|
701
765
|
min_magnitude=min_magnitude,
|
|
702
766
|
category=category,
|
|
703
|
-
genes=
|
|
767
|
+
genes=filter_genes,
|
|
768
|
+
rsids=filter_rsids,
|
|
704
769
|
build=_normalize_cli_build(build),
|
|
705
770
|
include_benign=include_benign,
|
|
706
771
|
gwas_min_magnitude=gwas_min_magnitude,
|
|
@@ -711,6 +776,7 @@ def analyze(
|
|
|
711
776
|
no_update=no_update,
|
|
712
777
|
no_gnomad=no_gnomad,
|
|
713
778
|
no_alphamissense=no_alphamissense,
|
|
779
|
+
no_cadd=no_cadd,
|
|
714
780
|
)
|
|
715
781
|
|
|
716
782
|
|
|
@@ -868,6 +934,7 @@ def compare(file1: Path, file2: Path, fmt1: str | None, fmt2: str | None) -> Non
|
|
|
868
934
|
@_NO_UPDATE_OPT
|
|
869
935
|
@_NO_GNOMAD_OPT
|
|
870
936
|
@_NO_ALPHAMISSENSE_OPT
|
|
937
|
+
@_NO_CADD_OPT
|
|
871
938
|
def methylation(
|
|
872
939
|
file_path: Path,
|
|
873
940
|
fmt: str | None,
|
|
@@ -886,6 +953,7 @@ def methylation(
|
|
|
886
953
|
no_update: bool,
|
|
887
954
|
no_gnomad: bool,
|
|
888
955
|
no_alphamissense: bool,
|
|
956
|
+
no_cadd: bool,
|
|
889
957
|
) -> None:
|
|
890
958
|
"""Methylation-pathway-focused report (MTHFR, MTR, MTRR, COMT, CBS, …)."""
|
|
891
959
|
excluded: set[str] = set()
|
|
@@ -912,6 +980,7 @@ def methylation(
|
|
|
912
980
|
no_update=no_update,
|
|
913
981
|
no_gnomad=no_gnomad,
|
|
914
982
|
no_alphamissense=no_alphamissense,
|
|
983
|
+
no_cadd=no_cadd,
|
|
915
984
|
)
|
|
916
985
|
|
|
917
986
|
|
|
@@ -933,6 +1002,7 @@ def methylation(
|
|
|
933
1002
|
@_NO_UPDATE_OPT
|
|
934
1003
|
@_NO_GNOMAD_OPT
|
|
935
1004
|
@_NO_ALPHAMISSENSE_OPT
|
|
1005
|
+
@_NO_CADD_OPT
|
|
936
1006
|
def pharmacogenomics(
|
|
937
1007
|
file_path: Path,
|
|
938
1008
|
fmt: str | None,
|
|
@@ -951,6 +1021,7 @@ def pharmacogenomics(
|
|
|
951
1021
|
no_update: bool,
|
|
952
1022
|
no_gnomad: bool,
|
|
953
1023
|
no_alphamissense: bool,
|
|
1024
|
+
no_cadd: bool,
|
|
954
1025
|
) -> None:
|
|
955
1026
|
"""Pharmacogenomics-focused report (annotations from PharmGKB-style sources)."""
|
|
956
1027
|
excluded: set[str] = set()
|
|
@@ -977,6 +1048,7 @@ def pharmacogenomics(
|
|
|
977
1048
|
no_update=no_update,
|
|
978
1049
|
no_gnomad=no_gnomad,
|
|
979
1050
|
no_alphamissense=no_alphamissense,
|
|
1051
|
+
no_cadd=no_cadd,
|
|
980
1052
|
)
|
|
981
1053
|
|
|
982
1054
|
|
|
@@ -105,6 +105,7 @@ class AnalysisResult:
|
|
|
105
105
|
min_magnitude: float = 0.0,
|
|
106
106
|
category: str | None = None,
|
|
107
107
|
genes: Iterable[str] | None = None,
|
|
108
|
+
rsids: Iterable[str] | None = None,
|
|
108
109
|
source_min_magnitudes: dict[str, float] | None = None,
|
|
109
110
|
) -> list[Annotation]:
|
|
110
111
|
"""Apply the standard filters and return a sorted list of annotations.
|
|
@@ -117,8 +118,14 @@ class AnalysisResult:
|
|
|
117
118
|
entry, that value IS the floor for that source — it can raise OR
|
|
118
119
|
lower the global ``min_magnitude``. Sources without an entry use
|
|
119
120
|
the global floor.
|
|
121
|
+
|
|
122
|
+
`genes` and `rsids` combine with OR: when either is provided, an
|
|
123
|
+
annotation passes if it matches the gene set OR the rsid set.
|
|
124
|
+
Empty collections (vs None) mean "match nothing" — an empty
|
|
125
|
+
filter file produces an empty report.
|
|
120
126
|
"""
|
|
121
|
-
gene_set = {g.upper() for g in genes} if genes else None
|
|
127
|
+
gene_set = {g.upper() for g in genes} if genes is not None else None
|
|
128
|
+
rsid_set = {r.lower() for r in rsids} if rsids is not None else None
|
|
122
129
|
out: list[Annotation] = []
|
|
123
130
|
for a in self.annotations:
|
|
124
131
|
if (
|
|
@@ -133,8 +140,11 @@ class AnalysisResult:
|
|
|
133
140
|
continue
|
|
134
141
|
if category is not None and a.category != category:
|
|
135
142
|
continue
|
|
136
|
-
if gene_set is not None
|
|
137
|
-
|
|
143
|
+
if gene_set is not None or rsid_set is not None:
|
|
144
|
+
gene_match = gene_set is not None and (a.gene or "").upper() in gene_set
|
|
145
|
+
rsid_match = rsid_set is not None and a.rsid.lower() in rsid_set
|
|
146
|
+
if not gene_match and not rsid_match:
|
|
147
|
+
continue
|
|
138
148
|
out.append(a)
|
|
139
149
|
out.sort(key=lambda a: (-a.magnitude, a.rsid))
|
|
140
150
|
return out
|
|
@@ -951,6 +951,7 @@ def render_html(
|
|
|
951
951
|
min_magnitude: float = 0.0,
|
|
952
952
|
category: str | None = None,
|
|
953
953
|
genes: Iterable[str] | None = None,
|
|
954
|
+
rsids: Iterable[str] | None = None,
|
|
954
955
|
source_min_magnitudes: dict[str, float] | None = None,
|
|
955
956
|
title: str = "Allelix Genotype Report",
|
|
956
957
|
diff: DiffResult | None = None,
|
|
@@ -961,6 +962,7 @@ def render_html(
|
|
|
961
962
|
min_magnitude=min_magnitude,
|
|
962
963
|
category=category,
|
|
963
964
|
genes=genes,
|
|
965
|
+
rsids=rsids,
|
|
964
966
|
source_min_magnitudes=source_min_magnitudes,
|
|
965
967
|
)
|
|
966
968
|
filtered = rollup_gwas_duplicates(filtered)
|
|
@@ -103,6 +103,7 @@ def render_json(
|
|
|
103
103
|
min_magnitude: float = 0.0,
|
|
104
104
|
category: str | None = None,
|
|
105
105
|
genes: Iterable[str] | None = None,
|
|
106
|
+
rsids: Iterable[str] | None = None,
|
|
106
107
|
source_min_magnitudes: dict[str, float] | None = None,
|
|
107
108
|
diff: DiffResult | None = None,
|
|
108
109
|
high_value_no_calls: list[dict[str, str]] | None = None,
|
|
@@ -112,6 +113,7 @@ def render_json(
|
|
|
112
113
|
min_magnitude=min_magnitude,
|
|
113
114
|
category=category,
|
|
114
115
|
genes=genes,
|
|
116
|
+
rsids=rsids,
|
|
115
117
|
source_min_magnitudes=source_min_magnitudes,
|
|
116
118
|
)
|
|
117
119
|
filtered = rollup_gwas_duplicates(filtered)
|
|
@@ -134,7 +136,8 @@ def render_json(
|
|
|
134
136
|
"filters": {
|
|
135
137
|
"min_magnitude": min_magnitude,
|
|
136
138
|
"category": category,
|
|
137
|
-
"genes": sorted(genes) if genes else None,
|
|
139
|
+
"genes": sorted(genes) if genes is not None else None,
|
|
140
|
+
"rsids": sorted(rsids) if rsids is not None else None,
|
|
138
141
|
},
|
|
139
142
|
"annotations": [_annotation_dict(a) for a in filtered],
|
|
140
143
|
}
|
|
@@ -27,6 +27,7 @@ def render_terminal(
|
|
|
27
27
|
min_magnitude: float = 0.0,
|
|
28
28
|
category: str | None = None,
|
|
29
29
|
genes: Iterable[str] | None = None,
|
|
30
|
+
rsids: Iterable[str] | None = None,
|
|
30
31
|
source_min_magnitudes: dict[str, float] | None = None,
|
|
31
32
|
) -> int:
|
|
32
33
|
"""Render an AnalysisResult as a Rich table. Returns annotation count.
|
|
@@ -38,6 +39,7 @@ def render_terminal(
|
|
|
38
39
|
min_magnitude=min_magnitude,
|
|
39
40
|
category=category,
|
|
40
41
|
genes=genes,
|
|
42
|
+
rsids=rsids,
|
|
41
43
|
source_min_magnitudes=source_min_magnitudes,
|
|
42
44
|
)
|
|
43
45
|
filtered = rollup_gwas_duplicates(filtered)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: allelix
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
5
5
|
Author-email: dial481 <dial481@users.noreply.github.com>
|
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
|
@@ -44,8 +44,8 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
|
|
|
44
44
|
> HTML/JSON/terminal reports, methylation + pharmacogenomics focused
|
|
45
45
|
> commands, report diffing, persistent config with commercial-mode
|
|
46
46
|
> safety switch. Build auto-detection from position data (ADR-0021).
|
|
47
|
-
> No regex on prose anywhere in production. **Latest: v1.
|
|
48
|
-
>
|
|
47
|
+
> No regex on prose anywhere in production. **Latest: v1.9.0** —
|
|
48
|
+
> `--filter-file` flag for custom-panel filtering on `analyze`.
|
|
49
49
|
> Release notes:
|
|
50
50
|
> [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
|
|
51
51
|
|
|
@@ -61,6 +61,9 @@ allelix db update
|
|
|
61
61
|
|
|
62
62
|
# Analyze a genotype file
|
|
63
63
|
allelix analyze your_genotype_file.txt --output report.html
|
|
64
|
+
|
|
65
|
+
# Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
|
|
66
|
+
allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
|
|
64
67
|
```
|
|
65
68
|
|
|
66
69
|
Requires Python 3.11+. See [Development](#development) for source installs and running tests.
|
|
@@ -98,7 +101,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
|
|
|
98
101
|
| GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
|
|
99
102
|
| gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
|
|
100
103
|
| AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
|
|
101
|
-
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
104
|
+
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
102
105
|
|
|
103
106
|
### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
|
|
104
107
|
|
|
@@ -155,7 +158,7 @@ allelix config set license.commercial true
|
|
|
155
158
|
allelix config set license.cadd true
|
|
156
159
|
```
|
|
157
160
|
|
|
158
|
-
CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
|
|
161
|
+
CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
|
|
159
162
|
|
|
160
163
|
### Database sizes and download times
|
|
161
164
|
|
|
@@ -187,7 +190,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
|
|
|
187
190
|
| SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
|
|
188
191
|
| gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
|
|
189
192
|
| AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
|
|
190
|
-
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
|
|
193
|
+
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
|
|
191
194
|
|
|
192
195
|
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
|
|
193
196
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "allelix"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.9.0"
|
|
8
8
|
description = "Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
@@ -1527,6 +1527,253 @@ class TestExcludeSnpedia:
|
|
|
1527
1527
|
assert captured["exclude_sources"] == frozenset({"snpedia", "gwas"})
|
|
1528
1528
|
|
|
1529
1529
|
|
|
1530
|
+
class TestNoCaddFlag:
|
|
1531
|
+
"""--no-cadd wires through to no_cadd on all three analysis commands."""
|
|
1532
|
+
|
|
1533
|
+
def test_analyze_passes_no_cadd(self, mock_mhg_path, monkeypatch):
|
|
1534
|
+
captured: dict = {}
|
|
1535
|
+
|
|
1536
|
+
def fake_run(**kwargs):
|
|
1537
|
+
captured.update(kwargs)
|
|
1538
|
+
|
|
1539
|
+
monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
|
|
1540
|
+
runner = CliRunner()
|
|
1541
|
+
result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--no-cadd"])
|
|
1542
|
+
assert result.exit_code == 0, result.output
|
|
1543
|
+
assert captured["no_cadd"] is True
|
|
1544
|
+
|
|
1545
|
+
def test_analyze_default_no_cadd_false(self, mock_mhg_path, monkeypatch):
|
|
1546
|
+
captured: dict = {}
|
|
1547
|
+
|
|
1548
|
+
def fake_run(**kwargs):
|
|
1549
|
+
captured.update(kwargs)
|
|
1550
|
+
|
|
1551
|
+
monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
|
|
1552
|
+
runner = CliRunner()
|
|
1553
|
+
result = runner.invoke(main, ["analyze", str(mock_mhg_path)])
|
|
1554
|
+
assert result.exit_code == 0, result.output
|
|
1555
|
+
assert captured["no_cadd"] is False
|
|
1556
|
+
|
|
1557
|
+
def test_methylation_passes_no_cadd(self, mock_mhg_path, monkeypatch):
|
|
1558
|
+
captured: dict = {}
|
|
1559
|
+
|
|
1560
|
+
def fake_run(**kwargs):
|
|
1561
|
+
captured.update(kwargs)
|
|
1562
|
+
|
|
1563
|
+
monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
|
|
1564
|
+
runner = CliRunner()
|
|
1565
|
+
result = runner.invoke(main, ["methylation", str(mock_mhg_path), "--no-cadd"])
|
|
1566
|
+
assert result.exit_code == 0, result.output
|
|
1567
|
+
assert captured["no_cadd"] is True
|
|
1568
|
+
|
|
1569
|
+
def test_pharmacogenomics_passes_no_cadd(self, mock_mhg_path, monkeypatch):
|
|
1570
|
+
captured: dict = {}
|
|
1571
|
+
|
|
1572
|
+
def fake_run(**kwargs):
|
|
1573
|
+
captured.update(kwargs)
|
|
1574
|
+
|
|
1575
|
+
monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
|
|
1576
|
+
runner = CliRunner()
|
|
1577
|
+
result = runner.invoke(main, ["pharmacogenomics", str(mock_mhg_path), "--no-cadd"])
|
|
1578
|
+
assert result.exit_code == 0, result.output
|
|
1579
|
+
assert captured["no_cadd"] is True
|
|
1580
|
+
|
|
1581
|
+
|
|
1582
|
+
class TestParseFilterFile:
|
|
1583
|
+
"""Unit tests for _parse_filter_file (parser classification)."""
|
|
1584
|
+
|
|
1585
|
+
def test_rsid_lowercase(self, tmp_path):
|
|
1586
|
+
from allelix.cli import _parse_filter_file
|
|
1587
|
+
|
|
1588
|
+
f = tmp_path / "filter.txt"
|
|
1589
|
+
f.write_text("rs1801133\n")
|
|
1590
|
+
genes, rsids = _parse_filter_file(f)
|
|
1591
|
+
assert genes == frozenset()
|
|
1592
|
+
assert rsids == frozenset({"rs1801133"})
|
|
1593
|
+
|
|
1594
|
+
def test_rsid_uppercase_normalized_to_lowercase(self, tmp_path):
|
|
1595
|
+
"""Input case-tolerant, output canonical: RS1801133 → rs1801133."""
|
|
1596
|
+
from allelix.cli import _parse_filter_file
|
|
1597
|
+
|
|
1598
|
+
f = tmp_path / "filter.txt"
|
|
1599
|
+
f.write_text("RS1801133\n")
|
|
1600
|
+
genes, rsids = _parse_filter_file(f)
|
|
1601
|
+
assert genes == frozenset()
|
|
1602
|
+
assert rsids == frozenset({"rs1801133"})
|
|
1603
|
+
|
|
1604
|
+
def test_gene_lowercase_normalized_to_uppercase(self, tmp_path):
|
|
1605
|
+
"""Input case-tolerant, output canonical: mthfr → MTHFR."""
|
|
1606
|
+
from allelix.cli import _parse_filter_file
|
|
1607
|
+
|
|
1608
|
+
f = tmp_path / "filter.txt"
|
|
1609
|
+
f.write_text("mthfr\n")
|
|
1610
|
+
genes, rsids = _parse_filter_file(f)
|
|
1611
|
+
assert genes == frozenset({"MTHFR"})
|
|
1612
|
+
assert rsids == frozenset()
|
|
1613
|
+
|
|
1614
|
+
def test_mixed_messy_case_normalized(self, tmp_path):
|
|
1615
|
+
"""End-to-end case-mixing across rsIDs and genes."""
|
|
1616
|
+
from allelix.cli import _parse_filter_file
|
|
1617
|
+
|
|
1618
|
+
f = tmp_path / "filter.txt"
|
|
1619
|
+
f.write_text("Rs1801133\ncomt\nRSPO1\nRS4680\nmThFr\n")
|
|
1620
|
+
genes, rsids = _parse_filter_file(f)
|
|
1621
|
+
assert genes == frozenset({"COMT", "RSPO1", "MTHFR"})
|
|
1622
|
+
assert rsids == frozenset({"rs1801133", "rs4680"})
|
|
1623
|
+
|
|
1624
|
+
def test_gene_only(self, tmp_path):
|
|
1625
|
+
from allelix.cli import _parse_filter_file
|
|
1626
|
+
|
|
1627
|
+
f = tmp_path / "filter.txt"
|
|
1628
|
+
f.write_text("MTHFR\n")
|
|
1629
|
+
genes, rsids = _parse_filter_file(f)
|
|
1630
|
+
assert genes == frozenset({"MTHFR"})
|
|
1631
|
+
assert rsids == frozenset()
|
|
1632
|
+
|
|
1633
|
+
def test_gene_starting_with_rs_prefix_is_gene_not_rsid(self, tmp_path):
|
|
1634
|
+
"""RSPO1, RSF1, RSC1A1 are real gene names — must not be classified as rsIDs."""
|
|
1635
|
+
from allelix.cli import _parse_filter_file
|
|
1636
|
+
|
|
1637
|
+
f = tmp_path / "filter.txt"
|
|
1638
|
+
f.write_text("RSPO1\nRSF1\nRSC1A1\n")
|
|
1639
|
+
genes, rsids = _parse_filter_file(f)
|
|
1640
|
+
assert genes == frozenset({"RSPO1", "RSF1", "RSC1A1"})
|
|
1641
|
+
assert rsids == frozenset()
|
|
1642
|
+
|
|
1643
|
+
def test_mixed(self, tmp_path):
|
|
1644
|
+
from allelix.cli import _parse_filter_file
|
|
1645
|
+
|
|
1646
|
+
f = tmp_path / "filter.txt"
|
|
1647
|
+
f.write_text("rs1801133\nMTHFR\nrs4680\nCOMT\n")
|
|
1648
|
+
genes, rsids = _parse_filter_file(f)
|
|
1649
|
+
assert genes == frozenset({"MTHFR", "COMT"})
|
|
1650
|
+
assert rsids == frozenset({"rs1801133", "rs4680"})
|
|
1651
|
+
|
|
1652
|
+
def test_comments_and_blanks_ignored(self, tmp_path):
|
|
1653
|
+
from allelix.cli import _parse_filter_file
|
|
1654
|
+
|
|
1655
|
+
f = tmp_path / "filter.txt"
|
|
1656
|
+
f.write_text("# this is a comment\n\nMTHFR\n\n# another\nrs1801133\n")
|
|
1657
|
+
genes, rsids = _parse_filter_file(f)
|
|
1658
|
+
assert genes == frozenset({"MTHFR"})
|
|
1659
|
+
assert rsids == frozenset({"rs1801133"})
|
|
1660
|
+
|
|
1661
|
+
def test_empty_file_returns_empty_sets(self, tmp_path):
|
|
1662
|
+
from allelix.cli import _parse_filter_file
|
|
1663
|
+
|
|
1664
|
+
f = tmp_path / "filter.txt"
|
|
1665
|
+
f.write_text("")
|
|
1666
|
+
genes, rsids = _parse_filter_file(f)
|
|
1667
|
+
assert genes == frozenset()
|
|
1668
|
+
assert rsids == frozenset()
|
|
1669
|
+
|
|
1670
|
+
def test_comments_only_returns_empty_sets(self, tmp_path):
|
|
1671
|
+
from allelix.cli import _parse_filter_file
|
|
1672
|
+
|
|
1673
|
+
f = tmp_path / "filter.txt"
|
|
1674
|
+
f.write_text("# only a comment\n# another\n\n")
|
|
1675
|
+
genes, rsids = _parse_filter_file(f)
|
|
1676
|
+
assert genes == frozenset()
|
|
1677
|
+
assert rsids == frozenset()
|
|
1678
|
+
|
|
1679
|
+
|
|
1680
|
+
class TestFilterFileOnAnalyze:
|
|
1681
|
+
"""--filter-file is only on analyze; threads through _run_analysis_command."""
|
|
1682
|
+
|
|
1683
|
+
def test_analyze_rsid_only(self, mock_mhg_path, tmp_path, monkeypatch):
|
|
1684
|
+
captured: dict = {}
|
|
1685
|
+
|
|
1686
|
+
def fake_run(**kwargs):
|
|
1687
|
+
captured.update(kwargs)
|
|
1688
|
+
|
|
1689
|
+
monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
|
|
1690
|
+
f = tmp_path / "filter.txt"
|
|
1691
|
+
f.write_text("rs1801133\n")
|
|
1692
|
+
runner = CliRunner()
|
|
1693
|
+
result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--filter-file", str(f)])
|
|
1694
|
+
assert result.exit_code == 0, result.output
|
|
1695
|
+
assert captured["genes"] == frozenset()
|
|
1696
|
+
assert captured["rsids"] == frozenset({"rs1801133"})
|
|
1697
|
+
|
|
1698
|
+
def test_analyze_gene_only(self, mock_mhg_path, tmp_path, monkeypatch):
|
|
1699
|
+
captured: dict = {}
|
|
1700
|
+
|
|
1701
|
+
def fake_run(**kwargs):
|
|
1702
|
+
captured.update(kwargs)
|
|
1703
|
+
|
|
1704
|
+
monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
|
|
1705
|
+
f = tmp_path / "filter.txt"
|
|
1706
|
+
f.write_text("MTHFR\n")
|
|
1707
|
+
runner = CliRunner()
|
|
1708
|
+
result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--filter-file", str(f)])
|
|
1709
|
+
assert result.exit_code == 0, result.output
|
|
1710
|
+
assert captured["genes"] == frozenset({"MTHFR"})
|
|
1711
|
+
assert captured["rsids"] == frozenset()
|
|
1712
|
+
|
|
1713
|
+
def test_analyze_mixed_or_combination(self, mock_mhg_path, tmp_path, monkeypatch):
|
|
1714
|
+
captured: dict = {}
|
|
1715
|
+
|
|
1716
|
+
def fake_run(**kwargs):
|
|
1717
|
+
captured.update(kwargs)
|
|
1718
|
+
|
|
1719
|
+
monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
|
|
1720
|
+
f = tmp_path / "filter.txt"
|
|
1721
|
+
f.write_text("rs1801133\nCOMT\n")
|
|
1722
|
+
runner = CliRunner()
|
|
1723
|
+
result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--filter-file", str(f)])
|
|
1724
|
+
assert result.exit_code == 0, result.output
|
|
1725
|
+
assert captured["genes"] == frozenset({"COMT"})
|
|
1726
|
+
assert captured["rsids"] == frozenset({"rs1801133"})
|
|
1727
|
+
|
|
1728
|
+
def test_analyze_empty_filter_passes_empty_sets(self, mock_mhg_path, tmp_path, monkeypatch):
|
|
1729
|
+
"""Empty filter file (only comments/blanks) threads empty frozensets through.
|
|
1730
|
+
|
|
1731
|
+
The empty-set → match-nothing semantic on AnalysisResult.filter()
|
|
1732
|
+
is covered by a direct unit test in tests/test_pipeline_filter.py;
|
|
1733
|
+
here we verify only that the CLI layer forwards empty frozensets,
|
|
1734
|
+
not None.
|
|
1735
|
+
"""
|
|
1736
|
+
captured: dict = {}
|
|
1737
|
+
|
|
1738
|
+
def fake_run(**kwargs):
|
|
1739
|
+
captured.update(kwargs)
|
|
1740
|
+
|
|
1741
|
+
monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
|
|
1742
|
+
f = tmp_path / "filter.txt"
|
|
1743
|
+
f.write_text("# only comments\n\n")
|
|
1744
|
+
runner = CliRunner()
|
|
1745
|
+
result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--filter-file", str(f)])
|
|
1746
|
+
assert result.exit_code == 0, result.output
|
|
1747
|
+
assert captured["genes"] == frozenset()
|
|
1748
|
+
assert captured["rsids"] == frozenset()
|
|
1749
|
+
|
|
1750
|
+
def test_analyze_filter_file_nonexistent_path_errors(self, mock_mhg_path):
|
|
1751
|
+
runner = CliRunner()
|
|
1752
|
+
result = runner.invoke(
|
|
1753
|
+
main,
|
|
1754
|
+
["analyze", str(mock_mhg_path), "--filter-file", "/does/not/exist.txt"],
|
|
1755
|
+
)
|
|
1756
|
+
assert result.exit_code != 0
|
|
1757
|
+
|
|
1758
|
+
def test_methylation_does_not_have_filter_file(self, mock_mhg_path):
|
|
1759
|
+
runner = CliRunner()
|
|
1760
|
+
result = runner.invoke(
|
|
1761
|
+
main,
|
|
1762
|
+
["methylation", str(mock_mhg_path), "--filter-file", "/tmp/x.txt"],
|
|
1763
|
+
)
|
|
1764
|
+
assert result.exit_code != 0
|
|
1765
|
+
assert "no such option" in result.output.lower()
|
|
1766
|
+
|
|
1767
|
+
def test_pharmacogenomics_does_not_have_filter_file(self, mock_mhg_path):
|
|
1768
|
+
runner = CliRunner()
|
|
1769
|
+
result = runner.invoke(
|
|
1770
|
+
main,
|
|
1771
|
+
["pharmacogenomics", str(mock_mhg_path), "--filter-file", "/tmp/x.txt"],
|
|
1772
|
+
)
|
|
1773
|
+
assert result.exit_code != 0
|
|
1774
|
+
assert "no such option" in result.output.lower()
|
|
1775
|
+
|
|
1776
|
+
|
|
1530
1777
|
class TestHighValueNoCalls:
|
|
1531
1778
|
def test_stats_flags_dpyd_no_call(self, mock_mhg_path):
|
|
1532
1779
|
"""The MHG fixture has rs3918290 (DPYD) as a no-call; stats should flag it."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|