allelix 1.8.2__tar.gz → 1.8.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {allelix-1.8.2 → allelix-1.8.4}/PKG-INFO +17 -45
  2. {allelix-1.8.2 → allelix-1.8.4}/README.md +16 -44
  3. {allelix-1.8.2 → allelix-1.8.4}/allelix/cli.py +22 -5
  4. {allelix-1.8.2 → allelix-1.8.4}/allelix.egg-info/PKG-INFO +17 -45
  5. {allelix-1.8.2 → allelix-1.8.4}/pyproject.toml +1 -1
  6. {allelix-1.8.2 → allelix-1.8.4}/tests/test_cli.py +52 -0
  7. {allelix-1.8.2 → allelix-1.8.4}/LICENSE +0 -0
  8. {allelix-1.8.2 → allelix-1.8.4}/allelix/__init__.py +0 -0
  9. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/__init__.py +0 -0
  10. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/alphamissense.py +0 -0
  11. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/base.py +0 -0
  12. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/cadd.py +0 -0
  13. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/clinvar.py +0 -0
  14. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/gnomad.py +0 -0
  15. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/gwas.py +0 -0
  16. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/pharmgkb.py +0 -0
  17. {allelix-1.8.2 → allelix-1.8.4}/allelix/annotators/snpedia.py +0 -0
  18. {allelix-1.8.2 → allelix-1.8.4}/allelix/compare.py +0 -0
  19. {allelix-1.8.2 → allelix-1.8.4}/allelix/config.py +0 -0
  20. {allelix-1.8.2 → allelix-1.8.4}/allelix/data/__init__.py +0 -0
  21. {allelix-1.8.2 → allelix-1.8.4}/allelix/data/high_value_snps.yaml +0 -0
  22. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/__init__.py +0 -0
  23. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/_versions.py +0 -0
  24. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/alphamissense_loader.py +0 -0
  25. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/cadd_loader.py +0 -0
  26. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/cpic_loader.py +0 -0
  27. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/gnomad_loader.py +0 -0
  28. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/gwas_loader.py +0 -0
  29. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/loader_utils.py +0 -0
  30. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/manager.py +0 -0
  31. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/pharmgkb_loader.py +0 -0
  32. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/schema.py +0 -0
  33. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/snpedia_loader.py +0 -0
  34. {allelix-1.8.2 → allelix-1.8.4}/allelix/databases/snpedia_parser.py +0 -0
  35. {allelix-1.8.2 → allelix-1.8.4}/allelix/exporters/__init__.py +0 -0
  36. {allelix-1.8.2 → allelix-1.8.4}/allelix/exporters/plink.py +0 -0
  37. {allelix-1.8.2 → allelix-1.8.4}/allelix/models.py +0 -0
  38. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/__init__.py +0 -0
  39. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/_helpers.py +0 -0
  40. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/ancestrydna.py +0 -0
  41. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/base.py +0 -0
  42. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/ftdna.py +0 -0
  43. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/livingdna.py +0 -0
  44. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/myhappygenes.py +0 -0
  45. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/myheritage.py +0 -0
  46. {allelix-1.8.2 → allelix-1.8.4}/allelix/parsers/twentythreeandme.py +0 -0
  47. {allelix-1.8.2 → allelix-1.8.4}/allelix/py.typed +0 -0
  48. {allelix-1.8.2 → allelix-1.8.4}/allelix/reports/__init__.py +0 -0
  49. {allelix-1.8.2 → allelix-1.8.4}/allelix/reports/_pipeline.py +0 -0
  50. {allelix-1.8.2 → allelix-1.8.4}/allelix/reports/diff.py +0 -0
  51. {allelix-1.8.2 → allelix-1.8.4}/allelix/reports/high_value.py +0 -0
  52. {allelix-1.8.2 → allelix-1.8.4}/allelix/reports/html.py +0 -0
  53. {allelix-1.8.2 → allelix-1.8.4}/allelix/reports/json_report.py +0 -0
  54. {allelix-1.8.2 → allelix-1.8.4}/allelix/reports/methylation.py +0 -0
  55. {allelix-1.8.2 → allelix-1.8.4}/allelix/reports/terminal.py +0 -0
  56. {allelix-1.8.2 → allelix-1.8.4}/allelix/utils/__init__.py +0 -0
  57. {allelix-1.8.2 → allelix-1.8.4}/allelix/utils/allele.py +0 -0
  58. {allelix-1.8.2 → allelix-1.8.4}/allelix/utils/build_detect.py +0 -0
  59. {allelix-1.8.2 → allelix-1.8.4}/allelix.egg-info/SOURCES.txt +0 -0
  60. {allelix-1.8.2 → allelix-1.8.4}/allelix.egg-info/dependency_links.txt +0 -0
  61. {allelix-1.8.2 → allelix-1.8.4}/allelix.egg-info/entry_points.txt +0 -0
  62. {allelix-1.8.2 → allelix-1.8.4}/allelix.egg-info/requires.txt +0 -0
  63. {allelix-1.8.2 → allelix-1.8.4}/allelix.egg-info/top_level.txt +0 -0
  64. {allelix-1.8.2 → allelix-1.8.4}/setup.cfg +0 -0
  65. {allelix-1.8.2 → allelix-1.8.4}/tests/test_cli_helpers.py +0 -0
  66. {allelix-1.8.2 → allelix-1.8.4}/tests/test_compare.py +0 -0
  67. {allelix-1.8.2 → allelix-1.8.4}/tests/test_config.py +0 -0
  68. {allelix-1.8.2 → allelix-1.8.4}/tests/test_end_to_end.py +0 -0
  69. {allelix-1.8.2 → allelix-1.8.4}/tests/test_mock_data_invariants.py +0 -0
  70. {allelix-1.8.2 → allelix-1.8.4}/tests/test_models.py +0 -0
  71. {allelix-1.8.2 → allelix-1.8.4}/tests/test_registry.py +0 -0
  72. {allelix-1.8.2 → allelix-1.8.4}/tests/test_version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: allelix
3
- Version: 1.8.2
3
+ Version: 1.8.4
4
4
  Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
5
5
  Author-email: dial481 <dial481@users.noreply.github.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -44,55 +44,27 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
44
44
  > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
45
45
  > commands, report diffing, persistent config with commercial-mode
46
46
  > safety switch. Build auto-detection from position data (ADR-0021).
47
- > No regex on prose anywhere in production. **Latest: v1.8.2** — HTML
48
- > report redesign with dark mode, PLINK export, and automated PyPI
49
- > publishing. Release notes:
50
- > [`CHANGELOG.md`](CHANGELOG.md).
47
+ > No regex on prose anywhere in production. **Latest: v1.8.4** —
48
+ > `--no-cadd` flag for licensing exclusion parity.
49
+ > Release notes:
50
+ > [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
51
51
 
52
52
  ## Quickstart
53
53
 
54
- Requires Python 3.11+.
55
-
56
54
  ```bash
57
- git clone https://github.com/dial481/allelix
58
- cd allelix
59
- python -m venv .venv
60
- source .venv/bin/activate
61
- pip install -e ".[dev]"
62
-
63
- # Generate a synthetic test fixture
64
- python tests/generate_mock_data.py
65
-
66
- # Show summary statistics for a genotype file
67
- allelix stats tests/fixtures/mock_myhappygenes.txt
55
+ pip install allelix
68
56
 
69
- # Download reference databases. First run downloads all sources (~15GB
70
- # on disk with gnomAD + AlphaMissense). Use --no-gnomad / --no-alphamissense
71
- # to skip the large enrichment databases. Re-runs skip unchanged sources.
57
+ # Download reference databases (~15GB with all sources).
58
+ # Use --no-gnomad / --no-alphamissense to skip the large ones.
72
59
  # CADD is opt-in: allelix db update --cadd
73
60
  allelix db update
74
- allelix db status # see what's cached
75
61
 
76
- # Analyze a genotype file against all ready databases
77
- allelix analyze tests/fixtures/mock_myhappygenes.txt --min-magnitude 5
78
-
79
- # Same data, focused subsets
80
- allelix methylation tests/fixtures/mock_myhappygenes.txt
81
- allelix pharmacogenomics tests/fixtures/mock_myhappygenes.txt
82
-
83
- # Compare two genotype files (coverage, concordance, strand-flip detection)
84
- allelix compare file1.txt file2.txt
85
-
86
- # Export to PLINK1 binary format (.bed/.bim/.fam) for plink2, ADMIXTURE, PRSice
87
- # Expect ~60% monomorphic markers (A2=0) — genotyping chips probe many
88
- # intronic/intergenic sites outside gnomAD's exome coverage.
89
- allelix export plink genotype_file.txt -o output_prefix --build grch37
90
-
91
- # Output to a self-contained HTML or JSON report
92
- allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.html
93
- allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.json
62
+ # Analyze a genotype file
63
+ allelix analyze your_genotype_file.txt --output report.html
94
64
  ```
95
65
 
66
+ Requires Python 3.11+. See [Development](#development) for source installs and running tests.
67
+
96
68
  ## Supported Formats
97
69
 
98
70
  | Format | Status | Notes |
@@ -126,7 +98,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
126
98
  | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
127
99
  | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
128
100
  | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
129
- | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
101
+ | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
130
102
 
131
103
  ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
132
104
 
@@ -183,7 +155,7 @@ allelix config set license.commercial true
183
155
  allelix config set license.cadd true
184
156
  ```
185
157
 
186
- CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
158
+ CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
187
159
 
188
160
  ### Database sizes and download times
189
161
 
@@ -215,7 +187,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
215
187
  | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
216
188
  | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
217
189
  | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
218
- | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
190
+ | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
219
191
 
220
192
  **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
221
193
 
@@ -246,7 +218,7 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
246
218
 
247
219
  ## Architecture & Design Decisions
248
220
 
249
- The "why" behind major design choices lives in [`docs/adr/`](docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
221
+ The "why" behind major design choices lives in [`docs/adr/`](https://github.com/dial481/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
250
222
 
251
223
  Notable load-bearing ADRs:
252
224
 
@@ -256,7 +228,7 @@ Notable load-bearing ADRs:
256
228
  - **ADR-0009 — PharmGKB matches the user's exact normalized diploid call.**
257
229
  - **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
258
230
 
259
- Release history: see [`CHANGELOG.md`](CHANGELOG.md).
231
+ Release history: see [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
260
232
 
261
233
  ## Development
262
234
 
@@ -10,55 +10,27 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
10
10
  > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
11
11
  > commands, report diffing, persistent config with commercial-mode
12
12
  > safety switch. Build auto-detection from position data (ADR-0021).
13
- > No regex on prose anywhere in production. **Latest: v1.8.2** — HTML
14
- > report redesign with dark mode, PLINK export, and automated PyPI
15
- > publishing. Release notes:
16
- > [`CHANGELOG.md`](CHANGELOG.md).
13
+ > No regex on prose anywhere in production. **Latest: v1.8.4** —
14
+ > `--no-cadd` flag for licensing exclusion parity.
15
+ > Release notes:
16
+ > [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
17
17
 
18
18
  ## Quickstart
19
19
 
20
- Requires Python 3.11+.
21
-
22
20
  ```bash
23
- git clone https://github.com/dial481/allelix
24
- cd allelix
25
- python -m venv .venv
26
- source .venv/bin/activate
27
- pip install -e ".[dev]"
28
-
29
- # Generate a synthetic test fixture
30
- python tests/generate_mock_data.py
31
-
32
- # Show summary statistics for a genotype file
33
- allelix stats tests/fixtures/mock_myhappygenes.txt
21
+ pip install allelix
34
22
 
35
- # Download reference databases. First run downloads all sources (~15GB
36
- # on disk with gnomAD + AlphaMissense). Use --no-gnomad / --no-alphamissense
37
- # to skip the large enrichment databases. Re-runs skip unchanged sources.
23
+ # Download reference databases (~15GB with all sources).
24
+ # Use --no-gnomad / --no-alphamissense to skip the large ones.
38
25
  # CADD is opt-in: allelix db update --cadd
39
26
  allelix db update
40
- allelix db status # see what's cached
41
27
 
42
- # Analyze a genotype file against all ready databases
43
- allelix analyze tests/fixtures/mock_myhappygenes.txt --min-magnitude 5
44
-
45
- # Same data, focused subsets
46
- allelix methylation tests/fixtures/mock_myhappygenes.txt
47
- allelix pharmacogenomics tests/fixtures/mock_myhappygenes.txt
48
-
49
- # Compare two genotype files (coverage, concordance, strand-flip detection)
50
- allelix compare file1.txt file2.txt
51
-
52
- # Export to PLINK1 binary format (.bed/.bim/.fam) for plink2, ADMIXTURE, PRSice
53
- # Expect ~60% monomorphic markers (A2=0) — genotyping chips probe many
54
- # intronic/intergenic sites outside gnomAD's exome coverage.
55
- allelix export plink genotype_file.txt -o output_prefix --build grch37
56
-
57
- # Output to a self-contained HTML or JSON report
58
- allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.html
59
- allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.json
28
+ # Analyze a genotype file
29
+ allelix analyze your_genotype_file.txt --output report.html
60
30
  ```
61
31
 
32
+ Requires Python 3.11+. See [Development](#development) for source installs and running tests.
33
+
62
34
  ## Supported Formats
63
35
 
64
36
  | Format | Status | Notes |
@@ -92,7 +64,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
92
64
  | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
93
65
  | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
94
66
  | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
95
- | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
67
+ | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
96
68
 
97
69
  ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
98
70
 
@@ -149,7 +121,7 @@ allelix config set license.commercial true
149
121
  allelix config set license.cadd true
150
122
  ```
151
123
 
152
- CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
124
+ CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
153
125
 
154
126
  ### Database sizes and download times
155
127
 
@@ -181,7 +153,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
181
153
  | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
182
154
  | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
183
155
  | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
184
- | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
156
+ | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
185
157
 
186
158
  **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
187
159
 
@@ -212,7 +184,7 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
212
184
 
213
185
  ## Architecture & Design Decisions
214
186
 
215
- The "why" behind major design choices lives in [`docs/adr/`](docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
187
+ The "why" behind major design choices lives in [`docs/adr/`](https://github.com/dial481/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
216
188
 
217
189
  Notable load-bearing ADRs:
218
190
 
@@ -222,7 +194,7 @@ Notable load-bearing ADRs:
222
194
  - **ADR-0009 — PharmGKB matches the user's exact normalized diploid call.**
223
195
  - **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
224
196
 
225
- Release history: see [`CHANGELOG.md`](CHANGELOG.md).
197
+ Release history: see [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
226
198
 
227
199
  ## Development
228
200
 
@@ -214,6 +214,7 @@ def _run_analysis_command(
214
214
  no_update: bool = False,
215
215
  no_gnomad: bool = False,
216
216
  no_alphamissense: bool = False,
217
+ no_cadd: bool = False,
217
218
  ) -> None:
218
219
  resolved = resolve_data_dir(data_dir)
219
220
  if not no_update:
@@ -256,12 +257,13 @@ def _run_analysis_command(
256
257
  ready = [a for a in ready if a.name != "alphamissense"]
257
258
 
258
259
  cadd_annotator = None
259
- from allelix.annotators.cadd import CaddAnnotator
260
+ if not no_cadd:
261
+ from allelix.annotators.cadd import CaddAnnotator
260
262
 
261
- for a in ready:
262
- if isinstance(a, CaddAnnotator):
263
- cadd_annotator = a
264
- break
263
+ for a in ready:
264
+ if isinstance(a, CaddAnnotator):
265
+ cadd_annotator = a
266
+ break
265
267
  ready = [a for a in ready if a.name != "cadd"]
266
268
 
267
269
  if not_ready:
@@ -576,6 +578,12 @@ _NO_ALPHAMISSENSE_OPT = click.option(
576
578
  default=False,
577
579
  help="Skip AlphaMissense variant pathogenicity enrichment.",
578
580
  )
581
+ _NO_CADD_OPT = click.option(
582
+ "--no-cadd",
583
+ is_flag=True,
584
+ default=False,
585
+ help="Skip CADD deleteriousness score enrichment.",
586
+ )
579
587
  _BUILD_OPT = click.option(
580
588
  "--build",
581
589
  type=click.Choice(["grch37", "grch38", "auto"], case_sensitive=False),
@@ -672,6 +680,7 @@ def _emit_build_diagnostics(result: object) -> None:
672
680
  @_NO_UPDATE_OPT
673
681
  @_NO_GNOMAD_OPT
674
682
  @_NO_ALPHAMISSENSE_OPT
683
+ @_NO_CADD_OPT
675
684
  def analyze(
676
685
  file_path: Path,
677
686
  fmt: str | None,
@@ -690,6 +699,7 @@ def analyze(
690
699
  no_update: bool,
691
700
  no_gnomad: bool,
692
701
  no_alphamissense: bool,
702
+ no_cadd: bool,
693
703
  ) -> None:
694
704
  """Annotate a genotype file against all ready reference databases."""
695
705
  _run_analysis_command(
@@ -711,6 +721,7 @@ def analyze(
711
721
  no_update=no_update,
712
722
  no_gnomad=no_gnomad,
713
723
  no_alphamissense=no_alphamissense,
724
+ no_cadd=no_cadd,
714
725
  )
715
726
 
716
727
 
@@ -868,6 +879,7 @@ def compare(file1: Path, file2: Path, fmt1: str | None, fmt2: str | None) -> Non
868
879
  @_NO_UPDATE_OPT
869
880
  @_NO_GNOMAD_OPT
870
881
  @_NO_ALPHAMISSENSE_OPT
882
+ @_NO_CADD_OPT
871
883
  def methylation(
872
884
  file_path: Path,
873
885
  fmt: str | None,
@@ -886,6 +898,7 @@ def methylation(
886
898
  no_update: bool,
887
899
  no_gnomad: bool,
888
900
  no_alphamissense: bool,
901
+ no_cadd: bool,
889
902
  ) -> None:
890
903
  """Methylation-pathway-focused report (MTHFR, MTR, MTRR, COMT, CBS, …)."""
891
904
  excluded: set[str] = set()
@@ -912,6 +925,7 @@ def methylation(
912
925
  no_update=no_update,
913
926
  no_gnomad=no_gnomad,
914
927
  no_alphamissense=no_alphamissense,
928
+ no_cadd=no_cadd,
915
929
  )
916
930
 
917
931
 
@@ -933,6 +947,7 @@ def methylation(
933
947
  @_NO_UPDATE_OPT
934
948
  @_NO_GNOMAD_OPT
935
949
  @_NO_ALPHAMISSENSE_OPT
950
+ @_NO_CADD_OPT
936
951
  def pharmacogenomics(
937
952
  file_path: Path,
938
953
  fmt: str | None,
@@ -951,6 +966,7 @@ def pharmacogenomics(
951
966
  no_update: bool,
952
967
  no_gnomad: bool,
953
968
  no_alphamissense: bool,
969
+ no_cadd: bool,
954
970
  ) -> None:
955
971
  """Pharmacogenomics-focused report (annotations from PharmGKB-style sources)."""
956
972
  excluded: set[str] = set()
@@ -977,6 +993,7 @@ def pharmacogenomics(
977
993
  no_update=no_update,
978
994
  no_gnomad=no_gnomad,
979
995
  no_alphamissense=no_alphamissense,
996
+ no_cadd=no_cadd,
980
997
  )
981
998
 
982
999
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: allelix
3
- Version: 1.8.2
3
+ Version: 1.8.4
4
4
  Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
5
5
  Author-email: dial481 <dial481@users.noreply.github.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -44,55 +44,27 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
44
44
  > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
45
45
  > commands, report diffing, persistent config with commercial-mode
46
46
  > safety switch. Build auto-detection from position data (ADR-0021).
47
- > No regex on prose anywhere in production. **Latest: v1.8.2** — HTML
48
- > report redesign with dark mode, PLINK export, and automated PyPI
49
- > publishing. Release notes:
50
- > [`CHANGELOG.md`](CHANGELOG.md).
47
+ > No regex on prose anywhere in production. **Latest: v1.8.4** —
48
+ > `--no-cadd` flag for licensing exclusion parity.
49
+ > Release notes:
50
+ > [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
51
51
 
52
52
  ## Quickstart
53
53
 
54
- Requires Python 3.11+.
55
-
56
54
  ```bash
57
- git clone https://github.com/dial481/allelix
58
- cd allelix
59
- python -m venv .venv
60
- source .venv/bin/activate
61
- pip install -e ".[dev]"
62
-
63
- # Generate a synthetic test fixture
64
- python tests/generate_mock_data.py
65
-
66
- # Show summary statistics for a genotype file
67
- allelix stats tests/fixtures/mock_myhappygenes.txt
55
+ pip install allelix
68
56
 
69
- # Download reference databases. First run downloads all sources (~15GB
70
- # on disk with gnomAD + AlphaMissense). Use --no-gnomad / --no-alphamissense
71
- # to skip the large enrichment databases. Re-runs skip unchanged sources.
57
+ # Download reference databases (~15GB with all sources).
58
+ # Use --no-gnomad / --no-alphamissense to skip the large ones.
72
59
  # CADD is opt-in: allelix db update --cadd
73
60
  allelix db update
74
- allelix db status # see what's cached
75
61
 
76
- # Analyze a genotype file against all ready databases
77
- allelix analyze tests/fixtures/mock_myhappygenes.txt --min-magnitude 5
78
-
79
- # Same data, focused subsets
80
- allelix methylation tests/fixtures/mock_myhappygenes.txt
81
- allelix pharmacogenomics tests/fixtures/mock_myhappygenes.txt
82
-
83
- # Compare two genotype files (coverage, concordance, strand-flip detection)
84
- allelix compare file1.txt file2.txt
85
-
86
- # Export to PLINK1 binary format (.bed/.bim/.fam) for plink2, ADMIXTURE, PRSice
87
- # Expect ~60% monomorphic markers (A2=0) — genotyping chips probe many
88
- # intronic/intergenic sites outside gnomAD's exome coverage.
89
- allelix export plink genotype_file.txt -o output_prefix --build grch37
90
-
91
- # Output to a self-contained HTML or JSON report
92
- allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.html
93
- allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.json
62
+ # Analyze a genotype file
63
+ allelix analyze your_genotype_file.txt --output report.html
94
64
  ```
95
65
 
66
+ Requires Python 3.11+. See [Development](#development) for source installs and running tests.
67
+
96
68
  ## Supported Formats
97
69
 
98
70
  | Format | Status | Notes |
@@ -126,7 +98,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
126
98
  | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
127
99
  | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
128
100
  | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
129
- | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
101
+ | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
130
102
 
131
103
  ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
132
104
 
@@ -183,7 +155,7 @@ allelix config set license.commercial true
183
155
  allelix config set license.cadd true
184
156
  ```
185
157
 
186
- CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
158
+ CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
187
159
 
188
160
  ### Database sizes and download times
189
161
 
@@ -215,7 +187,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
215
187
  | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
216
188
  | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
217
189
  | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
218
- | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
190
+ | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
219
191
 
220
192
  **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
221
193
 
@@ -246,7 +218,7 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
246
218
 
247
219
  ## Architecture & Design Decisions
248
220
 
249
- The "why" behind major design choices lives in [`docs/adr/`](docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
221
+ The "why" behind major design choices lives in [`docs/adr/`](https://github.com/dial481/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
250
222
 
251
223
  Notable load-bearing ADRs:
252
224
 
@@ -256,7 +228,7 @@ Notable load-bearing ADRs:
256
228
  - **ADR-0009 — PharmGKB matches the user's exact normalized diploid call.**
257
229
  - **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
258
230
 
259
- Release history: see [`CHANGELOG.md`](CHANGELOG.md).
231
+ Release history: see [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
260
232
 
261
233
  ## Development
262
234
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "allelix"
7
- version = "1.8.2"
7
+ version = "1.8.4"
8
8
  description = "Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -1527,6 +1527,58 @@ class TestExcludeSnpedia:
1527
1527
  assert captured["exclude_sources"] == frozenset({"snpedia", "gwas"})
1528
1528
 
1529
1529
 
1530
+ class TestNoCaddFlag:
1531
+ """--no-cadd wires through to no_cadd on all three analysis commands."""
1532
+
1533
+ def test_analyze_passes_no_cadd(self, mock_mhg_path, monkeypatch):
1534
+ captured: dict = {}
1535
+
1536
+ def fake_run(**kwargs):
1537
+ captured.update(kwargs)
1538
+
1539
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1540
+ runner = CliRunner()
1541
+ result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--no-cadd"])
1542
+ assert result.exit_code == 0, result.output
1543
+ assert captured["no_cadd"] is True
1544
+
1545
+ def test_analyze_default_no_cadd_false(self, mock_mhg_path, monkeypatch):
1546
+ captured: dict = {}
1547
+
1548
+ def fake_run(**kwargs):
1549
+ captured.update(kwargs)
1550
+
1551
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1552
+ runner = CliRunner()
1553
+ result = runner.invoke(main, ["analyze", str(mock_mhg_path)])
1554
+ assert result.exit_code == 0, result.output
1555
+ assert captured["no_cadd"] is False
1556
+
1557
+ def test_methylation_passes_no_cadd(self, mock_mhg_path, monkeypatch):
1558
+ captured: dict = {}
1559
+
1560
+ def fake_run(**kwargs):
1561
+ captured.update(kwargs)
1562
+
1563
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1564
+ runner = CliRunner()
1565
+ result = runner.invoke(main, ["methylation", str(mock_mhg_path), "--no-cadd"])
1566
+ assert result.exit_code == 0, result.output
1567
+ assert captured["no_cadd"] is True
1568
+
1569
+ def test_pharmacogenomics_passes_no_cadd(self, mock_mhg_path, monkeypatch):
1570
+ captured: dict = {}
1571
+
1572
+ def fake_run(**kwargs):
1573
+ captured.update(kwargs)
1574
+
1575
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1576
+ runner = CliRunner()
1577
+ result = runner.invoke(main, ["pharmacogenomics", str(mock_mhg_path), "--no-cadd"])
1578
+ assert result.exit_code == 0, result.output
1579
+ assert captured["no_cadd"] is True
1580
+
1581
+
1530
1582
  class TestHighValueNoCalls:
1531
1583
  def test_stats_flags_dpyd_no_call(self, mock_mhg_path):
1532
1584
  """The MHG fixture has rs3918290 (DPYD) as a no-call; stats should flag it."""
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes