allelix 1.8.3__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {allelix-1.8.3 → allelix-1.9.0}/PKG-INFO +9 -6
  2. {allelix-1.8.3 → allelix-1.9.0}/README.md +8 -5
  3. {allelix-1.8.3 → allelix-1.9.0}/allelix/cli.py +78 -6
  4. {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/_pipeline.py +13 -3
  5. {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/html.py +2 -0
  6. {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/json_report.py +4 -1
  7. {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/terminal.py +2 -0
  8. {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/PKG-INFO +9 -6
  9. {allelix-1.8.3 → allelix-1.9.0}/pyproject.toml +1 -1
  10. {allelix-1.8.3 → allelix-1.9.0}/tests/test_cli.py +247 -0
  11. {allelix-1.8.3 → allelix-1.9.0}/LICENSE +0 -0
  12. {allelix-1.8.3 → allelix-1.9.0}/allelix/__init__.py +0 -0
  13. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/__init__.py +0 -0
  14. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/alphamissense.py +0 -0
  15. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/base.py +0 -0
  16. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/cadd.py +0 -0
  17. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/clinvar.py +0 -0
  18. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/gnomad.py +0 -0
  19. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/gwas.py +0 -0
  20. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/pharmgkb.py +0 -0
  21. {allelix-1.8.3 → allelix-1.9.0}/allelix/annotators/snpedia.py +0 -0
  22. {allelix-1.8.3 → allelix-1.9.0}/allelix/compare.py +0 -0
  23. {allelix-1.8.3 → allelix-1.9.0}/allelix/config.py +0 -0
  24. {allelix-1.8.3 → allelix-1.9.0}/allelix/data/__init__.py +0 -0
  25. {allelix-1.8.3 → allelix-1.9.0}/allelix/data/high_value_snps.yaml +0 -0
  26. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/__init__.py +0 -0
  27. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/_versions.py +0 -0
  28. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/alphamissense_loader.py +0 -0
  29. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/cadd_loader.py +0 -0
  30. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/cpic_loader.py +0 -0
  31. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/gnomad_loader.py +0 -0
  32. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/gwas_loader.py +0 -0
  33. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/loader_utils.py +0 -0
  34. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/manager.py +0 -0
  35. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/pharmgkb_loader.py +0 -0
  36. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/schema.py +0 -0
  37. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/snpedia_loader.py +0 -0
  38. {allelix-1.8.3 → allelix-1.9.0}/allelix/databases/snpedia_parser.py +0 -0
  39. {allelix-1.8.3 → allelix-1.9.0}/allelix/exporters/__init__.py +0 -0
  40. {allelix-1.8.3 → allelix-1.9.0}/allelix/exporters/plink.py +0 -0
  41. {allelix-1.8.3 → allelix-1.9.0}/allelix/models.py +0 -0
  42. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/__init__.py +0 -0
  43. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/_helpers.py +0 -0
  44. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/ancestrydna.py +0 -0
  45. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/base.py +0 -0
  46. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/ftdna.py +0 -0
  47. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/livingdna.py +0 -0
  48. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/myhappygenes.py +0 -0
  49. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/myheritage.py +0 -0
  50. {allelix-1.8.3 → allelix-1.9.0}/allelix/parsers/twentythreeandme.py +0 -0
  51. {allelix-1.8.3 → allelix-1.9.0}/allelix/py.typed +0 -0
  52. {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/__init__.py +0 -0
  53. {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/diff.py +0 -0
  54. {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/high_value.py +0 -0
  55. {allelix-1.8.3 → allelix-1.9.0}/allelix/reports/methylation.py +0 -0
  56. {allelix-1.8.3 → allelix-1.9.0}/allelix/utils/__init__.py +0 -0
  57. {allelix-1.8.3 → allelix-1.9.0}/allelix/utils/allele.py +0 -0
  58. {allelix-1.8.3 → allelix-1.9.0}/allelix/utils/build_detect.py +0 -0
  59. {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/SOURCES.txt +0 -0
  60. {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/dependency_links.txt +0 -0
  61. {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/entry_points.txt +0 -0
  62. {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/requires.txt +0 -0
  63. {allelix-1.8.3 → allelix-1.9.0}/allelix.egg-info/top_level.txt +0 -0
  64. {allelix-1.8.3 → allelix-1.9.0}/setup.cfg +0 -0
  65. {allelix-1.8.3 → allelix-1.9.0}/tests/test_cli_helpers.py +0 -0
  66. {allelix-1.8.3 → allelix-1.9.0}/tests/test_compare.py +0 -0
  67. {allelix-1.8.3 → allelix-1.9.0}/tests/test_config.py +0 -0
  68. {allelix-1.8.3 → allelix-1.9.0}/tests/test_end_to_end.py +0 -0
  69. {allelix-1.8.3 → allelix-1.9.0}/tests/test_mock_data_invariants.py +0 -0
  70. {allelix-1.8.3 → allelix-1.9.0}/tests/test_models.py +0 -0
  71. {allelix-1.8.3 → allelix-1.9.0}/tests/test_registry.py +0 -0
  72. {allelix-1.8.3 → allelix-1.9.0}/tests/test_version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: allelix
3
- Version: 1.8.3
3
+ Version: 1.9.0
4
4
  Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
5
5
  Author-email: dial481 <dial481@users.noreply.github.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -44,8 +44,8 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
44
44
  > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
45
45
  > commands, report diffing, persistent config with commercial-mode
46
46
  > safety switch. Build auto-detection from position data (ADR-0021).
47
- > No regex on prose anywhere in production. **Latest: v1.8.3** —
48
- > pip install quickstart, workflow hardening, PyPI link fix.
47
+ > No regex on prose anywhere in production. **Latest: v1.9.0** —
48
+ > `--filter-file` flag for custom-panel filtering on `analyze`.
49
49
  > Release notes:
50
50
  > [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
51
51
 
@@ -61,6 +61,9 @@ allelix db update
61
61
 
62
62
  # Analyze a genotype file
63
63
  allelix analyze your_genotype_file.txt --output report.html
64
+
65
+ # Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
66
+ allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
64
67
  ```
65
68
 
66
69
  Requires Python 3.11+. See [Development](#development) for source installs and running tests.
@@ -98,7 +101,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
98
101
  | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
99
102
  | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
100
103
  | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
101
- | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
104
+ | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
102
105
 
103
106
  ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
104
107
 
@@ -155,7 +158,7 @@ allelix config set license.commercial true
155
158
  allelix config set license.cadd true
156
159
  ```
157
160
 
158
- CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
161
+ CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
159
162
 
160
163
  ### Database sizes and download times
161
164
 
@@ -187,7 +190,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
187
190
  | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
188
191
  | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
189
192
  | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
190
- | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
193
+ | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
191
194
 
192
195
  **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
193
196
 
@@ -10,8 +10,8 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
10
10
  > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
11
11
  > commands, report diffing, persistent config with commercial-mode
12
12
  > safety switch. Build auto-detection from position data (ADR-0021).
13
- > No regex on prose anywhere in production. **Latest: v1.8.3** —
14
- > pip install quickstart, workflow hardening, PyPI link fix.
13
+ > No regex on prose anywhere in production. **Latest: v1.9.0** —
14
+ > `--filter-file` flag for custom-panel filtering on `analyze`.
15
15
  > Release notes:
16
16
  > [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
17
17
 
@@ -27,6 +27,9 @@ allelix db update
27
27
 
28
28
  # Analyze a genotype file
29
29
  allelix analyze your_genotype_file.txt --output report.html
30
+
31
+ # Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
32
+ allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
30
33
  ```
31
34
 
32
35
  Requires Python 3.11+. See [Development](#development) for source installs and running tests.
@@ -64,7 +67,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
64
67
  | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
65
68
  | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
66
69
  | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
67
- | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
70
+ | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
68
71
 
69
72
  ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
70
73
 
@@ -121,7 +124,7 @@ allelix config set license.commercial true
121
124
  allelix config set license.cadd true
122
125
  ```
123
126
 
124
- CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
127
+ CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
125
128
 
126
129
  ### Database sizes and download times
127
130
 
@@ -153,7 +156,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
153
156
  | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
154
157
  | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
155
158
  | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
156
- | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
159
+ | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
157
160
 
158
161
  **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
159
162
 
@@ -5,6 +5,7 @@
5
5
  from __future__ import annotations
6
6
 
7
7
  import logging
8
+ import re
8
9
  import sys
9
10
  import time
10
11
  from pathlib import Path
@@ -195,6 +196,35 @@ def _format_from_path(output: Path, override: str | None) -> str:
195
196
  )
196
197
 
197
198
 
199
+ _RSID_PATTERN = re.compile(r"^rs\d+$", re.IGNORECASE)
200
+
201
+
202
+ def _parse_filter_file(path: Path) -> tuple[frozenset[str], frozenset[str]]:
203
+ r"""Parse a filter file into ``(gene_names, rsids)``.
204
+
205
+ Lines matching ``^rs\d+$`` (case-insensitive) are rsIDs. Everything
206
+ else is a gene name. Lines starting with ``#`` and blank lines are
207
+ ignored. Gene names starting with ``RS`` (e.g., RSPO1, RSF1) are
208
+ correctly classified as gene names, not rsIDs.
209
+
210
+ Input is case-tolerant; output is canonical: rsIDs are normalized to
211
+ lowercase (``rs1801133``), gene names to uppercase (``MTHFR``). The
212
+ filter recorded in JSON output therefore looks identical regardless
213
+ of how the user typed the entries in the filter file.
214
+ """
215
+ genes: set[str] = set()
216
+ rsids: set[str] = set()
217
+ for raw in path.read_text().splitlines():
218
+ line = raw.strip()
219
+ if not line or line.startswith("#"):
220
+ continue
221
+ if _RSID_PATTERN.match(line):
222
+ rsids.add(line.lower())
223
+ else:
224
+ genes.add(line.upper())
225
+ return frozenset(genes), frozenset(rsids)
226
+
227
+
198
228
  def _run_analysis_command(
199
229
  file_path: Path,
200
230
  fmt: str | None,
@@ -204,6 +234,7 @@ def _run_analysis_command(
204
234
  min_magnitude: float,
205
235
  category: str | None,
206
236
  genes: frozenset[str] | None,
237
+ rsids: frozenset[str] | None = None,
207
238
  build: str | None = None,
208
239
  include_benign: bool = False,
209
240
  gwas_min_magnitude: float | None = None,
@@ -214,6 +245,7 @@ def _run_analysis_command(
214
245
  no_update: bool = False,
215
246
  no_gnomad: bool = False,
216
247
  no_alphamissense: bool = False,
248
+ no_cadd: bool = False,
217
249
  ) -> None:
218
250
  resolved = resolve_data_dir(data_dir)
219
251
  if not no_update:
@@ -256,12 +288,13 @@ def _run_analysis_command(
256
288
  ready = [a for a in ready if a.name != "alphamissense"]
257
289
 
258
290
  cadd_annotator = None
259
- from allelix.annotators.cadd import CaddAnnotator
291
+ if not no_cadd:
292
+ from allelix.annotators.cadd import CaddAnnotator
260
293
 
261
- for a in ready:
262
- if isinstance(a, CaddAnnotator):
263
- cadd_annotator = a
264
- break
294
+ for a in ready:
295
+ if isinstance(a, CaddAnnotator):
296
+ cadd_annotator = a
297
+ break
265
298
  ready = [a for a in ready if a.name != "cadd"]
266
299
 
267
300
  if not_ready:
@@ -333,6 +366,7 @@ def _run_analysis_command(
333
366
  min_magnitude=min_magnitude,
334
367
  category=category,
335
368
  genes=genes,
369
+ rsids=rsids,
336
370
  source_min_magnitudes=source_floors,
337
371
  )
338
372
  from allelix.reports._pipeline import rollup_gwas_duplicates
@@ -354,6 +388,7 @@ def _run_analysis_command(
354
388
  min_magnitude=min_magnitude,
355
389
  category=category,
356
390
  genes=genes,
391
+ rsids=rsids,
357
392
  source_min_magnitudes=source_floors,
358
393
  )
359
394
  else:
@@ -371,6 +406,7 @@ def _run_analysis_command(
371
406
  min_magnitude=min_magnitude,
372
407
  category=category,
373
408
  genes=genes,
409
+ rsids=rsids,
374
410
  source_min_magnitudes=source_floors,
375
411
  diff=diff_result,
376
412
  high_value_no_calls=hv_dicts,
@@ -382,6 +418,7 @@ def _run_analysis_command(
382
418
  min_magnitude=min_magnitude,
383
419
  category=category,
384
420
  genes=genes,
421
+ rsids=rsids,
385
422
  source_min_magnitudes=source_floors,
386
423
  diff=diff_result,
387
424
  high_value_no_calls=hv_warning_lines,
@@ -558,6 +595,16 @@ _DIFF_OPT = click.option(
558
595
  "Not a monitoring tool — use for version-to-version validation."
559
596
  ),
560
597
  )
598
+ _FILTER_FILE_OPT = click.option(
599
+ "--filter-file",
600
+ type=click.Path(exists=True, dir_okay=False, path_type=Path),
601
+ default=None,
602
+ help=(
603
+ "Plain text file with rsIDs and/or gene names (one per line) to "
604
+ "filter the report. Lines matching '^rs\\d+$' are rsIDs; everything "
605
+ "else is a gene name. Comments (#) and blank lines are ignored."
606
+ ),
607
+ )
561
608
  _NO_UPDATE_OPT = click.option(
562
609
  "--no-update",
563
610
  is_flag=True,
@@ -576,6 +623,12 @@ _NO_ALPHAMISSENSE_OPT = click.option(
576
623
  default=False,
577
624
  help="Skip AlphaMissense variant pathogenicity enrichment.",
578
625
  )
626
+ _NO_CADD_OPT = click.option(
627
+ "--no-cadd",
628
+ is_flag=True,
629
+ default=False,
630
+ help="Skip CADD deleteriousness score enrichment.",
631
+ )
579
632
  _BUILD_OPT = click.option(
580
633
  "--build",
581
634
  type=click.Choice(["grch37", "grch38", "auto"], case_sensitive=False),
@@ -669,9 +722,11 @@ def _emit_build_diagnostics(result: object) -> None:
669
722
  @_GWAS_ALL_OPT
670
723
  @_EXCLUDE_SNPEDIA_OPT
671
724
  @_DIFF_OPT
725
+ @_FILTER_FILE_OPT
672
726
  @_NO_UPDATE_OPT
673
727
  @_NO_GNOMAD_OPT
674
728
  @_NO_ALPHAMISSENSE_OPT
729
+ @_NO_CADD_OPT
675
730
  def analyze(
676
731
  file_path: Path,
677
732
  fmt: str | None,
@@ -687,11 +742,20 @@ def analyze(
687
742
  gwas_all: bool,
688
743
  exclude_snpedia: bool,
689
744
  diff_path: Path | None,
745
+ filter_file: Path | None,
690
746
  no_update: bool,
691
747
  no_gnomad: bool,
692
748
  no_alphamissense: bool,
749
+ no_cadd: bool,
693
750
  ) -> None:
694
751
  """Annotate a genotype file against all ready reference databases."""
752
+ filter_genes: frozenset[str] | None = None
753
+ filter_rsids: frozenset[str] | None = None
754
+ if filter_file is not None:
755
+ filter_genes, filter_rsids = _parse_filter_file(filter_file)
756
+ # Empty sets (file had only comments/blanks) still apply — they
757
+ # mean "match nothing", producing an empty report.
758
+
695
759
  _run_analysis_command(
696
760
  file_path=file_path,
697
761
  fmt=fmt,
@@ -700,7 +764,8 @@ def analyze(
700
764
  report_format=report_format,
701
765
  min_magnitude=min_magnitude,
702
766
  category=category,
703
- genes=None,
767
+ genes=filter_genes,
768
+ rsids=filter_rsids,
704
769
  build=_normalize_cli_build(build),
705
770
  include_benign=include_benign,
706
771
  gwas_min_magnitude=gwas_min_magnitude,
@@ -711,6 +776,7 @@ def analyze(
711
776
  no_update=no_update,
712
777
  no_gnomad=no_gnomad,
713
778
  no_alphamissense=no_alphamissense,
779
+ no_cadd=no_cadd,
714
780
  )
715
781
 
716
782
 
@@ -868,6 +934,7 @@ def compare(file1: Path, file2: Path, fmt1: str | None, fmt2: str | None) -> Non
868
934
  @_NO_UPDATE_OPT
869
935
  @_NO_GNOMAD_OPT
870
936
  @_NO_ALPHAMISSENSE_OPT
937
+ @_NO_CADD_OPT
871
938
  def methylation(
872
939
  file_path: Path,
873
940
  fmt: str | None,
@@ -886,6 +953,7 @@ def methylation(
886
953
  no_update: bool,
887
954
  no_gnomad: bool,
888
955
  no_alphamissense: bool,
956
+ no_cadd: bool,
889
957
  ) -> None:
890
958
  """Methylation-pathway-focused report (MTHFR, MTR, MTRR, COMT, CBS, …)."""
891
959
  excluded: set[str] = set()
@@ -912,6 +980,7 @@ def methylation(
912
980
  no_update=no_update,
913
981
  no_gnomad=no_gnomad,
914
982
  no_alphamissense=no_alphamissense,
983
+ no_cadd=no_cadd,
915
984
  )
916
985
 
917
986
 
@@ -933,6 +1002,7 @@ def methylation(
933
1002
  @_NO_UPDATE_OPT
934
1003
  @_NO_GNOMAD_OPT
935
1004
  @_NO_ALPHAMISSENSE_OPT
1005
+ @_NO_CADD_OPT
936
1006
  def pharmacogenomics(
937
1007
  file_path: Path,
938
1008
  fmt: str | None,
@@ -951,6 +1021,7 @@ def pharmacogenomics(
951
1021
  no_update: bool,
952
1022
  no_gnomad: bool,
953
1023
  no_alphamissense: bool,
1024
+ no_cadd: bool,
954
1025
  ) -> None:
955
1026
  """Pharmacogenomics-focused report (annotations from PharmGKB-style sources)."""
956
1027
  excluded: set[str] = set()
@@ -977,6 +1048,7 @@ def pharmacogenomics(
977
1048
  no_update=no_update,
978
1049
  no_gnomad=no_gnomad,
979
1050
  no_alphamissense=no_alphamissense,
1051
+ no_cadd=no_cadd,
980
1052
  )
981
1053
 
982
1054
 
@@ -105,6 +105,7 @@ class AnalysisResult:
105
105
  min_magnitude: float = 0.0,
106
106
  category: str | None = None,
107
107
  genes: Iterable[str] | None = None,
108
+ rsids: Iterable[str] | None = None,
108
109
  source_min_magnitudes: dict[str, float] | None = None,
109
110
  ) -> list[Annotation]:
110
111
  """Apply the standard filters and return a sorted list of annotations.
@@ -117,8 +118,14 @@ class AnalysisResult:
117
118
  entry, that value IS the floor for that source — it can raise OR
118
119
  lower the global ``min_magnitude``. Sources without an entry use
119
120
  the global floor.
121
+
122
+ `genes` and `rsids` combine with OR: when either is provided, an
123
+ annotation passes if it matches the gene set OR the rsid set.
124
+ Empty collections (vs None) mean "match nothing" — an empty
125
+ filter file produces an empty report.
120
126
  """
121
- gene_set = {g.upper() for g in genes} if genes else None
127
+ gene_set = {g.upper() for g in genes} if genes is not None else None
128
+ rsid_set = {r.lower() for r in rsids} if rsids is not None else None
122
129
  out: list[Annotation] = []
123
130
  for a in self.annotations:
124
131
  if (
@@ -133,8 +140,11 @@ class AnalysisResult:
133
140
  continue
134
141
  if category is not None and a.category != category:
135
142
  continue
136
- if gene_set is not None and (a.gene or "").upper() not in gene_set:
137
- continue
143
+ if gene_set is not None or rsid_set is not None:
144
+ gene_match = gene_set is not None and (a.gene or "").upper() in gene_set
145
+ rsid_match = rsid_set is not None and a.rsid.lower() in rsid_set
146
+ if not gene_match and not rsid_match:
147
+ continue
138
148
  out.append(a)
139
149
  out.sort(key=lambda a: (-a.magnitude, a.rsid))
140
150
  return out
@@ -951,6 +951,7 @@ def render_html(
951
951
  min_magnitude: float = 0.0,
952
952
  category: str | None = None,
953
953
  genes: Iterable[str] | None = None,
954
+ rsids: Iterable[str] | None = None,
954
955
  source_min_magnitudes: dict[str, float] | None = None,
955
956
  title: str = "Allelix Genotype Report",
956
957
  diff: DiffResult | None = None,
@@ -961,6 +962,7 @@ def render_html(
961
962
  min_magnitude=min_magnitude,
962
963
  category=category,
963
964
  genes=genes,
965
+ rsids=rsids,
964
966
  source_min_magnitudes=source_min_magnitudes,
965
967
  )
966
968
  filtered = rollup_gwas_duplicates(filtered)
@@ -103,6 +103,7 @@ def render_json(
103
103
  min_magnitude: float = 0.0,
104
104
  category: str | None = None,
105
105
  genes: Iterable[str] | None = None,
106
+ rsids: Iterable[str] | None = None,
106
107
  source_min_magnitudes: dict[str, float] | None = None,
107
108
  diff: DiffResult | None = None,
108
109
  high_value_no_calls: list[dict[str, str]] | None = None,
@@ -112,6 +113,7 @@ def render_json(
112
113
  min_magnitude=min_magnitude,
113
114
  category=category,
114
115
  genes=genes,
116
+ rsids=rsids,
115
117
  source_min_magnitudes=source_min_magnitudes,
116
118
  )
117
119
  filtered = rollup_gwas_duplicates(filtered)
@@ -134,7 +136,8 @@ def render_json(
134
136
  "filters": {
135
137
  "min_magnitude": min_magnitude,
136
138
  "category": category,
137
- "genes": sorted(genes) if genes else None,
139
+ "genes": sorted(genes) if genes is not None else None,
140
+ "rsids": sorted(rsids) if rsids is not None else None,
138
141
  },
139
142
  "annotations": [_annotation_dict(a) for a in filtered],
140
143
  }
@@ -27,6 +27,7 @@ def render_terminal(
27
27
  min_magnitude: float = 0.0,
28
28
  category: str | None = None,
29
29
  genes: Iterable[str] | None = None,
30
+ rsids: Iterable[str] | None = None,
30
31
  source_min_magnitudes: dict[str, float] | None = None,
31
32
  ) -> int:
32
33
  """Render an AnalysisResult as a Rich table. Returns annotation count.
@@ -38,6 +39,7 @@ def render_terminal(
38
39
  min_magnitude=min_magnitude,
39
40
  category=category,
40
41
  genes=genes,
42
+ rsids=rsids,
41
43
  source_min_magnitudes=source_min_magnitudes,
42
44
  )
43
45
  filtered = rollup_gwas_duplicates(filtered)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: allelix
3
- Version: 1.8.3
3
+ Version: 1.9.0
4
4
  Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
5
5
  Author-email: dial481 <dial481@users.noreply.github.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -44,8 +44,8 @@ Open-source command-line toolkit for analyzing raw genotype files from consumer
44
44
  > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
45
45
  > commands, report diffing, persistent config with commercial-mode
46
46
  > safety switch. Build auto-detection from position data (ADR-0021).
47
- > No regex on prose anywhere in production. **Latest: v1.8.3** —
48
- > pip install quickstart, workflow hardening, PyPI link fix.
47
+ > No regex on prose anywhere in production. **Latest: v1.9.0** —
48
+ > `--filter-file` flag for custom-panel filtering on `analyze`.
49
49
  > Release notes:
50
50
  > [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
51
51
 
@@ -61,6 +61,9 @@ allelix db update
61
61
 
62
62
  # Analyze a genotype file
63
63
  allelix analyze your_genotype_file.txt --output report.html
64
+
65
+ # Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
66
+ allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
64
67
  ```
65
68
 
66
69
  Requires Python 3.11+. See [Development](#development) for source installs and running tests.
@@ -98,7 +101,7 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
98
101
  | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
99
102
  | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
100
103
  | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
101
- | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
104
+ | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
102
105
 
103
106
  ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
104
107
 
@@ -155,7 +158,7 @@ allelix config set license.commercial true
155
158
  allelix config set license.cadd true
156
159
  ```
157
160
 
158
- CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
161
+ CLI flags (`--no-gnomad`, `--no-alphamissense`, `--no-cadd`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
159
162
 
160
163
  ### Database sizes and download times
161
164
 
@@ -187,7 +190,7 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
187
190
  | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
188
191
  | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
189
192
  | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
190
- | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
193
+ | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
191
194
 
192
195
  **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
193
196
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "allelix"
7
- version = "1.8.3"
7
+ version = "1.9.0"
8
8
  description = "Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -1527,6 +1527,253 @@ class TestExcludeSnpedia:
1527
1527
  assert captured["exclude_sources"] == frozenset({"snpedia", "gwas"})
1528
1528
 
1529
1529
 
1530
+ class TestNoCaddFlag:
1531
+ """--no-cadd wires through to no_cadd on all three analysis commands."""
1532
+
1533
+ def test_analyze_passes_no_cadd(self, mock_mhg_path, monkeypatch):
1534
+ captured: dict = {}
1535
+
1536
+ def fake_run(**kwargs):
1537
+ captured.update(kwargs)
1538
+
1539
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1540
+ runner = CliRunner()
1541
+ result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--no-cadd"])
1542
+ assert result.exit_code == 0, result.output
1543
+ assert captured["no_cadd"] is True
1544
+
1545
+ def test_analyze_default_no_cadd_false(self, mock_mhg_path, monkeypatch):
1546
+ captured: dict = {}
1547
+
1548
+ def fake_run(**kwargs):
1549
+ captured.update(kwargs)
1550
+
1551
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1552
+ runner = CliRunner()
1553
+ result = runner.invoke(main, ["analyze", str(mock_mhg_path)])
1554
+ assert result.exit_code == 0, result.output
1555
+ assert captured["no_cadd"] is False
1556
+
1557
+ def test_methylation_passes_no_cadd(self, mock_mhg_path, monkeypatch):
1558
+ captured: dict = {}
1559
+
1560
+ def fake_run(**kwargs):
1561
+ captured.update(kwargs)
1562
+
1563
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1564
+ runner = CliRunner()
1565
+ result = runner.invoke(main, ["methylation", str(mock_mhg_path), "--no-cadd"])
1566
+ assert result.exit_code == 0, result.output
1567
+ assert captured["no_cadd"] is True
1568
+
1569
+ def test_pharmacogenomics_passes_no_cadd(self, mock_mhg_path, monkeypatch):
1570
+ captured: dict = {}
1571
+
1572
+ def fake_run(**kwargs):
1573
+ captured.update(kwargs)
1574
+
1575
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1576
+ runner = CliRunner()
1577
+ result = runner.invoke(main, ["pharmacogenomics", str(mock_mhg_path), "--no-cadd"])
1578
+ assert result.exit_code == 0, result.output
1579
+ assert captured["no_cadd"] is True
1580
+
1581
+
1582
+ class TestParseFilterFile:
1583
+ """Unit tests for _parse_filter_file (parser classification)."""
1584
+
1585
+ def test_rsid_lowercase(self, tmp_path):
1586
+ from allelix.cli import _parse_filter_file
1587
+
1588
+ f = tmp_path / "filter.txt"
1589
+ f.write_text("rs1801133\n")
1590
+ genes, rsids = _parse_filter_file(f)
1591
+ assert genes == frozenset()
1592
+ assert rsids == frozenset({"rs1801133"})
1593
+
1594
+ def test_rsid_uppercase_normalized_to_lowercase(self, tmp_path):
1595
+ """Input case-tolerant, output canonical: RS1801133 → rs1801133."""
1596
+ from allelix.cli import _parse_filter_file
1597
+
1598
+ f = tmp_path / "filter.txt"
1599
+ f.write_text("RS1801133\n")
1600
+ genes, rsids = _parse_filter_file(f)
1601
+ assert genes == frozenset()
1602
+ assert rsids == frozenset({"rs1801133"})
1603
+
1604
+ def test_gene_lowercase_normalized_to_uppercase(self, tmp_path):
1605
+ """Input case-tolerant, output canonical: mthfr → MTHFR."""
1606
+ from allelix.cli import _parse_filter_file
1607
+
1608
+ f = tmp_path / "filter.txt"
1609
+ f.write_text("mthfr\n")
1610
+ genes, rsids = _parse_filter_file(f)
1611
+ assert genes == frozenset({"MTHFR"})
1612
+ assert rsids == frozenset()
1613
+
1614
+ def test_mixed_messy_case_normalized(self, tmp_path):
1615
+ """End-to-end case-mixing across rsIDs and genes."""
1616
+ from allelix.cli import _parse_filter_file
1617
+
1618
+ f = tmp_path / "filter.txt"
1619
+ f.write_text("Rs1801133\ncomt\nRSPO1\nRS4680\nmThFr\n")
1620
+ genes, rsids = _parse_filter_file(f)
1621
+ assert genes == frozenset({"COMT", "RSPO1", "MTHFR"})
1622
+ assert rsids == frozenset({"rs1801133", "rs4680"})
1623
+
1624
+ def test_gene_only(self, tmp_path):
1625
+ from allelix.cli import _parse_filter_file
1626
+
1627
+ f = tmp_path / "filter.txt"
1628
+ f.write_text("MTHFR\n")
1629
+ genes, rsids = _parse_filter_file(f)
1630
+ assert genes == frozenset({"MTHFR"})
1631
+ assert rsids == frozenset()
1632
+
1633
+ def test_gene_starting_with_rs_prefix_is_gene_not_rsid(self, tmp_path):
1634
+ """RSPO1, RSF1, RSC1A1 are real gene names — must not be classified as rsIDs."""
1635
+ from allelix.cli import _parse_filter_file
1636
+
1637
+ f = tmp_path / "filter.txt"
1638
+ f.write_text("RSPO1\nRSF1\nRSC1A1\n")
1639
+ genes, rsids = _parse_filter_file(f)
1640
+ assert genes == frozenset({"RSPO1", "RSF1", "RSC1A1"})
1641
+ assert rsids == frozenset()
1642
+
1643
+ def test_mixed(self, tmp_path):
1644
+ from allelix.cli import _parse_filter_file
1645
+
1646
+ f = tmp_path / "filter.txt"
1647
+ f.write_text("rs1801133\nMTHFR\nrs4680\nCOMT\n")
1648
+ genes, rsids = _parse_filter_file(f)
1649
+ assert genes == frozenset({"MTHFR", "COMT"})
1650
+ assert rsids == frozenset({"rs1801133", "rs4680"})
1651
+
1652
+ def test_comments_and_blanks_ignored(self, tmp_path):
1653
+ from allelix.cli import _parse_filter_file
1654
+
1655
+ f = tmp_path / "filter.txt"
1656
+ f.write_text("# this is a comment\n\nMTHFR\n\n# another\nrs1801133\n")
1657
+ genes, rsids = _parse_filter_file(f)
1658
+ assert genes == frozenset({"MTHFR"})
1659
+ assert rsids == frozenset({"rs1801133"})
1660
+
1661
+ def test_empty_file_returns_empty_sets(self, tmp_path):
1662
+ from allelix.cli import _parse_filter_file
1663
+
1664
+ f = tmp_path / "filter.txt"
1665
+ f.write_text("")
1666
+ genes, rsids = _parse_filter_file(f)
1667
+ assert genes == frozenset()
1668
+ assert rsids == frozenset()
1669
+
1670
+ def test_comments_only_returns_empty_sets(self, tmp_path):
1671
+ from allelix.cli import _parse_filter_file
1672
+
1673
+ f = tmp_path / "filter.txt"
1674
+ f.write_text("# only a comment\n# another\n\n")
1675
+ genes, rsids = _parse_filter_file(f)
1676
+ assert genes == frozenset()
1677
+ assert rsids == frozenset()
1678
+
1679
+
1680
+ class TestFilterFileOnAnalyze:
1681
+ """--filter-file is only on analyze; threads through _run_analysis_command."""
1682
+
1683
+ def test_analyze_rsid_only(self, mock_mhg_path, tmp_path, monkeypatch):
1684
+ captured: dict = {}
1685
+
1686
+ def fake_run(**kwargs):
1687
+ captured.update(kwargs)
1688
+
1689
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1690
+ f = tmp_path / "filter.txt"
1691
+ f.write_text("rs1801133\n")
1692
+ runner = CliRunner()
1693
+ result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--filter-file", str(f)])
1694
+ assert result.exit_code == 0, result.output
1695
+ assert captured["genes"] == frozenset()
1696
+ assert captured["rsids"] == frozenset({"rs1801133"})
1697
+
1698
+ def test_analyze_gene_only(self, mock_mhg_path, tmp_path, monkeypatch):
1699
+ captured: dict = {}
1700
+
1701
+ def fake_run(**kwargs):
1702
+ captured.update(kwargs)
1703
+
1704
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1705
+ f = tmp_path / "filter.txt"
1706
+ f.write_text("MTHFR\n")
1707
+ runner = CliRunner()
1708
+ result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--filter-file", str(f)])
1709
+ assert result.exit_code == 0, result.output
1710
+ assert captured["genes"] == frozenset({"MTHFR"})
1711
+ assert captured["rsids"] == frozenset()
1712
+
1713
+ def test_analyze_mixed_or_combination(self, mock_mhg_path, tmp_path, monkeypatch):
1714
+ captured: dict = {}
1715
+
1716
+ def fake_run(**kwargs):
1717
+ captured.update(kwargs)
1718
+
1719
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1720
+ f = tmp_path / "filter.txt"
1721
+ f.write_text("rs1801133\nCOMT\n")
1722
+ runner = CliRunner()
1723
+ result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--filter-file", str(f)])
1724
+ assert result.exit_code == 0, result.output
1725
+ assert captured["genes"] == frozenset({"COMT"})
1726
+ assert captured["rsids"] == frozenset({"rs1801133"})
1727
+
1728
+ def test_analyze_empty_filter_passes_empty_sets(self, mock_mhg_path, tmp_path, monkeypatch):
1729
+ """Empty filter file (only comments/blanks) threads empty frozensets through.
1730
+
1731
+ The empty-set → match-nothing semantic on AnalysisResult.filter()
1732
+ is covered by a direct unit test in tests/test_pipeline_filter.py;
1733
+ here we verify only that the CLI layer forwards empty frozensets,
1734
+ not None.
1735
+ """
1736
+ captured: dict = {}
1737
+
1738
+ def fake_run(**kwargs):
1739
+ captured.update(kwargs)
1740
+
1741
+ monkeypatch.setattr("allelix.cli._run_analysis_command", fake_run)
1742
+ f = tmp_path / "filter.txt"
1743
+ f.write_text("# only comments\n\n")
1744
+ runner = CliRunner()
1745
+ result = runner.invoke(main, ["analyze", str(mock_mhg_path), "--filter-file", str(f)])
1746
+ assert result.exit_code == 0, result.output
1747
+ assert captured["genes"] == frozenset()
1748
+ assert captured["rsids"] == frozenset()
1749
+
1750
+ def test_analyze_filter_file_nonexistent_path_errors(self, mock_mhg_path):
1751
+ runner = CliRunner()
1752
+ result = runner.invoke(
1753
+ main,
1754
+ ["analyze", str(mock_mhg_path), "--filter-file", "/does/not/exist.txt"],
1755
+ )
1756
+ assert result.exit_code != 0
1757
+
1758
+ def test_methylation_does_not_have_filter_file(self, mock_mhg_path):
1759
+ runner = CliRunner()
1760
+ result = runner.invoke(
1761
+ main,
1762
+ ["methylation", str(mock_mhg_path), "--filter-file", "/tmp/x.txt"],
1763
+ )
1764
+ assert result.exit_code != 0
1765
+ assert "no such option" in result.output.lower()
1766
+
1767
+ def test_pharmacogenomics_does_not_have_filter_file(self, mock_mhg_path):
1768
+ runner = CliRunner()
1769
+ result = runner.invoke(
1770
+ main,
1771
+ ["pharmacogenomics", str(mock_mhg_path), "--filter-file", "/tmp/x.txt"],
1772
+ )
1773
+ assert result.exit_code != 0
1774
+ assert "no such option" in result.output.lower()
1775
+
1776
+
1530
1777
  class TestHighValueNoCalls:
1531
1778
  def test_stats_flags_dpyd_no_call(self, mock_mhg_path):
1532
1779
  """The MHG fixture has rs3918290 (DPYD) as a no-call; stats should flag it."""
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes