polygenic-pgx 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. polygenic_pgx-2.5.0/LICENSE +1 -0
  2. polygenic_pgx-2.5.0/PKG-INFO +43 -0
  3. polygenic_pgx-2.5.0/README.md +604 -0
  4. polygenic_pgx-2.5.0/polygenic/__init__.py +0 -0
  5. polygenic_pgx-2.5.0/polygenic/__main__.py +12 -0
  6. polygenic_pgx-2.5.0/polygenic/data/__init__.py +0 -0
  7. polygenic_pgx-2.5.0/polygenic/data/csv_accessor.py +143 -0
  8. polygenic_pgx-2.5.0/polygenic/data/data_accessor.py +109 -0
  9. polygenic_pgx-2.5.0/polygenic/data/gwas.py +638 -0
  10. polygenic_pgx-2.5.0/polygenic/data/mobigen_utils.py +70 -0
  11. polygenic_pgx-2.5.0/polygenic/data/polars_frame.py +128 -0
  12. polygenic_pgx-2.5.0/polygenic/data/snp_data.py +23 -0
  13. polygenic_pgx-2.5.0/polygenic/data/vcf_accessor.py +220 -0
  14. polygenic_pgx-2.5.0/polygenic/data/vcf_record.py +213 -0
  15. polygenic_pgx-2.5.0/polygenic/error/__init__.py +0 -0
  16. polygenic_pgx-2.5.0/polygenic/error/polygenic_exception.py +3 -0
  17. polygenic_pgx-2.5.0/polygenic/model/__init__.py +0 -0
  18. polygenic_pgx-2.5.0/polygenic/model/model.py +799 -0
  19. polygenic_pgx-2.5.0/polygenic/model/utils.py +140 -0
  20. polygenic_pgx-2.5.0/polygenic/pgstk.py +212 -0
  21. polygenic_pgx-2.5.0/polygenic/resources/chromsizes/hg38.chrom.sizes +26 -0
  22. polygenic_pgx-2.5.0/polygenic/rsidx/__init__.py +38 -0
  23. polygenic_pgx-2.5.0/polygenic/rsidx/__main__.py +31 -0
  24. polygenic_pgx-2.5.0/polygenic/rsidx/_version.py +520 -0
  25. polygenic_pgx-2.5.0/polygenic/rsidx/cli.py +83 -0
  26. polygenic_pgx-2.5.0/polygenic/rsidx/index.py +69 -0
  27. polygenic_pgx-2.5.0/polygenic/rsidx/search.py +63 -0
  28. polygenic_pgx-2.5.0/polygenic/tools/__init__.py +6 -0
  29. polygenic_pgx-2.5.0/polygenic/tools/data/__init__.py +0 -0
  30. polygenic_pgx-2.5.0/polygenic/tools/data/chromsizes.py +111 -0
  31. polygenic_pgx-2.5.0/polygenic/tools/data/colors.py +49 -0
  32. polygenic_pgx-2.5.0/polygenic/tools/gwasfilecreate.py +22 -0
  33. polygenic_pgx-2.5.0/polygenic/tools/modelbiobankuk.py +196 -0
  34. polygenic_pgx-2.5.0/polygenic/tools/modelgwasfile.py +14 -0
  35. polygenic_pgx-2.5.0/polygenic/tools/modelpgscat.py +164 -0
  36. polygenic_pgx-2.5.0/polygenic/tools/pgscompute.py +262 -0
  37. polygenic_pgx-2.5.0/polygenic/tools/plotmanhattan.py +101 -0
  38. polygenic_pgx-2.5.0/polygenic/tools/tsvtovcf.py +24 -0
  39. polygenic_pgx-2.5.0/polygenic/tools/utils.py +562 -0
  40. polygenic_pgx-2.5.0/polygenic/tools/vcfimpute.py +263 -0
  41. polygenic_pgx-2.5.0/polygenic/tools/vcfindex.py +5 -0
  42. polygenic_pgx-2.5.0/polygenic/tools/vcfstatbaf.py +98 -0
  43. polygenic_pgx-2.5.0/polygenic/tools/vcfstatzygosity.py +75 -0
  44. polygenic_pgx-2.5.0/polygenic/version.py +3 -0
  45. polygenic_pgx-2.5.0/polygenic_pgx.egg-info/PKG-INFO +43 -0
  46. polygenic_pgx-2.5.0/polygenic_pgx.egg-info/SOURCES.txt +51 -0
  47. polygenic_pgx-2.5.0/polygenic_pgx.egg-info/dependency_links.txt +1 -0
  48. polygenic_pgx-2.5.0/polygenic_pgx.egg-info/entry_points.txt +4 -0
  49. polygenic_pgx-2.5.0/polygenic_pgx.egg-info/requires.txt +18 -0
  50. polygenic_pgx-2.5.0/polygenic_pgx.egg-info/top_level.txt +1 -0
  51. polygenic_pgx-2.5.0/pyproject.toml +5 -0
  52. polygenic_pgx-2.5.0/setup.cfg +4 -0
  53. polygenic_pgx-2.5.0/setup.py +74 -0
@@ -0,0 +1 @@
1
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,43 @@
1
+ Metadata-Version: 2.4
2
+ Name: polygenic-pgx
3
+ Version: 2.5.0
4
+ Summary: Polygenic score toolkit
5
+ Home-page: https://github.com/marpiech/polygenic
6
+ Author: Marcin Piechota, Wojciech Galan
7
+ Author-email: piechota@intelliseq.com
8
+ License: Intelliseq dual licenses this package. For commercial use, please contact [contact @ intelliseq.com](mailto:contact@intelliseq.com). For non-commercial use, this license permits use of the software only by government agencies, schools, universities, non-profit organizations or individuals on projects that do not receive external funding other than government research grants and contracts. Any other use requires a commercial license. For the full license, please see [LICENSE.md](https://github.com/intelliseq/polygenic/blob/master/LICENSE.md), in this source repository.
9
+ Classifier: Development Status :: 5 - Production/Stable
10
+ Classifier: License :: Free for non-commercial use
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Operating System :: Unix
13
+ Classifier: Operating System :: MacOS
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Requires-Python: >=3.8
16
+ License-File: LICENSE
17
+ Requires-Dist: numpy
18
+ Requires-Dist: scipy
19
+ Requires-Dist: progressbar2
20
+ Requires-Dist: python-utils
21
+ Requires-Dist: pysam
22
+ Requires-Dist: pytabix
23
+ Requires-Dist: pandas
24
+ Requires-Dist: plotly
25
+ Requires-Dist: kaleido
26
+ Requires-Dist: DotMap
27
+ Requires-Dist: pyyaml
28
+ Requires-Dist: tqdm
29
+ Requires-Dist: plotnine
30
+ Requires-Dist: kaleido
31
+ Requires-Dist: polars
32
+ Requires-Dist: importlib-resources
33
+ Requires-Dist: logdecorator
34
+ Requires-Dist: pyarrow
35
+ Dynamic: author
36
+ Dynamic: author-email
37
+ Dynamic: classifier
38
+ Dynamic: home-page
39
+ Dynamic: license
40
+ Dynamic: license-file
41
+ Dynamic: requires-dist
42
+ Dynamic: requires-python
43
+ Dynamic: summary
@@ -0,0 +1,604 @@
1
+ # polygenic - the polygenic scores toolkit
2
+
3
+ ## Basic info
4
+ [![PyPI pyversions](https://img.shields.io/pypi/pyversions/polygenic.svg)](https://pypi.python.org/pypi/polygenic/)
5
+ [![PyPI](https://img.shields.io/pypi/v/polygenic.svg)](https://pypi.python.org/pypi/polygenic)
6
+ [![Maintainer]](https://img.shields.io/badge/maintainer-marpiech-blue)
7
+
8
+ ## Downloads
9
+ - pip [![PyPI download month](https://img.shields.io/pypi/dm/polygenic.svg)](https://pypi.python.org/pypi/polygenic/)
10
+ - docker with data [![Docker](https://img.shields.io/docker/pulls/marpiech/polygenictk.svg)](https://hub.docker.com/repository/docker/marpiech/polygenictk)
11
+ - docker without data [![Docker](https://img.shields.io/docker/pulls/intelliseq/polygenic.svg)](https://hub.docker.com/repository/docker/intelliseq/polygenic)
12
+
13
+ ## Index
14
+ * [Summary](#summary)
15
+ * [Diplotyping Algorithm](#diplotyping-algorithm)
16
+ * [Installation](#installation)
17
+ * [With pip](#with-pip)
18
+ * [With conda](#with-conda)
19
+ * [With docker](#with-docker)
20
+ * [Quick start guide](#quick-start-guide)
21
+ * [Manual](#manual)
22
+ * [Tools](#tools)
23
+ * [pgs-compute](#pgs_compute)
24
+ * [pgs-build](#pgs_build)
25
+ * [pgs-validate](#pgs_validate)
26
+ * [vcf-index](#vcf_index)
27
+ * [vcf-validate](#vcf_validate)
28
+ * [vcf-stat](#vcf_stat)
29
+ * [model-biobankuk](#model_biobankuk)
30
+ * [model-pgscat](#model_pgscat)
31
+ * [model-gbe](#model_gbe)
32
+ * [model-pharmvar](#model_pharmvar)
33
+ * [Docker images](#docker_images)
34
+ * [Building models](#building_models)
35
+ * [Example models](#example_models)
36
+ * [Usecases](#usecases)
37
+ * [pgx](#pgx)
38
+ * [License](#license)
39
+ * [Updates](#updates)
40
+
41
+ ## Summary
42
+ Polygenic is a toolkit for a wide range of polygenic scores analysis tasks. The most important use cases include computing scores for samples in vcf files, building scores for GWAS results or fetching scores from repositories.
43
+
44
+ ## Diplotyping Algorithm
45
+
46
+ We begin by reading individual genetic variants (genotypes) from the patient's VCF file, where each genotype carries two alleles — one per chromosome. The system supports phasing using a custom reference panel to resolve which alleles sit on the same chromosome; when phased data is available, the algorithm preserves this linkage information, while for unphased data both chromosomes are treated symmetrically. We define haplotypes as specific combinations of co-inherited variants that form recognized gene versions, such as pharmacogenomic star alleles, where each haplotype definition distinguishes core defining variants (weighted at 1.0) from supportive sub-lineage variants (weighted at 0.05). The algorithm scores every candidate haplotype against the patient's alleles, then selects the best-matching haplotype for the first chromosome — keeping all candidates within a 2% margin of the top score. The matched alleles are then "claimed" by that haplotype, and the remaining unmatched alleles (leftovers) are passed into a second round, where up to 100 candidate haplotypes are re-scored against only those residual alleles to identify the second haplotype. Each first/second haplotype pair is ranked by combined match percentage and filtered by total missing data, producing the final diplotype call — e.g., CYP2D6 \*1/\*4. Every variant in the result carries a source label — direct genotyping, LD proxy, imputation flag from VCF, allele-frequency-based imputation, reference, or missing — enabling granular quality control and full traceability of the diplotype call.
47
+
48
+ ### Missing variant handling (`--ref-fallback`)
49
+
50
+ By default, variants called `./.` in the input VCF are treated as **missing** (`source: "missing"`) — they are subtracted from both the numerator and the denominator of the haplotype match score, so they neither support nor penalize any candidate. This matches the approach taken by [PharmCAT's Named Allele Matcher](https://pharmcat.clinpgx.org/methods/NamedAlleleMatcher-101/), which explicitly drops missing positions rather than imputing them: *"If the sample data has missing positions that are required by a named allele definition, the position will be dropped from consideration."*
51
+
52
+ Passing `--ref-fallback` restores the legacy behavior of filling `./.` with homozygous reference (`source: "reference"`). This is almost always the **wrong** default for pharmacogenomic panels. The canonical example is **CYP2C19**: the variant rs3758581 at chr10:94842866 has GRCh38 reference `A`, but `G` is the major allele in every population and PharmVar v5 defines every "real" star allele (\*1, \*2, \*17 …) as requiring `G` at this position, reserving CYP2C19\*38 (= old \*1.001) for the minority carrying the reference `A` (see [PharmVar GeneFocus: CYP2C19](https://pmc.ncbi.nlm.nih.gov/articles/PMC7769975/)). If rs3758581 is not on the panel and the code fills it with the reference, every sample drifts toward \*38/\*38. Default off avoids this trap. Enable `--ref-fallback` only when your panel's `./.` genuinely means "0/0 and I chose not to spell it out" — for example when re-running test fixtures authored under the old default.
53
+
54
+ ### Match-confidence gate (`--top-n`)
55
+
56
+ After scoring all candidate haplotype pairs, the algorithm returns a `haplotype_id` when **either** of these holds:
57
+ - the best pair's per-chromosome `max_percent_match` is `≥ 50%` (high-confidence call), **or**
58
+ - the total scored-candidate pool size is `≤ top_n` (default **15**) — a small, non-ambiguous candidate space is itself a confidence signal.
59
+
60
+ Otherwise `haplotype_id` is `None` (the caller refuses to guess). This is more conservative than [PharmCAT](https://pharmcat.clinpgx.org/methods/NamedAlleleMatcher-101/), which always returns the top-scoring diplotype with deterministic tie-breaking, and roughly comparable to [Aldy 4](https://pmc.ncbi.nlm.nih.gov/articles/PMC9977157/), which reports "no more than three diplotypes" when it cannot fully disambiguate. Tune `--top-n` higher (laxer) or lower (stricter) depending on how much ambiguity is acceptable downstream. Set `--top-n 0` to rely solely on the 50% threshold.
61
+
62
+ ### CYP2C19*38 worked example
63
+
64
+ On a clinical panel that does **not** assay rs3758581, running `pgs-compute` with the CYP2C19 PharmVar model on four real samples (three carrying non-reference CYP2C19 variants, one all-reference) produces these calls:
65
+
66
+ | Sample | Non-ref CYP2C19 variants | Default (no `--ref-fallback`) | Legacy (`--ref-fallback`) |
67
+ |---|---|---|---|
68
+ | Sample A | none | `*1.001/*1.001` (≡ \*1/\*1) | `*1.001/*1.001` |
69
+ | Sample B | rs12769205, rs4244285 (het) | `*2/*1.001` ✓ | `*1.001/*1.001` ✗ |
70
+ | Sample C | rs12248560 (hom) | `*17/*17` ✓ | `*1.001/*1.001` ✗ |
71
+ | Sample D | rs12769205, rs4244285 (het) | `*2/*1.001` ✓ | `*1.001/*1.001` ✗ |
72
+
73
+ With the legacy `--ref-fallback` on, the missing rs3758581 becomes `A/A` (genomic reference), which does not match the `G` required by \*1, \*2, \*17 etc., so every sample collapses to the empty-haplotype \*1.001 (equivalent to \*38 in PharmVar v5 terms). With the new default, rs3758581 stays missing, the scoring ignores it, and the panel's actual informative variants drive the call.
74
+
75
+ ## Installation
76
+ ### With pip
77
+ #### Install for user account
78
+ ```
79
+ python3 -m pip install --upgrade polygenic
80
+ ```
81
+ #### Install globally
82
+ ```
83
+ sudo -H python3 -m pip install polygenic
84
+ ```
85
+ ### With conda
86
+ Run conda image
87
+ ```
88
+ docker run -it conda/miniconda3 /bin/bash
89
+ ```
90
+ Create python3.8 environment and install polygenic
91
+ ```
92
+ yes | conda create --name py38 python=3.8
93
+ eval "$(conda shell.bash hook)"
94
+ conda activate py38
95
+ ### should be 3.8
96
+ python --version
97
+
98
+ ### gcc is missing to build pytabix
99
+ apt -qq update
100
+ apt -y install build-essential tabix
101
+
102
+ pip install polygenic
103
+ ```
104
+ ### With docker
105
+ #### Large image with all data included
106
+ ```
107
+ docker run intelliseq:polygenictk:2.1.0 *command*
108
+ ```
109
+ #### Thin image with just polygenic package installed
110
+ ```
111
+ docker run intelliseq:polygenic:2.1.0 *command*
112
+ ```
113
+ ## Quick start guide
114
+ ```
115
+ mkdir polygenic && cd polygenic # create working directory
116
+ wget https://downloads.intelliseq.com/public/polygenic/gbe-INI78-bone-density.yml # download model
117
+ wget https://downloads.intelliseq.com/public/polygenic/illu_merged-imputed.vcf.gz # download genotypes
118
+ wget https://downloads.intelliseq.com/public/polygenic/illu_merged-imputed.vcf.gz.tbi # download position index
119
+ wget https://downloads.intelliseq.com/public/polygenic/illu_merged-imputed.vcf.gz.idx.db # download rsid index
120
+ docker run -v $(pwd):/data intelliseq/polygenic:latest --vcf /data/illu_merged-imputed.vcf.gz --model /data/gbe-INI78-bone-density.yml --output-directory /data # compute model
121
+ ```
122
+ ## Manual
123
+ ### Tools
124
+ #### pgs-compute
125
+ ```
126
+ usage: pgstk [-h] -i VCF [-m MODEL [MODEL ...]] [-p PARAMETERS] [-s SAMPLE_NAME] [-o OUTPUT_DIRECTORY] [-n OUTPUT_NAME_APPENDIX] [-l LOG_FILE] [--af AF] [--af-field AF_FIELD]
127
+ [-v] [--print]
128
+
129
+ pgs-compute computes polygenic scores for genotyped sample in vcf format
130
+
131
+ optional arguments:
132
+ -h, --help show this help message and exit
133
+ -i, --vcf VCF vcf.gz file with genotypes
134
+ -m, --model MODEL [MODEL ...]
135
+ path to .yml model (can be specified multiple times with space as separator)
136
+ -p, --parameters PARAMETERS
137
+ parameters json (to be used in formula models)
138
+ -s, --sample-name SAMPLE_NAME
139
+ sample name in vcf.gz to calculate
140
+ -o, --output-directory OUTPUT_DIRECTORY
141
+ output directory
142
+ -n, --output-name-appendix OUTPUT_NAME_APPENDIX
143
+ appendix for output file names
144
+ -l, --log-file LOG_FILE
145
+ path to log file
146
+ --af AF vcf file containing allele freq data
147
+ --af-field AF_FIELD name of the INFO field to be used as allele frequency
148
+ -v, --version show program's version number and exit
149
+ --print Print output to stdout
150
+ ```
151
+ ### Arguments
152
+ #### Required
153
+ - `--vcf` vcf.gz file with genotypes (tabix index should be available)
154
+ - `--model` path to model file
155
+ #### Optional
156
+ - `--log_file` log file
157
+ - `--out_dir` directory for result jsons
158
+ - `--population` population code
159
+ - `--models_path` path to a directory containing models
160
+ - `--af` an indexed vcf.gz file containing allele freq data
161
+ - `--version` prints version of package
162
+
163
+ ## Building models in yml
164
+
165
+ Index:
166
+ [Model structure](#model_structure)
167
+ [Model types](#model_types)
168
+ [Parameters](#parameters)
169
+
170
+
171
+ ### Model structure
172
+ ##### Core structure
173
+ Models have two properties which is `model` and `description`. `model` is a specification of computation to be performed and `description` is additional information to be included in the result.
174
+ ```
175
+ model:
176
+ description:
177
+ ```
178
+ ##### Object keys
179
+ Each object that is not collection has a set of predefined keys (required or optional) that can be used for computation. For example: `diplotype_model` object has a required `diplotypes` key.
180
+ ```
181
+ diplotype_model:
182
+ diplotypes:
183
+ ```
184
+ The computation is first delegated to key specified objects and later aggregated by the top level object itself.
185
+ ##### Collections
186
+ There is special category of objects that don't have predefined keys but are collections. Each key within collection becomes element of collection. Collections are easy to recognize, because they are specified in plural form like `diplotypes` or `variants`. Each element of collection will be defined as singular object of collection type. For example key in `variants` collection will becomes objects of `variant` type.
187
+ ```
188
+ variants:
189
+ rs7041: {diplotype: C/C}
190
+ rs4588: {diplotype: T/T}
191
+ ```
192
+ ##### Variants
193
+ Variants can be identified by rsid. Variant value will be computed basing on information provided: `diplotype` or `effect_allele`.
194
+ Accepted sets of fields are:
195
+ - diplotypes
196
+ - `diplotype`
197
+ - `symbol`
198
+ - score
199
+ - `effect_allele`
200
+ - `effect_size`
201
+ - `symbol`
202
+
203
+ ### Model types
204
+ There are currently implemented four types of models:
205
+ - `score_model`
206
+ - `diplotype_model`
207
+ - `haplotype_model`
208
+ - `formula_model`
209
+ The type of model can be specified at the top of yml structure or within the `model` field.
210
+ ##### Specification of model type at the top of yml structure
211
+ ```
212
+ diplotype_model:
213
+ description:
214
+ ```
215
+ ##### Specification of model type within the `model` field
216
+ ```
217
+ model:
218
+ diplotype_model:
219
+ description:
220
+ ```
221
+ ### Parameters
222
+ External parameters can be used in `formula_model` through `@parameters` keyword.
223
+ Example parameters file in `.json` format:
224
+ ```
225
+ {"sex": "F"}
226
+ ```
227
+ Path to file can be provided as argument to polygenic tool:
228
+ ```
229
+ --parameters /path/to/parameters.json
230
+ ```
231
+ Example of use of parameters in the `formula_model`:
232
+ ```
233
+ formula_model:
234
+ formula:
235
+ value: "@female.score_model.value if @parameters.sex == 'F' else @male.score_model.value"
236
+ male:
237
+ score_model:
238
+ variants:
239
+ ...
240
+ female:
241
+ score_model:
242
+ variants:
243
+ ```
244
+ ## Example models
245
+ ### Example diplotype model
246
+ This example diplotype model is based on [Randolph 2014](https://pubmed.ncbi.nlm.nih.gov/24447085/).
247
+ ```
248
+ diplotype_model:
249
+ diplotypes:
250
+ 1/1:
251
+ variants:
252
+ rs7041: {diplotype: C/C}
253
+ rs4588: {diplotype: T/T}
254
+ 1/1s:
255
+ variants:
256
+ rs7041: {diplotype: C/C}
257
+ rs4588: {diplotype: T/G}
258
+ 1/1f:
259
+ variants:
260
+ rs7041: {diplotype: C/A}
261
+ rs4588: {diplotype: T/G}
262
+ 1/2:
263
+ variants:
264
+ rs7041: {diplotype: C/A}
265
+ rs4588: {diplotype: T/T}
266
+ 1s/1s:
267
+ variants:
268
+ rs7041: {diplotype: C/C}
269
+ rs4588: {diplotype: G/G}
270
+ 1s/1f:
271
+ variants:
272
+ rs7041: {diplotype: C/A}
273
+ rs4588: {diplotype: G/G}
274
+ 1s/2:
275
+ variants:
276
+ rs7041: {diplotype: C/A}
277
+ rs4588: {diplotype: G/T}
278
+ 1f/1f:
279
+ variants:
280
+ rs7041: {diplotype: A/A}
281
+ rs4588: {diplotype: G/G}
282
+ 1f/2:
283
+ variants:
284
+ rs7041: {diplotype: A/A}
285
+ rs4588: {diplotype: G/T}
286
+ 2/2:
287
+ variants:
288
+ rs7041: {diplotype: A/A}
289
+ rs4588: {diplotype: T/T}
290
+ description:
291
+ pmid: 24447085
292
+ genes: [GC]
293
+ result_diplotype_choice:
294
+ 1/1: Moderate
295
+ 1/1s: High
296
+ 1/1f: High
297
+ 1/2: Low
298
+ 1s/1s: Very high
299
+ 1s/1f: Very high
300
+ 1s/2: Moderate
301
+ 1f/1f: Very high
302
+ 1f/2: Moderate
303
+ 2/2: Very low
304
+ ```
305
+
306
+ ### Example haplotype model
307
+
308
+ Haplotype model can be used for HLA and PGx.
309
+ To define haplotype models a list of alleles is required (called `variants` in this case, to be consistent with othe rypes of models). Each allele has associated list of defining mutations (alternative SNV alles) defined by Gnomad ID along with `ref`, `alt` and `effect_allele` properties. One star allele should be empty (containing only reference SNV alleles). The algorithm will utilised any phasing information in the vcf.
310
+
311
+ ```
312
+ haplotype_model:
313
+ variants:
314
+ CYP2D6*1.001:
315
+ CYP2D6*1.002:
316
+ 22-42126963-C-T: {ref: "C", alt: "T", effect_allele: "T"}
317
+ CYP2D6*1.003:
318
+ 22-42128813-G-A: {ref: "G", alt: "A", effect_allele: "A"}
319
+ CYP2D6*1.004:
320
+ 22-42128216-G-T: {ref: "G", alt: "T", effect_allele: "T"}
321
+ CYP2D6*1.005:
322
+ 22-42128922-A-G: {ref: "A", alt: "G", effect_allele: "G"}
323
+ CYP2D6*1.006:
324
+ 22-42129726-A-C: {ref: "A", alt: "C", effect_allele: "C"}
325
+ 22-42129950-A-C: {ref: "A", alt: "C", effect_allele: "C"}
326
+ 22-42130482-C-A: {ref: "C", alt: "A", effect_allele: "A"}
327
+ ```
328
+
329
+ For copy-number star alleles (CYP2D6 `*5`/`*1xN`, CYP2C19 `*36`/`*37`) the model gains a
330
+ `copy_number:` block and `structural:` haplotypes — see **[docs/pgx-cnv.md](docs/pgx-cnv.md)**
331
+ for the VCF contract and YAML schema.
332
+
333
+ ### Example score model with categories rescaling
334
+ ```
335
+ score_model:
336
+ variants:
337
+ rs10012: {effect_allele: G, effect_size: 0.369215857410143}
338
+ rs1014971: {effect_allele: T, effect_size: 0.075546961392531}
339
+ rs10936599: {effect_allele: C, effect_size: 0.086359830674748}
340
+ rs11892031: {effect_allele: C, effect_size: -0.552841968657781}
341
+ rs1495741: {effect_allele: A, effect_size: 0.05307844348342}
342
+ rs17674580: {effect_allele: C, effect_size: 0.187520720836463}
343
+ rs2294008: {effect_allele: T, effect_size: 0.08278537031645}
344
+ rs798766: {effect_allele: T, effect_size: 0.093421685162235}
345
+ rs9642880: {effect_allele: G, effect_size: 0.093421685162235}
346
+ categories:
347
+ High risk: {from: 1.371624087, to: 2.581880425, scale_from: 2, scale_to: 3}
348
+ Potential risk: {from: 1.169616034, to: 1.371624087, scale_from: 1, scale_to: 2}
349
+ Average risk: {from: -0.346748358, to: 1.169616034, scale_from: 0, scale_to: 1}
350
+ Low risk: {from: -1.657132197, to: -0.346748358, scale_from: -1, scale_to: 0}
351
+ description:
352
+ about:
353
+ genes: []
354
+ result_statement_choice:
355
+ Average risk: Avg
356
+ Potential risk: Pot
357
+ High risk: Hig
358
+ Low risk: Low
359
+ science_behind_the_test:
360
+ test_type: Polygenic Risk Score
361
+ trait: Breast cancer
362
+ trait_authors:
363
+ - taken from the PGS catalog
364
+ trait_copyright: Intelliseq all rights reserved
365
+ trait_explained: None
366
+ trait_heritability: None
367
+ trait_pgs_id: PGS000001
368
+ trait_pmids:
369
+ - 25855707
370
+ trait_snp_heritability: None
371
+ trait_title: Breast_Cancer
372
+ trait_version: 1.0
373
+ what_you_can_do_choice:
374
+ Average risk:
375
+ High risk:
376
+ Low risk:
377
+ what_your_result_means_choice:
378
+ Average risk:
379
+ High risk:
380
+ Low risk:
381
+ ```
382
+
383
+ #### Example Formula Model
384
+ ```
385
+ formula_model:
386
+ formula:
387
+ brownexp: "math.exp(@brown.score_model.value - 2.0769)"
388
+ redexp: "math.exp(@red.score_model.value - 6.3953)"
389
+ blackexp: "math.exp(@black.score_model.value - 2.4029)"
390
+ sumexp: "@brownexp + @redexp + @blackexp"
391
+ brown_prob: "@brownexp / (1 + @sumexp)"
392
+ red_prob: "@redexp / (1 + @sumexp)"
393
+ black_prob: "@blackexp / (1 + @sumexp)"
394
+ blonde_prob: "1 - (@brown_prob + @red_prob + @black_prob)"
395
+ brown:
396
+ score_model:
397
+ variants:
398
+ rs796296176: {effect_allele: CA, effect_size: 1.2522}
399
+ rs11547464: {effect_allele: A, effect_size: -0.61155}
400
+ rs885479: {effect_allele: T, effect_size: 0.2937}
401
+ rs1805008: {effect_allele: T, effect_size: -0.50143}
402
+ rs1805005: {effect_allele: T, effect_size: 0.21172}
403
+ rs1805006: {effect_allele: A, effect_size: 1.9293}
404
+ rs1805007: {effect_allele: T, effect_size: -0.32318}
405
+ rs1805009: {effect_allele: C, effect_size: 0.60861}
406
+ rs1805009: {effect_allele: A, effect_size: 0.25624}
407
+ rs2228479: {effect_allele: A, effect_size: -0.054143}
408
+ rs1110400: {effect_allele: C, effect_size: -0.56315}
409
+ rs28777: {effect_allele: C, effect_size: 0.52168}
410
+ rs16891982: {effect_allele: C, effect_size: 0.75284}
411
+ rs12821256: {effect_allele: G, effect_size: -0.34957}
412
+ rs4959270: {effect_allele: A, effect_size: -0.19171}
413
+ rs12203592: {effect_allele: T, effect_size: 1.6475}
414
+ rs1042602: {effect_allele: T, effect_size: 0.16092}
415
+ rs1800407: {effect_allele: A, effect_size: -0.19111}
416
+ rs2402130: {effect_allele: G, effect_size: 0.35821}
417
+ rs12913832: {effect_allele: T, effect_size: 1.214}
418
+ rs2378249: {effect_allele: C, effect_size: 0.12669}
419
+ rs683: {effect_allele: C, effect_size: 0.21172}
420
+ red:
421
+ score_model:
422
+ variants:
423
+ rs796296176: {effect_allele: CA, effect_size: 25.508}
424
+ rs11547464: {effect_allele: A, effect_size: 2.5381}
425
+ rs885479: {effect_allele: T, effect_size: -0.20889}
426
+ rs1805008: {effect_allele: T, effect_size: 2.801}
427
+ rs1805005: {effect_allele: T, effect_size: 0.93493}
428
+ rs1805006: {effect_allele: A, effect_size: 3.65}
429
+ rs1805007: {effect_allele: T, effect_size: 3.4408}
430
+ rs1805009: {effect_allele: C, effect_size: 4.5868}
431
+ rs1805009: {effect_allele: A, effect_size: 22.107}
432
+ rs2228479: {effect_allele: A, effect_size: 0.62307}
433
+ rs1110400: {effect_allele: C, effect_size: 1.4453}
434
+ rs28777: {effect_allele: C, effect_size: 0.70401}
435
+ rs16891982: {effect_allele: C, effect_size: -0.41869}
436
+ rs12821256: {effect_allele: G, effect_size: -0.57964}
437
+ rs4959270: {effect_allele: A, effect_size: 0.24861}
438
+ rs12203592: {effect_allele: T, effect_size: 0.90233}
439
+ rs1042602: {effect_allele: T, effect_size: 0.45003}
440
+ rs1800407: {effect_allele: A, effect_size: -0.27606}
441
+ rs2402130: {effect_allele: G, effect_size: 0.28313}
442
+ rs12913832: {effect_allele: T, effect_size: -0.093776}
443
+ rs2378249: {effect_allele: C, effect_size: 0.76634}
444
+ rs683: {effect_allele: C, effect_size: -0.053427}
445
+ black:
446
+ score_model:
447
+ variants:
448
+ rs796296176: {effect_allele: CA, effect_size: 2.732}
449
+ rs11547464: {effect_allele: A, effect_size: -16.969}
450
+ rs885479: {effect_allele: T, effect_size: 0.39983}
451
+ rs1805008: {effect_allele: T, effect_size: -0.86062}
452
+ rs1805005: {effect_allele: T, effect_size: -0.0029013}
453
+ rs1805006: {effect_allele: A, effect_size: -16.088}
454
+ rs1805007: {effect_allele: T, effect_size: -1.3757}
455
+ rs1805009: {effect_allele: C, effect_size: 0.060631}
456
+ rs1805009: {effect_allele: A, effect_size: 3.9824}
457
+ rs2228479: {effect_allele: A, effect_size: 0.17012}
458
+ rs1110400: {effect_allele: C, effect_size: 0.29143}
459
+ rs28777: {effect_allele: C, effect_size: 0.82228}
460
+ rs16891982: {effect_allele: C, effect_size: 1.1617}
461
+ rs12821256: {effect_allele: G, effect_size: -0.89824}
462
+ rs4959270: {effect_allele: A, effect_size: -0.36359}
463
+ rs12203592: {effect_allele: T, effect_size: 1.997}
464
+ rs1042602: {effect_allele: T, effect_size: 0.065432}
465
+ rs1800407: {effect_allele: A, effect_size: -0.49601}
466
+ rs2402130: {effect_allele: G, effect_size: 0.26536}
467
+ rs12913832: {effect_allele: T, effect_size: 1.9391}
468
+ rs2378249: {effect_allele: C, effect_size: -0.089509}
469
+ rs683: {effect_allele: C, effect_size: 0.15796}
470
+ description:
471
+ name: HirisPlex
472
+
473
+ ```
474
+
475
+ ### Description
476
+ ### Model keys glossary
477
+ - `model` - generic model that can aggregate results of other model types
478
+ - `diplotype_model`
479
+ Required keys:
480
+ - `diplotypes`
481
+ - `description` - all properties to be included in the final results
482
+
483
+ ### Usecases
484
+
485
+ #### PGX
486
+
487
+ ```
488
+ python3 -m pip install polygenic
489
+ pgstk pgs-compute --vcf [PATH_TO_VCF_GZ] --model cyp2d6-pharmvar.yml --print | jq .haplotype_model.haplotypes.match
490
+ ```
491
+
492
+ ## License
493
+ Proprietary (contact@intelliseq.pl)
494
+
495
+ ## Updates
496
+ ### 2.5.0
497
+ - FEATURE: copy-number (CNV) star-allele calling for CYP2D6 (`*5`, `*1xN`) and CYP2C19 (`*36`/`*37`). `polygenic` consumes copy number resolved by an upstream caller (symbolic `<DEL>`/`<DUP>` ALT + range + `FORMAT/CN` + phase) and emits non-diploid calls (`*5/*5`, `*1/*5`, `*1/*4x2`). It does no depth/ratio math — allele-specific duplication side is read from phase; unphased input yields `*a/*b (CNx)` + `allele_specific_unphased`. See [docs/pgx-cnv.md](docs/pgx-cnv.md).
498
+ - FEATURE: `haplotype_model` gains a `copy_number:` block, `structural:` haplotype definitions (`scope: whole|partial`), and a CYP2D6-only `multiplication:` block.
499
+ - TEST: structural VcfRecord unit tests and CNV deletion/duplication/regression integration tests (`test_vcfrecord_structural.py`, `test_haplotypemodel_cnv.py`).
500
+ ### 2.4.0
501
+ - FEATURE: `--ref-fallback` flag (opt-in; default off). `./.` genotypes now stay `source="missing"` by default — matches PharmCAT behavior and fixes spurious CYP2C19*38 calls on panels that don't assay rs3758581.
502
+ - FEATURE: `--top-n` flag (default 15). Haplotype caller returns a call when either the match ≥50% OR the scored candidate pool is ≤top_n, enabling calls on sparse panels without ambiguity risk.
503
+ - BUG: ScoreModel.compute no longer KeyErrors on `adjusted_score` when a variant has no `af` field (model.py:549).
504
+ - BUG: Diplotype QC now correctly counts variant sources (previously miscounted every variant as "missing" due to structural mismatch in compute_qc).
505
+ - BUG: ScoreModel handles models without `args` section (previously AttributeError on `self.get("args").get("prevalence")`).
506
+ - TEST: Added PGx truth-set integration tests against 1000 Genomes phased GRCh38 slices (NA12878, NA18507, NA19240 × {CYP2C19, CYP2C9, CYP2B6, CYP2D6, SLCO1B1}); rebuildable via `scripts/build_pgx_fixtures.py`.
507
+ - TEST: Added PharmVar-named haplotype tests and NA18507 CYP2B6/SLCO1B1 regression tests.
508
+ ### 2.3.17
509
+ - BUG: resolved bug with low weight for missing genotypes
510
+ ### 2.3.16
511
+ - BUG: resolved bug with weight of genotypes in haplotypes
512
+ ### 2.3.15
513
+ - BUG: resolved bug with not enough haplotypes to check
514
+ ### 2.3.14
515
+ - BUG: resolved bug with wrong leftover genotypes
516
+ ### 2.3.12
517
+ - FEATURE: added gene names to genotypes if available
518
+ ### 2.3.11
519
+ - BUG: resolved bug with wrong genotype sources counts
520
+ ### 2.3.10
521
+ - BUG: resolved bug with missing genotype sources counts
522
+ ### 2.3.9
523
+ - FEATURE: add reference as a genotyping source
524
+ ### 2.3.8
525
+ - BUG: resolved bugs inside mobigen wdl task
526
+ ### 2.3.7
527
+ - FEATURE: added ldproxy imputation source
528
+ ### 2.3.6
529
+ - BUG: resolved bug with missing polars package after installation
530
+ ### 2.3.5
531
+ - BUG: resolved bug with 'type' object is not subscriptable running pgstk
532
+ ### 2.3.4
533
+ - BUG: resolved bug with where model does not provide to or from category fields
534
+ ### 2.3.3
535
+ - BUG: resolved bug with missing pyarrow package after installation
536
+ ### 2.3.2
537
+ - BUG: renamed jpg to jpeg outputs from vcfstat
538
+ ### 2.3.1
539
+ - BUG: resolved bug with missing importlib-resources package after installation
540
+ ### 2.3.0
541
+ - FEATURE: added vcf stat tool for zygosities
542
+ - FEATURE: added vcf stat tool for baf computation
543
+ ### 2.2.15
544
+ - UPDATE: updated parsing for new version of pan biobankuk
545
+ - DEV: updated numpy version to 1.23.4
546
+ ### 2.2.14
547
+ - FEATURE: added module for ldproxy imputing
548
+ - FEATURE: added option for merging output as an array instead of dictionary in pgs-compute
549
+ ### 2.2.13
550
+ - BUG: resolved bug with missing score in haplotype model
551
+ - DEV: cleaned up test resources
552
+ ### 2.2.12
553
+ - BUG: resolved bug with empty argument in executable
554
+ ### 2.2.11
555
+ - BUG: resolved bug with naming of multiple models in one file
556
+ ### 2.2.10
557
+ - DOC: improved diploty model documentation
558
+ ### 2.2.9
559
+ - BUG: missing effect allele in diplotyp models
560
+ ### 2.2.8
561
+ - BUG: imputed source is based on IMP tag in the INFO field or GT:DS in format field
562
+ ### 2.2.7
563
+ - BUG: repaired bug with missing math library in eval
564
+ ### 2.2.6
565
+ - FEATURE: added qc to model results
566
+ ### 2.2.5
567
+ - ENHACEMENT: libraries updates
568
+ ### 2.2.0
569
+ - ENHANCEMENT: better computing of haplotype models. First one haplotype is identified and further the second haplotype is identified from leftover genotypes
570
+ - ENHANCEMENT: moved argparse from tools to pgstk
571
+ ### 2.1.10
572
+ - BUG: resolved bug with wrong plink.clumped path in clumping
573
+ ### 2.1.9
574
+ - BUG: resolved bug with missing index in biobankuk model
575
+ ### 2.1.8
576
+ - BUG: resolved bug with biobankuk model for codenames with special characters
577
+ ### 2.1.7
578
+ - BUG: resolved bug with haplotype model where none of haplotypes matched genotype. Most probable genotype is provided
579
+ ### 2.1.6
580
+ - DOC: added docker badges
581
+ - FEATURE: added posibility to output all pgs results in one json file `--merge-outputs`
582
+ - FEATURE: added category to diplotype model
583
+ - FEATURE: added caching in genotyping module
584
+ ### 2.1.5
585
+ - BUG: biobankuk model output files now contain only alphanumeric characters
586
+ - BUG: biobankuk model code names with special characters are now being downloaded
587
+ ### 2.1.4
588
+ - FEATURE: added model_name and sample_name to description
589
+ ### 2.1.3
590
+ - FEATURE: added support for multiple models in pgs-compute
591
+ - FEATURE: added missing variants count to haplotype in haplotype model
592
+ - BUG: id field in haplotype model
593
+ ### 2.1.2
594
+ - FEATURE: allow gnomadid for variant in yml models
595
+ - FEATURE: added printing output option in pgs-compute
596
+ ### 2.1.1
597
+ - BUG: resolved NoneType bug with empty haplotype
598
+ ### 2.1.0
599
+ - FEATURE: haplotype model now works with phased data
600
+ ### 2.0.0
601
+ - FEATURE: switched to yaml model definitions
602
+ - FEATURE: implemented formula, score, haplotype and diplotype model types
603
+ - FEATURE: added gene symbols to description
604
+ - DEVOPS: prepared docker image with resources for building models
File without changes
@@ -0,0 +1,12 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Entrypoint module
5
+ """
6
+
7
+ import sys
8
+
9
+ from polygenic.pgstk import main
10
+
11
+ if __name__ == "__main__":
12
+ main(sys.argv[1:])
File without changes