pycmplot 0.1.9__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycmplot-0.1.9 → pycmplot-0.2.1}/LICENSE +1 -1
- pycmplot-0.2.1/PKG-INFO +231 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/README.md +8 -2
- {pycmplot-0.1.9 → pycmplot-0.2.1}/docs/conf.py +1 -1
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/__init__.py +1 -1
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/_core.py +42 -23
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/annotation.py +48 -45
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/cli.py +38 -16
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/constants.py +2 -2
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/io.py +115 -51
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/liftover.py +8 -8
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/plotting/circular.py +49 -40
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/plotting/linear.py +247 -46
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/resources.py +6 -6
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/stats.py +6 -6
- pycmplot-0.2.1/pycmplot.egg-info/PKG-INFO +231 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot.egg-info/entry_points.txt +0 -1
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot.egg-info/requires.txt +7 -0
- pycmplot-0.2.1/pycmplot.egg-info/top_level.txt +3 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pyproject.toml +8 -2
- {pycmplot-0.1.9 → pycmplot-0.2.1}/setup.cfg +2 -2
- pycmplot-0.1.9/PKG-INFO +0 -14
- pycmplot-0.1.9/pycmplot.egg-info/PKG-INFO +0 -14
- pycmplot-0.1.9/pycmplot.egg-info/top_level.txt +0 -1
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/data/hg19ToHg38.over.chain +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot.egg-info/SOURCES.txt +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot.egg-info/dependency_links.txt +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docs/conf.py +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_annotation.py +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_core_cli.py +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_io.py +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_liftover.py +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_plotting.py +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_resources_constants.py +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_stats.py +0 -0
- {pycmplot-0.1.9 → pycmplot-0.2.1}/setup.py +0 -0
pycmplot-0.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pycmplot
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Multi-track circular and linear Manhattan plot generation for GWAS summary statistics
|
|
5
|
+
Author: Kevin Esoh
|
|
6
|
+
Author-email: Kevin Esoh <kesohku1@jh.edu>
|
|
7
|
+
License-Expression: CC-BY-NC-SA-4.0
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: pandas>=1.5
|
|
12
|
+
Requires-Dist: numpy>=1.23
|
|
13
|
+
Requires-Dist: matplotlib>=3.6
|
|
14
|
+
Requires-Dist: pillow>=9.0
|
|
15
|
+
Requires-Dist: pycirclize>=0.6
|
|
16
|
+
Requires-Dist: natsort>=8.0
|
|
17
|
+
Requires-Dist: adjustText>=0.8
|
|
18
|
+
Requires-Dist: pyliftover>=0.4
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pytest; extra == "dev"
|
|
21
|
+
Requires-Dist: black; extra == "dev"
|
|
22
|
+
Requires-Dist: ruff; extra == "dev"
|
|
23
|
+
Requires-Dist: towncrier; extra == "dev"
|
|
24
|
+
Requires-Dist: sphinx; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# pycmplot
|
|
28
|
+
|
|
29
|
+
Multi-track **circular** and **linear** Manhattan plot generation for GWAS summary statistics.
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
|
|
33
|
+
| PACKAGE FOR CIRCULAR AND LINEAR MANHATTAN PLOTTING |
|
|
34
|
+
| Kevin Esoh, 2026 |
|
|
35
|
+
| kesohku1@jh.edu |
|
|
36
|
+
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
This package will take any number of per SNP/variant summary statistics, be it GWAS,
|
|
40
|
+
selection scans (e.g. iHS, EHH, FST), etc and generate Manhattan plots. If given a single
|
|
41
|
+
file, a single one-track Manhattan plot will be generated. Multiple files will result in
|
|
42
|
+
the generation of a multi-track stacked Manhattan plot.
|
|
43
|
+
|
|
44
|
+
In the process, the package will generate a **hits summary table** for variants with p-value
|
|
45
|
+
(or whatever statistic for significance is used) below the user-specified significance threshold.
|
|
46
|
+
This hits summary table will contain annotated gene names, in addition to other annotations, that
|
|
47
|
+
would then be used to annotate the plots.
|
|
48
|
+
|
|
49
|
+
Importantly, the package allows for conversion of hg19 genomic coordinates to hg38 coordinates.
|
|
50
|
+
This ensures that summary stats obtained using different imputation panels, for instance, can be
|
|
51
|
+
processed in the same run. That is, users can simply concatenate multiple summary stats files together,
|
|
52
|
+
such as those for the same trait but analysed using different imputation panels. Users only need to
|
|
53
|
+
add a new column specifying the genome build (hg19 or hg38) of the variants. Then the `--build_column`
|
|
54
|
+
option of the package should be used to indicate the column and then the package will liftover all
|
|
55
|
+
postions in hg19 to hg38 ensuring that hits table generation and plotting are done with one unified
|
|
56
|
+
corrdinate system.
|
|
57
|
+
|
|
58
|
+
A key functionality of the package is its ability to auto-detect certain columns if ommited on the
|
|
59
|
+
command-line or python API:
|
|
60
|
+
- Chromosome column: `-chr, --chrom_column` or ommited
|
|
61
|
+
- Basepair position column: `-pos, --pos_column` or ommited
|
|
62
|
+
- SNP or Marker ID column: `-snp, --snp_column` or ommited
|
|
63
|
+
- P-value (or whatever value) column: `-p, --pval_column` or ommited
|
|
64
|
+
- Build version column: `-b, --build_column` or ommited
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
Candidate names for each of the columns is shown below.
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
# Resolve column names
|
|
71
|
+
chr_candidates = [chrom, 'CHR', 'CHROM', 'Chromosome', '#CHROM', '#CHR', 'Chrom', 'chrom', 'chr', 'chromosome', '#chr', '#chrom']
|
|
72
|
+
pos_candidates = [pos, 'BP', 'POS', 'bp', 'pos', 'Basepair']
|
|
73
|
+
snp_candidates = [snp, 'SNP', 'RSID', 'rsID', 'MarkerName', 'MarkerID', 'Predictor', 'Marker', 'SNPID', 'ID']
|
|
74
|
+
pvl_candidates = [pcol, 'P', 'P-value', 'Wald_P', 'pvalue', 'p_val', 'pval']
|
|
75
|
+
bld_candidates = [build, 'BUILD', 'Genome', 'Genome_Build', 'Genome-build']
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
> NB: Upper and lower cases of the candidates are also considered, making each candidate expanded 3 times.
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
Since GWAS summary stats files can be very large, to improve speed and memory efficiency, it is
|
|
82
|
+
**highly recommended** to use `-tp, --trim_pval` with a value to exclude variants with p-value above a
|
|
83
|
+
certain threshold, e.g. `0.01 (1e-2)` or `0.001 (1e-3)`.
|
|
84
|
+
|
|
85
|
+
A potential useful application is **comparative visualization** of results from multiple imputation panels,
|
|
86
|
+
multiple populations, or multiple traits to observe shared genetic architecture.
|
|
87
|
+
|
|
88
|
+
Read more in the package documentation page: https://pycmplot.readthedocs.io/en/latest/
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Installation
|
|
93
|
+
|
|
94
|
+
### From PyPI
|
|
95
|
+
```bash
|
|
96
|
+
pip install pycmplot
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
### From GitHub
|
|
101
|
+
```bash
|
|
102
|
+
git clone https://github.com/esohkevin/pycmplot.git
|
|
103
|
+
|
|
104
|
+
cd pycmplot
|
|
105
|
+
|
|
106
|
+
pip install -e .
|
|
107
|
+
|
|
108
|
+
# or
|
|
109
|
+
|
|
110
|
+
pip install -e . --break-system-packages
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
### Use python virtual environment if local installation is not possible
|
|
115
|
+
```bash
|
|
116
|
+
python -m venv ~/bin/pycmplot
|
|
117
|
+
|
|
118
|
+
source ~/bin/pycmplot/bin/activate
|
|
119
|
+
|
|
120
|
+
pip install --upgrade pip setuptools wheel
|
|
121
|
+
|
|
122
|
+
# then follow any of the installation steps above
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Test the installation
|
|
127
|
+
```bash
|
|
128
|
+
pycmplot -h
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Dependencies
|
|
132
|
+
|
|
133
|
+
| Package | Purpose |
|
|
134
|
+
|---------|---------|
|
|
135
|
+
| pandas, numpy | Data loading & statistics |
|
|
136
|
+
| matplotlib | Plotting backend |
|
|
137
|
+
| pycirclize | Circular (Circos-style) tracks |
|
|
138
|
+
| natsort | Natural chromosome sorting |
|
|
139
|
+
| adjustText | Label collision avoidance |
|
|
140
|
+
| pyliftover | hg19 to hg38 coordinate conversion |
|
|
141
|
+
| Pillow | Image utilities |
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
## Command-line usage
|
|
147
|
+
|
|
148
|
+
### Linear Manhattan (default)
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
pycmplot \
|
|
152
|
+
--sum_stats HbF.tsv.gz,MCV.txt.gz,MCH.tsv.gz \
|
|
153
|
+
--labels HbF,MCV,MCH \
|
|
154
|
+
--logp \
|
|
155
|
+
--signif_line \
|
|
156
|
+
--highlight \
|
|
157
|
+
--annotate GENE \
|
|
158
|
+
--output_dir ./results \
|
|
159
|
+
--output_format png \
|
|
160
|
+
--dpi 300
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Circular Manhattan
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
pycmplot \
|
|
167
|
+
--sum_stats HbF.tsv.gz,MCV.tsv.gz \
|
|
168
|
+
--labels HbF,MCV \
|
|
169
|
+
--mode cm \
|
|
170
|
+
--trim_pval 0.01 \
|
|
171
|
+
--logp \
|
|
172
|
+
--signif_threshold \
|
|
173
|
+
--plot_title "RBC Traits" \
|
|
174
|
+
--output_dir ./results
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Key options
|
|
178
|
+
|
|
179
|
+
| Flag | Description | Default |
|
|
180
|
+
|------|-------------|---------|
|
|
181
|
+
| `-s, --sum_stats` | Comma-separated sumstats files | **required** |
|
|
182
|
+
| `-l, --labels` | Comma-separated track labels | **required** |
|
|
183
|
+
| `-b, --build` | Comma-separated genome builds of sumstats | off |
|
|
184
|
+
| `-bc, --build_column` | Genome build column name (containing hg18/hg19/hg38) | off |
|
|
185
|
+
| `-m, --mode` | `lm` linear or `cm` circular | `lm` |
|
|
186
|
+
| `-qq, --qq_plot` | Also generate a QQ-plot | off (coming soon...) |
|
|
187
|
+
| `--logp` | Plot -log10(p) | off |
|
|
188
|
+
| `-sig, --signif_threshold` | Genome-wide significance threshold | off (auto 0.05/N) |
|
|
189
|
+
| `-sigl, --signif_line` | Value for genome-wide significance line if different from `-sig` | 5e-8 |
|
|
190
|
+
| `-sug, --suggest_threshold` | Threshold for suggestive signals | off |
|
|
191
|
+
| `-hl, --highlight` | Highlight significant loci | off |
|
|
192
|
+
| `-a, --annotate` | Annotate with `snp`, `gene`, or any column in `hits_table` | `snp` |
|
|
193
|
+
| `-tp, --trim_pval` | Trim variants above this p-value for speed | off |
|
|
194
|
+
| `-st, --sort_track` | Sort tracks by `label` or `chrom_len` | input order |
|
|
195
|
+
| `-od, --output_dir` | Output directory | `.` |
|
|
196
|
+
| `-of, --output_format` | Output format (`png`, `pdf`, `svg`, `jpg`) | `png` |
|
|
197
|
+
|
|
198
|
+
Run `pycmplot -h` for the full option list.
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## Python API
|
|
203
|
+
|
|
204
|
+
A demonstration of how to use the python API is provided in this notebook: https://github.com/esohkevin/pycmplot/blob/main/pycmplot_python_api.ipynb
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Package structure
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
pycmplot/
|
|
213
|
+
├── pyproject.toml
|
|
214
|
+
├── setup.py
|
|
215
|
+
├── setup.cfg
|
|
216
|
+
├── README.md
|
|
217
|
+
└── pycmplot/
|
|
218
|
+
├── __init__.py # public API exports
|
|
219
|
+
├── __main__.py # python -m pycmplot
|
|
220
|
+
├── _core.py # main() orchestration
|
|
221
|
+
├── cli.py # argparse definitions
|
|
222
|
+
├── constants.py # chromosome lengths, biotype weights
|
|
223
|
+
├── resources.py # external resource path config
|
|
224
|
+
├── io.py # sumstat loading, delimiter detection
|
|
225
|
+
├── stats.py # get_lead_snps, get_highlight_snps
|
|
226
|
+
├── liftover.py # lazy hg19→hg38 liftover
|
|
227
|
+
├── annotation.py # nearest-gene annotation, hits table
|
|
228
|
+
└── plotting/
|
|
229
|
+
├── __init__.py
|
|
230
|
+
├── linear.py # plot_linear
|
|
231
|
+
└── circular.py # plot_circular, compute_track_radii_dict
|
|
@@ -49,6 +49,9 @@ pvl_candidates = [pcol, 'P', 'P-value', 'Wald_P', 'pvalue', 'p_val', 'pval']
|
|
|
49
49
|
bld_candidates = [build, 'BUILD', 'Genome', 'Genome_Build', 'Genome-build']
|
|
50
50
|
```
|
|
51
51
|
|
|
52
|
+
> NB: Upper and lower cases of the candidates are also considered, making each candidate expanded 3 times.
|
|
53
|
+
|
|
54
|
+
|
|
52
55
|
Since GWAS summary stats files can be very large, to improve speed and memory efficiency, it is
|
|
53
56
|
**highly recommended** to use `-tp, --trim_pval` with a value to exclude variants with p-value above a
|
|
54
57
|
certain threshold, e.g. `0.01 (1e-2)` or `0.001 (1e-3)`.
|
|
@@ -56,6 +59,8 @@ certain threshold, e.g. `0.01 (1e-2)` or `0.001 (1e-3)`.
|
|
|
56
59
|
A potential useful application is **comparative visualization** of results from multiple imputation panels,
|
|
57
60
|
multiple populations, or multiple traits to observe shared genetic architecture.
|
|
58
61
|
|
|
62
|
+
Read more in the package documentation page: https://pycmplot.readthedocs.io/en/latest/
|
|
63
|
+
|
|
59
64
|
---
|
|
60
65
|
|
|
61
66
|
## Installation
|
|
@@ -149,7 +154,8 @@ pycmplot \
|
|
|
149
154
|
|------|-------------|---------|
|
|
150
155
|
| `-s, --sum_stats` | Comma-separated sumstats files | **required** |
|
|
151
156
|
| `-l, --labels` | Comma-separated track labels | **required** |
|
|
152
|
-
| `-b, --
|
|
157
|
+
| `-b, --build` | Comma-separated genome builds of sumstats | off |
|
|
158
|
+
| `-bc, --build_column` | Genome build column name (containing hg18/hg19/hg38) | off |
|
|
153
159
|
| `-m, --mode` | `lm` linear or `cm` circular | `lm` |
|
|
154
160
|
| `-qq, --qq_plot` | Also generate a QQ-plot | off (coming soon...) |
|
|
155
161
|
| `--logp` | Plot -log10(p) | off |
|
|
@@ -157,7 +163,7 @@ pycmplot \
|
|
|
157
163
|
| `-sigl, --signif_line` | Value for genome-wide significance line if different from `-sig` | 5e-8 |
|
|
158
164
|
| `-sug, --suggest_threshold` | Threshold for suggestive signals | off |
|
|
159
165
|
| `-hl, --highlight` | Highlight significant loci | off |
|
|
160
|
-
| `-a, --annotate` | Annotate with `
|
|
166
|
+
| `-a, --annotate` | Annotate with `snp`, `gene`, or any column in `hits_table` | `snp` |
|
|
161
167
|
| `-tp, --trim_pval` | Trim variants above this p-value for speed | off |
|
|
162
168
|
| `-st, --sort_track` | Sort tracks by `label` or `chrom_len` | input order |
|
|
163
169
|
| `-od, --output_dir` | Output directory | `.` |
|
|
@@ -12,7 +12,7 @@ sys.path.insert(0, os.path.abspath(".."))
|
|
|
12
12
|
project = "pycmplot"
|
|
13
13
|
copyright = "2026, Kevin Esoh"
|
|
14
14
|
author = "Kevin Esoh"
|
|
15
|
-
release = "0.1
|
|
15
|
+
release = "0.2.1" # update to match PyPI version
|
|
16
16
|
|
|
17
17
|
# -- General configuration -----------------------------------------------------
|
|
18
18
|
extensions = [
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
CORE_MODULE =
|
|
3
|
+
CORE_MODULE = """
|
|
4
4
|
pycmplot._core
|
|
5
5
|
==============
|
|
6
6
|
|
|
@@ -12,7 +12,7 @@ work to :mod:`pycmplot.io`, :mod:`pycmplot.plotting.linear`, and
|
|
|
12
12
|
All imports are deferred inside :func:`main` so that
|
|
13
13
|
``import pycmplot`` remains fast regardless of the size of the dependency
|
|
14
14
|
tree.
|
|
15
|
-
"""
|
|
15
|
+
"""
|
|
16
16
|
|
|
17
17
|
import logging
|
|
18
18
|
import warnings
|
|
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def main() -> None:
|
|
29
|
-
MAIN =
|
|
29
|
+
MAIN = """Orchestrate the full pycmplot pipeline from the command line.
|
|
30
30
|
|
|
31
31
|
This function is registered as the ``pycmplot`` console-script entry point
|
|
32
32
|
in ``pyproject.toml`` / ``setup.cfg``. It performs the following steps in
|
|
@@ -75,7 +75,7 @@ def main() -> None:
|
|
|
75
75
|
Linear Manhattan plotter called for ``--mode lm`` (default).
|
|
76
76
|
pycmplot.plotting.circular.plot_circular :
|
|
77
77
|
Circular Manhattan plotter called for ``--mode cm``.
|
|
78
|
-
"""
|
|
78
|
+
"""
|
|
79
79
|
|
|
80
80
|
# ------------------------------------------------------------------
|
|
81
81
|
# Deferred imports so ``import pycmplot`` remains fast
|
|
@@ -105,7 +105,8 @@ def main() -> None:
|
|
|
105
105
|
chrom_arg = args.chrom_column
|
|
106
106
|
pos_arg = args.pos_column
|
|
107
107
|
snp_arg = args.snp_column
|
|
108
|
-
build_arg = args.
|
|
108
|
+
build_arg = args.build
|
|
109
|
+
buildc_arg = args.build_column
|
|
109
110
|
labels_raw = args.labels
|
|
110
111
|
pcol_arg = args.pval_column
|
|
111
112
|
logp = args.logp
|
|
@@ -123,13 +124,13 @@ def main() -> None:
|
|
|
123
124
|
point_size = args.point_size
|
|
124
125
|
highlight = args.highlight
|
|
125
126
|
highlight_thresh = args.highlight_thresh
|
|
126
|
-
|
|
127
|
+
highlight_color = args.highlight_color
|
|
127
128
|
highlight_line = args.highlight_line
|
|
128
|
-
|
|
129
|
+
highlight_line_color = args.highlight_line_color
|
|
129
130
|
colors_raw = args.colors
|
|
130
|
-
r_min = args.
|
|
131
|
-
r_max = args.
|
|
132
|
-
pad = args.
|
|
131
|
+
r_min = args.min_radius
|
|
132
|
+
r_max = args.max_radius
|
|
133
|
+
pad = args.circular_track_spacing
|
|
133
134
|
output_format = args.output_format
|
|
134
135
|
output_dir = args.output_dir
|
|
135
136
|
dpi = args.dpi
|
|
@@ -142,18 +143,20 @@ def main() -> None:
|
|
|
142
143
|
|
|
143
144
|
|
|
144
145
|
# ------------------------------------------------------------------
|
|
145
|
-
# Sumstat, labels, colours, track heights str to list
|
|
146
|
+
# Sumstat, labels, colours, track heights [build] str to list
|
|
146
147
|
# ------------------------------------------------------------------
|
|
147
148
|
(
|
|
148
149
|
sum_stats,
|
|
149
150
|
labels,
|
|
150
151
|
colors,
|
|
151
|
-
t_heights
|
|
152
|
+
t_heights,
|
|
153
|
+
builds
|
|
152
154
|
) = strip_comma_separated_input_streams(
|
|
153
155
|
sum_stats = sum_stats_raw,
|
|
154
156
|
labels = labels_raw,
|
|
155
157
|
colors_raw = colors_raw,
|
|
156
158
|
track_heights = track_heights,
|
|
159
|
+
builds = build_arg if build_arg else None,
|
|
157
160
|
)
|
|
158
161
|
|
|
159
162
|
# ------------------------------------------------------------------
|
|
@@ -182,7 +185,8 @@ def main() -> None:
|
|
|
182
185
|
pos = pos_arg,
|
|
183
186
|
snp = snp_arg,
|
|
184
187
|
pcol = pcol_arg,
|
|
185
|
-
|
|
188
|
+
buildc = buildc_arg,
|
|
189
|
+
build = builds
|
|
186
190
|
)
|
|
187
191
|
|
|
188
192
|
# ------------------------------------------------------------------
|
|
@@ -212,6 +216,19 @@ def main() -> None:
|
|
|
212
216
|
resources=resources,
|
|
213
217
|
)
|
|
214
218
|
|
|
219
|
+
# ------------------------------------------------------------------
|
|
220
|
+
# ANNOTATE BY
|
|
221
|
+
# ------------------------------------------------------------------
|
|
222
|
+
if annotate:
|
|
223
|
+
if str(annotate).upper() == "GENE":
|
|
224
|
+
label_col = 'top_gene'
|
|
225
|
+
elif str(annotate).upper() == "SNP":
|
|
226
|
+
label_col = 'SNP'
|
|
227
|
+
else:
|
|
228
|
+
label_col = annotate
|
|
229
|
+
|
|
230
|
+
logger.info(f"Anotate by: {label_col}")
|
|
231
|
+
|
|
215
232
|
# ------------------------------------------------------------------
|
|
216
233
|
# CIRCULAR MANHATTAN
|
|
217
234
|
# ------------------------------------------------------------------
|
|
@@ -224,15 +241,16 @@ def main() -> None:
|
|
|
224
241
|
signif_lines = signif_lines,
|
|
225
242
|
highlight = highlight,
|
|
226
243
|
highlight_thresh = highlight_thresh,
|
|
227
|
-
|
|
244
|
+
highlight_color = highlight_color,
|
|
228
245
|
highlight_line = highlight_line,
|
|
229
|
-
|
|
246
|
+
highlight_line_color = highlight_line_color,
|
|
230
247
|
colors = colors,
|
|
231
248
|
chrom_label_side = chrom_label_side,
|
|
232
249
|
chrom_label_size = chrom_label_size,
|
|
233
250
|
track_label_size = track_label_size,
|
|
234
251
|
track_label_orientation = track_label_orientation,
|
|
235
252
|
annotate = annotate,
|
|
253
|
+
label_col = label_col if annotate else None,
|
|
236
254
|
annotation_size = annotation_size,
|
|
237
255
|
hits_table = hits_table,
|
|
238
256
|
sector_sizes = merged_assoc_sector_sizes,
|
|
@@ -253,24 +271,25 @@ def main() -> None:
|
|
|
253
271
|
else:
|
|
254
272
|
logger.info("Generating LINEAR MANHATTAN Plot ...")
|
|
255
273
|
plot_linear(
|
|
256
|
-
sumstats_loaded
|
|
257
|
-
track_heights
|
|
274
|
+
sumstats_loaded=sumstats_loaded,
|
|
275
|
+
track_heights=t_heights,
|
|
258
276
|
trim_pval=trim_pval,
|
|
259
277
|
logp=True if logp else False,
|
|
260
278
|
point_size=point_size,
|
|
261
279
|
highlight=highlight,
|
|
262
280
|
highlight_thresh=highlight_thresh,
|
|
263
|
-
|
|
264
|
-
highlight_line
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
281
|
+
highlight_color=highlight_color,
|
|
282
|
+
highlight_line=highlight_line,
|
|
283
|
+
highlight_line_color=highlight_line_color,
|
|
284
|
+
annotate=annotate,
|
|
285
|
+
hits_table=hits_table if not hits_table.empty else None,
|
|
286
|
+
label_col=label_col if annotate else None,
|
|
268
287
|
chr_spacing=chr_spacing,
|
|
269
288
|
linear_track_spacing=linear_track_spacing,
|
|
270
289
|
colors=colors,
|
|
271
290
|
signif_lines=signif_lines,
|
|
272
291
|
plot_title=plot_title,
|
|
273
|
-
no_track_labels
|
|
292
|
+
no_track_labels=no_track_labels,
|
|
274
293
|
dpi=dpi,
|
|
275
294
|
output_format=output_format,
|
|
276
295
|
output_dir=output_dir,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
MODULE_DOCSTRING =
|
|
3
|
+
MODULE_DOCSTRING = """
|
|
4
4
|
pycmplot.annotation
|
|
5
5
|
====================
|
|
6
6
|
|
|
@@ -20,7 +20,7 @@ Annotation relies on a bundled Ensembl gene-info TSV (hg38 or hg19). The
|
|
|
20
20
|
file is resolved through :class:`~pycmplot.resources.ResourceConfig`; custom
|
|
21
21
|
paths can be supplied via the ``PYCMPLOT_GENEINFO_HG38`` /
|
|
22
22
|
``PYCMPLOT_GENEINFO_HG19`` environment variables.
|
|
23
|
-
"""
|
|
23
|
+
"""
|
|
24
24
|
|
|
25
25
|
import bisect
|
|
26
26
|
import logging
|
|
@@ -41,7 +41,7 @@ logger = logging.getLogger(__name__)
|
|
|
41
41
|
# ---------------------------------------------------------------------------
|
|
42
42
|
|
|
43
43
|
def _build_genes_dict(genes_df: pd.DataFrame) -> dict:
|
|
44
|
-
BUILD_GENES_DICT =
|
|
44
|
+
BUILD_GENES_DICT = """Build a chromosome-keyed interval dictionary with sorted start positions.
|
|
45
45
|
|
|
46
46
|
Pre-processes the gene reference DataFrame into a structure that supports
|
|
47
47
|
efficient O(log N) binary-search lookup of genes near a query position.
|
|
@@ -67,7 +67,7 @@ def _build_genes_dict(genes_df: pd.DataFrame) -> dict:
|
|
|
67
67
|
-----
|
|
68
68
|
This function is called once per :func:`get_hits_summary_table` invocation;
|
|
69
69
|
the result is passed to :func:`_annotate_variant` for each lead SNP.
|
|
70
|
-
"""
|
|
70
|
+
"""
|
|
71
71
|
|
|
72
72
|
genes_df = genes_df.sort_values(["CHR", "START"])
|
|
73
73
|
genes_dict: dict = {}
|
|
@@ -98,7 +98,7 @@ def _annotate_variant(
|
|
|
98
98
|
window: int = 500_000,
|
|
99
99
|
promoter_window: int = 2_000,
|
|
100
100
|
) -> dict:
|
|
101
|
-
ANNOTATE_VARIANT =
|
|
101
|
+
ANNOTATE_VARIANT = """Return strand-aware nearest-gene annotation for a single variant.
|
|
102
102
|
|
|
103
103
|
Searches the pre-built *genes_dict* within *window* bp of *pos* on
|
|
104
104
|
*chrom*. Reports the nearest upstream and downstream genes (relative to
|
|
@@ -138,7 +138,7 @@ def _annotate_variant(
|
|
|
138
138
|
within *promoter_window* bp upstream of any TSS.
|
|
139
139
|
* ``gene_density`` (int) – number of genes with any overlap in the
|
|
140
140
|
search window.
|
|
141
|
-
"""
|
|
141
|
+
"""
|
|
142
142
|
|
|
143
143
|
_empty = {
|
|
144
144
|
"genic": False,
|
|
@@ -238,7 +238,7 @@ def _annotate_and_prioritize_variant(
|
|
|
238
238
|
promoter_window: int = 2_000,
|
|
239
239
|
biotype_weights: Optional[dict] = None,
|
|
240
240
|
) -> Optional[dict]:
|
|
241
|
-
ANNOTATE_PRIORITIZE =
|
|
241
|
+
ANNOTATE_PRIORITIZE = """Score and rank candidate genes for a single variant using a composite
|
|
242
242
|
priority metric.
|
|
243
243
|
|
|
244
244
|
Builds a candidate gene set within *window* bp of *pos* on *chrom*, then
|
|
@@ -287,7 +287,7 @@ def _annotate_and_prioritize_variant(
|
|
|
287
287
|
For intergenic variants, ``top_gene`` contains the two nearest flanking
|
|
288
288
|
gene symbols joined by ``'-'`` (e.g. ``'HBB-HBD'``) and ``biotype``
|
|
289
289
|
is set to ``'intergenic'``.
|
|
290
|
-
"""
|
|
290
|
+
"""
|
|
291
291
|
|
|
292
292
|
if biotype_weights is None:
|
|
293
293
|
biotype_weights = BIOTYPE_WEIGHTS
|
|
@@ -386,7 +386,7 @@ def _annotate_and_prioritize_variant(
|
|
|
386
386
|
# ---------------------------------------------------------------------------
|
|
387
387
|
|
|
388
388
|
def _clump_by_distance(df: pd.DataFrame, window_kb: int = 500) -> pd.DataFrame:
|
|
389
|
-
CLUMP_BY_DISTANCE =
|
|
389
|
+
CLUMP_BY_DISTANCE = """Reduce a lead-SNP table to one representative SNP per locus.
|
|
390
390
|
|
|
391
391
|
Applies greedy distance-based clumping within each chromosome group,
|
|
392
392
|
starting from the most significant SNP (lowest ``P`` or highest ``logP``).
|
|
@@ -406,7 +406,7 @@ def _clump_by_distance(df: pd.DataFrame, window_kb: int = 500) -> pd.DataFrame:
|
|
|
406
406
|
pandas.DataFrame
|
|
407
407
|
Deduplicated locus representatives sorted by chromosome and position
|
|
408
408
|
(natural sort order).
|
|
409
|
-
"""
|
|
409
|
+
"""
|
|
410
410
|
|
|
411
411
|
window = window_kb * 1000
|
|
412
412
|
clumped: list[pd.Series] = []
|
|
@@ -438,7 +438,7 @@ def get_hits_summary_table(
|
|
|
438
438
|
table_out: Optional[str] = None,
|
|
439
439
|
resources: Optional[ResourceConfig] = None,
|
|
440
440
|
) -> pd.DataFrame:
|
|
441
|
-
GET_HITS_SUMMARY_TABLE =
|
|
441
|
+
GET_HITS_SUMMARY_TABLE = """Annotate lead SNPs with nearest genes and write the locus summary table.
|
|
442
442
|
|
|
443
443
|
For each lead SNP in *leads_df*, runs two complementary annotation passes:
|
|
444
444
|
|
|
@@ -528,51 +528,54 @@ def get_hits_summary_table(
|
|
|
528
528
|
SNP CHR POS top_gene biotype
|
|
529
529
|
0 rs123456 2 60718043 BCL11A protein_coding
|
|
530
530
|
1 rs789012 11 5246696 HBB protein_coding
|
|
531
|
-
"""
|
|
531
|
+
"""
|
|
532
532
|
|
|
533
533
|
if resources is None:
|
|
534
534
|
resources = default_resources
|
|
535
535
|
|
|
536
536
|
# Choose gene info file based on build
|
|
537
|
-
if
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
537
|
+
if 'BUILD' in leads_df.columns:
|
|
538
|
+
if "OLD_POS" not in leads_df.columns and list(set(leads_df["BUILD"])) == ["hg19"]:
|
|
539
|
+
geneinfo_path = resources.require("geneinfo_hg19")
|
|
540
|
+
else:
|
|
541
|
+
geneinfo_path = resources.require("geneinfo_hg38")
|
|
541
542
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
543
|
+
logger.info("Loading gene info from: %s", geneinfo_path)
|
|
544
|
+
geneinfo = pd.read_csv(geneinfo_path, header=0, sep="\t")
|
|
545
|
+
genes_dict = _build_genes_dict(geneinfo)
|
|
545
546
|
|
|
546
|
-
|
|
547
|
-
|
|
547
|
+
window = window_kb * 1_000
|
|
548
|
+
records: list[dict] = []
|
|
548
549
|
|
|
549
550
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
551
|
+
logger.info("Annotating lead variants and generating hits summary table ...")
|
|
552
|
+
for _, row in leads_df.iterrows():
|
|
553
|
+
annotation = _annotate_variant(
|
|
554
|
+
chrom=row["CHR"],
|
|
555
|
+
pos=row["POS"],
|
|
556
|
+
genes_dict=genes_dict,
|
|
557
|
+
window=window,
|
|
558
|
+
)
|
|
559
|
+
prioritized = _annotate_and_prioritize_variant(
|
|
560
|
+
chrom=row["CHR"],
|
|
561
|
+
pos=row["POS"],
|
|
562
|
+
genes_df=geneinfo,
|
|
563
|
+
lead_snps_df=leads_df,
|
|
564
|
+
window=window,
|
|
565
|
+
)
|
|
565
566
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
567
|
+
record = {
|
|
568
|
+
**(row.to_dict()),
|
|
569
|
+
**(annotation if annotation is not None else {}),
|
|
570
|
+
**(prioritized if prioritized is not None else {}),
|
|
571
|
+
}
|
|
572
|
+
records.append(record)
|
|
572
573
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
574
|
+
locus_table = pd.DataFrame(records).sort_values(
|
|
575
|
+
["CHR", "POS"], key=natsort.natsort_keygen()
|
|
576
|
+
)
|
|
577
|
+
else:
|
|
578
|
+
locus_table = leads_df
|
|
576
579
|
|
|
577
580
|
if table_out is not None:
|
|
578
581
|
locus_table.to_csv(table_out, index=False, sep="\t", na_rep="None")
|