pycmplot 0.1.9__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {pycmplot-0.1.9 → pycmplot-0.2.1}/LICENSE +1 -1
  2. pycmplot-0.2.1/PKG-INFO +231 -0
  3. {pycmplot-0.1.9 → pycmplot-0.2.1}/README.md +8 -2
  4. {pycmplot-0.1.9 → pycmplot-0.2.1}/docs/conf.py +1 -1
  5. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/__init__.py +1 -1
  6. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/_core.py +42 -23
  7. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/annotation.py +48 -45
  8. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/cli.py +38 -16
  9. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/constants.py +2 -2
  10. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/io.py +115 -51
  11. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/liftover.py +8 -8
  12. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/plotting/circular.py +49 -40
  13. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/plotting/linear.py +247 -46
  14. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/resources.py +6 -6
  15. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/stats.py +6 -6
  16. pycmplot-0.2.1/pycmplot.egg-info/PKG-INFO +231 -0
  17. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot.egg-info/entry_points.txt +0 -1
  18. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot.egg-info/requires.txt +7 -0
  19. pycmplot-0.2.1/pycmplot.egg-info/top_level.txt +3 -0
  20. {pycmplot-0.1.9 → pycmplot-0.2.1}/pyproject.toml +8 -2
  21. {pycmplot-0.1.9 → pycmplot-0.2.1}/setup.cfg +2 -2
  22. pycmplot-0.1.9/PKG-INFO +0 -14
  23. pycmplot-0.1.9/pycmplot.egg-info/PKG-INFO +0 -14
  24. pycmplot-0.1.9/pycmplot.egg-info/top_level.txt +0 -1
  25. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
  26. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
  27. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot/data/hg19ToHg38.over.chain +0 -0
  28. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot.egg-info/SOURCES.txt +0 -0
  29. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot.egg-info/dependency_links.txt +0 -0
  30. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docs/conf.py +0 -0
  31. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_annotation.py +0 -0
  32. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_core_cli.py +0 -0
  33. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_io.py +0 -0
  34. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_liftover.py +0 -0
  35. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_plotting.py +0 -0
  36. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_resources_constants.py +0 -0
  37. {pycmplot-0.1.9 → pycmplot-0.2.1}/pycmplot_docs/docstrings_stats.py +0 -0
  38. {pycmplot-0.1.9 → pycmplot-0.2.1}/setup.py +0 -0
@@ -1,4 +1,4 @@
1
- CC BY-NC-SA 4.0 License
1
+ CC-BY-NC-SA-4.0 License
2
2
 
3
3
  Copyright (c) 2026 Kevin Esoh
4
4
 
@@ -0,0 +1,231 @@
1
+ Metadata-Version: 2.4
2
+ Name: pycmplot
3
+ Version: 0.2.1
4
+ Summary: Multi-track circular and linear Manhattan plot generation for GWAS summary statistics
5
+ Author: Kevin Esoh
6
+ Author-email: Kevin Esoh <kesohku1@jh.edu>
7
+ License-Expression: CC-BY-NC-SA-4.0
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: pandas>=1.5
12
+ Requires-Dist: numpy>=1.23
13
+ Requires-Dist: matplotlib>=3.6
14
+ Requires-Dist: pillow>=9.0
15
+ Requires-Dist: pycirclize>=0.6
16
+ Requires-Dist: natsort>=8.0
17
+ Requires-Dist: adjustText>=0.8
18
+ Requires-Dist: pyliftover>=0.4
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest; extra == "dev"
21
+ Requires-Dist: black; extra == "dev"
22
+ Requires-Dist: ruff; extra == "dev"
23
+ Requires-Dist: towncrier; extra == "dev"
24
+ Requires-Dist: sphinx; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # pycmplot
28
+
29
+ Multi-track **circular** and **linear** Manhattan plot generation for GWAS summary statistics.
30
+
31
+ ```
32
+ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
33
+ | PACKAGE FOR CIRCULAR AND LINEAR MANHATTAN PLOTTING |
34
+ | Kevin Esoh, 2026 |
35
+ | kesohku1@jh.edu |
36
+ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
37
+ ```
38
+
39
+ This package will take any number of per SNP/variant summary statistics, be it GWAS,
40
+ selection scans (e.g. iHS, EHH, FST), etc and generate Manhattan plots. If given a single
41
+ file, a single one-track Manhattan plot will be generated. Multiple files will result in
42
+ the generation of a multi-track stacked Manhattan plot.
43
+
44
+ In the process, the package will generate a **hits summary table** for variants with p-value
45
+ (or whatever statistic for significance is used) below the user-specified significance threshold.
46
+ This hits summary table will contain annotated gene names, in addition to other annotations, that
47
+ would then be used to annotate the plots.
48
+
49
+ Importantly, the package allows for conversion of hg19 genomic coordinates to hg38 coordinates.
50
+ This ensures that summary stats obtained using different imputation panels, for instance, can be
51
+ processed in the same run. That is, users can simply concatenate multiple summary stats files together,
52
+ such as those for the same trait but analysed using different imputation panels. Users only need to
53
+ add a new column specifying the genome build (hg19 or hg38) of the variants. Then the `--build_column`
54
+ option of the package should be used to indicate the column and then the package will liftover all
55
+ postions in hg19 to hg38 ensuring that hits table generation and plotting are done with one unified
56
+ corrdinate system.
57
+
58
+ A key functionality of the package is its ability to auto-detect certain columns if ommited on the
59
+ command-line or python API:
60
+ - Chromosome column: `-chr, --chrom_column` or ommited
61
+ - Basepair position column: `-pos, --pos_column` or ommited
62
+ - SNP or Marker ID column: `-snp, --snp_column` or ommited
63
+ - P-value (or whatever value) column: `-p, --pval_column` or ommited
64
+ - Build version column: `-b, --build_column` or ommited
65
+
66
+
67
+ Candidate names for each of the columns is shown below.
68
+
69
+ ```python
70
+ # Resolve column names
71
+ chr_candidates = [chrom, 'CHR', 'CHROM', 'Chromosome', '#CHROM', '#CHR', 'Chrom', 'chrom', 'chr', 'chromosome', '#chr', '#chrom']
72
+ pos_candidates = [pos, 'BP', 'POS', 'bp', 'pos', 'Basepair']
73
+ snp_candidates = [snp, 'SNP', 'RSID', 'rsID', 'MarkerName', 'MarkerID', 'Predictor', 'Marker', 'SNPID', 'ID']
74
+ pvl_candidates = [pcol, 'P', 'P-value', 'Wald_P', 'pvalue', 'p_val', 'pval']
75
+ bld_candidates = [build, 'BUILD', 'Genome', 'Genome_Build', 'Genome-build']
76
+ ```
77
+
78
+ > NB: Upper and lower cases of the candidates are also considered, making each candidate expanded 3 times.
79
+
80
+
81
+ Since GWAS summary stats files can be very large, to improve speed and memory efficiency, it is
82
+ **highly recommended** to use `-tp, --trim_pval` with a value to exclude variants with p-value above a
83
+ certain threshold, e.g. `0.01 (1e-2)` or `0.001 (1e-3)`.
84
+
85
+ A potential useful application is **comparative visualization** of results from multiple imputation panels,
86
+ multiple populations, or multiple traits to observe shared genetic architecture.
87
+
88
+ Read more in the package documentation page: https://pycmplot.readthedocs.io/en/latest/
89
+
90
+ ---
91
+
92
+ ## Installation
93
+
94
+ ### From PyPI
95
+ ```bash
96
+ pip install pycmplot
97
+ ```
98
+
99
+
100
+ ### From GitHub
101
+ ```bash
102
+ git clone https://github.com/esohkevin/pycmplot.git
103
+
104
+ cd pycmplot
105
+
106
+ pip install -e .
107
+
108
+ # or
109
+
110
+ pip install -e . --break-system-packages
111
+ ```
112
+
113
+
114
+ ### Use python virtual environment if local installation is not possible
115
+ ```bash
116
+ python -m venv ~/bin/pycmplot
117
+
118
+ source ~/bin/pycmplot/bin/activate
119
+
120
+ pip install --upgrade pip setuptools wheel
121
+
122
+ # then follow any of the installation steps above
123
+ ```
124
+
125
+
126
+ # Test the installation
127
+ ```bash
128
+ pycmplot -h
129
+ ```
130
+
131
+ ### Dependencies
132
+
133
+ | Package | Purpose |
134
+ |---------|---------|
135
+ | pandas, numpy | Data loading & statistics |
136
+ | matplotlib | Plotting backend |
137
+ | pycirclize | Circular (Circos-style) tracks |
138
+ | natsort | Natural chromosome sorting |
139
+ | adjustText | Label collision avoidance |
140
+ | pyliftover | hg19 to hg38 coordinate conversion |
141
+ | Pillow | Image utilities |
142
+
143
+ ---
144
+
145
+
146
+ ## Command-line usage
147
+
148
+ ### Linear Manhattan (default)
149
+
150
+ ```bash
151
+ pycmplot \
152
+ --sum_stats HbF.tsv.gz,MCV.txt.gz,MCH.tsv.gz \
153
+ --labels HbF,MCV,MCH \
154
+ --logp \
155
+ --signif_line \
156
+ --highlight \
157
+ --annotate GENE \
158
+ --output_dir ./results \
159
+ --output_format png \
160
+ --dpi 300
161
+ ```
162
+
163
+ ### Circular Manhattan
164
+
165
+ ```bash
166
+ pycmplot \
167
+ --sum_stats HbF.tsv.gz,MCV.tsv.gz \
168
+ --labels HbF,MCV \
169
+ --mode cm \
170
+ --trim_pval 0.01 \
171
+ --logp \
172
+ --signif_threshold \
173
+ --plot_title "RBC Traits" \
174
+ --output_dir ./results
175
+ ```
176
+
177
+ ### Key options
178
+
179
+ | Flag | Description | Default |
180
+ |------|-------------|---------|
181
+ | `-s, --sum_stats` | Comma-separated sumstats files | **required** |
182
+ | `-l, --labels` | Comma-separated track labels | **required** |
183
+ | `-b, --build` | Comma-separated genome builds of sumstats | off |
184
+ | `-bc, --build_column` | Genome build column name (containing hg18/hg19/hg38) | off |
185
+ | `-m, --mode` | `lm` linear or `cm` circular | `lm` |
186
+ | `-qq, --qq_plot` | Also generate a QQ-plot | off (coming soon...) |
187
+ | `--logp` | Plot -log10(p) | off |
188
+ | `-sig, --signif_threshold` | Genome-wide significance threshold | off (auto 0.05/N) |
189
+ | `-sigl, --signif_line` | Value for genome-wide significance line if different from `-sig` | 5e-8 |
190
+ | `-sug, --suggest_threshold` | Threshold for suggestive signals | off |
191
+ | `-hl, --highlight` | Highlight significant loci | off |
192
+ | `-a, --annotate` | Annotate with `snp`, `gene`, or any column in `hits_table` | `snp` |
193
+ | `-tp, --trim_pval` | Trim variants above this p-value for speed | off |
194
+ | `-st, --sort_track` | Sort tracks by `label` or `chrom_len` | input order |
195
+ | `-od, --output_dir` | Output directory | `.` |
196
+ | `-of, --output_format` | Output format (`png`, `pdf`, `svg`, `jpg`) | `png` |
197
+
198
+ Run `pycmplot -h` for the full option list.
199
+
200
+ ---
201
+
202
+ ## Python API
203
+
204
+ A demonstration of how to use the python API is provided in this notebook: https://github.com/esohkevin/pycmplot/blob/main/pycmplot_python_api.ipynb
205
+
206
+
207
+ ---
208
+
209
+ ## Package structure
210
+
211
+ ```
212
+ pycmplot/
213
+ ├── pyproject.toml
214
+ ├── setup.py
215
+ ├── setup.cfg
216
+ ├── README.md
217
+ └── pycmplot/
218
+ ├── __init__.py # public API exports
219
+ ├── __main__.py # python -m pycmplot
220
+ ├── _core.py # main() orchestration
221
+ ├── cli.py # argparse definitions
222
+ ├── constants.py # chromosome lengths, biotype weights
223
+ ├── resources.py # external resource path config
224
+ ├── io.py # sumstat loading, delimiter detection
225
+ ├── stats.py # get_lead_snps, get_highlight_snps
226
+ ├── liftover.py # lazy hg19→hg38 liftover
227
+ ├── annotation.py # nearest-gene annotation, hits table
228
+ └── plotting/
229
+ ├── __init__.py
230
+ ├── linear.py # plot_linear
231
+ └── circular.py # plot_circular, compute_track_radii_dict
@@ -49,6 +49,9 @@ pvl_candidates = [pcol, 'P', 'P-value', 'Wald_P', 'pvalue', 'p_val', 'pval']
49
49
  bld_candidates = [build, 'BUILD', 'Genome', 'Genome_Build', 'Genome-build']
50
50
  ```
51
51
 
52
+ > NB: Upper and lower cases of the candidates are also considered, making each candidate expanded 3 times.
53
+
54
+
52
55
  Since GWAS summary stats files can be very large, to improve speed and memory efficiency, it is
53
56
  **highly recommended** to use `-tp, --trim_pval` with a value to exclude variants with p-value above a
54
57
  certain threshold, e.g. `0.01 (1e-2)` or `0.001 (1e-3)`.
@@ -56,6 +59,8 @@ certain threshold, e.g. `0.01 (1e-2)` or `0.001 (1e-3)`.
56
59
  A potential useful application is **comparative visualization** of results from multiple imputation panels,
57
60
  multiple populations, or multiple traits to observe shared genetic architecture.
58
61
 
62
+ Read more in the package documentation page: https://pycmplot.readthedocs.io/en/latest/
63
+
59
64
  ---
60
65
 
61
66
  ## Installation
@@ -149,7 +154,8 @@ pycmplot \
149
154
  |------|-------------|---------|
150
155
  | `-s, --sum_stats` | Comma-separated sumstats files | **required** |
151
156
  | `-l, --labels` | Comma-separated track labels | **required** |
152
- | `-b, --build_column` | Genome build column name (containing hg18/hg19/hg38) | **required** |
157
+ | `-b, --build` | Comma-separated genome builds of sumstats | off |
158
+ | `-bc, --build_column` | Genome build column name (containing hg18/hg19/hg38) | off |
153
159
  | `-m, --mode` | `lm` linear or `cm` circular | `lm` |
154
160
  | `-qq, --qq_plot` | Also generate a QQ-plot | off (coming soon...) |
155
161
  | `--logp` | Plot -log10(p) | off |
@@ -157,7 +163,7 @@ pycmplot \
157
163
  | `-sigl, --signif_line` | Value for genome-wide significance line if different from `-sig` | 5e-8 |
158
164
  | `-sug, --suggest_threshold` | Threshold for suggestive signals | off |
159
165
  | `-hl, --highlight` | Highlight significant loci | off |
160
- | `-a, --annotate` | Annotate with `SNP` or `GENE` | `SNP` |
166
+ | `-a, --annotate` | Annotate with `snp`, `gene`, or any column in `hits_table` | `snp` |
161
167
  | `-tp, --trim_pval` | Trim variants above this p-value for speed | off |
162
168
  | `-st, --sort_track` | Sort tracks by `label` or `chrom_len` | input order |
163
169
  | `-od, --output_dir` | Output directory | `.` |
@@ -12,7 +12,7 @@ sys.path.insert(0, os.path.abspath(".."))
12
12
  project = "pycmplot"
13
13
  copyright = "2026, Kevin Esoh"
14
14
  author = "Kevin Esoh"
15
- release = "0.1.9" # update to match your PyPI version
15
+ release = "0.2.1" # update to match PyPI version
16
16
 
17
17
  # -- General configuration -----------------------------------------------------
18
18
  extensions = [
@@ -42,4 +42,4 @@ __all__ = [
42
42
  "ResourceConfig",
43
43
  ]
44
44
 
45
- __version__ = "0.1.9"
45
+ __version__ = "0.2.1"
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- CORE_MODULE = '''"""
3
+ CORE_MODULE = """
4
4
  pycmplot._core
5
5
  ==============
6
6
 
@@ -12,7 +12,7 @@ work to :mod:`pycmplot.io`, :mod:`pycmplot.plotting.linear`, and
12
12
  All imports are deferred inside :func:`main` so that
13
13
  ``import pycmplot`` remains fast regardless of the size of the dependency
14
14
  tree.
15
- """'''
15
+ """
16
16
 
17
17
  import logging
18
18
  import warnings
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
26
26
 
27
27
 
28
28
  def main() -> None:
29
- MAIN = '''"""Orchestrate the full pycmplot pipeline from the command line.
29
+ MAIN = """Orchestrate the full pycmplot pipeline from the command line.
30
30
 
31
31
  This function is registered as the ``pycmplot`` console-script entry point
32
32
  in ``pyproject.toml`` / ``setup.cfg``. It performs the following steps in
@@ -75,7 +75,7 @@ def main() -> None:
75
75
  Linear Manhattan plotter called for ``--mode lm`` (default).
76
76
  pycmplot.plotting.circular.plot_circular :
77
77
  Circular Manhattan plotter called for ``--mode cm``.
78
- """'''
78
+ """
79
79
 
80
80
  # ------------------------------------------------------------------
81
81
  # Deferred imports so ``import pycmplot`` remains fast
@@ -105,7 +105,8 @@ def main() -> None:
105
105
  chrom_arg = args.chrom_column
106
106
  pos_arg = args.pos_column
107
107
  snp_arg = args.snp_column
108
- build_arg = args.build_column
108
+ build_arg = args.build
109
+ buildc_arg = args.build_column
109
110
  labels_raw = args.labels
110
111
  pcol_arg = args.pval_column
111
112
  logp = args.logp
@@ -123,13 +124,13 @@ def main() -> None:
123
124
  point_size = args.point_size
124
125
  highlight = args.highlight
125
126
  highlight_thresh = args.highlight_thresh
126
- highight_color = args.highight_color
127
+ highlight_color = args.highlight_color
127
128
  highlight_line = args.highlight_line
128
- highight_line_color = args.highight_line_color
129
+ highlight_line_color = args.highlight_line_color
129
130
  colors_raw = args.colors
130
- r_min = args.r_min
131
- r_max = args.r_max
132
- pad = args.pad
131
+ r_min = args.min_radius
132
+ r_max = args.max_radius
133
+ pad = args.circular_track_spacing
133
134
  output_format = args.output_format
134
135
  output_dir = args.output_dir
135
136
  dpi = args.dpi
@@ -142,18 +143,20 @@ def main() -> None:
142
143
 
143
144
 
144
145
  # ------------------------------------------------------------------
145
- # Sumstat, labels, colours, track heights str to list
146
+ # Sumstat, labels, colours, track heights [build] str to list
146
147
  # ------------------------------------------------------------------
147
148
  (
148
149
  sum_stats,
149
150
  labels,
150
151
  colors,
151
- t_heights
152
+ t_heights,
153
+ builds
152
154
  ) = strip_comma_separated_input_streams(
153
155
  sum_stats = sum_stats_raw,
154
156
  labels = labels_raw,
155
157
  colors_raw = colors_raw,
156
158
  track_heights = track_heights,
159
+ builds = build_arg if build_arg else None,
157
160
  )
158
161
 
159
162
  # ------------------------------------------------------------------
@@ -182,7 +185,8 @@ def main() -> None:
182
185
  pos = pos_arg,
183
186
  snp = snp_arg,
184
187
  pcol = pcol_arg,
185
- build = build_arg
188
+ buildc = buildc_arg,
189
+ build = builds
186
190
  )
187
191
 
188
192
  # ------------------------------------------------------------------
@@ -212,6 +216,19 @@ def main() -> None:
212
216
  resources=resources,
213
217
  )
214
218
 
219
+ # ------------------------------------------------------------------
220
+ # ANNOTATE BY
221
+ # ------------------------------------------------------------------
222
+ if annotate:
223
+ if str(annotate).upper() == "GENE":
224
+ label_col = 'top_gene'
225
+ elif str(annotate).upper() == "SNP":
226
+ label_col = 'SNP'
227
+ else:
228
+ label_col = annotate
229
+
230
+ logger.info(f"Anotate by: {label_col}")
231
+
215
232
  # ------------------------------------------------------------------
216
233
  # CIRCULAR MANHATTAN
217
234
  # ------------------------------------------------------------------
@@ -224,15 +241,16 @@ def main() -> None:
224
241
  signif_lines = signif_lines,
225
242
  highlight = highlight,
226
243
  highlight_thresh = highlight_thresh,
227
- highight_color = highight_color,
244
+ highlight_color = highlight_color,
228
245
  highlight_line = highlight_line,
229
- highight_line_color = highight_line_color,
246
+ highlight_line_color = highlight_line_color,
230
247
  colors = colors,
231
248
  chrom_label_side = chrom_label_side,
232
249
  chrom_label_size = chrom_label_size,
233
250
  track_label_size = track_label_size,
234
251
  track_label_orientation = track_label_orientation,
235
252
  annotate = annotate,
253
+ label_col = label_col if annotate else None,
236
254
  annotation_size = annotation_size,
237
255
  hits_table = hits_table,
238
256
  sector_sizes = merged_assoc_sector_sizes,
@@ -253,24 +271,25 @@ def main() -> None:
253
271
  else:
254
272
  logger.info("Generating LINEAR MANHATTAN Plot ...")
255
273
  plot_linear(
256
- sumstats_loaded = sumstats_loaded,
257
- track_heights = t_heights,
274
+ sumstats_loaded=sumstats_loaded,
275
+ track_heights=t_heights,
258
276
  trim_pval=trim_pval,
259
277
  logp=True if logp else False,
260
278
  point_size=point_size,
261
279
  highlight=highlight,
262
280
  highlight_thresh=highlight_thresh,
263
- highight_color = highight_color,
264
- highlight_line = highlight_line,
265
- highight_line_color = highight_line_color,
266
- annot_df=hits_table if not hits_table.empty else None,
267
- label_col="top_gene",
281
+ highlight_color=highlight_color,
282
+ highlight_line=highlight_line,
283
+ highlight_line_color=highlight_line_color,
284
+ annotate=annotate,
285
+ hits_table=hits_table if not hits_table.empty else None,
286
+ label_col=label_col if annotate else None,
268
287
  chr_spacing=chr_spacing,
269
288
  linear_track_spacing=linear_track_spacing,
270
289
  colors=colors,
271
290
  signif_lines=signif_lines,
272
291
  plot_title=plot_title,
273
- no_track_labels = no_track_labels,
292
+ no_track_labels=no_track_labels,
274
293
  dpi=dpi,
275
294
  output_format=output_format,
276
295
  output_dir=output_dir,
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- MODULE_DOCSTRING = '''"""
3
+ MODULE_DOCSTRING = """
4
4
  pycmplot.annotation
5
5
  ====================
6
6
 
@@ -20,7 +20,7 @@ Annotation relies on a bundled Ensembl gene-info TSV (hg38 or hg19). The
20
20
  file is resolved through :class:`~pycmplot.resources.ResourceConfig`; custom
21
21
  paths can be supplied via the ``PYCMPLOT_GENEINFO_HG38`` /
22
22
  ``PYCMPLOT_GENEINFO_HG19`` environment variables.
23
- """'''
23
+ """
24
24
 
25
25
  import bisect
26
26
  import logging
@@ -41,7 +41,7 @@ logger = logging.getLogger(__name__)
41
41
  # ---------------------------------------------------------------------------
42
42
 
43
43
  def _build_genes_dict(genes_df: pd.DataFrame) -> dict:
44
- BUILD_GENES_DICT = '''"""Build a chromosome-keyed interval dictionary with sorted start positions.
44
+ BUILD_GENES_DICT = """Build a chromosome-keyed interval dictionary with sorted start positions.
45
45
 
46
46
  Pre-processes the gene reference DataFrame into a structure that supports
47
47
  efficient O(log N) binary-search lookup of genes near a query position.
@@ -67,7 +67,7 @@ def _build_genes_dict(genes_df: pd.DataFrame) -> dict:
67
67
  -----
68
68
  This function is called once per :func:`get_hits_summary_table` invocation;
69
69
  the result is passed to :func:`_annotate_variant` for each lead SNP.
70
- """'''
70
+ """
71
71
 
72
72
  genes_df = genes_df.sort_values(["CHR", "START"])
73
73
  genes_dict: dict = {}
@@ -98,7 +98,7 @@ def _annotate_variant(
98
98
  window: int = 500_000,
99
99
  promoter_window: int = 2_000,
100
100
  ) -> dict:
101
- ANNOTATE_VARIANT = '''"""Return strand-aware nearest-gene annotation for a single variant.
101
+ ANNOTATE_VARIANT = """Return strand-aware nearest-gene annotation for a single variant.
102
102
 
103
103
  Searches the pre-built *genes_dict* within *window* bp of *pos* on
104
104
  *chrom*. Reports the nearest upstream and downstream genes (relative to
@@ -138,7 +138,7 @@ def _annotate_variant(
138
138
  within *promoter_window* bp upstream of any TSS.
139
139
  * ``gene_density`` (int) – number of genes with any overlap in the
140
140
  search window.
141
- """'''
141
+ """
142
142
 
143
143
  _empty = {
144
144
  "genic": False,
@@ -238,7 +238,7 @@ def _annotate_and_prioritize_variant(
238
238
  promoter_window: int = 2_000,
239
239
  biotype_weights: Optional[dict] = None,
240
240
  ) -> Optional[dict]:
241
- ANNOTATE_PRIORITIZE = '''"""Score and rank candidate genes for a single variant using a composite
241
+ ANNOTATE_PRIORITIZE = """Score and rank candidate genes for a single variant using a composite
242
242
  priority metric.
243
243
 
244
244
  Builds a candidate gene set within *window* bp of *pos* on *chrom*, then
@@ -287,7 +287,7 @@ def _annotate_and_prioritize_variant(
287
287
  For intergenic variants, ``top_gene`` contains the two nearest flanking
288
288
  gene symbols joined by ``'-'`` (e.g. ``'HBB-HBD'``) and ``biotype``
289
289
  is set to ``'intergenic'``.
290
- """'''
290
+ """
291
291
 
292
292
  if biotype_weights is None:
293
293
  biotype_weights = BIOTYPE_WEIGHTS
@@ -386,7 +386,7 @@ def _annotate_and_prioritize_variant(
386
386
  # ---------------------------------------------------------------------------
387
387
 
388
388
  def _clump_by_distance(df: pd.DataFrame, window_kb: int = 500) -> pd.DataFrame:
389
- CLUMP_BY_DISTANCE = '''"""Reduce a lead-SNP table to one representative SNP per locus.
389
+ CLUMP_BY_DISTANCE = """Reduce a lead-SNP table to one representative SNP per locus.
390
390
 
391
391
  Applies greedy distance-based clumping within each chromosome group,
392
392
  starting from the most significant SNP (lowest ``P`` or highest ``logP``).
@@ -406,7 +406,7 @@ def _clump_by_distance(df: pd.DataFrame, window_kb: int = 500) -> pd.DataFrame:
406
406
  pandas.DataFrame
407
407
  Deduplicated locus representatives sorted by chromosome and position
408
408
  (natural sort order).
409
- """'''
409
+ """
410
410
 
411
411
  window = window_kb * 1000
412
412
  clumped: list[pd.Series] = []
@@ -438,7 +438,7 @@ def get_hits_summary_table(
438
438
  table_out: Optional[str] = None,
439
439
  resources: Optional[ResourceConfig] = None,
440
440
  ) -> pd.DataFrame:
441
- GET_HITS_SUMMARY_TABLE = '''"""Annotate lead SNPs with nearest genes and write the locus summary table.
441
+ GET_HITS_SUMMARY_TABLE = """Annotate lead SNPs with nearest genes and write the locus summary table.
442
442
 
443
443
  For each lead SNP in *leads_df*, runs two complementary annotation passes:
444
444
 
@@ -528,51 +528,54 @@ def get_hits_summary_table(
528
528
  SNP CHR POS top_gene biotype
529
529
  0 rs123456 2 60718043 BCL11A protein_coding
530
530
  1 rs789012 11 5246696 HBB protein_coding
531
- """'''
531
+ """
532
532
 
533
533
  if resources is None:
534
534
  resources = default_resources
535
535
 
536
536
  # Choose gene info file based on build
537
- if "OLD_POS" not in leads_df.columns and list(set(leads_df["BUILD"])) == ["hg19"]:
538
- geneinfo_path = resources.require("geneinfo_hg19")
539
- else:
540
- geneinfo_path = resources.require("geneinfo_hg38")
537
+ if 'BUILD' in leads_df.columns:
538
+ if "OLD_POS" not in leads_df.columns and list(set(leads_df["BUILD"])) == ["hg19"]:
539
+ geneinfo_path = resources.require("geneinfo_hg19")
540
+ else:
541
+ geneinfo_path = resources.require("geneinfo_hg38")
541
542
 
542
- logger.info("Loading gene info from: %s", geneinfo_path)
543
- geneinfo = pd.read_csv(geneinfo_path, header=0, sep="\t")
544
- genes_dict = _build_genes_dict(geneinfo)
543
+ logger.info("Loading gene info from: %s", geneinfo_path)
544
+ geneinfo = pd.read_csv(geneinfo_path, header=0, sep="\t")
545
+ genes_dict = _build_genes_dict(geneinfo)
545
546
 
546
- window = window_kb * 1_000
547
- records: list[dict] = []
547
+ window = window_kb * 1_000
548
+ records: list[dict] = []
548
549
 
549
550
 
550
- logger.info("Annotating lead variants and generating hits summary table ...")
551
- for _, row in leads_df.iterrows():
552
- annotation = _annotate_variant(
553
- chrom=row["CHR"],
554
- pos=row["POS"],
555
- genes_dict=genes_dict,
556
- window=window,
557
- )
558
- prioritized = _annotate_and_prioritize_variant(
559
- chrom=row["CHR"],
560
- pos=row["POS"],
561
- genes_df=geneinfo,
562
- lead_snps_df=leads_df,
563
- window=window,
564
- )
551
+ logger.info("Annotating lead variants and generating hits summary table ...")
552
+ for _, row in leads_df.iterrows():
553
+ annotation = _annotate_variant(
554
+ chrom=row["CHR"],
555
+ pos=row["POS"],
556
+ genes_dict=genes_dict,
557
+ window=window,
558
+ )
559
+ prioritized = _annotate_and_prioritize_variant(
560
+ chrom=row["CHR"],
561
+ pos=row["POS"],
562
+ genes_df=geneinfo,
563
+ lead_snps_df=leads_df,
564
+ window=window,
565
+ )
565
566
 
566
- record = {
567
- **(row.to_dict()),
568
- **(annotation if annotation is not None else {}),
569
- **(prioritized if prioritized is not None else {}),
570
- }
571
- records.append(record)
567
+ record = {
568
+ **(row.to_dict()),
569
+ **(annotation if annotation is not None else {}),
570
+ **(prioritized if prioritized is not None else {}),
571
+ }
572
+ records.append(record)
572
573
 
573
- locus_table = pd.DataFrame(records).sort_values(
574
- ["CHR", "POS"], key=natsort.natsort_keygen()
575
- )
574
+ locus_table = pd.DataFrame(records).sort_values(
575
+ ["CHR", "POS"], key=natsort.natsort_keygen()
576
+ )
577
+ else:
578
+ locus_table = leads_df
576
579
 
577
580
  if table_out is not None:
578
581
  locus_table.to_csv(table_out, index=False, sep="\t", na_rep="None")