pycmplot 0.2.4__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycmplot-0.2.6/LICENSE +21 -0
- {pycmplot-0.2.4/pycmplot.egg-info → pycmplot-0.2.6}/PKG-INFO +10 -2
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/__init__.py +7 -2
- pycmplot-0.2.6/pycmplot/__main__.py +6 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/_core.py +43 -50
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/annotation.py +59 -38
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/cli.py +166 -229
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/constants.py +1 -1
- pycmplot-0.2.6/pycmplot/data/hg18ToHg38.over.chain.gz +0 -0
- pycmplot-0.2.6/pycmplot/data/hg19ToHg38.over.chain.gz +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/io.py +214 -124
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/liftover.py +108 -26
- pycmplot-0.2.6/pycmplot/plotting/__init__.py +31 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/plotting/circular.py +31 -32
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/plotting/linear.py +87 -39
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/plotting/qq.py +50 -3
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/resources.py +44 -36
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/stats.py +6 -6
- {pycmplot-0.2.4 → pycmplot-0.2.6/pycmplot.egg-info}/PKG-INFO +10 -2
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot.egg-info/SOURCES.txt +5 -11
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot.egg-info/top_level.txt +1 -1
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pyproject.toml +11 -3
- {pycmplot-0.2.4 → pycmplot-0.2.6}/setup.cfg +1 -1
- pycmplot-0.2.4/LICENSE +0 -441
- pycmplot-0.2.4/docs/conf.py +0 -91
- pycmplot-0.2.4/pycmplot/data/hg19ToHg38.over.chain +0 -56506
- pycmplot-0.2.4/pycmplot_docs/docs/conf.py +0 -91
- pycmplot-0.2.4/pycmplot_docs/docstrings_annotation.py +0 -289
- pycmplot-0.2.4/pycmplot_docs/docstrings_core_cli.py +0 -347
- pycmplot-0.2.4/pycmplot_docs/docstrings_io.py +0 -468
- pycmplot-0.2.4/pycmplot_docs/docstrings_liftover.py +0 -156
- pycmplot-0.2.4/pycmplot_docs/docstrings_plotting.py +0 -587
- pycmplot-0.2.4/pycmplot_docs/docstrings_resources_constants.py +0 -170
- pycmplot-0.2.4/pycmplot_docs/docstrings_stats.py +0 -135
- {pycmplot-0.2.4 → pycmplot-0.2.6}/README.md +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot.egg-info/dependency_links.txt +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot.egg-info/entry_points.txt +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/pycmplot.egg-info/requires.txt +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.6}/setup.py +0 -0
pycmplot-0.2.6/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kevin Esoh
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,15 +1,23 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycmplot
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: Multi-track circular and linear Manhattan plot generation for GWAS summary statistics
|
|
5
5
|
Author: Kevin Esoh
|
|
6
6
|
Author-email: Kevin Esoh <kesohku1@jh.edu>
|
|
7
|
-
License-Expression:
|
|
7
|
+
License-Expression: MIT
|
|
8
8
|
Project-URL: Homepage, https://github.com/esohkevin/pycmplot
|
|
9
9
|
Project-URL: Issues, https://github.com/esohkevin/pycmplot/issues
|
|
10
10
|
Project-URL: Docs, https://pycmplot.readthedocs.io/en/latest/
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
16
|
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: Natural Language :: English
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
13
21
|
Requires-Python: >=3.9
|
|
14
22
|
Description-Content-Type: text/markdown
|
|
15
23
|
License-File: LICENSE
|
|
@@ -12,7 +12,7 @@ Command-line::
|
|
|
12
12
|
Python API::
|
|
13
13
|
|
|
14
14
|
from pycmplot.io import prep_pycmplot_input_info, get_sumstats_and_merged_sector_list
|
|
15
|
-
from pycmplot.plotting import plot_linear, plot_circular
|
|
15
|
+
from pycmplot.plotting import plot_linear, plot_circular, plot_qq_single, plot_qq_separate, plot_qq_overlay, plot_qq_combined
|
|
16
16
|
from pycmplot.stats import get_lead_snps
|
|
17
17
|
from pycmplot.annotation import get_hits_summary_table
|
|
18
18
|
|
|
@@ -22,6 +22,7 @@ Public surface
|
|
|
22
22
|
|
|
23
23
|
from pycmplot.plotting.linear import plot_linear
|
|
24
24
|
from pycmplot.plotting.circular import plot_circular, compute_track_radii_dict
|
|
25
|
+
from pycmplot.plotting.qq import plot_qq_single, plot_qq_separate, plot_qq_overlay, plot_qq_combined
|
|
25
26
|
from pycmplot.stats import get_lead_snps, get_highlight_snps
|
|
26
27
|
from pycmplot.io import prep_pycmplot_input_info, get_sumstats_and_merged_sector_list
|
|
27
28
|
from pycmplot.annotation import get_hits_summary_table
|
|
@@ -31,6 +32,10 @@ from pycmplot.resources import ResourceConfig
|
|
|
31
32
|
__all__ = [
|
|
32
33
|
"plot_linear",
|
|
33
34
|
"plot_circular",
|
|
35
|
+
"plot_qq_single",
|
|
36
|
+
"plot_qq_separate",
|
|
37
|
+
"plot_qq_overlay",
|
|
38
|
+
"plot_qq_combined",
|
|
34
39
|
"compute_track_radii_dict",
|
|
35
40
|
"get_lead_snps",
|
|
36
41
|
"get_highlight_snps",
|
|
@@ -42,4 +47,4 @@ __all__ = [
|
|
|
42
47
|
"ResourceConfig",
|
|
43
48
|
]
|
|
44
49
|
|
|
45
|
-
__version__ = "0.2.
|
|
50
|
+
__version__ = "0.2.6"
|
|
@@ -1,62 +1,62 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
CORE_MODULE = """
|
|
1
|
+
"""
|
|
4
2
|
pycmplot._core
|
|
5
3
|
==============
|
|
6
4
|
|
|
7
5
|
Main entry point that orchestrates CLI argument parsing, data loading, and
|
|
8
6
|
plot dispatch. This module is intentionally thin: it delegates all heavy
|
|
9
|
-
work to :mod:`pycmplot.io`, :mod:`pycmplot.plotting.linear`,
|
|
10
|
-
:mod:`pycmplot.plotting.circular`.
|
|
7
|
+
work to :mod:`pycmplot.io`, :mod:`pycmplot.plotting.linear`,
|
|
8
|
+
:mod:`pycmplot.plotting.circular`, and :mod:`pycmplot.plotting.qq`.
|
|
11
9
|
|
|
12
10
|
All imports are deferred inside :func:`main` so that
|
|
13
11
|
``import pycmplot`` remains fast regardless of the size of the dependency
|
|
14
12
|
tree.
|
|
15
13
|
"""
|
|
16
14
|
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
17
|
import logging
|
|
18
18
|
import warnings
|
|
19
|
+
import sys
|
|
19
20
|
|
|
20
21
|
# Suppress noisy font-manager warnings before any matplotlib import
|
|
21
22
|
logging.getLogger("matplotlib.font_manager").setLevel(logging.ERROR)
|
|
22
23
|
warnings.filterwarnings("ignore")
|
|
23
24
|
|
|
24
|
-
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
25
|
+
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s", stream=sys.stdout)
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
def main() -> None:
|
|
29
|
-
|
|
30
|
+
"""Orchestrate the full pycmplot pipeline from the command line.
|
|
30
31
|
|
|
31
32
|
This function is registered as the ``pycmplot`` console-script entry point
|
|
32
33
|
in ``pyproject.toml`` / ``setup.cfg``. It performs the following steps in
|
|
33
34
|
order:
|
|
34
35
|
|
|
35
36
|
1. **Parse CLI arguments** via :func:`~pycmplot.cli.get_arguments`.
|
|
36
|
-
2. **Parse comma-separated inputs** (files, labels, colours, track heights
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
2. **Parse comma-separated inputs** (files, labels, colours, track heights,
|
|
38
|
+
builds) into Python lists via
|
|
39
|
+
:func:`~pycmplot.io.strip_comma_separated_input_streams`.
|
|
39
40
|
3. **Construct output paths** (plot image and locus summary table TSV) via
|
|
40
|
-
|
|
41
|
+
:func:`~pycmplot.io.get_output_paths`.
|
|
41
42
|
4. **Resolve column names** for every input file via
|
|
42
|
-
|
|
43
|
+
:func:`~pycmplot.io.prep_pycmplot_input_info`.
|
|
43
44
|
5. **Load data** — reads summary statistics, normalises chromosome names,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
6. **Dispatch
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
All input is taken from ``sys.argv`` via :mod:`argparse`.
|
|
45
|
+
runs hg19 → hg38 liftover if needed, extracts lead SNPs, generates the
|
|
46
|
+
hits summary table, and computes merged Circos sector sizes via
|
|
47
|
+
:func:`~pycmplot.io.get_sumstats_and_merged_sector_list`.
|
|
48
|
+
6. **Dispatch Manhattan plot** — calls
|
|
49
|
+
:func:`~pycmplot.plotting.circular.plot_circular` when ``--mode cm``,
|
|
50
|
+
or :func:`~pycmplot.plotting.linear.plot_linear` otherwise.
|
|
51
|
+
7. **Optional QQ plot** — when ``--qq_plot`` is set, dispatches to one of
|
|
52
|
+
:func:`~pycmplot.plotting.qq.plot_qq_combined` (default),
|
|
53
|
+
:func:`~pycmplot.plotting.qq.plot_qq_separate` (``--qq_separate``), or
|
|
54
|
+
:func:`~pycmplot.plotting.qq.plot_qq_overlay` (``--qq_overlay``).
|
|
55
55
|
|
|
56
56
|
Returns
|
|
57
57
|
-------
|
|
58
58
|
None
|
|
59
|
-
Saves the plot image and locus summary table to the directory
|
|
59
|
+
Saves the plot image(s) and locus summary table to the directory
|
|
60
60
|
specified by ``--output_dir``.
|
|
61
61
|
|
|
62
62
|
Raises
|
|
@@ -94,6 +94,7 @@ def main() -> None:
|
|
|
94
94
|
from pycmplot.plotting.circular import plot_circular
|
|
95
95
|
from pycmplot.plotting.qq import plot_qq_combined, plot_qq_separate, plot_qq_overlay
|
|
96
96
|
from pycmplot.resources import ResourceConfig
|
|
97
|
+
from pycmplot.annotation import get_annotation_column
|
|
97
98
|
|
|
98
99
|
# ------------------------------------------------------------------
|
|
99
100
|
# Parse CLI
|
|
@@ -147,7 +148,9 @@ def main() -> None:
|
|
|
147
148
|
track_heights = args.track_heights
|
|
148
149
|
linear_track_spacing = args.linear_track_spacing
|
|
149
150
|
no_track_labels = args.no_track_labels
|
|
151
|
+
ylabel = args.ylabel
|
|
150
152
|
chr_spacing = args.chr_spacing
|
|
153
|
+
figure_size = args.figure_size
|
|
151
154
|
|
|
152
155
|
|
|
153
156
|
# ------------------------------------------------------------------
|
|
@@ -194,8 +197,8 @@ def main() -> None:
|
|
|
194
197
|
pos = pos_arg,
|
|
195
198
|
snp = snp_arg,
|
|
196
199
|
pcol = pcol_arg,
|
|
197
|
-
|
|
198
|
-
|
|
200
|
+
build_column = buildc_arg,
|
|
201
|
+
build_list = builds
|
|
199
202
|
)
|
|
200
203
|
|
|
201
204
|
# ------------------------------------------------------------------
|
|
@@ -206,38 +209,25 @@ def main() -> None:
|
|
|
206
209
|
# ------------------------------------------------------------------
|
|
207
210
|
# Load data, compute sectors, get hits table
|
|
208
211
|
# ------------------------------------------------------------------
|
|
209
|
-
(
|
|
210
|
-
merged_assoc_sector_sizes,
|
|
211
|
-
sumstats_loaded,
|
|
212
|
-
hits_table,
|
|
213
|
-
signif_lines,
|
|
214
|
-
pval_dict,
|
|
215
|
-
) = get_sumstats_and_merged_sector_list(
|
|
212
|
+
pycmplot_dict = get_sumstats_and_merged_sector_list(
|
|
216
213
|
sum_stats=sum_stats,
|
|
217
214
|
labels=labels,
|
|
218
215
|
trim_pval=trim_pval,
|
|
219
216
|
logp=logp,
|
|
220
217
|
file_info=sumstats_hdr_dic,
|
|
221
218
|
sort_tracks=sort_track,
|
|
222
|
-
table_out=
|
|
219
|
+
table_out=plt_base,
|
|
223
220
|
signif_threshold=signif_threshold,
|
|
224
221
|
signif_line=signif_line,
|
|
225
222
|
suggest_threshold=suggest_threshold,
|
|
226
223
|
resources=resources,
|
|
227
224
|
)
|
|
228
225
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
if str(annotate).upper() == "GENE" and 'top_gene' in hits_table.columns:
|
|
235
|
-
label_col = 'top_gene'
|
|
236
|
-
elif label_col in hits_table.columns:
|
|
237
|
-
label_col = annotate
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
logger.info(f"Anotate by: {label_col}")
|
|
226
|
+
merged_assoc_sector_sizes = pycmplot_dict["sectors"]
|
|
227
|
+
sumstats_loaded = pycmplot_dict["dfs"]
|
|
228
|
+
hits_table = pycmplot_dict["annot"]
|
|
229
|
+
signif_lines = pycmplot_dict["lines"]
|
|
230
|
+
pval_dict = pycmplot_dict["pvals"]
|
|
241
231
|
|
|
242
232
|
# ------------------------------------------------------------------
|
|
243
233
|
# CIRCULAR MANHATTAN
|
|
@@ -260,7 +250,6 @@ def main() -> None:
|
|
|
260
250
|
track_label_size = track_label_size,
|
|
261
251
|
track_label_orientation = track_label_orientation,
|
|
262
252
|
annotate = annotate,
|
|
263
|
-
label_col = label_col if annotate else None,
|
|
264
253
|
annotation_size = annotation_size,
|
|
265
254
|
hits_table = hits_table,
|
|
266
255
|
sector_sizes = merged_assoc_sector_sizes,
|
|
@@ -280,6 +269,9 @@ def main() -> None:
|
|
|
280
269
|
# ------------------------------------------------------------------
|
|
281
270
|
else:
|
|
282
271
|
logger.info("Generating LINEAR MANHATTAN Plot ...")
|
|
272
|
+
fsize = figure_size.strip(" ").split(",")
|
|
273
|
+
fsize = [int(v) for v in fsize]
|
|
274
|
+
logger.info(f"FIGURE SIZE: {fsize}")
|
|
283
275
|
plot_linear(
|
|
284
276
|
sumstats_loaded=sumstats_loaded,
|
|
285
277
|
track_heights=t_heights,
|
|
@@ -291,19 +283,19 @@ def main() -> None:
|
|
|
291
283
|
highlight_color=highlight_color,
|
|
292
284
|
highlight_line=highlight_line,
|
|
293
285
|
highlight_line_color=highlight_line_color,
|
|
294
|
-
annotate=annotate,
|
|
286
|
+
annotate=annotate,
|
|
295
287
|
hits_table=hits_table if not hits_table.empty else None,
|
|
296
|
-
label_col=label_col if annotate else None,
|
|
297
288
|
chr_spacing=chr_spacing,
|
|
298
289
|
linear_track_spacing=linear_track_spacing,
|
|
299
290
|
colors=colors,
|
|
300
291
|
signif_lines=signif_lines,
|
|
301
292
|
plot_title=plot_title,
|
|
302
293
|
no_track_labels=no_track_labels,
|
|
294
|
+
ylabel=ylabel,
|
|
303
295
|
dpi=dpi,
|
|
304
296
|
output_format=output_format,
|
|
305
297
|
output_dir=output_dir,
|
|
306
|
-
figsize=
|
|
298
|
+
figsize=fsize
|
|
307
299
|
)
|
|
308
300
|
|
|
309
301
|
# ------------------------------------------------------------------
|
|
@@ -316,6 +308,7 @@ def main() -> None:
|
|
|
316
308
|
if qq_separate:
|
|
317
309
|
plot_qq_separate(
|
|
318
310
|
pval_dict=pval_dict,
|
|
311
|
+
base_name=plot_title,
|
|
319
312
|
thin=qq_thin,
|
|
320
313
|
thin_below=thin_below,
|
|
321
314
|
max_points=qq_max_points,
|
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
MODULE_DOCSTRING = """
|
|
1
|
+
"""
|
|
4
2
|
pycmplot.annotation
|
|
5
3
|
====================
|
|
6
4
|
|
|
@@ -22,6 +20,8 @@ paths can be supplied via the ``PYCMPLOT_GENEINFO_HG38`` /
|
|
|
22
20
|
``PYCMPLOT_GENEINFO_HG19`` environment variables.
|
|
23
21
|
"""
|
|
24
22
|
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
25
|
import bisect
|
|
26
26
|
import logging
|
|
27
27
|
from typing import Optional
|
|
@@ -41,7 +41,7 @@ logger = logging.getLogger(__name__)
|
|
|
41
41
|
# ---------------------------------------------------------------------------
|
|
42
42
|
|
|
43
43
|
def _build_genes_dict(genes_df: pd.DataFrame) -> dict:
|
|
44
|
-
|
|
44
|
+
"""Build a chromosome-keyed interval dictionary with sorted start positions.
|
|
45
45
|
|
|
46
46
|
Pre-processes the gene reference DataFrame into a structure that supports
|
|
47
47
|
efficient O(log N) binary-search lookup of genes near a query position.
|
|
@@ -98,7 +98,7 @@ def _annotate_variant(
|
|
|
98
98
|
window: int = 500_000,
|
|
99
99
|
promoter_window: int = 2_000,
|
|
100
100
|
) -> dict:
|
|
101
|
-
|
|
101
|
+
"""Return strand-aware nearest-gene annotation for a single variant.
|
|
102
102
|
|
|
103
103
|
Searches the pre-built *genes_dict* within *window* bp of *pos* on
|
|
104
104
|
*chrom*. Reports the nearest upstream and downstream genes (relative to
|
|
@@ -238,8 +238,7 @@ def _annotate_and_prioritize_variant(
|
|
|
238
238
|
promoter_window: int = 2_000,
|
|
239
239
|
biotype_weights: Optional[dict] = None,
|
|
240
240
|
) -> Optional[dict]:
|
|
241
|
-
|
|
242
|
-
priority metric.
|
|
241
|
+
"""Score and rank candidate genes for a single variant using a composite priority metric.
|
|
243
242
|
|
|
244
243
|
Builds a candidate gene set within *window* bp of *pos* on *chrom*, then
|
|
245
244
|
scores each candidate on four additive components:
|
|
@@ -386,7 +385,7 @@ def _annotate_and_prioritize_variant(
|
|
|
386
385
|
# ---------------------------------------------------------------------------
|
|
387
386
|
|
|
388
387
|
def _clump_by_distance(df: pd.DataFrame, window_kb: int = 500) -> pd.DataFrame:
|
|
389
|
-
|
|
388
|
+
"""Reduce a lead-SNP table to one representative SNP per locus.
|
|
390
389
|
|
|
391
390
|
Applies greedy distance-based clumping within each chromosome group,
|
|
392
391
|
starting from the most significant SNP (lowest ``P`` or highest ``logP``).
|
|
@@ -438,7 +437,7 @@ def get_hits_summary_table(
|
|
|
438
437
|
table_out: Optional[str] = None,
|
|
439
438
|
resources: Optional[ResourceConfig] = None,
|
|
440
439
|
) -> pd.DataFrame:
|
|
441
|
-
|
|
440
|
+
"""Annotate lead SNPs with nearest genes and write the locus summary table.
|
|
442
441
|
|
|
443
442
|
For each lead SNP in *leads_df*, runs two complementary annotation passes:
|
|
444
443
|
|
|
@@ -475,33 +474,21 @@ def get_hits_summary_table(
|
|
|
475
474
|
Clumped locus summary table. Contains all columns from *leads_df*
|
|
476
475
|
plus annotation fields from both passes, including:
|
|
477
476
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
- Distance to ``nearest_downstream_gene`` in bp
|
|
494
|
-
* - ``promoter_upstream_flag``
|
|
495
|
-
- ``True`` when the SNP is within 2 kb upstream of a TSS
|
|
496
|
-
* - ``gene_density``
|
|
497
|
-
- Number of genes within the search window
|
|
498
|
-
* - ``top_gene``
|
|
499
|
-
- Top-priority gene from the scoring pass
|
|
500
|
-
* - ``biotype``
|
|
501
|
-
- Ensembl biotype of ``top_gene`` (``'intergenic'`` when no
|
|
502
|
-
genic overlap)
|
|
503
|
-
* - ``priority_score``
|
|
504
|
-
- Composite priority score (genic hits only)
|
|
477
|
+
- ``genic`` — ``True`` when the lead SNP overlaps a gene body.
|
|
478
|
+
- ``nearest_upstream_gene`` — nearest upstream gene symbol
|
|
479
|
+
(strand-aware).
|
|
480
|
+
- ``upstream_distance`` — distance to ``nearest_upstream_gene`` in bp.
|
|
481
|
+
- ``nearest_downstream_gene`` — nearest downstream gene symbol
|
|
482
|
+
(strand-aware).
|
|
483
|
+
- ``downstream_distance`` — distance to ``nearest_downstream_gene`` in
|
|
484
|
+
bp.
|
|
485
|
+
- ``promoter_upstream_flag`` — ``True`` when the SNP is within 2 kb
|
|
486
|
+
upstream of a TSS.
|
|
487
|
+
- ``gene_density`` — number of genes within the search window.
|
|
488
|
+
- ``top_gene`` — top-priority gene from the scoring pass.
|
|
489
|
+
- ``biotype`` — Ensembl biotype of ``top_gene`` (``'intergenic'`` when
|
|
490
|
+
no genic overlap).
|
|
491
|
+
- ``priority_score`` — composite priority score (genic hits only).
|
|
505
492
|
|
|
506
493
|
Notes
|
|
507
494
|
-----
|
|
@@ -578,7 +565,41 @@ def get_hits_summary_table(
|
|
|
578
565
|
locus_table = leads_df
|
|
579
566
|
|
|
580
567
|
if table_out is not None:
|
|
581
|
-
|
|
582
|
-
|
|
568
|
+
outpath = table_out.replace(" ", "_").lower() + '.tsv'
|
|
569
|
+
locus_table.to_csv(outpath, index=False, sep="\t", na_rep="None")
|
|
570
|
+
logger.info("Locus summary written to: %s", outpath)
|
|
583
571
|
|
|
584
572
|
return _clump_by_distance(locus_table, window_kb=window_kb)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def get_annotation_column(
|
|
576
|
+
annotate: str = None,
|
|
577
|
+
hits_table: pd.DataFrame = None,
|
|
578
|
+
label_col: str = None,
|
|
579
|
+
):
|
|
580
|
+
if annotate and not hits_table.empty:
|
|
581
|
+
if label_col is not None and label_col in hits_table.columns:
|
|
582
|
+
label_clm = label_col
|
|
583
|
+
elif annotate in hits_table.columns:
|
|
584
|
+
label_clm = annotate
|
|
585
|
+
else:
|
|
586
|
+
if str(annotate).upper() == "GENE":
|
|
587
|
+
for i, (_, row) in enumerate(hits_table.iterrows()):
|
|
588
|
+
try:
|
|
589
|
+
if row["genic"]:
|
|
590
|
+
label_clm = "nearest_upstream_gene"
|
|
591
|
+
label_msg = "'POS' is genic"
|
|
592
|
+
else:
|
|
593
|
+
label_clm = "top_gene"
|
|
594
|
+
label_msg = "'POS' is not genic"
|
|
595
|
+
logger.info("%s", label_msg)
|
|
596
|
+
except Exception:
|
|
597
|
+
logger.warning(
|
|
598
|
+
"Annotation columns '%s' and '%s' not found in hits table: %s; "
|
|
599
|
+
"falling back to 'SNP'.", annotate, label_col, hits_table.columns.values,
|
|
600
|
+
)
|
|
601
|
+
label_clm = 'SNP'
|
|
602
|
+
|
|
603
|
+
logger.info("Annotating by: %s", label_clm)
|
|
604
|
+
|
|
605
|
+
return label_clm
|