pycmplot 0.2.4__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycmplot-0.2.4/pycmplot.egg-info → pycmplot-0.2.5}/PKG-INFO +1 -1
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/__init__.py +7 -2
- pycmplot-0.2.5/pycmplot/__main__.py +6 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/_core.py +43 -38
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/annotation.py +26 -38
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/cli.py +141 -224
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/constants.py +1 -1
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/data/hg19ToHg38.over.chain +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/io.py +170 -120
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/liftover.py +27 -12
- pycmplot-0.2.5/pycmplot/plotting/__init__.py +31 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/plotting/circular.py +19 -14
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/plotting/linear.py +9 -9
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/plotting/qq.py +50 -3
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/resources.py +26 -31
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/stats.py +6 -6
- {pycmplot-0.2.4 → pycmplot-0.2.5/pycmplot.egg-info}/PKG-INFO +1 -1
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot.egg-info/SOURCES.txt +3 -10
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot.egg-info/top_level.txt +1 -1
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pyproject.toml +1 -1
- {pycmplot-0.2.4 → pycmplot-0.2.5}/setup.cfg +1 -1
- pycmplot-0.2.4/docs/conf.py +0 -91
- pycmplot-0.2.4/pycmplot_docs/docs/conf.py +0 -91
- pycmplot-0.2.4/pycmplot_docs/docstrings_annotation.py +0 -289
- pycmplot-0.2.4/pycmplot_docs/docstrings_core_cli.py +0 -347
- pycmplot-0.2.4/pycmplot_docs/docstrings_io.py +0 -468
- pycmplot-0.2.4/pycmplot_docs/docstrings_liftover.py +0 -156
- pycmplot-0.2.4/pycmplot_docs/docstrings_plotting.py +0 -587
- pycmplot-0.2.4/pycmplot_docs/docstrings_resources_constants.py +0 -170
- pycmplot-0.2.4/pycmplot_docs/docstrings_stats.py +0 -135
- {pycmplot-0.2.4 → pycmplot-0.2.5}/LICENSE +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/README.md +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot.egg-info/dependency_links.txt +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot.egg-info/entry_points.txt +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/pycmplot.egg-info/requires.txt +0 -0
- {pycmplot-0.2.4 → pycmplot-0.2.5}/setup.py +0 -0
|
@@ -12,7 +12,7 @@ Command-line::
|
|
|
12
12
|
Python API::
|
|
13
13
|
|
|
14
14
|
from pycmplot.io import prep_pycmplot_input_info, get_sumstats_and_merged_sector_list
|
|
15
|
-
from pycmplot.plotting import plot_linear, plot_circular
|
|
15
|
+
from pycmplot.plotting import plot_linear, plot_circular, plot_qq_single, plot_qq_separate, plot_qq_overlay, plot_qq_combined
|
|
16
16
|
from pycmplot.stats import get_lead_snps
|
|
17
17
|
from pycmplot.annotation import get_hits_summary_table
|
|
18
18
|
|
|
@@ -22,6 +22,7 @@ Public surface
|
|
|
22
22
|
|
|
23
23
|
from pycmplot.plotting.linear import plot_linear
|
|
24
24
|
from pycmplot.plotting.circular import plot_circular, compute_track_radii_dict
|
|
25
|
+
from pycmplot.plotting.qq import plot_qq_single, plot_qq_separate, plot_qq_overlay, plot_qq_combined
|
|
25
26
|
from pycmplot.stats import get_lead_snps, get_highlight_snps
|
|
26
27
|
from pycmplot.io import prep_pycmplot_input_info, get_sumstats_and_merged_sector_list
|
|
27
28
|
from pycmplot.annotation import get_hits_summary_table
|
|
@@ -31,6 +32,10 @@ from pycmplot.resources import ResourceConfig
|
|
|
31
32
|
__all__ = [
|
|
32
33
|
"plot_linear",
|
|
33
34
|
"plot_circular",
|
|
35
|
+
"plot_qq_single",
|
|
36
|
+
"plot_qq_separate",
|
|
37
|
+
"plot_qq_overlay",
|
|
38
|
+
"plot_qq_combined",
|
|
34
39
|
"compute_track_radii_dict",
|
|
35
40
|
"get_lead_snps",
|
|
36
41
|
"get_highlight_snps",
|
|
@@ -42,4 +47,4 @@ __all__ = [
|
|
|
42
47
|
"ResourceConfig",
|
|
43
48
|
]
|
|
44
49
|
|
|
45
|
-
__version__ = "0.2.
|
|
50
|
+
__version__ = "0.2.5"
|
|
@@ -1,62 +1,62 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
CORE_MODULE = """
|
|
1
|
+
"""
|
|
4
2
|
pycmplot._core
|
|
5
3
|
==============
|
|
6
4
|
|
|
7
5
|
Main entry point that orchestrates CLI argument parsing, data loading, and
|
|
8
6
|
plot dispatch. This module is intentionally thin: it delegates all heavy
|
|
9
|
-
work to :mod:`pycmplot.io`, :mod:`pycmplot.plotting.linear`,
|
|
10
|
-
:mod:`pycmplot.plotting.circular`.
|
|
7
|
+
work to :mod:`pycmplot.io`, :mod:`pycmplot.plotting.linear`,
|
|
8
|
+
:mod:`pycmplot.plotting.circular`, and :mod:`pycmplot.plotting.qq`.
|
|
11
9
|
|
|
12
10
|
All imports are deferred inside :func:`main` so that
|
|
13
11
|
``import pycmplot`` remains fast regardless of the size of the dependency
|
|
14
12
|
tree.
|
|
15
13
|
"""
|
|
16
14
|
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
17
|
import logging
|
|
18
18
|
import warnings
|
|
19
|
+
import sys
|
|
19
20
|
|
|
20
21
|
# Suppress noisy font-manager warnings before any matplotlib import
|
|
21
22
|
logging.getLogger("matplotlib.font_manager").setLevel(logging.ERROR)
|
|
22
23
|
warnings.filterwarnings("ignore")
|
|
23
24
|
|
|
24
|
-
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
25
|
+
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s", stream=sys.stdout)
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
def main() -> None:
|
|
29
|
-
|
|
30
|
+
"""Orchestrate the full pycmplot pipeline from the command line.
|
|
30
31
|
|
|
31
32
|
This function is registered as the ``pycmplot`` console-script entry point
|
|
32
33
|
in ``pyproject.toml`` / ``setup.cfg``. It performs the following steps in
|
|
33
34
|
order:
|
|
34
35
|
|
|
35
36
|
1. **Parse CLI arguments** via :func:`~pycmplot.cli.get_arguments`.
|
|
36
|
-
2. **Parse comma-separated inputs** (files, labels, colours, track heights
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
2. **Parse comma-separated inputs** (files, labels, colours, track heights,
|
|
38
|
+
builds) into Python lists via
|
|
39
|
+
:func:`~pycmplot.io.strip_comma_separated_input_streams`.
|
|
39
40
|
3. **Construct output paths** (plot image and locus summary table TSV) via
|
|
40
|
-
|
|
41
|
+
:func:`~pycmplot.io.get_output_paths`.
|
|
41
42
|
4. **Resolve column names** for every input file via
|
|
42
|
-
|
|
43
|
+
:func:`~pycmplot.io.prep_pycmplot_input_info`.
|
|
43
44
|
5. **Load data** — reads summary statistics, normalises chromosome names,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
6. **Dispatch
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
All input is taken from ``sys.argv`` via :mod:`argparse`.
|
|
45
|
+
runs hg19 → hg38 liftover if needed, extracts lead SNPs, generates the
|
|
46
|
+
hits summary table, and computes merged Circos sector sizes via
|
|
47
|
+
:func:`~pycmplot.io.get_sumstats_and_merged_sector_list`.
|
|
48
|
+
6. **Dispatch Manhattan plot** — calls
|
|
49
|
+
:func:`~pycmplot.plotting.circular.plot_circular` when ``--mode cm``,
|
|
50
|
+
or :func:`~pycmplot.plotting.linear.plot_linear` otherwise.
|
|
51
|
+
7. **Optional QQ plot** — when ``--qq_plot`` is set, dispatches to one of
|
|
52
|
+
:func:`~pycmplot.plotting.qq.plot_qq_combined` (default),
|
|
53
|
+
:func:`~pycmplot.plotting.qq.plot_qq_separate` (``--qq_separate``), or
|
|
54
|
+
:func:`~pycmplot.plotting.qq.plot_qq_overlay` (``--qq_overlay``).
|
|
55
55
|
|
|
56
56
|
Returns
|
|
57
57
|
-------
|
|
58
58
|
None
|
|
59
|
-
Saves the plot image and locus summary table to the directory
|
|
59
|
+
Saves the plot image(s) and locus summary table to the directory
|
|
60
60
|
specified by ``--output_dir``.
|
|
61
61
|
|
|
62
62
|
Raises
|
|
@@ -194,8 +194,8 @@ def main() -> None:
|
|
|
194
194
|
pos = pos_arg,
|
|
195
195
|
snp = snp_arg,
|
|
196
196
|
pcol = pcol_arg,
|
|
197
|
-
|
|
198
|
-
|
|
197
|
+
build_column = buildc_arg,
|
|
198
|
+
build_list = builds
|
|
199
199
|
)
|
|
200
200
|
|
|
201
201
|
# ------------------------------------------------------------------
|
|
@@ -206,38 +206,42 @@ def main() -> None:
|
|
|
206
206
|
# ------------------------------------------------------------------
|
|
207
207
|
# Load data, compute sectors, get hits table
|
|
208
208
|
# ------------------------------------------------------------------
|
|
209
|
-
(
|
|
210
|
-
merged_assoc_sector_sizes,
|
|
211
|
-
sumstats_loaded,
|
|
212
|
-
hits_table,
|
|
213
|
-
signif_lines,
|
|
214
|
-
pval_dict,
|
|
215
|
-
) = get_sumstats_and_merged_sector_list(
|
|
209
|
+
pycmplot_dict = get_sumstats_and_merged_sector_list(
|
|
216
210
|
sum_stats=sum_stats,
|
|
217
211
|
labels=labels,
|
|
218
212
|
trim_pval=trim_pval,
|
|
219
213
|
logp=logp,
|
|
220
214
|
file_info=sumstats_hdr_dic,
|
|
221
215
|
sort_tracks=sort_track,
|
|
222
|
-
table_out=
|
|
216
|
+
table_out=plt_base,
|
|
223
217
|
signif_threshold=signif_threshold,
|
|
224
218
|
signif_line=signif_line,
|
|
225
219
|
suggest_threshold=suggest_threshold,
|
|
226
220
|
resources=resources,
|
|
227
221
|
)
|
|
228
222
|
|
|
223
|
+
merged_assoc_sector_sizes = pycmplot_dict["sectors"]
|
|
224
|
+
sumstats_loaded = pycmplot_dict["dfs"]
|
|
225
|
+
hits_table = pycmplot_dict["annot"]
|
|
226
|
+
signif_lines = pycmplot_dict["lines"]
|
|
227
|
+
pval_dict = pycmplot_dict["pvals"]
|
|
228
|
+
|
|
229
229
|
# ------------------------------------------------------------------
|
|
230
230
|
# ANNOTATE BY
|
|
231
231
|
# ------------------------------------------------------------------
|
|
232
232
|
label_col = 'SNP'
|
|
233
|
-
if annotate:
|
|
233
|
+
if annotate and not hits_table.empty:
|
|
234
234
|
if str(annotate).upper() == "GENE" and 'top_gene' in hits_table.columns:
|
|
235
235
|
label_col = 'top_gene'
|
|
236
|
-
elif
|
|
236
|
+
elif annotate in hits_table.columns:
|
|
237
237
|
label_col = annotate
|
|
238
|
-
|
|
238
|
+
else:
|
|
239
|
+
logger.warning(
|
|
240
|
+
"Annotation column '%s' not found in hits table; "
|
|
241
|
+
"falling back to 'SNP'.", annotate,
|
|
242
|
+
)
|
|
239
243
|
|
|
240
|
-
logger.info(
|
|
244
|
+
logger.info("Annotate by: %s", label_col)
|
|
241
245
|
|
|
242
246
|
# ------------------------------------------------------------------
|
|
243
247
|
# CIRCULAR MANHATTAN
|
|
@@ -316,6 +320,7 @@ def main() -> None:
|
|
|
316
320
|
if qq_separate:
|
|
317
321
|
plot_qq_separate(
|
|
318
322
|
pval_dict=pval_dict,
|
|
323
|
+
base_name=plot_title,
|
|
319
324
|
thin=qq_thin,
|
|
320
325
|
thin_below=thin_below,
|
|
321
326
|
max_points=qq_max_points,
|
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
MODULE_DOCSTRING = """
|
|
1
|
+
"""
|
|
4
2
|
pycmplot.annotation
|
|
5
3
|
====================
|
|
6
4
|
|
|
@@ -22,6 +20,8 @@ paths can be supplied via the ``PYCMPLOT_GENEINFO_HG38`` /
|
|
|
22
20
|
``PYCMPLOT_GENEINFO_HG19`` environment variables.
|
|
23
21
|
"""
|
|
24
22
|
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
25
|
import bisect
|
|
26
26
|
import logging
|
|
27
27
|
from typing import Optional
|
|
@@ -41,7 +41,7 @@ logger = logging.getLogger(__name__)
|
|
|
41
41
|
# ---------------------------------------------------------------------------
|
|
42
42
|
|
|
43
43
|
def _build_genes_dict(genes_df: pd.DataFrame) -> dict:
|
|
44
|
-
|
|
44
|
+
"""Build a chromosome-keyed interval dictionary with sorted start positions.
|
|
45
45
|
|
|
46
46
|
Pre-processes the gene reference DataFrame into a structure that supports
|
|
47
47
|
efficient O(log N) binary-search lookup of genes near a query position.
|
|
@@ -98,7 +98,7 @@ def _annotate_variant(
|
|
|
98
98
|
window: int = 500_000,
|
|
99
99
|
promoter_window: int = 2_000,
|
|
100
100
|
) -> dict:
|
|
101
|
-
|
|
101
|
+
"""Return strand-aware nearest-gene annotation for a single variant.
|
|
102
102
|
|
|
103
103
|
Searches the pre-built *genes_dict* within *window* bp of *pos* on
|
|
104
104
|
*chrom*. Reports the nearest upstream and downstream genes (relative to
|
|
@@ -238,8 +238,7 @@ def _annotate_and_prioritize_variant(
|
|
|
238
238
|
promoter_window: int = 2_000,
|
|
239
239
|
biotype_weights: Optional[dict] = None,
|
|
240
240
|
) -> Optional[dict]:
|
|
241
|
-
|
|
242
|
-
priority metric.
|
|
241
|
+
"""Score and rank candidate genes for a single variant using a composite priority metric.
|
|
243
242
|
|
|
244
243
|
Builds a candidate gene set within *window* bp of *pos* on *chrom*, then
|
|
245
244
|
scores each candidate on four additive components:
|
|
@@ -386,7 +385,7 @@ def _annotate_and_prioritize_variant(
|
|
|
386
385
|
# ---------------------------------------------------------------------------
|
|
387
386
|
|
|
388
387
|
def _clump_by_distance(df: pd.DataFrame, window_kb: int = 500) -> pd.DataFrame:
|
|
389
|
-
|
|
388
|
+
"""Reduce a lead-SNP table to one representative SNP per locus.
|
|
390
389
|
|
|
391
390
|
Applies greedy distance-based clumping within each chromosome group,
|
|
392
391
|
starting from the most significant SNP (lowest ``P`` or highest ``logP``).
|
|
@@ -438,7 +437,7 @@ def get_hits_summary_table(
|
|
|
438
437
|
table_out: Optional[str] = None,
|
|
439
438
|
resources: Optional[ResourceConfig] = None,
|
|
440
439
|
) -> pd.DataFrame:
|
|
441
|
-
|
|
440
|
+
"""Annotate lead SNPs with nearest genes and write the locus summary table.
|
|
442
441
|
|
|
443
442
|
For each lead SNP in *leads_df*, runs two complementary annotation passes:
|
|
444
443
|
|
|
@@ -475,33 +474,21 @@ def get_hits_summary_table(
|
|
|
475
474
|
Clumped locus summary table. Contains all columns from *leads_df*
|
|
476
475
|
plus annotation fields from both passes, including:
|
|
477
476
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
- Distance to ``nearest_downstream_gene`` in bp
|
|
494
|
-
* - ``promoter_upstream_flag``
|
|
495
|
-
- ``True`` when the SNP is within 2 kb upstream of a TSS
|
|
496
|
-
* - ``gene_density``
|
|
497
|
-
- Number of genes within the search window
|
|
498
|
-
* - ``top_gene``
|
|
499
|
-
- Top-priority gene from the scoring pass
|
|
500
|
-
* - ``biotype``
|
|
501
|
-
- Ensembl biotype of ``top_gene`` (``'intergenic'`` when no
|
|
502
|
-
genic overlap)
|
|
503
|
-
* - ``priority_score``
|
|
504
|
-
- Composite priority score (genic hits only)
|
|
477
|
+
- ``genic`` — ``True`` when the lead SNP overlaps a gene body.
|
|
478
|
+
- ``nearest_upstream_gene`` — nearest upstream gene symbol
|
|
479
|
+
(strand-aware).
|
|
480
|
+
- ``upstream_distance`` — distance to ``nearest_upstream_gene`` in bp.
|
|
481
|
+
- ``nearest_downstream_gene`` — nearest downstream gene symbol
|
|
482
|
+
(strand-aware).
|
|
483
|
+
- ``downstream_distance`` — distance to ``nearest_downstream_gene`` in
|
|
484
|
+
bp.
|
|
485
|
+
- ``promoter_upstream_flag`` — ``True`` when the SNP is within 2 kb
|
|
486
|
+
upstream of a TSS.
|
|
487
|
+
- ``gene_density`` — number of genes within the search window.
|
|
488
|
+
- ``top_gene`` — top-priority gene from the scoring pass.
|
|
489
|
+
- ``biotype`` — Ensembl biotype of ``top_gene`` (``'intergenic'`` when
|
|
490
|
+
no genic overlap).
|
|
491
|
+
- ``priority_score`` — composite priority score (genic hits only).
|
|
505
492
|
|
|
506
493
|
Notes
|
|
507
494
|
-----
|
|
@@ -578,7 +565,8 @@ def get_hits_summary_table(
|
|
|
578
565
|
locus_table = leads_df
|
|
579
566
|
|
|
580
567
|
if table_out is not None:
|
|
581
|
-
|
|
582
|
-
|
|
568
|
+
outpath = table_out.replace(" ", "_").lower() + '.tsv'
|
|
569
|
+
locus_table.to_csv(outpath, index=False, sep="\t", na_rep="None")
|
|
570
|
+
logger.info("Locus summary written to: %s", outpath)
|
|
583
571
|
|
|
584
572
|
return _clump_by_distance(locus_table, window_kb=window_kb)
|