pycmplot 0.2.5__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycmplot-0.2.6/LICENSE +21 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/PKG-INFO +10 -2
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/__init__.py +1 -1
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/_core.py +9 -21
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/annotation.py +33 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/cli.py +26 -6
- pycmplot-0.2.6/pycmplot/data/hg18ToHg38.over.chain.gz +0 -0
- pycmplot-0.2.6/pycmplot/data/hg19ToHg38.over.chain.gz +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/io.py +50 -10
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/liftover.py +82 -15
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/plotting/circular.py +12 -18
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/plotting/linear.py +78 -30
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/resources.py +20 -7
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot.egg-info/PKG-INFO +10 -2
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot.egg-info/SOURCES.txt +2 -1
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pyproject.toml +11 -3
- {pycmplot-0.2.5 → pycmplot-0.2.6}/setup.cfg +1 -1
- pycmplot-0.2.5/LICENSE +0 -441
- pycmplot-0.2.5/pycmplot/data/hg19ToHg38.over.chain +0 -56506
- {pycmplot-0.2.5 → pycmplot-0.2.6}/README.md +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/__main__.py +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/constants.py +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/plotting/__init__.py +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/plotting/qq.py +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/stats.py +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot.egg-info/dependency_links.txt +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot.egg-info/entry_points.txt +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot.egg-info/requires.txt +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot.egg-info/top_level.txt +0 -0
- {pycmplot-0.2.5 → pycmplot-0.2.6}/setup.py +0 -0
pycmplot-0.2.6/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kevin Esoh
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,15 +1,23 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycmplot
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: Multi-track circular and linear Manhattan plot generation for GWAS summary statistics
|
|
5
5
|
Author: Kevin Esoh
|
|
6
6
|
Author-email: Kevin Esoh <kesohku1@jh.edu>
|
|
7
|
-
License-Expression:
|
|
7
|
+
License-Expression: MIT
|
|
8
8
|
Project-URL: Homepage, https://github.com/esohkevin/pycmplot
|
|
9
9
|
Project-URL: Issues, https://github.com/esohkevin/pycmplot/issues
|
|
10
10
|
Project-URL: Docs, https://pycmplot.readthedocs.io/en/latest/
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
16
|
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: Natural Language :: English
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
13
21
|
Requires-Python: >=3.9
|
|
14
22
|
Description-Content-Type: text/markdown
|
|
15
23
|
License-File: LICENSE
|
|
@@ -94,6 +94,7 @@ def main() -> None:
|
|
|
94
94
|
from pycmplot.plotting.circular import plot_circular
|
|
95
95
|
from pycmplot.plotting.qq import plot_qq_combined, plot_qq_separate, plot_qq_overlay
|
|
96
96
|
from pycmplot.resources import ResourceConfig
|
|
97
|
+
from pycmplot.annotation import get_annotation_column
|
|
97
98
|
|
|
98
99
|
# ------------------------------------------------------------------
|
|
99
100
|
# Parse CLI
|
|
@@ -147,7 +148,9 @@ def main() -> None:
|
|
|
147
148
|
track_heights = args.track_heights
|
|
148
149
|
linear_track_spacing = args.linear_track_spacing
|
|
149
150
|
no_track_labels = args.no_track_labels
|
|
151
|
+
ylabel = args.ylabel
|
|
150
152
|
chr_spacing = args.chr_spacing
|
|
153
|
+
figure_size = args.figure_size
|
|
151
154
|
|
|
152
155
|
|
|
153
156
|
# ------------------------------------------------------------------
|
|
@@ -226,23 +229,6 @@ def main() -> None:
|
|
|
226
229
|
signif_lines = pycmplot_dict["lines"]
|
|
227
230
|
pval_dict = pycmplot_dict["pvals"]
|
|
228
231
|
|
|
229
|
-
# ------------------------------------------------------------------
|
|
230
|
-
# ANNOTATE BY
|
|
231
|
-
# ------------------------------------------------------------------
|
|
232
|
-
label_col = 'SNP'
|
|
233
|
-
if annotate and not hits_table.empty:
|
|
234
|
-
if str(annotate).upper() == "GENE" and 'top_gene' in hits_table.columns:
|
|
235
|
-
label_col = 'top_gene'
|
|
236
|
-
elif annotate in hits_table.columns:
|
|
237
|
-
label_col = annotate
|
|
238
|
-
else:
|
|
239
|
-
logger.warning(
|
|
240
|
-
"Annotation column '%s' not found in hits table; "
|
|
241
|
-
"falling back to 'SNP'.", annotate,
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
logger.info("Annotate by: %s", label_col)
|
|
245
|
-
|
|
246
232
|
# ------------------------------------------------------------------
|
|
247
233
|
# CIRCULAR MANHATTAN
|
|
248
234
|
# ------------------------------------------------------------------
|
|
@@ -264,7 +250,6 @@ def main() -> None:
|
|
|
264
250
|
track_label_size = track_label_size,
|
|
265
251
|
track_label_orientation = track_label_orientation,
|
|
266
252
|
annotate = annotate,
|
|
267
|
-
label_col = label_col if annotate else None,
|
|
268
253
|
annotation_size = annotation_size,
|
|
269
254
|
hits_table = hits_table,
|
|
270
255
|
sector_sizes = merged_assoc_sector_sizes,
|
|
@@ -284,6 +269,9 @@ def main() -> None:
|
|
|
284
269
|
# ------------------------------------------------------------------
|
|
285
270
|
else:
|
|
286
271
|
logger.info("Generating LINEAR MANHATTAN Plot ...")
|
|
272
|
+
fsize = figure_size.strip(" ").split(",")
|
|
273
|
+
fsize = [int(v) for v in fsize]
|
|
274
|
+
logger.info(f"FIGURE SIZE: {fsize}")
|
|
287
275
|
plot_linear(
|
|
288
276
|
sumstats_loaded=sumstats_loaded,
|
|
289
277
|
track_heights=t_heights,
|
|
@@ -295,19 +283,19 @@ def main() -> None:
|
|
|
295
283
|
highlight_color=highlight_color,
|
|
296
284
|
highlight_line=highlight_line,
|
|
297
285
|
highlight_line_color=highlight_line_color,
|
|
298
|
-
annotate=annotate,
|
|
286
|
+
annotate=annotate,
|
|
299
287
|
hits_table=hits_table if not hits_table.empty else None,
|
|
300
|
-
label_col=label_col if annotate else None,
|
|
301
288
|
chr_spacing=chr_spacing,
|
|
302
289
|
linear_track_spacing=linear_track_spacing,
|
|
303
290
|
colors=colors,
|
|
304
291
|
signif_lines=signif_lines,
|
|
305
292
|
plot_title=plot_title,
|
|
306
293
|
no_track_labels=no_track_labels,
|
|
294
|
+
ylabel=ylabel,
|
|
307
295
|
dpi=dpi,
|
|
308
296
|
output_format=output_format,
|
|
309
297
|
output_dir=output_dir,
|
|
310
|
-
figsize=
|
|
298
|
+
figsize=fsize
|
|
311
299
|
)
|
|
312
300
|
|
|
313
301
|
# ------------------------------------------------------------------
|
|
@@ -570,3 +570,36 @@ def get_hits_summary_table(
|
|
|
570
570
|
logger.info("Locus summary written to: %s", outpath)
|
|
571
571
|
|
|
572
572
|
return _clump_by_distance(locus_table, window_kb=window_kb)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def get_annotation_column(
|
|
576
|
+
annotate: str = None,
|
|
577
|
+
hits_table: pd.DataFrame = None,
|
|
578
|
+
label_col: str = None,
|
|
579
|
+
):
|
|
580
|
+
if annotate and not hits_table.empty:
|
|
581
|
+
if label_col is not None and label_col in hits_table.columns:
|
|
582
|
+
label_clm = label_col
|
|
583
|
+
elif annotate in hits_table.columns:
|
|
584
|
+
label_clm = annotate
|
|
585
|
+
else:
|
|
586
|
+
if str(annotate).upper() == "GENE":
|
|
587
|
+
for i, (_, row) in enumerate(hits_table.iterrows()):
|
|
588
|
+
try:
|
|
589
|
+
if row["genic"]:
|
|
590
|
+
label_clm = "nearest_upstream_gene"
|
|
591
|
+
label_msg = "'POS' is genic"
|
|
592
|
+
else:
|
|
593
|
+
label_clm = "top_gene"
|
|
594
|
+
label_msg = "'POS' is not genic"
|
|
595
|
+
logger.info("%s", label_msg)
|
|
596
|
+
except Exception:
|
|
597
|
+
logger.warning(
|
|
598
|
+
"Annotation columns '%s' and '%s' not found in hits table: %s; "
|
|
599
|
+
"falling back to 'SNP'.", annotate, label_col, hits_table.columns.values,
|
|
600
|
+
)
|
|
601
|
+
label_clm = 'SNP'
|
|
602
|
+
|
|
603
|
+
logger.info("Annotating by: %s", label_clm)
|
|
604
|
+
|
|
605
|
+
return label_clm
|
|
@@ -63,7 +63,7 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
|
|
|
63
63
|
File delimiter name; auto-detected when ``None``.
|
|
64
64
|
``build_column`` : str or None
|
|
65
65
|
Column name containing per-variant genome-build values
|
|
66
|
-
(``hg19`` / ``hg38``).
|
|
66
|
+
(``hg18`` / ``hg19`` / ``hg38``).
|
|
67
67
|
``build`` : str or None
|
|
68
68
|
Comma-separated list of genome builds per summary statistics file,
|
|
69
69
|
in the same order as ``sum_stats``. Alternative to ``build_column``.
|
|
@@ -138,6 +138,10 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
|
|
|
138
138
|
Track sort order.
|
|
139
139
|
``no_track_labels`` : bool
|
|
140
140
|
Suppress track label rendering when ``True``.
|
|
141
|
+
``ylabel`` : str or None
|
|
142
|
+
Shared y-axis label for linear Manhattan plots. Override the
|
|
143
|
+
default (``"-log₁₀(p-value)"`` or the p-value column name) for
|
|
144
|
+
non-p-value statistics such as ``"iHS"`` or ``"F_ST"``.
|
|
141
145
|
``plot_title`` : str
|
|
142
146
|
Plot title and output file stem. Default ``'MyCMplot'``.
|
|
143
147
|
``plot_title_size`` : float
|
|
@@ -252,11 +256,14 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
|
|
|
252
256
|
opt.add_argument(
|
|
253
257
|
"-b","--build", default=None, required=False, type=str, metavar='str',
|
|
254
258
|
help=
|
|
255
|
-
"""Comma-
|
|
256
|
-
in the same order as sumstats files.
|
|
259
|
+
"""Comma-separated list of genome build of summary stats file(s) listed
|
|
260
|
+
in the same order as sumstats files. Accepted values: hg18, hg19, hg38.
|
|
261
|
+
E.g. hg19,hg38,hg38,hg18 means:
|
|
257
262
|
file1.txt.gz --> hg19
|
|
258
263
|
file2.txt.gz --> hg38
|
|
259
|
-
file3.tsv --> hg38
|
|
264
|
+
file3.tsv --> hg38
|
|
265
|
+
file4.tsv --> hg18 ... etc
|
|
266
|
+
hg18 and hg19 coordinates are lifted to hg38 before plotting.
|
|
260
267
|
"""
|
|
261
268
|
)
|
|
262
269
|
opt.add_argument(
|
|
@@ -365,7 +372,17 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
|
|
|
365
372
|
"Exclude track labels from plot. (default: False)"
|
|
366
373
|
),
|
|
367
374
|
action="store_true"
|
|
368
|
-
)
|
|
375
|
+
)
|
|
376
|
+
opt.add_argument(
|
|
377
|
+
"-yl", "--ylabel",
|
|
378
|
+
default=None, type=str, metavar="str",
|
|
379
|
+
help=(
|
|
380
|
+
"Shared y-axis label for linear Manhattan plots (left margin). "
|
|
381
|
+
"Useful for non-p-value statistics such as iHS, F_ST or "
|
|
382
|
+
"XP-EHH (e.g. --ylabel 'iHS'). Defaults to '-log10(p-value)' "
|
|
383
|
+
"when --logp is set, otherwise the p-value column name."
|
|
384
|
+
)
|
|
385
|
+
)
|
|
369
386
|
opt.add_argument(
|
|
370
387
|
"-plt", "--plot_title", default="MyCMplot", type=str, metavar="str",
|
|
371
388
|
help="Plot plot_title / output file stem."
|
|
@@ -439,7 +456,10 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
|
|
|
439
456
|
"-t_space", "--linear_track_spacing", default=0.10, type=float, metavar="float",
|
|
440
457
|
help="Space between linear tracks (default: 0.10)."
|
|
441
458
|
)
|
|
442
|
-
|
|
459
|
+
lio.add_argument(
|
|
460
|
+
"-figsize", "--figure_size", default='10,4', required=False, type=str, metavar="str",
|
|
461
|
+
help="Linear plot figure size (default: 10,4 for width,height)."
|
|
462
|
+
)
|
|
443
463
|
opt.add_argument(
|
|
444
464
|
"-h", "--help", action="help",
|
|
445
465
|
help="Show this help message and exit."
|
|
Binary file
|
|
Binary file
|
|
@@ -644,11 +644,11 @@ def prep_pycmplot_input_info(
|
|
|
644
644
|
bcol: "BUILD",
|
|
645
645
|
}
|
|
646
646
|
col_dtypes = {
|
|
647
|
-
chrom_col:
|
|
647
|
+
chrom_col: 'category',
|
|
648
648
|
pos_col: object,
|
|
649
649
|
snp_col: str,
|
|
650
650
|
pcol_col: float,
|
|
651
|
-
bcol:
|
|
651
|
+
bcol: 'category',
|
|
652
652
|
}
|
|
653
653
|
sumstats_hdr_dic[name] = [old_cols, col_dtypes, new_cols, file_sep]
|
|
654
654
|
|
|
@@ -662,7 +662,7 @@ def prep_pycmplot_input_info(
|
|
|
662
662
|
pcol_col: "P",
|
|
663
663
|
}
|
|
664
664
|
col_dtypes = {
|
|
665
|
-
chrom_col:
|
|
665
|
+
chrom_col: 'category',
|
|
666
666
|
pos_col: object,
|
|
667
667
|
snp_col: str,
|
|
668
668
|
pcol_col: float,
|
|
@@ -681,18 +681,26 @@ def prep_pycmplot_input_info(
|
|
|
681
681
|
pcol_col: "P",
|
|
682
682
|
}
|
|
683
683
|
col_dtypes = {
|
|
684
|
-
chrom_col:
|
|
684
|
+
chrom_col: 'category',
|
|
685
685
|
pos_col: object,
|
|
686
686
|
snp_col: str,
|
|
687
687
|
pcol_col: float,
|
|
688
688
|
}
|
|
689
689
|
sumstats_hdr_dic[name] = [old_cols, col_dtypes, new_cols, file_sep]
|
|
690
690
|
|
|
691
|
-
|
|
691
|
+
def _has_build_info(info: list) -> bool:
|
|
692
|
+
"""A file has build info when either (a) its header had a build
|
|
693
|
+
column (which is stored as a fifth entry in ``old_cols``), or
|
|
694
|
+
(b) a per-file build was supplied via ``--build`` (stored as a
|
|
695
|
+
fifth entry in the top-level list)."""
|
|
696
|
+
old_cols = info[0]
|
|
697
|
+
return len(old_cols) == 5 or len(info) == 5
|
|
698
|
+
|
|
699
|
+
if not any(_has_build_info(info) for info in sumstats_hdr_dic.values()):
|
|
692
700
|
# Neither build column nor --build was available for any file
|
|
693
701
|
logger.warning(
|
|
694
702
|
"No build column or --build values detected. Summary stats will "
|
|
695
|
-
"be plotted in their
|
|
703
|
+
"be plotted in their native coordinate systems. If your data "
|
|
696
704
|
"are in different coordinate systems, combining them in one plot "
|
|
697
705
|
"is not advisable, especially if ``--annotate`` is set!"
|
|
698
706
|
)
|
|
@@ -713,6 +721,23 @@ def _merge_min_max_lists(dicts: list[dict]) -> dict:
|
|
|
713
721
|
return {k: [min(v), max(v)] for k, v in temp.items()}
|
|
714
722
|
|
|
715
723
|
|
|
724
|
+
# ---------------------------------------------------------------------------
|
|
725
|
+
# Memory usage
|
|
726
|
+
# ---------------------------------------------------------------------------
|
|
727
|
+
def _get_memory_usage(mem_df: int):
|
|
728
|
+
if mem_df > 1e6:
|
|
729
|
+
df_mem = mem_df / 1e9
|
|
730
|
+
unit = 'GB'
|
|
731
|
+
else:
|
|
732
|
+
df_mem = mem_df / 1e6
|
|
733
|
+
unit = 'MB'
|
|
734
|
+
if df_mem < 1:
|
|
735
|
+
df_mem = df_mem * 100
|
|
736
|
+
unit = 'MB'
|
|
737
|
+
|
|
738
|
+
return f"{df_mem:.3g} {unit}"
|
|
739
|
+
|
|
740
|
+
|
|
716
741
|
# ---------------------------------------------------------------------------
|
|
717
742
|
# Main loader
|
|
718
743
|
# ---------------------------------------------------------------------------
|
|
@@ -874,20 +899,27 @@ def get_sumstats_and_merged_sector_list(
|
|
|
874
899
|
).rename(columns=sumstat_newcols)
|
|
875
900
|
|
|
876
901
|
df["POS"] = pd.to_numeric(df["POS"], errors="coerce").astype("Int64").dropna()
|
|
902
|
+
pre_trim_mem = _get_memory_usage(df.memory_usage(deep=True).sum())
|
|
903
|
+
pre_trim_vars = len(df.index)
|
|
904
|
+
logger.info("Loaded %s variants from summary stat file, using %s of memory", pre_trim_vars, pre_trim_mem)
|
|
877
905
|
|
|
878
906
|
# Get dict of p-values for qq-plotting before applying trim_pval
|
|
879
907
|
logger.info("Extracting raw p-values for QQ-plotting ...")
|
|
880
|
-
pval_dict[label] = df["P"].dropna().astype(
|
|
908
|
+
pval_dict[label] = df["P"].dropna().astype(float).values
|
|
881
909
|
|
|
882
910
|
|
|
883
911
|
# Add build column if not exist and build supplied
|
|
884
912
|
if build:
|
|
885
913
|
df['BUILD'] = build
|
|
914
|
+
df['BUILD'] = df['BUILD'].astype('category')
|
|
886
915
|
|
|
887
916
|
# Trim insignificant variants for faster plotting
|
|
888
917
|
if trim_pval:
|
|
889
918
|
logger.info("Excluding variants with p-value less than %s to speed up Manhattan plotting ...", trim_pval)
|
|
890
919
|
df = df[df["P"].astype(float) <= float(trim_pval)]
|
|
920
|
+
post_trim_mem = _get_memory_usage(df.memory_usage(deep=True).sum())
|
|
921
|
+
post_trim_vars = len(df.index)
|
|
922
|
+
logger.info("%s variants remain after trimming, using %s of memory", post_trim_vars, post_trim_mem)
|
|
891
923
|
else:
|
|
892
924
|
df = df[df["P"].astype(float) <= 1]
|
|
893
925
|
|
|
@@ -911,9 +943,16 @@ def get_sumstats_and_merged_sector_list(
|
|
|
911
943
|
n_chroms = len(df["CHR"].unique()) - 1
|
|
912
944
|
sumstats_loaded[label] = [df, n_chroms]
|
|
913
945
|
|
|
914
|
-
# Liftover hg19 data if needed
|
|
915
|
-
if "BUILD" in df.columns and
|
|
916
|
-
|
|
946
|
+
# Liftover hg18/hg19 data if needed
|
|
947
|
+
if "BUILD" in df.columns and (
|
|
948
|
+
"hg19" in df["BUILD"].unique() or "hg18" in df["BUILD"].unique()
|
|
949
|
+
):
|
|
950
|
+
builds_present = sorted(
|
|
951
|
+
b for b in df["BUILD"].unique() if b in {"hg18", "hg19"}
|
|
952
|
+
)
|
|
953
|
+
logger.info(
|
|
954
|
+
"Converting %s coordinates to hg38 ...", "/".join(builds_present)
|
|
955
|
+
)
|
|
917
956
|
sumstats_loaded[label][0] = liftover_position(df, resources=resources)
|
|
918
957
|
|
|
919
958
|
# Lead SNPs
|
|
@@ -1002,6 +1041,7 @@ def get_sumstats_and_merged_sector_list(
|
|
|
1002
1041
|
assoc_sector_sizes_list: list[dict] = []
|
|
1003
1042
|
min_dic_val = None
|
|
1004
1043
|
|
|
1044
|
+
logger.info("Computing per-sumstat sector sizes (chrom → [min_pos, max_pos])")
|
|
1005
1045
|
for df, _n in sumstats_loaded.values():
|
|
1006
1046
|
assoc = df[~(df["CHR"].str.len() > 2)].copy()
|
|
1007
1047
|
assoc["POS"] = assoc["POS"].fillna(0).astype(int)
|
|
@@ -2,23 +2,34 @@
|
|
|
2
2
|
pycmplot.liftover
|
|
3
3
|
=================
|
|
4
4
|
|
|
5
|
-
Genome coordinate liftover utilities (hg19 → hg38).
|
|
5
|
+
Genome coordinate liftover utilities (hg18 → hg38 and hg19 → hg38).
|
|
6
6
|
|
|
7
|
-
The :class:`pyliftover.LiftOver`
|
|
8
|
-
created on first use and cached in a module-level dictionary, so
|
|
9
|
-
this module never triggers a file-not-found error even if the
|
|
10
|
-
not been configured yet.
|
|
7
|
+
The :class:`pyliftover.LiftOver` objects are initialised **lazily** — they
|
|
8
|
+
are created on first use and cached in a module-level dictionary, so
|
|
9
|
+
importing this module never triggers a file-not-found error even if the
|
|
10
|
+
chain files have not been configured yet.
|
|
11
|
+
|
|
12
|
+
Supported conversions
|
|
13
|
+
---------------------
|
|
14
|
+
pycmplot harmonises input coordinates to GRCh38. Two source assemblies are
|
|
15
|
+
supported:
|
|
16
|
+
|
|
17
|
+
* ``hg19`` / GRCh37 → GRCh38 (default, bundled chain file)
|
|
18
|
+
* ``hg18`` / NCBI36 → GRCh38 (bundled chain file; used when input rows
|
|
19
|
+
carry a ``hg18`` build label)
|
|
11
20
|
|
|
12
21
|
Resource configuration
|
|
13
22
|
----------------------
|
|
14
|
-
|
|
15
|
-
:class:`~pycmplot.resources.ResourceConfig`. By default,
|
|
16
|
-
|
|
17
|
-
|
|
23
|
+
Chain file paths are resolved through
|
|
24
|
+
:class:`~pycmplot.resources.ResourceConfig`. By default, bundled chain
|
|
25
|
+
files are used (``pycmplot/data/hg19ToHg38.over.chain.gz`` and
|
|
26
|
+
``pycmplot/data/hg18ToHg38.over.chain.gz``). They can be overridden by
|
|
27
|
+
setting the environment variables:
|
|
18
28
|
|
|
19
29
|
.. code-block:: bash
|
|
20
30
|
|
|
21
|
-
export PYCMPLOT_CHAIN_HG19_HG38=/path/to/hg19ToHg38.over.chain
|
|
31
|
+
export PYCMPLOT_CHAIN_HG19_HG38=/path/to/hg19ToHg38.over.chain.gz
|
|
32
|
+
export PYCMPLOT_CHAIN_HG18_HG38=/path/to/hg18ToHg38.over.chain.gz
|
|
22
33
|
"""
|
|
23
34
|
|
|
24
35
|
from __future__ import annotations
|
|
@@ -135,17 +146,71 @@ def liftover_hg19_to_hg38(
|
|
|
135
146
|
return new_pos
|
|
136
147
|
|
|
137
148
|
|
|
149
|
+
def liftover_hg18_to_hg38(
|
|
150
|
+
chrom: str,
|
|
151
|
+
pos: int,
|
|
152
|
+
resources: Optional[ResourceConfig] = None,
|
|
153
|
+
) -> Optional[int]:
|
|
154
|
+
"""Convert a single hg18 (NCBI36) position to its hg38 equivalent.
|
|
155
|
+
|
|
156
|
+
Uses a lazily loaded and cached :class:`~pyliftover.LiftOver` object
|
|
157
|
+
backed by the hg18→hg38 chain file specified in *resources*. When
|
|
158
|
+
multiple hg38 mappings exist for a given position, the one with the
|
|
159
|
+
highest chain score is returned.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
chrom : str
|
|
164
|
+
Chromosome name **without** the ``'chr'`` prefix (e.g. ``'1'``,
|
|
165
|
+
``'X'``). The prefix is added internally before querying
|
|
166
|
+
pyliftover.
|
|
167
|
+
pos : int
|
|
168
|
+
0-based hg18 position, as expected by :class:`pyliftover.LiftOver`.
|
|
169
|
+
resources : ResourceConfig, optional
|
|
170
|
+
:class:`~pycmplot.resources.ResourceConfig` instance. Falls back
|
|
171
|
+
to :data:`~pycmplot.resources.default_resources` when ``None``.
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
int or None
|
|
176
|
+
Corresponding 0-based hg38 position, or ``None`` if the position
|
|
177
|
+
could not be mapped (unmapped region, chromosome gap, or deleted
|
|
178
|
+
sequence).
|
|
179
|
+
|
|
180
|
+
See Also
|
|
181
|
+
--------
|
|
182
|
+
liftover_hg19_to_hg38 :
|
|
183
|
+
Equivalent helper for hg19 coordinates.
|
|
184
|
+
liftover_position :
|
|
185
|
+
Applies the appropriate per-row dispatcher to a full DataFrame.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
if resources is None:
|
|
189
|
+
resources = default_resources
|
|
190
|
+
|
|
191
|
+
chain_path = resources.require("chain_hg18_hg38")
|
|
192
|
+
lo = _get_liftover(chain_path)
|
|
193
|
+
|
|
194
|
+
results = lo.convert_coordinate(f"chr{chrom}", pos)
|
|
195
|
+
if not results:
|
|
196
|
+
return None
|
|
197
|
+
_new_chrom, new_pos, _strand, _score = results[0]
|
|
198
|
+
return new_pos
|
|
199
|
+
|
|
200
|
+
|
|
138
201
|
def liftover_position(
|
|
139
202
|
df: pd.DataFrame,
|
|
140
203
|
hg38_chr_limits: dict = None,
|
|
141
204
|
resources: Optional[ResourceConfig] = None,
|
|
142
205
|
) -> pd.DataFrame:
|
|
143
|
-
"""Liftover all hg19 rows in *df*
|
|
206
|
+
"""Liftover all hg18/hg19 rows in *df* to hg38 coordinates.
|
|
144
207
|
|
|
145
|
-
Iterates over every row in *df* and
|
|
146
|
-
for rows whose ``BUILD`` column equals
|
|
147
|
-
|
|
148
|
-
|
|
208
|
+
Iterates over every row in *df* and dispatches to
|
|
209
|
+
:func:`liftover_hg19_to_hg38` for rows whose ``BUILD`` column equals
|
|
210
|
+
``'hg19'`` or to :func:`liftover_hg18_to_hg38` for rows whose ``BUILD``
|
|
211
|
+
column equals ``'hg18'``. Rows with any other build value are passed
|
|
212
|
+
through unchanged. Rows for which liftover returns ``None`` or ``0``
|
|
213
|
+
(unmappable positions) are silently dropped.
|
|
149
214
|
|
|
150
215
|
Two provenance columns are added to the returned DataFrame so that the
|
|
151
216
|
original coordinates remain accessible:
|
|
@@ -207,6 +272,8 @@ def liftover_position(
|
|
|
207
272
|
for chrom, pos, build in zip(df["CHR"], df["POS"], df["BUILD"]):
|
|
208
273
|
if build == "hg19":
|
|
209
274
|
new_positions.append(liftover_hg19_to_hg38(chrom, pos, resources))
|
|
275
|
+
elif build == "hg18":
|
|
276
|
+
new_positions.append(liftover_hg18_to_hg38(chrom, pos, resources))
|
|
210
277
|
else:
|
|
211
278
|
new_positions.append(pos)
|
|
212
279
|
|
|
@@ -29,6 +29,7 @@ import pandas as pd
|
|
|
29
29
|
|
|
30
30
|
from pycmplot.io import get_output_paths
|
|
31
31
|
from pycmplot.stats import get_highlight_snps
|
|
32
|
+
from pycmplot.annotation import get_annotation_column
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
34
35
|
|
|
@@ -622,25 +623,18 @@ def plot_circular(
|
|
|
622
623
|
# Circular: gene/SNP annotations
|
|
623
624
|
# ------------------------------------------------------------------
|
|
624
625
|
if annotate and not hits_table.empty:
|
|
626
|
+
label_col = get_annotation_column(
|
|
627
|
+
annotate = annotate,
|
|
628
|
+
hits_table=hits_table,
|
|
629
|
+
label_col=label_col,
|
|
630
|
+
)
|
|
631
|
+
if label_col == 'SNP':
|
|
632
|
+
fstyle = "normal"
|
|
633
|
+
else:
|
|
634
|
+
fstyle = "italic"
|
|
635
|
+
|
|
625
636
|
for i, (_, row) in enumerate(hits_table.iterrows()):
|
|
626
|
-
label = row[
|
|
627
|
-
fstyle = "normal"
|
|
628
|
-
if label_col:
|
|
629
|
-
label_col = str(label_col)
|
|
630
|
-
try:
|
|
631
|
-
if label_col == "GENE":
|
|
632
|
-
if row["genic"]:
|
|
633
|
-
label = row["nearest_upstream_gene"]
|
|
634
|
-
else:
|
|
635
|
-
label = row["top_gene"]
|
|
636
|
-
fstyle = "italic"
|
|
637
|
-
elif label_col != "SNP":
|
|
638
|
-
label = row[label_col]
|
|
639
|
-
fstyle = "italic"
|
|
640
|
-
except Exception:
|
|
641
|
-
logger.info("'SNP' column is used for annotation since '%s' column could not be resolved in hits table.", label_col)
|
|
642
|
-
pass
|
|
643
|
-
|
|
637
|
+
label = row[label_col]
|
|
644
638
|
for sector in circos.sectors:
|
|
645
639
|
if str(row["CHR"]) == sector.name:
|
|
646
640
|
a_track = sector.add_track(annotation_track_radius)
|