pycmplot 0.1.6__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {pycmplot-0.1.6/pycmplot.egg-info → pycmplot-0.1.7}/PKG-INFO +1 -1
  2. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/__init__.py +4 -2
  3. pycmplot-0.1.7/pycmplot/_core.py +218 -0
  4. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/cli.py +7 -0
  5. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/io.py +203 -19
  6. pycmplot-0.1.7/pycmplot/plotting/circular.py +489 -0
  7. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/plotting/linear.py +125 -42
  8. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/stats.py +4 -4
  9. {pycmplot-0.1.6 → pycmplot-0.1.7/pycmplot.egg-info}/PKG-INFO +1 -1
  10. {pycmplot-0.1.6 → pycmplot-0.1.7}/pyproject.toml +1 -1
  11. {pycmplot-0.1.6 → pycmplot-0.1.7}/setup.cfg +1 -1
  12. pycmplot-0.1.6/pycmplot/_core.py +0 -419
  13. pycmplot-0.1.6/pycmplot/plotting/circular.py +0 -261
  14. {pycmplot-0.1.6 → pycmplot-0.1.7}/LICENSE +0 -0
  15. {pycmplot-0.1.6 → pycmplot-0.1.7}/LICENSE.mit +0 -0
  16. {pycmplot-0.1.6 → pycmplot-0.1.7}/README.md +0 -0
  17. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/annotation.py +0 -0
  18. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/constants.py +0 -0
  19. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
  20. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
  21. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/data/hg19ToHg38.over.chain +0 -0
  22. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/liftover.py +0 -0
  23. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot/resources.py +0 -0
  24. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot.egg-info/SOURCES.txt +0 -0
  25. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot.egg-info/dependency_links.txt +0 -0
  26. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot.egg-info/entry_points.txt +0 -0
  27. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot.egg-info/requires.txt +0 -0
  28. {pycmplot-0.1.6 → pycmplot-0.1.7}/pycmplot.egg-info/top_level.txt +0 -0
  29. {pycmplot-0.1.6 → pycmplot-0.1.7}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycmplot
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Multi-track circular and linear Manhattan plot generation for GWAS summary statistics
5
5
  Author: Kevin Esoh
6
6
  Author-email: Kevin Esoh <kesohku1@jh.edu>
@@ -11,6 +11,7 @@ Command-line::
11
11
 
12
12
  Python API::
13
13
 
14
+ from pycmplot.io import prep_pycmplot_input_info, get_sumstats_and_merged_sector_list
14
15
  from pycmplot.plotting import plot_linear, plot_circular
15
16
  from pycmplot.stats import get_lead_snps
16
17
  from pycmplot.annotation import get_hits_summary_table
@@ -22,7 +23,7 @@ Public surface
22
23
  from pycmplot.plotting.linear import plot_linear
23
24
  from pycmplot.plotting.circular import plot_circular, compute_track_radii_dict
24
25
  from pycmplot.stats import get_lead_snps, get_highlight_snps
25
- from pycmplot.io import get_sumstats_and_merged_sector_list
26
+ from pycmplot.io import prep_pycmplot_input_info, get_sumstats_and_merged_sector_list
26
27
  from pycmplot.annotation import get_hits_summary_table
27
28
  from pycmplot.constants import hg38_chr_lengths, BIOTYPE_WEIGHTS
28
29
  from pycmplot.resources import ResourceConfig
@@ -33,6 +34,7 @@ __all__ = [
33
34
  "compute_track_radii_dict",
34
35
  "get_lead_snps",
35
36
  "get_highlight_snps",
37
+ "prep_pycmplot_input_info",
36
38
  "get_sumstats_and_merged_sector_list",
37
39
  "get_hits_summary_table",
38
40
  "hg38_chr_lengths",
@@ -40,4 +42,4 @@ __all__ = [
40
42
  "ResourceConfig",
41
43
  ]
42
44
 
43
- __version__ = "0.1.0"
45
+ __version__ = "0.1.7"
@@ -0,0 +1,218 @@
1
+ """
2
+ pycmplot._core
3
+ ==============
4
+ Main entry point — orchestrates CLI parsing, data loading, and plotting.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import warnings
11
+
12
+ # Suppress noisy font-manager warnings before any matplotlib import
13
+ logging.getLogger("matplotlib.font_manager").setLevel(logging.ERROR)
14
+ warnings.filterwarnings("ignore")
15
+
16
+ logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def main() -> None:
21
+ """CLI entry point — ``pycmplot`` console script."""
22
+
23
+ # ------------------------------------------------------------------
24
+ # Deferred imports so ``import pycmplot`` remains fast
25
+ # ------------------------------------------------------------------
26
+ from pycmplot.cli import get_arguments, DESCMSG
27
+ from pycmplot.io import (
28
+ get_sumstats_and_merged_sector_list,
29
+ prep_pycmplot_input_info,
30
+ get_output_paths,
31
+ strip_comma_separated_input_streams,
32
+ #detect_delimiter,
33
+ #resolve_delimiter,
34
+ #get_file_header,
35
+ )
36
+ from pycmplot.plotting.linear import plot_linear
37
+ from pycmplot.plotting.circular import plot_circular
38
+ from pycmplot.resources import ResourceConfig
39
+
40
+ # ------------------------------------------------------------------
41
+ # Parse CLI
42
+ # ------------------------------------------------------------------
43
+ args = get_arguments(DESCMSG)
44
+ print(DESCMSG)
45
+
46
+ mode = args.mode
47
+ sum_stats_raw = args.sum_stats
48
+ chrom_arg = args.chrom_column
49
+ pos_arg = args.pos_column
50
+ snp_arg = args.snp_column
51
+ build_arg = args.build_column
52
+ labels_raw = args.labels
53
+ pcol_arg = args.pval_column
54
+ logp = args.logp
55
+ chrom_label_size = args.chrom_label_size
56
+ chrom_label_side = args.chrom_label_side
57
+ track_label_size = args.track_label_size
58
+ track_label_orientation = args.track_label_orientation
59
+ sort_track = args.sort_track
60
+ trim_pval = args.trim_pval
61
+ signif_threshold = args.signif_threshold
62
+ signif_line = args.signif_line
63
+ suggest_threshold= args.suggest_threshold
64
+ annotate = args.annotate
65
+ annotation_size = args.annotation_size
66
+ point_size = args.point_size
67
+ highlight = args.highlight
68
+ highlight_thresh = args.highlight_thresh
69
+ highlight_line = args.highlight_line
70
+ colors_raw = args.colors
71
+ r_min = args.r_min
72
+ r_max = args.r_max
73
+ pad = args.pad
74
+ output_format = args.output_format
75
+ output_dir = args.output_dir
76
+ dpi = args.dpi
77
+ plot_title = args.plot_title
78
+ plot_title_size = args.plot_title_size
79
+ track_heights = args.track_heights
80
+ track_spacing = args.track_spacing
81
+ no_track_labels = args.no_track_labels
82
+ chr_spacing = args.chr_spacing
83
+
84
+
85
+ # ------------------------------------------------------------------
86
+ # Sumstat, labels, colours, track heights str to list
87
+ # ------------------------------------------------------------------
88
+ (
89
+ sum_stats,
90
+ labels,
91
+ colors,
92
+ t_heights
93
+ ) = strip_comma_separated_input_streams(
94
+ sum_stats = sum_stats_raw,
95
+ labels = labels_raw,
96
+ colors_raw = colors_raw,
97
+ track_heights = track_heights,
98
+ )
99
+
100
+ # ------------------------------------------------------------------
101
+ # Output paths
102
+ # ------------------------------------------------------------------
103
+ (
104
+ plt_name,
105
+ table_out
106
+ ) = get_output_paths(
107
+ labels,
108
+ mode = mode,
109
+ logp = logp,
110
+ output_dir = output_dir,
111
+ plot_title = plot_title,
112
+ output_format = output_format
113
+ )
114
+
115
+ # ------------------------------------------------------------------
116
+ # Resolve column names
117
+ # ------------------------------------------------------------------
118
+ sumstats_hdr_dic = prep_pycmplot_input_info(
119
+ sum_stats = sum_stats,
120
+ labels = labels,
121
+ delim = args.delim,
122
+ chrom = chrom_arg,
123
+ pos = pos_arg,
124
+ snp = snp_arg,
125
+ pcol = pcol_arg,
126
+ build = build_arg
127
+ )
128
+
129
+ # ------------------------------------------------------------------
130
+ # ResourceConfig — picks up environment variables automatically
131
+ # ------------------------------------------------------------------
132
+ resources = ResourceConfig()
133
+
134
+ # ------------------------------------------------------------------
135
+ # Load data, compute sectors, get hits table
136
+ # ------------------------------------------------------------------
137
+ (
138
+ merged_assoc_sector_sizes,
139
+ sumstats_loaded,
140
+ hits_table,
141
+ signif_lines,
142
+ ) = get_sumstats_and_merged_sector_list(
143
+ sum_stats=sum_stats,
144
+ labels=labels,
145
+ trim_pval=trim_pval,
146
+ logp=logp,
147
+ file_info=sumstats_hdr_dic,
148
+ sort_tracks=sort_track,
149
+ table_out=table_out,
150
+ signif_threshold=signif_threshold,
151
+ signif_line=signif_line,
152
+ suggest_threshold=suggest_threshold,
153
+ resources=resources,
154
+ )
155
+
156
+ # ------------------------------------------------------------------
157
+ # CIRCULAR MANHATTAN
158
+ # ------------------------------------------------------------------
159
+ if mode.upper() == "CM":
160
+ logger.info("Generating CIRCULAR MANHATTAN Plot ...")
161
+ plot_circular(
162
+ sumstats_loaded = sumstats_loaded,
163
+ logp = logp,
164
+ signif_line = signif_line,
165
+ signif_lines = signif_lines,
166
+ highlight = highlight,
167
+ highlight_thresh = highlight_thresh,
168
+ highlight_line = highlight_line,
169
+ colors = colors,
170
+ chrom_label_side = chrom_label_side,
171
+ chrom_label_size = chrom_label_size,
172
+ track_label_size = track_label_size,
173
+ track_label_orientation = track_label_orientation,
174
+ annotate = annotate,
175
+ annotation_size = annotation_size,
176
+ hits_table = hits_table,
177
+ sector_sizes = merged_assoc_sector_sizes,
178
+ pad = pad,
179
+ r_min = r_min,
180
+ r_max = r_max,
181
+ plot_title = plot_title,
182
+ plot_title_size = plot_title_size,
183
+ no_track_labels = no_track_labels,
184
+ dpi = dpi,
185
+ output_format=output_format,
186
+ output_dir=output_dir
187
+ )
188
+
189
+ # ------------------------------------------------------------------
190
+ # LINEAR MANHATTAN
191
+ # ------------------------------------------------------------------
192
+ else:
193
+ logger.info("Generating LINEAR MANHATTAN Plot ...")
194
+ plot_linear(
195
+ sumstats_loaded = sumstats_loaded,
196
+ track_heights = t_heights,
197
+ trim_pval=trim_pval,
198
+ logp=True if logp else False,
199
+ point_size=point_size,
200
+ highlight=highlight,
201
+ highlight_thresh=highlight_thresh,
202
+ annot_df=hits_table if not hits_table.empty else None,
203
+ label_col="top_gene",
204
+ chr_spacing=chr_spacing,
205
+ track_spacing=track_spacing,
206
+ colors=colors,
207
+ signif_lines=signif_lines,
208
+ plot_title=plot_title,
209
+ no_track_labels = no_track_labels,
210
+ dpi=dpi,
211
+ output_format=output_format,
212
+ output_dir=output_dir,
213
+ figsize=(15, 9)
214
+ )
215
+
216
+
217
+ if __name__ == "__main__":
218
+ main()
@@ -147,6 +147,13 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
147
147
  const="chrom_len", default=None, type=str, #metavar="str",
148
148
  help="Sort tracks by chromosome count or label."
149
149
  )
150
+ opt.add_argument(
151
+ "-ntl", "--no_track_labels",
152
+ help=(
153
+ "Exclude track labels from plot. (default: False)"
154
+ ),
155
+ action="store_true"
156
+ )
150
157
  opt.add_argument(
151
158
  "-plt", "--plot_title", default="MyCMplot", type=str, metavar="str",
152
159
  help="Plot plot_title / output file stem."
@@ -8,6 +8,8 @@ from __future__ import annotations
8
8
 
9
9
  import csv
10
10
  import gzip
11
+ import sys
12
+ import re
11
13
  import logging
12
14
  from collections import defaultdict
13
15
  from pathlib import Path
@@ -109,6 +111,198 @@ def get_file_header(
109
111
  return list(hdr)
110
112
 
111
113
 
114
+
115
+ def strip_comma_separated_input_streams(
116
+ sum_stats,
117
+ labels,
118
+ colors_raw = 'steelblue,grey',
119
+ track_heights = None,
120
+ ):
121
+
122
+ if len(sum_stats) != len(labels):
123
+ sys.exit(
124
+ "Error: number of summary stats files and labels must match.\n"
125
+ f" Files: {sum_stats}\n"
126
+ f" Labels: {labels}"
127
+ )
128
+
129
+ # ------------------------------------------------------------------
130
+ # Sumstat, labels str to list
131
+ # ------------------------------------------------------------------
132
+ labels = [lbl.strip() for lbl in labels.strip().split(",")]
133
+
134
+ sum_stats = [s.strip() for s in sum_stats.strip().split(",")]
135
+
136
+ # ------------------------------------------------------------------
137
+ # Colours str to list
138
+ # ------------------------------------------------------------------
139
+ colors = [c.strip() for c in colors_raw.strip().split(",")]
140
+
141
+ # ------------------------------------------------------------------
142
+ # Linear track heights str to list
143
+ # ------------------------------------------------------------------
144
+ t_heights = [float(x) for x in track_heights.strip().split(",")]
145
+
146
+ return sum_stats, labels, colors, t_heights
147
+
148
+
149
+ # ------------------------------------------------------------------
150
+ # Random string for output paths
151
+ # ------------------------------------------------------------------
152
+ def generate_random_string(length):
153
+ import random
154
+ import string
155
+ # Combine uppercase, lowercase, and digits
156
+ characters = string.ascii_letters + string.digits
157
+ # random.choices picks multiple characters with replacement
158
+ return ''.join(random.choices(characters, k=length))
159
+
160
+
161
+ # ------------------------------------------------------------------
162
+ # Output paths
163
+ # ------------------------------------------------------------------
164
+ def get_output_paths(
165
+ labels,
166
+ mode: Optional[str] = 'lm',
167
+ logp: bool = False,
168
+ output_dir: Optional[str] = '.',
169
+ plot_title: Optional[str] = None,
170
+ output_format: Optional[str] = 'png'
171
+ ):
172
+
173
+ out_path = Path(output_dir).resolve()
174
+
175
+ out_path.mkdir(parents=True, exist_ok=True)
176
+
177
+ if plot_title:
178
+ pltitle = re.sub(r"[^a-zA-Z0-9\s]", "", plot_title).replace(" ", "_")
179
+ else:
180
+ pltitle = generate_random_string(10)
181
+
182
+ plt_base = str(out_path / f"{pltitle}_{'_'.join(labels)}_{mode.lower()}")
183
+
184
+ suffix = "_logp" if logp else "_pval"
185
+
186
+ plt_name = f"{plt_base}{suffix}.{output_format.lower()}"
187
+
188
+ table_out = f"{plt_base}{suffix}_locus_summary_table.tsv"
189
+
190
+ return plt_name, table_out
191
+
192
+
193
+
194
+ # ---------------------------------------------------------------------------
195
+ # input formatter
196
+ # ---------------------------------------------------------------------------
197
+ def prep_pycmplot_input_info(
198
+ sum_stats: list[str],
199
+ labels: list[str],
200
+ delim: Optional[str] = None,
201
+ chrom: Optional[str] = None,
202
+ pos: Optional[str] = None,
203
+ snp: Optional[str] = None,
204
+ pcol: Optional[str] = None,
205
+ build: Optional[str] = None
206
+ ):
207
+ """Resolve column names and delimiter
208
+
209
+ Parameters
210
+ ----------
211
+ sum_stats:
212
+ List of file paths to GWAS summary statistics (possibly gzip-compressed).
213
+ labels:
214
+ Track labels in the same order as *sum_stats*.
215
+ delim:
216
+ File delimiter (autodetected if omitted)
217
+ chrom:
218
+ Chromosome column
219
+ pos:
220
+ Position column
221
+ snp:
222
+ SNP or Marker ID column
223
+ pcol:
224
+ P-value column
225
+ build:
226
+ Build version column
227
+
228
+ Returns
229
+ -------
230
+ {old_columns, column_dtypes, new_columns, delim}
231
+
232
+ """
233
+ # ------------------------------------------------------------------
234
+ # Resolve delimiter
235
+ # ------------------------------------------------------------------
236
+ if delim:
237
+ sep = resolve_delimiter(delim)
238
+ else:
239
+ sep = None # autodetect per file
240
+
241
+ # ------------------------------------------------------------------
242
+ # Column-name candidate lists for auto-resolution
243
+ # ------------------------------------------------------------------
244
+ chr_candidates = [chrom, "CHR", "CHROM", "Chromosome", "#CHROM", "#CHR",
245
+ "Chrom", "chrom", "chr", "chromosome", "#chr", "#chrom"]
246
+ pos_candidates = [pos, "BP", "POS", "bp", "pos", "Basepair"]
247
+ snp_candidates = [snp, "SNP", "RSID", "rsID", "MarkerName", "MarkerID",
248
+ "Predictor", "Marker", "SNPID", "ID"]
249
+ pvl_candidates = [pcol, "P", "P-value", "Wald_P", "pvalue", "p_val", "pval"]
250
+ bld_candidates = [build, "BUILD", "Genome", "Genome_Build", "Genome-build"]
251
+
252
+ # Remove None entries
253
+ chr_candidates = [c for c in chr_candidates if c]
254
+ pos_candidates = [c for c in pos_candidates if c]
255
+ snp_candidates = [c for c in snp_candidates if c]
256
+ pvl_candidates = [c for c in pvl_candidates if c]
257
+ bld_candidates = [c for c in bld_candidates if c]
258
+
259
+ # ------------------------------------------------------------------
260
+ # Resolve column names per file
261
+ # ------------------------------------------------------------------
262
+ sumstats_hdr_dic: dict = {}
263
+
264
+ for name, fpath in zip(labels, sum_stats):
265
+ if sep:
266
+ file_sep, dialect = sep, None
267
+ else:
268
+ file_sep, dialect = detect_delimiter(fpath, sample_size=5_000)
269
+
270
+ hdr = get_file_header(fpath, delim=file_sep, dialect=dialect)
271
+
272
+ try:
273
+ chrom_col = next(c for c in hdr if c in set(chr_candidates))
274
+ pos_col = next(c for c in hdr if c in set(pos_candidates))
275
+ snp_col = next(c for c in hdr if c in set(snp_candidates))
276
+ pcol = next(c for c in hdr if c in set(pvl_candidates))
277
+ bcol = next(c for c in hdr if c in set(bld_candidates))
278
+ except StopIteration as exc:
279
+ sys.exit(
280
+ f"Error: could not find a required column in {fpath}.\n"
281
+ f" Header: {hdr}\n"
282
+ f" Details: {exc}"
283
+ )
284
+
285
+ old_cols = [chrom_col, pos_col, snp_col, pcol, bcol]
286
+ new_cols = {
287
+ chrom_col: "CHR",
288
+ pos_col: "POS",
289
+ snp_col: "SNP",
290
+ pcol: "P",
291
+ bcol: "BUILD",
292
+ }
293
+ col_dtypes = {
294
+ chrom_col: str,
295
+ pos_col: object,
296
+ snp_col: str,
297
+ pcol: float,
298
+ bcol: str,
299
+ }
300
+
301
+ sumstats_hdr_dic[name] = [old_cols, col_dtypes, new_cols, file_sep]
302
+
303
+ return sumstats_hdr_dic
304
+
305
+
112
306
  # ---------------------------------------------------------------------------
113
307
  # Sector-size helpers
114
308
  # ---------------------------------------------------------------------------
@@ -134,8 +328,6 @@ def get_sumstats_and_merged_sector_list(
134
328
  file_info: Optional[dict] = None,
135
329
  sort_tracks: Optional[str] = "chrom_len",
136
330
  table_out: Optional[str] = None,
137
- highlight: bool = False,
138
- highlight_thresh: float = 5e-8,
139
331
  signif_threshold: Optional[float] = None,
140
332
  signif_line: Optional[float] = None,
141
333
  suggest_threshold: Optional[float] = None,
@@ -156,8 +348,8 @@ def get_sumstats_and_merged_sector_list(
156
348
  ``'label'`` — sort tracks alphabetically by label.
157
349
  ``'chrom_len'`` — sort by number of chromosomes (default).
158
350
  ``None`` — preserve input order.
159
- highlight:
160
- Whether to flag loci for highlighting.
351
+ signif_threshold:
352
+ Threshold of significance to create hits table.
161
353
  resources:
162
354
  :class:`~pycmplot.resources.ResourceConfig` instance.
163
355
 
@@ -225,21 +417,13 @@ def get_sumstats_and_merged_sector_list(
225
417
  logger.info("Converting hg19 coordinates to hg38 ...")
226
418
  sumstats_loaded[label][0] = liftover_position(df, resources=resources)
227
419
 
228
- # Lead SNPs / highlight SNPs
229
- if highlight:
230
- logger.info("Extracting variants to highlight ...")
231
- sumstats_loaded[label][0], leads = get_highlight_snps(
232
- df=sumstats_loaded[label][0],
233
- window=2_000_000,
234
- highlight_thresh=highlight_thresh,
235
- logp=True,
236
- )
237
- else:
238
- leads = get_lead_snps(
239
- df=sumstats_loaded[label][0],
240
- highlight_thresh=signif_threshold or 5e-8,
241
- logp=True,
242
- )
420
+ # Lead SNPs
421
+ logger.info("Extracting variants to highlight ...")
422
+ leads = get_lead_snps(
423
+ df=sumstats_loaded[label][0],
424
+ signif_threshold=signif_threshold or 5e-8,
425
+ logp=True,
426
+ )
243
427
 
244
428
  all_lead_snps.append(leads)
245
429