graphpop-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphpop_cli/__init__.py +2 -0
- graphpop_cli/cli.py +161 -0
- graphpop_cli/commands/__init__.py +1 -0
- graphpop_cli/commands/aggregate.py +206 -0
- graphpop_cli/commands/batch.py +155 -0
- graphpop_cli/commands/compare.py +118 -0
- graphpop_cli/commands/config_cmd.py +117 -0
- graphpop_cli/commands/converge.py +156 -0
- graphpop_cli/commands/db.py +188 -0
- graphpop_cli/commands/divergence.py +37 -0
- graphpop_cli/commands/diversity.py +36 -0
- graphpop_cli/commands/dump.py +210 -0
- graphpop_cli/commands/export_bed.py +170 -0
- graphpop_cli/commands/export_windows.py +91 -0
- graphpop_cli/commands/extract.py +271 -0
- graphpop_cli/commands/filter_results.py +165 -0
- graphpop_cli/commands/garud_h.py +30 -0
- graphpop_cli/commands/genome_scan.py +41 -0
- graphpop_cli/commands/ihs.py +29 -0
- graphpop_cli/commands/import_data.py +266 -0
- graphpop_cli/commands/inventory.py +160 -0
- graphpop_cli/commands/joint_sfs.py +38 -0
- graphpop_cli/commands/ld.py +35 -0
- graphpop_cli/commands/lookup.py +207 -0
- graphpop_cli/commands/neighbors.py +175 -0
- graphpop_cli/commands/nsl.py +29 -0
- graphpop_cli/commands/plot.py +1066 -0
- graphpop_cli/commands/pop_summary.py +30 -0
- graphpop_cli/commands/query.py +15 -0
- graphpop_cli/commands/rank_genes.py +177 -0
- graphpop_cli/commands/report.py +264 -0
- graphpop_cli/commands/roh.py +30 -0
- graphpop_cli/commands/run_all.py +276 -0
- graphpop_cli/commands/server.py +98 -0
- graphpop_cli/commands/setup.py +299 -0
- graphpop_cli/commands/sfs.py +38 -0
- graphpop_cli/commands/validate.py +167 -0
- graphpop_cli/commands/xpehh.py +31 -0
- graphpop_cli/config.py +57 -0
- graphpop_cli/connection.py +52 -0
- graphpop_cli/formatters.py +81 -0
- graphpop_cli-0.1.0.dist-info/METADATA +73 -0
- graphpop_cli-0.1.0.dist-info/RECORD +46 -0
- graphpop_cli-0.1.0.dist-info/WHEEL +5 -0
- graphpop_cli-0.1.0.dist-info/entry_points.txt +2 -0
- graphpop_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1066 @@
|
|
|
1
|
+
"""graphpop plot — generate standard population genomics figures from TSV results."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import csv
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from ..cli import pass_ctx
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import matplotlib
|
|
14
|
+
matplotlib.use("Agg")
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
import numpy as np
|
|
17
|
+
HAS_MPL = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
HAS_MPL = False
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from scipy.cluster.hierarchy import linkage, dendrogram
|
|
23
|
+
from scipy.spatial.distance import squareform
|
|
24
|
+
HAS_SCIPY = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
HAS_SCIPY = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Nature-style settings
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
WONG_PALETTE = [
|
|
33
|
+
"#0072B2", "#E69F00", "#009E73", "#D55E00",
|
|
34
|
+
"#56B4E9", "#CC79A7", "#F0E442", "#000000",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _apply_style():
|
|
39
|
+
"""Apply Nature Methods figure style."""
|
|
40
|
+
plt.rcParams.update({
|
|
41
|
+
"font.family": "sans-serif",
|
|
42
|
+
"font.sans-serif": ["Arial", "Helvetica", "DejaVu Sans"],
|
|
43
|
+
"font.size": 7,
|
|
44
|
+
"axes.titlesize": 8,
|
|
45
|
+
"axes.labelsize": 7,
|
|
46
|
+
"xtick.labelsize": 6,
|
|
47
|
+
"ytick.labelsize": 6,
|
|
48
|
+
"legend.fontsize": 6,
|
|
49
|
+
"axes.linewidth": 0.6,
|
|
50
|
+
"xtick.major.width": 0.6,
|
|
51
|
+
"ytick.major.width": 0.6,
|
|
52
|
+
"xtick.direction": "out",
|
|
53
|
+
"ytick.direction": "out",
|
|
54
|
+
"lines.linewidth": 1.0,
|
|
55
|
+
"axes.spines.top": False,
|
|
56
|
+
"axes.spines.right": False,
|
|
57
|
+
"figure.facecolor": "white",
|
|
58
|
+
"axes.facecolor": "white",
|
|
59
|
+
"savefig.facecolor": "white",
|
|
60
|
+
"pdf.fonttype": 42,
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _check_matplotlib():
|
|
65
|
+
if not HAS_MPL:
|
|
66
|
+
click.echo(
|
|
67
|
+
"Error: matplotlib is required for graphpop plot.\n"
|
|
68
|
+
"Install with: pip install matplotlib numpy",
|
|
69
|
+
err=True,
|
|
70
|
+
)
|
|
71
|
+
raise SystemExit(1)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _read_tsv(path: str) -> list[dict]:
|
|
75
|
+
"""Read a TSV file, skipping comment lines."""
|
|
76
|
+
rows = []
|
|
77
|
+
with open(path) as f:
|
|
78
|
+
lines = [l for l in f if not l.startswith("#")]
|
|
79
|
+
reader = csv.DictReader(lines, delimiter="\t")
|
|
80
|
+
return list(reader)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _read_tsv_dir(directory: str, pattern: str = "*.tsv") -> list[dict]:
|
|
84
|
+
"""Read all TSV files in a directory."""
|
|
85
|
+
rows = []
|
|
86
|
+
for p in sorted(Path(directory).glob(pattern)):
|
|
87
|
+
rows.extend(_read_tsv(str(p)))
|
|
88
|
+
return rows
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _save_fig(fig, output: str, dpi: int = 300):
|
|
92
|
+
"""Save figure in the requested format."""
|
|
93
|
+
fig.savefig(output, dpi=dpi, bbox_inches="tight", facecolor="white")
|
|
94
|
+
click.echo(f"Saved: {output}")
|
|
95
|
+
plt.close(fig)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
# Plot group
|
|
100
|
+
# ---------------------------------------------------------------------------
|
|
101
|
+
@click.group()
|
|
102
|
+
def plot():
|
|
103
|
+
"""Generate standard population genomics figures from TSV results.
|
|
104
|
+
|
|
105
|
+
\b
|
|
106
|
+
Plot types:
|
|
107
|
+
diversity-bar Per-population diversity ranking
|
|
108
|
+
fst-heatmap Pairwise Fst matrix with clustering
|
|
109
|
+
manhattan Genome-wide statistic scan
|
|
110
|
+
pinpis piN/piS ratios across populations
|
|
111
|
+
sfs-plot Site frequency spectrum
|
|
112
|
+
roh-landscape Per-population FROH distribution
|
|
113
|
+
"""
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# diversity-bar
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
@plot.command("diversity-bar")
|
|
121
|
+
@click.argument("input_dir", type=click.Path(exists=True))
|
|
122
|
+
@click.option("-o", "--output", required=True, help="Output figure file (PNG/PDF)")
|
|
123
|
+
@click.option("--stat", default="pi", help="Statistic to plot (pi, theta_w, tajima_d, fis)")
|
|
124
|
+
@click.option("--title", help="Figure title")
|
|
125
|
+
@click.option("--width", type=float, default=7.2, help="Figure width in inches")
|
|
126
|
+
@click.option("--height", type=float, default=3.5, help="Figure height in inches")
|
|
127
|
+
def diversity_bar(input_dir, output, stat, title, width, height):
|
|
128
|
+
"""Plot per-population diversity as a horizontal bar chart.
|
|
129
|
+
|
|
130
|
+
INPUT_DIR should contain per-population TSV files from graphpop diversity
|
|
131
|
+
or graphpop run-all (e.g., results/diversity/).
|
|
132
|
+
|
|
133
|
+
\b
|
|
134
|
+
Examples:
|
|
135
|
+
graphpop plot diversity-bar results/diversity/ -o fig_diversity.png
|
|
136
|
+
graphpop plot diversity-bar results/diversity/ --stat tajima_d -o fig_tajima.png
|
|
137
|
+
"""
|
|
138
|
+
_check_matplotlib()
|
|
139
|
+
_apply_style()
|
|
140
|
+
|
|
141
|
+
rows = _read_tsv_dir(input_dir)
|
|
142
|
+
if not rows:
|
|
143
|
+
click.echo("No data found.", err=True)
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
# Aggregate by population (mean across chromosomes)
|
|
147
|
+
pop_vals = {}
|
|
148
|
+
for r in rows:
|
|
149
|
+
pop = r.get("population", r.get("file_pop", "unknown"))
|
|
150
|
+
val = float(r.get(stat, 0))
|
|
151
|
+
pop_vals.setdefault(pop, []).append(val)
|
|
152
|
+
|
|
153
|
+
pops = sorted(pop_vals.keys(), key=lambda p: np.mean(pop_vals[p]))
|
|
154
|
+
means = [np.mean(pop_vals[p]) for p in pops]
|
|
155
|
+
colors = [WONG_PALETTE[i % len(WONG_PALETTE)] for i in range(len(pops))]
|
|
156
|
+
|
|
157
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
158
|
+
y = range(len(pops))
|
|
159
|
+
ax.barh(y, means, color=colors, height=0.7, edgecolor="none")
|
|
160
|
+
ax.set_yticks(y)
|
|
161
|
+
ax.set_yticklabels(pops)
|
|
162
|
+
ax.set_xlabel(stat.replace("_", " ").title() if stat != "pi" else "Nucleotide diversity (π)")
|
|
163
|
+
ax.set_title(title or f"Per-population {stat}", fontweight="bold")
|
|
164
|
+
|
|
165
|
+
for i, v in enumerate(means):
|
|
166
|
+
ax.text(v + max(means) * 0.01, i, f"{v:.4f}", va="center", fontsize=5)
|
|
167
|
+
|
|
168
|
+
fig.tight_layout()
|
|
169
|
+
_save_fig(fig, output)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ---------------------------------------------------------------------------
|
|
173
|
+
# fst-heatmap
|
|
174
|
+
# ---------------------------------------------------------------------------
|
|
175
|
+
@plot.command("fst-heatmap")
|
|
176
|
+
@click.argument("input_dir", type=click.Path(exists=True))
|
|
177
|
+
@click.option("-o", "--output", required=True, help="Output figure file")
|
|
178
|
+
@click.option("--stat", default="fst_wc", help="Fst statistic (fst_hudson, fst_wc)")
|
|
179
|
+
@click.option("--title", help="Figure title")
|
|
180
|
+
@click.option("--width", type=float, default=7.2)
|
|
181
|
+
@click.option("--height", type=float, default=6.0)
|
|
182
|
+
def fst_heatmap(input_dir, output, stat, title, width, height):
|
|
183
|
+
"""Plot pairwise Fst as a heatmap matrix.
|
|
184
|
+
|
|
185
|
+
INPUT_DIR should contain pairwise TSV files from graphpop divergence
|
|
186
|
+
or graphpop run-all (e.g., results/divergence/).
|
|
187
|
+
|
|
188
|
+
\b
|
|
189
|
+
Examples:
|
|
190
|
+
graphpop plot fst-heatmap results/divergence/ -o fig_fst.png
|
|
191
|
+
graphpop plot fst-heatmap results/divergence/ --stat fst_hudson -o fig_fst_hudson.pdf
|
|
192
|
+
"""
|
|
193
|
+
_check_matplotlib()
|
|
194
|
+
_apply_style()
|
|
195
|
+
|
|
196
|
+
rows = _read_tsv_dir(input_dir)
|
|
197
|
+
if not rows:
|
|
198
|
+
click.echo("No data found.", err=True)
|
|
199
|
+
return
|
|
200
|
+
|
|
201
|
+
# Build pairwise Fst matrix
|
|
202
|
+
pair_vals = {}
|
|
203
|
+
for r in rows:
|
|
204
|
+
p1 = r.get("pop1", "")
|
|
205
|
+
p2 = r.get("pop2", "")
|
|
206
|
+
if not p1 or not p2:
|
|
207
|
+
# Try parsing from filename pattern
|
|
208
|
+
continue
|
|
209
|
+
val = float(r.get(stat, 0))
|
|
210
|
+
pair_vals.setdefault((p1, p2), []).append(val)
|
|
211
|
+
|
|
212
|
+
if not pair_vals:
|
|
213
|
+
click.echo("No pairwise data found. Check TSV format.", err=True)
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
# Get unique populations
|
|
217
|
+
all_pops = sorted(set(p for pair in pair_vals for p in pair))
|
|
218
|
+
n = len(all_pops)
|
|
219
|
+
pop_idx = {p: i for i, p in enumerate(all_pops)}
|
|
220
|
+
|
|
221
|
+
matrix = np.zeros((n, n))
|
|
222
|
+
for (p1, p2), vals in pair_vals.items():
|
|
223
|
+
i, j = pop_idx[p1], pop_idx[p2]
|
|
224
|
+
mean_val = np.mean(vals)
|
|
225
|
+
matrix[i, j] = mean_val
|
|
226
|
+
matrix[j, i] = mean_val
|
|
227
|
+
|
|
228
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
229
|
+
im = ax.imshow(matrix, cmap="YlOrRd", aspect="equal")
|
|
230
|
+
|
|
231
|
+
ax.set_xticks(range(n))
|
|
232
|
+
ax.set_yticks(range(n))
|
|
233
|
+
ax.set_xticklabels(all_pops, rotation=45, ha="right")
|
|
234
|
+
ax.set_yticklabels(all_pops)
|
|
235
|
+
|
|
236
|
+
# Annotate cells if small matrix
|
|
237
|
+
if n <= 15:
|
|
238
|
+
for i in range(n):
|
|
239
|
+
for j in range(n):
|
|
240
|
+
if i != j:
|
|
241
|
+
val = matrix[i, j]
|
|
242
|
+
color = "white" if val > np.max(matrix) * 0.6 else "black"
|
|
243
|
+
ax.text(j, i, f"{val:.3f}", ha="center", va="center",
|
|
244
|
+
fontsize=4, color=color)
|
|
245
|
+
|
|
246
|
+
cbar = fig.colorbar(im, ax=ax, shrink=0.8, label=stat.replace("_", " "))
|
|
247
|
+
ax.set_title(title or f"Pairwise {stat}", fontweight="bold")
|
|
248
|
+
fig.tight_layout()
|
|
249
|
+
_save_fig(fig, output)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# ---------------------------------------------------------------------------
|
|
253
|
+
# manhattan
|
|
254
|
+
# ---------------------------------------------------------------------------
|
|
255
|
+
@plot.command("manhattan")
|
|
256
|
+
@click.argument("input_file", type=click.Path(exists=True))
|
|
257
|
+
@click.option("-o", "--output", required=True, help="Output figure file")
|
|
258
|
+
@click.option("--stat", default="ihs", help="Statistic column name")
|
|
259
|
+
@click.option("--threshold", type=float, help="Significance threshold line")
|
|
260
|
+
@click.option("--abs-value/--raw-value", default=True, help="Plot absolute values")
|
|
261
|
+
@click.option("--title", help="Figure title")
|
|
262
|
+
@click.option("--width", type=float, default=7.2)
|
|
263
|
+
@click.option("--height", type=float, default=3.0)
|
|
264
|
+
def manhattan(input_file, output, stat, threshold, abs_value, title, width, height):
|
|
265
|
+
"""Plot a Manhattan plot of per-variant or per-window statistics.
|
|
266
|
+
|
|
267
|
+
INPUT_FILE should be a TSV with columns: pos (or start) and the statistic.
|
|
268
|
+
For multi-chromosome input, include a chr column.
|
|
269
|
+
|
|
270
|
+
\b
|
|
271
|
+
Examples:
|
|
272
|
+
graphpop plot manhattan ihs_results.tsv --stat ihs --threshold 2.5 -o fig_ihs.png
|
|
273
|
+
graphpop plot manhattan windows.tsv --stat fst --threshold 0.5 -o fig_fst_scan.png
|
|
274
|
+
graphpop plot manhattan xpehh.tsv --stat xpehh --raw-value -o fig_xpehh.pdf
|
|
275
|
+
"""
|
|
276
|
+
_check_matplotlib()
|
|
277
|
+
_apply_style()
|
|
278
|
+
|
|
279
|
+
rows = _read_tsv(input_file)
|
|
280
|
+
if not rows:
|
|
281
|
+
click.echo("No data found.", err=True)
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
# Extract positions and values
|
|
285
|
+
positions = []
|
|
286
|
+
values = []
|
|
287
|
+
chroms = []
|
|
288
|
+
for r in rows:
|
|
289
|
+
pos = int(r.get("pos", r.get("start", 0)))
|
|
290
|
+
val = float(r.get(stat, r.get(f"{stat}_unstd", 0)))
|
|
291
|
+
if abs_value:
|
|
292
|
+
val = abs(val)
|
|
293
|
+
chrom = r.get("chr", r.get("chromosome", ""))
|
|
294
|
+
positions.append(pos)
|
|
295
|
+
values.append(val)
|
|
296
|
+
chroms.append(chrom)
|
|
297
|
+
|
|
298
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
299
|
+
|
|
300
|
+
# Color by chromosome
|
|
301
|
+
unique_chrs = sorted(set(chroms), key=lambda c: (len(c), c))
|
|
302
|
+
if len(unique_chrs) > 1:
|
|
303
|
+
chr_colors = {c: WONG_PALETTE[i % 2] for i, c in enumerate(unique_chrs)}
|
|
304
|
+
# Make positions additive
|
|
305
|
+
chr_offsets = {}
|
|
306
|
+
offset = 0
|
|
307
|
+
for c in unique_chrs:
|
|
308
|
+
chr_offsets[c] = offset
|
|
309
|
+
chr_positions = [p for p, ch in zip(positions, chroms) if ch == c]
|
|
310
|
+
if chr_positions:
|
|
311
|
+
offset += max(chr_positions) + max(chr_positions) * 0.05
|
|
312
|
+
|
|
313
|
+
adj_pos = [p + chr_offsets.get(c, 0) for p, c in zip(positions, chroms)]
|
|
314
|
+
colors = [chr_colors[c] for c in chroms]
|
|
315
|
+
ax.scatter(adj_pos, values, c=colors, s=1, alpha=0.5, rasterized=True)
|
|
316
|
+
|
|
317
|
+
# Chromosome labels
|
|
318
|
+
for c in unique_chrs:
|
|
319
|
+
c_positions = [p + chr_offsets[c] for p, ch in zip(positions, chroms) if ch == c]
|
|
320
|
+
if c_positions:
|
|
321
|
+
mid = (min(c_positions) + max(c_positions)) / 2
|
|
322
|
+
label = c.replace("chr", "").replace("Chr", "")
|
|
323
|
+
ax.text(mid, -max(values) * 0.05, label, ha="center", fontsize=4)
|
|
324
|
+
ax.set_xlabel("Chromosome")
|
|
325
|
+
else:
|
|
326
|
+
ax.scatter(positions, values, c=WONG_PALETTE[0], s=1, alpha=0.5, rasterized=True)
|
|
327
|
+
ax.set_xlabel(f"Position on {unique_chrs[0] if unique_chrs else 'chromosome'} (bp)")
|
|
328
|
+
|
|
329
|
+
if threshold is not None:
|
|
330
|
+
ax.axhline(threshold, color="#D55E00", linestyle="--", linewidth=0.8, alpha=0.7)
|
|
331
|
+
|
|
332
|
+
ylabel = f"|{stat}|" if abs_value else stat
|
|
333
|
+
ax.set_ylabel(ylabel)
|
|
334
|
+
ax.set_title(title or f"Manhattan plot: {stat}", fontweight="bold")
|
|
335
|
+
fig.tight_layout()
|
|
336
|
+
_save_fig(fig, output)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# ---------------------------------------------------------------------------
|
|
340
|
+
# pinpis
|
|
341
|
+
# ---------------------------------------------------------------------------
|
|
342
|
+
@plot.command("pinpis")
|
|
343
|
+
@click.argument("input_file", type=click.Path(exists=True))
|
|
344
|
+
@click.option("-o", "--output", required=True, help="Output figure file")
|
|
345
|
+
@click.option("--title", help="Figure title")
|
|
346
|
+
@click.option("--width", type=float, default=7.2)
|
|
347
|
+
@click.option("--height", type=float, default=4.0)
|
|
348
|
+
def pinpis(input_file, output, title, width, height):
|
|
349
|
+
"""Plot piN/piS ratios across populations.
|
|
350
|
+
|
|
351
|
+
INPUT_FILE should be a TSV with columns: population, piN_piS (or piN and piS
|
|
352
|
+
columns to compute the ratio).
|
|
353
|
+
|
|
354
|
+
\b
|
|
355
|
+
Generate input:
|
|
356
|
+
# For each population, compute piN and piS:
|
|
357
|
+
graphpop diversity chr1 1 43270923 POP --consequence missense_variant -o piN.tsv
|
|
358
|
+
graphpop diversity chr1 1 43270923 POP --consequence synonymous_variant -o piS.tsv
|
|
359
|
+
# Combine into a single TSV with columns: population, piN_piS
|
|
360
|
+
|
|
361
|
+
Examples:
|
|
362
|
+
graphpop plot pinpis pinpis_ratios.tsv -o fig_pinpis.png
|
|
363
|
+
"""
|
|
364
|
+
_check_matplotlib()
|
|
365
|
+
_apply_style()
|
|
366
|
+
|
|
367
|
+
rows = _read_tsv(input_file)
|
|
368
|
+
if not rows:
|
|
369
|
+
click.echo("No data found.", err=True)
|
|
370
|
+
return
|
|
371
|
+
|
|
372
|
+
# Try different column name patterns
|
|
373
|
+
pops = []
|
|
374
|
+
ratios = []
|
|
375
|
+
for r in rows:
|
|
376
|
+
pop = r.get("population", r.get("pop", ""))
|
|
377
|
+
ratio = r.get("piN_piS", r.get("pinpis", r.get("ratio", None)))
|
|
378
|
+
if ratio is None:
|
|
379
|
+
piN = float(r.get("piN", r.get("pi_N", r.get("pi_missense", 0))))
|
|
380
|
+
piS = float(r.get("piS", r.get("pi_S", r.get("pi_synonymous", 1))))
|
|
381
|
+
ratio = piN / piS if piS > 0 else 0
|
|
382
|
+
else:
|
|
383
|
+
ratio = float(ratio)
|
|
384
|
+
pops.append(pop)
|
|
385
|
+
ratios.append(ratio)
|
|
386
|
+
|
|
387
|
+
# Sort by ratio
|
|
388
|
+
sorted_pairs = sorted(zip(pops, ratios), key=lambda x: x[1])
|
|
389
|
+
pops = [p for p, _ in sorted_pairs]
|
|
390
|
+
ratios = [r for _, r in sorted_pairs]
|
|
391
|
+
|
|
392
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
393
|
+
colors = [WONG_PALETTE[0] if r <= 1.0 else WONG_PALETTE[3] for r in ratios]
|
|
394
|
+
bars = ax.barh(range(len(pops)), ratios, color=colors, height=0.7, edgecolor="none")
|
|
395
|
+
|
|
396
|
+
ax.axvline(1.0, color="black", linestyle="--", linewidth=0.8, alpha=0.5,
|
|
397
|
+
label="Neutral expectation (πN/πS = 1)")
|
|
398
|
+
ax.set_yticks(range(len(pops)))
|
|
399
|
+
ax.set_yticklabels(pops)
|
|
400
|
+
ax.set_xlabel("πN/πS ratio")
|
|
401
|
+
ax.set_title(title or "Cost of domestication: πN/πS across populations", fontweight="bold")
|
|
402
|
+
|
|
403
|
+
for i, v in enumerate(ratios):
|
|
404
|
+
ax.text(v + max(ratios) * 0.01, i, f"{v:.3f}", va="center", fontsize=5)
|
|
405
|
+
|
|
406
|
+
ax.legend(fontsize=5, loc="lower right")
|
|
407
|
+
fig.tight_layout()
|
|
408
|
+
_save_fig(fig, output)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
# ---------------------------------------------------------------------------
|
|
412
|
+
# sfs-plot
|
|
413
|
+
# ---------------------------------------------------------------------------
|
|
414
|
+
@plot.command("sfs-plot")
|
|
415
|
+
@click.argument("input_file", type=click.Path(exists=True))
|
|
416
|
+
@click.option("-o", "--output", required=True, help="Output figure file")
|
|
417
|
+
@click.option("--title", help="Figure title")
|
|
418
|
+
@click.option("--log-scale/--linear", default=False, help="Use log scale for y-axis")
|
|
419
|
+
@click.option("--width", type=float, default=5.0)
|
|
420
|
+
@click.option("--height", type=float, default=3.5)
|
|
421
|
+
def sfs_plot(input_file, output, title, log_scale, width, height):
|
|
422
|
+
"""Plot a site frequency spectrum.
|
|
423
|
+
|
|
424
|
+
INPUT_FILE should be a TSV from graphpop sfs with an 'sfs' column
|
|
425
|
+
containing comma-separated counts.
|
|
426
|
+
|
|
427
|
+
\b
|
|
428
|
+
Examples:
|
|
429
|
+
graphpop sfs chr22 1 51304566 EUR -o sfs.tsv
|
|
430
|
+
graphpop plot sfs-plot sfs.tsv -o fig_sfs.png
|
|
431
|
+
graphpop plot sfs-plot sfs.tsv --log-scale -o fig_sfs_log.png
|
|
432
|
+
"""
|
|
433
|
+
_check_matplotlib()
|
|
434
|
+
_apply_style()
|
|
435
|
+
|
|
436
|
+
rows = _read_tsv(input_file)
|
|
437
|
+
if not rows:
|
|
438
|
+
click.echo("No data found.", err=True)
|
|
439
|
+
return
|
|
440
|
+
|
|
441
|
+
sfs_str = rows[0].get("sfs", "")
|
|
442
|
+
counts = [int(x) for x in sfs_str.split(",") if x.strip()]
|
|
443
|
+
|
|
444
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
445
|
+
x = range(len(counts))
|
|
446
|
+
ax.bar(x, counts, color=WONG_PALETTE[0], edgecolor="none", width=0.8)
|
|
447
|
+
|
|
448
|
+
if log_scale:
|
|
449
|
+
ax.set_yscale("log")
|
|
450
|
+
ax.set_ylabel("Count (log scale)")
|
|
451
|
+
else:
|
|
452
|
+
ax.set_ylabel("Count")
|
|
453
|
+
|
|
454
|
+
ax.set_xlabel("Allele count")
|
|
455
|
+
ax.set_title(title or "Site frequency spectrum", fontweight="bold")
|
|
456
|
+
|
|
457
|
+
# Label first and last bins
|
|
458
|
+
if len(counts) > 2:
|
|
459
|
+
ax.set_xticks([0, len(counts) // 4, len(counts) // 2,
|
|
460
|
+
3 * len(counts) // 4, len(counts) - 1])
|
|
461
|
+
|
|
462
|
+
fig.tight_layout()
|
|
463
|
+
_save_fig(fig, output)
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
# ---------------------------------------------------------------------------
|
|
467
|
+
# roh-landscape
|
|
468
|
+
# ---------------------------------------------------------------------------
|
|
469
|
+
@plot.command("roh-landscape")
|
|
470
|
+
@click.argument("input_dir", type=click.Path(exists=True))
|
|
471
|
+
@click.option("-o", "--output", required=True, help="Output figure file")
|
|
472
|
+
@click.option("--title", help="Figure title")
|
|
473
|
+
@click.option("--width", type=float, default=7.2)
|
|
474
|
+
@click.option("--height", type=float, default=4.0)
|
|
475
|
+
def roh_landscape(input_dir, output, title, width, height):
|
|
476
|
+
"""Plot per-population FROH distribution as violin/box plots.
|
|
477
|
+
|
|
478
|
+
INPUT_DIR should contain per-population ROH TSV files from graphpop roh
|
|
479
|
+
or graphpop run-all (e.g., results/roh/).
|
|
480
|
+
|
|
481
|
+
\b
|
|
482
|
+
Examples:
|
|
483
|
+
graphpop plot roh-landscape results/roh/ -o fig_roh.png
|
|
484
|
+
"""
|
|
485
|
+
_check_matplotlib()
|
|
486
|
+
_apply_style()
|
|
487
|
+
|
|
488
|
+
rows = _read_tsv_dir(input_dir)
|
|
489
|
+
if not rows:
|
|
490
|
+
click.echo("No data found.", err=True)
|
|
491
|
+
return
|
|
492
|
+
|
|
493
|
+
# Group FROH by population
|
|
494
|
+
pop_froh = {}
|
|
495
|
+
for r in rows:
|
|
496
|
+
pop = r.get("population", r.get("file_pop", "unknown"))
|
|
497
|
+
froh = float(r.get("froh", 0))
|
|
498
|
+
pop_froh.setdefault(pop, []).append(froh)
|
|
499
|
+
|
|
500
|
+
pops = sorted(pop_froh.keys(), key=lambda p: np.median(pop_froh[p]))
|
|
501
|
+
data = [pop_froh[p] for p in pops]
|
|
502
|
+
|
|
503
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
504
|
+
parts = ax.violinplot(data, positions=range(len(pops)), showmedians=True,
|
|
505
|
+
showextrema=False)
|
|
506
|
+
|
|
507
|
+
for i, body in enumerate(parts["bodies"]):
|
|
508
|
+
body.set_facecolor(WONG_PALETTE[i % len(WONG_PALETTE)])
|
|
509
|
+
body.set_alpha(0.7)
|
|
510
|
+
body.set_edgecolor("none")
|
|
511
|
+
parts["cmedians"].set_color("black")
|
|
512
|
+
parts["cmedians"].set_linewidth(1.0)
|
|
513
|
+
|
|
514
|
+
# Add mean markers
|
|
515
|
+
means = [np.mean(d) for d in data]
|
|
516
|
+
ax.scatter(range(len(pops)), means, color="black", s=15, zorder=3, marker="D")
|
|
517
|
+
|
|
518
|
+
ax.set_xticks(range(len(pops)))
|
|
519
|
+
ax.set_xticklabels(pops, rotation=45, ha="right")
|
|
520
|
+
ax.set_ylabel("FROH (fraction of genome in ROH)")
|
|
521
|
+
ax.set_title(title or "Inbreeding landscape: FROH by population", fontweight="bold")
|
|
522
|
+
|
|
523
|
+
fig.tight_layout()
|
|
524
|
+
_save_fig(fig, output)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
# ---------------------------------------------------------------------------
|
|
528
|
+
# gene-zoom
|
|
529
|
+
# ---------------------------------------------------------------------------
|
|
530
|
+
@plot.command("gene-zoom")
|
|
531
|
+
@click.argument("target")
|
|
532
|
+
@click.option("--pop", "population", required=True, help="Population name")
|
|
533
|
+
@click.option("--pop2", help="Second population (for Fst track)")
|
|
534
|
+
@click.option("-o", "--output", required=True, help="Output figure file (PNG/PDF)")
|
|
535
|
+
@click.option("--title", help="Figure title")
|
|
536
|
+
@click.option("--width", type=float, default=7.2, help="Figure width in inches")
|
|
537
|
+
@click.option("--height", type=float, default=6.0, help="Figure height in inches")
|
|
538
|
+
@pass_ctx
|
|
539
|
+
def gene_zoom(ctx, target, population, pop2, output, title, width, height):
|
|
540
|
+
"""Multi-track regional plot for a gene or genomic region.
|
|
541
|
+
|
|
542
|
+
TARGET is either a gene name (e.g., KCNE1) or a region in chr:start-end
|
|
543
|
+
format (e.g., chr6:9000000-9600000). The command resolves gene names to
|
|
544
|
+
coordinates via the Gene node in the graph.
|
|
545
|
+
|
|
546
|
+
\b
|
|
547
|
+
Tracks (top to bottom):
|
|
548
|
+
1. Fst (from GenomicWindow or per-variant)
|
|
549
|
+
2. |iHS| (from Variant properties)
|
|
550
|
+
3. Gene model (from HAS_CONSEQUENCE edges)
|
|
551
|
+
|
|
552
|
+
\b
|
|
553
|
+
Examples:
|
|
554
|
+
graphpop plot gene-zoom KCNE1 --pop EUR -o fig_kcne1.png
|
|
555
|
+
graphpop plot gene-zoom chr6:9000000-9600000 --pop GJ-tmp -o fig_hd1.png
|
|
556
|
+
graphpop plot gene-zoom GW5 --pop GJ-tmp --pop2 GJ-trop -o fig_gw5.pdf
|
|
557
|
+
"""
|
|
558
|
+
_check_matplotlib()
|
|
559
|
+
_apply_style()
|
|
560
|
+
|
|
561
|
+
# --- Resolve target to chr, start, end ---
|
|
562
|
+
region_match = re.match(r'^(chr\w+|Chr\w+):(\d+)-(\d+)$', target)
|
|
563
|
+
if region_match:
|
|
564
|
+
chrom = region_match.group(1)
|
|
565
|
+
reg_start = int(region_match.group(2))
|
|
566
|
+
reg_end = int(region_match.group(3))
|
|
567
|
+
region_label = f"{chrom}:{reg_start}-{reg_end}"
|
|
568
|
+
else:
|
|
569
|
+
# Resolve gene name
|
|
570
|
+
recs = ctx.run(
|
|
571
|
+
"MATCH (g:Gene) "
|
|
572
|
+
"WHERE g.symbol = $target OR g.geneId = $target "
|
|
573
|
+
"RETURN g.chr AS chr, g.start AS start, g.end AS end, "
|
|
574
|
+
"g.symbol AS symbol LIMIT 1",
|
|
575
|
+
{"target": target},
|
|
576
|
+
)
|
|
577
|
+
if not recs:
|
|
578
|
+
click.echo(f"Gene '{target}' not found in the graph.", err=True)
|
|
579
|
+
raise SystemExit(1)
|
|
580
|
+
g = recs[0]
|
|
581
|
+
chrom = g["chr"]
|
|
582
|
+
# Pad 20% on each side for context
|
|
583
|
+
gene_len = (g["end"] or 0) - (g["start"] or 0)
|
|
584
|
+
pad = max(gene_len * 0.2, 10000)
|
|
585
|
+
reg_start = max(0, int((g["start"] or 0) - pad))
|
|
586
|
+
reg_end = int((g["end"] or 0) + pad)
|
|
587
|
+
region_label = f"{g['symbol']} ({chrom}:{reg_start}-{reg_end})"
|
|
588
|
+
|
|
589
|
+
click.echo(f"Region: {region_label}", err=True)
|
|
590
|
+
|
|
591
|
+
# --- Query Fst from GenomicWindow ---
|
|
592
|
+
fst_pos = []
|
|
593
|
+
fst_vals = []
|
|
594
|
+
fst_query = (
|
|
595
|
+
"MATCH (w:GenomicWindow) "
|
|
596
|
+
"WHERE w.chr = $chrom AND w.population = $population "
|
|
597
|
+
"AND w.start >= $reg_start AND w.end <= $reg_end "
|
|
598
|
+
"RETURN w.start AS start, w.end AS end, "
|
|
599
|
+
"w.fst AS fst "
|
|
600
|
+
"ORDER BY w.start"
|
|
601
|
+
)
|
|
602
|
+
region_params = {
|
|
603
|
+
"chrom": chrom, "population": population,
|
|
604
|
+
"reg_start": reg_start, "reg_end": reg_end,
|
|
605
|
+
}
|
|
606
|
+
try:
|
|
607
|
+
fst_recs = ctx.run(fst_query, region_params)
|
|
608
|
+
for r in fst_recs:
|
|
609
|
+
mid = ((r["start"] or 0) + (r["end"] or 0)) / 2
|
|
610
|
+
val = r.get("fst")
|
|
611
|
+
if val is not None:
|
|
612
|
+
fst_pos.append(mid)
|
|
613
|
+
fst_vals.append(float(val))
|
|
614
|
+
except SystemExit:
|
|
615
|
+
click.echo("Warning: no GenomicWindow Fst data.", err=True)
|
|
616
|
+
|
|
617
|
+
# --- Query |iHS| from Variant nodes ---
|
|
618
|
+
ihs_prop = f"ihs_{population}"
|
|
619
|
+
ihs_pos = []
|
|
620
|
+
ihs_vals = []
|
|
621
|
+
ihs_query = (
|
|
622
|
+
f"MATCH (v:Variant) "
|
|
623
|
+
f"WHERE v.chr = $chrom AND v.pos >= $reg_start AND v.pos <= $reg_end "
|
|
624
|
+
f"AND v.{ihs_prop} IS NOT NULL "
|
|
625
|
+
f"RETURN v.pos AS pos, v.{ihs_prop} AS ihs "
|
|
626
|
+
f"ORDER BY v.pos"
|
|
627
|
+
)
|
|
628
|
+
try:
|
|
629
|
+
ihs_recs = ctx.run(ihs_query, region_params)
|
|
630
|
+
for r in ihs_recs:
|
|
631
|
+
ihs_pos.append(r["pos"])
|
|
632
|
+
ihs_vals.append(abs(float(r["ihs"])))
|
|
633
|
+
except SystemExit:
|
|
634
|
+
click.echo("Warning: no iHS data.", err=True)
|
|
635
|
+
|
|
636
|
+
# --- Query gene models ---
|
|
637
|
+
gene_query = (
|
|
638
|
+
"MATCH (v:Variant)-[hc:HAS_CONSEQUENCE]->(g:Gene) "
|
|
639
|
+
"WHERE v.chr = $chrom AND v.pos >= $reg_start AND v.pos <= $reg_end "
|
|
640
|
+
"RETURN DISTINCT g.symbol AS gene, g.start AS start, g.end AS end "
|
|
641
|
+
"ORDER BY g.start"
|
|
642
|
+
)
|
|
643
|
+
gene_recs = ctx.run(gene_query, region_params)
|
|
644
|
+
|
|
645
|
+
# --- Build figure ---
|
|
646
|
+
fig, axes = plt.subplots(3, 1, figsize=(width, height), sharex=True,
|
|
647
|
+
gridspec_kw={"height_ratios": [2, 2, 1]})
|
|
648
|
+
|
|
649
|
+
# Track 1: Fst
|
|
650
|
+
ax_fst = axes[0]
|
|
651
|
+
if fst_pos:
|
|
652
|
+
ax_fst.fill_between(fst_pos, fst_vals, alpha=0.3, color=WONG_PALETTE[0])
|
|
653
|
+
ax_fst.plot(fst_pos, fst_vals, color=WONG_PALETTE[0], linewidth=0.8)
|
|
654
|
+
ax_fst.set_ylabel("Fst")
|
|
655
|
+
ax_fst.set_title(title or f"Gene zoom: {region_label}", fontweight="bold")
|
|
656
|
+
|
|
657
|
+
# Track 2: |iHS|
|
|
658
|
+
ax_ihs = axes[1]
|
|
659
|
+
if ihs_pos:
|
|
660
|
+
ax_ihs.scatter(ihs_pos, ihs_vals, s=2, color=WONG_PALETTE[3],
|
|
661
|
+
alpha=0.6, rasterized=True)
|
|
662
|
+
# Threshold line at 2.0
|
|
663
|
+
ax_ihs.axhline(2.0, color="grey", linestyle="--", linewidth=0.6, alpha=0.5)
|
|
664
|
+
ax_ihs.set_ylabel("|iHS|")
|
|
665
|
+
|
|
666
|
+
# Track 3: Gene models
|
|
667
|
+
ax_gene = axes[2]
|
|
668
|
+
if gene_recs:
|
|
669
|
+
y_pos = 0.5
|
|
670
|
+
for i, g in enumerate(gene_recs):
|
|
671
|
+
g_start = g.get("start") or reg_start
|
|
672
|
+
g_end = g.get("end") or reg_end
|
|
673
|
+
color = WONG_PALETTE[i % len(WONG_PALETTE)]
|
|
674
|
+
ax_gene.barh(y_pos, g_end - g_start, left=g_start, height=0.3,
|
|
675
|
+
color=color, edgecolor="black", linewidth=0.3)
|
|
676
|
+
mid = (g_start + g_end) / 2
|
|
677
|
+
ax_gene.text(mid, y_pos + 0.25, g.get("gene", ""),
|
|
678
|
+
ha="center", va="bottom", fontsize=5, style="italic")
|
|
679
|
+
y_pos += 0.5
|
|
680
|
+
ax_gene.set_ylabel("Genes")
|
|
681
|
+
ax_gene.set_yticks([])
|
|
682
|
+
ax_gene.set_xlabel(f"Position on {chrom} (bp)")
|
|
683
|
+
ax_gene.set_xlim(reg_start, reg_end)
|
|
684
|
+
|
|
685
|
+
# Add vertical lines at peak iHS positions
|
|
686
|
+
if ihs_vals:
|
|
687
|
+
peak_thresh = max(ihs_vals) * 0.9 if max(ihs_vals) > 0 else 999
|
|
688
|
+
for pos, val in zip(ihs_pos, ihs_vals):
|
|
689
|
+
if val >= peak_thresh:
|
|
690
|
+
for ax in axes:
|
|
691
|
+
ax.axvline(pos, color=WONG_PALETTE[3], linestyle=":",
|
|
692
|
+
linewidth=0.5, alpha=0.4)
|
|
693
|
+
|
|
694
|
+
fig.tight_layout()
|
|
695
|
+
_save_fig(fig, output)
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
# ---------------------------------------------------------------------------
|
|
699
|
+
# pop-tree
|
|
700
|
+
# ---------------------------------------------------------------------------
|
|
701
|
+
@plot.command("pop-tree")
|
|
702
|
+
@click.argument("input_dir", type=click.Path(exists=True))
|
|
703
|
+
@click.option("-o", "--output", required=True, help="Output figure file (PNG/PDF)")
|
|
704
|
+
@click.option("--method", default="upgma", type=click.Choice(["upgma", "nj"]),
|
|
705
|
+
help="Tree method: upgma (default) or nj")
|
|
706
|
+
@click.option("--stat", default="fst_wc", help="Fst statistic column (fst_wc or fst_hudson)")
|
|
707
|
+
@click.option("--title", help="Figure title")
|
|
708
|
+
@click.option("--width", type=float, default=7.2, help="Figure width in inches")
|
|
709
|
+
@click.option("--height", type=float, default=5.0, help="Figure height in inches")
|
|
710
|
+
def pop_tree(input_dir, output, method, stat, title, width, height):
|
|
711
|
+
"""Build a UPGMA or neighbor-joining tree from pairwise Fst data.
|
|
712
|
+
|
|
713
|
+
INPUT_DIR should contain pairwise divergence TSV files (same format as
|
|
714
|
+
fst-heatmap input, from graphpop divergence or graphpop run-all).
|
|
715
|
+
|
|
716
|
+
Uses scipy.cluster.hierarchy for UPGMA clustering. Neighbor-joining is
|
|
717
|
+
approximated via the 'weighted' linkage method.
|
|
718
|
+
|
|
719
|
+
\b
|
|
720
|
+
Examples:
|
|
721
|
+
graphpop plot pop-tree results/divergence/ -o fig_tree.png
|
|
722
|
+
graphpop plot pop-tree results/divergence/ --method nj --stat fst_hudson -o fig_nj.png
|
|
723
|
+
"""
|
|
724
|
+
_check_matplotlib()
|
|
725
|
+
_apply_style()
|
|
726
|
+
|
|
727
|
+
if not HAS_SCIPY:
|
|
728
|
+
click.echo(
|
|
729
|
+
"Error: scipy is required for pop-tree.\n"
|
|
730
|
+
"Install with: pip install scipy",
|
|
731
|
+
err=True,
|
|
732
|
+
)
|
|
733
|
+
raise SystemExit(1)
|
|
734
|
+
|
|
735
|
+
rows = _read_tsv_dir(input_dir)
|
|
736
|
+
if not rows:
|
|
737
|
+
click.echo("No data found.", err=True)
|
|
738
|
+
return
|
|
739
|
+
|
|
740
|
+
# Build pairwise Fst matrix
|
|
741
|
+
pair_vals = {}
|
|
742
|
+
for r in rows:
|
|
743
|
+
p1 = r.get("pop1", "")
|
|
744
|
+
p2 = r.get("pop2", "")
|
|
745
|
+
if not p1 or not p2:
|
|
746
|
+
continue
|
|
747
|
+
val = float(r.get(stat, 0))
|
|
748
|
+
pair_vals.setdefault((p1, p2), []).append(val)
|
|
749
|
+
|
|
750
|
+
if not pair_vals:
|
|
751
|
+
click.echo(f"No pairwise data found. Check TSV format and --stat={stat}.", err=True)
|
|
752
|
+
return
|
|
753
|
+
|
|
754
|
+
# Build distance matrix
|
|
755
|
+
all_pops = sorted(set(p for pair in pair_vals for p in pair))
|
|
756
|
+
n = len(all_pops)
|
|
757
|
+
pop_idx = {p: i for i, p in enumerate(all_pops)}
|
|
758
|
+
|
|
759
|
+
matrix = np.zeros((n, n))
|
|
760
|
+
for (p1, p2), vals in pair_vals.items():
|
|
761
|
+
i, j = pop_idx[p1], pop_idx[p2]
|
|
762
|
+
mean_val = max(0, np.mean(vals)) # Clamp negative Fst to 0
|
|
763
|
+
matrix[i, j] = mean_val
|
|
764
|
+
matrix[j, i] = mean_val
|
|
765
|
+
|
|
766
|
+
# Convert to condensed distance form for scipy
|
|
767
|
+
dist_condensed = squareform(matrix, checks=False)
|
|
768
|
+
|
|
769
|
+
# Linkage method
|
|
770
|
+
if method == "upgma":
|
|
771
|
+
linkage_method = "average"
|
|
772
|
+
else:
|
|
773
|
+
# NJ approximation via weighted linkage
|
|
774
|
+
linkage_method = "weighted"
|
|
775
|
+
|
|
776
|
+
Z = linkage(dist_condensed, method=linkage_method)
|
|
777
|
+
|
|
778
|
+
# Plot dendrogram
|
|
779
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
780
|
+
dendrogram(
|
|
781
|
+
Z,
|
|
782
|
+
labels=all_pops,
|
|
783
|
+
ax=ax,
|
|
784
|
+
leaf_rotation=0,
|
|
785
|
+
orientation="left",
|
|
786
|
+
leaf_font_size=7,
|
|
787
|
+
color_threshold=0,
|
|
788
|
+
above_threshold_color=WONG_PALETTE[0],
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
ax.set_xlabel(f"Genetic distance ({stat.replace('_', ' ')})")
|
|
792
|
+
tree_label = "UPGMA" if method == "upgma" else "Neighbor-joining"
|
|
793
|
+
ax.set_title(title or f"Population tree ({tree_label}, {stat})", fontweight="bold")
|
|
794
|
+
ax.spines["top"].set_visible(False)
|
|
795
|
+
ax.spines["right"].set_visible(False)
|
|
796
|
+
|
|
797
|
+
fig.tight_layout()
|
|
798
|
+
_save_fig(fig, output)
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
# ---------------------------------------------------------------------------
|
|
802
|
+
# chromosome — multi-track chromosome view
|
|
803
|
+
# ---------------------------------------------------------------------------
|
|
804
|
+
@plot.command("chromosome")
|
|
805
|
+
@click.option("--chr", "chrom", required=True, help="Chromosome name (e.g., chr22)")
|
|
806
|
+
@click.option("--pop", "population", required=True, help="Population name")
|
|
807
|
+
@click.option("--stats", default="fst,ihs",
|
|
808
|
+
help="Comma-separated statistics to plot (e.g., fst,ihs,pi,tajima_d,xpehh)")
|
|
809
|
+
@click.option("-o", "--output", required=True, help="Output figure file (PNG/PDF)")
|
|
810
|
+
@click.option("--title", help="Figure title")
|
|
811
|
+
@click.option("--width", type=float, default=7.2, help="Figure width in inches")
|
|
812
|
+
@click.option("--height", type=float, default=2.0,
|
|
813
|
+
help="Height per track in inches (total = n_tracks * height)")
|
|
814
|
+
@pass_ctx
|
|
815
|
+
def chromosome(ctx, chrom, population, stats, output, title, width, height):
|
|
816
|
+
"""Multi-track chromosome view of population statistics.
|
|
817
|
+
|
|
818
|
+
Draws stacked tracks for each requested statistic along the chromosome.
|
|
819
|
+
Window-level stats (fst, pi, tajima_d) are queried from GenomicWindow nodes.
|
|
820
|
+
Variant-level stats (ihs, xpehh) are queried from Variant nodes.
|
|
821
|
+
|
|
822
|
+
\b
|
|
823
|
+
Examples:
|
|
824
|
+
graphpop plot chromosome --chr chr22 --pop EUR --stats fst,ihs,pi -o fig_chr22.png
|
|
825
|
+
graphpop plot chromosome --chr Chr01 --pop GJ-tmp --stats fst,pi -o fig_chr01.png
|
|
826
|
+
"""
|
|
827
|
+
_check_matplotlib()
|
|
828
|
+
_apply_style()
|
|
829
|
+
|
|
830
|
+
stat_list = [s.strip() for s in stats.split(",") if s.strip()]
|
|
831
|
+
if not stat_list:
|
|
832
|
+
click.echo("No statistics specified.", err=True)
|
|
833
|
+
raise SystemExit(1)
|
|
834
|
+
|
|
835
|
+
window_stats = {"fst", "pi", "theta_w", "tajima_d"}
|
|
836
|
+
variant_stats = {"ihs", "xpehh"}
|
|
837
|
+
|
|
838
|
+
n_tracks = len(stat_list)
|
|
839
|
+
fig_height = height * n_tracks
|
|
840
|
+
fig, axes = plt.subplots(n_tracks, 1, figsize=(width, fig_height), sharex=True,
|
|
841
|
+
squeeze=False)
|
|
842
|
+
axes = axes.flatten()
|
|
843
|
+
|
|
844
|
+
track_colors = [WONG_PALETTE[i % len(WONG_PALETTE)] for i in range(n_tracks)]
|
|
845
|
+
|
|
846
|
+
for idx, stat in enumerate(stat_list):
|
|
847
|
+
ax = axes[idx]
|
|
848
|
+
color = track_colors[idx]
|
|
849
|
+
|
|
850
|
+
if stat.lower() in window_stats:
|
|
851
|
+
query = (
|
|
852
|
+
f"MATCH (w:GenomicWindow) "
|
|
853
|
+
f"WHERE w.chr = $chrom AND w.population = $population "
|
|
854
|
+
f"AND w.{stat} IS NOT NULL "
|
|
855
|
+
f"RETURN w.start AS start, w.end AS end, w.{stat} AS value "
|
|
856
|
+
f"ORDER BY w.start"
|
|
857
|
+
)
|
|
858
|
+
recs = ctx.run(query, {"chrom": chrom, "population": population})
|
|
859
|
+
if recs:
|
|
860
|
+
positions = [((r["start"] or 0) + (r["end"] or 0)) / 2 for r in recs]
|
|
861
|
+
values = [float(r["value"]) for r in recs]
|
|
862
|
+
ax.fill_between(positions, values, alpha=0.3, color=color)
|
|
863
|
+
ax.plot(positions, values, color=color, linewidth=0.6)
|
|
864
|
+
else:
|
|
865
|
+
ax.text(0.5, 0.5, "No data", transform=ax.transAxes,
|
|
866
|
+
ha="center", va="center", fontsize=7, color="grey")
|
|
867
|
+
|
|
868
|
+
elif stat.lower() in variant_stats:
|
|
869
|
+
prop = f"{stat}_{population}"
|
|
870
|
+
query = (
|
|
871
|
+
f"MATCH (v:Variant) "
|
|
872
|
+
f"WHERE v.chr = $chrom AND v.{prop} IS NOT NULL "
|
|
873
|
+
f"RETURN v.pos AS pos, v.{prop} AS value "
|
|
874
|
+
f"ORDER BY v.pos"
|
|
875
|
+
)
|
|
876
|
+
recs = ctx.run(query, {"chrom": chrom})
|
|
877
|
+
if recs:
|
|
878
|
+
positions = [r["pos"] for r in recs]
|
|
879
|
+
values = [abs(float(r["value"])) for r in recs]
|
|
880
|
+
ax.scatter(positions, values, s=1, alpha=0.4, color=color,
|
|
881
|
+
rasterized=True)
|
|
882
|
+
else:
|
|
883
|
+
ax.text(0.5, 0.5, "No data", transform=ax.transAxes,
|
|
884
|
+
ha="center", va="center", fontsize=7, color="grey")
|
|
885
|
+
else:
|
|
886
|
+
ax.text(0.5, 0.5, f"Unknown stat: {stat}", transform=ax.transAxes,
|
|
887
|
+
ha="center", va="center", fontsize=7, color="red")
|
|
888
|
+
|
|
889
|
+
label = f"|{stat}|" if stat.lower() in variant_stats else stat
|
|
890
|
+
ax.set_ylabel(label, fontsize=7)
|
|
891
|
+
|
|
892
|
+
# Alternating background
|
|
893
|
+
if idx % 2 == 1:
|
|
894
|
+
ax.set_facecolor("#f8f8f8")
|
|
895
|
+
|
|
896
|
+
axes[-1].set_xlabel(f"Position on {chrom} (bp)")
|
|
897
|
+
axes[0].set_title(
|
|
898
|
+
title or f"Chromosome view: {chrom} ({population})", fontweight="bold"
|
|
899
|
+
)
|
|
900
|
+
|
|
901
|
+
fig.tight_layout()
|
|
902
|
+
_save_fig(fig, output)
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
# ---------------------------------------------------------------------------
|
|
906
|
+
# pca-scatter
|
|
907
|
+
# ---------------------------------------------------------------------------
|
|
908
|
+
@plot.command("pca-scatter")
|
|
909
|
+
@click.argument("input_file", type=click.Path(exists=True))
|
|
910
|
+
@click.option("-o", "--output", required=True, help="Output figure file (PNG/PDF)")
|
|
911
|
+
@click.option("--color-by", "color_by", default="population",
|
|
912
|
+
help="Column name to color points by (default: population)")
|
|
913
|
+
@click.option("--pc", "pc_axes", default="1,2",
|
|
914
|
+
help="Which PCs to plot, comma-separated (default: 1,2)")
|
|
915
|
+
@click.option("--title", help="Figure title")
|
|
916
|
+
@click.option("--width", type=float, default=5.0, help="Figure width in inches")
|
|
917
|
+
@click.option("--height", type=float, default=5.0, help="Figure height in inches")
|
|
918
|
+
def pca_scatter(input_file, output, color_by, pc_axes, title, width, height):
|
|
919
|
+
"""PCA scatter plot from a TSV with PC columns.
|
|
920
|
+
|
|
921
|
+
INPUT_FILE should be a TSV with columns like pc1, pc2 (or PC1, PC2) and
|
|
922
|
+
a grouping column (default: population) for coloring.
|
|
923
|
+
|
|
924
|
+
\b
|
|
925
|
+
Examples:
|
|
926
|
+
graphpop plot pca-scatter pca_results.tsv -o fig_pca.png
|
|
927
|
+
graphpop plot pca-scatter pca.tsv --color-by superpopulation --pc 1,3 -o pca13.png
|
|
928
|
+
"""
|
|
929
|
+
_check_matplotlib()
|
|
930
|
+
_apply_style()
|
|
931
|
+
|
|
932
|
+
rows = _read_tsv(input_file)
|
|
933
|
+
if not rows:
|
|
934
|
+
click.echo("No data found.", err=True)
|
|
935
|
+
return
|
|
936
|
+
|
|
937
|
+
# Parse PC axes
|
|
938
|
+
try:
|
|
939
|
+
pc_a, pc_b = [int(x.strip()) for x in pc_axes.split(",")]
|
|
940
|
+
except (ValueError, IndexError):
|
|
941
|
+
click.echo("Invalid --pc format. Use e.g. '1,2'.", err=True)
|
|
942
|
+
raise SystemExit(1)
|
|
943
|
+
|
|
944
|
+
# Find PC column names (case-insensitive)
|
|
945
|
+
sample_keys = list(rows[0].keys())
|
|
946
|
+
pc_col_a = _find_pc_col(sample_keys, pc_a)
|
|
947
|
+
pc_col_b = _find_pc_col(sample_keys, pc_b)
|
|
948
|
+
if not pc_col_a or not pc_col_b:
|
|
949
|
+
click.echo(
|
|
950
|
+
f"Could not find PC{pc_a} and PC{pc_b} columns in: {sample_keys}",
|
|
951
|
+
err=True,
|
|
952
|
+
)
|
|
953
|
+
raise SystemExit(1)
|
|
954
|
+
|
|
955
|
+
# Group by color column
|
|
956
|
+
groups = {}
|
|
957
|
+
for r in rows:
|
|
958
|
+
group = r.get(color_by, "unknown")
|
|
959
|
+
x = float(r[pc_col_a])
|
|
960
|
+
y = float(r[pc_col_b])
|
|
961
|
+
groups.setdefault(group, ([], []))
|
|
962
|
+
groups[group][0].append(x)
|
|
963
|
+
groups[group][1].append(y)
|
|
964
|
+
|
|
965
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
966
|
+
for i, (group_name, (xs, ys)) in enumerate(sorted(groups.items())):
|
|
967
|
+
color = WONG_PALETTE[i % len(WONG_PALETTE)]
|
|
968
|
+
ax.scatter(xs, ys, s=8, alpha=0.7, color=color, label=group_name,
|
|
969
|
+
edgecolors="none")
|
|
970
|
+
|
|
971
|
+
ax.set_xlabel(f"PC{pc_a}")
|
|
972
|
+
ax.set_ylabel(f"PC{pc_b}")
|
|
973
|
+
ax.set_title(title or f"PCA: PC{pc_a} vs PC{pc_b}", fontweight="bold")
|
|
974
|
+
|
|
975
|
+
# Legend outside if many groups
|
|
976
|
+
if len(groups) > 8:
|
|
977
|
+
ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=5,
|
|
978
|
+
markerscale=1.5, frameon=False)
|
|
979
|
+
else:
|
|
980
|
+
ax.legend(fontsize=6, markerscale=1.5, frameon=False)
|
|
981
|
+
|
|
982
|
+
fig.tight_layout()
|
|
983
|
+
_save_fig(fig, output)
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
def _find_pc_col(keys: list[str], pc_num: int) -> str | None:
|
|
987
|
+
"""Find the column name for a given PC number (case-insensitive)."""
|
|
988
|
+
candidates = [f"pc{pc_num}", f"PC{pc_num}", f"Pc{pc_num}",
|
|
989
|
+
f"pc_{pc_num}", f"PC_{pc_num}"]
|
|
990
|
+
for c in candidates:
|
|
991
|
+
if c in keys:
|
|
992
|
+
return c
|
|
993
|
+
# Fallback: partial match
|
|
994
|
+
for k in keys:
|
|
995
|
+
if k.lower().replace("_", "") == f"pc{pc_num}":
|
|
996
|
+
return k
|
|
997
|
+
return None
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
# ---------------------------------------------------------------------------
|
|
1001
|
+
# heatmap — general-purpose heatmap from a matrix TSV
|
|
1002
|
+
# ---------------------------------------------------------------------------
|
|
1003
|
+
@plot.command("heatmap")
|
|
1004
|
+
@click.argument("input_file", type=click.Path(exists=True))
|
|
1005
|
+
@click.option("-o", "--output", required=True, help="Output figure file (PNG/PDF)")
|
|
1006
|
+
@click.option("--cmap", default="viridis", help="Matplotlib colormap (default: viridis)")
|
|
1007
|
+
@click.option("--annotate", is_flag=True, help="Add numeric values to cells")
|
|
1008
|
+
@click.option("--title", help="Figure title")
|
|
1009
|
+
@click.option("--width", type=float, default=7.2, help="Figure width in inches")
|
|
1010
|
+
@click.option("--height", type=float, default=6.0, help="Figure height in inches")
|
|
1011
|
+
def heatmap(input_file, output, cmap, annotate, title, width, height):
|
|
1012
|
+
"""General-purpose heatmap from a matrix TSV.
|
|
1013
|
+
|
|
1014
|
+
INPUT_FILE should be a TSV where the first column contains row labels
|
|
1015
|
+
and the header row contains column labels. All other cells are numeric.
|
|
1016
|
+
|
|
1017
|
+
\b
|
|
1018
|
+
Examples:
|
|
1019
|
+
graphpop plot heatmap matrix.tsv -o fig_heatmap.png --cmap YlOrRd --annotate
|
|
1020
|
+
graphpop plot heatmap fst_matrix.tsv -o fig_fst.png --title "Pairwise Fst"
|
|
1021
|
+
"""
|
|
1022
|
+
_check_matplotlib()
|
|
1023
|
+
_apply_style()
|
|
1024
|
+
|
|
1025
|
+
rows = _read_tsv(input_file)
|
|
1026
|
+
if not rows:
|
|
1027
|
+
click.echo("No data found.", err=True)
|
|
1028
|
+
return
|
|
1029
|
+
|
|
1030
|
+
# First column = row labels, rest = numeric matrix
|
|
1031
|
+
col_keys = list(rows[0].keys())
|
|
1032
|
+
label_col = col_keys[0]
|
|
1033
|
+
value_cols = col_keys[1:]
|
|
1034
|
+
|
|
1035
|
+
row_labels = [r[label_col] for r in rows]
|
|
1036
|
+
matrix = np.zeros((len(rows), len(value_cols)))
|
|
1037
|
+
for i, r in enumerate(rows):
|
|
1038
|
+
for j, c in enumerate(value_cols):
|
|
1039
|
+
try:
|
|
1040
|
+
matrix[i, j] = float(r[c])
|
|
1041
|
+
except (ValueError, TypeError):
|
|
1042
|
+
matrix[i, j] = np.nan
|
|
1043
|
+
|
|
1044
|
+
fig, ax = plt.subplots(figsize=(width, height))
|
|
1045
|
+
im = ax.imshow(matrix, cmap=cmap, aspect="auto")
|
|
1046
|
+
|
|
1047
|
+
ax.set_xticks(range(len(value_cols)))
|
|
1048
|
+
ax.set_yticks(range(len(row_labels)))
|
|
1049
|
+
ax.set_xticklabels(value_cols, rotation=45, ha="right")
|
|
1050
|
+
ax.set_yticklabels(row_labels)
|
|
1051
|
+
|
|
1052
|
+
if annotate:
|
|
1053
|
+
for i in range(len(row_labels)):
|
|
1054
|
+
for j in range(len(value_cols)):
|
|
1055
|
+
val = matrix[i, j]
|
|
1056
|
+
if not np.isnan(val):
|
|
1057
|
+
text_color = ("white"
|
|
1058
|
+
if val > np.nanmax(matrix) * 0.6
|
|
1059
|
+
else "black")
|
|
1060
|
+
ax.text(j, i, f"{val:.3g}", ha="center", va="center",
|
|
1061
|
+
fontsize=5, color=text_color)
|
|
1062
|
+
|
|
1063
|
+
fig.colorbar(im, ax=ax, shrink=0.8)
|
|
1064
|
+
ax.set_title(title or "Heatmap", fontweight="bold")
|
|
1065
|
+
fig.tight_layout()
|
|
1066
|
+
_save_fig(fig, output)
|