graphpop-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphpop_cli/__init__.py +2 -0
- graphpop_cli/cli.py +161 -0
- graphpop_cli/commands/__init__.py +1 -0
- graphpop_cli/commands/aggregate.py +206 -0
- graphpop_cli/commands/batch.py +155 -0
- graphpop_cli/commands/compare.py +118 -0
- graphpop_cli/commands/config_cmd.py +117 -0
- graphpop_cli/commands/converge.py +156 -0
- graphpop_cli/commands/db.py +188 -0
- graphpop_cli/commands/divergence.py +37 -0
- graphpop_cli/commands/diversity.py +36 -0
- graphpop_cli/commands/dump.py +210 -0
- graphpop_cli/commands/export_bed.py +170 -0
- graphpop_cli/commands/export_windows.py +91 -0
- graphpop_cli/commands/extract.py +271 -0
- graphpop_cli/commands/filter_results.py +165 -0
- graphpop_cli/commands/garud_h.py +30 -0
- graphpop_cli/commands/genome_scan.py +41 -0
- graphpop_cli/commands/ihs.py +29 -0
- graphpop_cli/commands/import_data.py +266 -0
- graphpop_cli/commands/inventory.py +160 -0
- graphpop_cli/commands/joint_sfs.py +38 -0
- graphpop_cli/commands/ld.py +35 -0
- graphpop_cli/commands/lookup.py +207 -0
- graphpop_cli/commands/neighbors.py +175 -0
- graphpop_cli/commands/nsl.py +29 -0
- graphpop_cli/commands/plot.py +1066 -0
- graphpop_cli/commands/pop_summary.py +30 -0
- graphpop_cli/commands/query.py +15 -0
- graphpop_cli/commands/rank_genes.py +177 -0
- graphpop_cli/commands/report.py +264 -0
- graphpop_cli/commands/roh.py +30 -0
- graphpop_cli/commands/run_all.py +276 -0
- graphpop_cli/commands/server.py +98 -0
- graphpop_cli/commands/setup.py +299 -0
- graphpop_cli/commands/sfs.py +38 -0
- graphpop_cli/commands/validate.py +167 -0
- graphpop_cli/commands/xpehh.py +31 -0
- graphpop_cli/config.py +57 -0
- graphpop_cli/connection.py +52 -0
- graphpop_cli/formatters.py +81 -0
- graphpop_cli-0.1.0.dist-info/METADATA +73 -0
- graphpop_cli-0.1.0.dist-info/RECORD +46 -0
- graphpop_cli-0.1.0.dist-info/WHEEL +5 -0
- graphpop_cli-0.1.0.dist-info/entry_points.txt +2 -0
- graphpop_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""graphpop extract — extract variants, samples, and genotypes from the graph."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from ..cli import pass_ctx
|
|
7
|
+
from ..formatters import format_output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@click.group()
|
|
11
|
+
def extract():
|
|
12
|
+
"""Extract data from the graph: variants, samples, or genotypes.
|
|
13
|
+
|
|
14
|
+
\b
|
|
15
|
+
Subcommands:
|
|
16
|
+
variants Query Variant nodes with flexible filters
|
|
17
|
+
samples Query Sample nodes for a population
|
|
18
|
+
genotypes Extract sample x variant dosage matrix for a region
|
|
19
|
+
|
|
20
|
+
\b
|
|
21
|
+
Examples:
|
|
22
|
+
graphpop extract variants --chr chr22 --pop EUR --consequence missense_variant -o variants.tsv
|
|
23
|
+
graphpop extract samples --pop EUR -o samples.tsv
|
|
24
|
+
graphpop extract genotypes --chr chr22 --start 16000000 --end 17000000 --pop EUR -o geno.tsv
|
|
25
|
+
"""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@extract.command("variants")
|
|
30
|
+
@click.option("--chr", "chromosome", help="Chromosome filter")
|
|
31
|
+
@click.option("--start", type=int, help="Region start position")
|
|
32
|
+
@click.option("--end", type=int, help="Region end position")
|
|
33
|
+
@click.option("--pop", "population", help="Population name (for AF lookup)")
|
|
34
|
+
@click.option("--min-af", type=float, help="Minimum allele frequency")
|
|
35
|
+
@click.option("--max-af", type=float, help="Maximum allele frequency")
|
|
36
|
+
@click.option("--consequence", help="VEP consequence type (e.g., missense_variant)")
|
|
37
|
+
@click.option("--pathway", help="Pathway name (substring match)")
|
|
38
|
+
@click.option("--gene", help="Gene symbol or ID")
|
|
39
|
+
@click.option("--fields", default="variantId,pos,ref,alt,af",
|
|
40
|
+
help="Comma-separated fields to return (default: variantId,pos,ref,alt,af)")
|
|
41
|
+
@click.option("--limit", type=int, default=10000, help="Maximum rows (default: 10000)")
|
|
42
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
43
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
44
|
+
@pass_ctx
|
|
45
|
+
def extract_variants(ctx, chromosome, start, end, population, min_af, max_af,
|
|
46
|
+
consequence, pathway, gene, fields, limit, output_path, fmt):
|
|
47
|
+
"""Query Variant nodes with optional filters.
|
|
48
|
+
|
|
49
|
+
Builds Cypher dynamically based on provided filters. Use --fields to
|
|
50
|
+
select which variant properties to return.
|
|
51
|
+
|
|
52
|
+
\b
|
|
53
|
+
Examples:
|
|
54
|
+
graphpop extract variants --chr chr22 --pop EUR --consequence missense_variant -o variants.tsv
|
|
55
|
+
graphpop extract variants --chr chr22 --start 16000000 --end 17000000 --fields pos,ref,alt,af,fst,ihs -o region.tsv
|
|
56
|
+
graphpop extract variants --gene KCNE1 --pop EUR -o kcne1_variants.tsv
|
|
57
|
+
"""
|
|
58
|
+
field_list = [f.strip() for f in fields.split(",")]
|
|
59
|
+
params = {}
|
|
60
|
+
|
|
61
|
+
# Build MATCH clause with optional annotation joins
|
|
62
|
+
match_clause = "MATCH (v:Variant)"
|
|
63
|
+
where_parts = []
|
|
64
|
+
|
|
65
|
+
if consequence:
|
|
66
|
+
match_clause += "-[:HAS_CONSEQUENCE]->(hc)"
|
|
67
|
+
where_parts.append("hc.consequence = $consequence")
|
|
68
|
+
params["consequence"] = consequence
|
|
69
|
+
if gene:
|
|
70
|
+
if "-[:HAS_CONSEQUENCE]->" not in match_clause:
|
|
71
|
+
match_clause += "-[:HAS_CONSEQUENCE]->(g:Gene)"
|
|
72
|
+
else:
|
|
73
|
+
match_clause = "MATCH (v:Variant)-[:HAS_CONSEQUENCE]->(g:Gene)"
|
|
74
|
+
if consequence:
|
|
75
|
+
match_clause = "MATCH (v:Variant)-[hc_rel:HAS_CONSEQUENCE]->(g:Gene)"
|
|
76
|
+
where_parts = [p for p in where_parts if "hc.consequence" not in p]
|
|
77
|
+
where_parts.append("hc_rel.consequence = $consequence")
|
|
78
|
+
where_parts.append("(g.symbol = $gene OR g.geneId = $gene)")
|
|
79
|
+
params["gene"] = gene
|
|
80
|
+
if pathway:
|
|
81
|
+
if "Gene" not in match_clause:
|
|
82
|
+
match_clause += "-[:HAS_CONSEQUENCE]->(g:Gene)-[:IN_PATHWAY]->(pw:Pathway)"
|
|
83
|
+
else:
|
|
84
|
+
match_clause += "-[:IN_PATHWAY]->(pw:Pathway)"
|
|
85
|
+
where_parts.append("pw.name CONTAINS $pathway")
|
|
86
|
+
params["pathway"] = pathway
|
|
87
|
+
|
|
88
|
+
if chromosome:
|
|
89
|
+
where_parts.append("v.chr = $chromosome")
|
|
90
|
+
params["chromosome"] = chromosome
|
|
91
|
+
if start is not None:
|
|
92
|
+
where_parts.append("v.pos >= $start")
|
|
93
|
+
params["start"] = start
|
|
94
|
+
if end is not None:
|
|
95
|
+
where_parts.append("v.pos <= $end")
|
|
96
|
+
params["end"] = end
|
|
97
|
+
|
|
98
|
+
# AF filtering: if population is given, look up index in pop_ids array
|
|
99
|
+
if population and (min_af is not None or max_af is not None):
|
|
100
|
+
params["population"] = population
|
|
101
|
+
if min_af is not None:
|
|
102
|
+
where_parts.append(
|
|
103
|
+
"ANY(i IN range(0, size(v.pop_ids)-1) "
|
|
104
|
+
"WHERE v.pop_ids[i] = $population AND v.af[i] >= $min_af)"
|
|
105
|
+
)
|
|
106
|
+
params["min_af"] = min_af
|
|
107
|
+
if max_af is not None:
|
|
108
|
+
where_parts.append(
|
|
109
|
+
"ANY(i IN range(0, size(v.pop_ids)-1) "
|
|
110
|
+
"WHERE v.pop_ids[i] = $population AND v.af[i] <= $max_af)"
|
|
111
|
+
)
|
|
112
|
+
params["max_af"] = max_af
|
|
113
|
+
|
|
114
|
+
# Build RETURN clause from requested fields
|
|
115
|
+
return_cols = []
|
|
116
|
+
for f in field_list:
|
|
117
|
+
if f == "af" and population:
|
|
118
|
+
if "population" not in params:
|
|
119
|
+
params["population"] = population
|
|
120
|
+
return_cols.append(
|
|
121
|
+
"[i IN range(0, size(v.pop_ids)-1) "
|
|
122
|
+
"WHERE v.pop_ids[i] = $population | v.af[i]][0] AS af_" + population
|
|
123
|
+
)
|
|
124
|
+
elif f in ("gene", "gene_symbol") and "Gene" in match_clause:
|
|
125
|
+
return_cols.append("g.symbol AS gene")
|
|
126
|
+
elif f == "consequence" and consequence:
|
|
127
|
+
col = "hc.consequence" if "hc)" in match_clause else "hc_rel.consequence"
|
|
128
|
+
return_cols.append(f"{col} AS consequence")
|
|
129
|
+
else:
|
|
130
|
+
return_cols.append(f"v.{f} AS {f}")
|
|
131
|
+
|
|
132
|
+
params["limit"] = limit
|
|
133
|
+
where_str = " AND ".join(where_parts) if where_parts else "true"
|
|
134
|
+
cypher = (
|
|
135
|
+
f"{match_clause} "
|
|
136
|
+
f"WHERE {where_str} "
|
|
137
|
+
f"RETURN DISTINCT {', '.join(return_cols)} "
|
|
138
|
+
f"ORDER BY v.pos LIMIT $limit"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
records = ctx.run(cypher, params)
|
|
142
|
+
if not records:
|
|
143
|
+
click.echo("No variants found with given filters.", err=True)
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
click.echo(f"Found {len(records)} variants.", err=True)
|
|
147
|
+
format_output(records, output_path, fmt, "extract variants",
|
|
148
|
+
{"chr": chromosome, "start": start, "end": end,
|
|
149
|
+
"pop": population, "consequence": consequence,
|
|
150
|
+
"pathway": pathway, "gene": gene})
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@extract.command("samples")
|
|
154
|
+
@click.option("--pop", "population", required=True, help="Population name")
|
|
155
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
156
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
157
|
+
@pass_ctx
|
|
158
|
+
def extract_samples(ctx, population, output_path, fmt):
|
|
159
|
+
"""Query Sample nodes for a population.
|
|
160
|
+
|
|
161
|
+
Returns sampleId, population, and packed_index for each sample. If
|
|
162
|
+
population-level summary stats (e.g., FROH) are available, they are
|
|
163
|
+
included.
|
|
164
|
+
|
|
165
|
+
\b
|
|
166
|
+
Examples:
|
|
167
|
+
graphpop extract samples --pop EUR -o samples.tsv
|
|
168
|
+
graphpop extract samples --pop GJ-tmp --format json
|
|
169
|
+
"""
|
|
170
|
+
cypher = """
|
|
171
|
+
MATCH (s:Sample)
|
|
172
|
+
WHERE s.population = $population
|
|
173
|
+
OPTIONAL MATCH (p:Population {name: $population})
|
|
174
|
+
RETURN s.sampleId AS sampleId,
|
|
175
|
+
s.population AS population,
|
|
176
|
+
s.packed_index AS packed_index,
|
|
177
|
+
s.froh AS froh,
|
|
178
|
+
p.n_samples AS pop_n_samples,
|
|
179
|
+
p.mean_froh AS pop_mean_froh
|
|
180
|
+
ORDER BY s.packed_index
|
|
181
|
+
"""
|
|
182
|
+
records = ctx.run(cypher, {"population": population})
|
|
183
|
+
if not records:
|
|
184
|
+
click.echo(f"No samples found for population '{population}'.", err=True)
|
|
185
|
+
return
|
|
186
|
+
|
|
187
|
+
click.echo(f"Found {len(records)} samples for {population}.", err=True)
|
|
188
|
+
format_output(records, output_path, fmt, "extract samples",
|
|
189
|
+
{"population": population})
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@extract.command("genotypes")
|
|
193
|
+
@click.option("--chr", "chromosome", required=True, help="Chromosome")
|
|
194
|
+
@click.option("--start", type=int, required=True, help="Region start position")
|
|
195
|
+
@click.option("--end", type=int, required=True, help="Region end position")
|
|
196
|
+
@click.option("--pop", "population", required=True, help="Population name")
|
|
197
|
+
@click.option("--format-gt", "gt_format", default="dosage",
|
|
198
|
+
type=click.Choice(["dosage", "gt", "raw"]),
|
|
199
|
+
help="Output format: dosage (0/1/2), gt (0/0, 0/1, 1/1), or raw (hex gt_packed)")
|
|
200
|
+
@click.option("--limit", type=int, default=1000, help="Maximum variants (default: 1000)")
|
|
201
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
202
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
203
|
+
@pass_ctx
|
|
204
|
+
def extract_genotypes(ctx, chromosome, start, end, population, gt_format,
|
|
205
|
+
limit, output_path, fmt):
|
|
206
|
+
"""Extract genotype data for a region and population.
|
|
207
|
+
|
|
208
|
+
Queries CARRIES edges between Sample and Variant nodes to build a
|
|
209
|
+
sample x variant matrix. For large regions, consider using --limit
|
|
210
|
+
to cap the number of variants.
|
|
211
|
+
|
|
212
|
+
Note: gt_packed decoding requires bit operations. With --format-gt raw,
|
|
213
|
+
the raw gt_packed byte array is returned as a hex string per variant.
|
|
214
|
+
With dosage or gt mode, individual CARRIES edges are queried instead.
|
|
215
|
+
|
|
216
|
+
\b
|
|
217
|
+
Examples:
|
|
218
|
+
graphpop extract genotypes --chr chr22 --start 16000000 --end 17000000 --pop EUR -o geno.tsv
|
|
219
|
+
graphpop extract genotypes --chr chr22 --start 16000000 --end 17000000 --pop EUR --format-gt raw -o geno_raw.tsv
|
|
220
|
+
"""
|
|
221
|
+
params = {
|
|
222
|
+
"chromosome": chromosome,
|
|
223
|
+
"start": start,
|
|
224
|
+
"end": end,
|
|
225
|
+
"population": population,
|
|
226
|
+
"limit": limit,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if gt_format == "raw":
|
|
230
|
+
# Return per-variant summary with raw gt_packed as hex
|
|
231
|
+
cypher = (
|
|
232
|
+
"MATCH (v:Variant) "
|
|
233
|
+
"WHERE v.chr = $chromosome AND v.pos >= $start AND v.pos <= $end "
|
|
234
|
+
"RETURN v.variantId AS variantId, v.pos AS pos, v.ref AS ref, v.alt AS alt, "
|
|
235
|
+
"v.gt_packed AS gt_packed_hex, "
|
|
236
|
+
"[i IN range(0, size(v.pop_ids)-1) "
|
|
237
|
+
"WHERE v.pop_ids[i] = $population | v.af[i]][0] AS af "
|
|
238
|
+
"ORDER BY v.pos LIMIT $limit"
|
|
239
|
+
)
|
|
240
|
+
records = ctx.run(cypher, params)
|
|
241
|
+
if not records:
|
|
242
|
+
click.echo("No variants found in region.", err=True)
|
|
243
|
+
return
|
|
244
|
+
click.echo(f"Found {len(records)} variants (raw gt_packed mode).", err=True)
|
|
245
|
+
format_output(records, output_path, fmt, "extract genotypes",
|
|
246
|
+
{"chr": chromosome, "start": start, "end": end,
|
|
247
|
+
"pop": population, "format": gt_format})
|
|
248
|
+
else:
|
|
249
|
+
# Query CARRIES edges for individual genotypes
|
|
250
|
+
gt_label = "c.gt" if gt_format == "dosage" else (
|
|
251
|
+
"CASE c.gt WHEN 1 THEN '0/1' WHEN 2 THEN '1/1' ELSE '0/0' END"
|
|
252
|
+
)
|
|
253
|
+
params["carries_limit"] = limit * 100
|
|
254
|
+
cypher = (
|
|
255
|
+
"MATCH (s:Sample)-[c:CARRIES]->(v:Variant) "
|
|
256
|
+
"WHERE s.population = $population "
|
|
257
|
+
"AND v.chr = $chromosome AND v.pos >= $start AND v.pos <= $end "
|
|
258
|
+
"RETURN s.sampleId AS sampleId, v.variantId AS variantId, "
|
|
259
|
+
f"v.pos AS pos, {gt_label} AS genotype "
|
|
260
|
+
"ORDER BY v.pos, s.sampleId "
|
|
261
|
+
"LIMIT $carries_limit"
|
|
262
|
+
)
|
|
263
|
+
records = ctx.run(cypher, params)
|
|
264
|
+
if not records:
|
|
265
|
+
click.echo("No genotype data found. CARRIES edges may not exist "
|
|
266
|
+
"for this region/population.", err=True)
|
|
267
|
+
return
|
|
268
|
+
click.echo(f"Found {len(records)} genotype entries.", err=True)
|
|
269
|
+
format_output(records, output_path, fmt, "extract genotypes",
|
|
270
|
+
{"chr": chromosome, "start": start, "end": end,
|
|
271
|
+
"pop": population, "format": gt_format})
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""graphpop filter — query persisted results with annotation filters."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from ..cli import pass_ctx
|
|
7
|
+
from ..formatters import format_output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@click.command("filter")
|
|
11
|
+
@click.argument("statistic", type=click.Choice([
|
|
12
|
+
"ihs", "xpehh", "nsl", "fst", "pi", "tajima_d", "h12",
|
|
13
|
+
]))
|
|
14
|
+
@click.argument("chr")
|
|
15
|
+
@click.argument("population")
|
|
16
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
17
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
18
|
+
@click.option("--consequence", help="Filter by VEP consequence type (e.g., missense_variant)")
|
|
19
|
+
@click.option("--pathway", help="Filter by pathway name")
|
|
20
|
+
@click.option("--gene", help="Filter by gene name or ID")
|
|
21
|
+
@click.option("--min-score", type=float, help="Minimum absolute score")
|
|
22
|
+
@click.option("--max-score", type=float, help="Maximum absolute score")
|
|
23
|
+
@click.option("--pop2", help="Second population (for xpehh)")
|
|
24
|
+
@click.option("--limit", type=int, default=10000, help="Maximum rows (default: 10000)")
|
|
25
|
+
@pass_ctx
|
|
26
|
+
def filter_results(ctx, statistic, chr, population, output_path, fmt,
|
|
27
|
+
consequence, pathway, gene, min_score, max_score, pop2, limit):
|
|
28
|
+
"""Query persisted statistics with annotation-based filters.
|
|
29
|
+
|
|
30
|
+
This command retrieves already-computed statistics (iHS, XP-EHH, nSL, etc.)
|
|
31
|
+
from graph nodes and filters them by functional annotation. It is the
|
|
32
|
+
recommended way to perform conditioned analysis for haplotype-based
|
|
33
|
+
statistics, which must be computed genome-wide first and then filtered.
|
|
34
|
+
|
|
35
|
+
\b
|
|
36
|
+
Workflow:
|
|
37
|
+
1. Compute statistics: graphpop ihs chr1 EUR --persist
|
|
38
|
+
2. Filter by annotation: graphpop filter ihs chr1 EUR --consequence missense_variant
|
|
39
|
+
|
|
40
|
+
\b
|
|
41
|
+
Examples:
|
|
42
|
+
graphpop filter ihs chr1 EUR --consequence missense_variant -o ihs_missense.tsv
|
|
43
|
+
graphpop filter xpehh chr1 EUR --pop2 AFR --pathway "Cardiac repolarization"
|
|
44
|
+
graphpop filter nsl chr1 GJ-tmp --gene GW5 --min-score 2.0
|
|
45
|
+
graphpop filter h12 chr1 GJ-tmp --consequence missense_variant
|
|
46
|
+
"""
|
|
47
|
+
# Build the property name for this statistic
|
|
48
|
+
if statistic == "xpehh" and pop2:
|
|
49
|
+
prop = f"xpehh_{population}_{pop2}"
|
|
50
|
+
prop_unstd = f"xpehh_unstd_{population}_{pop2}"
|
|
51
|
+
elif statistic == "xpehh":
|
|
52
|
+
# Try to find any xpehh property
|
|
53
|
+
prop = f"xpehh_{population}_*"
|
|
54
|
+
click.echo("Warning: --pop2 not specified; will search for any XP-EHH involving this population.", err=True)
|
|
55
|
+
prop = None
|
|
56
|
+
elif statistic in ("ihs", "nsl"):
|
|
57
|
+
prop = f"{statistic}_{population}"
|
|
58
|
+
prop_unstd = f"{statistic}_unstd_{population}"
|
|
59
|
+
elif statistic in ("fst", "pi", "tajima_d", "h12"):
|
|
60
|
+
prop = statistic
|
|
61
|
+
prop_unstd = None
|
|
62
|
+
else:
|
|
63
|
+
prop = statistic
|
|
64
|
+
prop_unstd = None
|
|
65
|
+
|
|
66
|
+
# Build Cypher query
|
|
67
|
+
params: dict = {"chr": chr, "population": population, "limit": limit}
|
|
68
|
+
|
|
69
|
+
if statistic in ("ihs", "xpehh", "nsl"):
|
|
70
|
+
# Per-variant statistics stored on Variant nodes
|
|
71
|
+
match_clause = "MATCH (v:Variant)"
|
|
72
|
+
where_parts = ["v.chr = $chr"]
|
|
73
|
+
if prop:
|
|
74
|
+
where_parts.append(f"v.{prop} IS NOT NULL")
|
|
75
|
+
|
|
76
|
+
# Annotation join
|
|
77
|
+
if consequence:
|
|
78
|
+
match_clause += "-[:HAS_CONSEQUENCE]->(hc)"
|
|
79
|
+
where_parts.append("hc.consequence = $consequence")
|
|
80
|
+
params["consequence"] = consequence
|
|
81
|
+
if pathway:
|
|
82
|
+
match_clause += "-[:HAS_CONSEQUENCE]->(:Gene)-[:IN_PATHWAY]->(pw:Pathway)"
|
|
83
|
+
where_parts.append("pw.name CONTAINS $pathway")
|
|
84
|
+
params["pathway"] = pathway
|
|
85
|
+
if gene:
|
|
86
|
+
match_clause += "-[:HAS_CONSEQUENCE]->(g:Gene)"
|
|
87
|
+
where_parts.append("(g.geneId = $gene OR g.symbol = $gene)")
|
|
88
|
+
params["gene"] = gene
|
|
89
|
+
|
|
90
|
+
if min_score is not None and prop:
|
|
91
|
+
where_parts.append(f"abs(v.{prop}) >= {min_score}")
|
|
92
|
+
if max_score is not None and prop:
|
|
93
|
+
where_parts.append(f"abs(v.{prop}) <= {max_score}")
|
|
94
|
+
|
|
95
|
+
return_cols = [
|
|
96
|
+
"v.variantId AS variant_id",
|
|
97
|
+
"v.pos AS pos",
|
|
98
|
+
]
|
|
99
|
+
if prop:
|
|
100
|
+
return_cols.append(f"v.{prop} AS {statistic}")
|
|
101
|
+
if prop_unstd:
|
|
102
|
+
return_cols.append(f"v.{prop_unstd} AS {statistic}_unstd")
|
|
103
|
+
if consequence:
|
|
104
|
+
return_cols.append("hc.consequence AS consequence")
|
|
105
|
+
return_cols.append("hc.impact AS impact")
|
|
106
|
+
if gene:
|
|
107
|
+
return_cols.append("g.symbol AS gene")
|
|
108
|
+
|
|
109
|
+
cypher = (
|
|
110
|
+
f"{match_clause} "
|
|
111
|
+
f"WHERE {' AND '.join(where_parts)} "
|
|
112
|
+
f"RETURN DISTINCT {', '.join(return_cols)} "
|
|
113
|
+
"ORDER BY v.pos LIMIT $limit"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
elif statistic == "h12":
|
|
117
|
+
# Garud's H stored on GenomicWindow nodes
|
|
118
|
+
match_clause = "MATCH (w:GenomicWindow)"
|
|
119
|
+
where_parts = [
|
|
120
|
+
"w.chr = $chr",
|
|
121
|
+
"w.population = $population",
|
|
122
|
+
]
|
|
123
|
+
if min_score is not None:
|
|
124
|
+
where_parts.append(f"w.h12 >= {min_score}")
|
|
125
|
+
|
|
126
|
+
cypher = (
|
|
127
|
+
f"{match_clause} "
|
|
128
|
+
f"WHERE {' AND '.join(where_parts)} "
|
|
129
|
+
"RETURN w.windowId AS window_id, w.chr AS chr, "
|
|
130
|
+
"w.start AS start, w.end AS end, "
|
|
131
|
+
"w.h12 AS h12, w.h2_h1 AS h2_h1, w.hap_diversity AS hap_div "
|
|
132
|
+
"ORDER BY w.h12 DESC LIMIT $limit"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
else:
|
|
136
|
+
# Window-level statistics (fst, pi, tajima_d)
|
|
137
|
+
match_clause = "MATCH (w:GenomicWindow)"
|
|
138
|
+
where_parts = [
|
|
139
|
+
"w.chr = $chr",
|
|
140
|
+
"w.population = $population",
|
|
141
|
+
]
|
|
142
|
+
if min_score is not None:
|
|
143
|
+
where_parts.append(f"w.{prop} >= {min_score}")
|
|
144
|
+
if max_score is not None:
|
|
145
|
+
where_parts.append(f"w.{prop} <= {max_score}")
|
|
146
|
+
|
|
147
|
+
cypher = (
|
|
148
|
+
f"{match_clause} "
|
|
149
|
+
f"WHERE {' AND '.join(where_parts)} "
|
|
150
|
+
f"RETURN w.windowId AS window_id, w.start AS start, w.end AS end, "
|
|
151
|
+
f"w.{prop} AS {statistic}, w.n_variants AS n_variants "
|
|
152
|
+
"ORDER BY w.start LIMIT $limit"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
records = ctx.run(cypher, params)
|
|
156
|
+
|
|
157
|
+
if not records:
|
|
158
|
+
click.echo(f"No results found for {statistic} on {chr}/{population} "
|
|
159
|
+
f"with given filters.", err=True)
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
click.echo(f"Found {len(records)} records.", err=True)
|
|
163
|
+
format_output(records, output_path, fmt, "filter",
|
|
164
|
+
{"statistic": statistic, "chr": chr, "population": population,
|
|
165
|
+
"consequence": consequence, "pathway": pathway, "gene": gene})
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""graphpop garud-h — Garud's H statistics for haplotype homozygosity."""
|
|
2
|
+
import click
|
|
3
|
+
from ..cli import pass_ctx
|
|
4
|
+
from ..config import build_options_map, build_cypher
|
|
5
|
+
from ..formatters import format_output
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@click.command("garud-h")
|
|
9
|
+
@click.argument("chr")
|
|
10
|
+
@click.argument("population")
|
|
11
|
+
@click.argument("window_size", type=int)
|
|
12
|
+
@click.argument("step_size", type=int)
|
|
13
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
14
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
15
|
+
@click.option("--min-af", type=float, help="Minimum allele frequency")
|
|
16
|
+
@pass_ctx
|
|
17
|
+
def garud_h(ctx, chr, population, window_size, step_size, output_path, fmt, min_af):
|
|
18
|
+
"""Compute Garud's H1, H12, H2/H1 in sliding windows."""
|
|
19
|
+
opts = build_options_map(min_af=min_af)
|
|
20
|
+
cypher = build_cypher(
|
|
21
|
+
"graphpop.garud_h",
|
|
22
|
+
[f"'{chr}'", f"'{population}'", str(window_size), str(step_size)],
|
|
23
|
+
options=opts if opts else None,
|
|
24
|
+
yield_cols=["chr", "start", "end", "population", "h1", "h12", "h2_h1",
|
|
25
|
+
"hap_diversity", "n_haplotypes", "n_variants"],
|
|
26
|
+
)
|
|
27
|
+
records = ctx.run(cypher)
|
|
28
|
+
format_output(records, output_path, fmt, "garud-h",
|
|
29
|
+
{"chr": chr, "pop": population, "window": window_size,
|
|
30
|
+
"step": step_size})
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""graphpop genome-scan — sliding-window genome scan."""
|
|
2
|
+
import click
|
|
3
|
+
from ..cli import pass_ctx
|
|
4
|
+
from ..config import build_options_map, build_cypher
|
|
5
|
+
from ..formatters import format_output
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@click.command("genome-scan")
|
|
9
|
+
@click.argument("chr")
|
|
10
|
+
@click.argument("population")
|
|
11
|
+
@click.argument("window_size", type=int)
|
|
12
|
+
@click.argument("step_size", type=int)
|
|
13
|
+
@click.option("--pop2", help="Second population for Fst/Dxy/PBS")
|
|
14
|
+
@click.option("--persist", is_flag=True, default=False,
|
|
15
|
+
help="Persist window results to graph (default behavior)")
|
|
16
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
17
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
18
|
+
@click.option("--consequence", help="Filter by VEP consequence type")
|
|
19
|
+
@click.option("--pathway", help="Filter by pathway name")
|
|
20
|
+
@click.option("--gene", help="Filter by gene name")
|
|
21
|
+
@click.option("--min-af", type=float, help="Minimum allele frequency")
|
|
22
|
+
@pass_ctx
|
|
23
|
+
def genome_scan(ctx, chr, population, window_size, step_size, pop2, persist,
|
|
24
|
+
output_path, fmt, consequence, pathway, gene, min_af):
|
|
25
|
+
"""Run a sliding-window genome scan (pi, theta, Tajima's D, Fst, etc.)."""
|
|
26
|
+
opts = build_options_map(consequence=consequence, pathway=pathway, gene=gene,
|
|
27
|
+
min_af=min_af)
|
|
28
|
+
positional = [f"'{chr}'", f"'{population}'", str(window_size), str(step_size)]
|
|
29
|
+
if pop2:
|
|
30
|
+
positional.append(f"'{pop2}'")
|
|
31
|
+
cypher = build_cypher(
|
|
32
|
+
"graphpop.genome_scan", positional,
|
|
33
|
+
options=opts if opts else None,
|
|
34
|
+
yield_cols=["window_id", "chr", "start", "end", "population",
|
|
35
|
+
"n_variants", "n_segregating", "pi", "theta_w", "tajima_d",
|
|
36
|
+
"fst", "fst_wc", "dxy", "pbs", "fay_wu_h"],
|
|
37
|
+
)
|
|
38
|
+
records = ctx.run(cypher)
|
|
39
|
+
format_output(records, output_path, fmt, "genome-scan",
|
|
40
|
+
{"chr": chr, "pop": population, "window": window_size,
|
|
41
|
+
"step": step_size, "pop2": pop2})
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""graphpop ihs — integrated haplotype score."""
|
|
2
|
+
import click
|
|
3
|
+
from ..cli import pass_ctx
|
|
4
|
+
from ..config import build_options_map, build_cypher
|
|
5
|
+
from ..formatters import format_output
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@click.command()
|
|
9
|
+
@click.argument("chr")
|
|
10
|
+
@click.argument("population")
|
|
11
|
+
@click.option("--min-af", type=float, help="Minimum allele frequency filter")
|
|
12
|
+
@click.option("--persist", is_flag=True, default=False,
|
|
13
|
+
help="Write iHS scores to Variant nodes")
|
|
14
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
15
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
16
|
+
@pass_ctx
|
|
17
|
+
def ihs(ctx, chr, population, min_af, persist, output_path, fmt):
|
|
18
|
+
"""Compute integrated haplotype score (iHS) across a chromosome."""
|
|
19
|
+
opts = build_options_map(min_af=min_af, persist=persist)
|
|
20
|
+
cypher = build_cypher(
|
|
21
|
+
"graphpop.ihs",
|
|
22
|
+
[f"'{chr}'", f"'{population}'"],
|
|
23
|
+
options=opts if opts else None,
|
|
24
|
+
yield_cols=["variantId", "pos", "af", "ihs_unstd", "ihs"],
|
|
25
|
+
)
|
|
26
|
+
records = ctx.run(cypher)
|
|
27
|
+
format_output(records, output_path, fmt, "ihs",
|
|
28
|
+
{"chr": chr, "pop": population, "min_af": min_af,
|
|
29
|
+
"persist": persist})
|