graphpop-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphpop_cli/__init__.py +2 -0
- graphpop_cli/cli.py +161 -0
- graphpop_cli/commands/__init__.py +1 -0
- graphpop_cli/commands/aggregate.py +206 -0
- graphpop_cli/commands/batch.py +155 -0
- graphpop_cli/commands/compare.py +118 -0
- graphpop_cli/commands/config_cmd.py +117 -0
- graphpop_cli/commands/converge.py +156 -0
- graphpop_cli/commands/db.py +188 -0
- graphpop_cli/commands/divergence.py +37 -0
- graphpop_cli/commands/diversity.py +36 -0
- graphpop_cli/commands/dump.py +210 -0
- graphpop_cli/commands/export_bed.py +170 -0
- graphpop_cli/commands/export_windows.py +91 -0
- graphpop_cli/commands/extract.py +271 -0
- graphpop_cli/commands/filter_results.py +165 -0
- graphpop_cli/commands/garud_h.py +30 -0
- graphpop_cli/commands/genome_scan.py +41 -0
- graphpop_cli/commands/ihs.py +29 -0
- graphpop_cli/commands/import_data.py +266 -0
- graphpop_cli/commands/inventory.py +160 -0
- graphpop_cli/commands/joint_sfs.py +38 -0
- graphpop_cli/commands/ld.py +35 -0
- graphpop_cli/commands/lookup.py +207 -0
- graphpop_cli/commands/neighbors.py +175 -0
- graphpop_cli/commands/nsl.py +29 -0
- graphpop_cli/commands/plot.py +1066 -0
- graphpop_cli/commands/pop_summary.py +30 -0
- graphpop_cli/commands/query.py +15 -0
- graphpop_cli/commands/rank_genes.py +177 -0
- graphpop_cli/commands/report.py +264 -0
- graphpop_cli/commands/roh.py +30 -0
- graphpop_cli/commands/run_all.py +276 -0
- graphpop_cli/commands/server.py +98 -0
- graphpop_cli/commands/setup.py +299 -0
- graphpop_cli/commands/sfs.py +38 -0
- graphpop_cli/commands/validate.py +167 -0
- graphpop_cli/commands/xpehh.py +31 -0
- graphpop_cli/config.py +57 -0
- graphpop_cli/connection.py +52 -0
- graphpop_cli/formatters.py +81 -0
- graphpop_cli-0.1.0.dist-info/METADATA +73 -0
- graphpop_cli-0.1.0.dist-info/RECORD +46 -0
- graphpop_cli-0.1.0.dist-info/WHEEL +5 -0
- graphpop_cli-0.1.0.dist-info/entry_points.txt +2 -0
- graphpop_cli-0.1.0.dist-info/top_level.txt +1 -0
graphpop_cli/__init__.py
ADDED
graphpop_cli/cli.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""GraphPop CLI — command-line interface for graph-native population genomics.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
graphpop diversity chr22 1 50000000 EUR -o diversity.tsv
|
|
5
|
+
graphpop ihs chr22 EUR --min-af 0.05 --persist -o ihs.tsv
|
|
6
|
+
graphpop genome-scan chr22 EUR 100000 50000 --persist -o scan.tsv
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import click
|
|
14
|
+
|
|
15
|
+
from .connection import load_config, get_driver
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GraphPopContext:
|
|
19
|
+
"""Shared context passed to all commands."""
|
|
20
|
+
def __init__(self, uri=None, user=None, password=None, database=None,
|
|
21
|
+
config_path=None):
|
|
22
|
+
cfg = load_config(Path(config_path) if config_path else None)
|
|
23
|
+
if uri:
|
|
24
|
+
cfg["uri"] = uri
|
|
25
|
+
if user:
|
|
26
|
+
cfg["user"] = user
|
|
27
|
+
if password:
|
|
28
|
+
cfg["password"] = password
|
|
29
|
+
if database:
|
|
30
|
+
cfg["database"] = database
|
|
31
|
+
self.cfg = cfg
|
|
32
|
+
self._driver = None
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def driver(self):
|
|
36
|
+
if self._driver is None:
|
|
37
|
+
self._driver = get_driver(self.cfg)
|
|
38
|
+
return self._driver
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def database(self):
|
|
42
|
+
return self.cfg["database"]
|
|
43
|
+
|
|
44
|
+
def run(self, cypher: str, parameters: dict | None = None) -> list[dict]:
|
|
45
|
+
"""Run Cypher and return records as list of dicts."""
|
|
46
|
+
try:
|
|
47
|
+
with self.driver.session(database=self.database) as session:
|
|
48
|
+
return [rec.data() for rec in session.run(cypher, parameters)]
|
|
49
|
+
except Exception as e:
|
|
50
|
+
err_msg = str(e)
|
|
51
|
+
if "Connection refused" in err_msg or "Failed to establish" in err_msg:
|
|
52
|
+
click.echo(
|
|
53
|
+
"Error: Cannot connect to Neo4j at "
|
|
54
|
+
f"{self.cfg['uri']}.\n"
|
|
55
|
+
"Is Neo4j running? Check connection with:\n"
|
|
56
|
+
f" export GRAPHPOP_URI={self.cfg['uri']}\n"
|
|
57
|
+
" or create ~/.graphpop/config.yaml",
|
|
58
|
+
err=True,
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
click.echo(f"Error: {e}", err=True)
|
|
62
|
+
raise SystemExit(1)
|
|
63
|
+
|
|
64
|
+
def close(self):
|
|
65
|
+
if self._driver:
|
|
66
|
+
self._driver.close()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
pass_ctx = click.make_pass_decorator(GraphPopContext, ensure=True)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@click.group()
|
|
73
|
+
@click.option("--uri", envvar="GRAPHPOP_URI", help="Neo4j bolt URI")
|
|
74
|
+
@click.option("--user", envvar="GRAPHPOP_USER", help="Neo4j username")
|
|
75
|
+
@click.option("--password", envvar="GRAPHPOP_PASSWORD", help="Neo4j password")
|
|
76
|
+
@click.option("--database", envvar="GRAPHPOP_DATABASE", help="Neo4j database name")
|
|
77
|
+
@click.option("--config", "config_path", type=click.Path(),
|
|
78
|
+
help="Config file path (default: ~/.graphpop/config.yaml)")
|
|
79
|
+
@click.version_option(package_name="graphpop-cli")
|
|
80
|
+
@click.pass_context
|
|
81
|
+
def main(ctx, uri, user, password, database, config_path):
|
|
82
|
+
"""GraphPop — graph-native population genomics from the command line.
|
|
83
|
+
|
|
84
|
+
Compute population genetics statistics via Neo4j stored procedures with
|
|
85
|
+
default TSV output. Use --persist to write results to graph nodes.
|
|
86
|
+
"""
|
|
87
|
+
ctx.ensure_object(dict)
|
|
88
|
+
ctx.obj = GraphPopContext(uri=uri, user=user, password=password,
|
|
89
|
+
database=database, config_path=config_path)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Import all command modules
|
|
93
|
+
from .commands import ( # noqa: E402
|
|
94
|
+
diversity, divergence, sfs, joint_sfs,
|
|
95
|
+
genome_scan, pop_summary,
|
|
96
|
+
ld, ihs, xpehh, nsl, roh, garud_h,
|
|
97
|
+
query, run_all, aggregate, export_windows,
|
|
98
|
+
setup, server, db, import_data, dump,
|
|
99
|
+
config_cmd, validate, filter_results, plot,
|
|
100
|
+
lookup, converge, inventory, rank_genes,
|
|
101
|
+
extract, export_bed, batch, compare,
|
|
102
|
+
report, neighbors,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Individual procedures (12)
|
|
106
|
+
main.add_command(diversity.diversity)
|
|
107
|
+
main.add_command(divergence.divergence)
|
|
108
|
+
main.add_command(sfs.sfs)
|
|
109
|
+
main.add_command(joint_sfs.joint_sfs)
|
|
110
|
+
main.add_command(genome_scan.genome_scan)
|
|
111
|
+
main.add_command(pop_summary.pop_summary)
|
|
112
|
+
main.add_command(ld.ld)
|
|
113
|
+
main.add_command(ihs.ihs)
|
|
114
|
+
main.add_command(xpehh.xpehh)
|
|
115
|
+
main.add_command(nsl.nsl)
|
|
116
|
+
main.add_command(roh.roh)
|
|
117
|
+
main.add_command(garud_h.garud_h)
|
|
118
|
+
|
|
119
|
+
# Orchestration and export
|
|
120
|
+
main.add_command(run_all.run_all)
|
|
121
|
+
main.add_command(aggregate.aggregate)
|
|
122
|
+
main.add_command(export_windows.export_windows)
|
|
123
|
+
main.add_command(query.query)
|
|
124
|
+
main.add_command(filter_results.filter_results)
|
|
125
|
+
|
|
126
|
+
# Setup and server management
|
|
127
|
+
main.add_command(setup.setup)
|
|
128
|
+
main.add_command(server.start)
|
|
129
|
+
main.add_command(server.stop)
|
|
130
|
+
main.add_command(server.status)
|
|
131
|
+
|
|
132
|
+
# Database management
|
|
133
|
+
main.add_command(db.db)
|
|
134
|
+
main.add_command(import_data.import_data)
|
|
135
|
+
main.add_command(dump.dump)
|
|
136
|
+
main.add_command(dump.load)
|
|
137
|
+
|
|
138
|
+
# Configuration and validation
|
|
139
|
+
main.add_command(config_cmd.config)
|
|
140
|
+
main.add_command(validate.validate)
|
|
141
|
+
main.add_command(plot.plot)
|
|
142
|
+
|
|
143
|
+
# Phase 1 high-priority commands
|
|
144
|
+
main.add_command(lookup.lookup)
|
|
145
|
+
main.add_command(converge.converge)
|
|
146
|
+
main.add_command(inventory.inventory)
|
|
147
|
+
main.add_command(rank_genes.rank_genes)
|
|
148
|
+
|
|
149
|
+
# Phase 2 commands
|
|
150
|
+
main.add_command(extract.extract)
|
|
151
|
+
main.add_command(export_bed.export_bed)
|
|
152
|
+
main.add_command(batch.batch)
|
|
153
|
+
main.add_command(compare.compare)
|
|
154
|
+
|
|
155
|
+
# Phase 3 commands
|
|
156
|
+
main.add_command(report.report)
|
|
157
|
+
main.add_command(neighbors.neighbors)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if __name__ == "__main__":
|
|
161
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""GraphPop CLI command modules."""
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""graphpop aggregate — aggregate results and generate summary tables."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import csv
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from ..cli import pass_ctx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@click.command()
|
|
14
|
+
@click.option("--results-dir", "-d", type=click.Path(exists=True), required=True,
|
|
15
|
+
help="Directory with per-procedure TSV results (from run-all)")
|
|
16
|
+
@click.option("--json-results", "-j", type=click.Path(exists=True),
|
|
17
|
+
help="JSON results file (from run-all)")
|
|
18
|
+
@click.option("--output-dir", "-o", type=click.Path(), default="graphpop_tables",
|
|
19
|
+
help="Output directory for summary tables")
|
|
20
|
+
@pass_ctx
|
|
21
|
+
def aggregate(ctx, results_dir, json_results, output_dir):
|
|
22
|
+
"""Aggregate per-population results into summary tables.
|
|
23
|
+
|
|
24
|
+
Reads TSV results from a run-all output directory and produces
|
|
25
|
+
publication-ready summary tables:
|
|
26
|
+
|
|
27
|
+
\b
|
|
28
|
+
population_summary.tsv — per-pop diversity, theta, Tajima's D, Fis
|
|
29
|
+
fst_matrix.tsv — pairwise Fst matrix
|
|
30
|
+
pinpis.tsv — piN/piS ratios (if conditioned results exist)
|
|
31
|
+
selection_peaks.tsv — top iHS/XP-EHH/nSL peaks per population
|
|
32
|
+
roh_summary.tsv — per-pop FROH statistics
|
|
33
|
+
"""
|
|
34
|
+
results_path = Path(results_dir)
|
|
35
|
+
out_dir = Path(output_dir)
|
|
36
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
# Load JSON results if provided
|
|
39
|
+
all_results = {}
|
|
40
|
+
if json_results:
|
|
41
|
+
with open(json_results) as f:
|
|
42
|
+
all_results = json.load(f)
|
|
43
|
+
click.echo(f"Loaded {len(all_results)} results from JSON")
|
|
44
|
+
|
|
45
|
+
# --- Table 1: Population Summary ---
|
|
46
|
+
diversity_dir = results_path / "diversity"
|
|
47
|
+
if diversity_dir.exists():
|
|
48
|
+
click.echo("Generating population_summary.tsv...")
|
|
49
|
+
pop_stats = _aggregate_single_row_tsv(diversity_dir)
|
|
50
|
+
_write_summary(out_dir / "population_summary.tsv", pop_stats,
|
|
51
|
+
["population", "chr", "pi", "theta_w", "tajima_d",
|
|
52
|
+
"het_exp", "het_obs", "fis", "n_variants", "n_segregating"])
|
|
53
|
+
|
|
54
|
+
# --- Table 2: Fst Matrix ---
|
|
55
|
+
divergence_dir = results_path / "divergence"
|
|
56
|
+
if divergence_dir.exists():
|
|
57
|
+
click.echo("Generating fst_matrix.tsv...")
|
|
58
|
+
div_stats = _aggregate_single_row_tsv(divergence_dir)
|
|
59
|
+
_write_summary(out_dir / "fst_matrix.tsv", div_stats,
|
|
60
|
+
["pop1", "pop2", "chr", "fst_hudson", "fst_wc", "dxy", "da"])
|
|
61
|
+
|
|
62
|
+
# --- Table 3: ROH Summary ---
|
|
63
|
+
roh_dir = results_path / "roh"
|
|
64
|
+
if roh_dir.exists():
|
|
65
|
+
click.echo("Generating roh_summary.tsv...")
|
|
66
|
+
roh_data = _aggregate_multi_row_tsv(roh_dir)
|
|
67
|
+
# Compute per-population means
|
|
68
|
+
pop_roh = {}
|
|
69
|
+
for rec in roh_data:
|
|
70
|
+
pop = rec.get("population", rec.get("file_pop", "unknown"))
|
|
71
|
+
if pop not in pop_roh:
|
|
72
|
+
pop_roh[pop] = {"n_samples": 0, "total_froh": 0.0,
|
|
73
|
+
"total_n_roh": 0, "max_froh": 0.0}
|
|
74
|
+
pop_roh[pop]["n_samples"] += 1
|
|
75
|
+
froh = float(rec.get("froh", 0))
|
|
76
|
+
pop_roh[pop]["total_froh"] += froh
|
|
77
|
+
pop_roh[pop]["total_n_roh"] += int(rec.get("n_roh", 0))
|
|
78
|
+
pop_roh[pop]["max_froh"] = max(pop_roh[pop]["max_froh"], froh)
|
|
79
|
+
|
|
80
|
+
rows = []
|
|
81
|
+
for pop, s in sorted(pop_roh.items()):
|
|
82
|
+
rows.append({
|
|
83
|
+
"population": pop,
|
|
84
|
+
"n_samples": s["n_samples"],
|
|
85
|
+
"mean_froh": f"{s['total_froh'] / s['n_samples']:.6f}",
|
|
86
|
+
"mean_n_roh": f"{s['total_n_roh'] / s['n_samples']:.1f}",
|
|
87
|
+
"max_froh": f"{s['max_froh']:.6f}",
|
|
88
|
+
})
|
|
89
|
+
_write_dict_tsv(out_dir / "roh_summary.tsv", rows)
|
|
90
|
+
|
|
91
|
+
# --- Table 4: Selection Peaks ---
|
|
92
|
+
for proc in ("ihs", "nsl", "xpehh"):
|
|
93
|
+
proc_dir = results_path / proc
|
|
94
|
+
if proc_dir.exists():
|
|
95
|
+
click.echo(f"Generating {proc}_peaks.tsv...")
|
|
96
|
+
peaks = _extract_peaks(proc_dir, proc, top_n=100)
|
|
97
|
+
_write_dict_tsv(out_dir / f"{proc}_peaks.tsv", peaks)
|
|
98
|
+
|
|
99
|
+
# --- Table 5: Garud's H Sweep Windows ---
|
|
100
|
+
garud_dir = results_path / "garud_h"
|
|
101
|
+
if garud_dir.exists():
|
|
102
|
+
click.echo("Generating sweep_windows.tsv...")
|
|
103
|
+
sweeps = _extract_sweep_windows(garud_dir, h12_threshold=0.1)
|
|
104
|
+
_write_dict_tsv(out_dir / "sweep_windows.tsv", sweeps)
|
|
105
|
+
|
|
106
|
+
click.echo(f"\nSummary tables written to {out_dir}/")
|
|
107
|
+
for f in sorted(out_dir.glob("*.tsv")):
|
|
108
|
+
n_lines = sum(1 for _ in open(f)) - 1
|
|
109
|
+
click.echo(f" {f.name}: {n_lines} rows")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _aggregate_single_row_tsv(directory: Path) -> list[dict]:
|
|
113
|
+
"""Read TSV files with single data row, extract pop/chr from filename."""
|
|
114
|
+
rows = []
|
|
115
|
+
for tsv in sorted(directory.glob("*.tsv")):
|
|
116
|
+
parts = tsv.stem.split("_")
|
|
117
|
+
with open(tsv) as f:
|
|
118
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
119
|
+
for rec in reader:
|
|
120
|
+
# Infer pop and chr from filename: POP_CHR.tsv
|
|
121
|
+
if len(parts) >= 2:
|
|
122
|
+
rec["population"] = "_".join(parts[:-1])
|
|
123
|
+
rec["chr"] = parts[-1]
|
|
124
|
+
elif "vs" in tsv.stem:
|
|
125
|
+
# Pairwise: POP1_vs_POP2_CHR.tsv
|
|
126
|
+
vs_idx = parts.index("vs")
|
|
127
|
+
rec["pop1"] = "_".join(parts[:vs_idx])
|
|
128
|
+
rec["pop2"] = "_".join(parts[vs_idx + 1:-1])
|
|
129
|
+
rec["chr"] = parts[-1]
|
|
130
|
+
rows.append(rec)
|
|
131
|
+
return rows
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _aggregate_multi_row_tsv(directory: Path) -> list[dict]:
|
|
135
|
+
"""Read TSV files with multiple data rows."""
|
|
136
|
+
rows = []
|
|
137
|
+
for tsv in sorted(directory.glob("*.tsv")):
|
|
138
|
+
parts = tsv.stem.split("_")
|
|
139
|
+
pop = "_".join(parts[:-1]) if len(parts) >= 2 else parts[0]
|
|
140
|
+
with open(tsv) as f:
|
|
141
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
142
|
+
for rec in reader:
|
|
143
|
+
rec["file_pop"] = pop
|
|
144
|
+
rows.append(rec)
|
|
145
|
+
return rows
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _extract_peaks(directory: Path, stat_name: str,
|
|
149
|
+
top_n: int = 100) -> list[dict]:
|
|
150
|
+
"""Extract top peaks from per-variant result files."""
|
|
151
|
+
all_variants = []
|
|
152
|
+
for tsv in sorted(directory.glob("*.tsv")):
|
|
153
|
+
parts = tsv.stem.split("_")
|
|
154
|
+
with open(tsv) as f:
|
|
155
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
156
|
+
for rec in reader:
|
|
157
|
+
score = rec.get(stat_name, rec.get(f"{stat_name}_unstd", "0"))
|
|
158
|
+
try:
|
|
159
|
+
rec["abs_score"] = abs(float(score))
|
|
160
|
+
except (ValueError, TypeError):
|
|
161
|
+
rec["abs_score"] = 0
|
|
162
|
+
rec["source_file"] = tsv.stem
|
|
163
|
+
all_variants.append(rec)
|
|
164
|
+
|
|
165
|
+
all_variants.sort(key=lambda r: r["abs_score"], reverse=True)
|
|
166
|
+
return all_variants[:top_n]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _extract_sweep_windows(directory: Path,
|
|
170
|
+
h12_threshold: float = 0.1) -> list[dict]:
|
|
171
|
+
"""Extract windows exceeding H12 threshold."""
|
|
172
|
+
sweeps = []
|
|
173
|
+
for tsv in sorted(directory.glob("*.tsv")):
|
|
174
|
+
with open(tsv) as f:
|
|
175
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
176
|
+
for rec in reader:
|
|
177
|
+
try:
|
|
178
|
+
if float(rec.get("h12", 0)) >= h12_threshold:
|
|
179
|
+
sweeps.append(rec)
|
|
180
|
+
except (ValueError, TypeError):
|
|
181
|
+
pass
|
|
182
|
+
sweeps.sort(key=lambda r: float(r.get("h12", 0)), reverse=True)
|
|
183
|
+
return sweeps
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _write_summary(path: Path, rows: list[dict], columns: list[str]):
|
|
187
|
+
"""Write summary table with specified columns."""
|
|
188
|
+
with open(path, "w", newline="") as f:
|
|
189
|
+
writer = csv.DictWriter(f, fieldnames=columns, delimiter="\t",
|
|
190
|
+
extrasaction="ignore")
|
|
191
|
+
writer.writeheader()
|
|
192
|
+
writer.writerows(rows)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _write_dict_tsv(path: Path, rows: list[dict]):
|
|
196
|
+
"""Write list of dicts as TSV."""
|
|
197
|
+
if not rows:
|
|
198
|
+
with open(path, "w") as f:
|
|
199
|
+
f.write("# No results\n")
|
|
200
|
+
return
|
|
201
|
+
keys = list(rows[0].keys())
|
|
202
|
+
with open(path, "w", newline="") as f:
|
|
203
|
+
writer = csv.DictWriter(f, fieldnames=keys, delimiter="\t",
|
|
204
|
+
extrasaction="ignore")
|
|
205
|
+
writer.writeheader()
|
|
206
|
+
writer.writerows(rows)
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""graphpop batch — run any procedure across multiple populations and chromosomes."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from ..cli import pass_ctx
|
|
9
|
+
from ..config import build_options_map, build_cypher
|
|
10
|
+
from ..formatters import format_output
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Map command names to their procedure and yield columns
|
|
14
|
+
COMMAND_REGISTRY = {
|
|
15
|
+
"diversity": {
|
|
16
|
+
"procedure": "graphpop.diversity",
|
|
17
|
+
"args": lambda chr, pop, **kw: [f"'{chr}'", "1", "999999999", f"'{pop}'"],
|
|
18
|
+
"yield": ["pi", "theta_w", "tajima_d", "fay_wu_h", "fay_wu_h_norm",
|
|
19
|
+
"het_exp", "het_obs", "fis", "n_variants", "n_segregating",
|
|
20
|
+
"n_polarized"],
|
|
21
|
+
},
|
|
22
|
+
"divergence": {
|
|
23
|
+
"procedure": "graphpop.divergence",
|
|
24
|
+
"args": lambda chr, pop, pop2=None, **kw: [
|
|
25
|
+
f"'{chr}'", "1", "999999999", f"'{pop}'", f"'{pop2}'"
|
|
26
|
+
] if pop2 else None,
|
|
27
|
+
"yield": ["fst_hudson", "fst_wc", "dxy", "da", "n_variants"],
|
|
28
|
+
},
|
|
29
|
+
"ihs": {
|
|
30
|
+
"procedure": "graphpop.ihs",
|
|
31
|
+
"args": lambda chr, pop, **kw: [f"'{chr}'", f"'{pop}'"],
|
|
32
|
+
"yield": ["n_variants", "n_computed", "n_significant"],
|
|
33
|
+
},
|
|
34
|
+
"xpehh": {
|
|
35
|
+
"procedure": "graphpop.xpehh",
|
|
36
|
+
"args": lambda chr, pop, pop2=None, **kw: [
|
|
37
|
+
f"'{chr}'", f"'{pop}'", f"'{pop2}'"
|
|
38
|
+
] if pop2 else None,
|
|
39
|
+
"yield": ["n_variants", "n_computed", "n_significant"],
|
|
40
|
+
},
|
|
41
|
+
"nsl": {
|
|
42
|
+
"procedure": "graphpop.nsl",
|
|
43
|
+
"args": lambda chr, pop, **kw: [f"'{chr}'", f"'{pop}'"],
|
|
44
|
+
"yield": ["n_variants", "n_computed", "n_significant"],
|
|
45
|
+
},
|
|
46
|
+
"sfs": {
|
|
47
|
+
"procedure": "graphpop.sfs",
|
|
48
|
+
"args": lambda chr, pop, **kw: [f"'{chr}'", "1", "999999999", f"'{pop}'"],
|
|
49
|
+
"yield": ["sfs", "n_variants", "n_segregating"],
|
|
50
|
+
},
|
|
51
|
+
"roh": {
|
|
52
|
+
"procedure": "graphpop.roh",
|
|
53
|
+
"args": lambda chr, pop, **kw: [f"'{chr}'", f"'{pop}'"],
|
|
54
|
+
"yield": ["n_samples", "mean_froh", "median_froh", "n_roh_segments"],
|
|
55
|
+
},
|
|
56
|
+
"garud-h": {
|
|
57
|
+
"procedure": "graphpop.garud_h",
|
|
58
|
+
"args": lambda chr, pop, **kw: [f"'{chr}'", f"'{pop}'"],
|
|
59
|
+
"yield": ["n_windows", "mean_h12", "max_h12"],
|
|
60
|
+
},
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@click.command("batch")
|
|
65
|
+
@click.argument("command")
|
|
66
|
+
@click.option("--pops", required=True, help="Comma-separated population list")
|
|
67
|
+
@click.option("--chrs", required=True, help="Comma-separated chromosome list")
|
|
68
|
+
@click.option("--pop2", help="Second population (for divergence/xpehh, applied to all)")
|
|
69
|
+
@click.option("--workers", type=int, default=1,
|
|
70
|
+
help="Parallel workers (default: 1, currently sequential)")
|
|
71
|
+
@click.option("-d", "--output-dir", required=True,
|
|
72
|
+
type=click.Path(), help="Output directory (one TSV per pop-chr combo)")
|
|
73
|
+
@click.option("--persist", is_flag=True, help="Pass --persist to underlying command")
|
|
74
|
+
@click.option("--consequence", help="Filter by VEP consequence type")
|
|
75
|
+
@click.option("--pathway", help="Filter by pathway name")
|
|
76
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
77
|
+
@pass_ctx
|
|
78
|
+
def batch(ctx, command, pops, chrs, pop2, workers, output_dir, persist,
|
|
79
|
+
consequence, pathway, fmt):
|
|
80
|
+
"""Run a GraphPop procedure across multiple populations and/or chromosomes.
|
|
81
|
+
|
|
82
|
+
COMMAND is the procedure name: diversity, divergence, ihs, xpehh, nsl,
|
|
83
|
+
sfs, roh, garud-h.
|
|
84
|
+
|
|
85
|
+
Creates one output file per (population, chromosome) combination in
|
|
86
|
+
the output directory, named {command}_{pop}_{chr}.{ext}.
|
|
87
|
+
|
|
88
|
+
\b
|
|
89
|
+
Examples:
|
|
90
|
+
graphpop batch diversity --pops EUR,AFR,EAS --chrs chr1,chr2,chr22 -d output/
|
|
91
|
+
graphpop batch ihs --pops EUR,AFR --chrs chr22 --persist -d output/
|
|
92
|
+
graphpop batch divergence --pops EUR --pop2 AFR --chrs chr22 -d output/
|
|
93
|
+
"""
|
|
94
|
+
if command not in COMMAND_REGISTRY:
|
|
95
|
+
available = ", ".join(sorted(COMMAND_REGISTRY.keys()))
|
|
96
|
+
click.echo(f"Error: unknown command '{command}'. Available: {available}", err=True)
|
|
97
|
+
raise SystemExit(1)
|
|
98
|
+
|
|
99
|
+
spec = COMMAND_REGISTRY[command]
|
|
100
|
+
pop_list = [p.strip() for p in pops.split(",")]
|
|
101
|
+
chr_list = [c.strip() for c in chrs.split(",")]
|
|
102
|
+
|
|
103
|
+
# Create output directory
|
|
104
|
+
out_dir = Path(output_dir)
|
|
105
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
|
|
107
|
+
ext = fmt
|
|
108
|
+
opts = build_options_map(consequence=consequence, pathway=pathway)
|
|
109
|
+
if persist:
|
|
110
|
+
opts["persist"] = True
|
|
111
|
+
|
|
112
|
+
total = len(pop_list) * len(chr_list)
|
|
113
|
+
completed = 0
|
|
114
|
+
failed = 0
|
|
115
|
+
|
|
116
|
+
for pop in pop_list:
|
|
117
|
+
for chr_name in chr_list:
|
|
118
|
+
completed += 1
|
|
119
|
+
label = f"[{completed}/{total}] {command} {pop} {chr_name}"
|
|
120
|
+
click.echo(f"{label} ...", err=True)
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
args = spec["args"](chr=chr_name, pop=pop, pop2=pop2)
|
|
124
|
+
if args is None:
|
|
125
|
+
click.echo(f" Skipping: missing required argument (e.g., --pop2).", err=True)
|
|
126
|
+
failed += 1
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
cypher = build_cypher(
|
|
130
|
+
spec["procedure"],
|
|
131
|
+
args,
|
|
132
|
+
options=opts if opts else None,
|
|
133
|
+
yield_cols=spec["yield"],
|
|
134
|
+
)
|
|
135
|
+
records = ctx.run(cypher)
|
|
136
|
+
|
|
137
|
+
if not records:
|
|
138
|
+
click.echo(f" No results.", err=True)
|
|
139
|
+
failed += 1
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
out_file = out_dir / f"{command}_{pop}_{chr_name}.{ext}"
|
|
143
|
+
format_output(records, str(out_file), fmt, f"batch {command}",
|
|
144
|
+
{"population": pop, "chr": chr_name})
|
|
145
|
+
click.echo(f" -> {out_file} ({len(records)} rows)", err=True)
|
|
146
|
+
|
|
147
|
+
except SystemExit:
|
|
148
|
+
click.echo(f" FAILED (query error).", err=True)
|
|
149
|
+
failed += 1
|
|
150
|
+
except Exception as e:
|
|
151
|
+
click.echo(f" FAILED: {e}", err=True)
|
|
152
|
+
failed += 1
|
|
153
|
+
|
|
154
|
+
click.echo(f"\nBatch complete: {completed - failed}/{total} succeeded, "
|
|
155
|
+
f"{failed} failed.", err=True)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""graphpop compare — compare statistics between two populations."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from ..cli import pass_ctx
|
|
9
|
+
from ..formatters import format_output
|
|
10
|
+
|
|
11
|
+
# Allowed stat names — used to whitelist dynamic property access
|
|
12
|
+
_VALID_STATS = {"pi", "theta_w", "tajima_d", "fst", "ihs"}
|
|
13
|
+
_IDENT_RE = re.compile(r'^[A-Za-z0-9_-]+$')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _validate_identifier(value: str, label: str) -> str:
|
|
17
|
+
"""Validate that a value is safe for use as a Cypher property name."""
|
|
18
|
+
if not _IDENT_RE.match(value):
|
|
19
|
+
raise click.BadParameter(
|
|
20
|
+
f"Invalid {label}: {value!r}. Only alphanumeric, hyphen, "
|
|
21
|
+
"and underscore characters are allowed."
|
|
22
|
+
)
|
|
23
|
+
return value
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@click.command("compare")
|
|
27
|
+
@click.argument("pop1")
|
|
28
|
+
@click.argument("pop2")
|
|
29
|
+
@click.argument("chr", metavar="CHR")
|
|
30
|
+
@click.option("--stat", required=True,
|
|
31
|
+
type=click.Choice(["pi", "theta_w", "tajima_d", "fst", "ihs"]),
|
|
32
|
+
help="Statistic to compare")
|
|
33
|
+
@click.option("--window-size", type=int, default=100000,
|
|
34
|
+
help="Sliding window size for comparison (default: 100000)")
|
|
35
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
36
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
37
|
+
@click.option("--limit", type=int, default=10000, help="Maximum rows (default: 10000)")
|
|
38
|
+
@pass_ctx
|
|
39
|
+
def compare(ctx, pop1, pop2, chr, stat, window_size, output_path, fmt, limit):
|
|
40
|
+
"""Compare statistics between two populations across a chromosome.
|
|
41
|
+
|
|
42
|
+
For window-based stats (pi, theta_w, tajima_d, fst), queries GenomicWindow
|
|
43
|
+
nodes for both populations and joins by window position to compute delta.
|
|
44
|
+
|
|
45
|
+
For ihs, queries Variant nodes with ihs_{POP1} and ihs_{POP2} properties
|
|
46
|
+
and computes the per-variant difference.
|
|
47
|
+
|
|
48
|
+
Output columns: window_start, window_end, stat_pop1, stat_pop2, delta, abs_delta.
|
|
49
|
+
|
|
50
|
+
\b
|
|
51
|
+
Examples:
|
|
52
|
+
graphpop compare EUR AFR chr22 --stat pi -o delta_pi.tsv
|
|
53
|
+
graphpop compare GJ-tmp GJ-trp Chr1 --stat fst -o delta.tsv
|
|
54
|
+
graphpop compare EUR EAS chr22 --stat ihs -o ihs_diff.tsv
|
|
55
|
+
"""
|
|
56
|
+
# Validate identifiers used in dynamic property names
|
|
57
|
+
pop1 = _validate_identifier(pop1, "population")
|
|
58
|
+
pop2 = _validate_identifier(pop2, "population")
|
|
59
|
+
|
|
60
|
+
if stat == "ihs":
|
|
61
|
+
records = _compare_variant_stat(ctx, pop1, pop2, chr, stat, limit)
|
|
62
|
+
else:
|
|
63
|
+
records = _compare_window_stat(ctx, pop1, pop2, chr, stat, window_size, limit)
|
|
64
|
+
|
|
65
|
+
if not records:
|
|
66
|
+
click.echo(f"No comparison data found for {stat} on {chr} "
|
|
67
|
+
f"({pop1} vs {pop2}).", err=True)
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
click.echo(f"Found {len(records)} comparison rows.", err=True)
|
|
71
|
+
format_output(records, output_path, fmt, "compare",
|
|
72
|
+
{"pop1": pop1, "pop2": pop2, "chr": chr, "stat": stat})
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _compare_window_stat(ctx, pop1, pop2, chr, stat, window_size, limit):
|
|
76
|
+
"""Compare window-based statistics between two populations."""
|
|
77
|
+
prop = stat
|
|
78
|
+
params = {"chr": chr, "pop1": pop1, "pop2": pop2, "limit": limit}
|
|
79
|
+
|
|
80
|
+
cypher = (
|
|
81
|
+
f"MATCH (w1:GenomicWindow) "
|
|
82
|
+
f"WHERE w1.chr = $chr AND w1.population = $pop1 "
|
|
83
|
+
f"AND w1.{prop} IS NOT NULL "
|
|
84
|
+
f"WITH w1 "
|
|
85
|
+
f"MATCH (w2:GenomicWindow) "
|
|
86
|
+
f"WHERE w2.chr = $chr AND w2.population = $pop2 "
|
|
87
|
+
f"AND w2.start = w1.start AND w2.end = w1.end "
|
|
88
|
+
f"AND w2.{prop} IS NOT NULL "
|
|
89
|
+
f"RETURN w1.start AS window_start, "
|
|
90
|
+
f"w1.end AS window_end, "
|
|
91
|
+
f"w1.{prop} AS {stat}_{pop1}, "
|
|
92
|
+
f"w2.{prop} AS {stat}_{pop2}, "
|
|
93
|
+
f"(w1.{prop} - w2.{prop}) AS delta, "
|
|
94
|
+
f"abs(w1.{prop} - w2.{prop}) AS abs_delta "
|
|
95
|
+
f"ORDER BY w1.start LIMIT $limit"
|
|
96
|
+
)
|
|
97
|
+
return ctx.run(cypher, params)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _compare_variant_stat(ctx, pop1, pop2, chr, stat, limit):
|
|
101
|
+
"""Compare variant-based statistics (ihs) between two populations."""
|
|
102
|
+
prop1 = f"{stat}_{pop1}"
|
|
103
|
+
prop2 = f"{stat}_{pop2}"
|
|
104
|
+
params = {"chr": chr, "limit": limit}
|
|
105
|
+
|
|
106
|
+
cypher = (
|
|
107
|
+
f"MATCH (v:Variant) "
|
|
108
|
+
f"WHERE v.chr = $chr "
|
|
109
|
+
f"AND v.{prop1} IS NOT NULL AND v.{prop2} IS NOT NULL "
|
|
110
|
+
f"RETURN v.pos AS pos, "
|
|
111
|
+
f"v.variantId AS variantId, "
|
|
112
|
+
f"v.{prop1} AS {stat}_{pop1}, "
|
|
113
|
+
f"v.{prop2} AS {stat}_{pop2}, "
|
|
114
|
+
f"(v.{prop1} - v.{prop2}) AS delta, "
|
|
115
|
+
f"abs(v.{prop1} - v.{prop2}) AS abs_delta "
|
|
116
|
+
f"ORDER BY v.pos LIMIT $limit"
|
|
117
|
+
)
|
|
118
|
+
return ctx.run(cypher, params)
|