graphpop-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. graphpop_cli/__init__.py +2 -0
  2. graphpop_cli/cli.py +161 -0
  3. graphpop_cli/commands/__init__.py +1 -0
  4. graphpop_cli/commands/aggregate.py +206 -0
  5. graphpop_cli/commands/batch.py +155 -0
  6. graphpop_cli/commands/compare.py +118 -0
  7. graphpop_cli/commands/config_cmd.py +117 -0
  8. graphpop_cli/commands/converge.py +156 -0
  9. graphpop_cli/commands/db.py +188 -0
  10. graphpop_cli/commands/divergence.py +37 -0
  11. graphpop_cli/commands/diversity.py +36 -0
  12. graphpop_cli/commands/dump.py +210 -0
  13. graphpop_cli/commands/export_bed.py +170 -0
  14. graphpop_cli/commands/export_windows.py +91 -0
  15. graphpop_cli/commands/extract.py +271 -0
  16. graphpop_cli/commands/filter_results.py +165 -0
  17. graphpop_cli/commands/garud_h.py +30 -0
  18. graphpop_cli/commands/genome_scan.py +41 -0
  19. graphpop_cli/commands/ihs.py +29 -0
  20. graphpop_cli/commands/import_data.py +266 -0
  21. graphpop_cli/commands/inventory.py +160 -0
  22. graphpop_cli/commands/joint_sfs.py +38 -0
  23. graphpop_cli/commands/ld.py +35 -0
  24. graphpop_cli/commands/lookup.py +207 -0
  25. graphpop_cli/commands/neighbors.py +175 -0
  26. graphpop_cli/commands/nsl.py +29 -0
  27. graphpop_cli/commands/plot.py +1066 -0
  28. graphpop_cli/commands/pop_summary.py +30 -0
  29. graphpop_cli/commands/query.py +15 -0
  30. graphpop_cli/commands/rank_genes.py +177 -0
  31. graphpop_cli/commands/report.py +264 -0
  32. graphpop_cli/commands/roh.py +30 -0
  33. graphpop_cli/commands/run_all.py +276 -0
  34. graphpop_cli/commands/server.py +98 -0
  35. graphpop_cli/commands/setup.py +299 -0
  36. graphpop_cli/commands/sfs.py +38 -0
  37. graphpop_cli/commands/validate.py +167 -0
  38. graphpop_cli/commands/xpehh.py +31 -0
  39. graphpop_cli/config.py +57 -0
  40. graphpop_cli/connection.py +52 -0
  41. graphpop_cli/formatters.py +81 -0
  42. graphpop_cli-0.1.0.dist-info/METADATA +73 -0
  43. graphpop_cli-0.1.0.dist-info/RECORD +46 -0
  44. graphpop_cli-0.1.0.dist-info/WHEEL +5 -0
  45. graphpop_cli-0.1.0.dist-info/entry_points.txt +2 -0
  46. graphpop_cli-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2 @@
1
+ """GraphPop CLI — command-line interface for graph-native population genomics."""
2
+ __version__ = "0.1.0"
graphpop_cli/cli.py ADDED
@@ -0,0 +1,161 @@
1
+ """GraphPop CLI — command-line interface for graph-native population genomics.
2
+
3
+ Usage:
4
+ graphpop diversity chr22 1 50000000 EUR -o diversity.tsv
5
+ graphpop ihs chr22 EUR --min-af 0.05 --persist -o ihs.tsv
6
+ graphpop genome-scan chr22 EUR 100000 50000 --persist -o scan.tsv
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ import click
14
+
15
+ from .connection import load_config, get_driver
16
+
17
+
18
+ class GraphPopContext:
19
+ """Shared context passed to all commands."""
20
+ def __init__(self, uri=None, user=None, password=None, database=None,
21
+ config_path=None):
22
+ cfg = load_config(Path(config_path) if config_path else None)
23
+ if uri:
24
+ cfg["uri"] = uri
25
+ if user:
26
+ cfg["user"] = user
27
+ if password:
28
+ cfg["password"] = password
29
+ if database:
30
+ cfg["database"] = database
31
+ self.cfg = cfg
32
+ self._driver = None
33
+
34
+ @property
35
+ def driver(self):
36
+ if self._driver is None:
37
+ self._driver = get_driver(self.cfg)
38
+ return self._driver
39
+
40
+ @property
41
+ def database(self):
42
+ return self.cfg["database"]
43
+
44
+ def run(self, cypher: str, parameters: dict | None = None) -> list[dict]:
45
+ """Run Cypher and return records as list of dicts."""
46
+ try:
47
+ with self.driver.session(database=self.database) as session:
48
+ return [rec.data() for rec in session.run(cypher, parameters)]
49
+ except Exception as e:
50
+ err_msg = str(e)
51
+ if "Connection refused" in err_msg or "Failed to establish" in err_msg:
52
+ click.echo(
53
+ "Error: Cannot connect to Neo4j at "
54
+ f"{self.cfg['uri']}.\n"
55
+ "Is Neo4j running? Check connection with:\n"
56
+ f" export GRAPHPOP_URI={self.cfg['uri']}\n"
57
+ " or create ~/.graphpop/config.yaml",
58
+ err=True,
59
+ )
60
+ else:
61
+ click.echo(f"Error: {e}", err=True)
62
+ raise SystemExit(1)
63
+
64
+ def close(self):
65
+ if self._driver:
66
+ self._driver.close()
67
+
68
+
69
+ pass_ctx = click.make_pass_decorator(GraphPopContext, ensure=True)
70
+
71
+
72
+ @click.group()
73
+ @click.option("--uri", envvar="GRAPHPOP_URI", help="Neo4j bolt URI")
74
+ @click.option("--user", envvar="GRAPHPOP_USER", help="Neo4j username")
75
+ @click.option("--password", envvar="GRAPHPOP_PASSWORD", help="Neo4j password")
76
+ @click.option("--database", envvar="GRAPHPOP_DATABASE", help="Neo4j database name")
77
+ @click.option("--config", "config_path", type=click.Path(),
78
+ help="Config file path (default: ~/.graphpop/config.yaml)")
79
+ @click.version_option(package_name="graphpop-cli")
80
+ @click.pass_context
81
+ def main(ctx, uri, user, password, database, config_path):
82
+ """GraphPop — graph-native population genomics from the command line.
83
+
84
+ Compute population genetics statistics via Neo4j stored procedures with
85
+ default TSV output. Use --persist to write results to graph nodes.
86
+ """
87
+ ctx.ensure_object(dict)
88
+ ctx.obj = GraphPopContext(uri=uri, user=user, password=password,
89
+ database=database, config_path=config_path)
90
+
91
+
92
+ # Import all command modules
93
+ from .commands import ( # noqa: E402
94
+ diversity, divergence, sfs, joint_sfs,
95
+ genome_scan, pop_summary,
96
+ ld, ihs, xpehh, nsl, roh, garud_h,
97
+ query, run_all, aggregate, export_windows,
98
+ setup, server, db, import_data, dump,
99
+ config_cmd, validate, filter_results, plot,
100
+ lookup, converge, inventory, rank_genes,
101
+ extract, export_bed, batch, compare,
102
+ report, neighbors,
103
+ )
104
+
105
+ # Individual procedures (12)
106
+ main.add_command(diversity.diversity)
107
+ main.add_command(divergence.divergence)
108
+ main.add_command(sfs.sfs)
109
+ main.add_command(joint_sfs.joint_sfs)
110
+ main.add_command(genome_scan.genome_scan)
111
+ main.add_command(pop_summary.pop_summary)
112
+ main.add_command(ld.ld)
113
+ main.add_command(ihs.ihs)
114
+ main.add_command(xpehh.xpehh)
115
+ main.add_command(nsl.nsl)
116
+ main.add_command(roh.roh)
117
+ main.add_command(garud_h.garud_h)
118
+
119
+ # Orchestration and export
120
+ main.add_command(run_all.run_all)
121
+ main.add_command(aggregate.aggregate)
122
+ main.add_command(export_windows.export_windows)
123
+ main.add_command(query.query)
124
+ main.add_command(filter_results.filter_results)
125
+
126
+ # Setup and server management
127
+ main.add_command(setup.setup)
128
+ main.add_command(server.start)
129
+ main.add_command(server.stop)
130
+ main.add_command(server.status)
131
+
132
+ # Database management
133
+ main.add_command(db.db)
134
+ main.add_command(import_data.import_data)
135
+ main.add_command(dump.dump)
136
+ main.add_command(dump.load)
137
+
138
+ # Configuration and validation
139
+ main.add_command(config_cmd.config)
140
+ main.add_command(validate.validate)
141
+ main.add_command(plot.plot)
142
+
143
+ # Phase 1 high-priority commands
144
+ main.add_command(lookup.lookup)
145
+ main.add_command(converge.converge)
146
+ main.add_command(inventory.inventory)
147
+ main.add_command(rank_genes.rank_genes)
148
+
149
+ # Phase 2 commands
150
+ main.add_command(extract.extract)
151
+ main.add_command(export_bed.export_bed)
152
+ main.add_command(batch.batch)
153
+ main.add_command(compare.compare)
154
+
155
+ # Phase 3 commands
156
+ main.add_command(report.report)
157
+ main.add_command(neighbors.neighbors)
158
+
159
+
160
+ if __name__ == "__main__":
161
+ main()
@@ -0,0 +1 @@
1
+ """GraphPop CLI command modules."""
@@ -0,0 +1,206 @@
1
+ """graphpop aggregate — aggregate results and generate summary tables."""
2
+ from __future__ import annotations
3
+
4
+ import csv
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import click
9
+
10
+ from ..cli import pass_ctx
11
+
12
+
13
+ @click.command()
14
+ @click.option("--results-dir", "-d", type=click.Path(exists=True), required=True,
15
+ help="Directory with per-procedure TSV results (from run-all)")
16
+ @click.option("--json-results", "-j", type=click.Path(exists=True),
17
+ help="JSON results file (from run-all)")
18
+ @click.option("--output-dir", "-o", type=click.Path(), default="graphpop_tables",
19
+ help="Output directory for summary tables")
20
+ @pass_ctx
21
+ def aggregate(ctx, results_dir, json_results, output_dir):
22
+ """Aggregate per-population results into summary tables.
23
+
24
+ Reads TSV results from a run-all output directory and produces
25
+ publication-ready summary tables:
26
+
27
+ \b
28
+ population_summary.tsv — per-pop diversity, theta, Tajima's D, Fis
29
+ fst_matrix.tsv — pairwise Fst matrix
30
+ pinpis.tsv — piN/piS ratios (if conditioned results exist)
31
+ selection_peaks.tsv — top iHS/XP-EHH/nSL peaks per population
32
+ roh_summary.tsv — per-pop FROH statistics
33
+ """
34
+ results_path = Path(results_dir)
35
+ out_dir = Path(output_dir)
36
+ out_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ # Load JSON results if provided
39
+ all_results = {}
40
+ if json_results:
41
+ with open(json_results) as f:
42
+ all_results = json.load(f)
43
+ click.echo(f"Loaded {len(all_results)} results from JSON")
44
+
45
+ # --- Table 1: Population Summary ---
46
+ diversity_dir = results_path / "diversity"
47
+ if diversity_dir.exists():
48
+ click.echo("Generating population_summary.tsv...")
49
+ pop_stats = _aggregate_single_row_tsv(diversity_dir)
50
+ _write_summary(out_dir / "population_summary.tsv", pop_stats,
51
+ ["population", "chr", "pi", "theta_w", "tajima_d",
52
+ "het_exp", "het_obs", "fis", "n_variants", "n_segregating"])
53
+
54
+ # --- Table 2: Fst Matrix ---
55
+ divergence_dir = results_path / "divergence"
56
+ if divergence_dir.exists():
57
+ click.echo("Generating fst_matrix.tsv...")
58
+ div_stats = _aggregate_single_row_tsv(divergence_dir)
59
+ _write_summary(out_dir / "fst_matrix.tsv", div_stats,
60
+ ["pop1", "pop2", "chr", "fst_hudson", "fst_wc", "dxy", "da"])
61
+
62
+ # --- Table 3: ROH Summary ---
63
+ roh_dir = results_path / "roh"
64
+ if roh_dir.exists():
65
+ click.echo("Generating roh_summary.tsv...")
66
+ roh_data = _aggregate_multi_row_tsv(roh_dir)
67
+ # Compute per-population means
68
+ pop_roh = {}
69
+ for rec in roh_data:
70
+ pop = rec.get("population", rec.get("file_pop", "unknown"))
71
+ if pop not in pop_roh:
72
+ pop_roh[pop] = {"n_samples": 0, "total_froh": 0.0,
73
+ "total_n_roh": 0, "max_froh": 0.0}
74
+ pop_roh[pop]["n_samples"] += 1
75
+ froh = float(rec.get("froh", 0))
76
+ pop_roh[pop]["total_froh"] += froh
77
+ pop_roh[pop]["total_n_roh"] += int(rec.get("n_roh", 0))
78
+ pop_roh[pop]["max_froh"] = max(pop_roh[pop]["max_froh"], froh)
79
+
80
+ rows = []
81
+ for pop, s in sorted(pop_roh.items()):
82
+ rows.append({
83
+ "population": pop,
84
+ "n_samples": s["n_samples"],
85
+ "mean_froh": f"{s['total_froh'] / s['n_samples']:.6f}",
86
+ "mean_n_roh": f"{s['total_n_roh'] / s['n_samples']:.1f}",
87
+ "max_froh": f"{s['max_froh']:.6f}",
88
+ })
89
+ _write_dict_tsv(out_dir / "roh_summary.tsv", rows)
90
+
91
+ # --- Table 4: Selection Peaks ---
92
+ for proc in ("ihs", "nsl", "xpehh"):
93
+ proc_dir = results_path / proc
94
+ if proc_dir.exists():
95
+ click.echo(f"Generating {proc}_peaks.tsv...")
96
+ peaks = _extract_peaks(proc_dir, proc, top_n=100)
97
+ _write_dict_tsv(out_dir / f"{proc}_peaks.tsv", peaks)
98
+
99
+ # --- Table 5: Garud's H Sweep Windows ---
100
+ garud_dir = results_path / "garud_h"
101
+ if garud_dir.exists():
102
+ click.echo("Generating sweep_windows.tsv...")
103
+ sweeps = _extract_sweep_windows(garud_dir, h12_threshold=0.1)
104
+ _write_dict_tsv(out_dir / "sweep_windows.tsv", sweeps)
105
+
106
+ click.echo(f"\nSummary tables written to {out_dir}/")
107
+ for f in sorted(out_dir.glob("*.tsv")):
108
+ n_lines = sum(1 for _ in open(f)) - 1
109
+ click.echo(f" {f.name}: {n_lines} rows")
110
+
111
+
112
+ def _aggregate_single_row_tsv(directory: Path) -> list[dict]:
113
+ """Read TSV files with single data row, extract pop/chr from filename."""
114
+ rows = []
115
+ for tsv in sorted(directory.glob("*.tsv")):
116
+ parts = tsv.stem.split("_")
117
+ with open(tsv) as f:
118
+ reader = csv.DictReader(f, delimiter="\t")
119
+ for rec in reader:
120
+ # Infer pop and chr from filename: POP_CHR.tsv
121
+ if len(parts) >= 2:
122
+ rec["population"] = "_".join(parts[:-1])
123
+ rec["chr"] = parts[-1]
124
+ elif "vs" in tsv.stem:
125
+ # Pairwise: POP1_vs_POP2_CHR.tsv
126
+ vs_idx = parts.index("vs")
127
+ rec["pop1"] = "_".join(parts[:vs_idx])
128
+ rec["pop2"] = "_".join(parts[vs_idx + 1:-1])
129
+ rec["chr"] = parts[-1]
130
+ rows.append(rec)
131
+ return rows
132
+
133
+
134
+ def _aggregate_multi_row_tsv(directory: Path) -> list[dict]:
135
+ """Read TSV files with multiple data rows."""
136
+ rows = []
137
+ for tsv in sorted(directory.glob("*.tsv")):
138
+ parts = tsv.stem.split("_")
139
+ pop = "_".join(parts[:-1]) if len(parts) >= 2 else parts[0]
140
+ with open(tsv) as f:
141
+ reader = csv.DictReader(f, delimiter="\t")
142
+ for rec in reader:
143
+ rec["file_pop"] = pop
144
+ rows.append(rec)
145
+ return rows
146
+
147
+
148
+ def _extract_peaks(directory: Path, stat_name: str,
149
+ top_n: int = 100) -> list[dict]:
150
+ """Extract top peaks from per-variant result files."""
151
+ all_variants = []
152
+ for tsv in sorted(directory.glob("*.tsv")):
153
+ parts = tsv.stem.split("_")
154
+ with open(tsv) as f:
155
+ reader = csv.DictReader(f, delimiter="\t")
156
+ for rec in reader:
157
+ score = rec.get(stat_name, rec.get(f"{stat_name}_unstd", "0"))
158
+ try:
159
+ rec["abs_score"] = abs(float(score))
160
+ except (ValueError, TypeError):
161
+ rec["abs_score"] = 0
162
+ rec["source_file"] = tsv.stem
163
+ all_variants.append(rec)
164
+
165
+ all_variants.sort(key=lambda r: r["abs_score"], reverse=True)
166
+ return all_variants[:top_n]
167
+
168
+
169
+ def _extract_sweep_windows(directory: Path,
170
+ h12_threshold: float = 0.1) -> list[dict]:
171
+ """Extract windows exceeding H12 threshold."""
172
+ sweeps = []
173
+ for tsv in sorted(directory.glob("*.tsv")):
174
+ with open(tsv) as f:
175
+ reader = csv.DictReader(f, delimiter="\t")
176
+ for rec in reader:
177
+ try:
178
+ if float(rec.get("h12", 0)) >= h12_threshold:
179
+ sweeps.append(rec)
180
+ except (ValueError, TypeError):
181
+ pass
182
+ sweeps.sort(key=lambda r: float(r.get("h12", 0)), reverse=True)
183
+ return sweeps
184
+
185
+
186
+ def _write_summary(path: Path, rows: list[dict], columns: list[str]):
187
+ """Write summary table with specified columns."""
188
+ with open(path, "w", newline="") as f:
189
+ writer = csv.DictWriter(f, fieldnames=columns, delimiter="\t",
190
+ extrasaction="ignore")
191
+ writer.writeheader()
192
+ writer.writerows(rows)
193
+
194
+
195
+ def _write_dict_tsv(path: Path, rows: list[dict]):
196
+ """Write list of dicts as TSV."""
197
+ if not rows:
198
+ with open(path, "w") as f:
199
+ f.write("# No results\n")
200
+ return
201
+ keys = list(rows[0].keys())
202
+ with open(path, "w", newline="") as f:
203
+ writer = csv.DictWriter(f, fieldnames=keys, delimiter="\t",
204
+ extrasaction="ignore")
205
+ writer.writeheader()
206
+ writer.writerows(rows)
@@ -0,0 +1,155 @@
1
+ """graphpop batch — run any procedure across multiple populations and chromosomes."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ from ..cli import pass_ctx
9
+ from ..config import build_options_map, build_cypher
10
+ from ..formatters import format_output
11
+
12
+
13
+ # Map command names to their procedure and yield columns
14
+ COMMAND_REGISTRY = {
15
+ "diversity": {
16
+ "procedure": "graphpop.diversity",
17
+ "args": lambda chr, pop, **kw: [f"'{chr}'", "1", "999999999", f"'{pop}'"],
18
+ "yield": ["pi", "theta_w", "tajima_d", "fay_wu_h", "fay_wu_h_norm",
19
+ "het_exp", "het_obs", "fis", "n_variants", "n_segregating",
20
+ "n_polarized"],
21
+ },
22
+ "divergence": {
23
+ "procedure": "graphpop.divergence",
24
+ "args": lambda chr, pop, pop2=None, **kw: [
25
+ f"'{chr}'", "1", "999999999", f"'{pop}'", f"'{pop2}'"
26
+ ] if pop2 else None,
27
+ "yield": ["fst_hudson", "fst_wc", "dxy", "da", "n_variants"],
28
+ },
29
+ "ihs": {
30
+ "procedure": "graphpop.ihs",
31
+ "args": lambda chr, pop, **kw: [f"'{chr}'", f"'{pop}'"],
32
+ "yield": ["n_variants", "n_computed", "n_significant"],
33
+ },
34
+ "xpehh": {
35
+ "procedure": "graphpop.xpehh",
36
+ "args": lambda chr, pop, pop2=None, **kw: [
37
+ f"'{chr}'", f"'{pop}'", f"'{pop2}'"
38
+ ] if pop2 else None,
39
+ "yield": ["n_variants", "n_computed", "n_significant"],
40
+ },
41
+ "nsl": {
42
+ "procedure": "graphpop.nsl",
43
+ "args": lambda chr, pop, **kw: [f"'{chr}'", f"'{pop}'"],
44
+ "yield": ["n_variants", "n_computed", "n_significant"],
45
+ },
46
+ "sfs": {
47
+ "procedure": "graphpop.sfs",
48
+ "args": lambda chr, pop, **kw: [f"'{chr}'", "1", "999999999", f"'{pop}'"],
49
+ "yield": ["sfs", "n_variants", "n_segregating"],
50
+ },
51
+ "roh": {
52
+ "procedure": "graphpop.roh",
53
+ "args": lambda chr, pop, **kw: [f"'{chr}'", f"'{pop}'"],
54
+ "yield": ["n_samples", "mean_froh", "median_froh", "n_roh_segments"],
55
+ },
56
+ "garud-h": {
57
+ "procedure": "graphpop.garud_h",
58
+ "args": lambda chr, pop, **kw: [f"'{chr}'", f"'{pop}'"],
59
+ "yield": ["n_windows", "mean_h12", "max_h12"],
60
+ },
61
+ }
62
+
63
+
64
+ @click.command("batch")
65
+ @click.argument("command")
66
+ @click.option("--pops", required=True, help="Comma-separated population list")
67
+ @click.option("--chrs", required=True, help="Comma-separated chromosome list")
68
+ @click.option("--pop2", help="Second population (for divergence/xpehh, applied to all)")
69
+ @click.option("--workers", type=int, default=1,
70
+ help="Parallel workers (default: 1, currently sequential)")
71
+ @click.option("-d", "--output-dir", required=True,
72
+ type=click.Path(), help="Output directory (one TSV per pop-chr combo)")
73
+ @click.option("--persist", is_flag=True, help="Pass --persist to underlying command")
74
+ @click.option("--consequence", help="Filter by VEP consequence type")
75
+ @click.option("--pathway", help="Filter by pathway name")
76
+ @click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
77
+ @pass_ctx
78
+ def batch(ctx, command, pops, chrs, pop2, workers, output_dir, persist,
79
+ consequence, pathway, fmt):
80
+ """Run a GraphPop procedure across multiple populations and/or chromosomes.
81
+
82
+ COMMAND is the procedure name: diversity, divergence, ihs, xpehh, nsl,
83
+ sfs, roh, garud-h.
84
+
85
+ Creates one output file per (population, chromosome) combination in
86
+ the output directory, named {command}_{pop}_{chr}.{ext}.
87
+
88
+ \b
89
+ Examples:
90
+ graphpop batch diversity --pops EUR,AFR,EAS --chrs chr1,chr2,chr22 -d output/
91
+ graphpop batch ihs --pops EUR,AFR --chrs chr22 --persist -d output/
92
+ graphpop batch divergence --pops EUR --pop2 AFR --chrs chr22 -d output/
93
+ """
94
+ if command not in COMMAND_REGISTRY:
95
+ available = ", ".join(sorted(COMMAND_REGISTRY.keys()))
96
+ click.echo(f"Error: unknown command '{command}'. Available: {available}", err=True)
97
+ raise SystemExit(1)
98
+
99
+ spec = COMMAND_REGISTRY[command]
100
+ pop_list = [p.strip() for p in pops.split(",")]
101
+ chr_list = [c.strip() for c in chrs.split(",")]
102
+
103
+ # Create output directory
104
+ out_dir = Path(output_dir)
105
+ out_dir.mkdir(parents=True, exist_ok=True)
106
+
107
+ ext = fmt
108
+ opts = build_options_map(consequence=consequence, pathway=pathway)
109
+ if persist:
110
+ opts["persist"] = True
111
+
112
+ total = len(pop_list) * len(chr_list)
113
+ completed = 0
114
+ failed = 0
115
+
116
+ for pop in pop_list:
117
+ for chr_name in chr_list:
118
+ completed += 1
119
+ label = f"[{completed}/{total}] {command} {pop} {chr_name}"
120
+ click.echo(f"{label} ...", err=True)
121
+
122
+ try:
123
+ args = spec["args"](chr=chr_name, pop=pop, pop2=pop2)
124
+ if args is None:
125
+ click.echo(f" Skipping: missing required argument (e.g., --pop2).", err=True)
126
+ failed += 1
127
+ continue
128
+
129
+ cypher = build_cypher(
130
+ spec["procedure"],
131
+ args,
132
+ options=opts if opts else None,
133
+ yield_cols=spec["yield"],
134
+ )
135
+ records = ctx.run(cypher)
136
+
137
+ if not records:
138
+ click.echo(f" No results.", err=True)
139
+ failed += 1
140
+ continue
141
+
142
+ out_file = out_dir / f"{command}_{pop}_{chr_name}.{ext}"
143
+ format_output(records, str(out_file), fmt, f"batch {command}",
144
+ {"population": pop, "chr": chr_name})
145
+ click.echo(f" -> {out_file} ({len(records)} rows)", err=True)
146
+
147
+ except SystemExit:
148
+ click.echo(f" FAILED (query error).", err=True)
149
+ failed += 1
150
+ except Exception as e:
151
+ click.echo(f" FAILED: {e}", err=True)
152
+ failed += 1
153
+
154
+ click.echo(f"\nBatch complete: {completed - failed}/{total} succeeded, "
155
+ f"{failed} failed.", err=True)
@@ -0,0 +1,118 @@
1
+ """graphpop compare — compare statistics between two populations."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+
6
+ import click
7
+
8
+ from ..cli import pass_ctx
9
+ from ..formatters import format_output
10
+
11
+ # Allowed stat names — used to whitelist dynamic property access
12
+ _VALID_STATS = {"pi", "theta_w", "tajima_d", "fst", "ihs"}
13
+ _IDENT_RE = re.compile(r'^[A-Za-z0-9_-]+$')
14
+
15
+
16
+ def _validate_identifier(value: str, label: str) -> str:
17
+ """Validate that a value is safe for use as a Cypher property name."""
18
+ if not _IDENT_RE.match(value):
19
+ raise click.BadParameter(
20
+ f"Invalid {label}: {value!r}. Only alphanumeric, hyphen, "
21
+ "and underscore characters are allowed."
22
+ )
23
+ return value
24
+
25
+
26
+ @click.command("compare")
27
+ @click.argument("pop1")
28
+ @click.argument("pop2")
29
+ @click.argument("chr", metavar="CHR")
30
+ @click.option("--stat", required=True,
31
+ type=click.Choice(["pi", "theta_w", "tajima_d", "fst", "ihs"]),
32
+ help="Statistic to compare")
33
+ @click.option("--window-size", type=int, default=100000,
34
+ help="Sliding window size for comparison (default: 100000)")
35
+ @click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
36
+ @click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
37
+ @click.option("--limit", type=int, default=10000, help="Maximum rows (default: 10000)")
38
+ @pass_ctx
39
+ def compare(ctx, pop1, pop2, chr, stat, window_size, output_path, fmt, limit):
40
+ """Compare statistics between two populations across a chromosome.
41
+
42
+ For window-based stats (pi, theta_w, tajima_d, fst), queries GenomicWindow
43
+ nodes for both populations and joins by window position to compute delta.
44
+
45
+ For ihs, queries Variant nodes with ihs_{POP1} and ihs_{POP2} properties
46
+ and computes the per-variant difference.
47
+
48
+ Output columns: window_start, window_end, stat_pop1, stat_pop2, delta, abs_delta.
49
+
50
+ \b
51
+ Examples:
52
+ graphpop compare EUR AFR chr22 --stat pi -o delta_pi.tsv
53
+ graphpop compare GJ-tmp GJ-trp Chr1 --stat fst -o delta.tsv
54
+ graphpop compare EUR EAS chr22 --stat ihs -o ihs_diff.tsv
55
+ """
56
+ # Validate identifiers used in dynamic property names
57
+ pop1 = _validate_identifier(pop1, "population")
58
+ pop2 = _validate_identifier(pop2, "population")
59
+
60
+ if stat == "ihs":
61
+ records = _compare_variant_stat(ctx, pop1, pop2, chr, stat, limit)
62
+ else:
63
+ records = _compare_window_stat(ctx, pop1, pop2, chr, stat, window_size, limit)
64
+
65
+ if not records:
66
+ click.echo(f"No comparison data found for {stat} on {chr} "
67
+ f"({pop1} vs {pop2}).", err=True)
68
+ return
69
+
70
+ click.echo(f"Found {len(records)} comparison rows.", err=True)
71
+ format_output(records, output_path, fmt, "compare",
72
+ {"pop1": pop1, "pop2": pop2, "chr": chr, "stat": stat})
73
+
74
+
75
+ def _compare_window_stat(ctx, pop1, pop2, chr, stat, window_size, limit):
76
+ """Compare window-based statistics between two populations."""
77
+ prop = stat
78
+ params = {"chr": chr, "pop1": pop1, "pop2": pop2, "limit": limit}
79
+
80
+ cypher = (
81
+ f"MATCH (w1:GenomicWindow) "
82
+ f"WHERE w1.chr = $chr AND w1.population = $pop1 "
83
+ f"AND w1.{prop} IS NOT NULL "
84
+ f"WITH w1 "
85
+ f"MATCH (w2:GenomicWindow) "
86
+ f"WHERE w2.chr = $chr AND w2.population = $pop2 "
87
+ f"AND w2.start = w1.start AND w2.end = w1.end "
88
+ f"AND w2.{prop} IS NOT NULL "
89
+ f"RETURN w1.start AS window_start, "
90
+ f"w1.end AS window_end, "
91
+ f"w1.{prop} AS {stat}_{pop1}, "
92
+ f"w2.{prop} AS {stat}_{pop2}, "
93
+ f"(w1.{prop} - w2.{prop}) AS delta, "
94
+ f"abs(w1.{prop} - w2.{prop}) AS abs_delta "
95
+ f"ORDER BY w1.start LIMIT $limit"
96
+ )
97
+ return ctx.run(cypher, params)
98
+
99
+
100
+ def _compare_variant_stat(ctx, pop1, pop2, chr, stat, limit):
101
+ """Compare variant-based statistics (ihs) between two populations."""
102
+ prop1 = f"{stat}_{pop1}"
103
+ prop2 = f"{stat}_{pop2}"
104
+ params = {"chr": chr, "limit": limit}
105
+
106
+ cypher = (
107
+ f"MATCH (v:Variant) "
108
+ f"WHERE v.chr = $chr "
109
+ f"AND v.{prop1} IS NOT NULL AND v.{prop2} IS NOT NULL "
110
+ f"RETURN v.pos AS pos, "
111
+ f"v.variantId AS variantId, "
112
+ f"v.{prop1} AS {stat}_{pop1}, "
113
+ f"v.{prop2} AS {stat}_{pop2}, "
114
+ f"(v.{prop1} - v.{prop2}) AS delta, "
115
+ f"abs(v.{prop1} - v.{prop2}) AS abs_delta "
116
+ f"ORDER BY v.pos LIMIT $limit"
117
+ )
118
+ return ctx.run(cypher, params)