graphpop-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphpop_cli/__init__.py +2 -0
- graphpop_cli/cli.py +161 -0
- graphpop_cli/commands/__init__.py +1 -0
- graphpop_cli/commands/aggregate.py +206 -0
- graphpop_cli/commands/batch.py +155 -0
- graphpop_cli/commands/compare.py +118 -0
- graphpop_cli/commands/config_cmd.py +117 -0
- graphpop_cli/commands/converge.py +156 -0
- graphpop_cli/commands/db.py +188 -0
- graphpop_cli/commands/divergence.py +37 -0
- graphpop_cli/commands/diversity.py +36 -0
- graphpop_cli/commands/dump.py +210 -0
- graphpop_cli/commands/export_bed.py +170 -0
- graphpop_cli/commands/export_windows.py +91 -0
- graphpop_cli/commands/extract.py +271 -0
- graphpop_cli/commands/filter_results.py +165 -0
- graphpop_cli/commands/garud_h.py +30 -0
- graphpop_cli/commands/genome_scan.py +41 -0
- graphpop_cli/commands/ihs.py +29 -0
- graphpop_cli/commands/import_data.py +266 -0
- graphpop_cli/commands/inventory.py +160 -0
- graphpop_cli/commands/joint_sfs.py +38 -0
- graphpop_cli/commands/ld.py +35 -0
- graphpop_cli/commands/lookup.py +207 -0
- graphpop_cli/commands/neighbors.py +175 -0
- graphpop_cli/commands/nsl.py +29 -0
- graphpop_cli/commands/plot.py +1066 -0
- graphpop_cli/commands/pop_summary.py +30 -0
- graphpop_cli/commands/query.py +15 -0
- graphpop_cli/commands/rank_genes.py +177 -0
- graphpop_cli/commands/report.py +264 -0
- graphpop_cli/commands/roh.py +30 -0
- graphpop_cli/commands/run_all.py +276 -0
- graphpop_cli/commands/server.py +98 -0
- graphpop_cli/commands/setup.py +299 -0
- graphpop_cli/commands/sfs.py +38 -0
- graphpop_cli/commands/validate.py +167 -0
- graphpop_cli/commands/xpehh.py +31 -0
- graphpop_cli/config.py +57 -0
- graphpop_cli/connection.py +52 -0
- graphpop_cli/formatters.py +81 -0
- graphpop_cli-0.1.0.dist-info/METADATA +73 -0
- graphpop_cli-0.1.0.dist-info/RECORD +46 -0
- graphpop_cli-0.1.0.dist-info/WHEEL +5 -0
- graphpop_cli-0.1.0.dist-info/entry_points.txt +2 -0
- graphpop_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""graphpop dump/load — database dump and restore for sharing."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import subprocess
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _get_neo4j_home() -> Path:
|
|
14
|
+
"""Get Neo4j home from config."""
|
|
15
|
+
config_path = Path.home() / ".graphpop" / "config.yaml"
|
|
16
|
+
if config_path.exists():
|
|
17
|
+
with open(config_path) as f:
|
|
18
|
+
cfg = yaml.safe_load(f) or {}
|
|
19
|
+
if "neo4j_home" in cfg:
|
|
20
|
+
return Path(cfg["neo4j_home"])
|
|
21
|
+
return Path.home() / "neo4j"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@click.command()
|
|
25
|
+
@click.option("--database", required=True, help="Database name to dump")
|
|
26
|
+
@click.option("-o", "--output", "output_path", type=click.Path(),
|
|
27
|
+
help="Output dump file path (default: <database>_<date>.dump)")
|
|
28
|
+
@click.option("--neo4j-home", type=click.Path(), help="Neo4j installation directory")
|
|
29
|
+
@click.option("--manifest/--no-manifest", default=True,
|
|
30
|
+
help="Generate a JSON manifest with database metadata (default: yes)")
|
|
31
|
+
def dump(database, output_path, neo4j_home, manifest):
|
|
32
|
+
"""Dump a Neo4j database to a file for sharing.
|
|
33
|
+
|
|
34
|
+
Creates a neo4j-admin dump file that can be shared and loaded on another
|
|
35
|
+
machine. Optionally generates a JSON manifest with node/edge counts,
|
|
36
|
+
populations, chromosomes, and computed statistics metadata.
|
|
37
|
+
|
|
38
|
+
\b
|
|
39
|
+
Examples:
|
|
40
|
+
graphpop dump --database rice3k
|
|
41
|
+
graphpop dump --database rice3k -o rice3k_v1.dump
|
|
42
|
+
graphpop dump --database neo4j --no-manifest
|
|
43
|
+
"""
|
|
44
|
+
home = Path(neo4j_home) if neo4j_home else _get_neo4j_home()
|
|
45
|
+
admin_bin = home / "bin" / "neo4j-admin"
|
|
46
|
+
|
|
47
|
+
if not admin_bin.exists():
|
|
48
|
+
click.echo(f"Error: neo4j-admin not found at {admin_bin}", err=True)
|
|
49
|
+
raise SystemExit(1)
|
|
50
|
+
|
|
51
|
+
# Default output path
|
|
52
|
+
if not output_path:
|
|
53
|
+
date_str = datetime.now().strftime("%Y%m%d")
|
|
54
|
+
output_path = f"graphpop_{database}_{date_str}.dump"
|
|
55
|
+
|
|
56
|
+
output_file = Path(output_path)
|
|
57
|
+
|
|
58
|
+
click.echo(f"Dumping database '{database}' to {output_file}...")
|
|
59
|
+
|
|
60
|
+
# Run neo4j-admin dump
|
|
61
|
+
cmd = [
|
|
62
|
+
str(admin_bin), "database", "dump",
|
|
63
|
+
f"--to-path={output_file.parent}",
|
|
64
|
+
database,
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
68
|
+
if result.returncode != 0:
|
|
69
|
+
click.echo(f"Error: {result.stderr.strip()}", err=True)
|
|
70
|
+
click.echo("\nNote: Neo4j must be stopped before dumping.", err=True)
|
|
71
|
+
click.echo("Run 'graphpop stop' first, then retry.", err=True)
|
|
72
|
+
raise SystemExit(1)
|
|
73
|
+
|
|
74
|
+
# Rename if neo4j-admin used its own naming
|
|
75
|
+
expected_dump = output_file.parent / f"{database}.dump"
|
|
76
|
+
if expected_dump.exists() and expected_dump != output_file:
|
|
77
|
+
expected_dump.rename(output_file)
|
|
78
|
+
|
|
79
|
+
size = output_file.stat().st_size if output_file.exists() else 0
|
|
80
|
+
click.echo(f"Dump complete: {output_file} ({_format_size(size)})")
|
|
81
|
+
|
|
82
|
+
# Generate manifest
|
|
83
|
+
if manifest:
|
|
84
|
+
manifest_path = output_file.with_suffix(".manifest.json")
|
|
85
|
+
_generate_manifest(home, database, manifest_path, output_file)
|
|
86
|
+
click.echo(f"Manifest: {manifest_path}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@click.command()
|
|
90
|
+
@click.option("--dump-file", required=True, type=click.Path(exists=True),
|
|
91
|
+
help="Path to the .dump file")
|
|
92
|
+
@click.option("--database", required=True,
|
|
93
|
+
help="Name for the restored database")
|
|
94
|
+
@click.option("--neo4j-home", type=click.Path(), help="Neo4j installation directory")
|
|
95
|
+
@click.option("--overwrite", is_flag=True, help="Overwrite existing database")
|
|
96
|
+
def load(dump_file, database, neo4j_home, overwrite):
|
|
97
|
+
"""Load a database from a dump file.
|
|
98
|
+
|
|
99
|
+
Restores a previously dumped Neo4j database. The database name can be
|
|
100
|
+
different from the original.
|
|
101
|
+
|
|
102
|
+
\b
|
|
103
|
+
Examples:
|
|
104
|
+
graphpop load --dump-file rice3k_v1.dump --database rice3k
|
|
105
|
+
graphpop load --dump-file shared_db.dump --database myanalysis --overwrite
|
|
106
|
+
"""
|
|
107
|
+
home = Path(neo4j_home) if neo4j_home else _get_neo4j_home()
|
|
108
|
+
admin_bin = home / "bin" / "neo4j-admin"
|
|
109
|
+
|
|
110
|
+
if not admin_bin.exists():
|
|
111
|
+
click.echo(f"Error: neo4j-admin not found at {admin_bin}", err=True)
|
|
112
|
+
raise SystemExit(1)
|
|
113
|
+
|
|
114
|
+
dump_path = Path(dump_file)
|
|
115
|
+
click.echo(f"Loading database '{database}' from {dump_path}...")
|
|
116
|
+
|
|
117
|
+
cmd = [
|
|
118
|
+
str(admin_bin), "database", "load",
|
|
119
|
+
f"--from-path={dump_path.parent}",
|
|
120
|
+
]
|
|
121
|
+
if overwrite:
|
|
122
|
+
cmd.append("--overwrite-destination=true")
|
|
123
|
+
cmd.append(database)
|
|
124
|
+
|
|
125
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
126
|
+
if result.returncode != 0:
|
|
127
|
+
click.echo(f"Error: {result.stderr.strip()}", err=True)
|
|
128
|
+
if "already exists" in result.stderr:
|
|
129
|
+
click.echo("Use --overwrite to replace the existing database.", err=True)
|
|
130
|
+
elif "running" in result.stderr.lower():
|
|
131
|
+
click.echo("Stop Neo4j first: graphpop stop", err=True)
|
|
132
|
+
raise SystemExit(1)
|
|
133
|
+
|
|
134
|
+
# Update config
|
|
135
|
+
config_path = Path.home() / ".graphpop" / "config.yaml"
|
|
136
|
+
cfg = {}
|
|
137
|
+
if config_path.exists():
|
|
138
|
+
with open(config_path) as f:
|
|
139
|
+
cfg = yaml.safe_load(f) or {}
|
|
140
|
+
cfg["database"] = database
|
|
141
|
+
with open(config_path, "w") as f:
|
|
142
|
+
yaml.dump(cfg, f, default_flow_style=False)
|
|
143
|
+
|
|
144
|
+
click.echo(f"Database '{database}' loaded successfully.")
|
|
145
|
+
click.echo(f"Config updated to use database '{database}'.")
|
|
146
|
+
click.echo(f"\nNext: graphpop start && graphpop db info")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _generate_manifest(neo4j_home: Path, database: str,
|
|
150
|
+
manifest_path: Path, dump_path: Path):
|
|
151
|
+
"""Generate a JSON manifest with database metadata."""
|
|
152
|
+
manifest = {
|
|
153
|
+
"database": database,
|
|
154
|
+
"date": datetime.now().isoformat(),
|
|
155
|
+
"dump_file": str(dump_path),
|
|
156
|
+
"dump_size_bytes": dump_path.stat().st_size if dump_path.exists() else 0,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
# Try to query the database for node/edge counts
|
|
160
|
+
# (only works if Neo4j is running — otherwise just save basic info)
|
|
161
|
+
try:
|
|
162
|
+
from ..connection import load_config, get_driver
|
|
163
|
+
cfg = load_config()
|
|
164
|
+
driver = get_driver(cfg)
|
|
165
|
+
with driver.session(database=database) as session:
|
|
166
|
+
# Node counts
|
|
167
|
+
result = session.run(
|
|
168
|
+
"CALL db.labels() YIELD label "
|
|
169
|
+
"CALL { WITH label MATCH (n) WHERE label IN labels(n) "
|
|
170
|
+
"RETURN count(n) AS cnt } RETURN label, cnt"
|
|
171
|
+
)
|
|
172
|
+
manifest["node_counts"] = {r["label"]: r["cnt"] for r in result}
|
|
173
|
+
|
|
174
|
+
# Relationship counts
|
|
175
|
+
result = session.run(
|
|
176
|
+
"CALL db.relationshipTypes() YIELD relationshipType AS type "
|
|
177
|
+
"CALL { WITH type MATCH ()-[r]->() WHERE type(r) = type "
|
|
178
|
+
"RETURN count(r) AS cnt } RETURN type, cnt"
|
|
179
|
+
)
|
|
180
|
+
manifest["edge_counts"] = {r["type"]: r["cnt"] for r in result}
|
|
181
|
+
|
|
182
|
+
# Populations
|
|
183
|
+
result = session.run(
|
|
184
|
+
"MATCH (p:Population) RETURN p.populationId AS pop, "
|
|
185
|
+
"p.n_samples AS n ORDER BY n DESC"
|
|
186
|
+
)
|
|
187
|
+
manifest["populations"] = {r["pop"]: r["n"] for r in result}
|
|
188
|
+
|
|
189
|
+
# Chromosomes
|
|
190
|
+
result = session.run(
|
|
191
|
+
"MATCH (c:Chromosome) RETURN c.chromosomeId AS chr, "
|
|
192
|
+
"c.length AS len ORDER BY chr"
|
|
193
|
+
)
|
|
194
|
+
manifest["chromosomes"] = {r["chr"]: r["len"] for r in result}
|
|
195
|
+
|
|
196
|
+
driver.close()
|
|
197
|
+
except Exception:
|
|
198
|
+
manifest["note"] = "Neo4j not running; metadata not available"
|
|
199
|
+
|
|
200
|
+
with open(manifest_path, "w") as f:
|
|
201
|
+
json.dump(manifest, f, indent=2, default=str)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _format_size(size_bytes: int) -> str:
|
|
205
|
+
"""Format bytes as human-readable."""
|
|
206
|
+
for unit in ("B", "KB", "MB", "GB", "TB"):
|
|
207
|
+
if abs(size_bytes) < 1024.0:
|
|
208
|
+
return f"{size_bytes:.1f} {unit}"
|
|
209
|
+
size_bytes /= 1024.0
|
|
210
|
+
return f"{size_bytes:.1f} PB"
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""graphpop export-bed — export high-scoring regions as BED format."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from ..cli import pass_ctx
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Statistics stored on GenomicWindow nodes vs Variant nodes
|
|
10
|
+
WINDOW_STATS = {"fst", "pi", "tajima_d", "h12"}
|
|
11
|
+
VARIANT_STATS = {"ihs", "xpehh", "nsl"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.command("export-bed")
|
|
15
|
+
@click.option("--stat", required=True,
|
|
16
|
+
type=click.Choice(["fst", "pi", "tajima_d", "h12", "ihs", "xpehh", "nsl"]),
|
|
17
|
+
help="Statistic to threshold")
|
|
18
|
+
@click.option("--threshold", required=True, type=float,
|
|
19
|
+
help="Minimum value to include (absolute value for ihs/xpehh/nsl)")
|
|
20
|
+
@click.option("--pop", "population", required=True, help="Population name")
|
|
21
|
+
@click.option("--pop2", help="Second population (required for xpehh)")
|
|
22
|
+
@click.option("--chr", "chromosome", help="Chromosome filter (optional)")
|
|
23
|
+
@click.option("--merge-distance", type=int, default=100000,
|
|
24
|
+
help="Merge variants within this distance into intervals (variant-based stats, default: 100000)")
|
|
25
|
+
@click.option("-o", "--output", "output_path", required=True, help="Output BED file")
|
|
26
|
+
@pass_ctx
|
|
27
|
+
def export_bed(ctx, stat, threshold, population, pop2, chromosome,
|
|
28
|
+
merge_distance, output_path):
|
|
29
|
+
"""Export regions exceeding a statistic threshold as BED format.
|
|
30
|
+
|
|
31
|
+
For window-based stats (fst, pi, tajima_d, h12), queries GenomicWindow
|
|
32
|
+
nodes directly. For variant-based stats (ihs, xpehh, nsl), merges
|
|
33
|
+
consecutive high-scoring variants into intervals using --merge-distance.
|
|
34
|
+
|
|
35
|
+
Output: standard 5-column BED (chr, start, end, name, score).
|
|
36
|
+
|
|
37
|
+
\b
|
|
38
|
+
Examples:
|
|
39
|
+
graphpop export-bed --stat fst --threshold 0.5 --pop EUR -o high_fst.bed
|
|
40
|
+
graphpop export-bed --stat ihs --threshold 2.5 --pop EUR --chr chr22 -o ihs_peaks.bed
|
|
41
|
+
graphpop export-bed --stat xpehh --threshold 3.0 --pop EUR --pop2 AFR -o xpehh.bed
|
|
42
|
+
graphpop export-bed --stat tajima_d --threshold -2.0 --pop GJ-tmp -o tajimad.bed
|
|
43
|
+
"""
|
|
44
|
+
if stat == "xpehh" and not pop2:
|
|
45
|
+
click.echo("Error: --pop2 is required for xpehh.", err=True)
|
|
46
|
+
raise SystemExit(1)
|
|
47
|
+
|
|
48
|
+
bed_name = f"{stat}_{population}" if not pop2 else f"{stat}_{population}_{pop2}"
|
|
49
|
+
|
|
50
|
+
if stat in WINDOW_STATS:
|
|
51
|
+
records = _query_window_stat(ctx, stat, threshold, population, pop2, chromosome)
|
|
52
|
+
bed_lines = _windows_to_bed(records, stat, bed_name)
|
|
53
|
+
else:
|
|
54
|
+
records = _query_variant_stat(ctx, stat, threshold, population, pop2, chromosome)
|
|
55
|
+
bed_lines = _merge_variants_to_bed(records, merge_distance, bed_name)
|
|
56
|
+
|
|
57
|
+
if not bed_lines:
|
|
58
|
+
click.echo(f"No regions found exceeding threshold {threshold} for {stat}.", err=True)
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
with open(output_path, "w") as f:
|
|
62
|
+
for line in bed_lines:
|
|
63
|
+
f.write(line + "\n")
|
|
64
|
+
|
|
65
|
+
click.echo(f"Wrote {len(bed_lines)} BED intervals to {output_path}.", err=True)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _query_window_stat(ctx, stat, threshold, population, pop2, chromosome):
|
|
69
|
+
"""Query GenomicWindow nodes for window-based statistics."""
|
|
70
|
+
params = {"population": population}
|
|
71
|
+
where_parts = ["w.population = $population"]
|
|
72
|
+
if chromosome:
|
|
73
|
+
where_parts.append("w.chr = $chromosome")
|
|
74
|
+
params["chromosome"] = chromosome
|
|
75
|
+
|
|
76
|
+
prop = stat
|
|
77
|
+
if stat == "fst" and pop2:
|
|
78
|
+
prop = f"fst_{population}_{pop2}"
|
|
79
|
+
|
|
80
|
+
# For Tajima's D, extreme negative values indicate selection
|
|
81
|
+
if stat == "tajima_d":
|
|
82
|
+
where_parts.append(f"w.{prop} <= {threshold}")
|
|
83
|
+
else:
|
|
84
|
+
where_parts.append(f"w.{prop} >= {threshold}")
|
|
85
|
+
|
|
86
|
+
cypher = (
|
|
87
|
+
f"MATCH (w:GenomicWindow) "
|
|
88
|
+
f"WHERE {' AND '.join(where_parts)} "
|
|
89
|
+
f"RETURN w.chr AS chr, w.start AS start, w.end AS end, "
|
|
90
|
+
f"w.{prop} AS score "
|
|
91
|
+
f"ORDER BY w.chr, w.start"
|
|
92
|
+
)
|
|
93
|
+
return ctx.run(cypher, params)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _query_variant_stat(ctx, stat, threshold, population, pop2, chromosome):
|
|
97
|
+
"""Query Variant nodes for variant-based statistics."""
|
|
98
|
+
if stat == "xpehh":
|
|
99
|
+
prop = f"xpehh_{population}_{pop2}"
|
|
100
|
+
else:
|
|
101
|
+
prop = f"{stat}_{population}"
|
|
102
|
+
|
|
103
|
+
params = {}
|
|
104
|
+
where_parts = [f"v.{prop} IS NOT NULL", f"abs(v.{prop}) >= {threshold}"]
|
|
105
|
+
if chromosome:
|
|
106
|
+
where_parts.append("v.chr = $chromosome")
|
|
107
|
+
params["chromosome"] = chromosome
|
|
108
|
+
|
|
109
|
+
cypher = (
|
|
110
|
+
f"MATCH (v:Variant) "
|
|
111
|
+
f"WHERE {' AND '.join(where_parts)} "
|
|
112
|
+
f"RETURN v.chr AS chr, v.pos AS pos, v.{prop} AS score "
|
|
113
|
+
f"ORDER BY v.chr, v.pos"
|
|
114
|
+
)
|
|
115
|
+
return ctx.run(cypher, params)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _windows_to_bed(records, stat, bed_name):
|
|
119
|
+
"""Convert window records directly to BED lines."""
|
|
120
|
+
lines = []
|
|
121
|
+
for r in records:
|
|
122
|
+
chrom = r.get("chr", "")
|
|
123
|
+
start = r.get("start", 0)
|
|
124
|
+
end = r.get("end", 0)
|
|
125
|
+
score = r.get("score", 0)
|
|
126
|
+
score_str = f"{score:.6g}" if isinstance(score, float) else str(score)
|
|
127
|
+
lines.append(f"{chrom}\t{start}\t{end}\t{bed_name}\t{score_str}")
|
|
128
|
+
return lines
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _merge_variants_to_bed(records, merge_distance, bed_name):
|
|
132
|
+
"""Merge consecutive high-scoring variants into BED intervals."""
|
|
133
|
+
if not records:
|
|
134
|
+
return []
|
|
135
|
+
|
|
136
|
+
intervals = []
|
|
137
|
+
current_chr = None
|
|
138
|
+
current_start = None
|
|
139
|
+
current_end = None
|
|
140
|
+
current_scores = []
|
|
141
|
+
|
|
142
|
+
for r in records:
|
|
143
|
+
chrom = r.get("chr", "")
|
|
144
|
+
pos = r.get("pos", 0)
|
|
145
|
+
score = r.get("score", 0)
|
|
146
|
+
|
|
147
|
+
if (current_chr is None or chrom != current_chr
|
|
148
|
+
or pos - current_end > merge_distance):
|
|
149
|
+
# Emit previous interval
|
|
150
|
+
if current_chr is not None:
|
|
151
|
+
mean_score = sum(current_scores) / len(current_scores)
|
|
152
|
+
intervals.append((current_chr, current_start, current_end, mean_score))
|
|
153
|
+
# Start new interval
|
|
154
|
+
current_chr = chrom
|
|
155
|
+
current_start = pos
|
|
156
|
+
current_end = pos
|
|
157
|
+
current_scores = [abs(score) if isinstance(score, (int, float)) else 0]
|
|
158
|
+
else:
|
|
159
|
+
current_end = pos
|
|
160
|
+
current_scores.append(abs(score) if isinstance(score, (int, float)) else 0)
|
|
161
|
+
|
|
162
|
+
# Emit last interval
|
|
163
|
+
if current_chr is not None:
|
|
164
|
+
mean_score = sum(current_scores) / len(current_scores)
|
|
165
|
+
intervals.append((current_chr, current_start, current_end, mean_score))
|
|
166
|
+
|
|
167
|
+
lines = []
|
|
168
|
+
for chrom, start, end, score in intervals:
|
|
169
|
+
lines.append(f"{chrom}\t{start}\t{end}\t{bed_name}\t{score:.6g}")
|
|
170
|
+
return lines
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""graphpop export-windows — batch export GenomicWindow nodes to TSV."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from ..cli import pass_ctx
|
|
9
|
+
from ..formatters import format_output
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@click.command("export-windows")
|
|
13
|
+
@click.argument("chr", required=False)
|
|
14
|
+
@click.argument("population", required=False)
|
|
15
|
+
@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
|
|
16
|
+
@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
|
|
17
|
+
@click.option("--min-pi", type=float, help="Minimum pi filter")
|
|
18
|
+
@click.option("--max-pi", type=float, help="Maximum pi filter")
|
|
19
|
+
@click.option("--min-fst", type=float, help="Minimum Fst filter")
|
|
20
|
+
@click.option("--min-tajima-d", type=float, help="Minimum Tajima's D filter")
|
|
21
|
+
@click.option("--max-tajima-d", type=float, help="Maximum Tajima's D filter")
|
|
22
|
+
@click.option("--run-id", help="Filter by specific run ID")
|
|
23
|
+
@click.option("--limit", type=int, help="Maximum number of windows to return")
|
|
24
|
+
@pass_ctx
|
|
25
|
+
def export_windows(ctx, chr, population, output_path, fmt,
|
|
26
|
+
min_pi, max_pi, min_fst, min_tajima_d, max_tajima_d,
|
|
27
|
+
run_id, limit):
|
|
28
|
+
"""Export GenomicWindow nodes from the graph as TSV.
|
|
29
|
+
|
|
30
|
+
Query persisted genome scan results (GenomicWindow nodes) with optional
|
|
31
|
+
filters. Without arguments, exports all windows. With CHR and POPULATION,
|
|
32
|
+
exports windows for that combination.
|
|
33
|
+
|
|
34
|
+
\b
|
|
35
|
+
Examples:
|
|
36
|
+
graphpop export-windows # all windows
|
|
37
|
+
graphpop export-windows chr22 EUR -o windows.tsv # specific region
|
|
38
|
+
graphpop export-windows --min-fst 0.5 # high-Fst windows
|
|
39
|
+
graphpop export-windows chr1 AFR --max-tajima-d -2 # negative Tajima's D
|
|
40
|
+
"""
|
|
41
|
+
# Build Cypher query with parameterized values to prevent injection
|
|
42
|
+
where_clauses = []
|
|
43
|
+
params: dict = {}
|
|
44
|
+
|
|
45
|
+
if chr:
|
|
46
|
+
where_clauses.append("w.chr = $chr")
|
|
47
|
+
params["chr"] = chr
|
|
48
|
+
if population:
|
|
49
|
+
where_clauses.append("w.population = $population")
|
|
50
|
+
params["population"] = population
|
|
51
|
+
if min_pi is not None:
|
|
52
|
+
where_clauses.append(f"w.pi >= {min_pi}")
|
|
53
|
+
if max_pi is not None:
|
|
54
|
+
where_clauses.append(f"w.pi <= {max_pi}")
|
|
55
|
+
if min_fst is not None:
|
|
56
|
+
where_clauses.append(f"w.fst >= {min_fst}")
|
|
57
|
+
if min_tajima_d is not None:
|
|
58
|
+
where_clauses.append(f"w.tajima_d >= {min_tajima_d}")
|
|
59
|
+
if max_tajima_d is not None:
|
|
60
|
+
where_clauses.append(f"w.tajima_d <= {max_tajima_d}")
|
|
61
|
+
if run_id:
|
|
62
|
+
where_clauses.append("w.run_id = $run_id")
|
|
63
|
+
params["run_id"] = run_id
|
|
64
|
+
|
|
65
|
+
where = " AND ".join(where_clauses) if where_clauses else "TRUE"
|
|
66
|
+
limit_clause = " LIMIT $limit" if limit else ""
|
|
67
|
+
if limit:
|
|
68
|
+
params["limit"] = limit
|
|
69
|
+
|
|
70
|
+
cypher = (
|
|
71
|
+
f"MATCH (w:GenomicWindow) WHERE {where} "
|
|
72
|
+
f"RETURN w.windowId AS window_id, w.chr AS chr, "
|
|
73
|
+
f"w.start AS start, w.end AS end, "
|
|
74
|
+
f"w.population AS population, w.run_id AS run_id, "
|
|
75
|
+
f"w.n_variants AS n_variants, w.n_segregating AS n_segregating, "
|
|
76
|
+
f"w.pi AS pi, w.theta_w AS theta_w, w.tajima_d AS tajima_d, "
|
|
77
|
+
f"w.fst AS fst, w.fst_wc AS fst_wc, w.dxy AS dxy, "
|
|
78
|
+
f"w.pbs AS pbs, w.fay_wu_h AS fay_wu_h "
|
|
79
|
+
f"ORDER BY w.chr, w.start"
|
|
80
|
+
f"{limit_clause}"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
records = ctx.run(cypher, params)
|
|
84
|
+
|
|
85
|
+
if not records:
|
|
86
|
+
click.echo("No windows found matching criteria.", err=True)
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
click.echo(f"Exporting {len(records)} windows...", err=True)
|
|
90
|
+
format_output(records, output_path, fmt, "export-windows",
|
|
91
|
+
{"chr": chr, "population": population})
|