graphpop-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. graphpop_cli/__init__.py +2 -0
  2. graphpop_cli/cli.py +161 -0
  3. graphpop_cli/commands/__init__.py +1 -0
  4. graphpop_cli/commands/aggregate.py +206 -0
  5. graphpop_cli/commands/batch.py +155 -0
  6. graphpop_cli/commands/compare.py +118 -0
  7. graphpop_cli/commands/config_cmd.py +117 -0
  8. graphpop_cli/commands/converge.py +156 -0
  9. graphpop_cli/commands/db.py +188 -0
  10. graphpop_cli/commands/divergence.py +37 -0
  11. graphpop_cli/commands/diversity.py +36 -0
  12. graphpop_cli/commands/dump.py +210 -0
  13. graphpop_cli/commands/export_bed.py +170 -0
  14. graphpop_cli/commands/export_windows.py +91 -0
  15. graphpop_cli/commands/extract.py +271 -0
  16. graphpop_cli/commands/filter_results.py +165 -0
  17. graphpop_cli/commands/garud_h.py +30 -0
  18. graphpop_cli/commands/genome_scan.py +41 -0
  19. graphpop_cli/commands/ihs.py +29 -0
  20. graphpop_cli/commands/import_data.py +266 -0
  21. graphpop_cli/commands/inventory.py +160 -0
  22. graphpop_cli/commands/joint_sfs.py +38 -0
  23. graphpop_cli/commands/ld.py +35 -0
  24. graphpop_cli/commands/lookup.py +207 -0
  25. graphpop_cli/commands/neighbors.py +175 -0
  26. graphpop_cli/commands/nsl.py +29 -0
  27. graphpop_cli/commands/plot.py +1066 -0
  28. graphpop_cli/commands/pop_summary.py +30 -0
  29. graphpop_cli/commands/query.py +15 -0
  30. graphpop_cli/commands/rank_genes.py +177 -0
  31. graphpop_cli/commands/report.py +264 -0
  32. graphpop_cli/commands/roh.py +30 -0
  33. graphpop_cli/commands/run_all.py +276 -0
  34. graphpop_cli/commands/server.py +98 -0
  35. graphpop_cli/commands/setup.py +299 -0
  36. graphpop_cli/commands/sfs.py +38 -0
  37. graphpop_cli/commands/validate.py +167 -0
  38. graphpop_cli/commands/xpehh.py +31 -0
  39. graphpop_cli/config.py +57 -0
  40. graphpop_cli/connection.py +52 -0
  41. graphpop_cli/formatters.py +81 -0
  42. graphpop_cli-0.1.0.dist-info/METADATA +73 -0
  43. graphpop_cli-0.1.0.dist-info/RECORD +46 -0
  44. graphpop_cli-0.1.0.dist-info/WHEEL +5 -0
  45. graphpop_cli-0.1.0.dist-info/entry_points.txt +2 -0
  46. graphpop_cli-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,210 @@
1
+ """graphpop dump/load — database dump and restore for sharing."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import subprocess
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+
9
+ import click
10
+ import yaml
11
+
12
+
13
+ def _get_neo4j_home() -> Path:
14
+ """Get Neo4j home from config."""
15
+ config_path = Path.home() / ".graphpop" / "config.yaml"
16
+ if config_path.exists():
17
+ with open(config_path) as f:
18
+ cfg = yaml.safe_load(f) or {}
19
+ if "neo4j_home" in cfg:
20
+ return Path(cfg["neo4j_home"])
21
+ return Path.home() / "neo4j"
22
+
23
+
24
+ @click.command()
25
+ @click.option("--database", required=True, help="Database name to dump")
26
+ @click.option("-o", "--output", "output_path", type=click.Path(),
27
+ help="Output dump file path (default: <database>_<date>.dump)")
28
+ @click.option("--neo4j-home", type=click.Path(), help="Neo4j installation directory")
29
+ @click.option("--manifest/--no-manifest", default=True,
30
+ help="Generate a JSON manifest with database metadata (default: yes)")
31
+ def dump(database, output_path, neo4j_home, manifest):
32
+ """Dump a Neo4j database to a file for sharing.
33
+
34
+ Creates a neo4j-admin dump file that can be shared and loaded on another
35
+ machine. Optionally generates a JSON manifest with node/edge counts,
36
+ populations, chromosomes, and computed statistics metadata.
37
+
38
+ \b
39
+ Examples:
40
+ graphpop dump --database rice3k
41
+ graphpop dump --database rice3k -o rice3k_v1.dump
42
+ graphpop dump --database neo4j --no-manifest
43
+ """
44
+ home = Path(neo4j_home) if neo4j_home else _get_neo4j_home()
45
+ admin_bin = home / "bin" / "neo4j-admin"
46
+
47
+ if not admin_bin.exists():
48
+ click.echo(f"Error: neo4j-admin not found at {admin_bin}", err=True)
49
+ raise SystemExit(1)
50
+
51
+ # Default output path
52
+ if not output_path:
53
+ date_str = datetime.now().strftime("%Y%m%d")
54
+ output_path = f"graphpop_{database}_{date_str}.dump"
55
+
56
+ output_file = Path(output_path)
57
+
58
+ click.echo(f"Dumping database '{database}' to {output_file}...")
59
+
60
+ # Run neo4j-admin dump
61
+ cmd = [
62
+ str(admin_bin), "database", "dump",
63
+ f"--to-path={output_file.parent}",
64
+ database,
65
+ ]
66
+
67
+ result = subprocess.run(cmd, capture_output=True, text=True)
68
+ if result.returncode != 0:
69
+ click.echo(f"Error: {result.stderr.strip()}", err=True)
70
+ click.echo("\nNote: Neo4j must be stopped before dumping.", err=True)
71
+ click.echo("Run 'graphpop stop' first, then retry.", err=True)
72
+ raise SystemExit(1)
73
+
74
+ # Rename if neo4j-admin used its own naming
75
+ expected_dump = output_file.parent / f"{database}.dump"
76
+ if expected_dump.exists() and expected_dump != output_file:
77
+ expected_dump.rename(output_file)
78
+
79
+ size = output_file.stat().st_size if output_file.exists() else 0
80
+ click.echo(f"Dump complete: {output_file} ({_format_size(size)})")
81
+
82
+ # Generate manifest
83
+ if manifest:
84
+ manifest_path = output_file.with_suffix(".manifest.json")
85
+ _generate_manifest(home, database, manifest_path, output_file)
86
+ click.echo(f"Manifest: {manifest_path}")
87
+
88
+
89
+ @click.command()
90
+ @click.option("--dump-file", required=True, type=click.Path(exists=True),
91
+ help="Path to the .dump file")
92
+ @click.option("--database", required=True,
93
+ help="Name for the restored database")
94
+ @click.option("--neo4j-home", type=click.Path(), help="Neo4j installation directory")
95
+ @click.option("--overwrite", is_flag=True, help="Overwrite existing database")
96
+ def load(dump_file, database, neo4j_home, overwrite):
97
+ """Load a database from a dump file.
98
+
99
+ Restores a previously dumped Neo4j database. The database name can be
100
+ different from the original.
101
+
102
+ \b
103
+ Examples:
104
+ graphpop load --dump-file rice3k_v1.dump --database rice3k
105
+ graphpop load --dump-file shared_db.dump --database myanalysis --overwrite
106
+ """
107
+ home = Path(neo4j_home) if neo4j_home else _get_neo4j_home()
108
+ admin_bin = home / "bin" / "neo4j-admin"
109
+
110
+ if not admin_bin.exists():
111
+ click.echo(f"Error: neo4j-admin not found at {admin_bin}", err=True)
112
+ raise SystemExit(1)
113
+
114
+ dump_path = Path(dump_file)
115
+ click.echo(f"Loading database '{database}' from {dump_path}...")
116
+
117
+ cmd = [
118
+ str(admin_bin), "database", "load",
119
+ f"--from-path={dump_path.parent}",
120
+ ]
121
+ if overwrite:
122
+ cmd.append("--overwrite-destination=true")
123
+ cmd.append(database)
124
+
125
+ result = subprocess.run(cmd, capture_output=True, text=True)
126
+ if result.returncode != 0:
127
+ click.echo(f"Error: {result.stderr.strip()}", err=True)
128
+ if "already exists" in result.stderr:
129
+ click.echo("Use --overwrite to replace the existing database.", err=True)
130
+ elif "running" in result.stderr.lower():
131
+ click.echo("Stop Neo4j first: graphpop stop", err=True)
132
+ raise SystemExit(1)
133
+
134
+ # Update config
135
+ config_path = Path.home() / ".graphpop" / "config.yaml"
136
+ cfg = {}
137
+ if config_path.exists():
138
+ with open(config_path) as f:
139
+ cfg = yaml.safe_load(f) or {}
140
+ cfg["database"] = database
141
+ with open(config_path, "w") as f:
142
+ yaml.dump(cfg, f, default_flow_style=False)
143
+
144
+ click.echo(f"Database '{database}' loaded successfully.")
145
+ click.echo(f"Config updated to use database '{database}'.")
146
+ click.echo(f"\nNext: graphpop start && graphpop db info")
147
+
148
+
149
+ def _generate_manifest(neo4j_home: Path, database: str,
150
+ manifest_path: Path, dump_path: Path):
151
+ """Generate a JSON manifest with database metadata."""
152
+ manifest = {
153
+ "database": database,
154
+ "date": datetime.now().isoformat(),
155
+ "dump_file": str(dump_path),
156
+ "dump_size_bytes": dump_path.stat().st_size if dump_path.exists() else 0,
157
+ }
158
+
159
+ # Try to query the database for node/edge counts
160
+ # (only works if Neo4j is running — otherwise just save basic info)
161
+ try:
162
+ from ..connection import load_config, get_driver
163
+ cfg = load_config()
164
+ driver = get_driver(cfg)
165
+ with driver.session(database=database) as session:
166
+ # Node counts
167
+ result = session.run(
168
+ "CALL db.labels() YIELD label "
169
+ "CALL { WITH label MATCH (n) WHERE label IN labels(n) "
170
+ "RETURN count(n) AS cnt } RETURN label, cnt"
171
+ )
172
+ manifest["node_counts"] = {r["label"]: r["cnt"] for r in result}
173
+
174
+ # Relationship counts
175
+ result = session.run(
176
+ "CALL db.relationshipTypes() YIELD relationshipType AS type "
177
+ "CALL { WITH type MATCH ()-[r]->() WHERE type(r) = type "
178
+ "RETURN count(r) AS cnt } RETURN type, cnt"
179
+ )
180
+ manifest["edge_counts"] = {r["type"]: r["cnt"] for r in result}
181
+
182
+ # Populations
183
+ result = session.run(
184
+ "MATCH (p:Population) RETURN p.populationId AS pop, "
185
+ "p.n_samples AS n ORDER BY n DESC"
186
+ )
187
+ manifest["populations"] = {r["pop"]: r["n"] for r in result}
188
+
189
+ # Chromosomes
190
+ result = session.run(
191
+ "MATCH (c:Chromosome) RETURN c.chromosomeId AS chr, "
192
+ "c.length AS len ORDER BY chr"
193
+ )
194
+ manifest["chromosomes"] = {r["chr"]: r["len"] for r in result}
195
+
196
+ driver.close()
197
+ except Exception:
198
+ manifest["note"] = "Neo4j not running; metadata not available"
199
+
200
+ with open(manifest_path, "w") as f:
201
+ json.dump(manifest, f, indent=2, default=str)
202
+
203
+
204
+ def _format_size(size_bytes: int) -> str:
205
+ """Format bytes as human-readable."""
206
+ for unit in ("B", "KB", "MB", "GB", "TB"):
207
+ if abs(size_bytes) < 1024.0:
208
+ return f"{size_bytes:.1f} {unit}"
209
+ size_bytes /= 1024.0
210
+ return f"{size_bytes:.1f} PB"
@@ -0,0 +1,170 @@
1
+ """graphpop export-bed — export high-scoring regions as BED format."""
2
+ from __future__ import annotations
3
+
4
+ import click
5
+
6
+ from ..cli import pass_ctx
7
+
8
+
9
+ # Statistics stored on GenomicWindow nodes vs Variant nodes
10
+ WINDOW_STATS = {"fst", "pi", "tajima_d", "h12"}
11
+ VARIANT_STATS = {"ihs", "xpehh", "nsl"}
12
+
13
+
14
+ @click.command("export-bed")
15
+ @click.option("--stat", required=True,
16
+ type=click.Choice(["fst", "pi", "tajima_d", "h12", "ihs", "xpehh", "nsl"]),
17
+ help="Statistic to threshold")
18
+ @click.option("--threshold", required=True, type=float,
19
+ help="Minimum value to include (absolute value for ihs/xpehh/nsl)")
20
+ @click.option("--pop", "population", required=True, help="Population name")
21
+ @click.option("--pop2", help="Second population (required for xpehh)")
22
+ @click.option("--chr", "chromosome", help="Chromosome filter (optional)")
23
+ @click.option("--merge-distance", type=int, default=100000,
24
+ help="Merge variants within this distance into intervals (variant-based stats, default: 100000)")
25
+ @click.option("-o", "--output", "output_path", required=True, help="Output BED file")
26
+ @pass_ctx
27
+ def export_bed(ctx, stat, threshold, population, pop2, chromosome,
28
+ merge_distance, output_path):
29
+ """Export regions exceeding a statistic threshold as BED format.
30
+
31
+ For window-based stats (fst, pi, tajima_d, h12), queries GenomicWindow
32
+ nodes directly. For variant-based stats (ihs, xpehh, nsl), merges
33
+ consecutive high-scoring variants into intervals using --merge-distance.
34
+
35
+ Output: standard 5-column BED (chr, start, end, name, score).
36
+
37
+ \b
38
+ Examples:
39
+ graphpop export-bed --stat fst --threshold 0.5 --pop EUR -o high_fst.bed
40
+ graphpop export-bed --stat ihs --threshold 2.5 --pop EUR --chr chr22 -o ihs_peaks.bed
41
+ graphpop export-bed --stat xpehh --threshold 3.0 --pop EUR --pop2 AFR -o xpehh.bed
42
+ graphpop export-bed --stat tajima_d --threshold -2.0 --pop GJ-tmp -o tajimad.bed
43
+ """
44
+ if stat == "xpehh" and not pop2:
45
+ click.echo("Error: --pop2 is required for xpehh.", err=True)
46
+ raise SystemExit(1)
47
+
48
+ bed_name = f"{stat}_{population}" if not pop2 else f"{stat}_{population}_{pop2}"
49
+
50
+ if stat in WINDOW_STATS:
51
+ records = _query_window_stat(ctx, stat, threshold, population, pop2, chromosome)
52
+ bed_lines = _windows_to_bed(records, stat, bed_name)
53
+ else:
54
+ records = _query_variant_stat(ctx, stat, threshold, population, pop2, chromosome)
55
+ bed_lines = _merge_variants_to_bed(records, merge_distance, bed_name)
56
+
57
+ if not bed_lines:
58
+ click.echo(f"No regions found exceeding threshold {threshold} for {stat}.", err=True)
59
+ return
60
+
61
+ with open(output_path, "w") as f:
62
+ for line in bed_lines:
63
+ f.write(line + "\n")
64
+
65
+ click.echo(f"Wrote {len(bed_lines)} BED intervals to {output_path}.", err=True)
66
+
67
+
68
+ def _query_window_stat(ctx, stat, threshold, population, pop2, chromosome):
69
+ """Query GenomicWindow nodes for window-based statistics."""
70
+ params = {"population": population}
71
+ where_parts = ["w.population = $population"]
72
+ if chromosome:
73
+ where_parts.append("w.chr = $chromosome")
74
+ params["chromosome"] = chromosome
75
+
76
+ prop = stat
77
+ if stat == "fst" and pop2:
78
+ prop = f"fst_{population}_{pop2}"
79
+
80
+ # For Tajima's D, extreme negative values indicate selection
81
+ if stat == "tajima_d":
82
+ where_parts.append(f"w.{prop} <= {threshold}")
83
+ else:
84
+ where_parts.append(f"w.{prop} >= {threshold}")
85
+
86
+ cypher = (
87
+ f"MATCH (w:GenomicWindow) "
88
+ f"WHERE {' AND '.join(where_parts)} "
89
+ f"RETURN w.chr AS chr, w.start AS start, w.end AS end, "
90
+ f"w.{prop} AS score "
91
+ f"ORDER BY w.chr, w.start"
92
+ )
93
+ return ctx.run(cypher, params)
94
+
95
+
96
+ def _query_variant_stat(ctx, stat, threshold, population, pop2, chromosome):
97
+ """Query Variant nodes for variant-based statistics."""
98
+ if stat == "xpehh":
99
+ prop = f"xpehh_{population}_{pop2}"
100
+ else:
101
+ prop = f"{stat}_{population}"
102
+
103
+ params = {}
104
+ where_parts = [f"v.{prop} IS NOT NULL", f"abs(v.{prop}) >= {threshold}"]
105
+ if chromosome:
106
+ where_parts.append("v.chr = $chromosome")
107
+ params["chromosome"] = chromosome
108
+
109
+ cypher = (
110
+ f"MATCH (v:Variant) "
111
+ f"WHERE {' AND '.join(where_parts)} "
112
+ f"RETURN v.chr AS chr, v.pos AS pos, v.{prop} AS score "
113
+ f"ORDER BY v.chr, v.pos"
114
+ )
115
+ return ctx.run(cypher, params)
116
+
117
+
118
+ def _windows_to_bed(records, stat, bed_name):
119
+ """Convert window records directly to BED lines."""
120
+ lines = []
121
+ for r in records:
122
+ chrom = r.get("chr", "")
123
+ start = r.get("start", 0)
124
+ end = r.get("end", 0)
125
+ score = r.get("score", 0)
126
+ score_str = f"{score:.6g}" if isinstance(score, float) else str(score)
127
+ lines.append(f"{chrom}\t{start}\t{end}\t{bed_name}\t{score_str}")
128
+ return lines
129
+
130
+
131
+ def _merge_variants_to_bed(records, merge_distance, bed_name):
132
+ """Merge consecutive high-scoring variants into BED intervals."""
133
+ if not records:
134
+ return []
135
+
136
+ intervals = []
137
+ current_chr = None
138
+ current_start = None
139
+ current_end = None
140
+ current_scores = []
141
+
142
+ for r in records:
143
+ chrom = r.get("chr", "")
144
+ pos = r.get("pos", 0)
145
+ score = r.get("score", 0)
146
+
147
+ if (current_chr is None or chrom != current_chr
148
+ or pos - current_end > merge_distance):
149
+ # Emit previous interval
150
+ if current_chr is not None:
151
+ mean_score = sum(current_scores) / len(current_scores)
152
+ intervals.append((current_chr, current_start, current_end, mean_score))
153
+ # Start new interval
154
+ current_chr = chrom
155
+ current_start = pos
156
+ current_end = pos
157
+ current_scores = [abs(score) if isinstance(score, (int, float)) else 0]
158
+ else:
159
+ current_end = pos
160
+ current_scores.append(abs(score) if isinstance(score, (int, float)) else 0)
161
+
162
+ # Emit last interval
163
+ if current_chr is not None:
164
+ mean_score = sum(current_scores) / len(current_scores)
165
+ intervals.append((current_chr, current_start, current_end, mean_score))
166
+
167
+ lines = []
168
+ for chrom, start, end, score in intervals:
169
+ lines.append(f"{chrom}\t{start}\t{end}\t{bed_name}\t{score:.6g}")
170
+ return lines
@@ -0,0 +1,91 @@
1
+ """graphpop export-windows — batch export GenomicWindow nodes to TSV."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ from ..cli import pass_ctx
9
+ from ..formatters import format_output
10
+
11
+
12
+ @click.command("export-windows")
13
+ @click.argument("chr", required=False)
14
+ @click.argument("population", required=False)
15
+ @click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
16
+ @click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
17
+ @click.option("--min-pi", type=float, help="Minimum pi filter")
18
+ @click.option("--max-pi", type=float, help="Maximum pi filter")
19
+ @click.option("--min-fst", type=float, help="Minimum Fst filter")
20
+ @click.option("--min-tajima-d", type=float, help="Minimum Tajima's D filter")
21
+ @click.option("--max-tajima-d", type=float, help="Maximum Tajima's D filter")
22
+ @click.option("--run-id", help="Filter by specific run ID")
23
+ @click.option("--limit", type=int, help="Maximum number of windows to return")
24
+ @pass_ctx
25
+ def export_windows(ctx, chr, population, output_path, fmt,
26
+ min_pi, max_pi, min_fst, min_tajima_d, max_tajima_d,
27
+ run_id, limit):
28
+ """Export GenomicWindow nodes from the graph as TSV.
29
+
30
+ Query persisted genome scan results (GenomicWindow nodes) with optional
31
+ filters. Without arguments, exports all windows. With CHR and POPULATION,
32
+ exports windows for that combination.
33
+
34
+ \b
35
+ Examples:
36
+ graphpop export-windows # all windows
37
+ graphpop export-windows chr22 EUR -o windows.tsv # specific region
38
+ graphpop export-windows --min-fst 0.5 # high-Fst windows
39
+ graphpop export-windows chr1 AFR --max-tajima-d -2 # negative Tajima's D
40
+ """
41
+ # Build Cypher query with parameterized values to prevent injection
42
+ where_clauses = []
43
+ params: dict = {}
44
+
45
+ if chr:
46
+ where_clauses.append("w.chr = $chr")
47
+ params["chr"] = chr
48
+ if population:
49
+ where_clauses.append("w.population = $population")
50
+ params["population"] = population
51
+ if min_pi is not None:
52
+ where_clauses.append(f"w.pi >= {min_pi}")
53
+ if max_pi is not None:
54
+ where_clauses.append(f"w.pi <= {max_pi}")
55
+ if min_fst is not None:
56
+ where_clauses.append(f"w.fst >= {min_fst}")
57
+ if min_tajima_d is not None:
58
+ where_clauses.append(f"w.tajima_d >= {min_tajima_d}")
59
+ if max_tajima_d is not None:
60
+ where_clauses.append(f"w.tajima_d <= {max_tajima_d}")
61
+ if run_id:
62
+ where_clauses.append("w.run_id = $run_id")
63
+ params["run_id"] = run_id
64
+
65
+ where = " AND ".join(where_clauses) if where_clauses else "TRUE"
66
+ limit_clause = " LIMIT $limit" if limit else ""
67
+ if limit:
68
+ params["limit"] = limit
69
+
70
+ cypher = (
71
+ f"MATCH (w:GenomicWindow) WHERE {where} "
72
+ f"RETURN w.windowId AS window_id, w.chr AS chr, "
73
+ f"w.start AS start, w.end AS end, "
74
+ f"w.population AS population, w.run_id AS run_id, "
75
+ f"w.n_variants AS n_variants, w.n_segregating AS n_segregating, "
76
+ f"w.pi AS pi, w.theta_w AS theta_w, w.tajima_d AS tajima_d, "
77
+ f"w.fst AS fst, w.fst_wc AS fst_wc, w.dxy AS dxy, "
78
+ f"w.pbs AS pbs, w.fay_wu_h AS fay_wu_h "
79
+ f"ORDER BY w.chr, w.start"
80
+ f"{limit_clause}"
81
+ )
82
+
83
+ records = ctx.run(cypher, params)
84
+
85
+ if not records:
86
+ click.echo("No windows found matching criteria.", err=True)
87
+ return
88
+
89
+ click.echo(f"Exporting {len(records)} windows...", err=True)
90
+ format_output(records, output_path, fmt, "export-windows",
91
+ {"chr": chr, "population": population})