graphpop-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. graphpop_cli-0.1.0/PKG-INFO +73 -0
  2. graphpop_cli-0.1.0/README.md +47 -0
  3. graphpop_cli-0.1.0/pyproject.toml +46 -0
  4. graphpop_cli-0.1.0/setup.cfg +4 -0
  5. graphpop_cli-0.1.0/src/graphpop_cli/__init__.py +2 -0
  6. graphpop_cli-0.1.0/src/graphpop_cli/cli.py +161 -0
  7. graphpop_cli-0.1.0/src/graphpop_cli/commands/__init__.py +1 -0
  8. graphpop_cli-0.1.0/src/graphpop_cli/commands/aggregate.py +206 -0
  9. graphpop_cli-0.1.0/src/graphpop_cli/commands/batch.py +155 -0
  10. graphpop_cli-0.1.0/src/graphpop_cli/commands/compare.py +118 -0
  11. graphpop_cli-0.1.0/src/graphpop_cli/commands/config_cmd.py +117 -0
  12. graphpop_cli-0.1.0/src/graphpop_cli/commands/converge.py +156 -0
  13. graphpop_cli-0.1.0/src/graphpop_cli/commands/db.py +188 -0
  14. graphpop_cli-0.1.0/src/graphpop_cli/commands/divergence.py +37 -0
  15. graphpop_cli-0.1.0/src/graphpop_cli/commands/diversity.py +36 -0
  16. graphpop_cli-0.1.0/src/graphpop_cli/commands/dump.py +210 -0
  17. graphpop_cli-0.1.0/src/graphpop_cli/commands/export_bed.py +170 -0
  18. graphpop_cli-0.1.0/src/graphpop_cli/commands/export_windows.py +91 -0
  19. graphpop_cli-0.1.0/src/graphpop_cli/commands/extract.py +271 -0
  20. graphpop_cli-0.1.0/src/graphpop_cli/commands/filter_results.py +165 -0
  21. graphpop_cli-0.1.0/src/graphpop_cli/commands/garud_h.py +30 -0
  22. graphpop_cli-0.1.0/src/graphpop_cli/commands/genome_scan.py +41 -0
  23. graphpop_cli-0.1.0/src/graphpop_cli/commands/ihs.py +29 -0
  24. graphpop_cli-0.1.0/src/graphpop_cli/commands/import_data.py +266 -0
  25. graphpop_cli-0.1.0/src/graphpop_cli/commands/inventory.py +160 -0
  26. graphpop_cli-0.1.0/src/graphpop_cli/commands/joint_sfs.py +38 -0
  27. graphpop_cli-0.1.0/src/graphpop_cli/commands/ld.py +35 -0
  28. graphpop_cli-0.1.0/src/graphpop_cli/commands/lookup.py +207 -0
  29. graphpop_cli-0.1.0/src/graphpop_cli/commands/neighbors.py +175 -0
  30. graphpop_cli-0.1.0/src/graphpop_cli/commands/nsl.py +29 -0
  31. graphpop_cli-0.1.0/src/graphpop_cli/commands/plot.py +1066 -0
  32. graphpop_cli-0.1.0/src/graphpop_cli/commands/pop_summary.py +30 -0
  33. graphpop_cli-0.1.0/src/graphpop_cli/commands/query.py +15 -0
  34. graphpop_cli-0.1.0/src/graphpop_cli/commands/rank_genes.py +177 -0
  35. graphpop_cli-0.1.0/src/graphpop_cli/commands/report.py +264 -0
  36. graphpop_cli-0.1.0/src/graphpop_cli/commands/roh.py +30 -0
  37. graphpop_cli-0.1.0/src/graphpop_cli/commands/run_all.py +276 -0
  38. graphpop_cli-0.1.0/src/graphpop_cli/commands/server.py +98 -0
  39. graphpop_cli-0.1.0/src/graphpop_cli/commands/setup.py +299 -0
  40. graphpop_cli-0.1.0/src/graphpop_cli/commands/sfs.py +38 -0
  41. graphpop_cli-0.1.0/src/graphpop_cli/commands/validate.py +167 -0
  42. graphpop_cli-0.1.0/src/graphpop_cli/commands/xpehh.py +31 -0
  43. graphpop_cli-0.1.0/src/graphpop_cli/config.py +57 -0
  44. graphpop_cli-0.1.0/src/graphpop_cli/connection.py +52 -0
  45. graphpop_cli-0.1.0/src/graphpop_cli/formatters.py +81 -0
  46. graphpop_cli-0.1.0/src/graphpop_cli.egg-info/PKG-INFO +73 -0
  47. graphpop_cli-0.1.0/src/graphpop_cli.egg-info/SOURCES.txt +53 -0
  48. graphpop_cli-0.1.0/src/graphpop_cli.egg-info/dependency_links.txt +1 -0
  49. graphpop_cli-0.1.0/src/graphpop_cli.egg-info/entry_points.txt +2 -0
  50. graphpop_cli-0.1.0/src/graphpop_cli.egg-info/requires.txt +7 -0
  51. graphpop_cli-0.1.0/src/graphpop_cli.egg-info/top_level.txt +1 -0
  52. graphpop_cli-0.1.0/tests/test_commands.py +476 -0
  53. graphpop_cli-0.1.0/tests/test_config.py +61 -0
  54. graphpop_cli-0.1.0/tests/test_connection.py +76 -0
  55. graphpop_cli-0.1.0/tests/test_formatters.py +121 -0
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.4
2
+ Name: graphpop-cli
3
+ Version: 0.1.0
4
+ Summary: Graph database-native population genomics CLI with O(V*K) complexity
5
+ Author: Jianfeng Mao
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/jfmao/GraphPop
8
+ Project-URL: Repository, https://github.com/jfmao/GraphPop
9
+ Project-URL: Issues, https://github.com/jfmao/GraphPop/issues
10
+ Keywords: population-genomics,graph-database,neo4j,bioinformatics,genetics
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: click>=8.0
21
+ Requires-Dist: neo4j>=5.0
22
+ Requires-Dist: pyyaml>=6.0
23
+ Provides-Extra: plot
24
+ Requires-Dist: matplotlib>=3.5; extra == "plot"
25
+ Requires-Dist: numpy>=1.22; extra == "plot"
26
+
27
+ # GraphPop CLI
28
+
29
+ Command-line interface for **GraphPop** — a graph database-native population genomics engine that reduces summary statistic complexity from O(V×N) to O(V×K), independent of sample count.
30
+
31
+ ## Quick Start
32
+
33
+ ```bash
34
+ pip install graphpop-cli
35
+ graphpop setup --password mypass # Downloads Neo4j + procedures plugin
36
+ graphpop start # Start the database
37
+ ```
38
+
39
+ **Prerequisites:** Python 3.10+, Java 21+ (for Neo4j runtime).
40
+
41
+ ## Features
42
+
43
+ - **60 commands** across 11 functional domains
44
+ - **12 population genetics procedures**: diversity, Fst, SFS, iHS, XP-EHH, nSL, ROH, Garud's H, LD, genome scan, pop summary, joint SFS
45
+ - **Annotation conditioning**: `--consequence`, `--pathway`, `--gene` flags on any procedure
46
+ - **Persistent analytical records**: `--persist` writes results to graph nodes
47
+ - **Publication-ready plots**: 11 visualization types following Nature Methods guidelines
48
+
49
+ ## Usage
50
+
51
+ ```bash
52
+ # Population diversity
53
+ graphpop diversity chr1 1 50000000 EUR -o diversity.tsv
54
+
55
+ # Annotation-conditioned analysis
56
+ graphpop diversity chr1 1 43270923 GJ-tmp --consequence missense_variant
57
+
58
+ # Selection scan
59
+ graphpop ihs chr22 EUR --persist -o ihs.tsv
60
+
61
+ # Multi-statistic convergence
62
+ graphpop converge --stats ihs,xpehh,h12 --thresholds 2,2,0.3 --pop EUR
63
+ ```
64
+
65
+ ## Documentation
66
+
67
+ - [Full documentation](https://github.com/jfmao/GraphPop)
68
+ - [Rice 3K vignette](https://github.com/jfmao/GraphPop/blob/main/graphpop-cli/vignettes/rice-3k-analysis.md)
69
+ - [Human 1000G vignette](https://github.com/jfmao/GraphPop/blob/main/graphpop-cli/vignettes/human-1000g-analysis.md)
70
+
71
+ ## License
72
+
73
+ MIT
@@ -0,0 +1,47 @@
1
+ # GraphPop CLI
2
+
3
+ Command-line interface for **GraphPop** — a graph database-native population genomics engine that reduces summary statistic complexity from O(V×N) to O(V×K), independent of sample count.
4
+
5
+ ## Quick Start
6
+
7
+ ```bash
8
+ pip install graphpop-cli
9
+ graphpop setup --password mypass # Downloads Neo4j + procedures plugin
10
+ graphpop start # Start the database
11
+ ```
12
+
13
+ **Prerequisites:** Python 3.10+, Java 21+ (for Neo4j runtime).
14
+
15
+ ## Features
16
+
17
+ - **60 commands** across 11 functional domains
18
+ - **12 population genetics procedures**: diversity, Fst, SFS, iHS, XP-EHH, nSL, ROH, Garud's H, LD, genome scan, pop summary, joint SFS
19
+ - **Annotation conditioning**: `--consequence`, `--pathway`, `--gene` flags on any procedure
20
+ - **Persistent analytical records**: `--persist` writes results to graph nodes
21
+ - **Publication-ready plots**: 11 visualization types following Nature Methods guidelines
22
+
23
+ ## Usage
24
+
25
+ ```bash
26
+ # Population diversity
27
+ graphpop diversity chr1 1 50000000 EUR -o diversity.tsv
28
+
29
+ # Annotation-conditioned analysis
30
+ graphpop diversity chr1 1 43270923 GJ-tmp --consequence missense_variant
31
+
32
+ # Selection scan
33
+ graphpop ihs chr22 EUR --persist -o ihs.tsv
34
+
35
+ # Multi-statistic convergence
36
+ graphpop converge --stats ihs,xpehh,h12 --thresholds 2,2,0.3 --pop EUR
37
+ ```
38
+
39
+ ## Documentation
40
+
41
+ - [Full documentation](https://github.com/jfmao/GraphPop)
42
+ - [Rice 3K vignette](https://github.com/jfmao/GraphPop/blob/main/graphpop-cli/vignettes/rice-3k-analysis.md)
43
+ - [Human 1000G vignette](https://github.com/jfmao/GraphPop/blob/main/graphpop-cli/vignettes/human-1000g-analysis.md)
44
+
45
+ ## License
46
+
47
+ MIT
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "graphpop-cli"
7
+ version = "0.1.0"
8
+ description = "Graph database-native population genomics CLI with O(V*K) complexity"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "Jianfeng Mao"},
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Science/Research",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
23
+ ]
24
+ keywords = ["population-genomics", "graph-database", "neo4j", "bioinformatics", "genetics"]
25
+ dependencies = [
26
+ "click>=8.0",
27
+ "neo4j>=5.0",
28
+ "pyyaml>=6.0",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ plot = [
33
+ "matplotlib>=3.5",
34
+ "numpy>=1.22",
35
+ ]
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/jfmao/GraphPop"
39
+ Repository = "https://github.com/jfmao/GraphPop"
40
+ Issues = "https://github.com/jfmao/GraphPop/issues"
41
+
42
+ [project.scripts]
43
+ graphpop = "graphpop_cli.cli:main"
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,2 @@
1
+ """GraphPop CLI — command-line interface for graph-native population genomics."""
2
+ __version__ = "0.1.0"
@@ -0,0 +1,161 @@
1
+ """GraphPop CLI — command-line interface for graph-native population genomics.
2
+
3
+ Usage:
4
+ graphpop diversity chr22 1 50000000 EUR -o diversity.tsv
5
+ graphpop ihs chr22 EUR --min-af 0.05 --persist -o ihs.tsv
6
+ graphpop genome-scan chr22 EUR 100000 50000 --persist -o scan.tsv
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ import click
14
+
15
+ from .connection import load_config, get_driver
16
+
17
+
18
+ class GraphPopContext:
19
+ """Shared context passed to all commands."""
20
+ def __init__(self, uri=None, user=None, password=None, database=None,
21
+ config_path=None):
22
+ cfg = load_config(Path(config_path) if config_path else None)
23
+ if uri:
24
+ cfg["uri"] = uri
25
+ if user:
26
+ cfg["user"] = user
27
+ if password:
28
+ cfg["password"] = password
29
+ if database:
30
+ cfg["database"] = database
31
+ self.cfg = cfg
32
+ self._driver = None
33
+
34
+ @property
35
+ def driver(self):
36
+ if self._driver is None:
37
+ self._driver = get_driver(self.cfg)
38
+ return self._driver
39
+
40
+ @property
41
+ def database(self):
42
+ return self.cfg["database"]
43
+
44
+ def run(self, cypher: str, parameters: dict | None = None) -> list[dict]:
45
+ """Run Cypher and return records as list of dicts."""
46
+ try:
47
+ with self.driver.session(database=self.database) as session:
48
+ return [rec.data() for rec in session.run(cypher, parameters)]
49
+ except Exception as e:
50
+ err_msg = str(e)
51
+ if "Connection refused" in err_msg or "Failed to establish" in err_msg:
52
+ click.echo(
53
+ "Error: Cannot connect to Neo4j at "
54
+ f"{self.cfg['uri']}.\n"
55
+ "Is Neo4j running? Check connection with:\n"
56
+ f" export GRAPHPOP_URI={self.cfg['uri']}\n"
57
+ " or create ~/.graphpop/config.yaml",
58
+ err=True,
59
+ )
60
+ else:
61
+ click.echo(f"Error: {e}", err=True)
62
+ raise SystemExit(1)
63
+
64
+ def close(self):
65
+ if self._driver:
66
+ self._driver.close()
67
+
68
+
69
+ pass_ctx = click.make_pass_decorator(GraphPopContext, ensure=True)
70
+
71
+
72
+ @click.group()
73
+ @click.option("--uri", envvar="GRAPHPOP_URI", help="Neo4j bolt URI")
74
+ @click.option("--user", envvar="GRAPHPOP_USER", help="Neo4j username")
75
+ @click.option("--password", envvar="GRAPHPOP_PASSWORD", help="Neo4j password")
76
+ @click.option("--database", envvar="GRAPHPOP_DATABASE", help="Neo4j database name")
77
+ @click.option("--config", "config_path", type=click.Path(),
78
+ help="Config file path (default: ~/.graphpop/config.yaml)")
79
+ @click.version_option(package_name="graphpop-cli")
80
+ @click.pass_context
81
+ def main(ctx, uri, user, password, database, config_path):
82
+ """GraphPop — graph-native population genomics from the command line.
83
+
84
+ Compute population genetics statistics via Neo4j stored procedures with
85
+ default TSV output. Use --persist to write results to graph nodes.
86
+ """
87
+ ctx.ensure_object(dict)
88
+ ctx.obj = GraphPopContext(uri=uri, user=user, password=password,
89
+ database=database, config_path=config_path)
90
+
91
+
92
+ # Import all command modules
93
+ from .commands import ( # noqa: E402
94
+ diversity, divergence, sfs, joint_sfs,
95
+ genome_scan, pop_summary,
96
+ ld, ihs, xpehh, nsl, roh, garud_h,
97
+ query, run_all, aggregate, export_windows,
98
+ setup, server, db, import_data, dump,
99
+ config_cmd, validate, filter_results, plot,
100
+ lookup, converge, inventory, rank_genes,
101
+ extract, export_bed, batch, compare,
102
+ report, neighbors,
103
+ )
104
+
105
+ # Individual procedures (12)
106
+ main.add_command(diversity.diversity)
107
+ main.add_command(divergence.divergence)
108
+ main.add_command(sfs.sfs)
109
+ main.add_command(joint_sfs.joint_sfs)
110
+ main.add_command(genome_scan.genome_scan)
111
+ main.add_command(pop_summary.pop_summary)
112
+ main.add_command(ld.ld)
113
+ main.add_command(ihs.ihs)
114
+ main.add_command(xpehh.xpehh)
115
+ main.add_command(nsl.nsl)
116
+ main.add_command(roh.roh)
117
+ main.add_command(garud_h.garud_h)
118
+
119
+ # Orchestration and export
120
+ main.add_command(run_all.run_all)
121
+ main.add_command(aggregate.aggregate)
122
+ main.add_command(export_windows.export_windows)
123
+ main.add_command(query.query)
124
+ main.add_command(filter_results.filter_results)
125
+
126
+ # Setup and server management
127
+ main.add_command(setup.setup)
128
+ main.add_command(server.start)
129
+ main.add_command(server.stop)
130
+ main.add_command(server.status)
131
+
132
+ # Database management
133
+ main.add_command(db.db)
134
+ main.add_command(import_data.import_data)
135
+ main.add_command(dump.dump)
136
+ main.add_command(dump.load)
137
+
138
+ # Configuration and validation
139
+ main.add_command(config_cmd.config)
140
+ main.add_command(validate.validate)
141
+ main.add_command(plot.plot)
142
+
143
+ # Phase 1 high-priority commands
144
+ main.add_command(lookup.lookup)
145
+ main.add_command(converge.converge)
146
+ main.add_command(inventory.inventory)
147
+ main.add_command(rank_genes.rank_genes)
148
+
149
+ # Phase 2 commands
150
+ main.add_command(extract.extract)
151
+ main.add_command(export_bed.export_bed)
152
+ main.add_command(batch.batch)
153
+ main.add_command(compare.compare)
154
+
155
+ # Phase 3 commands
156
+ main.add_command(report.report)
157
+ main.add_command(neighbors.neighbors)
158
+
159
+
160
+ if __name__ == "__main__":
161
+ main()
@@ -0,0 +1 @@
1
+ """GraphPop CLI command modules."""
@@ -0,0 +1,206 @@
1
+ """graphpop aggregate — aggregate results and generate summary tables."""
2
+ from __future__ import annotations
3
+
4
+ import csv
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import click
9
+
10
+ from ..cli import pass_ctx
11
+
12
+
13
+ @click.command()
14
+ @click.option("--results-dir", "-d", type=click.Path(exists=True), required=True,
15
+ help="Directory with per-procedure TSV results (from run-all)")
16
+ @click.option("--json-results", "-j", type=click.Path(exists=True),
17
+ help="JSON results file (from run-all)")
18
+ @click.option("--output-dir", "-o", type=click.Path(), default="graphpop_tables",
19
+ help="Output directory for summary tables")
20
+ @pass_ctx
21
+ def aggregate(ctx, results_dir, json_results, output_dir):
22
+ """Aggregate per-population results into summary tables.
23
+
24
+ Reads TSV results from a run-all output directory and produces
25
+ publication-ready summary tables:
26
+
27
+ \b
28
+ population_summary.tsv — per-pop diversity, theta, Tajima's D, Fis
29
+ fst_matrix.tsv — pairwise Fst matrix
30
+ pinpis.tsv — piN/piS ratios (if conditioned results exist)
31
+ selection_peaks.tsv — top iHS/XP-EHH/nSL peaks per population
32
+ roh_summary.tsv — per-pop FROH statistics
33
+ """
34
+ results_path = Path(results_dir)
35
+ out_dir = Path(output_dir)
36
+ out_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ # Load JSON results if provided
39
+ all_results = {}
40
+ if json_results:
41
+ with open(json_results) as f:
42
+ all_results = json.load(f)
43
+ click.echo(f"Loaded {len(all_results)} results from JSON")
44
+
45
+ # --- Table 1: Population Summary ---
46
+ diversity_dir = results_path / "diversity"
47
+ if diversity_dir.exists():
48
+ click.echo("Generating population_summary.tsv...")
49
+ pop_stats = _aggregate_single_row_tsv(diversity_dir)
50
+ _write_summary(out_dir / "population_summary.tsv", pop_stats,
51
+ ["population", "chr", "pi", "theta_w", "tajima_d",
52
+ "het_exp", "het_obs", "fis", "n_variants", "n_segregating"])
53
+
54
+ # --- Table 2: Fst Matrix ---
55
+ divergence_dir = results_path / "divergence"
56
+ if divergence_dir.exists():
57
+ click.echo("Generating fst_matrix.tsv...")
58
+ div_stats = _aggregate_single_row_tsv(divergence_dir)
59
+ _write_summary(out_dir / "fst_matrix.tsv", div_stats,
60
+ ["pop1", "pop2", "chr", "fst_hudson", "fst_wc", "dxy", "da"])
61
+
62
+ # --- Table 3: ROH Summary ---
63
+ roh_dir = results_path / "roh"
64
+ if roh_dir.exists():
65
+ click.echo("Generating roh_summary.tsv...")
66
+ roh_data = _aggregate_multi_row_tsv(roh_dir)
67
+ # Compute per-population means
68
+ pop_roh = {}
69
+ for rec in roh_data:
70
+ pop = rec.get("population", rec.get("file_pop", "unknown"))
71
+ if pop not in pop_roh:
72
+ pop_roh[pop] = {"n_samples": 0, "total_froh": 0.0,
73
+ "total_n_roh": 0, "max_froh": 0.0}
74
+ pop_roh[pop]["n_samples"] += 1
75
+ froh = float(rec.get("froh", 0))
76
+ pop_roh[pop]["total_froh"] += froh
77
+ pop_roh[pop]["total_n_roh"] += int(rec.get("n_roh", 0))
78
+ pop_roh[pop]["max_froh"] = max(pop_roh[pop]["max_froh"], froh)
79
+
80
+ rows = []
81
+ for pop, s in sorted(pop_roh.items()):
82
+ rows.append({
83
+ "population": pop,
84
+ "n_samples": s["n_samples"],
85
+ "mean_froh": f"{s['total_froh'] / s['n_samples']:.6f}",
86
+ "mean_n_roh": f"{s['total_n_roh'] / s['n_samples']:.1f}",
87
+ "max_froh": f"{s['max_froh']:.6f}",
88
+ })
89
+ _write_dict_tsv(out_dir / "roh_summary.tsv", rows)
90
+
91
+ # --- Table 4: Selection Peaks ---
92
+ for proc in ("ihs", "nsl", "xpehh"):
93
+ proc_dir = results_path / proc
94
+ if proc_dir.exists():
95
+ click.echo(f"Generating {proc}_peaks.tsv...")
96
+ peaks = _extract_peaks(proc_dir, proc, top_n=100)
97
+ _write_dict_tsv(out_dir / f"{proc}_peaks.tsv", peaks)
98
+
99
+ # --- Table 5: Garud's H Sweep Windows ---
100
+ garud_dir = results_path / "garud_h"
101
+ if garud_dir.exists():
102
+ click.echo("Generating sweep_windows.tsv...")
103
+ sweeps = _extract_sweep_windows(garud_dir, h12_threshold=0.1)
104
+ _write_dict_tsv(out_dir / "sweep_windows.tsv", sweeps)
105
+
106
+ click.echo(f"\nSummary tables written to {out_dir}/")
107
+ for f in sorted(out_dir.glob("*.tsv")):
108
+ n_lines = sum(1 for _ in open(f)) - 1
109
+ click.echo(f" {f.name}: {n_lines} rows")
110
+
111
+
112
+ def _aggregate_single_row_tsv(directory: Path) -> list[dict]:
113
+ """Read TSV files with single data row, extract pop/chr from filename."""
114
+ rows = []
115
+ for tsv in sorted(directory.glob("*.tsv")):
116
+ parts = tsv.stem.split("_")
117
+ with open(tsv) as f:
118
+ reader = csv.DictReader(f, delimiter="\t")
119
+ for rec in reader:
120
+ # Infer pop and chr from filename: POP_CHR.tsv
121
+ if len(parts) >= 2:
122
+ rec["population"] = "_".join(parts[:-1])
123
+ rec["chr"] = parts[-1]
124
+ elif "vs" in tsv.stem:
125
+ # Pairwise: POP1_vs_POP2_CHR.tsv
126
+ vs_idx = parts.index("vs")
127
+ rec["pop1"] = "_".join(parts[:vs_idx])
128
+ rec["pop2"] = "_".join(parts[vs_idx + 1:-1])
129
+ rec["chr"] = parts[-1]
130
+ rows.append(rec)
131
+ return rows
132
+
133
+
134
+ def _aggregate_multi_row_tsv(directory: Path) -> list[dict]:
135
+ """Read TSV files with multiple data rows."""
136
+ rows = []
137
+ for tsv in sorted(directory.glob("*.tsv")):
138
+ parts = tsv.stem.split("_")
139
+ pop = "_".join(parts[:-1]) if len(parts) >= 2 else parts[0]
140
+ with open(tsv) as f:
141
+ reader = csv.DictReader(f, delimiter="\t")
142
+ for rec in reader:
143
+ rec["file_pop"] = pop
144
+ rows.append(rec)
145
+ return rows
146
+
147
+
148
+ def _extract_peaks(directory: Path, stat_name: str,
149
+ top_n: int = 100) -> list[dict]:
150
+ """Extract top peaks from per-variant result files."""
151
+ all_variants = []
152
+ for tsv in sorted(directory.glob("*.tsv")):
153
+ parts = tsv.stem.split("_")
154
+ with open(tsv) as f:
155
+ reader = csv.DictReader(f, delimiter="\t")
156
+ for rec in reader:
157
+ score = rec.get(stat_name, rec.get(f"{stat_name}_unstd", "0"))
158
+ try:
159
+ rec["abs_score"] = abs(float(score))
160
+ except (ValueError, TypeError):
161
+ rec["abs_score"] = 0
162
+ rec["source_file"] = tsv.stem
163
+ all_variants.append(rec)
164
+
165
+ all_variants.sort(key=lambda r: r["abs_score"], reverse=True)
166
+ return all_variants[:top_n]
167
+
168
+
169
+ def _extract_sweep_windows(directory: Path,
170
+ h12_threshold: float = 0.1) -> list[dict]:
171
+ """Extract windows exceeding H12 threshold."""
172
+ sweeps = []
173
+ for tsv in sorted(directory.glob("*.tsv")):
174
+ with open(tsv) as f:
175
+ reader = csv.DictReader(f, delimiter="\t")
176
+ for rec in reader:
177
+ try:
178
+ if float(rec.get("h12", 0)) >= h12_threshold:
179
+ sweeps.append(rec)
180
+ except (ValueError, TypeError):
181
+ pass
182
+ sweeps.sort(key=lambda r: float(r.get("h12", 0)), reverse=True)
183
+ return sweeps
184
+
185
+
186
+ def _write_summary(path: Path, rows: list[dict], columns: list[str]):
187
+ """Write summary table with specified columns."""
188
+ with open(path, "w", newline="") as f:
189
+ writer = csv.DictWriter(f, fieldnames=columns, delimiter="\t",
190
+ extrasaction="ignore")
191
+ writer.writeheader()
192
+ writer.writerows(rows)
193
+
194
+
195
+ def _write_dict_tsv(path: Path, rows: list[dict]):
196
+ """Write list of dicts as TSV."""
197
+ if not rows:
198
+ with open(path, "w") as f:
199
+ f.write("# No results\n")
200
+ return
201
+ keys = list(rows[0].keys())
202
+ with open(path, "w", newline="") as f:
203
+ writer = csv.DictWriter(f, fieldnames=keys, delimiter="\t",
204
+ extrasaction="ignore")
205
+ writer.writeheader()
206
+ writer.writerows(rows)