graphpop-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphpop_cli-0.1.0/PKG-INFO +73 -0
- graphpop_cli-0.1.0/README.md +47 -0
- graphpop_cli-0.1.0/pyproject.toml +46 -0
- graphpop_cli-0.1.0/setup.cfg +4 -0
- graphpop_cli-0.1.0/src/graphpop_cli/__init__.py +2 -0
- graphpop_cli-0.1.0/src/graphpop_cli/cli.py +161 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/__init__.py +1 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/aggregate.py +206 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/batch.py +155 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/compare.py +118 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/config_cmd.py +117 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/converge.py +156 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/db.py +188 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/divergence.py +37 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/diversity.py +36 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/dump.py +210 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/export_bed.py +170 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/export_windows.py +91 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/extract.py +271 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/filter_results.py +165 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/garud_h.py +30 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/genome_scan.py +41 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/ihs.py +29 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/import_data.py +266 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/inventory.py +160 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/joint_sfs.py +38 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/ld.py +35 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/lookup.py +207 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/neighbors.py +175 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/nsl.py +29 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/plot.py +1066 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/pop_summary.py +30 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/query.py +15 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/rank_genes.py +177 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/report.py +264 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/roh.py +30 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/run_all.py +276 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/server.py +98 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/setup.py +299 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/sfs.py +38 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/validate.py +167 -0
- graphpop_cli-0.1.0/src/graphpop_cli/commands/xpehh.py +31 -0
- graphpop_cli-0.1.0/src/graphpop_cli/config.py +57 -0
- graphpop_cli-0.1.0/src/graphpop_cli/connection.py +52 -0
- graphpop_cli-0.1.0/src/graphpop_cli/formatters.py +81 -0
- graphpop_cli-0.1.0/src/graphpop_cli.egg-info/PKG-INFO +73 -0
- graphpop_cli-0.1.0/src/graphpop_cli.egg-info/SOURCES.txt +53 -0
- graphpop_cli-0.1.0/src/graphpop_cli.egg-info/dependency_links.txt +1 -0
- graphpop_cli-0.1.0/src/graphpop_cli.egg-info/entry_points.txt +2 -0
- graphpop_cli-0.1.0/src/graphpop_cli.egg-info/requires.txt +7 -0
- graphpop_cli-0.1.0/src/graphpop_cli.egg-info/top_level.txt +1 -0
- graphpop_cli-0.1.0/tests/test_commands.py +476 -0
- graphpop_cli-0.1.0/tests/test_config.py +61 -0
- graphpop_cli-0.1.0/tests/test_connection.py +76 -0
- graphpop_cli-0.1.0/tests/test_formatters.py +121 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graphpop-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Graph database-native population genomics CLI with O(V*K) complexity
|
|
5
|
+
Author: Jianfeng Mao
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jfmao/GraphPop
|
|
8
|
+
Project-URL: Repository, https://github.com/jfmao/GraphPop
|
|
9
|
+
Project-URL: Issues, https://github.com/jfmao/GraphPop/issues
|
|
10
|
+
Keywords: population-genomics,graph-database,neo4j,bioinformatics,genetics
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
21
|
+
Requires-Dist: neo4j>=5.0
|
|
22
|
+
Requires-Dist: pyyaml>=6.0
|
|
23
|
+
Provides-Extra: plot
|
|
24
|
+
Requires-Dist: matplotlib>=3.5; extra == "plot"
|
|
25
|
+
Requires-Dist: numpy>=1.22; extra == "plot"
|
|
26
|
+
|
|
27
|
+
# GraphPop CLI
|
|
28
|
+
|
|
29
|
+
Command-line interface for **GraphPop** — a graph database-native population genomics engine that reduces summary statistic complexity from O(V×N) to O(V×K), independent of sample count.
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install graphpop-cli
|
|
35
|
+
graphpop setup --password mypass # Downloads Neo4j + procedures plugin
|
|
36
|
+
graphpop start # Start the database
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
**Prerequisites:** Python 3.10+, Java 21+ (for Neo4j runtime).
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- **60 commands** across 11 functional domains
|
|
44
|
+
- **12 population genetics procedures**: diversity, Fst, SFS, iHS, XP-EHH, nSL, ROH, Garud's H, LD, genome scan, pop summary, joint SFS
|
|
45
|
+
- **Annotation conditioning**: `--consequence`, `--pathway`, `--gene` flags on any procedure
|
|
46
|
+
- **Persistent analytical records**: `--persist` writes results to graph nodes
|
|
47
|
+
- **Publication-ready plots**: 11 visualization types following Nature Methods guidelines
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# Population diversity
|
|
53
|
+
graphpop diversity chr1 1 50000000 EUR -o diversity.tsv
|
|
54
|
+
|
|
55
|
+
# Annotation-conditioned analysis
|
|
56
|
+
graphpop diversity chr1 1 43270923 GJ-tmp --consequence missense_variant
|
|
57
|
+
|
|
58
|
+
# Selection scan
|
|
59
|
+
graphpop ihs chr22 EUR --persist -o ihs.tsv
|
|
60
|
+
|
|
61
|
+
# Multi-statistic convergence
|
|
62
|
+
graphpop converge --stats ihs,xpehh,h12 --thresholds 2,2,0.3 --pop EUR
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Documentation
|
|
66
|
+
|
|
67
|
+
- [Full documentation](https://github.com/jfmao/GraphPop)
|
|
68
|
+
- [Rice 3K vignette](https://github.com/jfmao/GraphPop/blob/main/graphpop-cli/vignettes/rice-3k-analysis.md)
|
|
69
|
+
- [Human 1000G vignette](https://github.com/jfmao/GraphPop/blob/main/graphpop-cli/vignettes/human-1000g-analysis.md)
|
|
70
|
+
|
|
71
|
+
## License
|
|
72
|
+
|
|
73
|
+
MIT
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# GraphPop CLI
|
|
2
|
+
|
|
3
|
+
Command-line interface for **GraphPop** — a graph database-native population genomics engine that reduces summary statistic complexity from O(V×N) to O(V×K), independent of sample count.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install graphpop-cli
|
|
9
|
+
graphpop setup --password mypass # Downloads Neo4j + procedures plugin
|
|
10
|
+
graphpop start # Start the database
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
**Prerequisites:** Python 3.10+, Java 21+ (for Neo4j runtime).
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- **60 commands** across 11 functional domains
|
|
18
|
+
- **12 population genetics procedures**: diversity, Fst, SFS, iHS, XP-EHH, nSL, ROH, Garud's H, LD, genome scan, pop summary, joint SFS
|
|
19
|
+
- **Annotation conditioning**: `--consequence`, `--pathway`, `--gene` flags on any procedure
|
|
20
|
+
- **Persistent analytical records**: `--persist` writes results to graph nodes
|
|
21
|
+
- **Publication-ready plots**: 11 visualization types following Nature Methods guidelines
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Population diversity
|
|
27
|
+
graphpop diversity chr1 1 50000000 EUR -o diversity.tsv
|
|
28
|
+
|
|
29
|
+
# Annotation-conditioned analysis
|
|
30
|
+
graphpop diversity chr1 1 43270923 GJ-tmp --consequence missense_variant
|
|
31
|
+
|
|
32
|
+
# Selection scan
|
|
33
|
+
graphpop ihs chr22 EUR --persist -o ihs.tsv
|
|
34
|
+
|
|
35
|
+
# Multi-statistic convergence
|
|
36
|
+
graphpop converge --stats ihs,xpehh,h12 --thresholds 2,2,0.3 --pop EUR
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Documentation
|
|
40
|
+
|
|
41
|
+
- [Full documentation](https://github.com/jfmao/GraphPop)
|
|
42
|
+
- [Rice 3K vignette](https://github.com/jfmao/GraphPop/blob/main/graphpop-cli/vignettes/rice-3k-analysis.md)
|
|
43
|
+
- [Human 1000G vignette](https://github.com/jfmao/GraphPop/blob/main/graphpop-cli/vignettes/human-1000g-analysis.md)
|
|
44
|
+
|
|
45
|
+
## License
|
|
46
|
+
|
|
47
|
+
MIT
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "graphpop-cli"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Graph database-native population genomics CLI with O(V*K) complexity"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Jianfeng Mao"},
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
23
|
+
]
|
|
24
|
+
keywords = ["population-genomics", "graph-database", "neo4j", "bioinformatics", "genetics"]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"click>=8.0",
|
|
27
|
+
"neo4j>=5.0",
|
|
28
|
+
"pyyaml>=6.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
plot = [
|
|
33
|
+
"matplotlib>=3.5",
|
|
34
|
+
"numpy>=1.22",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/jfmao/GraphPop"
|
|
39
|
+
Repository = "https://github.com/jfmao/GraphPop"
|
|
40
|
+
Issues = "https://github.com/jfmao/GraphPop/issues"
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
graphpop = "graphpop_cli.cli:main"
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["src"]
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""GraphPop CLI — command-line interface for graph-native population genomics.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
graphpop diversity chr22 1 50000000 EUR -o diversity.tsv
|
|
5
|
+
graphpop ihs chr22 EUR --min-af 0.05 --persist -o ihs.tsv
|
|
6
|
+
graphpop genome-scan chr22 EUR 100000 50000 --persist -o scan.tsv
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import click
|
|
14
|
+
|
|
15
|
+
from .connection import load_config, get_driver
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GraphPopContext:
|
|
19
|
+
"""Shared context passed to all commands."""
|
|
20
|
+
def __init__(self, uri=None, user=None, password=None, database=None,
|
|
21
|
+
config_path=None):
|
|
22
|
+
cfg = load_config(Path(config_path) if config_path else None)
|
|
23
|
+
if uri:
|
|
24
|
+
cfg["uri"] = uri
|
|
25
|
+
if user:
|
|
26
|
+
cfg["user"] = user
|
|
27
|
+
if password:
|
|
28
|
+
cfg["password"] = password
|
|
29
|
+
if database:
|
|
30
|
+
cfg["database"] = database
|
|
31
|
+
self.cfg = cfg
|
|
32
|
+
self._driver = None
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def driver(self):
|
|
36
|
+
if self._driver is None:
|
|
37
|
+
self._driver = get_driver(self.cfg)
|
|
38
|
+
return self._driver
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def database(self):
|
|
42
|
+
return self.cfg["database"]
|
|
43
|
+
|
|
44
|
+
def run(self, cypher: str, parameters: dict | None = None) -> list[dict]:
|
|
45
|
+
"""Run Cypher and return records as list of dicts."""
|
|
46
|
+
try:
|
|
47
|
+
with self.driver.session(database=self.database) as session:
|
|
48
|
+
return [rec.data() for rec in session.run(cypher, parameters)]
|
|
49
|
+
except Exception as e:
|
|
50
|
+
err_msg = str(e)
|
|
51
|
+
if "Connection refused" in err_msg or "Failed to establish" in err_msg:
|
|
52
|
+
click.echo(
|
|
53
|
+
"Error: Cannot connect to Neo4j at "
|
|
54
|
+
f"{self.cfg['uri']}.\n"
|
|
55
|
+
"Is Neo4j running? Check connection with:\n"
|
|
56
|
+
f" export GRAPHPOP_URI={self.cfg['uri']}\n"
|
|
57
|
+
" or create ~/.graphpop/config.yaml",
|
|
58
|
+
err=True,
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
click.echo(f"Error: {e}", err=True)
|
|
62
|
+
raise SystemExit(1)
|
|
63
|
+
|
|
64
|
+
def close(self):
|
|
65
|
+
if self._driver:
|
|
66
|
+
self._driver.close()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
pass_ctx = click.make_pass_decorator(GraphPopContext, ensure=True)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@click.group()
|
|
73
|
+
@click.option("--uri", envvar="GRAPHPOP_URI", help="Neo4j bolt URI")
|
|
74
|
+
@click.option("--user", envvar="GRAPHPOP_USER", help="Neo4j username")
|
|
75
|
+
@click.option("--password", envvar="GRAPHPOP_PASSWORD", help="Neo4j password")
|
|
76
|
+
@click.option("--database", envvar="GRAPHPOP_DATABASE", help="Neo4j database name")
|
|
77
|
+
@click.option("--config", "config_path", type=click.Path(),
|
|
78
|
+
help="Config file path (default: ~/.graphpop/config.yaml)")
|
|
79
|
+
@click.version_option(package_name="graphpop-cli")
|
|
80
|
+
@click.pass_context
|
|
81
|
+
def main(ctx, uri, user, password, database, config_path):
|
|
82
|
+
"""GraphPop — graph-native population genomics from the command line.
|
|
83
|
+
|
|
84
|
+
Compute population genetics statistics via Neo4j stored procedures with
|
|
85
|
+
default TSV output. Use --persist to write results to graph nodes.
|
|
86
|
+
"""
|
|
87
|
+
ctx.ensure_object(dict)
|
|
88
|
+
ctx.obj = GraphPopContext(uri=uri, user=user, password=password,
|
|
89
|
+
database=database, config_path=config_path)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Import all command modules
|
|
93
|
+
from .commands import ( # noqa: E402
|
|
94
|
+
diversity, divergence, sfs, joint_sfs,
|
|
95
|
+
genome_scan, pop_summary,
|
|
96
|
+
ld, ihs, xpehh, nsl, roh, garud_h,
|
|
97
|
+
query, run_all, aggregate, export_windows,
|
|
98
|
+
setup, server, db, import_data, dump,
|
|
99
|
+
config_cmd, validate, filter_results, plot,
|
|
100
|
+
lookup, converge, inventory, rank_genes,
|
|
101
|
+
extract, export_bed, batch, compare,
|
|
102
|
+
report, neighbors,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Individual procedures (12)
|
|
106
|
+
main.add_command(diversity.diversity)
|
|
107
|
+
main.add_command(divergence.divergence)
|
|
108
|
+
main.add_command(sfs.sfs)
|
|
109
|
+
main.add_command(joint_sfs.joint_sfs)
|
|
110
|
+
main.add_command(genome_scan.genome_scan)
|
|
111
|
+
main.add_command(pop_summary.pop_summary)
|
|
112
|
+
main.add_command(ld.ld)
|
|
113
|
+
main.add_command(ihs.ihs)
|
|
114
|
+
main.add_command(xpehh.xpehh)
|
|
115
|
+
main.add_command(nsl.nsl)
|
|
116
|
+
main.add_command(roh.roh)
|
|
117
|
+
main.add_command(garud_h.garud_h)
|
|
118
|
+
|
|
119
|
+
# Orchestration and export
|
|
120
|
+
main.add_command(run_all.run_all)
|
|
121
|
+
main.add_command(aggregate.aggregate)
|
|
122
|
+
main.add_command(export_windows.export_windows)
|
|
123
|
+
main.add_command(query.query)
|
|
124
|
+
main.add_command(filter_results.filter_results)
|
|
125
|
+
|
|
126
|
+
# Setup and server management
|
|
127
|
+
main.add_command(setup.setup)
|
|
128
|
+
main.add_command(server.start)
|
|
129
|
+
main.add_command(server.stop)
|
|
130
|
+
main.add_command(server.status)
|
|
131
|
+
|
|
132
|
+
# Database management
|
|
133
|
+
main.add_command(db.db)
|
|
134
|
+
main.add_command(import_data.import_data)
|
|
135
|
+
main.add_command(dump.dump)
|
|
136
|
+
main.add_command(dump.load)
|
|
137
|
+
|
|
138
|
+
# Configuration and validation
|
|
139
|
+
main.add_command(config_cmd.config)
|
|
140
|
+
main.add_command(validate.validate)
|
|
141
|
+
main.add_command(plot.plot)
|
|
142
|
+
|
|
143
|
+
# Phase 1 high-priority commands
|
|
144
|
+
main.add_command(lookup.lookup)
|
|
145
|
+
main.add_command(converge.converge)
|
|
146
|
+
main.add_command(inventory.inventory)
|
|
147
|
+
main.add_command(rank_genes.rank_genes)
|
|
148
|
+
|
|
149
|
+
# Phase 2 commands
|
|
150
|
+
main.add_command(extract.extract)
|
|
151
|
+
main.add_command(export_bed.export_bed)
|
|
152
|
+
main.add_command(batch.batch)
|
|
153
|
+
main.add_command(compare.compare)
|
|
154
|
+
|
|
155
|
+
# Phase 3 commands
|
|
156
|
+
main.add_command(report.report)
|
|
157
|
+
main.add_command(neighbors.neighbors)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if __name__ == "__main__":
|
|
161
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""GraphPop CLI command modules."""
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""graphpop aggregate — aggregate results and generate summary tables."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import csv
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from ..cli import pass_ctx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@click.command()
|
|
14
|
+
@click.option("--results-dir", "-d", type=click.Path(exists=True), required=True,
|
|
15
|
+
help="Directory with per-procedure TSV results (from run-all)")
|
|
16
|
+
@click.option("--json-results", "-j", type=click.Path(exists=True),
|
|
17
|
+
help="JSON results file (from run-all)")
|
|
18
|
+
@click.option("--output-dir", "-o", type=click.Path(), default="graphpop_tables",
|
|
19
|
+
help="Output directory for summary tables")
|
|
20
|
+
@pass_ctx
|
|
21
|
+
def aggregate(ctx, results_dir, json_results, output_dir):
|
|
22
|
+
"""Aggregate per-population results into summary tables.
|
|
23
|
+
|
|
24
|
+
Reads TSV results from a run-all output directory and produces
|
|
25
|
+
publication-ready summary tables:
|
|
26
|
+
|
|
27
|
+
\b
|
|
28
|
+
population_summary.tsv — per-pop diversity, theta, Tajima's D, Fis
|
|
29
|
+
fst_matrix.tsv — pairwise Fst matrix
|
|
30
|
+
pinpis.tsv — piN/piS ratios (if conditioned results exist)
|
|
31
|
+
selection_peaks.tsv — top iHS/XP-EHH/nSL peaks per population
|
|
32
|
+
roh_summary.tsv — per-pop FROH statistics
|
|
33
|
+
"""
|
|
34
|
+
results_path = Path(results_dir)
|
|
35
|
+
out_dir = Path(output_dir)
|
|
36
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
# Load JSON results if provided
|
|
39
|
+
all_results = {}
|
|
40
|
+
if json_results:
|
|
41
|
+
with open(json_results) as f:
|
|
42
|
+
all_results = json.load(f)
|
|
43
|
+
click.echo(f"Loaded {len(all_results)} results from JSON")
|
|
44
|
+
|
|
45
|
+
# --- Table 1: Population Summary ---
|
|
46
|
+
diversity_dir = results_path / "diversity"
|
|
47
|
+
if diversity_dir.exists():
|
|
48
|
+
click.echo("Generating population_summary.tsv...")
|
|
49
|
+
pop_stats = _aggregate_single_row_tsv(diversity_dir)
|
|
50
|
+
_write_summary(out_dir / "population_summary.tsv", pop_stats,
|
|
51
|
+
["population", "chr", "pi", "theta_w", "tajima_d",
|
|
52
|
+
"het_exp", "het_obs", "fis", "n_variants", "n_segregating"])
|
|
53
|
+
|
|
54
|
+
# --- Table 2: Fst Matrix ---
|
|
55
|
+
divergence_dir = results_path / "divergence"
|
|
56
|
+
if divergence_dir.exists():
|
|
57
|
+
click.echo("Generating fst_matrix.tsv...")
|
|
58
|
+
div_stats = _aggregate_single_row_tsv(divergence_dir)
|
|
59
|
+
_write_summary(out_dir / "fst_matrix.tsv", div_stats,
|
|
60
|
+
["pop1", "pop2", "chr", "fst_hudson", "fst_wc", "dxy", "da"])
|
|
61
|
+
|
|
62
|
+
# --- Table 3: ROH Summary ---
|
|
63
|
+
roh_dir = results_path / "roh"
|
|
64
|
+
if roh_dir.exists():
|
|
65
|
+
click.echo("Generating roh_summary.tsv...")
|
|
66
|
+
roh_data = _aggregate_multi_row_tsv(roh_dir)
|
|
67
|
+
# Compute per-population means
|
|
68
|
+
pop_roh = {}
|
|
69
|
+
for rec in roh_data:
|
|
70
|
+
pop = rec.get("population", rec.get("file_pop", "unknown"))
|
|
71
|
+
if pop not in pop_roh:
|
|
72
|
+
pop_roh[pop] = {"n_samples": 0, "total_froh": 0.0,
|
|
73
|
+
"total_n_roh": 0, "max_froh": 0.0}
|
|
74
|
+
pop_roh[pop]["n_samples"] += 1
|
|
75
|
+
froh = float(rec.get("froh", 0))
|
|
76
|
+
pop_roh[pop]["total_froh"] += froh
|
|
77
|
+
pop_roh[pop]["total_n_roh"] += int(rec.get("n_roh", 0))
|
|
78
|
+
pop_roh[pop]["max_froh"] = max(pop_roh[pop]["max_froh"], froh)
|
|
79
|
+
|
|
80
|
+
rows = []
|
|
81
|
+
for pop, s in sorted(pop_roh.items()):
|
|
82
|
+
rows.append({
|
|
83
|
+
"population": pop,
|
|
84
|
+
"n_samples": s["n_samples"],
|
|
85
|
+
"mean_froh": f"{s['total_froh'] / s['n_samples']:.6f}",
|
|
86
|
+
"mean_n_roh": f"{s['total_n_roh'] / s['n_samples']:.1f}",
|
|
87
|
+
"max_froh": f"{s['max_froh']:.6f}",
|
|
88
|
+
})
|
|
89
|
+
_write_dict_tsv(out_dir / "roh_summary.tsv", rows)
|
|
90
|
+
|
|
91
|
+
# --- Table 4: Selection Peaks ---
|
|
92
|
+
for proc in ("ihs", "nsl", "xpehh"):
|
|
93
|
+
proc_dir = results_path / proc
|
|
94
|
+
if proc_dir.exists():
|
|
95
|
+
click.echo(f"Generating {proc}_peaks.tsv...")
|
|
96
|
+
peaks = _extract_peaks(proc_dir, proc, top_n=100)
|
|
97
|
+
_write_dict_tsv(out_dir / f"{proc}_peaks.tsv", peaks)
|
|
98
|
+
|
|
99
|
+
# --- Table 5: Garud's H Sweep Windows ---
|
|
100
|
+
garud_dir = results_path / "garud_h"
|
|
101
|
+
if garud_dir.exists():
|
|
102
|
+
click.echo("Generating sweep_windows.tsv...")
|
|
103
|
+
sweeps = _extract_sweep_windows(garud_dir, h12_threshold=0.1)
|
|
104
|
+
_write_dict_tsv(out_dir / "sweep_windows.tsv", sweeps)
|
|
105
|
+
|
|
106
|
+
click.echo(f"\nSummary tables written to {out_dir}/")
|
|
107
|
+
for f in sorted(out_dir.glob("*.tsv")):
|
|
108
|
+
n_lines = sum(1 for _ in open(f)) - 1
|
|
109
|
+
click.echo(f" {f.name}: {n_lines} rows")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _aggregate_single_row_tsv(directory: Path) -> list[dict]:
|
|
113
|
+
"""Read TSV files with single data row, extract pop/chr from filename."""
|
|
114
|
+
rows = []
|
|
115
|
+
for tsv in sorted(directory.glob("*.tsv")):
|
|
116
|
+
parts = tsv.stem.split("_")
|
|
117
|
+
with open(tsv) as f:
|
|
118
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
119
|
+
for rec in reader:
|
|
120
|
+
# Infer pop and chr from filename: POP_CHR.tsv
|
|
121
|
+
if len(parts) >= 2:
|
|
122
|
+
rec["population"] = "_".join(parts[:-1])
|
|
123
|
+
rec["chr"] = parts[-1]
|
|
124
|
+
elif "vs" in tsv.stem:
|
|
125
|
+
# Pairwise: POP1_vs_POP2_CHR.tsv
|
|
126
|
+
vs_idx = parts.index("vs")
|
|
127
|
+
rec["pop1"] = "_".join(parts[:vs_idx])
|
|
128
|
+
rec["pop2"] = "_".join(parts[vs_idx + 1:-1])
|
|
129
|
+
rec["chr"] = parts[-1]
|
|
130
|
+
rows.append(rec)
|
|
131
|
+
return rows
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _aggregate_multi_row_tsv(directory: Path) -> list[dict]:
|
|
135
|
+
"""Read TSV files with multiple data rows."""
|
|
136
|
+
rows = []
|
|
137
|
+
for tsv in sorted(directory.glob("*.tsv")):
|
|
138
|
+
parts = tsv.stem.split("_")
|
|
139
|
+
pop = "_".join(parts[:-1]) if len(parts) >= 2 else parts[0]
|
|
140
|
+
with open(tsv) as f:
|
|
141
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
142
|
+
for rec in reader:
|
|
143
|
+
rec["file_pop"] = pop
|
|
144
|
+
rows.append(rec)
|
|
145
|
+
return rows
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _extract_peaks(directory: Path, stat_name: str,
|
|
149
|
+
top_n: int = 100) -> list[dict]:
|
|
150
|
+
"""Extract top peaks from per-variant result files."""
|
|
151
|
+
all_variants = []
|
|
152
|
+
for tsv in sorted(directory.glob("*.tsv")):
|
|
153
|
+
parts = tsv.stem.split("_")
|
|
154
|
+
with open(tsv) as f:
|
|
155
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
156
|
+
for rec in reader:
|
|
157
|
+
score = rec.get(stat_name, rec.get(f"{stat_name}_unstd", "0"))
|
|
158
|
+
try:
|
|
159
|
+
rec["abs_score"] = abs(float(score))
|
|
160
|
+
except (ValueError, TypeError):
|
|
161
|
+
rec["abs_score"] = 0
|
|
162
|
+
rec["source_file"] = tsv.stem
|
|
163
|
+
all_variants.append(rec)
|
|
164
|
+
|
|
165
|
+
all_variants.sort(key=lambda r: r["abs_score"], reverse=True)
|
|
166
|
+
return all_variants[:top_n]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _extract_sweep_windows(directory: Path,
|
|
170
|
+
h12_threshold: float = 0.1) -> list[dict]:
|
|
171
|
+
"""Extract windows exceeding H12 threshold."""
|
|
172
|
+
sweeps = []
|
|
173
|
+
for tsv in sorted(directory.glob("*.tsv")):
|
|
174
|
+
with open(tsv) as f:
|
|
175
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
176
|
+
for rec in reader:
|
|
177
|
+
try:
|
|
178
|
+
if float(rec.get("h12", 0)) >= h12_threshold:
|
|
179
|
+
sweeps.append(rec)
|
|
180
|
+
except (ValueError, TypeError):
|
|
181
|
+
pass
|
|
182
|
+
sweeps.sort(key=lambda r: float(r.get("h12", 0)), reverse=True)
|
|
183
|
+
return sweeps
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _write_summary(path: Path, rows: list[dict], columns: list[str]):
|
|
187
|
+
"""Write summary table with specified columns."""
|
|
188
|
+
with open(path, "w", newline="") as f:
|
|
189
|
+
writer = csv.DictWriter(f, fieldnames=columns, delimiter="\t",
|
|
190
|
+
extrasaction="ignore")
|
|
191
|
+
writer.writeheader()
|
|
192
|
+
writer.writerows(rows)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _write_dict_tsv(path: Path, rows: list[dict]):
|
|
196
|
+
"""Write list of dicts as TSV."""
|
|
197
|
+
if not rows:
|
|
198
|
+
with open(path, "w") as f:
|
|
199
|
+
f.write("# No results\n")
|
|
200
|
+
return
|
|
201
|
+
keys = list(rows[0].keys())
|
|
202
|
+
with open(path, "w", newline="") as f:
|
|
203
|
+
writer = csv.DictWriter(f, fieldnames=keys, delimiter="\t",
|
|
204
|
+
extrasaction="ignore")
|
|
205
|
+
writer.writeheader()
|
|
206
|
+
writer.writerows(rows)
|