PyPI - graphpop-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

graphpop-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

graphpop_cli/__init__.py +2 -0
graphpop_cli/cli.py +161 -0
graphpop_cli/commands/__init__.py +1 -0
graphpop_cli/commands/aggregate.py +206 -0
graphpop_cli/commands/batch.py +155 -0
graphpop_cli/commands/compare.py +118 -0
graphpop_cli/commands/config_cmd.py +117 -0
graphpop_cli/commands/converge.py +156 -0
graphpop_cli/commands/db.py +188 -0
graphpop_cli/commands/divergence.py +37 -0
graphpop_cli/commands/diversity.py +36 -0
graphpop_cli/commands/dump.py +210 -0
graphpop_cli/commands/export_bed.py +170 -0
graphpop_cli/commands/export_windows.py +91 -0
graphpop_cli/commands/extract.py +271 -0
graphpop_cli/commands/filter_results.py +165 -0
graphpop_cli/commands/garud_h.py +30 -0
graphpop_cli/commands/genome_scan.py +41 -0
graphpop_cli/commands/ihs.py +29 -0
graphpop_cli/commands/import_data.py +266 -0
graphpop_cli/commands/inventory.py +160 -0
graphpop_cli/commands/joint_sfs.py +38 -0
graphpop_cli/commands/ld.py +35 -0
graphpop_cli/commands/lookup.py +207 -0
graphpop_cli/commands/neighbors.py +175 -0
graphpop_cli/commands/nsl.py +29 -0
graphpop_cli/commands/plot.py +1066 -0
graphpop_cli/commands/pop_summary.py +30 -0
graphpop_cli/commands/query.py +15 -0
graphpop_cli/commands/rank_genes.py +177 -0
graphpop_cli/commands/report.py +264 -0
graphpop_cli/commands/roh.py +30 -0
graphpop_cli/commands/run_all.py +276 -0
graphpop_cli/commands/server.py +98 -0
graphpop_cli/commands/setup.py +299 -0
graphpop_cli/commands/sfs.py +38 -0
graphpop_cli/commands/validate.py +167 -0
graphpop_cli/commands/xpehh.py +31 -0
graphpop_cli/config.py +57 -0
graphpop_cli/connection.py +52 -0
graphpop_cli/formatters.py +81 -0
graphpop_cli-0.1.0.dist-info/METADATA +73 -0
graphpop_cli-0.1.0.dist-info/RECORD +46 -0
graphpop_cli-0.1.0.dist-info/WHEEL +5 -0
graphpop_cli-0.1.0.dist-info/entry_points.txt +2 -0
graphpop_cli-0.1.0.dist-info/top_level.txt +1 -0

graphpop_cli/commands/pop_summary.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""graphpop pop-summary — whole-chromosome population summary statistics."""
+import click
+from ..cli import pass_ctx
+from ..config import build_options_map, build_cypher
+from ..formatters import format_output
+@click.command("pop-summary")
+@click.argument("chr")
+@click.argument("population")
+@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
+@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
+@click.option("--consequence", help="Filter by VEP consequence type")
+@click.option("--pathway", help="Filter by pathway name")
+@click.option("--gene", help="Filter by gene name")
+@pass_ctx
+def pop_summary(ctx, chr, population, output_path, fmt,
+                consequence, pathway, gene):
+    """Compute population summary statistics for a chromosome."""
+    opts = build_options_map(consequence=consequence, pathway=pathway, gene=gene)
+    cypher = build_cypher(
+        "graphpop.pop_summary",
+        [f"'{chr}'", f"'{population}'"],
+        options=opts if opts else None,
+        yield_cols=["pi", "theta_w", "tajima_d", "fay_wu_h", "mean_he", "mean_ho",
+                     "mean_fis", "n_variants", "n_segregating", "n_polarized"],
+    )
+    records = ctx.run(cypher)
+    format_output(records, output_path, fmt, "pop-summary",
+                  {"chr": chr, "pop": population})

graphpop_cli/commands/query.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""graphpop query — run arbitrary Cypher and format as TSV."""
+import click
+from ..cli import pass_ctx
+from ..formatters import format_output
+@click.command()
+@click.argument("cypher")
+@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
+@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
+@pass_ctx
+def query(ctx, cypher, output_path, fmt):
+    """Run an arbitrary Cypher statement and format the results."""
+    records = ctx.run(cypher)
+    format_output(records, output_path, fmt, "query", {"cypher": cypher})

graphpop_cli/commands/rank_genes.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""graphpop rank-genes — rank genes by composite selection evidence."""
+from __future__ import annotations
+import click
+from ..cli import pass_ctx
+from ..formatters import format_output
+@click.command("rank-genes")
+@click.option("--pop", "population", required=True, help="Population name")
+@click.option("--pop2", help="Second population (for xpehh)")
+@click.option("--chr", "chromosome", help="Restrict to chromosome")
+@click.option("--top", type=int, default=100, help="Number of top genes (default: 100)")
+@click.option("--sort-by", "sort_by", default="composite",
+              type=click.Choice(["composite", "max_abs_ihs", "max_abs_xpehh",
+                                  "max_h12", "mean_fst", "n_high_impact"]),
+              help="Sort criterion (default: composite)")
+@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
+@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
+@pass_ctx
+def rank_genes(ctx, population, pop2, chromosome, top, sort_by,
+               output_path, fmt):
+    """Rank genes by composite selection evidence.
+    For each gene computes:
+      - max_abs_ihs:   max |iHS| across variants in the gene
+      - max_abs_xpehh: max |XP-EHH| (if --pop2 provided)
+      - max_h12:       max H12 from overlapping GenomicWindow
+      - mean_fst:      mean Fst from GenomicWindow overlapping the gene
+      - n_high_impact: count of HIGH-impact variants
+    Composite score = sum of per-stat percentile ranks (higher = stronger signal).
+    \b
+    Examples:
+      graphpop rank-genes --pop EUR --top 50 -o top_genes.tsv
+      graphpop rank-genes --pop GJ-tmp --pop2 GJ-trop --chr Chr01 --sort-by max_abs_ihs
+      graphpop rank-genes --pop EUR --pop2 AFR --sort-by mean_fst --format json
+    """
+    # Dynamic property names cannot be parameterized — kept as f-strings.
+    ihs_prop = f"ihs_{population}"
+    xpehh_prop = f"xpehh_{population}_{pop2}" if pop2 else None
+    # Build parameterized chromosome filter strings and params dict.
+    chr_filter_v = "AND v.chr = $chromosome" if chromosome else ""
+    chr_filter_g = "AND g.chr = $chromosome" if chromosome else ""
+    params: dict = {}
+    if chromosome:
+        params["chromosome"] = chromosome
+    # --- Query: per-gene iHS, high-impact count ---
+    cypher_variant = f"""
+    MATCH (v:Variant)-[hc:HAS_CONSEQUENCE]->(g:Gene)
+    WHERE v.{ihs_prop} IS NOT NULL {chr_filter_v}
+    WITH g,
+         MAX(abs(v.{ihs_prop})) AS max_abs_ihs,
+         {'MAX(abs(v.' + xpehh_prop + ')) AS max_abs_xpehh,' if xpehh_prop else ''}
+         SUM(CASE WHEN hc.impact = 'HIGH' THEN 1 ELSE 0 END) AS n_high_impact,
+         COUNT(DISTINCT v) AS n_variants
+    RETURN g.symbol AS gene,
+           g.geneId AS gene_id,
+           g.chr AS chr,
+           g.start AS gene_start,
+           g.end AS gene_end,
+           max_abs_ihs,
+           {'max_abs_xpehh,' if xpehh_prop else ''}
+           n_high_impact,
+           n_variants
+    """
+    variant_records = ctx.run(cypher_variant, params)
+    if not variant_records:
+        # Fallback: try without iHS requirement
+        click.echo("No iHS data found; querying genes by annotation only.", err=True)
+        cypher_fallback = f"""
+        MATCH (v:Variant)-[hc:HAS_CONSEQUENCE]->(g:Gene)
+        WHERE TRUE {chr_filter_v}
+        WITH g,
+             SUM(CASE WHEN hc.impact = 'HIGH' THEN 1 ELSE 0 END) AS n_high_impact,
+             COUNT(DISTINCT v) AS n_variants
+        RETURN g.symbol AS gene,
+               g.geneId AS gene_id,
+               g.chr AS chr,
+               g.start AS gene_start,
+               g.end AS gene_end,
+               0.0 AS max_abs_ihs,
+               n_high_impact,
+               n_variants
+        """
+        variant_records = ctx.run(cypher_fallback, params)
+    if not variant_records:
+        click.echo("No genes found.", err=True)
+        return
+    # Build gene lookup
+    gene_data = {}
+    for rec in variant_records:
+        gene = rec.get("gene") or rec.get("gene_id")
+        if not gene:
+            continue
+        gene_data[gene] = {
+            "gene": gene,
+            "gene_id": rec.get("gene_id", ""),
+            "chr": rec.get("chr", ""),
+            "gene_start": rec.get("gene_start", 0),
+            "gene_end": rec.get("gene_end", 0),
+            "max_abs_ihs": rec.get("max_abs_ihs", 0) or 0,
+            "max_abs_xpehh": rec.get("max_abs_xpehh", 0) or 0,
+            "n_high_impact": rec.get("n_high_impact", 0) or 0,
+            "n_variants": rec.get("n_variants", 0) or 0,
+            "max_h12": 0.0,
+            "mean_fst": 0.0,
+        }
+    # --- Query: window-based stats (H12, Fst) overlapping genes ---
+    click.echo("Querying window-based statistics...", err=True)
+    window_params: dict = {
+        "gene_names": list(gene_data.keys()),
+        "population": population,
+    }
+    if chromosome:
+        window_params["chromosome"] = chromosome
+    cypher_windows = f"""
+    MATCH (g:Gene)
+    WHERE g.symbol IN $gene_names {chr_filter_g}
+    MATCH (w:GenomicWindow)
+    WHERE w.chr = g.chr AND w.population = $population
+      AND w.start <= g.end AND w.end >= g.start
+    WITH g, MAX(w.h12) AS max_h12, AVG(w.fst) AS mean_fst
+    RETURN g.symbol AS gene, max_h12, mean_fst
+    """
+    try:
+        window_records = ctx.run(cypher_windows, window_params)
+        for rec in window_records:
+            gene = rec.get("gene")
+            if gene in gene_data:
+                gene_data[gene]["max_h12"] = rec.get("max_h12", 0) or 0
+                gene_data[gene]["mean_fst"] = rec.get("mean_fst", 0) or 0
+    except SystemExit:
+        click.echo("Warning: could not query GenomicWindow stats.", err=True)
+    # --- Compute composite score (sum of ranks) ---
+    genes = list(gene_data.values())
+    stat_cols = ["max_abs_ihs", "max_abs_xpehh", "max_h12", "mean_fst", "n_high_impact"]
+    for col in stat_cols:
+        vals = sorted(set(g[col] for g in genes))
+        rank_map = {v: i for i, v in enumerate(vals)}
+        n = max(len(vals) - 1, 1)
+        for g in genes:
+            g[f"_rank_{col}"] = rank_map[g[col]] / n if n > 0 else 0
+    for g in genes:
+        g["composite"] = sum(g[f"_rank_{col}"] for col in stat_cols)
+    # Clean up internal rank columns
+    for g in genes:
+        for col in stat_cols:
+            del g[f"_rank_{col}"]
+    # Sort
+    if sort_by == "composite":
+        genes.sort(key=lambda g: g["composite"], reverse=True)
+    else:
+        genes.sort(key=lambda g: g.get(sort_by, 0), reverse=True)
+    # Top N
+    genes = genes[:top]
+    click.echo(f"Ranked {len(genes)} genes by {sort_by}.", err=True)
+    format_output(genes, output_path, fmt, "rank-genes",
+                  {"pop": population, "pop2": pop2, "chr": chromosome,
+                   "sort_by": sort_by, "top": top})

graphpop_cli/commands/report.py ADDED Viewed

@@ -0,0 +1,264 @@
+"""graphpop report -- generate an automated HTML analysis summary report."""
+from __future__ import annotations
+import click
+from ..cli import pass_ctx
+@click.command()
+@click.option("-o", "--output", "output_path", required=True,
+              help="Output HTML file path")
+@click.option("--database", help="Override database name for the report title")
+@pass_ctx
+def report(ctx, output_path, database):
+    """Generate a self-contained HTML analysis report.
+    Queries the graph database for dataset overview, per-population diversity,
+    pairwise Fst, top selection signals, and annotation summary.
+    \b
+    Examples:
+      graphpop report -o report.html
+      graphpop report --database rice3k -o rice3k_report.html
+    """
+    db_name = database or ctx.database or "GraphPop"
+    click.echo(f"Generating report for database: {db_name} ...", err=True)
+    # ---- 1. Dataset overview ------------------------------------------------
+    overview_rows = ctx.run(
+        "MATCH (v:Variant) "
+        "WITH count(v) AS n_variants "
+        "OPTIONAL MATCH (s:Sample) "
+        "WITH n_variants, count(DISTINCT s) AS n_samples "
+        "OPTIONAL MATCH (g:Gene) "
+        "WITH n_variants, n_samples, count(DISTINCT g) AS n_genes "
+        "RETURN n_variants, n_samples, n_genes"
+    )
+    overview = overview_rows[0] if overview_rows else {
+        "n_variants": 0, "n_samples": 0, "n_genes": 0,
+    }
+    pop_list_rows = ctx.run(
+        "MATCH (v:Variant) WHERE v.pop_ids IS NOT NULL "
+        "RETURN v.pop_ids AS pids LIMIT 1"
+    )
+    populations = pop_list_rows[0]["pids"] if pop_list_rows else []
+    chr_rows = ctx.run(
+        "MATCH (v:Variant) "
+        "RETURN DISTINCT v.chr AS chr ORDER BY chr"
+    )
+    chromosomes = [r["chr"] for r in chr_rows]
+    # ---- 2. Per-population diversity ----------------------------------------
+    diversity_rows = ctx.run(
+        "MATCH (w:GenomicWindow) "
+        "WHERE w.pi IS NOT NULL "
+        "RETURN w.population AS population, "
+        "       avg(w.pi) AS mean_pi, "
+        "       avg(w.theta_w) AS mean_theta_w, "
+        "       avg(w.tajima_d) AS mean_tajima_d "
+        "ORDER BY mean_pi DESC"
+    )
+    # ---- 3. Pairwise Fst (top 10 pairs) ------------------------------------
+    fst_rows = ctx.run(
+        "MATCH (w:GenomicWindow) "
+        "WHERE w.fst IS NOT NULL AND w.pop_pair IS NOT NULL "
+        "RETURN w.pop_pair AS pop_pair, avg(w.fst) AS mean_fst "
+        "ORDER BY mean_fst DESC LIMIT 10"
+    )
+    # ---- 4. Top selection signals -------------------------------------------
+    ihs_rows = ctx.run(
+        "MATCH (v:Variant) "
+        "WHERE any(k IN keys(v) WHERE k STARTS WITH 'ihs_') "
+        "WITH v, [k IN keys(v) WHERE k STARTS WITH 'ihs_'] AS ks "
+        "UNWIND ks AS k "
+        "WITH v.variantId AS variant, v.chr AS chr, v.pos AS pos, "
+        "     k AS stat, v[k] AS value "
+        "WHERE abs(value) > 2.0 "
+        "RETURN variant, chr, pos, stat, value "
+        "ORDER BY abs(value) DESC LIMIT 20"
+    )
+    xpehh_rows = ctx.run(
+        "MATCH (v:Variant) "
+        "WHERE any(k IN keys(v) WHERE k STARTS WITH 'xpehh_') "
+        "WITH v, [k IN keys(v) WHERE k STARTS WITH 'xpehh_'] AS ks "
+        "UNWIND ks AS k "
+        "WITH v.variantId AS variant, v.chr AS chr, v.pos AS pos, "
+        "     k AS stat, v[k] AS value "
+        "WHERE abs(value) > 2.0 "
+        "RETURN variant, chr, pos, stat, value "
+        "ORDER BY abs(value) DESC LIMIT 20"
+    )
+    selection_rows = ihs_rows + xpehh_rows
+    selection_rows.sort(key=lambda r: abs(r.get("value", 0)), reverse=True)
+    selection_rows = selection_rows[:20]
+    # ---- 5. Annotation summary ----------------------------------------------
+    annot_rows = ctx.run(
+        "OPTIONAL MATCH (g:Gene) "
+        "WITH count(DISTINCT g) AS n_genes "
+        "OPTIONAL MATCH (pw:Pathway) "
+        "WITH n_genes, count(DISTINCT pw) AS n_pathways "
+        "OPTIONAL MATCH (go:GOTerm) "
+        "RETURN n_genes, n_pathways, count(DISTINCT go) AS n_go_terms"
+    )
+    annot = annot_rows[0] if annot_rows else {
+        "n_genes": 0, "n_pathways": 0, "n_go_terms": 0,
+    }
+    # ---- Build HTML ---------------------------------------------------------
+    def _table(headers, rows_data):
+        """Build an HTML table from headers and list-of-lists."""
+        lines = ['<table>', '<tr>' + ''.join(f'<th>{h}</th>' for h in headers) + '</tr>']
+        for row in rows_data:
+            lines.append('<tr>' + ''.join(f'<td>{_fmt(v)}</td>' for v in row) + '</tr>')
+        lines.append('</table>')
+        return '\n'.join(lines)
+    def _fmt(v):
+        if v is None:
+            return "NA"
+        if isinstance(v, float):
+            return f"{v:.6g}"
+        if isinstance(v, list):
+            return ", ".join(str(x) for x in v)
+        return str(v)
+    # Overview table
+    overview_html = _table(
+        ["Metric", "Value"],
+        [
+            ["Variants", overview.get("n_variants", 0)],
+            ["Samples", overview.get("n_samples", 0)],
+            ["Genes", overview.get("n_genes", 0)],
+            ["Populations", len(populations)],
+            ["Population IDs", ", ".join(str(p) for p in populations)],
+            ["Chromosomes", ", ".join(str(c) for c in chromosomes)],
+        ],
+    )
+    # Diversity table
+    if diversity_rows:
+        div_html = _table(
+            ["Population", "Mean pi", "Mean theta_W", "Mean Tajima's D"],
+            [[r["population"], r["mean_pi"], r["mean_theta_w"], r["mean_tajima_d"]]
+             for r in diversity_rows],
+        )
+    else:
+        div_html = "<p>No GenomicWindow diversity data found.</p>"
+    # Fst table
+    if fst_rows:
+        fst_html = _table(
+            ["Population Pair", "Mean Fst"],
+            [[r["pop_pair"], r["mean_fst"]] for r in fst_rows],
+        )
+    else:
+        fst_html = "<p>No pairwise Fst data found in GenomicWindow nodes.</p>"
+    # Selection signals table
+    if selection_rows:
+        sel_html = _table(
+            ["Variant", "Chr", "Pos", "Statistic", "Value"],
+            [[r["variant"], r["chr"], r["pos"], r["stat"], r["value"]]
+             for r in selection_rows],
+        )
+    else:
+        sel_html = "<p>No iHS or XP-EHH signals above threshold found.</p>"
+    # Annotation summary table
+    annot_html = _table(
+        ["Annotation Type", "Count"],
+        [
+            ["Genes", annot.get("n_genes", 0)],
+            ["Pathways", annot.get("n_pathways", 0)],
+            ["GO Terms", annot.get("n_go_terms", 0)],
+        ],
+    )
+    html = f"""<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8">
+<title>GraphPop Report: {db_name}</title>
+<style>
+body {{
+    font-family: Arial, Helvetica, sans-serif;
+    max-width: 960px;
+    margin: 40px auto;
+    padding: 0 20px;
+    color: #333;
+    line-height: 1.5;
+}}
+h1 {{
+    color: #0072B2;
+    border-bottom: 2px solid #0072B2;
+    padding-bottom: 8px;
+}}
+h2 {{
+    color: #555;
+    margin-top: 32px;
+}}
+table {{
+    border-collapse: collapse;
+    width: 100%;
+    margin: 12px 0 24px 0;
+    font-size: 13px;
+}}
+th, td {{
+    border: 1px solid #ddd;
+    padding: 6px 10px;
+    text-align: left;
+}}
+th {{
+    background-color: #0072B2;
+    color: white;
+    font-weight: 600;
+}}
+tr:nth-child(even) {{
+    background-color: #f9f9f9;
+}}
+tr:hover {{
+    background-color: #e9f3fb;
+}}
+footer {{
+    margin-top: 48px;
+    padding-top: 12px;
+    border-top: 1px solid #ddd;
+    font-size: 11px;
+    color: #999;
+}}
+</style>
+</head>
+<body>
+<h1>GraphPop Analysis Report: {db_name}</h1>
+<h2>Dataset Overview</h2>
+{overview_html}
+<h2>Population Diversity</h2>
+{div_html}
+<h2>Pairwise Fst (Top 10)</h2>
+{fst_html}
+<h2>Top Selection Signals</h2>
+{sel_html}
+<h2>Annotation Summary</h2>
+{annot_html}
+<footer>Generated by GraphPop CLI v0.1.0</footer>
+</body>
+</html>
+"""
+    with open(output_path, "w") as f:
+        f.write(html)
+    click.echo(f"Report saved to: {output_path}")

graphpop_cli/commands/roh.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""graphpop roh — runs of homozygosity."""
+import click
+from ..cli import pass_ctx
+from ..config import build_options_map, build_cypher
+from ..formatters import format_output
+@click.command()
+@click.argument("chr")
+@click.argument("population")
+@click.option("--method", type=click.Choice(["hmm", "window"]), default="hmm",
+              help="ROH detection method (default: hmm)")
+@click.option("--min-length", type=int, help="Minimum ROH length in bp")
+@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
+@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
+@pass_ctx
+def roh(ctx, chr, population, method, min_length, output_path, fmt):
+    """Detect runs of homozygosity (ROH) per sample."""
+    opts = build_options_map(method=method, min_length=min_length)
+    cypher = build_cypher(
+        "graphpop.roh",
+        [f"'{chr}'", f"'{population}'"],
+        options=opts if opts else None,
+        yield_cols=["sampleId", "n_roh", "total_length", "froh",
+                     "mean_length", "max_length"],
+    )
+    records = ctx.run(cypher)
+    format_output(records, output_path, fmt, "roh",
+                  {"chr": chr, "pop": population, "method": method,
+                   "min_length": min_length})