PyPI - graphpop-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

graphpop-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

graphpop_cli/__init__.py +2 -0
graphpop_cli/cli.py +161 -0
graphpop_cli/commands/__init__.py +1 -0
graphpop_cli/commands/aggregate.py +206 -0
graphpop_cli/commands/batch.py +155 -0
graphpop_cli/commands/compare.py +118 -0
graphpop_cli/commands/config_cmd.py +117 -0
graphpop_cli/commands/converge.py +156 -0
graphpop_cli/commands/db.py +188 -0
graphpop_cli/commands/divergence.py +37 -0
graphpop_cli/commands/diversity.py +36 -0
graphpop_cli/commands/dump.py +210 -0
graphpop_cli/commands/export_bed.py +170 -0
graphpop_cli/commands/export_windows.py +91 -0
graphpop_cli/commands/extract.py +271 -0
graphpop_cli/commands/filter_results.py +165 -0
graphpop_cli/commands/garud_h.py +30 -0
graphpop_cli/commands/genome_scan.py +41 -0
graphpop_cli/commands/ihs.py +29 -0
graphpop_cli/commands/import_data.py +266 -0
graphpop_cli/commands/inventory.py +160 -0
graphpop_cli/commands/joint_sfs.py +38 -0
graphpop_cli/commands/ld.py +35 -0
graphpop_cli/commands/lookup.py +207 -0
graphpop_cli/commands/neighbors.py +175 -0
graphpop_cli/commands/nsl.py +29 -0
graphpop_cli/commands/plot.py +1066 -0
graphpop_cli/commands/pop_summary.py +30 -0
graphpop_cli/commands/query.py +15 -0
graphpop_cli/commands/rank_genes.py +177 -0
graphpop_cli/commands/report.py +264 -0
graphpop_cli/commands/roh.py +30 -0
graphpop_cli/commands/run_all.py +276 -0
graphpop_cli/commands/server.py +98 -0
graphpop_cli/commands/setup.py +299 -0
graphpop_cli/commands/sfs.py +38 -0
graphpop_cli/commands/validate.py +167 -0
graphpop_cli/commands/xpehh.py +31 -0
graphpop_cli/config.py +57 -0
graphpop_cli/connection.py +52 -0
graphpop_cli/formatters.py +81 -0
graphpop_cli-0.1.0.dist-info/METADATA +73 -0
graphpop_cli-0.1.0.dist-info/RECORD +46 -0
graphpop_cli-0.1.0.dist-info/WHEEL +5 -0
graphpop_cli-0.1.0.dist-info/entry_points.txt +2 -0
graphpop_cli-0.1.0.dist-info/top_level.txt +1 -0

graphpop_cli/commands/import_data.py ADDED Viewed

@@ -0,0 +1,266 @@
+"""graphpop import — import VCF data into a Neo4j graph database."""
+from __future__ import annotations
+import subprocess
+import sys
+from pathlib import Path
+import click
+import yaml
+def _get_neo4j_home() -> Path:
+    """Get Neo4j home from config."""
+    config_path = Path.home() / ".graphpop" / "config.yaml"
+    if config_path.exists():
+        with open(config_path) as f:
+            cfg = yaml.safe_load(f) or {}
+        if "neo4j_home" in cfg:
+            return Path(cfg["neo4j_home"])
+    return Path.home() / "neo4j"
+@click.command("import")
+@click.option("--vcf", required=True, type=click.Path(exists=True),
+              help="Input VCF file (bgzipped recommended)")
+@click.option("--panel", required=True, type=click.Path(exists=True),
+              help="Population panel file (TSV: sample_id, population)")
+@click.option("--database", required=True,
+              help="Name for the Neo4j database")
+@click.option("--vep", type=click.Path(exists=True),
+              help="VEP/SnpEff annotation file")
+@click.option("--pathways", type=click.Path(exists=True),
+              help="Reactome/Plant Reactome pathway file")
+@click.option("--go-terms", type=click.Path(exists=True),
+              help="GO term annotation file (UniProt GOA format)")
+@click.option("--ancestral", type=click.Path(exists=True),
+              help="Ancestral allele FASTA (Ensembl EPO)")
+@click.option("--csv-dir", type=click.Path(),
+              help="Directory for intermediate CSV files (default: temp)")
+@click.option("--neo4j-home", type=click.Path(),
+              help="Neo4j installation directory")
+@click.option("--threads", type=int, default=4,
+              help="Import threads (default: 4)")
+@click.option("--skip-csv", is_flag=True,
+              help="Skip CSV generation (reuse existing CSVs)")
+@click.option("--skip-import", is_flag=True,
+              help="Skip neo4j-admin import (CSVs only)")
+@click.option("--skip-annotations", is_flag=True,
+              help="Skip annotation loading")
+def import_data(vcf, panel, database, vep, pathways, go_terms, ancestral,
+                csv_dir, neo4j_home, threads, skip_csv, skip_import,
+                skip_annotations):
+    """Import VCF data into a Neo4j graph database.
+    This command orchestrates the full import pipeline:
+    \b
+    1. Parse VCF + panel → generate CSV files (Variant, Sample, Population, etc.)
+    2. Run neo4j-admin database import to bulk-load CSVs
+    3. Load functional annotations (VEP, pathways, GO terms, ancestral alleles)
+    The database name is user-specified and stored in the GraphPop config.
+    \b
+    Examples:
+      graphpop import --vcf data.vcf.gz --panel panel.txt --database myproject
+      graphpop import --vcf rice.vcf.gz --panel rice_panel.txt \\
+        --database rice3k --vep rice_vep.vcf --pathways plant_reactome.tsv
+    """
+    neo4j_path = Path(neo4j_home) if neo4j_home else _get_neo4j_home()
+    csv_path = Path(csv_dir) if csv_dir else Path(f"/tmp/graphpop_csv_{database}")
+    csv_path.mkdir(parents=True, exist_ok=True)
+    click.echo(f"GraphPop Import Pipeline")
+    click.echo(f"  VCF:       {vcf}")
+    click.echo(f"  Panel:     {panel}")
+    click.echo(f"  Database:  {database}")
+    click.echo(f"  Neo4j:     {neo4j_path}")
+    click.echo(f"  CSV dir:   {csv_path}")
+    click.echo()
+    # Step 1: Generate CSVs
+    if not skip_csv:
+        click.echo("Step 1/3: Generating CSV files from VCF...")
+        _run_csv_generation(vcf, panel, csv_path, threads)
+    else:
+        click.echo("Step 1/3: Skipping CSV generation (--skip-csv)")
+    # Step 2: neo4j-admin import
+    if not skip_import:
+        click.echo("\nStep 2/3: Running neo4j-admin bulk import...")
+        _run_bulk_import(neo4j_path, csv_path, database)
+    else:
+        click.echo("\nStep 2/3: Skipping bulk import (--skip-import)")
+    # Step 3: Load annotations
+    if not skip_annotations:
+        click.echo("\nStep 3/3: Loading annotations...")
+        _load_annotations(neo4j_path, database, vep, pathways, go_terms, ancestral)
+    else:
+        click.echo("\nStep 3/3: Skipping annotations (--skip-annotations)")
+    # Update config with new database
+    _update_config(database)
+    click.echo(f"""
+Import complete!
+  Database:  {database}
+  GraphPop config updated to use database '{database}'.
+Next steps:
+  graphpop start                                     # Start Neo4j (if not running)
+  graphpop db info                                   # Verify node/edge counts
+  graphpop diversity chr1 1 50000000 YOUR_POP        # Run first analysis
+  graphpop run-all --database {database} -d results/ # Full-genome analysis
+""")
+def _run_csv_generation(vcf: str, panel: str, csv_dir: Path, threads: int):
+    """Run the graphpop-import CSV generation."""
+    try:
+        import importlib
+        # Try importing graphpop_import directly
+        spec = importlib.util.find_spec("graphpop_import")
+        if spec:
+            click.echo("  Using graphpop-import Python package...")
+            from graphpop_import.vcf_parser import VCFParser
+            from graphpop_import.csv_emitter import CSVEmitter
+            parser = VCFParser(vcf, panel)
+            emitter = CSVEmitter(str(csv_dir))
+            parser.parse(emitter)
+            click.echo(f"  CSVs written to {csv_dir}")
+            return
+    except ImportError:
+        pass
+    # Fallback: run as subprocess
+    click.echo("  Running graphpop-import as subprocess...")
+    scripts = [
+        Path("graphpop-import/src/graphpop_import/vcf_parser.py"),
+        Path("scripts/rice_csv_parallel.py"),
+    ]
+    for script in scripts:
+        if script.exists():
+            result = subprocess.run(
+                [sys.executable, str(script),
+                 "--vcf", vcf, "--panel", panel, "--output", str(csv_dir),
+                 "--threads", str(threads)],
+                capture_output=True, text=True,
+            )
+            if result.returncode == 0:
+                click.echo(f"  CSVs written to {csv_dir}")
+                return
+            else:
+                click.echo(f"  Warning: {result.stderr[:200]}", err=True)
+    click.echo(
+        "  Error: graphpop-import not found.\n"
+        "  Install with: pip install -e graphpop-import/\n"
+        "  Or generate CSVs manually and use --skip-csv",
+        err=True,
+    )
+    raise SystemExit(1)
+def _run_bulk_import(neo4j_home: Path, csv_dir: Path, database: str):
+    """Run neo4j-admin database import."""
+    admin_bin = neo4j_home / "bin" / "neo4j-admin"
+    if not admin_bin.exists():
+        click.echo(f"  Error: neo4j-admin not found at {admin_bin}", err=True)
+        raise SystemExit(1)
+    # Check if database already exists
+    db_dir = neo4j_home / "data" / "databases" / database
+    if db_dir.exists():
+        if not click.confirm(f"  Database '{database}' already exists. Overwrite?"):
+            click.echo("  Import cancelled.")
+            raise SystemExit(0)
+    # Build neo4j-admin import command
+    cmd = [
+        str(admin_bin), "database", "import", "full",
+        f"--nodes=Variant={csv_dir}/variant_header.csv,{csv_dir}/variants_*.csv",
+        f"--nodes=Sample={csv_dir}/sample_header.csv,{csv_dir}/samples.csv",
+        f"--nodes=Population={csv_dir}/population_header.csv,{csv_dir}/populations.csv",
+        f"--nodes=Chromosome={csv_dir}/chromosome_header.csv,{csv_dir}/chromosomes.csv",
+        f"--relationships=NEXT={csv_dir}/next_header.csv,{csv_dir}/next_*.csv",
+        f"--relationships=ON_CHROMOSOME={csv_dir}/on_chromosome_header.csv,{csv_dir}/on_chromosome_*.csv",
+        f"--relationships=IN_POPULATION={csv_dir}/in_population_header.csv,{csv_dir}/in_population.csv",
+        "--overwrite-destination=true",
+        database,
+    ]
+    click.echo(f"  Running: neo4j-admin database import {database}")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        click.echo(f"  Import failed: {result.stderr[:500]}", err=True)
+        click.echo("  You may need to stop Neo4j first: graphpop stop", err=True)
+        raise SystemExit(1)
+    click.echo("  Bulk import complete.")
+def _load_annotations(neo4j_home: Path, database: str,
+                      vep: str | None, pathways: str | None,
+                      go_terms: str | None, ancestral: str | None):
+    """Load functional annotations via Cypher transactions."""
+    if not any([vep, pathways, go_terms, ancestral]):
+        click.echo("  No annotations specified, skipping.")
+        return
+    # Load annotations by running the appropriate Python scripts
+    scripts_dir = Path("scripts")
+    annotation_scripts = []
+    if vep:
+        click.echo(f"  Loading VEP annotations from {vep}...")
+        annotation_scripts.append(("load_annotations", ["--vep", vep]))
+    if pathways:
+        click.echo(f"  Loading pathway annotations from {pathways}...")
+        annotation_scripts.append(("load_annotations", ["--pathways", pathways]))
+    if go_terms:
+        click.echo(f"  Loading GO term annotations from {go_terms}...")
+        annotation_scripts.append(("load_annotations", ["--go", go_terms]))
+    if ancestral:
+        click.echo(f"  Loading ancestral alleles from {ancestral}...")
+        annotation_scripts.append(("load_annotations", ["--ancestral", ancestral]))
+    for script_name, args in annotation_scripts:
+        # Try to find the annotation loading script
+        candidates = [
+            scripts_dir / f"{script_name}.py",
+            scripts_dir / "load_rice_annotations.py",
+            Path(f"graphpop-import/src/graphpop_import/{script_name}.py"),
+        ]
+        for script in candidates:
+            if script.exists():
+                result = subprocess.run(
+                    [sys.executable, str(script), "--database", database] + args,
+                    capture_output=True, text=True,
+                )
+                if result.returncode == 0:
+                    click.echo(f"  Loaded: {script_name}")
+                    break
+                else:
+                    click.echo(f"  Warning: {result.stderr[:200]}", err=True)
+        else:
+            click.echo(f"  Annotation script not found for: {script_name}")
+            click.echo("  You can load annotations manually after import.")
+def _update_config(database: str):
+    """Update GraphPop config to use the new database."""
+    config_path = Path.home() / ".graphpop" / "config.yaml"
+    cfg = {}
+    if config_path.exists():
+        with open(config_path) as f:
+            cfg = yaml.safe_load(f) or {}
+    cfg["database"] = database
+    config_path.parent.mkdir(exist_ok=True)
+    with open(config_path, "w") as f:
+        yaml.dump(cfg, f, default_flow_style=False)
+    click.echo(f"  Config updated: database = {database}")

graphpop_cli/commands/inventory.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""graphpop inventory — comprehensive database inventory."""
+from __future__ import annotations
+import click
+from ..cli import pass_ctx
+from ..formatters import format_output
+@click.command("inventory")
+@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
+@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
+@pass_ctx
+def inventory(ctx, output_path, fmt):
+    """Show comprehensive database inventory.
+    Reports node/relationship counts, populations, chromosomes, loaded
+    annotations, and persisted statistics. No arguments needed.
+    \b
+    Examples:
+      graphpop inventory
+      graphpop inventory --format json -o db_inventory.json
+    """
+    sections = []
+    # --- 1. Node label counts ---
+    click.echo("Querying node counts...", err=True)
+    labels = ["Variant", "Sample", "Population", "Gene", "Pathway",
+              "GOTerm", "GenomicWindow"]
+    for label in labels:
+        recs = ctx.run(f"MATCH (n:{label}) RETURN count(n) AS count")
+        count = recs[0]["count"] if recs else 0
+        sections.append({"section": "nodes", "item": label, "value": str(count)})
+    # --- 2. Relationship type counts ---
+    click.echo("Querying relationship counts...", err=True)
+    rel_types = ["CARRIES", "HAS_CONSEQUENCE", "IN_PATHWAY", "HAS_GO_TERM",
+                 "NEXT", "LD", "BELONGS_TO"]
+    for rel in rel_types:
+        recs = ctx.run(f"MATCH ()-[r:{rel}]->() RETURN count(r) AS count")
+        count = recs[0]["count"] if recs else 0
+        sections.append({"section": "relationships", "item": rel, "value": str(count)})
+    # --- 3. Populations and sample counts ---
+    click.echo("Querying populations...", err=True)
+    recs = ctx.run(
+        "MATCH (p:Population) "
+        "OPTIONAL MATCH (s:Sample)-[:BELONGS_TO]->(p) "
+        "RETURN p.popId AS population, count(s) AS sample_count "
+        "ORDER BY p.popId"
+    )
+    for rec in recs:
+        sections.append({
+            "section": "populations",
+            "item": rec["population"],
+            "value": str(rec["sample_count"]),
+        })
+    # --- 4. Chromosomes and variant counts ---
+    click.echo("Querying chromosomes...", err=True)
+    recs = ctx.run(
+        "MATCH (v:Variant) "
+        "RETURN v.chr AS chr, count(v) AS variant_count "
+        "ORDER BY v.chr"
+    )
+    for rec in recs:
+        sections.append({
+            "section": "chromosomes",
+            "item": rec["chr"],
+            "value": str(rec["variant_count"]),
+        })
+    # --- 5. Annotation coverage ---
+    click.echo("Querying annotations...", err=True)
+    # HAS_CONSEQUENCE edges
+    recs = ctx.run("MATCH ()-[r:HAS_CONSEQUENCE]->() RETURN count(r) AS count")
+    has_conseq = recs[0]["count"] if recs else 0
+    sections.append({"section": "annotations", "item": "HAS_CONSEQUENCE edges",
+                     "value": str(has_conseq)})
+    # IN_PATHWAY edges
+    recs = ctx.run("MATCH ()-[r:IN_PATHWAY]->() RETURN count(r) AS count")
+    has_pw = recs[0]["count"] if recs else 0
+    sections.append({"section": "annotations", "item": "IN_PATHWAY edges",
+                     "value": str(has_pw)})
+    # HAS_GO_TERM edges
+    recs = ctx.run("MATCH ()-[r:HAS_GO_TERM]->() RETURN count(r) AS count")
+    has_go = recs[0]["count"] if recs else 0
+    sections.append({"section": "annotations", "item": "HAS_GO_TERM edges",
+                     "value": str(has_go)})
+    # Ancestral allele coverage
+    recs = ctx.run(
+        "MATCH (v:Variant) WHERE v.ancestral_allele IS NOT NULL "
+        "RETURN count(v) AS count"
+    )
+    aa_count = recs[0]["count"] if recs else 0
+    sections.append({"section": "annotations", "item": "variants_with_ancestral_allele",
+                     "value": str(aa_count)})
+    # --- 6. Persisted statistics ---
+    click.echo("Querying persisted statistics...", err=True)
+    # Check for ihs/xpehh/nsl properties on Variant nodes (sample a few)
+    for stat_prefix in ["ihs_", "xpehh_", "nsl_"]:
+        recs = ctx.run(
+            "MATCH (v:Variant) "
+            "WITH v LIMIT 1 "
+            "UNWIND keys(v) AS k "
+            "WITH k WHERE k STARTS WITH $stat_prefix "
+            "RETURN COLLECT(DISTINCT k) AS props",
+            {"stat_prefix": stat_prefix},
+        )
+        props = recs[0]["props"] if recs and recs[0]["props"] else []
+        if props:
+            for p in props:
+                sections.append({"section": "persisted_stats", "item": p,
+                                 "value": "on Variant nodes"})
+        else:
+            sections.append({"section": "persisted_stats",
+                             "item": f"{stat_prefix}*",
+                             "value": "none found"})
+    # GenomicWindow statistics
+    recs = ctx.run("MATCH (w:GenomicWindow) RETURN count(w) AS count")
+    gw_count = recs[0]["count"] if recs else 0
+    sections.append({"section": "persisted_stats", "item": "GenomicWindow count",
+                     "value": str(gw_count)})
+    # Fst properties on GenomicWindow
+    recs = ctx.run(
+        "MATCH (w:GenomicWindow) "
+        "WITH w LIMIT 1 "
+        "UNWIND keys(w) AS k "
+        "WITH k WHERE k STARTS WITH 'fst_' "
+        "RETURN COLLECT(DISTINCT k) AS props"
+    )
+    fst_props = recs[0]["props"] if recs and recs[0]["props"] else []
+    for p in fst_props:
+        sections.append({"section": "persisted_stats", "item": p,
+                         "value": "on GenomicWindow nodes"})
+    # --- Print summary ---
+    if fmt == "tsv" and not output_path:
+        _print_inventory(sections)
+    else:
+        format_output(sections, output_path, fmt, "inventory", {})
+def _print_inventory(sections: list[dict]):
+    """Pretty-print inventory to stderr/stdout."""
+    current_section = None
+    for row in sections:
+        sec = row["section"]
+        if sec != current_section:
+            current_section = sec
+            click.echo(f"\n=== {sec.upper().replace('_', ' ')} ===")
+        click.echo(f"  {row['item']:40s} {row['value']}")
+    click.echo()

graphpop_cli/commands/joint_sfs.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""graphpop joint-sfs — joint site frequency spectrum between two populations."""
+import click
+from ..cli import pass_ctx
+from ..config import build_options_map, build_cypher
+from ..formatters import format_output
+@click.command("joint-sfs")
+@click.argument("chr")
+@click.argument("start", type=int)
+@click.argument("end", type=int)
+@click.argument("pop1")
+@click.argument("pop2")
+@click.option("--unfolded", is_flag=True, default=False,
+              help="Compute unfolded joint SFS (requires ancestral allele)")
+@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
+@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
+@click.option("--consequence", help="Filter by VEP consequence type")
+@click.option("--pathway", help="Filter by pathway name")
+@click.option("--gene", help="Filter by gene name")
+@click.option("--min-af", type=float, help="Minimum allele frequency")
+@pass_ctx
+def joint_sfs(ctx, chr, start, end, pop1, pop2, unfolded, output_path, fmt,
+              consequence, pathway, gene, min_af):
+    """Compute the joint site frequency spectrum between two populations."""
+    opts = build_options_map(consequence=consequence, pathway=pathway, gene=gene,
+                             min_af=min_af)
+    cypher = build_cypher(
+        "graphpop.joint_sfs",
+        [f"'{chr}'", str(start), str(end), f"'{pop1}'", f"'{pop2}'",
+         "true" if unfolded else "false"],
+        options=opts if opts else None,
+        yield_cols=["joint_sfs", "n_variants", "max_ac1", "max_ac2", "dim1", "dim2"],
+    )
+    records = ctx.run(cypher)
+    format_output(records, output_path, fmt, "joint-sfs",
+                  {"chr": chr, "start": start, "end": end,
+                   "pop1": pop1, "pop2": pop2, "unfolded": unfolded})

graphpop_cli/commands/ld.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""graphpop ld — linkage disequilibrium (r2, D')."""
+import click
+from ..cli import pass_ctx
+from ..config import build_options_map, build_cypher
+from ..formatters import format_output
+@click.command()
+@click.argument("chr")
+@click.argument("start", type=int)
+@click.argument("end", type=int)
+@click.argument("population")
+@click.argument("max_dist", type=int)
+@click.argument("threshold", type=float)
+@click.option("--persist", is_flag=True, default=False,
+              help="Write LD edges to the graph")
+@click.option("-o", "--output", "output_path", help="Output file (default: stdout)")
+@click.option("--format", "fmt", default="tsv", type=click.Choice(["tsv", "csv", "json"]))
+@click.option("--min-af", type=float, help="Minimum allele frequency")
+@pass_ctx
+def ld(ctx, chr, start, end, population, max_dist, threshold, persist,
+       output_path, fmt, min_af):
+    """Compute pairwise linkage disequilibrium (r2 and D')."""
+    opts = build_options_map(min_af=min_af, write_edges=persist)
+    cypher = build_cypher(
+        "graphpop.ld",
+        [f"'{chr}'", str(start), str(end), f"'{population}'",
+         str(max_dist), str(threshold)],
+        options=opts if opts else None,
+        yield_cols=["variant1", "variant2", "r2", "dprime", "distance"],
+    )
+    records = ctx.run(cypher)
+    format_output(records, output_path, fmt, "ld",
+                  {"chr": chr, "start": start, "end": end, "pop": population,
+                   "max_dist": max_dist, "threshold": threshold, "persist": persist})