PyPI - causaliq-knowledge - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

causaliq-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

causaliq_knowledge/__init__.py +6 -3
causaliq_knowledge/action.py +480 -0
causaliq_knowledge/cache/__init__.py +18 -0
causaliq_knowledge/cache/encoders/__init__.py +13 -0
causaliq_knowledge/cache/encoders/base.py +90 -0
causaliq_knowledge/cache/encoders/json_encoder.py +430 -0
causaliq_knowledge/cache/token_cache.py +666 -0
causaliq_knowledge/cli/__init__.py +15 -0
causaliq_knowledge/cli/cache.py +478 -0
causaliq_knowledge/cli/generate.py +410 -0
causaliq_knowledge/cli/main.py +172 -0
causaliq_knowledge/cli/models.py +309 -0
causaliq_knowledge/graph/__init__.py +78 -0
causaliq_knowledge/graph/generator.py +457 -0
causaliq_knowledge/graph/loader.py +222 -0
causaliq_knowledge/graph/models.py +426 -0
causaliq_knowledge/graph/params.py +175 -0
causaliq_knowledge/graph/prompts.py +445 -0
causaliq_knowledge/graph/response.py +392 -0
causaliq_knowledge/graph/view_filter.py +154 -0
causaliq_knowledge/llm/base_client.py +147 -1
causaliq_knowledge/llm/cache.py +443 -0
causaliq_knowledge/py.typed +0 -0
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/METADATA +10 -6
causaliq_knowledge-0.4.0.dist-info/RECORD +42 -0
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/WHEEL +1 -1
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/entry_points.txt +3 -0
causaliq_knowledge/cli.py +0 -414
causaliq_knowledge-0.2.0.dist-info/RECORD +0 -22
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/licenses/LICENSE +0 -0
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/top_level.txt +0 -0

causaliq_knowledge/cli/generate.py ADDED Viewed

@@ -0,0 +1,410 @@
+"""Graph generation CLI commands.
+This module provides commands for generating causal graphs from
+model specifications using LLMs.
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+import click
+from pydantic import ValidationError
+from causaliq_knowledge.graph.params import GenerateGraphParams
+from causaliq_knowledge.graph.view_filter import PromptDetail
+if TYPE_CHECKING:  # pragma: no cover
+    from causaliq_knowledge.graph.models import ModelSpec
+    from causaliq_knowledge.graph.response import GeneratedGraph
+def _map_graph_names(
+    graph: "GeneratedGraph", mapping: dict[str, str]
+) -> "GeneratedGraph":
+    """Map variable names in a graph using a mapping dictionary.
+    Args:
+        graph: The generated graph with edges to map.
+        mapping: Dictionary mapping old names to new names.
+    Returns:
+        New GeneratedGraph with mapped variable names.
+    """
+    from causaliq_knowledge.graph.response import GeneratedGraph, ProposedEdge
+    new_edges = []
+    for edge in graph.edges:
+        new_edge = ProposedEdge(
+            source=mapping.get(edge.source, edge.source),
+            target=mapping.get(edge.target, edge.target),
+            confidence=edge.confidence,
+        )
+        new_edges.append(new_edge)
+    # Map variable names too
+    new_variables = [mapping.get(v, v) for v in graph.variables]
+    return GeneratedGraph(
+        edges=new_edges,
+        variables=new_variables,
+        reasoning=graph.reasoning,
+        metadata=graph.metadata,
+    )
+@click.command("generate_graph")
+@click.option(
+    "--model-spec",
+    "-s",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help="Path to model specification JSON file.",
+)
+@click.option(
+    "--prompt-detail",
+    "-p",
+    "prompt_detail",
+    default="standard",
+    type=click.Choice(["minimal", "standard", "rich"], case_sensitive=False),
+    help="Detail level for variable information in prompts.",
+)
+@click.option(
+    "--use-benchmark-names/--use-llm-names",
+    "use_benchmark_names",
+    default=False,
+    help="Use benchmark names instead of LLM names (test memorisation).",
+)
+@click.option(
+    "--llm-model",
+    "-m",
+    "llm_model",
+    default="groq/llama-3.1-8b-instant",
+    help="LLM model to use (e.g., groq/llama-3.1-8b-instant).",
+)
+@click.option(
+    "--output",
+    "-o",
+    required=True,
+    help="Output: .json file path or 'none' for adjacency matrix to stdout.",
+)
+@click.option(
+    "--llm-cache",
+    "-c",
+    "llm_cache",
+    required=True,
+    help="Path to cache database (.db) or 'none' to disable caching.",
+)
+@click.option(
+    "--llm-temperature",
+    "-t",
+    type=float,
+    default=0.1,
+    help="LLM temperature (0.0-1.0). Lower = more deterministic.",
+)
+def generate_graph(
+    model_spec: Path,
+    prompt_detail: str,
+    use_benchmark_names: bool,
+    llm_model: str,
+    output: str,
+    llm_cache: str,
+    llm_temperature: float,
+) -> None:
+    """Generate a causal graph from a model specification.
+    Reads variable definitions from a JSON model specification file and
+    uses an LLM to propose causal relationships between variables.
+    By default, LLM names are used in prompts to prevent memorisation.
+    Use --use-benchmark-names to test with original benchmark names.
+    Output behaviour:
+    - If output is a .json file: writes JSON to file, prints edges to stdout
+    - If output is 'none': prints adjacency matrix to stdout, edges to stderr
+    Examples:
+        cqknow generate_graph -s model.json -c cache.db -o graph.json
+        cqknow generate_graph -s model.json -c cache.db -o none
+        cqknow generate_graph -s model.json -c none -o none --use-benchmark
+    """
+    # Import here to avoid slow startup for --help
+    from causaliq_knowledge.cache import TokenCache
+    from causaliq_knowledge.graph import ModelLoader
+    from causaliq_knowledge.graph.generator import (
+        GraphGenerator,
+        GraphGeneratorConfig,
+    )
+    from causaliq_knowledge.graph.prompts import OutputFormat
+    # Validate all parameters using shared model
+    try:
+        params = GenerateGraphParams(
+            model_spec=model_spec,
+            prompt_detail=PromptDetail(prompt_detail.lower()),
+            use_benchmark_names=use_benchmark_names,
+            llm_model=llm_model,
+            output=output,
+            llm_cache=llm_cache,
+            llm_temperature=llm_temperature,
+        )
+    except ValidationError as e:
+        # Format Pydantic errors for CLI
+        for error in e.errors():
+            field = error.get("loc", ["unknown"])[0]
+            msg = error.get("msg", "validation error")
+            click.echo(f"Error: --{field}: {msg}", err=True)
+        sys.exit(1)
+    # Get effective paths from validated params
+    output_path = params.get_effective_output_path()
+    # Load model specification
+    try:
+        spec = ModelLoader.load(params.model_spec)
+        click.echo(
+            f"Loaded model specification: {spec.dataset_id} "
+            f"({len(spec.variables)} variables)",
+            err=True,
+        )
+    except Exception as e:
+        click.echo(f"Error loading model specification: {e}", err=True)
+        sys.exit(1)
+    # Track mapping for converting LLM output back to benchmark names
+    llm_to_benchmark_mapping: dict[str, str] = {}
+    # Determine naming mode
+    use_llm_names = not params.use_benchmark_names
+    if use_llm_names and spec.uses_distinct_llm_names():
+        llm_to_benchmark_mapping = spec.get_llm_to_name_mapping()
+        click.echo("Using LLM names (prevents memorisation)", err=True)
+    elif params.use_benchmark_names:
+        click.echo("Using benchmark names (memorisation test)", err=True)
+    # Set up cache
+    cache: Optional[TokenCache] = None
+    cache_path = params.get_effective_cache_path()
+    if cache_path is not None:
+        try:
+            cache = TokenCache(str(cache_path))
+            cache.open()
+            click.echo(f"Using cache: {cache_path}", err=True)
+        except Exception as e:
+            click.echo(f"Error opening cache: {e}", err=True)
+            sys.exit(1)
+    else:
+        click.echo("Cache disabled", err=True)
+    # Create generator - use edge_list format for structured output
+    try:
+        # Derive request_id from output filename stem
+        if params.output.lower() == "none":
+            request_id = "none"
+        else:
+            request_id = Path(params.output).stem
+        config = GraphGeneratorConfig(
+            temperature=params.llm_temperature,
+            output_format=OutputFormat.EDGE_LIST,
+            prompt_detail=params.prompt_detail,
+            use_llm_names=use_llm_names,
+            request_id=request_id,
+        )
+        generator = GraphGenerator(
+            model=params.llm_model, config=config, cache=cache
+        )
+    except ValueError as e:
+        click.echo(f"Error creating generator: {e}", err=True)
+        sys.exit(1)
+    # Generate graph
+    click.echo(f"Generating graph using {params.llm_model}...", err=True)
+    click.echo(f"View level: {params.prompt_detail.value}", err=True)
+    try:
+        graph = generator.generate_from_spec(spec, level=params.prompt_detail)
+    except Exception as e:
+        click.echo(f"Error generating graph: {e}", err=True)
+        sys.exit(1)
+    # Map LLM names back to benchmark names
+    if llm_to_benchmark_mapping:
+        graph = _map_graph_names(graph, llm_to_benchmark_mapping)
+        click.echo("Mapped LLM names back to benchmark names", err=True)
+    # Build JSON output
+    result = _build_output(graph, spec, params.llm_model, params.prompt_detail)
+    # Output results - always print edges summary to stdout
+    _print_edges(graph)
+    _print_summary(graph, err=False)
+    if output_path:
+        # Write JSON to file
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
+        click.echo(f"\nOutput written to: {output_path}", err=True)
+    else:
+        # Print adjacency matrix to stdout
+        click.echo()
+        _print_adjacency_matrix(graph, spec)
+    # Show stats
+    stats = generator.get_stats()
+    if stats.get("client_call_count", 0) > 0:
+        click.echo(
+            f"\nLLM calls: {stats['client_call_count']}, "
+            f"Generator calls: {stats['call_count']}",
+            err=True,
+        )
+    # Close cache if opened
+    if cache:
+        cache.close()
+def _build_output(
+    graph: GeneratedGraph,
+    spec: ModelSpec,
+    llm_model: str,
+    level: PromptDetail,
+) -> dict:
+    """Build output dictionary for the generated graph.
+    Args:
+        graph: The GeneratedGraph result.
+        spec: The ModelSpec used.
+        llm_model: LLM model identifier.
+        level: View level used.
+    Returns:
+        Dictionary suitable for JSON output.
+    """
+    edges = []
+    for edge in graph.edges:
+        edge_dict = {
+            "source": edge.source,
+            "target": edge.target,
+            "confidence": edge.confidence,
+        }
+        if edge.reasoning:
+            edge_dict["reasoning"] = edge.reasoning
+        edges.append(edge_dict)
+    result = {
+        "dataset_id": spec.dataset_id,
+        "domain": spec.domain,
+        "variable_count": len(spec.variables),
+        "edge_count": len(edges),
+        "edges": edges,
+        "generation": {
+            "model": llm_model,
+            "prompt_detail": level.value,
+        },
+    }
+    # Add metadata if available
+    if graph.metadata:
+        result["metadata"] = {
+            "model": graph.metadata.model,
+            "provider": graph.metadata.provider,
+            "input_tokens": graph.metadata.input_tokens,
+            "output_tokens": graph.metadata.output_tokens,
+            "from_cache": graph.metadata.from_cache,
+        }
+    return result
+def _print_edges(graph: GeneratedGraph) -> None:
+    """Print proposed edges with confidence bars.
+    Args:
+        graph: The GeneratedGraph result.
+    """
+    if not graph.edges:
+        click.echo("\nNo edges proposed by the LLM.")
+        return
+    click.echo(f"\nProposed Edges ({len(graph.edges)}):\n")
+    # Sort by confidence descending
+    sorted_edges = sorted(
+        graph.edges, key=lambda e: e.confidence, reverse=True
+    )
+    for i, edge in enumerate(sorted_edges, 1):
+        conf_pct = edge.confidence * 100
+        conf_bar = "█" * int(edge.confidence * 10) + "░" * (
+            10 - int(edge.confidence * 10)
+        )
+        click.echo(
+            f"  {i:2d}. {edge.source} → {edge.target}  "
+            f"[{conf_bar}] {conf_pct:5.1f}%"
+        )
+        if edge.reasoning:
+            # Wrap reasoning text
+            reasoning = edge.reasoning[:100]
+            if len(edge.reasoning) > 100:
+                reasoning += "..."
+            click.echo(f"      {reasoning}")
+def _print_summary(graph: GeneratedGraph, err: bool = False) -> None:
+    """Print a brief summary of the generated graph.
+    Args:
+        graph: The GeneratedGraph result.
+        err: Whether to print to stderr.
+    """
+    edge_count = len(graph.edges)
+    high_conf = sum(1 for e in graph.edges if e.confidence >= 0.7)
+    med_conf = sum(1 for e in graph.edges if 0.4 <= e.confidence < 0.7)
+    low_conf = sum(1 for e in graph.edges if e.confidence < 0.4)
+    click.echo(f"\nEdge Confidence Summary ({edge_count} edges):", err=err)
+    click.echo(f"  High confidence (>=0.7): {high_conf}", err=err)
+    click.echo(f"  Medium confidence (0.4-0.7): {med_conf}", err=err)
+    click.echo(f"  Low confidence (<0.4): {low_conf}", err=err)
+def _print_adjacency_matrix(graph: GeneratedGraph, spec: ModelSpec) -> None:
+    """Print adjacency matrix representation of the graph.
+    Args:
+        graph: The GeneratedGraph result.
+        spec: The ModelSpec used for variable names.
+    """
+    # Get variable names in order
+    var_names = [v.name for v in spec.variables]
+    # Build edge lookup (source, target) -> confidence
+    edge_lookup = {(e.source, e.target): e.confidence for e in graph.edges}
+    click.echo("Adjacency Matrix:")
+    click.echo()
+    # Header row
+    max_name_len = max(len(name) for name in var_names)
+    header = " " * (max_name_len + 2)
+    for name in var_names:
+        header += f"{name[:3]:>4}"
+    click.echo(header)
+    # Data rows
+    for i, row_name in enumerate(var_names):
+        row = f"{row_name:<{max_name_len}}  "
+        for j, col_name in enumerate(var_names):
+            if (row_name, col_name) in edge_lookup:
+                conf = edge_lookup[(row_name, col_name)]
+                row += f"{conf:4.1f}"
+            else:
+                row += "   ."
+        click.echo(row)

causaliq_knowledge/cli/main.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""Main CLI entry point and core commands.
+This module provides the main CLI group and the query command for
+querying LLMs about causal relationships between variables.
+"""
+from __future__ import annotations
+import json
+import sys
+from typing import Optional
+import click
+from causaliq_knowledge import __version__
+@click.group()
+@click.version_option(version=__version__)
+def cli() -> None:
+    """CausalIQ Knowledge - LLM knowledge for causal discovery.
+    Query LLMs about causal relationships between variables.
+    """
+    pass
+@cli.command("query")
+@click.argument("node_a")
+@click.argument("node_b")
+@click.option(
+    "--model",
+    "-m",
+    multiple=True,
+    default=["groq/llama-3.1-8b-instant"],
+    help="LLM model(s) to query. Can be specified multiple times.",
+)
+@click.option(
+    "--domain",
+    "-d",
+    default=None,
+    help="Domain context (e.g., 'medicine', 'economics').",
+)
+@click.option(
+    "--strategy",
+    "-s",
+    type=click.Choice(["weighted_vote", "highest_confidence"]),
+    default="weighted_vote",
+    help="Consensus strategy for multi-model queries.",
+)
+@click.option(
+    "--json",
+    "output_json",
+    is_flag=True,
+    help="Output result as JSON.",
+)
+@click.option(
+    "--llm-temperature",
+    "-t",
+    type=float,
+    default=0.1,
+    help="LLM temperature (0.0-1.0).",
+)
+def query_edge(
+    node_a: str,
+    node_b: str,
+    model: tuple[str, ...],
+    domain: Optional[str],
+    strategy: str,
+    output_json: bool,
+    llm_temperature: float,
+) -> None:
+    """Query LLMs about a causal relationship between two variables.
+    NODE_A and NODE_B are the variable names to query about.
+    Examples:
+        cqknow query smoking lung_cancer
+        cqknow query smoking lung_cancer --domain medicine
+        cqknow query X Y --model groq/llama-3.1-8b-instant \
+                         --model gemini/gemini-2.5-flash
+    """
+    # Import here to avoid slow startup for --help
+    from causaliq_knowledge.llm import LLMKnowledge
+    # Build context
+    context = None
+    if domain:
+        context = {"domain": domain}
+    # Create provider
+    try:
+        provider = LLMKnowledge(
+            models=list(model),
+            consensus_strategy=strategy,
+            temperature=llm_temperature,
+        )
+    except Exception as e:
+        click.echo(f"Error creating provider: {e}", err=True)
+        sys.exit(1)
+    # Query
+    click.echo(
+        f"Querying {len(model)} model(s) about: {node_a} -> {node_b}",
+        err=True,
+    )
+    try:
+        result = provider.query_edge(node_a, node_b, context=context)
+    except Exception as e:
+        click.echo(f"Error querying LLM: {e}", err=True)
+        sys.exit(1)
+    # Output
+    if output_json:
+        output = {
+            "node_a": node_a,
+            "node_b": node_b,
+            "exists": result.exists,
+            "direction": result.direction.value if result.direction else None,
+            "confidence": result.confidence,
+            "reasoning": result.reasoning,
+            "model": result.model,
+        }
+        click.echo(json.dumps(output, indent=2))
+    else:
+        # Human-readable output
+        exists_map = {True: "Yes", False: "No", None: "Uncertain"}
+        exists_str = exists_map[result.exists]
+        direction_str = result.direction.value if result.direction else "N/A"
+        click.echo(f"\n{'='*60}")
+        click.echo(f"Query: Does '{node_a}' cause '{node_b}'?")
+        click.echo("=" * 60)
+        click.echo(f"Exists:     {exists_str}")
+        click.echo(f"Direction:  {direction_str}")
+        click.echo(f"Confidence: {result.confidence:.2f}")
+        click.echo(f"Model(s):   {result.model or 'unknown'}")
+        click.echo(f"{'='*60}")
+        click.echo(f"Reasoning:  {result.reasoning}")
+        click.echo()
+    # Show stats
+    stats = provider.get_stats()
+    if stats["total_cost"] > 0:
+        click.echo(
+            f"Cost: ${stats['total_cost']:.6f} "
+            f"({stats['total_calls']} call(s))",
+            err=True,
+        )
+# Import and register command groups
+from causaliq_knowledge.cli.cache import cache_group  # noqa: E402
+from causaliq_knowledge.cli.generate import generate_graph  # noqa: E402
+from causaliq_knowledge.cli.models import list_models  # noqa: E402
+cli.add_command(cache_group)
+cli.add_command(generate_graph)
+cli.add_command(list_models)
+def main() -> None:
+    """Entry point for the CLI."""
+    cli()
+if __name__ == "__main__":  # pragma: no cover
+    main()

causaliq-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

causaliq-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl