PyPI - riptide-watergraph - Versions diffs - 0.9.0__py3-none-any.whl - Mend

riptide-watergraph 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

riptide_watergraph/__init__.py +82 -0
riptide_watergraph/cli.py +364 -0
riptide_watergraph/config.py +58 -0
riptide_watergraph/evaluation/__init__.py +18 -0
riptide_watergraph/evaluation/runner.py +135 -0
riptide_watergraph/evaluation/suite.py +51 -0
riptide_watergraph/gateway/__init__.py +7 -0
riptide_watergraph/gateway/demo_gateway.py +177 -0
riptide_watergraph/gateway/litellm_gateway.py +106 -0
riptide_watergraph/gateway/resilient.py +72 -0
riptide_watergraph/graph/__init__.py +6 -0
riptide_watergraph/graph/builder.py +164 -0
riptide_watergraph/graph/nodes.py +1012 -0
riptide_watergraph/graph/state.py +63 -0
riptide_watergraph/graph/waves.py +35 -0
riptide_watergraph/guardrails/__init__.py +12 -0
riptide_watergraph/guardrails/injection.py +39 -0
riptide_watergraph/guardrails/pii.py +41 -0
riptide_watergraph/guardrails/pipeline.py +43 -0
riptide_watergraph/interfaces/__init__.py +37 -0
riptide_watergraph/interfaces/agent.py +17 -0
riptide_watergraph/interfaces/embedding.py +12 -0
riptide_watergraph/interfaces/gateway.py +72 -0
riptide_watergraph/interfaces/guardrail.py +32 -0
riptide_watergraph/interfaces/memory.py +54 -0
riptide_watergraph/interfaces/reflector.py +33 -0
riptide_watergraph/interfaces/reranker.py +18 -0
riptide_watergraph/interfaces/swarm.py +48 -0
riptide_watergraph/interfaces/tools.py +65 -0
riptide_watergraph/mcp/__init__.py +18 -0
riptide_watergraph/mcp/adapter.py +66 -0
riptide_watergraph/mcp/client.py +57 -0
riptide_watergraph/mcp/stdio.py +83 -0
riptide_watergraph/memory/__init__.py +26 -0
riptide_watergraph/memory/embedding.py +48 -0
riptide_watergraph/memory/inmemory.py +59 -0
riptide_watergraph/memory/jsonfile.py +143 -0
riptide_watergraph/memory/pgvector.py +101 -0
riptide_watergraph/memory/ranking.py +128 -0
riptide_watergraph/memory/reflection.py +80 -0
riptide_watergraph/memory/rerank.py +27 -0
riptide_watergraph/memory/types.py +49 -0
riptide_watergraph/observability/__init__.py +13 -0
riptide_watergraph/observability/cost.py +121 -0
riptide_watergraph/observability/tracing.py +78 -0
riptide_watergraph/py.typed +0 -0
riptide_watergraph/server/__init__.py +9 -0
riptide_watergraph/server/app.py +578 -0
riptide_watergraph/server/static/app.js +1139 -0
riptide_watergraph/server/static/index.html +49 -0
riptide_watergraph/server/static/styles.css +329 -0
riptide_watergraph/service.py +447 -0
riptide_watergraph/swarm/__init__.py +13 -0
riptide_watergraph/swarm/cost.py +46 -0
riptide_watergraph/swarm/heuristic_composer.py +75 -0
riptide_watergraph/swarm/llm_composer.py +110 -0
riptide_watergraph/swarm/plan_composer.py +50 -0
riptide_watergraph/swarm/role_library.py +324 -0
riptide_watergraph/swarm/roles.py +127 -0
riptide_watergraph/swarm/static_composer.py +27 -0
riptide_watergraph/tools/__init__.py +6 -0
riptide_watergraph/tools/dev_tools.py +298 -0
riptide_watergraph/tools/enterprise.py +96 -0
riptide_watergraph/tools/examples.py +179 -0
riptide_watergraph/tools/library.py +925 -0
riptide_watergraph/tools/registry.py +114 -0
riptide_watergraph/workflows.py +154 -0
riptide_watergraph-0.9.0.dist-info/METADATA +470 -0
riptide_watergraph-0.9.0.dist-info/RECORD +73 -0
riptide_watergraph-0.9.0.dist-info/WHEEL +5 -0
riptide_watergraph-0.9.0.dist-info/entry_points.txt +3 -0
riptide_watergraph-0.9.0.dist-info/licenses/LICENSE +21 -0
riptide_watergraph-0.9.0.dist-info/top_level.txt +1 -0

riptide_watergraph/__init__.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Riptide-Watergraph — a 'like water', layered multi-agent framework on LangGraph.
+Public surface (Stage 1):
+    from riptide_watergraph import build_graph, LiteLLMGateway, InMemoryMemory
+    from riptide_watergraph import default_registry, SingleAgentComposer
+"""
+from __future__ import annotations
+__version__ = "0.9.0"
+from .gateway import DemoGateway, LiteLLMGateway, ResilientGateway
+from .graph import build_graph
+from .guardrails import (
+    GuardrailPipeline,
+    PiiGuardrail,
+    PromptInjectionGuardrail,
+    default_guardrails,
+)
+from .interfaces import (
+    Agent,
+    CompletionResult,
+    Guardrail,
+    GuardrailResult,
+    Memory,
+    Message,
+    ModelGateway,
+    Reflector,
+    SwarmComposer,
+    SwarmDecision,
+    ToolRegistry,
+    ToolSpec,
+    Trajectory,
+)
+from .mcp import FakeMcpClient, McpToolInfo, register_mcp_tools
+from .memory import InMemoryMemory, JsonFileMemory, LLMReflector, MemoryType
+from .observability import CostTracker, UsageRecord
+from .swarm import HeuristicSwarmComposer, LLMSwarmComposer, SingleAgentComposer
+from .tools import StaticToolRegistry, default_registry
+__all__ = [
+    "__version__",
+    "build_graph",
+    "LiteLLMGateway",
+    "DemoGateway",
+    "ResilientGateway",
+    "InMemoryMemory",
+    "JsonFileMemory",
+    "LLMReflector",
+    "MemoryType",
+    "StaticToolRegistry",
+    "default_registry",
+    "SingleAgentComposer",
+    "HeuristicSwarmComposer",
+    "LLMSwarmComposer",
+    # guardrails + observability (Stage 4)
+    "GuardrailPipeline",
+    "default_guardrails",
+    "PiiGuardrail",
+    "PromptInjectionGuardrail",
+    "Guardrail",
+    "GuardrailResult",
+    "CostTracker",
+    "UsageRecord",
+    # interfaces
+    "Agent",
+    "ModelGateway",
+    "Message",
+    "CompletionResult",
+    "Memory",
+    "Reflector",
+    "Trajectory",
+    "ToolRegistry",
+    "ToolSpec",
+    "SwarmComposer",
+    "SwarmDecision",
+    # MCP tool interop
+    "register_mcp_tools",
+    "FakeMcpClient",
+    "McpToolInfo",
+]

riptide_watergraph/cli.py ADDED Viewed

@@ -0,0 +1,364 @@
+"""Command-line entrypoint.
+``riptide run "<task>"`` runs a task end-to-end (guardrails -> recall -> orchestrate ->
+worker/swarm -> approval -> finalize -> reflect -> output), attributing usage to a
+tenant. ``riptide costs`` prints the per-tenant cost dashboard.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+import time
+import uuid
+from pathlib import Path
+from typing import Any
+from langgraph.checkpoint.sqlite import SqliteSaver
+from langgraph.types import Command
+from .config import get_settings
+from .evaluation import EvalRunner
+from .gateway import DemoGateway, LiteLLMGateway, ResilientGateway
+from .graph import build_graph
+from .guardrails import default_guardrails
+from .memory import HashingEmbedding, JsonFileMemory, LexicalOverlapReranker
+from .memory.reflection import LLMReflector
+from .observability.cost import (
+    BudgetExceeded,
+    CostTracker,
+    UsageRecord,
+    cost_from_usage,
+    estimate_tokens,
+)
+from .observability.tracing import init_tracing
+from .service import enforce_budget
+from .interfaces import SwarmComposer
+from .swarm import HeuristicSwarmComposer, LLMSwarmComposer, SingleAgentComposer
+from .tools import default_registry
+def _prompt_approval(payload: dict[str, Any]) -> bool:
+    """Ask the operator to approve a pending side-effecting action."""
+    print("\n  HUMAN APPROVAL REQUIRED")
+    print(f"   tool:      {payload.get('tool')}")
+    print(f"   arguments: {payload.get('arguments')}")
+    print(f"   subtask:   {payload.get('subtask')}")
+    reply = input("   Approve? [y/N] ").strip().lower()
+    return reply in ("y", "yes")
+def _prompt_clarification(payload: dict[str, Any]) -> str:
+    """Ask the operator to answer a worker's clarifying question."""
+    print("\n  CLARIFICATION REQUESTED")
+    print(f"   subtask:  {payload.get('subtask')}")
+    print(f"   question: {payload.get('question')}")
+    return input("   Your answer: ").strip()
+def _run_task(
+    task: str,
+    *,
+    auto_approve: bool,
+    offline: bool = False,
+    memory_on: bool = True,
+    single: bool = False,
+    tenant_id: str = "default",
+    guardrails_on: bool = True,
+    llm_composer: bool = False,
+    critic: bool = False,
+    supervisor: bool = False,
+    react_steps: int = 1,
+    vote_k: int = 1,
+    final_schema: dict[str, Any] | None = None,
+) -> int:
+    settings = get_settings()
+    init_tracing(settings)
+    Path(settings.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
+    try:
+        enforce_budget(settings, tenant_id)
+    except BudgetExceeded as exc:
+        print(f" BUDGET EXCEEDED: {exc}")
+        return 2
+    model = settings.riptide_watergraph_model
+    planner_model = settings.planner_model or model
+    worker_model = settings.worker_model or model
+    base_gateway = DemoGateway() if offline else LiteLLMGateway(default_model=model)
+    # Wrap with timeout + retry so transient API failures don't crash the run.
+    gateway = ResilientGateway(base_gateway)
+    registry = default_registry()
+    composer: SwarmComposer
+    if single:
+        composer = SingleAgentComposer(model=planner_model)
+    elif llm_composer:
+        composer = LLMSwarmComposer(gateway, model=planner_model)
+    else:
+        composer = HeuristicSwarmComposer(model=planner_model)
+    # Stage 2 + 4: per-tenant persistent memory (lessons never leak across tenants),
+    # with hybrid dense+lexical retrieval (offline embedder) and reranking.
+    memory = (
+        JsonFileMemory(
+            settings.tenant_memory_path(tenant_id),
+            embedding=HashingEmbedding(),
+            reranker=LexicalOverlapReranker(),
+        )
+        if memory_on
+        else None
+    )
+    reflector = LLMReflector(gateway, model=planner_model) if memory_on else None
+    guardrails = default_guardrails() if guardrails_on else None
+    thread_id = str(uuid.uuid4())
+    config = {"configurable": {"thread_id": thread_id}}
+    with SqliteSaver.from_conn_string(settings.checkpoint_path) as checkpointer:
+        graph = build_graph(
+            gateway=gateway,
+            registry=registry,
+            composer=composer,
+            model=model,
+            checkpointer=checkpointer,
+            memory=memory,
+            reflector=reflector,
+            guardrails=guardrails,
+            planner_model=planner_model,
+            worker_model=worker_model,
+            enable_critic=critic,
+            enable_supervisor=supervisor,
+            max_steps=react_steps,
+            vote_k=vote_k,
+            final_schema=final_schema,
+        )
+        print(f" tenant={tenant_id} thread={thread_id}")
+        result = graph.invoke(
+            {"task": task, "session_id": thread_id, "tenant_id": tenant_id}, config
+        )
+        # Resume loop: handle approval and clarification interrupts.
+        while "__interrupt__" in result:
+            payload = result["__interrupt__"][0].value
+            if isinstance(payload, dict) and payload.get("type") == "clarification":
+                if auto_approve:
+                    answer = "(no clarification available; proceed with your best assumption)"
+                    print(f" auto-clarify: {payload.get('question')}")
+                else:
+                    answer = _prompt_clarification(payload)
+                result = graph.invoke(Command(resume={"answer": answer}), config)
+            else:
+                approved = True if auto_approve else _prompt_approval(payload)
+                if auto_approve:
+                    print(f" auto-approved: {payload.get('tool')}")
+                result = graph.invoke(Command(resume={"approved": approved}), config)
+        _print_result(result, memory_on=memory_on, memory=memory)
+        _record_usage(settings, tenant_id, task, result)
+    return 0
+def _print_result(result: dict, *, memory_on: bool, memory) -> None:
+    if result.get("blocked"):
+        print(f" BLOCKED by guardrails: {', '.join(result.get('guard_violations') or [])}")
+        print("\n FINAL ANSWER\n" + (result.get("final_answer") or "(none)"))
+        return
+    decision = result.get("swarm_decision") or {}
+    if decision:
+        print(f" composition: {decision.get('mode')} "
+              f"(parallelism={decision.get('parallelism')}) - {decision.get('rationale')}")
+    roles = result.get("roles") or []
+    plan = result.get("plan") or []
+    if roles:
+        print(" roles: " + ", ".join(
+            f"{plan[i] if i < len(plan) else '?'} -> {roles[i]}" for i in range(len(roles))
+        ))
+    verdicts = result.get("verdicts") or []
+    if verdicts:
+        n_pass = sum(1 for v in verdicts if v.get("verdict") == "pass")
+        print(f" critic: {n_pass}/{len(verdicts)} subtasks verified")
+    for tag in ("guard_violations", "guard_violations_out"):
+        if result.get(tag):
+            print(f" guardrails ({tag}): {', '.join(result[tag])}")
+    recalled = result.get("recalled_lessons") or []
+    if recalled:
+        print(f"\n recalled {len(recalled)} lesson(s):")
+        for ln in recalled:
+            print(f"   - {ln}")
+    print("\n FINAL ANSWER\n" + (result.get("final_answer") or "(none)"))
+    structured = result.get("structured_output")
+    if structured:
+        print("\n STRUCTURED OUTPUT\n" + json.dumps(structured, indent=2))
+    metrics = result.get("metrics") or {}
+    total = metrics.get("tool_calls_total", 0)
+    valid = metrics.get("tool_calls_valid", 0)
+    if total:
+        print(f"\n tool-call validity: {valid}/{total} = {valid / total:.0%}")
+    if memory_on and memory is not None:
+        stored = result.get("stored_lessons") or []
+        outcome = "success" if result.get("success") else "needs-improvement"
+        print(f" outcome: {outcome}; learned {len(stored)} lesson(s) "
+              f"(memory now holds {len(memory)})")
+def _record_usage(settings, tenant_id: str, task: str, result: dict) -> None:
+    decision = result.get("swarm_decision") or {}
+    blob = (
+        task
+        + " ".join(r.get("output", "") for r in (result.get("results") or []))
+        + (result.get("final_answer") or "")
+    )
+    # Prefer real token usage from the gateway; fall back to the composer estimate.
+    usage = (result.get("metrics") or {}).get("usage") or {}
+    actual_total = int(usage.get("total_tokens", 0) or 0)
+    if actual_total > 0:
+        cost = cost_from_usage(settings.riptide_watergraph_model, usage)
+    else:
+        cost = float(decision.get("estimated_cost_usd", 0.0))
+    tracker = CostTracker(settings.usage_log_path)
+    tracker.record(
+        UsageRecord(
+            tenant_id=tenant_id,
+            task=task,
+            mode=decision.get("mode", "single"),
+            est_tokens=estimate_tokens(blob),
+            actual_tokens=actual_total,
+            cost_usd=cost,
+            blocked=bool(result.get("blocked")),
+            ts=time.time(),
+        )
+    )
+def _show_costs() -> int:
+    settings = get_settings()
+    totals = CostTracker(settings.usage_log_path).by_tenant()
+    if not totals:
+        print("no usage recorded yet.")
+        return 0
+    print(f"{'tenant':<16}{'runs':>6}{'tokens':>10}{'cost_usd':>12}{'blocked':>9}")
+    print("-" * 53)
+    for t in sorted(totals.values(), key=lambda x: x.cost_usd, reverse=True):
+        print(f"{t.tenant_id:<16}{t.runs:>6}{t.est_tokens:>10}"
+              f"{t.cost_usd:>12.6f}{t.blocked:>9}")
+    return 0
+def _run_eval(offline: bool) -> int:
+    try:
+        report = EvalRunner(offline=offline).run()
+    except Exception as exc:  # noqa: BLE001 - surface a friendly hint for real runs
+        if not offline:
+            print(f" real-model eval failed: {exc}")
+            print(' hint: pip install -e ".[litellm]", set OPENAI_API_KEY and '
+                  "AGENTIC_WATER_MODEL, or use --offline.")
+            return 1
+        raise
+    print(f"{'task':<14}{'pass':>6}{'mode':>10}{'tool_valid':>12}  notes")
+    print("-" * 60)
+    for r in report.results:
+        rate = "-" if r.tool_valid_rate is None else f"{r.tool_valid_rate:.0%}"
+        mark = "PASS" if r.passed else "FAIL"
+        print(f"{r.task_id:<14}{mark:>6}{r.mode:>10}{rate:>12}  {r.notes}")
+    print("-" * 60)
+    print(f" pass rate: {report.n_passed}/{report.n_total} = {report.pass_rate:.0%}")
+    print(f" routing: {report.modes}; blocked: {report.blocked}; "
+          f"self-learning recall: {report.learning_recall}")
+    return 0 if report.pass_rate == 1.0 else 1
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="riptide-watergraph")
+    sub = parser.add_subparsers(dest="command", required=True)
+    run_p = sub.add_parser("run", help="Run a task end-to-end.")
+    run_p.add_argument("task", help="The task for the agent to perform.")
+    run_p.add_argument("--auto-approve", action="store_true",
+                       help="Approve side-effecting tools without prompting (for CI).")
+    run_p.add_argument("--offline", action="store_true",
+                       help="Use the deterministic offline gateway (no API key).")
+    run_p.add_argument("--no-memory", action="store_true",
+                       help="Disable long-term memory recall + reflection.")
+    run_p.add_argument("--single", action="store_true",
+                       help="Force a single agent (skip the swarm composer).")
+    run_p.add_argument("--tenant", default="default",
+                       help="Tenant id for memory isolation + cost attribution.")
+    run_p.add_argument("--no-guardrails", action="store_true",
+                       help="Disable input/output guardrails for this run.")
+    run_p.add_argument("--llm-composer", action="store_true",
+                       help="Use the LLM swarm composer (plan + dependencies) instead "
+                            "of the heuristic one.")
+    run_p.add_argument("--critic", action="store_true",
+                       help="Add a critic agent that verifies each subtask result.")
+    run_p.add_argument("--supervisor", action="store_true",
+                       help="Add a supervisor that re-plans corrective subtasks (implies "
+                            "--critic).")
+    run_p.add_argument("--react", type=int, default=1, metavar="N",
+                       help="Max think->act->observe steps per subtask (default 1).")
+    run_p.add_argument("--vote", type=int, default=1, metavar="K",
+                       help="Self-consistency samples for direct answers (default 1).")
+    run_p.add_argument("--schema", metavar="PATH",
+                       help="Path to a JSON Schema file; finalize emits a validated "
+                            "structured output matching it.")
+    sub.add_parser("costs", help="Show the per-tenant cost dashboard.")
+    eval_p = sub.add_parser("eval", help="Run the evaluation suite and report metrics.")
+    eval_p.add_argument("--offline", action="store_true",
+                        help="Evaluate with the deterministic offline gateway.")
+    serve_p = sub.add_parser("serve", help="Run the HTTP service (needs the [server] extra).")
+    serve_p.add_argument("--host", default="127.0.0.1")
+    serve_p.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args(argv)
+    if args.command == "run":
+        final_schema = json.loads(Path(args.schema).read_text()) if args.schema else None
+        return _run_task(
+            args.task,
+            auto_approve=args.auto_approve,
+            offline=args.offline,
+            memory_on=not args.no_memory,
+            single=args.single,
+            tenant_id=args.tenant,
+            guardrails_on=not args.no_guardrails,
+            llm_composer=args.llm_composer,
+            critic=args.critic,
+            supervisor=args.supervisor,
+            react_steps=args.react,
+            vote_k=args.vote,
+            final_schema=final_schema,
+        )
+    if args.command == "costs":
+        return _show_costs()
+    if args.command == "eval":
+        return _run_eval(args.offline)
+    if args.command == "serve":
+        return _serve(args.host, args.port)
+    parser.print_help()
+    return 1
+def _serve(host: str, port: int) -> int:
+    try:
+        import uvicorn
+    except ImportError:
+        print('the HTTP server needs the [server] extra: pip install -e ".[server]"')
+        return 1
+    print(f" serving riptide-watergraph on http://{host}:{port}")
+    uvicorn.run("riptide_watergraph.server:app", host=host, port=port)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

riptide_watergraph/config.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""Runtime configuration via pydantic-settings (reads from env / .env)."""
+from __future__ import annotations
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    """Framework settings. All fields overridable by environment variables."""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_prefix="",
+        extra="ignore",
+        case_sensitive=False,
+    )
+    # Default model string passed to LiteLLM (orchestrator/worker/finalizer).
+    riptide_watergraph_model: str = "gpt-4o-mini"
+    # Optional per-role model routing (Phase C). Empty => use the default model.
+    planner_model: str = ""  # orchestrator + finalize (the "thinking" steps)
+    worker_model: str = ""  # workers (often a cheaper model)
+    # Checkpoint database path for the LangGraph SqliteSaver.
+    checkpoint_path: str = ".riptide_watergraph/checkpoints.sqlite"
+    # Persistent long-term memory store (Stage 2: lessons accumulate here across runs).
+    memory_path: str = ".riptide_watergraph/memory.json"
+    # Stage 4: multi-tenancy + cost attribution.
+    tenant_id: str = "default"
+    data_dir: str = ".riptide_watergraph"  # base dir for per-tenant memory + usage log
+    # Sandbox root the agentic developer tools (read_file/write_file/run_*) are confined to.
+    # All file paths are resolved under this dir; ``..``/absolute escapes are refused.
+    workspace_dir: str = ".riptide_watergraph/workspace"
+    # Phase D: per-tenant spend ceiling in USD (0 = unlimited). Runs are refused once a
+    # tenant's accumulated cost reaches this.
+    tenant_budget_usd: float = 0.0
+    def tenant_memory_path(self, tenant_id: str) -> str:
+        """Per-tenant memory namespace so lessons never leak across tenants."""
+        return f"{self.data_dir}/tenants/{tenant_id}/memory.json"
+    @property
+    def usage_log_path(self) -> str:
+        return f"{self.data_dir}/usage.jsonl"
+    # Observability
+    langfuse_public_key: str | None = None
+    langfuse_secret_key: str | None = None
+    langfuse_host: str = "https://cloud.langfuse.com"
+    riptide_watergraph_disable_tracing: bool = False
+def get_settings() -> Settings:
+    """Load settings from environment / .env."""
+    return Settings()

riptide_watergraph/evaluation/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""Offline evaluation harness — measure the framework on a task suite.
+The research consensus is to run your own evals on your task distribution rather than
+trust vendor benchmarks. This harness makes the framework's behavior measurable:
+pass rate, single-vs-swarm routing, guardrail blocking, tool-call validity, and
+self-learning gain — deterministically, offline.
+"""
+from .runner import EvalReport, EvalResult, EvalRunner
+from .suite import EvalTask, default_suite
+__all__ = [
+    "EvalTask",
+    "default_suite",
+    "EvalRunner",
+    "EvalResult",
+    "EvalReport",
+]

riptide_watergraph/evaluation/runner.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""Eval runner: build a graph, run the suite, score and aggregate.
+Offline + deterministic by default (DemoGateway), so the suite doubles as a behavioral
+regression gate in CI. Pass ``offline=False`` to evaluate against a real model.
+"""
+from __future__ import annotations
+from pydantic import BaseModel, Field
+from ..config import get_settings
+from ..gateway import DemoGateway, LiteLLMGateway, ResilientGateway
+from ..graph import build_graph
+from ..guardrails import default_guardrails
+from ..memory import HashingEmbedding, InMemoryMemory, LexicalOverlapReranker
+from ..memory.reflection import LLMReflector
+from ..swarm import HeuristicSwarmComposer
+from ..tools import default_registry
+from .suite import EvalTask, default_suite
+class EvalResult(BaseModel):
+    task_id: str
+    passed: bool
+    mode: str  # single | swarm | blocked
+    blocked: bool = False
+    tool_valid_rate: float | None = None
+    notes: str = ""
+class EvalReport(BaseModel):
+    results: list[EvalResult] = Field(default_factory=list)
+    pass_rate: float = 0.0
+    n_passed: int = 0
+    n_total: int = 0
+    modes: dict[str, int] = Field(default_factory=dict)
+    blocked: int = 0
+    learning_recall: bool = False  # did a repeated task recall a prior lesson?
+class EvalRunner:
+    """Runs the task suite through a freshly built graph."""
+    def __init__(self, *, offline: bool = True, model: str | None = None) -> None:
+        self.offline = offline
+        # For a real run, default to the configured model rather than a placeholder.
+        self.model = model or ("demo" if offline else get_settings().riptide_watergraph_model)
+    def _gateway(self):
+        if self.offline:
+            return DemoGateway()
+        # Real model: wrap LiteLLM in the resilient gateway (timeouts + retries).
+        return ResilientGateway(LiteLLMGateway(default_model=self.model))
+    def _build(self, memory):
+        gateway = self._gateway()
+        return build_graph(
+            gateway=gateway,
+            registry=default_registry(),
+            composer=HeuristicSwarmComposer(model=self.model),
+            model=self.model,
+            memory=memory,
+            reflector=LLMReflector(gateway, model=self.model),
+            guardrails=default_guardrails(),
+        )
+    def run(self, suite: list[EvalTask] | None = None) -> EvalReport:
+        suite = suite or default_suite()
+        memory = InMemoryMemory(
+            embedding=HashingEmbedding(), reranker=LexicalOverlapReranker()
+        )
+        graph = self._build(memory)
+        results = [self._run_task(graph, t) for t in suite]
+        report = EvalReport(
+            results=results,
+            n_total=len(results),
+            n_passed=sum(1 for r in results if r.passed),
+            blocked=sum(1 for r in results if r.blocked),
+            learning_recall=self._probe_learning(),
+        )
+        report.pass_rate = (report.n_passed / report.n_total) if report.n_total else 0.0
+        for r in results:
+            report.modes[r.mode] = report.modes.get(r.mode, 0) + 1
+        return report
+    def _run_task(self, graph, task: EvalTask) -> EvalResult:
+        state = graph.invoke(
+            {"task": task.prompt, "session_id": task.id, "tenant_id": "eval"},
+            {"configurable": {"thread_id": task.id}},
+        )
+        blocked = bool(state.get("blocked"))
+        decision = state.get("swarm_decision") or {}
+        mode = "blocked" if blocked else decision.get("mode", "single")
+        metrics = state.get("metrics") or {}
+        total = metrics.get("tool_calls_total", 0)
+        valid = metrics.get("tool_calls_valid", 0)
+        rate = (valid / total) if total else None
+        passed, notes = self._score(task, state, blocked, mode)
+        return EvalResult(
+            task_id=task.id, passed=passed, mode=mode, blocked=blocked,
+            tool_valid_rate=rate, notes=notes,
+        )
+    @staticmethod
+    def _score(task: EvalTask, state: dict, blocked: bool, mode: str) -> tuple[bool, str]:
+        if task.expect_blocked:
+            return (blocked, "" if blocked else "expected block, was allowed")
+        if blocked:
+            return (False, "unexpectedly blocked")
+        if task.expect_mode and mode != task.expect_mode:
+            return (False, f"expected {task.expect_mode}, got {mode}")
+        if task.expect_substring:
+            blob = (
+                task.prompt
+                + " ".join(r.get("output", "") for r in (state.get("results") or []))
+                + (state.get("final_answer") or "")
+            ).lower()
+            if task.expect_substring.lower() not in blob:
+                return (False, f"missing expected {task.expect_substring!r}")
+        return (True, "")
+    def _probe_learning(self) -> bool:
+        """Run one task twice; the second run should recall the first run's lesson."""
+        memory = InMemoryMemory(
+            embedding=HashingEmbedding(), reranker=LexicalOverlapReranker()
+        )
+        graph = self._build(memory)
+        cfg1 = {"configurable": {"thread_id": "probe-1"}}
+        cfg2 = {"configurable": {"thread_id": "probe-2"}}
+        graph.invoke({"task": "compute 7 * 7", "session_id": "p1", "tenant_id": "eval"}, cfg1)
+        s2 = graph.invoke({"task": "compute 7 * 7", "session_id": "p2", "tenant_id": "eval"}, cfg2)
+        return bool(s2.get("recalled_lessons"))