PyPI - agentforge-py - Versions diffs - 0.2.1__py3-none-any.whl - Mend

agentforge-py 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

agentforge/__init__.py +114 -0
agentforge/_testing/__init__.py +19 -0
agentforge/_testing/fake_llm.py +126 -0
agentforge/_testing/fake_tool.py +122 -0
agentforge/_tools/__init__.py +14 -0
agentforge/_tools/calculator.py +102 -0
agentforge/_tools/decorator.py +300 -0
agentforge/_tools/file_read.py +112 -0
agentforge/_tools/shell.py +134 -0
agentforge/_tools/web_search.py +207 -0
agentforge/agent.py +817 -0
agentforge/auth.py +42 -0
agentforge/cli/__init__.py +18 -0
agentforge/cli/_build.py +323 -0
agentforge/cli/_scaffold_state.py +250 -0
agentforge/cli/_shared_scaffold.py +174 -0
agentforge/cli/config_cmd.py +174 -0
agentforge/cli/db_cmd.py +262 -0
agentforge/cli/debug_cmd.py +168 -0
agentforge/cli/docs_cmd.py +217 -0
agentforge/cli/eval_cmd.py +181 -0
agentforge/cli/health_cmd.py +139 -0
agentforge/cli/list_modules.py +85 -0
agentforge/cli/main.py +81 -0
agentforge/cli/manifest_apply.py +368 -0
agentforge/cli/module_cmd.py +247 -0
agentforge/cli/new_cmd.py +171 -0
agentforge/cli/run_cmd.py +234 -0
agentforge/cli/upgrade_cmd.py +230 -0
agentforge/config/__init__.py +45 -0
agentforge/eval/__init__.py +18 -0
agentforge/eval/consistency.py +107 -0
agentforge/eval/coverage.py +100 -0
agentforge/eval/format_compliance.py +107 -0
agentforge/eval/regression.py +143 -0
agentforge/findings.py +166 -0
agentforge/guardrails/__init__.py +32 -0
agentforge/guardrails/allowlist.py +49 -0
agentforge/guardrails/capability_check.py +58 -0
agentforge/guardrails/engine.py +289 -0
agentforge/guardrails/pii_redact_basic.py +61 -0
agentforge/guardrails/prompt_injection_basic.py +90 -0
agentforge/memory/__init__.py +16 -0
agentforge/memory/in_memory.py +130 -0
agentforge/memory/in_memory_graph.py +262 -0
agentforge/memory/in_memory_vector.py +167 -0
agentforge/pipeline/__init__.py +26 -0
agentforge/pipeline/engine.py +189 -0
agentforge/pipeline/errors.py +19 -0
agentforge/pipeline/tool.py +93 -0
agentforge/py.typed +0 -0
agentforge/recording.py +189 -0
agentforge/renderers/__init__.py +28 -0
agentforge/renderers/_defaults.py +32 -0
agentforge/renderers/markdown.py +44 -0
agentforge/renderers/patch_applier.py +46 -0
agentforge/renderers/registry.py +108 -0
agentforge/renderers/scorecard.py +59 -0
agentforge/renderers/span_table.py +71 -0
agentforge/replay.py +260 -0
agentforge/resolver_register.py +41 -0
agentforge/retrieval.py +410 -0
agentforge/runtime.py +63 -0
agentforge/strategies/__init__.py +27 -0
agentforge/strategies/_base.py +280 -0
agentforge/strategies/_plan.py +93 -0
agentforge/strategies/multi_agent.py +541 -0
agentforge/strategies/plan_execute.py +506 -0
agentforge/strategies/react.py +237 -0
agentforge/strategies/tot.py +472 -0
agentforge/templates/_shared/.cursorrules +12 -0
agentforge/templates/_shared/.github/copilot-instructions.md +13 -0
agentforge/templates/_shared/.gitkeep +0 -0
agentforge/templates/_shared/AGENTS.md.tmpl +123 -0
agentforge/templates/_shared/CLAUDE.md +13 -0
agentforge/templates/_shared/docs/runbooks/01-set-up-new-agent.md.tmpl +67 -0
agentforge/templates/_shared/docs/runbooks/02-add-a-tool.md +67 -0
agentforge/templates/_shared/docs/runbooks/03-add-a-pipeline-task.md +69 -0
agentforge/templates/_shared/docs/runbooks/04-pick-reasoning-strategy.md +67 -0
agentforge/templates/_shared/docs/runbooks/05-write-prompts.md +75 -0
agentforge/templates/_shared/docs/runbooks/06-test-your-agent.md +75 -0
agentforge/templates/_shared/docs/runbooks/07-debug-a-run.md +70 -0
agentforge/templates/_shared/docs/runbooks/08-add-memory.md +75 -0
agentforge/templates/_shared/docs/runbooks/09-add-mcp.md +78 -0
agentforge/templates/_shared/docs/runbooks/10-add-evaluators.md +76 -0
agentforge/templates/_shared/docs/runbooks/11-add-safety-guardrails.md +83 -0
agentforge/templates/_shared/docs/runbooks/12-add-observability.md +77 -0
agentforge/templates/_shared/docs/runbooks/13-configure-multi-provider.md +91 -0
agentforge/templates/_shared/docs/runbooks/14-deploy-your-agent.md +70 -0
agentforge/templates/_shared/docs/runbooks/15-upgrade-your-agent.md +67 -0
agentforge/templates/_shared/docs/runbooks/16-configuration-reference.md +81 -0
agentforge/templates/_shared/docs/runbooks/17-add-reranker.md +78 -0
agentforge/templates/_shared/docs/runbooks/18-add-hybrid-search.md +78 -0
agentforge/templates/_shared/docs/runbooks/19-add-graphrag.md +83 -0
agentforge/templates/_shared/docs/runbooks/20-apply-schema-migrations.md +92 -0
agentforge/templates/_shared/docs/runbooks/21-use-streaming-guardrails.md +82 -0
agentforge/templates/_shared/docs/runbooks/README.md.tmpl +68 -0
agentforge/templates/code-reviewer/.env.example +8 -0
agentforge/templates/code-reviewer/.gitignore +7 -0
agentforge/templates/code-reviewer/README.md +12 -0
agentforge/templates/code-reviewer/agentforge.yaml +23 -0
agentforge/templates/code-reviewer/copier.yml +34 -0
agentforge/templates/code-reviewer/pyproject.toml +18 -0
agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
agentforge/templates/docs-qa/.env.example +8 -0
agentforge/templates/docs-qa/.gitignore +7 -0
agentforge/templates/docs-qa/README.md +14 -0
agentforge/templates/docs-qa/agentforge.yaml +19 -0
agentforge/templates/docs-qa/copier.yml +31 -0
agentforge/templates/docs-qa/pyproject.toml +18 -0
agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
agentforge/templates/minimal/.env.example +11 -0
agentforge/templates/minimal/.gitignore +10 -0
agentforge/templates/minimal/README.md +28 -0
agentforge/templates/minimal/agentforge.yaml +10 -0
agentforge/templates/minimal/copier.yml +52 -0
agentforge/templates/minimal/pyproject.toml +18 -0
agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/main.py +34 -0
agentforge/templates/patch-bot/.env.example +8 -0
agentforge/templates/patch-bot/.gitignore +7 -0
agentforge/templates/patch-bot/README.md +13 -0
agentforge/templates/patch-bot/agentforge.yaml +15 -0
agentforge/templates/patch-bot/copier.yml +31 -0
agentforge/templates/patch-bot/pyproject.toml +18 -0
agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
agentforge/templates/research/.env.example +8 -0
agentforge/templates/research/.gitignore +7 -0
agentforge/templates/research/README.md +14 -0
agentforge/templates/research/agentforge.yaml +17 -0
agentforge/templates/research/copier.yml +31 -0
agentforge/templates/research/pyproject.toml +18 -0
agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/main.py +31 -0
agentforge/templates/triage/.env.example +8 -0
agentforge/templates/triage/.gitignore +7 -0
agentforge/templates/triage/README.md +14 -0
agentforge/templates/triage/agentforge.yaml +25 -0
agentforge/templates/triage/copier.yml +31 -0
agentforge/templates/triage/pyproject.toml +18 -0
agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/main.py +30 -0
agentforge/testing/__init__.py +69 -0
agentforge/testing/conformance.py +40 -0
agentforge/testing/factory.py +89 -0
agentforge/testing/fixtures.py +42 -0
agentforge/testing/llm.py +235 -0
agentforge/testing/recording.py +177 -0
agentforge/tools/__init__.py +41 -0
agentforge_py-0.2.1.dist-info/METADATA +158 -0
agentforge_py-0.2.1.dist-info/RECORD +157 -0
agentforge_py-0.2.1.dist-info/WHEEL +4 -0
agentforge_py-0.2.1.dist-info/entry_points.txt +2 -0
agentforge_py-0.2.1.dist-info/licenses/LICENSE +202 -0

agentforge/agent.py ADDED Viewed

@@ -0,0 +1,817 @@
+"""`Agent` — the framework's top-level orchestrator.
+Per feat-001 §4.2 and ADR-0007, the constructor surface is locked.
+Adding a kwarg with a safe default is a minor bump; removing or
+renaming requires a major bump.
+Lifecycle (per ADR-0010):
+    Agent.__init__: load config → resolve modules → wire defaults →
+                    install RunIdFilter (if configured)
+    Agent.run(task): bind RunContext → call strategy.run(state) →
+                     run evaluators → fire on_finish → return RunResult
+    Agent.close(): release LLM client / memory / hooks (async ctx mgr OK)
+feat-001 ships the lifecycle + locked surface; feat-002 adds the
+default `ReActLoop`, feat-003 the provider surface, feat-007 the full
+fallback chain. The `Agent` constructor stays unchanged across those
+features.
+"""
+from __future__ import annotations
+import logging
+import time
+from collections.abc import AsyncIterator, Awaitable, Callable
+from pathlib import Path
+from types import TracebackType
+from typing import Any
+from agentforge_core.config.schema import GuardrailPolicy
+from agentforge_core.contracts.evaluator import EvalResult, Evaluator
+from agentforge_core.contracts.graph_store import GraphStore
+from agentforge_core.contracts.guardrails import (
+    InputValidator,
+    OutputValidator,
+    ToolCallGate,
+)
+from agentforge_core.contracts.llm import LLMClient
+from agentforge_core.contracts.memory import MemoryStore
+from agentforge_core.contracts.strategy import ReasoningStrategy
+from agentforge_core.contracts.tool import Tool
+from agentforge_core.observability import get_tracer
+from agentforge_core.production.budget import BudgetPolicy
+from agentforge_core.production.exceptions import (
+    AgentForgeError,
+    BudgetExceeded,
+    GuardrailViolation,
+    ModuleError,
+)
+from agentforge_core.production.log_filter import (
+    install_run_id_filter,
+    uninstall_run_id_filter,
+)
+from agentforge_core.production.log_format import (
+    install_json_formatter,
+    uninstall_json_formatter,
+)
+from agentforge_core.production.run_context import (
+    RunContext,
+    bind_run,
+    new_run,
+    reset_run,
+)
+from agentforge_core.resolver import Resolver, parse_model_string
+from agentforge_core.values.chat import StreamingEvent
+from agentforge_core.values.state import AgentState, FinishReason, RunResult, Step
+from agentforge.config import AgentForgeConfig, load_config
+from agentforge.memory import InMemoryStore
+from agentforge.pipeline import Pipeline, PipelineFailure, PipelineFindingsTool, PipelineResult
+from agentforge.retrieval import Retriever
+from agentforge.runtime import RUNTIME_KEY, RuntimeContext
+_evaluator_log = logging.getLogger("agentforge.evaluators")
+_observability_log = logging.getLogger("agentforge.observability")
+StepHook = Callable[..., Awaitable[None] | None]
+"""Hook signature: takes a Step, returns awaitable-or-None."""
+FinishHook = Callable[..., Awaitable[None] | None]
+"""Hook signature: takes a RunResult, returns awaitable-or-None."""
+StepHooks = StepHook | list[StepHook]
+"""Constructor accepts a single hook or a list. Internally normalised
+to a list — see `Agent.__init__`. feat-009 spec §4.4: multiple
+observability backends can run concurrently against the same run."""
+FinishHooks = FinishHook | list[FinishHook]
+class Agent:
+    """Framework-level agent orchestrator.
+    The constructor signature is the locked public API; see
+    `docs/features/feat-001-core-contracts-and-agent.md` §4.2.
+    """
+    def __init__(
+        self,
+        *,
+        model: str | LLMClient | None = None,
+        tools: list[Tool] | None = None,
+        strategy: str | ReasoningStrategy | None = None,
+        memory: MemoryStore | None = None,
+        retriever: Retriever | None = None,
+        graph_store: GraphStore | None = None,
+        evaluators: list[Evaluator] | None = None,
+        system_prompt: str | None = None,
+        budget_usd: float | None = None,
+        max_iterations: int | None = None,
+        on_step: StepHooks | None = None,
+        on_finish: FinishHooks | None = None,
+        config_path: str | Path | None = None,
+        install_log_filter: bool = True,
+        record_runs: MemoryStore | None = None,
+        input_validators: list[InputValidator] | None = None,
+        output_validators: list[OutputValidator] | None = None,
+        tool_gates: list[ToolCallGate] | None = None,
+        guardrail_policy: GuardrailPolicy | None = None,
+        pipeline: Pipeline | None = None,
+    ) -> None:
+        self._config: AgentForgeConfig = load_config(config_path)
+        # Resolve model. The widened config (feat-012) allows
+        # `model:` to be a dict for inline llm_options, but feat-001's
+        # constructor still only accepts `str | LLMClient` directly.
+        # When the YAML form is a dict, prefer the explicit kwarg or
+        # error at startup.
+        self._llm: LLMClient | None = self._resolve_model(
+            _pick_str_form(model, self._config.agent.model, field="model")
+        )
+        # Resolve strategy. Same shape constraint as `model`.
+        self._strategy: ReasoningStrategy = self._resolve_strategy(
+            _pick_str_form(strategy, self._config.agent.strategy, field="strategy")
+        )
+        # Defaults: in-memory store, no evaluators, no tools.
+        self._memory: MemoryStore = memory if memory is not None else InMemoryStore()
+        self._retriever: Retriever | None = retriever
+        self._graph_store: GraphStore | None = graph_store
+        self._tools: list[Tool] = list(tools) if tools is not None else []
+        self._evaluators: list[Evaluator] = list(evaluators) if evaluators is not None else []
+        self._system_prompt: str | None = (
+            system_prompt if system_prompt is not None else self._config.agent.system_prompt
+        )
+        # Budget — kwargs override config; config overrides Pydantic default.
+        cap_usd = budget_usd if budget_usd is not None else self._config.agent.budget.usd
+        max_iter = (
+            max_iterations if max_iterations is not None else self._config.agent.max_iterations
+        )
+        self._budget = BudgetPolicy(usd=cap_usd, max_iterations=max_iter)
+        self._on_step: list[StepHook] = _normalise_hooks(on_step)
+        self._on_finish: list[FinishHook] = _normalise_hooks(on_finish)
+        # feat-018: build the GuardrailEngine. Built-ins on by default
+        # (modules.guardrails.defaults) and combined with any
+        # validators passed explicitly via the constructor kwargs.
+        from agentforge.guardrails.engine import GuardrailEngine  # noqa: PLC0415
+        policy = guardrail_policy if guardrail_policy is not None else self._config.guardrail_policy
+        self._guardrails = GuardrailEngine(
+            input_validators=list(input_validators or []),
+            output_validators=list(output_validators or []),
+            tool_gates=list(tool_gates or []),
+            policy=policy,
+        )
+        # feat-017: optional run recording. When `record_runs` is set,
+        # install hooks that persist every step + the final result as
+        # claims so `agentforge run --replay` and `agentforge debug`
+        # can reconstruct the run. Recording errors fall under the
+        # same isolation as other hooks (logged at WARN, never break
+        # the run — feat-009 §4.3).
+        if record_runs is not None:
+            from agentforge.recording import RecordRunHook  # noqa: PLC0415
+            recorder = RecordRunHook(
+                memory=record_runs,
+                project="default",
+                agent_name=self._config.agent.name or "agent",
+            )
+            self._on_step.append(recorder.on_step)
+            self._on_finish.append(recorder.on_finish)
+        self._record_runs: MemoryStore | None = record_runs
+        # feat-015: optional pre-LLM pipeline. When set, runs to
+        # completion before the strategy loop; findings are exposed
+        # via a built-in `pipeline_findings` tool and a system-prompt
+        # addendum. Replay short-circuits actual execution by reading
+        # the recorded `__pipeline` claim.
+        self._pipeline: Pipeline | None = pipeline
+        self._pipeline_tool: PipelineFindingsTool | None = None
+        if pipeline is not None:
+            self._pipeline_tool = PipelineFindingsTool()
+            self._tools.append(self._pipeline_tool)
+        self._closed = False
+        if install_log_filter and self._config.logging.run_id_filter:
+            install_run_id_filter()
+        if install_log_filter and self._config.logging.format == "json":
+            install_json_formatter()
+    # ------------------------------------------------------------------
+    # Resolution helpers (used at construction; raise at startup, P11).
+    # ------------------------------------------------------------------
+    def _resolve_model(self, model: str | LLMClient | None) -> LLMClient | None:
+        if model is None:
+            return None
+        if isinstance(model, LLMClient):
+            return model
+        # String — parse "<provider>:<model_id>" and look up the
+        # provider in the resolver. feat-003 lights up the bedrock
+        # provider; future provider packages (anthropic, openai, ...)
+        # register themselves the same way at import time.
+        provider, model_id = parse_model_string(model)
+        try:
+            cls = Resolver.global_().resolve("providers", provider)
+        except ModuleError as exc:
+            raise ModuleError(
+                f"No LLM provider registered for {provider!r}. "
+                f"Install agentforge-{provider} (e.g. `uv add agentforge-{provider}`) "
+                f"or pass a typed LLMClient instance via Agent(model=...)."
+            ) from exc
+        instance = cls(model_id=model_id)
+        if not isinstance(instance, LLMClient):
+            raise ModuleError(
+                f"Resolved provider {provider!r} ({cls.__name__}) does not implement LLMClient."
+            )
+        return instance
+    def _resolve_strategy(self, strategy: str | ReasoningStrategy | None) -> ReasoningStrategy:
+        if isinstance(strategy, ReasoningStrategy):
+            return strategy
+        if strategy is None:
+            raise ModuleError(
+                "No reasoning strategy provided. feat-001 ships only the "
+                "ReasoningStrategy ABC; install agentforge[react] (when feat-002 "
+                "ships) or pass a custom ReasoningStrategy instance via "
+                "Agent(strategy=...)."
+            )
+        # String name — look up in the resolver (feat-002 will register
+        # ReActLoop here when it ships).
+        cls = Resolver.global_().resolve("strategies", strategy)
+        if not callable(cls):
+            raise ModuleError(f"Resolved strategy {strategy!r} is not constructible: {cls!r}.")
+        instance = cls()
+        if not isinstance(instance, ReasoningStrategy):
+            raise ModuleError(
+                f"Resolved strategy {strategy!r} ({cls.__name__}) does not "
+                f"implement ReasoningStrategy."
+            )
+        return instance
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    @property
+    def memory(self) -> MemoryStore:
+        return self._memory
+    @property
+    def tools(self) -> list[Tool]:
+        return list(self._tools)
+    @property
+    def budget(self) -> BudgetPolicy:
+        return self._budget
+    @property
+    def pipeline(self) -> Pipeline | None:
+        return self._pipeline
+    def _build_runtime_metadata(
+        self,
+        run_budget: BudgetPolicy,
+        guard_ctx: dict[str, Any],
+        *,
+        system_prompt: str | None = None,
+    ) -> dict[str, object]:
+        """Build the `state.metadata` mapping that carries the
+        per-run `RuntimeContext`. Wraps the LLM + tools with the
+        guardrail engine so output validation and tool-call gating
+        happen inside the strategy loop transparently.
+        `system_prompt`, when provided, overrides `self._system_prompt`
+        for this single run only (feat-015 uses this to append the
+        pipeline-findings addendum without mutating the configured
+        prompt).
+        """
+        metadata: dict[str, object] = {}
+        if self._llm is None:
+            return metadata
+        def _ctx_factory() -> dict[str, object]:
+            return dict(guard_ctx)
+        metadata[RUNTIME_KEY] = RuntimeContext(
+            llm=self._guardrails.wrap_llm(self._llm, _ctx_factory),
+            tools=tuple(self._guardrails.wrap_tool(t, _ctx_factory) for t in self._tools),
+            memory=self._memory,
+            budget=run_budget,
+            system_prompt=system_prompt if system_prompt is not None else self._system_prompt,
+            retriever=self._retriever,
+            graph_store=self._graph_store,
+        )
+        return metadata
+    async def _maybe_run_pipeline(
+        self,
+        *,
+        context: dict[str, Any] | None,
+        run_budget: BudgetPolicy,
+        run_id: str,
+        replay_pipeline: PipelineResult | None,
+    ) -> PipelineResult | None:
+        """Run the configured pipeline (or load it from a replay), apply
+        cost accounting, and bind the findings to the built-in tool.
+        Returns ``None`` when the agent has no pipeline configured.
+        Raises `BudgetExceeded` if the pipeline alone exhausts the run
+        budget. Raises `PipelineFailure` if `on_task_error="fail"`
+        and a task errors.
+        """
+        if self._pipeline is None and replay_pipeline is None:
+            return None
+        if replay_pipeline is not None:
+            result = replay_pipeline
+        else:
+            assert self._pipeline is not None  # narrowing for mypy
+            result = await self._pipeline.run(context or {})
+        # Charge declared pipeline cost against the budget.
+        if result.total_cost_usd > 0.0:
+            run_budget.commit(result.total_cost_usd)
+            run_budget.check()
+        if self._pipeline_tool is not None:
+            self._pipeline_tool._set_cache(list(result.findings))
+        # Persist as a `__pipeline` claim when recording.
+        if self._record_runs is not None and replay_pipeline is None:
+            from agentforge.recording import record_pipeline_result  # noqa: PLC0415
+            await record_pipeline_result(
+                memory=self._record_runs,
+                run_id=run_id,
+                project="default",
+                agent_name=self._config.agent.name or "agent",
+                result=result,
+            )
+        return result
+    def _compose_system_prompt(self, pipeline_result: PipelineResult | None) -> str | None:
+        """Produce the per-run system prompt: the configured prompt
+        with the optional pipeline-findings addendum appended."""
+        if pipeline_result is None or not pipeline_result.findings:
+            return self._system_prompt
+        addendum = _format_pipeline_addendum(pipeline_result)
+        if self._system_prompt is None:
+            return addendum
+        return f"{self._system_prompt}\n\n{addendum}"
+    async def run(
+        self,
+        task: str,
+        *,
+        context: dict[str, Any] | None = None,
+        replay_pipeline: PipelineResult | None = None,
+    ) -> RunResult:
+        """Execute the agent's reasoning loop on `task`.
+        Args:
+            task: The task text the agent should reason about.
+            context: Extra key-value context passed to a configured
+                pipeline (feat-015). Ignored when no pipeline is set.
+            replay_pipeline: When replaying a recorded run, the
+                previously recorded `PipelineResult` is threaded in
+                here so the pipeline doesn't re-execute. Set by the
+                replay CLI; user code rarely passes it directly.
+        Returns:
+            A `RunResult` with the agent's output, full trace, and cost.
+        """
+        if self._closed:
+            raise ModuleError("Agent has been closed; create a new instance.")
+        ctx: RunContext = new_run(task=task)
+        token = bind_run(ctx)
+        started_ms = time.monotonic()
+        finish_reason: FinishReason = "completed"
+        tracer = get_tracer()
+        try:
+            with tracer.start_as_current_span(
+                "agent.run",
+                attributes={
+                    "agentforge.run_id": ctx.run_id,
+                    "agentforge.task": task,
+                },
+            ) as run_span:
+                run_budget = BudgetPolicy(
+                    usd=self._budget.usd,
+                    max_tokens=self._budget.max_tokens,
+                    max_iterations=self._budget.max_iterations,
+                    error_streak_limit=self._budget.error_streak_limit,
+                )
+                guard_ctx: dict[str, Any] = {
+                    "run_id": ctx.run_id,
+                    "project": self._config.agent.name or "default",
+                }
+                pipeline_result: PipelineResult | None = None
+                state: AgentState | None = None
+                try:
+                    pipeline_result = await self._maybe_run_pipeline(
+                        context=context,
+                        run_budget=run_budget,
+                        run_id=ctx.run_id,
+                        replay_pipeline=replay_pipeline,
+                    )
+                except PipelineFailure:
+                    finish_reason = "pipeline"
+                    raise
+                run_system_prompt = self._compose_system_prompt(pipeline_result)
+                metadata = self._build_runtime_metadata(
+                    run_budget, guard_ctx, system_prompt=run_system_prompt
+                )
+                try:
+                    validated_task = await self._guardrails.check_input(task, guard_ctx)
+                    state = AgentState(
+                        run_id=ctx.run_id,
+                        task=validated_task,
+                        metadata=metadata,
+                    )
+                    await self._strategy.run(state)
+                except BudgetExceeded:
+                    finish_reason = "budget_exceeded"
+                    raise
+                except GuardrailViolation:
+                    finish_reason = "guardrail"
+                    raise
+                except AgentForgeError:
+                    finish_reason = "error"
+                    raise
+                finally:
+                    # Fire `on_step` for every step the strategy appended,
+                    # even on error paths — observability of the partial
+                    # trace is just as important as the happy path.
+                    if state is not None:
+                        await self._fire_steps(list(state.steps))
+                result = await self._finalize_result(
+                    state=state,
+                    task=task,
+                    run_budget=run_budget,
+                    run_id=ctx.run_id,
+                    started_ms=started_ms,
+                    finish_reason=finish_reason,
+                )
+                _tag_run_span(run_span, result, finish_reason)
+                await self._fire_finish(result)
+                return result
+        finally:
+            reset_run(token)
+    async def stream(
+        self,
+        task: str,
+        *,
+        context: dict[str, Any] | None = None,
+        replay_pipeline: PipelineResult | None = None,
+    ) -> AsyncIterator[StreamingEvent]:
+        """Streaming counterpart to :meth:`run` (feat-020 v0.2).
+        Drives the agent via ``strategy.stream(state)`` and yields
+        every event as it arrives. Same setup as ``run()`` —
+        guardrails on input, pipeline, RunContext binding, span
+        tracing, finalize-result, on_finish hook. The terminal
+        ``done`` event carries the full :class:`RunResult` shape in
+        ``content`` (``output`` / ``run_id`` / ``cost_usd`` /
+        ``tokens_in`` / ``tokens_out`` / ``finish_reason``) so
+        callers don't need a second round-trip.
+        Strategies that don't override
+        :meth:`ReasoningStrategy.stream` get the ABC's default
+        behaviour: one terminal ``done`` event. Callers (e.g.
+        :class:`ChatSession`) check the override and fall back to
+        the buffered ``run()`` + segment-and-stream path when the
+        default is in effect.
+        """
+        if self._closed:
+            raise ModuleError("Agent has been closed; create a new instance.")
+        ctx: RunContext = new_run(task=task)
+        token = bind_run(ctx)
+        started_ms = time.monotonic()
+        finish_reason: FinishReason = "completed"
+        tracer = get_tracer()
+        try:
+            with tracer.start_as_current_span(
+                "agent.stream",
+                attributes={
+                    "agentforge.run_id": ctx.run_id,
+                    "agentforge.task": task,
+                },
+            ) as run_span:
+                run_budget = BudgetPolicy(
+                    usd=self._budget.usd,
+                    max_tokens=self._budget.max_tokens,
+                    max_iterations=self._budget.max_iterations,
+                    error_streak_limit=self._budget.error_streak_limit,
+                )
+                guard_ctx: dict[str, Any] = {
+                    "run_id": ctx.run_id,
+                    "project": self._config.agent.name or "default",
+                }
+                pipeline_result: PipelineResult | None = None
+                state: AgentState | None = None
+                try:
+                    pipeline_result = await self._maybe_run_pipeline(
+                        context=context,
+                        run_budget=run_budget,
+                        run_id=ctx.run_id,
+                        replay_pipeline=replay_pipeline,
+                    )
+                except PipelineFailure:
+                    finish_reason = "pipeline"
+                    raise
+                run_system_prompt = self._compose_system_prompt(pipeline_result)
+                metadata = self._build_runtime_metadata(
+                    run_budget, guard_ctx, system_prompt=run_system_prompt
+                )
+                try:
+                    validated_task = await self._guardrails.check_input(task, guard_ctx)
+                    state = AgentState(
+                        run_id=ctx.run_id,
+                        task=validated_task,
+                        metadata=metadata,
+                    )
+                    async for event in self._strategy.stream(state):
+                        # Strategy may emit a terminal `done` itself
+                        # (default ABC impl does). Swallow it — we
+                        # emit the canonical terminal `done` below
+                        # with the full RunResult shape.
+                        if event.kind == "done":
+                            continue
+                        yield event
+                except BudgetExceeded:
+                    finish_reason = "budget_exceeded"
+                    raise
+                except GuardrailViolation:
+                    finish_reason = "guardrail"
+                    raise
+                except AgentForgeError:
+                    finish_reason = "error"
+                    raise
+                finally:
+                    if state is not None:
+                        await self._fire_steps(list(state.steps))
+                result = await self._finalize_result(
+                    state=state,
+                    task=task,
+                    run_budget=run_budget,
+                    run_id=ctx.run_id,
+                    started_ms=started_ms,
+                    finish_reason=finish_reason,
+                )
+                _tag_run_span(run_span, result, finish_reason)
+                await self._fire_finish(result)
+                yield StreamingEvent(
+                    kind="done",
+                    content={
+                        "output": result.output,
+                        "run_id": result.run_id,
+                        "cost_usd": float(result.cost_usd),
+                        "tokens_in": int(result.tokens_in),
+                        "tokens_out": int(result.tokens_out),
+                        "finish_reason": str(result.finish_reason),
+                        "duration_ms": int(result.duration_ms),
+                    },
+                )
+        finally:
+            reset_run(token)
+    async def _finalize_result(
+        self,
+        *,
+        state: AgentState,
+        task: str,
+        run_budget: BudgetPolicy,
+        run_id: str,
+        started_ms: float,
+        finish_reason: FinishReason,
+    ) -> RunResult:
+        duration_ms = int((time.monotonic() - started_ms) * 1000)
+        output = self._extract_output(state)
+        tokens_in = sum(s.tokens_in for s in state.steps)
+        tokens_out = sum(s.tokens_out for s in state.steps)
+        interim = RunResult(
+            output=output,
+            steps=tuple(state.steps),
+            cost_usd=run_budget.spent_usd,
+            tokens_in=tokens_in,
+            tokens_out=tokens_out,
+            run_id=run_id,
+            duration_ms=duration_ms,
+            finish_reason=finish_reason,
+            guardrail_events=tuple(self._guardrails.events),
+        )
+        eval_scores = await self._run_evaluators(interim, task=task, state=state, budget=run_budget)
+        return interim.model_copy(update={"eval_scores": eval_scores})
+    async def _run_evaluators(
+        self,
+        result: RunResult,
+        *,
+        task: str,
+        state: AgentState,
+        budget: BudgetPolicy,
+    ) -> tuple[EvalResult, ...]:
+        """Iterate configured evaluators, gating each by remaining budget.
+        Per feat-006 §4.3: skip an evaluator if
+        `budget.remaining_usd() < evaluator.cost_estimate_usd`; log at
+        WARN. The evaluator receives the just-built `RunResult` as
+        `finding` and a context dict carrying `task`, `state`, and
+        `budget` so judge graders can reserve / commit against the
+        live policy.
+        Skipped evaluators do not appear in the returned tuple — only
+        evaluators that actually ran. Order preserved.
+        """
+        if not self._evaluators:
+            return ()
+        context: dict[str, object] = {"task": task, "state": state, "budget": budget}
+        out: list[EvalResult] = []
+        for evaluator in self._evaluators:
+            est = float(getattr(evaluator, "cost_estimate_usd", 0.0))
+            remaining = budget.remaining_usd()
+            if est > remaining:
+                _evaluator_log.warning(
+                    "skipping evaluator %r: budget exhausted (need=$%.4f, remaining=$%.4f)",
+                    evaluator.name,
+                    est,
+                    remaining,
+                )
+                continue
+            tracer = get_tracer()
+            started_ms = time.monotonic()
+            with tracer.start_as_current_span(
+                f"evaluator.{evaluator.name}",
+                attributes={
+                    "agentforge.evaluator.name": evaluator.name,
+                    "agentforge.evaluator.cost_estimate_usd": est,
+                },
+            ) as ev_span:
+                eval_result = await evaluator.evaluate(result, context)
+                ev_span.set_attribute("agentforge.evaluator.score", float(eval_result.score))
+                ev_span.set_attribute(
+                    "agentforge.evaluator.cost_usd",
+                    float(getattr(eval_result, "cost_usd", 0.0)),
+                )
+                ev_span.set_attribute(
+                    "agentforge.evaluator.duration_ms",
+                    int((time.monotonic() - started_ms) * 1000),
+                )
+            out.append(eval_result)
+        return tuple(out)
+    async def close(self) -> None:
+        """Release resources held by the agent (LLM, memory, log filter)."""
+        if self._closed:
+            return
+        self._closed = True
+        if self._llm is not None:
+            await self._llm.close()
+        await self._memory.close()
+        if self._graph_store is not None:
+            await self._graph_store.close()
+        uninstall_run_id_filter()
+        uninstall_json_formatter()
+    async def __aenter__(self) -> Agent:
+        return self
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
+    ) -> None:
+        await self.close()
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _extract_output(state: AgentState) -> str:
+        """Pick the agent's final output from `state.steps`.
+        feat-001 uses the simplest rule: the content of the last
+        non-system step, stringified. feat-002 strategies will set
+        a richer convention.
+        """
+        for step in reversed(state.steps):
+            if step.kind != "system":
+                content = step.content
+                return content if isinstance(content, str) else str(content)
+        return ""
+    async def _fire_finish(self, result: RunResult) -> None:
+        """Fire every finish hook in registration order. Each hook is
+        isolated — a raise gets logged at WARN via the
+        `agentforge.observability` logger and does NOT propagate.
+        Per feat-009 §4.3: "Observability must never break the run."
+        """
+        for hook in self._on_finish:
+            await _safe_call_hook(hook, result, kind="on_finish")
+    async def _fire_steps(self, new_steps: list[Step]) -> None:
+        """Fire every step hook for each newly-appended step.
+        Order: (step1, hook_a), (step1, hook_b), (step2, hook_a), ...
+        — finish each step's hook fan-out before moving to the next.
+        Errors are isolated per-hook same as `_fire_finish`.
+        """
+        if not self._on_step or not new_steps:
+            return
+        for step in new_steps:
+            for hook in self._on_step:
+                await _safe_call_hook(hook, step, kind="on_step")
+def _pick_str_form(
+    kwarg_value: Any,
+    config_value: Any,
+    *,
+    field: str,
+) -> Any:
+    """Prefer `kwarg_value`; if absent, use `config_value` unless it's
+    a dict (inline-options form, feat-012 §4.5 — not yet supported at
+    Agent construction). Returns `str | object | None` typed as `Any`
+    so the per-field resolver narrows on its own.
+    """
+    if kwarg_value is not None:
+        return kwarg_value
+    if isinstance(config_value, dict):
+        raise ModuleError(
+            f"agent.{field} in agentforge.yaml is a dict (inline options form); "
+            f"not yet supported at Agent construction. Pass {field}= explicitly "
+            "or use the string form in YAML."
+        )
+    return config_value
+def _normalise_hooks(hooks: Any) -> list[Any]:
+    """Accept `None | Callable | list[Callable]`; return a fresh list.
+    Centralised so the on_step / on_finish surfaces stay in sync.
+    """
+    if hooks is None:
+        return []
+    if isinstance(hooks, list):
+        return list(hooks)
+    return [hooks]
+def _tag_run_span(span: Any, result: RunResult, finish_reason: FinishReason) -> None:
+    """Stamp the run span with the run summary before it closes."""
+    span.set_attribute("agentforge.finish_reason", finish_reason)
+    span.set_attribute("agentforge.cost_usd", result.cost_usd)
+    span.set_attribute("agentforge.tokens_in", result.tokens_in)
+    span.set_attribute("agentforge.tokens_out", result.tokens_out)
+    span.set_attribute("agentforge.duration_ms", result.duration_ms)
+    span.set_attribute("agentforge.n_steps", len(result.steps))
+def _format_pipeline_addendum(result: PipelineResult) -> str:
+    """Render `PipelineResult.findings` as a markdown section the LLM
+    sees in the per-run system prompt (feat-015 §4.3).
+    Format:
+        ## Pipeline findings
+        - [severity] category: message
+    Empty findings short-circuit at the caller, so this is only
+    invoked when there's at least one finding to render.
+    """
+    lines = ["## Pipeline findings", ""]
+    for f in result.findings:
+        sev = getattr(f, "severity", "info")
+        cat = getattr(f, "category", "")
+        msg = getattr(f, "message", "")
+        lines.append(f"- [{sev}] {cat}: {msg}")
+    return "\n".join(lines)
+async def _safe_call_hook(hook: Any, payload: Any, *, kind: str) -> None:
+    """Invoke a hook with `payload`; await if it returned an awaitable;
+    catch + log any exception so the run keeps going.
+    "Observability must never break the run" per feat-009 §4.3.
+    """
+    try:
+        outcome = hook(payload)
+        if outcome is not None and hasattr(outcome, "__await__"):
+            await outcome
+    except Exception as exc:
+        _observability_log.warning(
+            "hook %s raised %s: %s (hook=%r)",
+            kind,
+            type(exc).__name__,
+            exc,
+            getattr(hook, "__name__", hook),
+        )