agentforge-py 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. agentforge/__init__.py +114 -0
  2. agentforge/_testing/__init__.py +19 -0
  3. agentforge/_testing/fake_llm.py +126 -0
  4. agentforge/_testing/fake_tool.py +122 -0
  5. agentforge/_tools/__init__.py +14 -0
  6. agentforge/_tools/calculator.py +102 -0
  7. agentforge/_tools/decorator.py +300 -0
  8. agentforge/_tools/file_read.py +112 -0
  9. agentforge/_tools/shell.py +134 -0
  10. agentforge/_tools/web_search.py +207 -0
  11. agentforge/agent.py +817 -0
  12. agentforge/auth.py +42 -0
  13. agentforge/cli/__init__.py +18 -0
  14. agentforge/cli/_build.py +323 -0
  15. agentforge/cli/_scaffold_state.py +250 -0
  16. agentforge/cli/_shared_scaffold.py +174 -0
  17. agentforge/cli/config_cmd.py +174 -0
  18. agentforge/cli/db_cmd.py +262 -0
  19. agentforge/cli/debug_cmd.py +168 -0
  20. agentforge/cli/docs_cmd.py +217 -0
  21. agentforge/cli/eval_cmd.py +181 -0
  22. agentforge/cli/health_cmd.py +139 -0
  23. agentforge/cli/list_modules.py +85 -0
  24. agentforge/cli/main.py +81 -0
  25. agentforge/cli/manifest_apply.py +368 -0
  26. agentforge/cli/module_cmd.py +247 -0
  27. agentforge/cli/new_cmd.py +171 -0
  28. agentforge/cli/run_cmd.py +234 -0
  29. agentforge/cli/upgrade_cmd.py +230 -0
  30. agentforge/config/__init__.py +45 -0
  31. agentforge/eval/__init__.py +18 -0
  32. agentforge/eval/consistency.py +107 -0
  33. agentforge/eval/coverage.py +100 -0
  34. agentforge/eval/format_compliance.py +107 -0
  35. agentforge/eval/regression.py +143 -0
  36. agentforge/findings.py +166 -0
  37. agentforge/guardrails/__init__.py +32 -0
  38. agentforge/guardrails/allowlist.py +49 -0
  39. agentforge/guardrails/capability_check.py +58 -0
  40. agentforge/guardrails/engine.py +289 -0
  41. agentforge/guardrails/pii_redact_basic.py +61 -0
  42. agentforge/guardrails/prompt_injection_basic.py +90 -0
  43. agentforge/memory/__init__.py +16 -0
  44. agentforge/memory/in_memory.py +130 -0
  45. agentforge/memory/in_memory_graph.py +262 -0
  46. agentforge/memory/in_memory_vector.py +167 -0
  47. agentforge/pipeline/__init__.py +26 -0
  48. agentforge/pipeline/engine.py +189 -0
  49. agentforge/pipeline/errors.py +19 -0
  50. agentforge/pipeline/tool.py +93 -0
  51. agentforge/py.typed +0 -0
  52. agentforge/recording.py +189 -0
  53. agentforge/renderers/__init__.py +28 -0
  54. agentforge/renderers/_defaults.py +32 -0
  55. agentforge/renderers/markdown.py +44 -0
  56. agentforge/renderers/patch_applier.py +46 -0
  57. agentforge/renderers/registry.py +108 -0
  58. agentforge/renderers/scorecard.py +59 -0
  59. agentforge/renderers/span_table.py +71 -0
  60. agentforge/replay.py +260 -0
  61. agentforge/resolver_register.py +41 -0
  62. agentforge/retrieval.py +410 -0
  63. agentforge/runtime.py +63 -0
  64. agentforge/strategies/__init__.py +27 -0
  65. agentforge/strategies/_base.py +280 -0
  66. agentforge/strategies/_plan.py +93 -0
  67. agentforge/strategies/multi_agent.py +541 -0
  68. agentforge/strategies/plan_execute.py +506 -0
  69. agentforge/strategies/react.py +237 -0
  70. agentforge/strategies/tot.py +472 -0
  71. agentforge/templates/_shared/.cursorrules +12 -0
  72. agentforge/templates/_shared/.github/copilot-instructions.md +13 -0
  73. agentforge/templates/_shared/.gitkeep +0 -0
  74. agentforge/templates/_shared/AGENTS.md.tmpl +123 -0
  75. agentforge/templates/_shared/CLAUDE.md +13 -0
  76. agentforge/templates/_shared/docs/runbooks/01-set-up-new-agent.md.tmpl +67 -0
  77. agentforge/templates/_shared/docs/runbooks/02-add-a-tool.md +67 -0
  78. agentforge/templates/_shared/docs/runbooks/03-add-a-pipeline-task.md +69 -0
  79. agentforge/templates/_shared/docs/runbooks/04-pick-reasoning-strategy.md +67 -0
  80. agentforge/templates/_shared/docs/runbooks/05-write-prompts.md +75 -0
  81. agentforge/templates/_shared/docs/runbooks/06-test-your-agent.md +75 -0
  82. agentforge/templates/_shared/docs/runbooks/07-debug-a-run.md +70 -0
  83. agentforge/templates/_shared/docs/runbooks/08-add-memory.md +75 -0
  84. agentforge/templates/_shared/docs/runbooks/09-add-mcp.md +78 -0
  85. agentforge/templates/_shared/docs/runbooks/10-add-evaluators.md +76 -0
  86. agentforge/templates/_shared/docs/runbooks/11-add-safety-guardrails.md +83 -0
  87. agentforge/templates/_shared/docs/runbooks/12-add-observability.md +77 -0
  88. agentforge/templates/_shared/docs/runbooks/13-configure-multi-provider.md +91 -0
  89. agentforge/templates/_shared/docs/runbooks/14-deploy-your-agent.md +70 -0
  90. agentforge/templates/_shared/docs/runbooks/15-upgrade-your-agent.md +67 -0
  91. agentforge/templates/_shared/docs/runbooks/16-configuration-reference.md +81 -0
  92. agentforge/templates/_shared/docs/runbooks/17-add-reranker.md +78 -0
  93. agentforge/templates/_shared/docs/runbooks/18-add-hybrid-search.md +78 -0
  94. agentforge/templates/_shared/docs/runbooks/19-add-graphrag.md +83 -0
  95. agentforge/templates/_shared/docs/runbooks/20-apply-schema-migrations.md +92 -0
  96. agentforge/templates/_shared/docs/runbooks/21-use-streaming-guardrails.md +82 -0
  97. agentforge/templates/_shared/docs/runbooks/README.md.tmpl +68 -0
  98. agentforge/templates/code-reviewer/.env.example +8 -0
  99. agentforge/templates/code-reviewer/.gitignore +7 -0
  100. agentforge/templates/code-reviewer/README.md +12 -0
  101. agentforge/templates/code-reviewer/agentforge.yaml +23 -0
  102. agentforge/templates/code-reviewer/copier.yml +34 -0
  103. agentforge/templates/code-reviewer/pyproject.toml +18 -0
  104. agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  105. agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  106. agentforge/templates/docs-qa/.env.example +8 -0
  107. agentforge/templates/docs-qa/.gitignore +7 -0
  108. agentforge/templates/docs-qa/README.md +14 -0
  109. agentforge/templates/docs-qa/agentforge.yaml +19 -0
  110. agentforge/templates/docs-qa/copier.yml +31 -0
  111. agentforge/templates/docs-qa/pyproject.toml +18 -0
  112. agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  113. agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  114. agentforge/templates/minimal/.env.example +11 -0
  115. agentforge/templates/minimal/.gitignore +10 -0
  116. agentforge/templates/minimal/README.md +28 -0
  117. agentforge/templates/minimal/agentforge.yaml +10 -0
  118. agentforge/templates/minimal/copier.yml +52 -0
  119. agentforge/templates/minimal/pyproject.toml +18 -0
  120. agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  121. agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/main.py +34 -0
  122. agentforge/templates/patch-bot/.env.example +8 -0
  123. agentforge/templates/patch-bot/.gitignore +7 -0
  124. agentforge/templates/patch-bot/README.md +13 -0
  125. agentforge/templates/patch-bot/agentforge.yaml +15 -0
  126. agentforge/templates/patch-bot/copier.yml +31 -0
  127. agentforge/templates/patch-bot/pyproject.toml +18 -0
  128. agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  129. agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  130. agentforge/templates/research/.env.example +8 -0
  131. agentforge/templates/research/.gitignore +7 -0
  132. agentforge/templates/research/README.md +14 -0
  133. agentforge/templates/research/agentforge.yaml +17 -0
  134. agentforge/templates/research/copier.yml +31 -0
  135. agentforge/templates/research/pyproject.toml +18 -0
  136. agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  137. agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/main.py +31 -0
  138. agentforge/templates/triage/.env.example +8 -0
  139. agentforge/templates/triage/.gitignore +7 -0
  140. agentforge/templates/triage/README.md +14 -0
  141. agentforge/templates/triage/agentforge.yaml +25 -0
  142. agentforge/templates/triage/copier.yml +31 -0
  143. agentforge/templates/triage/pyproject.toml +18 -0
  144. agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  145. agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/main.py +30 -0
  146. agentforge/testing/__init__.py +69 -0
  147. agentforge/testing/conformance.py +40 -0
  148. agentforge/testing/factory.py +89 -0
  149. agentforge/testing/fixtures.py +42 -0
  150. agentforge/testing/llm.py +235 -0
  151. agentforge/testing/recording.py +177 -0
  152. agentforge/tools/__init__.py +41 -0
  153. agentforge_py-0.2.1.dist-info/METADATA +158 -0
  154. agentforge_py-0.2.1.dist-info/RECORD +157 -0
  155. agentforge_py-0.2.1.dist-info/WHEEL +4 -0
  156. agentforge_py-0.2.1.dist-info/entry_points.txt +2 -0
  157. agentforge_py-0.2.1.dist-info/licenses/LICENSE +202 -0
agentforge/agent.py ADDED
@@ -0,0 +1,817 @@
1
+ """`Agent` — the framework's top-level orchestrator.
2
+
3
+ Per feat-001 §4.2 and ADR-0007, the constructor surface is locked.
4
+ Adding a kwarg with a safe default is a minor bump; removing or
5
+ renaming requires a major bump.
6
+
7
+ Lifecycle (per ADR-0010):
8
+
9
+ Agent.__init__: load config → resolve modules → wire defaults →
10
+ install RunIdFilter (if configured)
11
+ Agent.run(task): bind RunContext → call strategy.run(state) →
12
+ run evaluators → fire on_finish → return RunResult
13
+ Agent.close(): release LLM client / memory / hooks (async ctx mgr OK)
14
+
15
+ feat-001 ships the lifecycle + locked surface; feat-002 adds the
16
+ default `ReActLoop`, feat-003 the provider surface, feat-007 the full
17
+ fallback chain. The `Agent` constructor stays unchanged across those
18
+ features.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import logging
24
+ import time
25
+ from collections.abc import AsyncIterator, Awaitable, Callable
26
+ from pathlib import Path
27
+ from types import TracebackType
28
+ from typing import Any
29
+
30
+ from agentforge_core.config.schema import GuardrailPolicy
31
+ from agentforge_core.contracts.evaluator import EvalResult, Evaluator
32
+ from agentforge_core.contracts.graph_store import GraphStore
33
+ from agentforge_core.contracts.guardrails import (
34
+ InputValidator,
35
+ OutputValidator,
36
+ ToolCallGate,
37
+ )
38
+ from agentforge_core.contracts.llm import LLMClient
39
+ from agentforge_core.contracts.memory import MemoryStore
40
+ from agentforge_core.contracts.strategy import ReasoningStrategy
41
+ from agentforge_core.contracts.tool import Tool
42
+ from agentforge_core.observability import get_tracer
43
+ from agentforge_core.production.budget import BudgetPolicy
44
+ from agentforge_core.production.exceptions import (
45
+ AgentForgeError,
46
+ BudgetExceeded,
47
+ GuardrailViolation,
48
+ ModuleError,
49
+ )
50
+ from agentforge_core.production.log_filter import (
51
+ install_run_id_filter,
52
+ uninstall_run_id_filter,
53
+ )
54
+ from agentforge_core.production.log_format import (
55
+ install_json_formatter,
56
+ uninstall_json_formatter,
57
+ )
58
+ from agentforge_core.production.run_context import (
59
+ RunContext,
60
+ bind_run,
61
+ new_run,
62
+ reset_run,
63
+ )
64
+ from agentforge_core.resolver import Resolver, parse_model_string
65
+ from agentforge_core.values.chat import StreamingEvent
66
+ from agentforge_core.values.state import AgentState, FinishReason, RunResult, Step
67
+
68
+ from agentforge.config import AgentForgeConfig, load_config
69
+ from agentforge.memory import InMemoryStore
70
+ from agentforge.pipeline import Pipeline, PipelineFailure, PipelineFindingsTool, PipelineResult
71
+ from agentforge.retrieval import Retriever
72
+ from agentforge.runtime import RUNTIME_KEY, RuntimeContext
73
+
74
+ _evaluator_log = logging.getLogger("agentforge.evaluators")
75
+ _observability_log = logging.getLogger("agentforge.observability")
76
+
77
+
78
+ StepHook = Callable[..., Awaitable[None] | None]
79
+ """Hook signature: takes a Step, returns awaitable-or-None."""
80
+
81
+ FinishHook = Callable[..., Awaitable[None] | None]
82
+ """Hook signature: takes a RunResult, returns awaitable-or-None."""
83
+
84
+ StepHooks = StepHook | list[StepHook]
85
+ """Constructor accepts a single hook or a list. Internally normalised
86
+ to a list — see `Agent.__init__`. feat-009 spec §4.4: multiple
87
+ observability backends can run concurrently against the same run."""
88
+
89
+ FinishHooks = FinishHook | list[FinishHook]
90
+
91
+
92
+ class Agent:
93
+ """Framework-level agent orchestrator.
94
+
95
+ The constructor signature is the locked public API; see
96
+ `docs/features/feat-001-core-contracts-and-agent.md` §4.2.
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ *,
102
+ model: str | LLMClient | None = None,
103
+ tools: list[Tool] | None = None,
104
+ strategy: str | ReasoningStrategy | None = None,
105
+ memory: MemoryStore | None = None,
106
+ retriever: Retriever | None = None,
107
+ graph_store: GraphStore | None = None,
108
+ evaluators: list[Evaluator] | None = None,
109
+ system_prompt: str | None = None,
110
+ budget_usd: float | None = None,
111
+ max_iterations: int | None = None,
112
+ on_step: StepHooks | None = None,
113
+ on_finish: FinishHooks | None = None,
114
+ config_path: str | Path | None = None,
115
+ install_log_filter: bool = True,
116
+ record_runs: MemoryStore | None = None,
117
+ input_validators: list[InputValidator] | None = None,
118
+ output_validators: list[OutputValidator] | None = None,
119
+ tool_gates: list[ToolCallGate] | None = None,
120
+ guardrail_policy: GuardrailPolicy | None = None,
121
+ pipeline: Pipeline | None = None,
122
+ ) -> None:
123
+ self._config: AgentForgeConfig = load_config(config_path)
124
+
125
+ # Resolve model. The widened config (feat-012) allows
126
+ # `model:` to be a dict for inline llm_options, but feat-001's
127
+ # constructor still only accepts `str | LLMClient` directly.
128
+ # When the YAML form is a dict, prefer the explicit kwarg or
129
+ # error at startup.
130
+ self._llm: LLMClient | None = self._resolve_model(
131
+ _pick_str_form(model, self._config.agent.model, field="model")
132
+ )
133
+
134
+ # Resolve strategy. Same shape constraint as `model`.
135
+ self._strategy: ReasoningStrategy = self._resolve_strategy(
136
+ _pick_str_form(strategy, self._config.agent.strategy, field="strategy")
137
+ )
138
+
139
+ # Defaults: in-memory store, no evaluators, no tools.
140
+ self._memory: MemoryStore = memory if memory is not None else InMemoryStore()
141
+ self._retriever: Retriever | None = retriever
142
+ self._graph_store: GraphStore | None = graph_store
143
+ self._tools: list[Tool] = list(tools) if tools is not None else []
144
+ self._evaluators: list[Evaluator] = list(evaluators) if evaluators is not None else []
145
+ self._system_prompt: str | None = (
146
+ system_prompt if system_prompt is not None else self._config.agent.system_prompt
147
+ )
148
+
149
+ # Budget — kwargs override config; config overrides Pydantic default.
150
+ cap_usd = budget_usd if budget_usd is not None else self._config.agent.budget.usd
151
+ max_iter = (
152
+ max_iterations if max_iterations is not None else self._config.agent.max_iterations
153
+ )
154
+ self._budget = BudgetPolicy(usd=cap_usd, max_iterations=max_iter)
155
+
156
+ self._on_step: list[StepHook] = _normalise_hooks(on_step)
157
+ self._on_finish: list[FinishHook] = _normalise_hooks(on_finish)
158
+
159
+ # feat-018: build the GuardrailEngine. Built-ins on by default
160
+ # (modules.guardrails.defaults) and combined with any
161
+ # validators passed explicitly via the constructor kwargs.
162
+ from agentforge.guardrails.engine import GuardrailEngine # noqa: PLC0415
163
+
164
+ policy = guardrail_policy if guardrail_policy is not None else self._config.guardrail_policy
165
+ self._guardrails = GuardrailEngine(
166
+ input_validators=list(input_validators or []),
167
+ output_validators=list(output_validators or []),
168
+ tool_gates=list(tool_gates or []),
169
+ policy=policy,
170
+ )
171
+
172
+ # feat-017: optional run recording. When `record_runs` is set,
173
+ # install hooks that persist every step + the final result as
174
+ # claims so `agentforge run --replay` and `agentforge debug`
175
+ # can reconstruct the run. Recording errors fall under the
176
+ # same isolation as other hooks (logged at WARN, never break
177
+ # the run — feat-009 §4.3).
178
+ if record_runs is not None:
179
+ from agentforge.recording import RecordRunHook # noqa: PLC0415
180
+
181
+ recorder = RecordRunHook(
182
+ memory=record_runs,
183
+ project="default",
184
+ agent_name=self._config.agent.name or "agent",
185
+ )
186
+ self._on_step.append(recorder.on_step)
187
+ self._on_finish.append(recorder.on_finish)
188
+ self._record_runs: MemoryStore | None = record_runs
189
+
190
+ # feat-015: optional pre-LLM pipeline. When set, runs to
191
+ # completion before the strategy loop; findings are exposed
192
+ # via a built-in `pipeline_findings` tool and a system-prompt
193
+ # addendum. Replay short-circuits actual execution by reading
194
+ # the recorded `__pipeline` claim.
195
+ self._pipeline: Pipeline | None = pipeline
196
+ self._pipeline_tool: PipelineFindingsTool | None = None
197
+ if pipeline is not None:
198
+ self._pipeline_tool = PipelineFindingsTool()
199
+ self._tools.append(self._pipeline_tool)
200
+
201
+ self._closed = False
202
+
203
+ if install_log_filter and self._config.logging.run_id_filter:
204
+ install_run_id_filter()
205
+ if install_log_filter and self._config.logging.format == "json":
206
+ install_json_formatter()
207
+
208
+ # ------------------------------------------------------------------
209
+ # Resolution helpers (used at construction; raise at startup, P11).
210
+ # ------------------------------------------------------------------
211
+
212
+ def _resolve_model(self, model: str | LLMClient | None) -> LLMClient | None:
213
+ if model is None:
214
+ return None
215
+ if isinstance(model, LLMClient):
216
+ return model
217
+ # String — parse "<provider>:<model_id>" and look up the
218
+ # provider in the resolver. feat-003 lights up the bedrock
219
+ # provider; future provider packages (anthropic, openai, ...)
220
+ # register themselves the same way at import time.
221
+ provider, model_id = parse_model_string(model)
222
+ try:
223
+ cls = Resolver.global_().resolve("providers", provider)
224
+ except ModuleError as exc:
225
+ raise ModuleError(
226
+ f"No LLM provider registered for {provider!r}. "
227
+ f"Install agentforge-{provider} (e.g. `uv add agentforge-{provider}`) "
228
+ f"or pass a typed LLMClient instance via Agent(model=...)."
229
+ ) from exc
230
+ instance = cls(model_id=model_id)
231
+ if not isinstance(instance, LLMClient):
232
+ raise ModuleError(
233
+ f"Resolved provider {provider!r} ({cls.__name__}) does not implement LLMClient."
234
+ )
235
+ return instance
236
+
237
+ def _resolve_strategy(self, strategy: str | ReasoningStrategy | None) -> ReasoningStrategy:
238
+ if isinstance(strategy, ReasoningStrategy):
239
+ return strategy
240
+ if strategy is None:
241
+ raise ModuleError(
242
+ "No reasoning strategy provided. feat-001 ships only the "
243
+ "ReasoningStrategy ABC; install agentforge[react] (when feat-002 "
244
+ "ships) or pass a custom ReasoningStrategy instance via "
245
+ "Agent(strategy=...)."
246
+ )
247
+ # String name — look up in the resolver (feat-002 will register
248
+ # ReActLoop here when it ships).
249
+ cls = Resolver.global_().resolve("strategies", strategy)
250
+ if not callable(cls):
251
+ raise ModuleError(f"Resolved strategy {strategy!r} is not constructible: {cls!r}.")
252
+ instance = cls()
253
+ if not isinstance(instance, ReasoningStrategy):
254
+ raise ModuleError(
255
+ f"Resolved strategy {strategy!r} ({cls.__name__}) does not "
256
+ f"implement ReasoningStrategy."
257
+ )
258
+ return instance
259
+
260
+ # ------------------------------------------------------------------
261
+ # Public API
262
+ # ------------------------------------------------------------------
263
+
264
+ @property
265
+ def memory(self) -> MemoryStore:
266
+ return self._memory
267
+
268
+ @property
269
+ def tools(self) -> list[Tool]:
270
+ return list(self._tools)
271
+
272
+ @property
273
+ def budget(self) -> BudgetPolicy:
274
+ return self._budget
275
+
276
+ @property
277
+ def pipeline(self) -> Pipeline | None:
278
+ return self._pipeline
279
+
280
+ def _build_runtime_metadata(
281
+ self,
282
+ run_budget: BudgetPolicy,
283
+ guard_ctx: dict[str, Any],
284
+ *,
285
+ system_prompt: str | None = None,
286
+ ) -> dict[str, object]:
287
+ """Build the `state.metadata` mapping that carries the
288
+ per-run `RuntimeContext`. Wraps the LLM + tools with the
289
+ guardrail engine so output validation and tool-call gating
290
+ happen inside the strategy loop transparently.
291
+
292
+ `system_prompt`, when provided, overrides `self._system_prompt`
293
+ for this single run only (feat-015 uses this to append the
294
+ pipeline-findings addendum without mutating the configured
295
+ prompt).
296
+ """
297
+ metadata: dict[str, object] = {}
298
+ if self._llm is None:
299
+ return metadata
300
+
301
+ def _ctx_factory() -> dict[str, object]:
302
+ return dict(guard_ctx)
303
+
304
+ metadata[RUNTIME_KEY] = RuntimeContext(
305
+ llm=self._guardrails.wrap_llm(self._llm, _ctx_factory),
306
+ tools=tuple(self._guardrails.wrap_tool(t, _ctx_factory) for t in self._tools),
307
+ memory=self._memory,
308
+ budget=run_budget,
309
+ system_prompt=system_prompt if system_prompt is not None else self._system_prompt,
310
+ retriever=self._retriever,
311
+ graph_store=self._graph_store,
312
+ )
313
+ return metadata
314
+
315
+ async def _maybe_run_pipeline(
316
+ self,
317
+ *,
318
+ context: dict[str, Any] | None,
319
+ run_budget: BudgetPolicy,
320
+ run_id: str,
321
+ replay_pipeline: PipelineResult | None,
322
+ ) -> PipelineResult | None:
323
+ """Run the configured pipeline (or load it from a replay), apply
324
+ cost accounting, and bind the findings to the built-in tool.
325
+
326
+ Returns ``None`` when the agent has no pipeline configured.
327
+ Raises `BudgetExceeded` if the pipeline alone exhausts the run
328
+ budget. Raises `PipelineFailure` if `on_task_error="fail"`
329
+ and a task errors.
330
+ """
331
+ if self._pipeline is None and replay_pipeline is None:
332
+ return None
333
+ if replay_pipeline is not None:
334
+ result = replay_pipeline
335
+ else:
336
+ assert self._pipeline is not None # narrowing for mypy
337
+ result = await self._pipeline.run(context or {})
338
+ # Charge declared pipeline cost against the budget.
339
+ if result.total_cost_usd > 0.0:
340
+ run_budget.commit(result.total_cost_usd)
341
+ run_budget.check()
342
+ if self._pipeline_tool is not None:
343
+ self._pipeline_tool._set_cache(list(result.findings))
344
+ # Persist as a `__pipeline` claim when recording.
345
+ if self._record_runs is not None and replay_pipeline is None:
346
+ from agentforge.recording import record_pipeline_result # noqa: PLC0415
347
+
348
+ await record_pipeline_result(
349
+ memory=self._record_runs,
350
+ run_id=run_id,
351
+ project="default",
352
+ agent_name=self._config.agent.name or "agent",
353
+ result=result,
354
+ )
355
+ return result
356
+
357
+ def _compose_system_prompt(self, pipeline_result: PipelineResult | None) -> str | None:
358
+ """Produce the per-run system prompt: the configured prompt
359
+ with the optional pipeline-findings addendum appended."""
360
+ if pipeline_result is None or not pipeline_result.findings:
361
+ return self._system_prompt
362
+ addendum = _format_pipeline_addendum(pipeline_result)
363
+ if self._system_prompt is None:
364
+ return addendum
365
+ return f"{self._system_prompt}\n\n{addendum}"
366
+
367
+ async def run(
368
+ self,
369
+ task: str,
370
+ *,
371
+ context: dict[str, Any] | None = None,
372
+ replay_pipeline: PipelineResult | None = None,
373
+ ) -> RunResult:
374
+ """Execute the agent's reasoning loop on `task`.
375
+
376
+ Args:
377
+ task: The task text the agent should reason about.
378
+ context: Extra key-value context passed to a configured
379
+ pipeline (feat-015). Ignored when no pipeline is set.
380
+ replay_pipeline: When replaying a recorded run, the
381
+ previously recorded `PipelineResult` is threaded in
382
+ here so the pipeline doesn't re-execute. Set by the
383
+ replay CLI; user code rarely passes it directly.
384
+
385
+ Returns:
386
+ A `RunResult` with the agent's output, full trace, and cost.
387
+ """
388
+ if self._closed:
389
+ raise ModuleError("Agent has been closed; create a new instance.")
390
+ ctx: RunContext = new_run(task=task)
391
+ token = bind_run(ctx)
392
+ started_ms = time.monotonic()
393
+ finish_reason: FinishReason = "completed"
394
+ tracer = get_tracer()
395
+ try:
396
+ with tracer.start_as_current_span(
397
+ "agent.run",
398
+ attributes={
399
+ "agentforge.run_id": ctx.run_id,
400
+ "agentforge.task": task,
401
+ },
402
+ ) as run_span:
403
+ run_budget = BudgetPolicy(
404
+ usd=self._budget.usd,
405
+ max_tokens=self._budget.max_tokens,
406
+ max_iterations=self._budget.max_iterations,
407
+ error_streak_limit=self._budget.error_streak_limit,
408
+ )
409
+ guard_ctx: dict[str, Any] = {
410
+ "run_id": ctx.run_id,
411
+ "project": self._config.agent.name or "default",
412
+ }
413
+ pipeline_result: PipelineResult | None = None
414
+ state: AgentState | None = None
415
+ try:
416
+ pipeline_result = await self._maybe_run_pipeline(
417
+ context=context,
418
+ run_budget=run_budget,
419
+ run_id=ctx.run_id,
420
+ replay_pipeline=replay_pipeline,
421
+ )
422
+ except PipelineFailure:
423
+ finish_reason = "pipeline"
424
+ raise
425
+ run_system_prompt = self._compose_system_prompt(pipeline_result)
426
+ metadata = self._build_runtime_metadata(
427
+ run_budget, guard_ctx, system_prompt=run_system_prompt
428
+ )
429
+ try:
430
+ validated_task = await self._guardrails.check_input(task, guard_ctx)
431
+ state = AgentState(
432
+ run_id=ctx.run_id,
433
+ task=validated_task,
434
+ metadata=metadata,
435
+ )
436
+ await self._strategy.run(state)
437
+ except BudgetExceeded:
438
+ finish_reason = "budget_exceeded"
439
+ raise
440
+ except GuardrailViolation:
441
+ finish_reason = "guardrail"
442
+ raise
443
+ except AgentForgeError:
444
+ finish_reason = "error"
445
+ raise
446
+ finally:
447
+ # Fire `on_step` for every step the strategy appended,
448
+ # even on error paths — observability of the partial
449
+ # trace is just as important as the happy path.
450
+ if state is not None:
451
+ await self._fire_steps(list(state.steps))
452
+ result = await self._finalize_result(
453
+ state=state,
454
+ task=task,
455
+ run_budget=run_budget,
456
+ run_id=ctx.run_id,
457
+ started_ms=started_ms,
458
+ finish_reason=finish_reason,
459
+ )
460
+ _tag_run_span(run_span, result, finish_reason)
461
+ await self._fire_finish(result)
462
+ return result
463
+ finally:
464
+ reset_run(token)
465
+
466
+ async def stream(
467
+ self,
468
+ task: str,
469
+ *,
470
+ context: dict[str, Any] | None = None,
471
+ replay_pipeline: PipelineResult | None = None,
472
+ ) -> AsyncIterator[StreamingEvent]:
473
+ """Streaming counterpart to :meth:`run` (feat-020 v0.2).
474
+
475
+ Drives the agent via ``strategy.stream(state)`` and yields
476
+ every event as it arrives. Same setup as ``run()`` —
477
+ guardrails on input, pipeline, RunContext binding, span
478
+ tracing, finalize-result, on_finish hook. The terminal
479
+ ``done`` event carries the full :class:`RunResult` shape in
480
+ ``content`` (``output`` / ``run_id`` / ``cost_usd`` /
481
+ ``tokens_in`` / ``tokens_out`` / ``finish_reason``) so
482
+ callers don't need a second round-trip.
483
+
484
+ Strategies that don't override
485
+ :meth:`ReasoningStrategy.stream` get the ABC's default
486
+ behaviour: one terminal ``done`` event. Callers (e.g.
487
+ :class:`ChatSession`) check the override and fall back to
488
+ the buffered ``run()`` + segment-and-stream path when the
489
+ default is in effect.
490
+ """
491
+ if self._closed:
492
+ raise ModuleError("Agent has been closed; create a new instance.")
493
+ ctx: RunContext = new_run(task=task)
494
+ token = bind_run(ctx)
495
+ started_ms = time.monotonic()
496
+ finish_reason: FinishReason = "completed"
497
+ tracer = get_tracer()
498
+ try:
499
+ with tracer.start_as_current_span(
500
+ "agent.stream",
501
+ attributes={
502
+ "agentforge.run_id": ctx.run_id,
503
+ "agentforge.task": task,
504
+ },
505
+ ) as run_span:
506
+ run_budget = BudgetPolicy(
507
+ usd=self._budget.usd,
508
+ max_tokens=self._budget.max_tokens,
509
+ max_iterations=self._budget.max_iterations,
510
+ error_streak_limit=self._budget.error_streak_limit,
511
+ )
512
+ guard_ctx: dict[str, Any] = {
513
+ "run_id": ctx.run_id,
514
+ "project": self._config.agent.name or "default",
515
+ }
516
+ pipeline_result: PipelineResult | None = None
517
+ state: AgentState | None = None
518
+ try:
519
+ pipeline_result = await self._maybe_run_pipeline(
520
+ context=context,
521
+ run_budget=run_budget,
522
+ run_id=ctx.run_id,
523
+ replay_pipeline=replay_pipeline,
524
+ )
525
+ except PipelineFailure:
526
+ finish_reason = "pipeline"
527
+ raise
528
+ run_system_prompt = self._compose_system_prompt(pipeline_result)
529
+ metadata = self._build_runtime_metadata(
530
+ run_budget, guard_ctx, system_prompt=run_system_prompt
531
+ )
532
+ try:
533
+ validated_task = await self._guardrails.check_input(task, guard_ctx)
534
+ state = AgentState(
535
+ run_id=ctx.run_id,
536
+ task=validated_task,
537
+ metadata=metadata,
538
+ )
539
+ async for event in self._strategy.stream(state):
540
+ # Strategy may emit a terminal `done` itself
541
+ # (default ABC impl does). Swallow it — we
542
+ # emit the canonical terminal `done` below
543
+ # with the full RunResult shape.
544
+ if event.kind == "done":
545
+ continue
546
+ yield event
547
+ except BudgetExceeded:
548
+ finish_reason = "budget_exceeded"
549
+ raise
550
+ except GuardrailViolation:
551
+ finish_reason = "guardrail"
552
+ raise
553
+ except AgentForgeError:
554
+ finish_reason = "error"
555
+ raise
556
+ finally:
557
+ if state is not None:
558
+ await self._fire_steps(list(state.steps))
559
+ result = await self._finalize_result(
560
+ state=state,
561
+ task=task,
562
+ run_budget=run_budget,
563
+ run_id=ctx.run_id,
564
+ started_ms=started_ms,
565
+ finish_reason=finish_reason,
566
+ )
567
+ _tag_run_span(run_span, result, finish_reason)
568
+ await self._fire_finish(result)
569
+ yield StreamingEvent(
570
+ kind="done",
571
+ content={
572
+ "output": result.output,
573
+ "run_id": result.run_id,
574
+ "cost_usd": float(result.cost_usd),
575
+ "tokens_in": int(result.tokens_in),
576
+ "tokens_out": int(result.tokens_out),
577
+ "finish_reason": str(result.finish_reason),
578
+ "duration_ms": int(result.duration_ms),
579
+ },
580
+ )
581
+ finally:
582
+ reset_run(token)
583
+
584
+ async def _finalize_result(
585
+ self,
586
+ *,
587
+ state: AgentState,
588
+ task: str,
589
+ run_budget: BudgetPolicy,
590
+ run_id: str,
591
+ started_ms: float,
592
+ finish_reason: FinishReason,
593
+ ) -> RunResult:
594
+ duration_ms = int((time.monotonic() - started_ms) * 1000)
595
+ output = self._extract_output(state)
596
+ tokens_in = sum(s.tokens_in for s in state.steps)
597
+ tokens_out = sum(s.tokens_out for s in state.steps)
598
+ interim = RunResult(
599
+ output=output,
600
+ steps=tuple(state.steps),
601
+ cost_usd=run_budget.spent_usd,
602
+ tokens_in=tokens_in,
603
+ tokens_out=tokens_out,
604
+ run_id=run_id,
605
+ duration_ms=duration_ms,
606
+ finish_reason=finish_reason,
607
+ guardrail_events=tuple(self._guardrails.events),
608
+ )
609
+ eval_scores = await self._run_evaluators(interim, task=task, state=state, budget=run_budget)
610
+ return interim.model_copy(update={"eval_scores": eval_scores})
611
+
612
+ async def _run_evaluators(
613
+ self,
614
+ result: RunResult,
615
+ *,
616
+ task: str,
617
+ state: AgentState,
618
+ budget: BudgetPolicy,
619
+ ) -> tuple[EvalResult, ...]:
620
+ """Iterate configured evaluators, gating each by remaining budget.
621
+
622
+ Per feat-006 §4.3: skip an evaluator if
623
+ `budget.remaining_usd() < evaluator.cost_estimate_usd`; log at
624
+ WARN. The evaluator receives the just-built `RunResult` as
625
+ `finding` and a context dict carrying `task`, `state`, and
626
+ `budget` so judge graders can reserve / commit against the
627
+ live policy.
628
+
629
+ Skipped evaluators do not appear in the returned tuple — only
630
+ evaluators that actually ran. Order preserved.
631
+ """
632
+ if not self._evaluators:
633
+ return ()
634
+
635
+ context: dict[str, object] = {"task": task, "state": state, "budget": budget}
636
+ out: list[EvalResult] = []
637
+ for evaluator in self._evaluators:
638
+ est = float(getattr(evaluator, "cost_estimate_usd", 0.0))
639
+ remaining = budget.remaining_usd()
640
+ if est > remaining:
641
+ _evaluator_log.warning(
642
+ "skipping evaluator %r: budget exhausted (need=$%.4f, remaining=$%.4f)",
643
+ evaluator.name,
644
+ est,
645
+ remaining,
646
+ )
647
+ continue
648
+ tracer = get_tracer()
649
+ started_ms = time.monotonic()
650
+ with tracer.start_as_current_span(
651
+ f"evaluator.{evaluator.name}",
652
+ attributes={
653
+ "agentforge.evaluator.name": evaluator.name,
654
+ "agentforge.evaluator.cost_estimate_usd": est,
655
+ },
656
+ ) as ev_span:
657
+ eval_result = await evaluator.evaluate(result, context)
658
+ ev_span.set_attribute("agentforge.evaluator.score", float(eval_result.score))
659
+ ev_span.set_attribute(
660
+ "agentforge.evaluator.cost_usd",
661
+ float(getattr(eval_result, "cost_usd", 0.0)),
662
+ )
663
+ ev_span.set_attribute(
664
+ "agentforge.evaluator.duration_ms",
665
+ int((time.monotonic() - started_ms) * 1000),
666
+ )
667
+ out.append(eval_result)
668
+ return tuple(out)
669
+
670
+ async def close(self) -> None:
671
+ """Release resources held by the agent (LLM, memory, log filter)."""
672
+ if self._closed:
673
+ return
674
+ self._closed = True
675
+ if self._llm is not None:
676
+ await self._llm.close()
677
+ await self._memory.close()
678
+ if self._graph_store is not None:
679
+ await self._graph_store.close()
680
+ uninstall_run_id_filter()
681
+ uninstall_json_formatter()
682
+
683
+ async def __aenter__(self) -> Agent:
684
+ return self
685
+
686
+ async def __aexit__(
687
+ self,
688
+ exc_type: type[BaseException] | None,
689
+ exc: BaseException | None,
690
+ tb: TracebackType | None,
691
+ ) -> None:
692
+ await self.close()
693
+
694
+ # ------------------------------------------------------------------
695
+
696
+ @staticmethod
697
+ def _extract_output(state: AgentState) -> str:
698
+ """Pick the agent's final output from `state.steps`.
699
+
700
+ feat-001 uses the simplest rule: the content of the last
701
+ non-system step, stringified. feat-002 strategies will set
702
+ a richer convention.
703
+ """
704
+ for step in reversed(state.steps):
705
+ if step.kind != "system":
706
+ content = step.content
707
+ return content if isinstance(content, str) else str(content)
708
+ return ""
709
+
710
+ async def _fire_finish(self, result: RunResult) -> None:
711
+ """Fire every finish hook in registration order. Each hook is
712
+ isolated — a raise gets logged at WARN via the
713
+ `agentforge.observability` logger and does NOT propagate.
714
+
715
+ Per feat-009 §4.3: "Observability must never break the run."
716
+ """
717
+ for hook in self._on_finish:
718
+ await _safe_call_hook(hook, result, kind="on_finish")
719
+
720
+ async def _fire_steps(self, new_steps: list[Step]) -> None:
721
+ """Fire every step hook for each newly-appended step.
722
+
723
+ Order: (step1, hook_a), (step1, hook_b), (step2, hook_a), ...
724
+ — finish each step's hook fan-out before moving to the next.
725
+ Errors are isolated per-hook same as `_fire_finish`.
726
+ """
727
+ if not self._on_step or not new_steps:
728
+ return
729
+ for step in new_steps:
730
+ for hook in self._on_step:
731
+ await _safe_call_hook(hook, step, kind="on_step")
732
+
733
+
734
+ def _pick_str_form(
735
+ kwarg_value: Any,
736
+ config_value: Any,
737
+ *,
738
+ field: str,
739
+ ) -> Any:
740
+ """Prefer `kwarg_value`; if absent, use `config_value` unless it's
741
+ a dict (inline-options form, feat-012 §4.5 — not yet supported at
742
+ Agent construction). Returns `str | object | None` typed as `Any`
743
+ so the per-field resolver narrows on its own.
744
+ """
745
+ if kwarg_value is not None:
746
+ return kwarg_value
747
+ if isinstance(config_value, dict):
748
+ raise ModuleError(
749
+ f"agent.{field} in agentforge.yaml is a dict (inline options form); "
750
+ f"not yet supported at Agent construction. Pass {field}= explicitly "
751
+ "or use the string form in YAML."
752
+ )
753
+ return config_value
754
+
755
+
756
+ def _normalise_hooks(hooks: Any) -> list[Any]:
757
+ """Accept `None | Callable | list[Callable]`; return a fresh list.
758
+
759
+ Centralised so the on_step / on_finish surfaces stay in sync.
760
+ """
761
+ if hooks is None:
762
+ return []
763
+ if isinstance(hooks, list):
764
+ return list(hooks)
765
+ return [hooks]
766
+
767
+
768
+ def _tag_run_span(span: Any, result: RunResult, finish_reason: FinishReason) -> None:
769
+ """Stamp the run span with the run summary before it closes."""
770
+ span.set_attribute("agentforge.finish_reason", finish_reason)
771
+ span.set_attribute("agentforge.cost_usd", result.cost_usd)
772
+ span.set_attribute("agentforge.tokens_in", result.tokens_in)
773
+ span.set_attribute("agentforge.tokens_out", result.tokens_out)
774
+ span.set_attribute("agentforge.duration_ms", result.duration_ms)
775
+ span.set_attribute("agentforge.n_steps", len(result.steps))
776
+
777
+
778
+ def _format_pipeline_addendum(result: PipelineResult) -> str:
779
+ """Render `PipelineResult.findings` as a markdown section the LLM
780
+ sees in the per-run system prompt (feat-015 §4.3).
781
+
782
+ Format:
783
+
784
+ ## Pipeline findings
785
+
786
+ - [severity] category: message
787
+
788
+ Empty findings short-circuit at the caller, so this is only
789
+ invoked when there's at least one finding to render.
790
+ """
791
+ lines = ["## Pipeline findings", ""]
792
+ for f in result.findings:
793
+ sev = getattr(f, "severity", "info")
794
+ cat = getattr(f, "category", "")
795
+ msg = getattr(f, "message", "")
796
+ lines.append(f"- [{sev}] {cat}: {msg}")
797
+ return "\n".join(lines)
798
+
799
+
800
+ async def _safe_call_hook(hook: Any, payload: Any, *, kind: str) -> None:
801
+ """Invoke a hook with `payload`; await if it returned an awaitable;
802
+ catch + log any exception so the run keeps going.
803
+
804
+ "Observability must never break the run" per feat-009 §4.3.
805
+ """
806
+ try:
807
+ outcome = hook(payload)
808
+ if outcome is not None and hasattr(outcome, "__await__"):
809
+ await outcome
810
+ except Exception as exc:
811
+ _observability_log.warning(
812
+ "hook %s raised %s: %s (hook=%r)",
813
+ kind,
814
+ type(exc).__name__,
815
+ exc,
816
+ getattr(hook, "__name__", hook),
817
+ )