@agent-native/core 0.51.15 → 0.53.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. package/README.md +42 -96
  2. package/blueprints/action/crud.md +98 -0
  3. package/blueprints/channel/discord.md +74 -0
  4. package/blueprints/provider/stripe.md +87 -0
  5. package/blueprints/sandbox/docker.md +78 -0
  6. package/dist/action.d.ts +24 -0
  7. package/dist/action.d.ts.map +1 -1
  8. package/dist/action.js +4 -0
  9. package/dist/action.js.map +1 -1
  10. package/dist/agent/observational-memory/compactor.d.ts +43 -0
  11. package/dist/agent/observational-memory/compactor.d.ts.map +1 -0
  12. package/dist/agent/observational-memory/compactor.js +50 -0
  13. package/dist/agent/observational-memory/compactor.js.map +1 -0
  14. package/dist/agent/observational-memory/config.d.ts +37 -0
  15. package/dist/agent/observational-memory/config.d.ts.map +1 -0
  16. package/dist/agent/observational-memory/config.js +48 -0
  17. package/dist/agent/observational-memory/config.js.map +1 -0
  18. package/dist/agent/observational-memory/index.d.ts +26 -0
  19. package/dist/agent/observational-memory/index.d.ts.map +1 -0
  20. package/dist/agent/observational-memory/index.js +25 -0
  21. package/dist/agent/observational-memory/index.js.map +1 -0
  22. package/dist/agent/observational-memory/internal-run.d.ts +37 -0
  23. package/dist/agent/observational-memory/internal-run.d.ts.map +1 -0
  24. package/dist/agent/observational-memory/internal-run.js +59 -0
  25. package/dist/agent/observational-memory/internal-run.js.map +1 -0
  26. package/dist/agent/observational-memory/message-text.d.ts +13 -0
  27. package/dist/agent/observational-memory/message-text.d.ts.map +1 -0
  28. package/dist/agent/observational-memory/message-text.js +46 -0
  29. package/dist/agent/observational-memory/message-text.js.map +1 -0
  30. package/dist/agent/observational-memory/migrations.d.ts +13 -0
  31. package/dist/agent/observational-memory/migrations.d.ts.map +1 -0
  32. package/dist/agent/observational-memory/migrations.js +43 -0
  33. package/dist/agent/observational-memory/migrations.js.map +1 -0
  34. package/dist/agent/observational-memory/observer.d.ts +37 -0
  35. package/dist/agent/observational-memory/observer.d.ts.map +1 -0
  36. package/dist/agent/observational-memory/observer.js +82 -0
  37. package/dist/agent/observational-memory/observer.js.map +1 -0
  38. package/dist/agent/observational-memory/plugin.d.ts +16 -0
  39. package/dist/agent/observational-memory/plugin.d.ts.map +1 -0
  40. package/dist/agent/observational-memory/plugin.js +26 -0
  41. package/dist/agent/observational-memory/plugin.js.map +1 -0
  42. package/dist/agent/observational-memory/prompts.d.ts +27 -0
  43. package/dist/agent/observational-memory/prompts.d.ts.map +1 -0
  44. package/dist/agent/observational-memory/prompts.js +42 -0
  45. package/dist/agent/observational-memory/prompts.js.map +1 -0
  46. package/dist/agent/observational-memory/read.d.ts +47 -0
  47. package/dist/agent/observational-memory/read.d.ts.map +1 -0
  48. package/dist/agent/observational-memory/read.js +99 -0
  49. package/dist/agent/observational-memory/read.js.map +1 -0
  50. package/dist/agent/observational-memory/reflector.d.ts +31 -0
  51. package/dist/agent/observational-memory/reflector.d.ts.map +1 -0
  52. package/dist/agent/observational-memory/reflector.js +76 -0
  53. package/dist/agent/observational-memory/reflector.js.map +1 -0
  54. package/dist/agent/observational-memory/schema.d.ts +267 -0
  55. package/dist/agent/observational-memory/schema.d.ts.map +1 -0
  56. package/dist/agent/observational-memory/schema.js +48 -0
  57. package/dist/agent/observational-memory/schema.js.map +1 -0
  58. package/dist/agent/observational-memory/store.d.ts +52 -0
  59. package/dist/agent/observational-memory/store.d.ts.map +1 -0
  60. package/dist/agent/observational-memory/store.js +197 -0
  61. package/dist/agent/observational-memory/store.js.map +1 -0
  62. package/dist/agent/observational-memory/types.d.ts +61 -0
  63. package/dist/agent/observational-memory/types.d.ts.map +1 -0
  64. package/dist/agent/observational-memory/types.js +9 -0
  65. package/dist/agent/observational-memory/types.js.map +1 -0
  66. package/dist/agent/production-agent.d.ts +15 -0
  67. package/dist/agent/production-agent.d.ts.map +1 -1
  68. package/dist/agent/production-agent.js +240 -1
  69. package/dist/agent/production-agent.js.map +1 -1
  70. package/dist/agent/run-loop-with-resume.d.ts.map +1 -1
  71. package/dist/agent/run-loop-with-resume.js +49 -0
  72. package/dist/agent/run-loop-with-resume.js.map +1 -1
  73. package/dist/agent/run-store.d.ts +17 -0
  74. package/dist/agent/run-store.d.ts.map +1 -1
  75. package/dist/agent/run-store.js +55 -0
  76. package/dist/agent/run-store.js.map +1 -1
  77. package/dist/agent/runtime-context.d.ts +30 -0
  78. package/dist/agent/runtime-context.d.ts.map +1 -1
  79. package/dist/agent/runtime-context.js +54 -1
  80. package/dist/agent/runtime-context.js.map +1 -1
  81. package/dist/agent/tool-call-journal.d.ts +101 -0
  82. package/dist/agent/tool-call-journal.d.ts.map +1 -0
  83. package/dist/agent/tool-call-journal.js +214 -0
  84. package/dist/agent/tool-call-journal.js.map +1 -0
  85. package/dist/agent/types.d.ts +24 -0
  86. package/dist/agent/types.d.ts.map +1 -1
  87. package/dist/agent/types.js.map +1 -1
  88. package/dist/cli/add.d.ts +109 -0
  89. package/dist/cli/add.d.ts.map +1 -0
  90. package/dist/cli/add.js +352 -0
  91. package/dist/cli/add.js.map +1 -0
  92. package/dist/cli/connect.d.ts +5 -4
  93. package/dist/cli/connect.d.ts.map +1 -1
  94. package/dist/cli/connect.js +157 -48
  95. package/dist/cli/connect.js.map +1 -1
  96. package/dist/cli/eval.d.ts +17 -0
  97. package/dist/cli/eval.d.ts.map +1 -0
  98. package/dist/cli/eval.js +121 -0
  99. package/dist/cli/eval.js.map +1 -0
  100. package/dist/cli/index.js +44 -3
  101. package/dist/cli/index.js.map +1 -1
  102. package/dist/cli/mcp-config-writers.d.ts +20 -13
  103. package/dist/cli/mcp-config-writers.d.ts.map +1 -1
  104. package/dist/cli/mcp-config-writers.js +152 -13
  105. package/dist/cli/mcp-config-writers.js.map +1 -1
  106. package/dist/cli/mcp.d.ts +2 -2
  107. package/dist/cli/mcp.d.ts.map +1 -1
  108. package/dist/cli/mcp.js +50 -196
  109. package/dist/cli/mcp.js.map +1 -1
  110. package/dist/cli/plan-local.d.ts +69 -6
  111. package/dist/cli/plan-local.d.ts.map +1 -1
  112. package/dist/cli/plan-local.js +517 -23
  113. package/dist/cli/plan-local.js.map +1 -1
  114. package/dist/cli/recap.d.ts.map +1 -1
  115. package/dist/cli/recap.js +1 -1
  116. package/dist/cli/recap.js.map +1 -1
  117. package/dist/cli/skills.d.ts +13 -6
  118. package/dist/cli/skills.d.ts.map +1 -1
  119. package/dist/cli/skills.js +287 -111
  120. package/dist/cli/skills.js.map +1 -1
  121. package/dist/client/AssistantChat.d.ts.map +1 -1
  122. package/dist/client/AssistantChat.js +118 -92
  123. package/dist/client/AssistantChat.js.map +1 -1
  124. package/dist/client/agent-chat-adapter.d.ts.map +1 -1
  125. package/dist/client/agent-chat-adapter.js +16 -0
  126. package/dist/client/agent-chat-adapter.js.map +1 -1
  127. package/dist/client/agent-engine-key.d.ts +6 -4
  128. package/dist/client/agent-engine-key.d.ts.map +1 -1
  129. package/dist/client/agent-engine-key.js +9 -6
  130. package/dist/client/agent-engine-key.js.map +1 -1
  131. package/dist/client/chat/run-recovery.js +1 -1
  132. package/dist/client/chat/run-recovery.js.map +1 -1
  133. package/dist/client/chat/tool-call-display.d.ts +20 -1
  134. package/dist/client/chat/tool-call-display.d.ts.map +1 -1
  135. package/dist/client/chat/tool-call-display.js +32 -7
  136. package/dist/client/chat/tool-call-display.js.map +1 -1
  137. package/dist/client/settings/SettingsPanel.d.ts.map +1 -1
  138. package/dist/client/settings/SettingsPanel.js +7 -14
  139. package/dist/client/settings/SettingsPanel.js.map +1 -1
  140. package/dist/client/sse-event-processor.d.ts +13 -0
  141. package/dist/client/sse-event-processor.d.ts.map +1 -1
  142. package/dist/client/sse-event-processor.js +21 -0
  143. package/dist/client/sse-event-processor.js.map +1 -1
  144. package/dist/coding-tools/run-code.d.ts +7 -0
  145. package/dist/coding-tools/run-code.d.ts.map +1 -1
  146. package/dist/coding-tools/run-code.js +21 -106
  147. package/dist/coding-tools/run-code.js.map +1 -1
  148. package/dist/coding-tools/sandbox/adapter.d.ts +79 -0
  149. package/dist/coding-tools/sandbox/adapter.d.ts.map +1 -0
  150. package/dist/coding-tools/sandbox/adapter.js +24 -0
  151. package/dist/coding-tools/sandbox/adapter.js.map +1 -0
  152. package/dist/coding-tools/sandbox/index.d.ts +51 -0
  153. package/dist/coding-tools/sandbox/index.d.ts.map +1 -0
  154. package/dist/coding-tools/sandbox/index.js +79 -0
  155. package/dist/coding-tools/sandbox/index.js.map +1 -0
  156. package/dist/coding-tools/sandbox/local-child-process-adapter.d.ts +24 -0
  157. package/dist/coding-tools/sandbox/local-child-process-adapter.d.ts.map +1 -0
  158. package/dist/coding-tools/sandbox/local-child-process-adapter.js +141 -0
  159. package/dist/coding-tools/sandbox/local-child-process-adapter.js.map +1 -0
  160. package/dist/db/client.d.ts +4 -2
  161. package/dist/db/client.d.ts.map +1 -1
  162. package/dist/db/client.js +6 -4
  163. package/dist/db/client.js.map +1 -1
  164. package/dist/deploy/route-discovery.d.ts.map +1 -1
  165. package/dist/deploy/route-discovery.js +1 -0
  166. package/dist/deploy/route-discovery.js.map +1 -1
  167. package/dist/eval/agent-runner.d.ts +63 -0
  168. package/dist/eval/agent-runner.d.ts.map +1 -0
  169. package/dist/eval/agent-runner.js +142 -0
  170. package/dist/eval/agent-runner.js.map +1 -0
  171. package/dist/eval/define-eval.d.ts +29 -0
  172. package/dist/eval/define-eval.d.ts.map +1 -0
  173. package/dist/eval/define-eval.js +43 -0
  174. package/dist/eval/define-eval.js.map +1 -0
  175. package/dist/eval/index.d.ts +18 -0
  176. package/dist/eval/index.d.ts.map +1 -0
  177. package/dist/eval/index.js +17 -0
  178. package/dist/eval/index.js.map +1 -0
  179. package/dist/eval/report.d.ts +8 -0
  180. package/dist/eval/report.d.ts.map +1 -0
  181. package/dist/eval/report.js +44 -0
  182. package/dist/eval/report.js.map +1 -0
  183. package/dist/eval/runner.d.ts +67 -0
  184. package/dist/eval/runner.d.ts.map +1 -0
  185. package/dist/eval/runner.js +256 -0
  186. package/dist/eval/runner.js.map +1 -0
  187. package/dist/eval/scorer.d.ts +83 -0
  188. package/dist/eval/scorer.d.ts.map +1 -0
  189. package/dist/eval/scorer.js +195 -0
  190. package/dist/eval/scorer.js.map +1 -0
  191. package/dist/eval/types.d.ts +162 -0
  192. package/dist/eval/types.d.ts.map +1 -0
  193. package/dist/eval/types.js +20 -0
  194. package/dist/eval/types.js.map +1 -0
  195. package/dist/observability/traces.d.ts.map +1 -1
  196. package/dist/observability/traces.js +100 -1
  197. package/dist/observability/traces.js.map +1 -1
  198. package/dist/observability/tracing.d.ts +73 -0
  199. package/dist/observability/tracing.d.ts.map +1 -0
  200. package/dist/observability/tracing.js +126 -0
  201. package/dist/observability/tracing.js.map +1 -0
  202. package/dist/onboarding/default-steps.d.ts.map +1 -1
  203. package/dist/onboarding/default-steps.js +4 -1
  204. package/dist/onboarding/default-steps.js.map +1 -1
  205. package/dist/provider-api/actions/query-staged-dataset.d.ts +1 -1
  206. package/dist/scripts/agent-engines/list-agent-engines.d.ts.map +1 -1
  207. package/dist/scripts/agent-engines/list-agent-engines.js +10 -3
  208. package/dist/scripts/agent-engines/list-agent-engines.js.map +1 -1
  209. package/dist/server/action-discovery.d.ts.map +1 -1
  210. package/dist/server/action-discovery.js +4 -0
  211. package/dist/server/action-discovery.js.map +1 -1
  212. package/dist/server/agent-chat-plugin.d.ts +9 -0
  213. package/dist/server/agent-chat-plugin.d.ts.map +1 -1
  214. package/dist/server/agent-chat-plugin.js +118 -110
  215. package/dist/server/agent-chat-plugin.js.map +1 -1
  216. package/dist/server/agent-engine-api-key-route.d.ts +37 -0
  217. package/dist/server/agent-engine-api-key-route.d.ts.map +1 -0
  218. package/dist/server/agent-engine-api-key-route.js +105 -0
  219. package/dist/server/agent-engine-api-key-route.js.map +1 -0
  220. package/dist/server/agent-teams.d.ts +62 -0
  221. package/dist/server/agent-teams.d.ts.map +1 -1
  222. package/dist/server/agent-teams.js +99 -2
  223. package/dist/server/agent-teams.js.map +1 -1
  224. package/dist/server/core-routes-plugin.d.ts.map +1 -1
  225. package/dist/server/core-routes-plugin.js +17 -10
  226. package/dist/server/core-routes-plugin.js.map +1 -1
  227. package/dist/server/create-server.js +1 -1
  228. package/dist/server/create-server.js.map +1 -1
  229. package/dist/server/credential-provider.d.ts.map +1 -1
  230. package/dist/server/credential-provider.js +2 -0
  231. package/dist/server/credential-provider.js.map +1 -1
  232. package/dist/server/framework-request-handler.d.ts.map +1 -1
  233. package/dist/server/framework-request-handler.js +33 -1
  234. package/dist/server/framework-request-handler.js.map +1 -1
  235. package/dist/server/index.d.ts +1 -0
  236. package/dist/server/index.d.ts.map +1 -1
  237. package/dist/server/index.js +1 -0
  238. package/dist/server/index.js.map +1 -1
  239. package/dist/templates/workspace-core/.agents/skills/external-agents/SKILL.md +17 -4
  240. package/dist/templates/workspace-core/.agents/skills/harness-agents/SKILL.md +20 -0
  241. package/dist/templates/workspace-core/.agents/skills/observability/SKILL.md +20 -0
  242. package/docs/content/agent-teams.md +32 -0
  243. package/docs/content/blueprint-installer.md +73 -0
  244. package/docs/content/evals.md +141 -0
  245. package/docs/content/pr-visual-recap.md +7 -4
  246. package/docs/content/sandbox-adapters.md +134 -0
  247. package/docs/content/template-plan.md +20 -8
  248. package/package.json +5 -1
  249. package/src/templates/workspace-core/.agents/skills/external-agents/SKILL.md +17 -4
  250. package/src/templates/workspace-core/.agents/skills/harness-agents/SKILL.md +20 -0
  251. package/src/templates/workspace-core/.agents/skills/observability/SKILL.md +20 -0
@@ -0,0 +1,141 @@
1
+ ---
2
+ title: "Evals (CI Gate)"
3
+ description: "Write *.eval.ts test cases that run the real agent against fixed inputs, score the output with composable scorers, and gate CI/deploys on a threshold."
4
+ ---
5
+
6
+ # Evals (CI Gate)
7
+
8
+ Evals are a first-class testing primitive: you declare a prompt plus the behavior you expect, the runner **actually runs the agent loop** against that input, scores the output with composable scorers, and exits non-zero if any case scores below its threshold. That non-zero exit makes `agent-native eval` a drop-in CI deploy gate.
9
+
10
+ This is complementary to the post-hoc scoring in [Observability](/docs/observability):
11
+
12
+ - **Observability evals** (`observability/evals.ts`) — _"how did this real run do?"_ Passive, sampled, lives next to traces.
13
+ - **`*.eval.ts` (this primitive)** — _"does the agent do the right thing on this fixed input?"_ Active, deterministic, a CI gate run via the CLI.
14
+
15
+ The runner resolves a provider-agnostic engine/model from the existing registry — no model is hardcoded — so the same suite runs against whatever engine the app is configured for.
16
+
17
+ ## Writing an eval {#writing}
18
+
19
+ Drop a `*.eval.ts` file anywhere in the app (or an `evals/*.ts` file). Each file `export default defineEval(...)` (or exports an array of them):
20
+
21
+ ```ts
22
+ // evals/greeting.eval.ts
23
+ import { defineEval, contains, llmJudge } from "@agent-native/core/eval";
24
+
25
+ export default defineEval({
26
+ name: "greets the user by name",
27
+ input: { prompt: "Say hi to Ada." },
28
+ threshold: 0.7, // per-scorer pass bar; default 0.5
29
+ scorers: [
30
+ contains("Ada"),
31
+ llmJudge({ criteria: "friendliness", rubric: "1.0 = warm greeting" }),
32
+ ],
33
+ });
34
+ ```
35
+
36
+ An eval passes only when **every** scorer meets the threshold. Key `defineEval` fields:
37
+
38
+ | Field | Type | Notes |
39
+ | ----------- | --------------------- | ------------------------------------------------------------- |
40
+ | `name` | string | Required. Shown in the report. |
41
+ | `input` | `{ prompt, history }` | Required `prompt`; optional prior `{ role, text }` turns. |
42
+ | `scorers` | `Scorer[]` | Required, at least one. |
43
+ | `threshold` | number `0..1` | Per-scorer pass bar. Default `0.5`; overridable from the CLI. |
44
+ | `run` | function | Optional override for custom setup (seed data, multi-turn). |
45
+
46
+ The agent run handed to scorers is small and transport-agnostic:
47
+
48
+ ```ts
49
+ interface AgentRunOutput {
50
+ text: string; // concatenated assistant text
51
+ toolCalls: readonly string[]; // tool/action names, in call order
52
+ ok: boolean; // completed without a terminal error
53
+ error?: string;
54
+ runId: string;
55
+ durationMs: number;
56
+ }
57
+ ```
58
+
59
+ ## Built-in scorers {#built-in}
60
+
61
+ Imported from `@agent-native/core/eval`:
62
+
63
+ | Scorer | Score | Model? |
64
+ | ------------------------ | ----------------------------------------------------------------- | ------ |
65
+ | `exactMatch(expected)` | `1.0` if text equals `expected` (trimmed, case-insensitive) | No |
66
+ | `contains(needles)` | Fraction of required substrings present (so partial hits surface) | No |
67
+ | `usesTool(toolName)` | `1.0` if the agent invoked that tool/action at least once | No |
68
+ | `llmJudge({ criteria })` | LLM-as-judge scored against a natural-language rubric, → `0..1` | Yes |
69
+
70
+ `exactMatch` and `contains` take an optional `{ caseSensitive }`. `llmJudge` takes `{ criteria, rubric?, name?, scoreRange? }` — its output is normalized to `[0, 1]`, and the judge model is whatever the runner resolved (never a hardcoded provider).
71
+
72
+ ## Custom scorers: the 4-step pipeline {#custom}
73
+
74
+ `createScorer` builds a scorer from a Mastra-style 4-step pipeline. Only `generateScore` is required:
75
+
76
+ ```txt
77
+ preprocess(run) → x transform the run/output (optional)
78
+ analyze(x, ctx) → analysis plain JS OR an LLM judge (optional)
79
+ generateScore(a) → 0..1 REQUIRED, normalized
80
+ generateReason(...) → string human-readable why (optional)
81
+ ```
82
+
83
+ `preprocess` and `analyze` default to identity (the scorer sees the raw `AgentRunOutput`). The `analyze` step receives a `ctx` with a provider-agnostic `judge()` helper for LLM-backed scoring:
84
+
85
+ ```ts
86
+ import { createScorer, clamp01 } from "@agent-native/core/eval";
87
+
88
+ // A scorer that rewards short, tool-using answers.
89
+ const concise = createScorer({
90
+ name: "concise_with_tool",
91
+ analyze(run) {
92
+ return {
93
+ words: run.text.trim().split(/\s+/).length,
94
+ usedTool: run.toolCalls.length > 0,
95
+ };
96
+ },
97
+ generateScore({ words, usedTool }) {
98
+ if (!usedTool) return 0;
99
+ return clamp01(1 - Math.max(0, words - 40) / 200);
100
+ },
101
+ generateReason({ analysis }) {
102
+ return `${analysis.words} words, tool used: ${analysis.usedTool}`;
103
+ },
104
+ });
105
+ ```
106
+
107
+ ## Running the gate {#cli}
108
+
109
+ ```bash
110
+ agent-native eval # run every *.eval.ts; non-zero exit on failure
111
+ agent-native eval billing # only files whose path contains "billing"
112
+ agent-native eval --json # machine-readable report (for CI)
113
+ agent-native eval --threshold 0.8 # override every eval's pass threshold (0..1)
114
+ ```
115
+
116
+ The command discovers `**/*.eval.ts` and `evals/*.ts` under the current app, runs the agent for each input, scores it, prints a readable table (or JSON), and **exits non-zero if any eval scores below its threshold**.
117
+
118
+ Exit codes:
119
+
120
+ | Code | Meaning |
121
+ | ---- | --------------------------------------------------------------- |
122
+ | `0` | All evals passed — _or_ no eval files were found (CI-friendly). |
123
+ | `1` | At least one eval scored below threshold, or the suite errored. |
124
+ | `2` | Bad arguments (e.g. `--threshold` outside `[0, 1]`). |
125
+
126
+ ### As a CI deploy gate {#ci}
127
+
128
+ Add it to the pipeline that runs before a deploy:
129
+
130
+ ```yaml
131
+ # .github/workflows/deploy.yml (excerpt)
132
+ - run: npx agent-native eval --json
133
+ ```
134
+
135
+ A regression that drops any scorer below threshold fails the step and blocks the deploy. An app with no eval files exits `0`, so adopting evals is opt-in per app.
136
+
137
+ ## What's next
138
+
139
+ - [**Observability**](/docs/observability) — post-hoc scoring of real production runs (the complementary layer)
140
+ - [**Actions**](/docs/actions) — the tools/actions that show up in `toolCalls`
141
+ - [**Agent Teams**](/docs/agent-teams) — sub-agents an eval might exercise
@@ -216,12 +216,15 @@ the prompt instructs the agent to write `plans/pr-123-visual-recap/plan.mdx`
216
216
  plus optional visual files and then run:
217
217
 
218
218
  ```bash
219
- npx @agent-native/core@latest plan local preview --dir plans/pr-123-visual-recap --kind recap --open
219
+ npx @agent-native/core@latest plan local serve --dir plans/pr-123-visual-recap --kind recap --open
220
220
  ```
221
221
 
222
- The returned `file://` preview, or `/local-plans/pr-123-visual-recap` in a local
223
- Plan app using the same `PLAN_LOCAL_DIR`, is the review link. This mode disables
224
- the hosted sticky PR comment, inline screenshot upload, usage attachment, and
222
+ The returned URL opens the hosted Plan UI while the browser reads the recap MDX
223
+ from a localhost bridge. Recap content is not written to the hosted Plan
224
+ database, and the URL only works on the machine running the bridge. If you run
225
+ the Plan app locally with the same `PLAN_LOCAL_DIR`, the
226
+ `/local-plans/pr-123-visual-recap` route is also valid. This mode disables the
227
+ hosted sticky PR comment, inline screenshot upload, usage attachment, and
225
228
  browser comments until you explicitly publish.
226
229
 
227
230
  ## It's informational, not a gate
@@ -0,0 +1,134 @@
1
+ ---
2
+ title: "Sandbox Adapters"
3
+ description: "Swap the backend that runs the agent's run-code tool — local child process by default, a remote/durable runner when you need to exceed the hosted code-exec ceiling."
4
+ ---
5
+
6
+ # Sandbox Adapters
7
+
8
+ The `run-code` tool runs agent-supplied JavaScript in an isolated environment. **Sandbox adapters** factor the _execution_ concern out of that tool so the backend can be swapped — a local child process by default, or a Docker / remote / durable runner — without touching the agent loop, `run-code.ts`, the localhost bridge, the env scrub, or the output formatting.
9
+
10
+ ## Why a seam {#why}
11
+
12
+ The default backend spawns a locked-down local Node child process. That's bounded by the hosting process: on the hosted platform it shares the agent loop's soft execution ceiling (~40s before timeout/continuation thrash). A remote or durable adapter is the lever to exceed that ceiling — it runs large data jobs to completion independently of the request lifecycle.
13
+
14
+ Keeping the contract narrow means a remote adapter inherits the same security posture. The parent process keeps ownership of everything secret-bearing: it builds the sandbox module, runs the localhost bridge (which holds the request context and applies host allowlists + SSRF guards), scrubs the env, and formats output. An adapter only receives an already-prepared, **non-secret** module source plus resource limits — it is responsible solely for _running_ it and capturing stdout/stderr/exit status.
15
+
16
+ ## The interface {#interface}
17
+
18
+ The seam lives in core at `packages/core/src/coding-tools/sandbox/` — `adapter.ts` (the contract), `index.ts` (selection: `getSandboxAdapter()` / `registerSandboxAdapter()`), and `local-child-process-adapter.ts` (the default). It is wired in-package by `run-code.ts`; a host plugs in a different backend through the `index.ts` registration helper (or, for a Docker backend, via the [blueprint](/docs/blueprint-installer) that edits these files directly).
19
+
20
+ Every backend implements `SandboxAdapter`:
21
+
22
+ ```ts
23
+ interface SandboxAdapter {
24
+ /** Stable id, surfaced for diagnostics and adapter selection. */
25
+ readonly id: string;
26
+ /** Execute one prepared sandbox module and capture its output. */
27
+ run(request: SandboxRunRequest): Promise<SandboxRunResult>;
28
+ }
29
+ ```
30
+
31
+ The request and result are intentionally small and opaque:
32
+
33
+ ```ts
34
+ interface SandboxRunRequest {
35
+ /**
36
+ * The complete ESM module source to execute. Already wraps the user's code
37
+ * and embeds the loopback bridge URL/token; the adapter does NOT parse or
38
+ * rewrite it.
39
+ */
40
+ moduleSource: string;
41
+ /**
42
+ * Scrubbed environment — only safe POSIX vars (PATH/HOME/TMPDIR/…), never app
43
+ * secrets. Adapters must not augment this with the parent's own environment.
44
+ */
45
+ env: Record<string, string>;
46
+ /** Hard wall-clock timeout in milliseconds. The adapter must enforce it. */
47
+ timeoutMs: number;
48
+ /**
49
+ * Loopback port of the parent's bridge server (reachable over 127.0.0.1). A
50
+ * remote adapter that can't reach the parent's loopback must tunnel or proxy
51
+ * this to support bridge-backed globals (`appAction`, `providerFetch`, …).
52
+ */
53
+ bridgePort: number;
54
+ }
55
+
56
+ interface SandboxRunResult {
57
+ stdout: string;
58
+ stderr: string;
59
+ /** `0` on clean exit, non-zero on failure, `null` when killed by a signal. */
60
+ exitCode: number | null;
61
+ /** True when the run was killed for exceeding `timeoutMs`. */
62
+ timedOut: boolean;
63
+ }
64
+ ```
65
+
66
+ ## The default: `LocalChildProcessAdapter` {#default}
67
+
68
+ Out of the box, `getSandboxAdapter()` returns `LocalChildProcessAdapter` (`id: "local-child-process"`). It preserves the historical `run-code` behavior byte-for-byte:
69
+
70
+ - The prepared module source is written to a fresh temp dir.
71
+ - The child runs with the scrubbed env (no secrets), with `TMPDIR`/`TEMP`/`TMP` pointed inside the sandbox dir.
72
+ - When the Node permission model is available (`--permission`, or `--experimental-permission` on Node 20), the child is denied filesystem access outside its temp dir, plus child processes, workers, and native addons. Outbound network is _not_ blocked by the permission model — but the env scrub means such requests carry no credentials, and all authenticated calls go through the parent's loopback bridge.
73
+ - A timeout sends `SIGTERM`, then `SIGKILL` after a 2s grace period.
74
+ - Temp files are cleaned up best-effort after the run.
75
+
76
+ > [!WARNING]
77
+ > The default adapter uses `node:child_process`, which does not exist on edge/worker runtimes (Cloudflare Workers, Netlify Edge Functions). Run `run-code` in a standard Node.js environment, or register a remote adapter.
78
+
79
+ ## Selecting an adapter {#selection}
80
+
81
+ Resolution order — an explicitly registered adapter wins; otherwise the env var selects a built-in; otherwise the local default is used:
82
+
83
+ ```txt
84
+ registerSandboxAdapter(adapter) → AGENT_NATIVE_SANDBOX → local default
85
+ ```
86
+
87
+ ### `AGENT_NATIVE_SANDBOX` env var {#env}
88
+
89
+ Selects a built-in adapter by id. Currently only `local` (the default) is wired; unknown values fall back to local rather than failing the run.
90
+
91
+ ```bash
92
+ AGENT_NATIVE_SANDBOX=local # the default — explicit
93
+ ```
94
+
95
+ ### `registerSandboxAdapter()` {#register}
96
+
97
+ A host process overrides the backend for all subsequent `run-code` invocations through the seam's `index.ts` — for example, to run every call in a remote container:
98
+
99
+ ```ts
100
+ import {
101
+ registerSandboxAdapter,
102
+ type SandboxAdapter,
103
+ } from "./coding-tools/sandbox/index.js";
104
+
105
+ class RemoteSandboxAdapter implements SandboxAdapter {
106
+ readonly id = "remote";
107
+ async run(request) {
108
+ // Ship request.moduleSource to the durable runner, enforce request.timeoutMs,
109
+ // proxy bridge calls back to request.bridgePort, and return stdout/stderr/exitCode.
110
+ }
111
+ }
112
+
113
+ registerSandboxAdapter(new RemoteSandboxAdapter());
114
+ // Pass `null` to clear the override and fall back to env-var / default resolution.
115
+ ```
116
+
117
+ ## The seam for a durable runner {#durable}
118
+
119
+ This interface is deliberately the seam for a future remote/durable sandbox. A remote or durable adapter (Docker, a Vercel-Sandbox-style runner, or a queued background worker) would:
120
+
121
+ 1. Implement `SandboxAdapter.run` against an out-of-process runtime.
122
+ 2. Tunnel the loopback bridge (or proxy bridge calls back to the parent).
123
+ 3. Let large data jobs run to completion independently of the request lifecycle — exceeding the hosted ~40s code-exec ceiling that bounds the local child-process adapter.
124
+
125
+ Register it under a new `AGENT_NATIVE_SANDBOX` value (e.g. `remote`) and/or via `registerSandboxAdapter()`. The agent loop and `run-code.ts` never change.
126
+
127
+ > [!TIP]
128
+ > The `agent-native add sandbox docker` blueprint emits a full, self-contained recipe for implementing a Docker adapter against this seam. See [Blueprint Installer](/docs/blueprint-installer).
129
+
130
+ ## What's next
131
+
132
+ - [**Blueprint Installer**](/docs/blueprint-installer) — `agent-native add sandbox docker` prints a Docker-adapter recipe
133
+ - [**Agent Teams**](/docs/agent-teams) — delegating heavy work to sub-agents
134
+ - [**Security**](/docs/security) — the env scrub and bridge allowlist posture
@@ -180,22 +180,34 @@ or set the convention for your agent environment:
180
180
  export AGENT_NATIVE_PLANS_MODE=local-files
181
181
  ```
182
182
 
183
- In this mode the agent writes a local MDX folder under `plans/<slug>/` and must
184
- not call the hosted Plan MCP tools. The durable files are:
183
+ In this mode the agent writes a local MDX folder and must not call the hosted
184
+ Plan MCP tools. Use a repo folder such as `plans/<slug>/` when you want the plan
185
+ checked in with the code. Use a temp or ignored folder, such as
186
+ `/tmp/agent-native-plans/<slug>/` or `.agent-native/plans/<slug>/`, when the
187
+ plan should stay out of git. The folder contains:
185
188
 
186
189
  - `plan.mdx`
187
190
  - optional `canvas.mdx`
188
191
  - optional `prototype.mdx`
189
192
  - optional `.plan-state.json`
190
193
 
191
- After writing the folder, the agent validates and previews it locally:
194
+ After writing the folder, the agent starts a tiny localhost bridge and opens the
195
+ hosted Plan UI against that local-only source:
192
196
 
193
197
  ```bash
194
- npx @agent-native/core@latest plan local preview --dir plans/<slug> --kind plan --open
198
+ npx @agent-native/core@latest plan local serve --dir plans/<slug> --kind plan --open
195
199
  ```
196
200
 
197
- If you run the Plan app locally with the same `PLAN_LOCAL_DIR`, you can open the
198
- read-only app route:
201
+ The bridge URL looks like
202
+ `https://plan.agent-native.com/local-plans/<slug>?bridge=http://127.0.0.1:...`.
203
+ The page is the normal Plan viewer, but the browser fetches `plan.mdx`,
204
+ `canvas.mdx`, `prototype.mdx`, `.plan-state.json`, and local image assets from
205
+ the localhost bridge. Plan content is not written to the hosted database and is
206
+ not sent through hosted Plan actions. Keep the bridge process running while you
207
+ review; the URL is local to your machine and is not a shareable team link.
208
+
209
+ If you run the Plan app locally with the same `PLAN_LOCAL_DIR`, you can also
210
+ open the read-only app route:
199
211
 
200
212
  ```text
201
213
  http://localhost:<port>/local-plans/<slug>
@@ -206,8 +218,8 @@ Plan database. It also disables hosted sharing, browser comments, plan history,
206
218
  and publish/export receipts until you explicitly opt into publishing. To move a
207
219
  local plan into the hosted database, call `publish-visual-plan` with the local
208
220
  MDX folder path; this uploads the plan, assigns it a hosted ID, enables sharing
209
- and commenting, and returns the hosted URL. It does
210
- not automatically make your coding agent's LLM local; choose a local or approved
221
+ and commenting, and returns the hosted URL. Local-files mode does not
222
+ automatically make your coding agent's LLM local; choose a local or approved
211
223
  model if that privacy boundary matters too.
212
224
 
213
225
  ## Desktop local file sync {#desktop-local-sync}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agent-native/core",
3
- "version": "0.51.15",
3
+ "version": "0.53.0",
4
4
  "type": "module",
5
5
  "engines": {
6
6
  "node": ">=22"
@@ -74,6 +74,7 @@
74
74
  "./agent/context-xray/actions/context-evict": "./dist/agent/context-xray/actions/context-evict.js",
75
75
  "./agent/context-xray/actions/context-restore": "./dist/agent/context-xray/actions/context-restore.js",
76
76
  "./agent/context-xray/actions/context-report": "./dist/agent/context-xray/actions/context-report.js",
77
+ "./agent/observational-memory": "./dist/agent/observational-memory/index.js",
77
78
  "./resources": "./dist/resources/index.js",
78
79
  "./resources/store": "./dist/resources/store.js",
79
80
  "./resources/metadata": "./dist/resources/metadata.js",
@@ -135,6 +136,7 @@
135
136
  "./styles/agent-native.css": "./dist/styles/agent-native.css",
136
137
  "./agent/engine": "./dist/agent/engine/index.js",
137
138
  "./agent/harness": "./dist/agent/harness/index.js",
139
+ "./eval": "./dist/eval/index.js",
138
140
  "./tsconfig.base.json": "./tsconfig.base.json"
139
141
  },
140
142
  "sideEffects": [
@@ -142,6 +144,7 @@
142
144
  ],
143
145
  "files": [
144
146
  "bin",
147
+ "blueprints",
145
148
  "dist",
146
149
  "docs",
147
150
  "tsconfig.base.json",
@@ -264,6 +267,7 @@
264
267
  "ws": ">=8"
265
268
  },
266
269
  "optionalDependencies": {
270
+ "@opentelemetry/api": "^1.9.1",
267
271
  "playwright": "^1.60.0"
268
272
  },
269
273
  "peerDependenciesMeta": {
@@ -329,16 +329,19 @@ The hosted `connect` flow above is the recommended path. For local dev, run
329
329
  the app (`pnpm dev` / `pnpm exec agent-native dev`) then point a local agent at it:
330
330
 
331
331
  ```bash
332
- pnpm exec agent-native mcp install --client claude-code|claude-code-cli|codex|cowork \
332
+ pnpm exec agent-native mcp install --client claude-code|claude-code-cli|codex|cowork|cursor|opencode|github-copilot \
333
333
  [--app <id>] [--scope user|project]
334
334
  ```
335
335
 
336
336
  It provisions a token (random `ACCESS_TOKEN` into the workspace `.env` for
337
337
  local dev, or a `signA2AToken` JWT for a detected hosted origin) and writes an
338
338
  idempotent stdio server entry — `.mcp.json` / `~/.claude.json` for Claude Code,
339
- the `[mcp_servers.*]` block in `~/.codex/config.toml` for Codex, the
340
- Claude-Code JSON shape for Cowork. The entry runs `pnpm exec agent-native mcp serve
341
- --app <id>`, by default a **thin stdio proxy** to the running local app's
339
+ the `[mcp_servers.*]` block in `~/.codex/config.toml` for Codex,
340
+ `.cursor/mcp.json` / `~/.cursor/mcp.json` for Cursor, `opencode.json` /
341
+ `~/.config/opencode/opencode.json` for OpenCode, `.vscode/mcp.json` / VS Code
342
+ user `mcp.json` for GitHub Copilot / VS Code, and the Claude-Code JSON shape
343
+ for Cowork. The entry runs `pnpm exec agent-native mcp serve --app <id>`, by
344
+ default a **thin stdio proxy** to the running local app's
342
345
  `/_agent-native/mcp` (live registry + HMR + correct deep links stay the single
343
346
  source of truth; `--standalone` builds the registry in-process). Companion
344
347
  subcommands: `mcp uninstall`, `mcp status`, `mcp token [--rotate]`. You can
@@ -413,3 +416,13 @@ before telling the user they are unauthenticated.
413
416
  - **a2a-protocol** — the `ask-agent` meta-tool and JSON-RPC peer calls
414
417
  - **adding-a-feature** — the four-area checklist (add a `link` builder when a
415
418
  feature produces a navigable resource)
419
+
420
+ ## Blueprint installer
421
+
422
+ To add a whole new integration the agent-native way, `agent-native add <kind>
423
+ <name|url>` prints a curated Markdown blueprint to stdout — pipe it into the
424
+ external coding agent you connected (`agent-native add provider stripe |
425
+ claude`) and it applies the changes against the live repo. A URL emits a
426
+ generic research-and-integrate blueprint instead. Seeded kinds:
427
+ `provider` / `channel` / `sandbox` / `action`. Add your own by dropping a
428
+ `.md` in `packages/core/blueprints/<kind>/`. See the Blueprint Installer doc.
@@ -80,6 +80,26 @@ existing run routes as `goalId=agent-harness`.
80
80
  Preserve `defineAction` auth, request context, timeouts, truncation, and
81
81
  read-only metadata.
82
82
 
83
+ ## Code Execution Sandbox
84
+
85
+ - The `run-code` tool executes through a pluggable `SandboxAdapter`
86
+ (`packages/core/src/coding-tools/sandbox/`). The default
87
+ `LocalChildProcessAdapter` spawns a locked-down local Node child process;
88
+ swap it via `AGENT_NATIVE_SANDBOX` or `registerSandboxAdapter()` for a
89
+ Docker/remote/durable backend (the lever to exceed the hosted ~40s code-exec
90
+ ceiling). An adapter only runs the already-prepared, non-secret module source
91
+ — it never sees app secrets. See the Sandbox Adapters doc; `agent-native add
92
+ sandbox docker` emits a full Docker-adapter recipe.
93
+
94
+ ## Sub-Agent Delegation Depth
95
+
96
+ - Sub-agent spawning is capped server-side (default depth `2`) so delegation
97
+ chains can't fan out indefinitely. Override at deploy time with
98
+ `AGENT_NATIVE_MAX_SUBAGENT_DEPTH` (`0` disables sub-agents; clamped to `16`).
99
+ Enforcement is ambient via `evaluateSubagentDepth` in
100
+ `packages/core/src/server/agent-teams.ts` — independent of any tool-level
101
+ guard. See the Agent Teams doc for the depth model.
102
+
83
103
  ## Don't
84
104
 
85
105
  - Don't add Claude Code, Codex, Cursor, Mastra, or Pi as an `AgentEngine`.
@@ -75,6 +75,26 @@ const criteria: EvalCriteria = {
75
75
  };
76
76
  ```
77
77
 
78
+ #### Evals (CI gate)
79
+
80
+ The three layers above score *real production runs* after the fact. For an active, deterministic gate, use the first-class `*.eval.ts` primitive from `@agent-native/core/eval` (source: `packages/core/src/eval/*`). It runs the actual agent loop against fixed inputs and exits non-zero below threshold, so it gates CI/deploys.
81
+
82
+ ```ts
83
+ // evals/faq.eval.ts
84
+ import { defineEval, contains, llmJudge } from "@agent-native/core/eval";
85
+
86
+ export default defineEval({
87
+ name: "answers the FAQ",
88
+ input: { prompt: "What is your return policy?" },
89
+ threshold: 0.7,
90
+ scorers: [contains("30 days"), llmJudge({ criteria: "accuracy" })],
91
+ });
92
+ ```
93
+
94
+ - Built-in scorers: `exactMatch` / `contains` / `usesTool` (pure JS) and `llmJudge` (provider-agnostic judge).
95
+ - Custom scorers: `createScorer` with the 4-step `preprocess → analyze → generateScore → generateReason` pipeline (only `generateScore` is required).
96
+ - Run as a gate: `agent-native eval [pattern] [--json] [--threshold N]` — discovers `**/*.eval.ts` and `evals/*.ts`, runs the agent, and exits non-zero if any eval is below its threshold. An app with no eval files exits `0`. Complements (does not replace) the post-hoc scoring in `evals.ts`. See the Evals doc.
97
+
78
98
  ### 4. Experiments
79
99
 
80
100
  A/B testing with sticky user-level assignment: