@kontourai/flow-agents 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/.github/dependabot.yml +23 -0
  2. package/.github/workflows/publish-npm.yml +1 -1
  3. package/.github/workflows/release-please.yml +31 -0
  4. package/.github/workflows/runtime-compat.yml +118 -0
  5. package/CHANGELOG.md +38 -0
  6. package/CONTRIBUTING.md +4 -0
  7. package/README.md +58 -19
  8. package/build/src/cli/init.js +215 -5
  9. package/build/src/cli/utterance-check.js +236 -0
  10. package/build/src/cli.js +3 -0
  11. package/build/src/tools/build-universal-bundles.js +268 -0
  12. package/build/src/tools/filter-installed-packs.js +3 -0
  13. package/build/src/tools/validate-source-tree.js +6 -1
  14. package/context/scripts/telemetry/lib/config.sh +5 -1
  15. package/context/settings/flow-agents-settings.json +7 -0
  16. package/docs/agent-system-guidebook.md +4 -5
  17. package/docs/context-map.md +1 -0
  18. package/docs/index.md +46 -6
  19. package/docs/integrations/conformance.md +246 -0
  20. package/docs/integrations/framework-adapter.md +275 -0
  21. package/docs/integrations/harness-install.md +213 -0
  22. package/docs/integrations/index.md +54 -0
  23. package/docs/north-star.md +3 -3
  24. package/docs/repository-structure.md +1 -1
  25. package/docs/skills-map.md +10 -4
  26. package/docs/spec/runtime-hook-surface.md +472 -0
  27. package/docs/survey-utterance-check.md +308 -0
  28. package/docs/vision.md +45 -0
  29. package/docs/workflow-usage-guide.md +1 -1
  30. package/evals/acceptance/run.sh +4 -2
  31. package/evals/acceptance/test_opencode_harness.sh +121 -0
  32. package/evals/acceptance/test_pi_harness.sh +98 -0
  33. package/evals/integration/test_bundle_install.sh +226 -1
  34. package/evals/integration/test_bundle_lifecycle.sh +641 -0
  35. package/evals/integration/test_utterance_check.sh +518 -0
  36. package/evals/run.sh +2 -0
  37. package/evals/static/test_universal_bundles.sh +137 -2
  38. package/integrations/strands/README.md +256 -0
  39. package/integrations/strands/example.py +74 -0
  40. package/integrations/strands/flow_agents_strands/__init__.py +27 -0
  41. package/integrations/strands/flow_agents_strands/hooks.py +194 -0
  42. package/integrations/strands/flow_agents_strands/policy.py +348 -0
  43. package/integrations/strands/flow_agents_strands/steering.py +172 -0
  44. package/integrations/strands/flow_agents_strands/telemetry.py +238 -0
  45. package/integrations/strands/pyproject.toml +38 -0
  46. package/integrations/strands/tests/__init__.py +0 -0
  47. package/integrations/strands/tests/test_hooks.py +304 -0
  48. package/integrations/strands/tests/test_policy.py +315 -0
  49. package/integrations/strands/tests/test_telemetry.py +184 -0
  50. package/integrations/strands-ts/README.md +224 -0
  51. package/integrations/strands-ts/bin/conformance-shim.mjs +257 -0
  52. package/integrations/strands-ts/package.json +53 -0
  53. package/integrations/strands-ts/src/hooks.ts +208 -0
  54. package/integrations/strands-ts/src/index.ts +22 -0
  55. package/integrations/strands-ts/src/policy.ts +345 -0
  56. package/integrations/strands-ts/src/telemetry.ts +251 -0
  57. package/integrations/strands-ts/test/test-policy.ts +322 -0
  58. package/integrations/strands-ts/test/test-telemetry.ts +226 -0
  59. package/integrations/strands-ts/tsconfig.json +20 -0
  60. package/package.json +7 -2
  61. package/packaging/conformance/README.md +142 -0
  62. package/packaging/conformance/fixtures/config-protection--allow-no-path.json +18 -0
  63. package/packaging/conformance/fixtures/config-protection--allow-safe-file.json +20 -0
  64. package/packaging/conformance/fixtures/config-protection--block-biome.json +20 -0
  65. package/packaging/conformance/fixtures/config-protection--block-eslintrc.json +20 -0
  66. package/packaging/conformance/fixtures/quality-gate--allow-no-path.json +17 -0
  67. package/packaging/conformance/fixtures/quality-gate--allow-nonexistent-file.json +19 -0
  68. package/packaging/conformance/fixtures/stop-goal-fit--allow-clean-cwd.json +17 -0
  69. package/packaging/conformance/fixtures/stop-goal-fit--block-strict-mode.json +23 -0
  70. package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +21 -0
  71. package/packaging/conformance/fixtures/workflow-steering--allow-no-state.json +16 -0
  72. package/packaging/conformance/fixtures/workflow-steering--inject-active-state.json +29 -0
  73. package/packaging/conformance/fixtures/workflow-steering--inject-subagent-steering.json +25 -0
  74. package/packaging/conformance/package.json +4 -0
  75. package/packaging/conformance/run-conformance.js +322 -0
  76. package/packaging/manifest.json +59 -0
  77. package/schemas/flow-agents-settings.schema.json +48 -0
  78. package/scripts/README.md +5 -0
  79. package/scripts/dogfood.js +16 -0
  80. package/scripts/hooks/opencode-hook-adapter.js +123 -0
  81. package/scripts/hooks/opencode-telemetry-hook.js +101 -0
  82. package/scripts/hooks/pi-hook-adapter.js +123 -0
  83. package/scripts/hooks/pi-telemetry-hook.js +105 -0
  84. package/scripts/hooks/run-hook.js +8 -0
  85. package/scripts/hooks/utterance-check.js +327 -0
  86. package/scripts/telemetry/lib/config.sh +5 -1
  87. package/skills/idea-to-backlog/SKILL.md +1 -1
  88. package/src/cli/init.ts +219 -6
  89. package/src/cli/utterance-check.ts +324 -0
  90. package/src/cli.ts +3 -0
  91. package/src/tools/build-universal-bundles.ts +266 -0
  92. package/src/tools/filter-installed-packs.ts +3 -0
  93. package/src/tools/validate-source-tree.ts +6 -1
  94. package/build/src/cli/docs-preview.js +0 -39
  95. package/build/src/cli/export-bookmarks.js +0 -38
  96. package/build/src/cli/import-bookmarks.js +0 -50
  97. package/build/src/cli/instinct-cli.js +0 -93
@@ -0,0 +1,308 @@
1
+ ---
2
+ title: Survey Utterance Check Integration
3
+ ---
4
+
5
+ # Survey Utterance Check Integration
6
+
7
+ When an agent says something factual — "test coverage is 92%", "the API is backward-compatible", "no breaking changes in this release" — that claim either has evidence behind it or it doesn't. The utterance check feature bridges Flow Agents hooks to `@kontourai/survey` so that every factual statement in an agent response is compared against a trust bundle and tagged with a badge. Statements with no backing evidence are flagged inline so the agent can acknowledge the gap rather than assert silently.
8
+
9
+ This document explains how to enable and configure the feature, what the workflow looks like end to end, and what to watch out for.
10
+
11
+ ---
12
+
13
+ ## What actually happens
14
+
15
+ Here is a concrete walkthrough from agent response to badge guidance:
16
+
17
+ ```
18
+ Agent says: "The test coverage for auth-service is 92%.
19
+ All critical paths have been verified."
20
+
21
+ Flow Agents hook (PostToolUse):
22
+ 1. Captures the agent response text from the PostToolUse event.
23
+ 2. Invokes the utterance-check CLI with the response text and your trust bundle.
24
+
25
+ @kontourai/survey (inside the CLI):
26
+ 3. Extractor splits the response into factual statements:
27
+ - "test coverage for auth-service is 92%"
28
+ - "All critical paths have been verified"
29
+ 4. Each statement is resolved against the trust bundle.
30
+ 5. Neither statement has a matching verified claim → both resolve as "unsupported".
31
+
32
+ Flow Agents hook injects guidance into the agent context:
33
+ UTTERANCE CHECK: 2 statement(s) in this response lack evidence coverage.
34
+ Summary: unsupported:2
35
+ - [unsupported] "test coverage for auth-service is 92%"
36
+ - [unsupported] "All critical paths have been verified"
37
+ Evidence note: unsupported = no matching claim in the trust bundle; ...
38
+ ```
39
+
40
+ The agent sees honest gap disclosure rather than silent pass-through. It can then cite sources, note the gap explicitly, or record a coverage claim via `@kontourai/survey`.
41
+
42
+ ---
43
+
44
+ ## Deciding between report and strict mode
45
+
46
+ The hook has two modes:
47
+
48
+ | Mode | Effect |
49
+ |------|--------|
50
+ | `report` (default) | Appends badge guidance to the agent context. Never blocks. Agent decides next step. |
51
+ | `strict` | If any statement is `unsupported`, `disputed`, or `rejected`, the hook exits 2, which routes the Stop event back to the agent for revision. |
52
+
53
+ Use **report** when you want visibility without gate behavior — good for exploratory sessions, onboarding, or repos where the trust bundle is still being built out. Use **strict** when you want the agent to revise or cite sources before completing a turn — appropriate for regulated workflows, production deployments, or repos with a well-populated bundle.
54
+
55
+ The empty-bundle caveat: if you enable the hook without a `bundlePath`, every factual statement the extractor finds will resolve as `unsupported` because there are no claims to match against. In strict mode this means every response with factual statements will be blocked. Make sure you either provide a `bundlePath` or use report mode until you have a bundle.
56
+
57
+ ---
58
+
59
+ ## The trust bundle
60
+
61
+ The trust bundle is a JSON file with a `claims` array. It is the authoritative record of what is considered evidenced for your codebase. Two practical sources:
62
+
63
+ - **Veritas-generated bundle**: if your repo uses `@veritas/veritas`, it can produce a `trust.bundle.json` from `.veritas/evidence`. Point `bundlePath` at that output.
64
+ - **Surface report**: the `@kontourai/surface` package can generate a trust bundle from a surface verification run. If your repo runs surface checks, look for the generated bundle in the surface output directory (e.g. `dist/trust-bundle.json` or a named artifact).
65
+ - **Hand-authored bundle**: a minimal bundle is just `{ "claims": [] }`. Add claims incrementally as you record evidence.
66
+
67
+ An empty or missing bundle means everything is unsupported. That is not necessarily wrong — it is an honest starting state — but it is only useful in report mode.
68
+
69
+ ---
70
+
71
+ ## Choosing an extractor
72
+
73
+ The extractor is responsible for splitting the agent utterance into discrete factual statements. Two are available:
74
+
75
+ | Extractor | How it works | Requirements |
76
+ |-----------|-------------|--------------|
77
+ | `reference` (default) | Pattern-based heuristics. Fast, no API call, no key needed. Works offline. Lower recall on complex prose. | `@kontourai/survey` installed |
78
+ | `anthropic` | Model-backed extraction via `@kontourai/survey/anthropic`. Higher recall, understands context and nuance, can split compound claims. | `@kontourai/survey` + `@anthropic-ai/sdk` installed, `ANTHROPIC_API_KEY` set |
79
+
80
+ For most exploratory use, `reference` is sufficient. Switch to `anthropic` when you find the reference extractor is missing statements that matter for your domain.
81
+
82
+ The `anthropic` extractor fails open: if `ANTHROPIC_API_KEY` is missing or `@anthropic-ai/sdk` is not installed, the CLI emits `status: "not_configured"` (with a clear explanation in `summary`) and exits 0. The hook treats this as a silent pass-through. You will see a message in stderr explaining what is missing, but the hook will not block.
83
+
84
+ ---
85
+
86
+ ## Per-repo configuration
87
+
88
+ The canonical way to enable utterance checking is a `context/settings/flow-agents-settings.json` file in the consumer repo. This is a peer to `context/settings/backlog-provider-settings.json` — the same directory, the same convention.
89
+
90
+ **Minimal example (report mode, reference extractor):**
91
+
92
+ ```json
93
+ {
94
+ "$schema": "../../node_modules/@kontourai/flow-agents/schemas/flow-agents-settings.schema.json",
95
+ "schema_version": "1.0",
96
+ "utteranceCheck": {
97
+ "enabled": true,
98
+ "mode": "report",
99
+ "extractor": "reference"
100
+ }
101
+ }
102
+ ```
103
+
104
+ **With a trust bundle and anthropic extractor:**
105
+
106
+ ```json
107
+ {
108
+ "$schema": "../../node_modules/@kontourai/flow-agents/schemas/flow-agents-settings.schema.json",
109
+ "schema_version": "1.0",
110
+ "utteranceCheck": {
111
+ "enabled": true,
112
+ "mode": "report",
113
+ "extractor": "anthropic",
114
+ "bundlePath": ".veritas/trust.bundle.json",
115
+ "model": "claude-haiku-4-5",
116
+ "agentId": "surface-agent"
117
+ }
118
+ }
119
+ ```
120
+
121
+ **Strict mode:**
122
+
123
+ ```json
124
+ {
125
+ "$schema": "../../node_modules/@kontourai/flow-agents/schemas/flow-agents-settings.schema.json",
126
+ "schema_version": "1.0",
127
+ "utteranceCheck": {
128
+ "enabled": true,
129
+ "mode": "strict",
130
+ "extractor": "anthropic",
131
+ "bundlePath": "dist/trust-bundle.json"
132
+ }
133
+ }
134
+ ```
135
+
136
+ Config field reference:
137
+
138
+ | Field | Type | Default | Description |
139
+ |-------|------|---------|-------------|
140
+ | `enabled` | boolean | `false` | Whether utterance checking is active for this repo. |
141
+ | `mode` | `"report"` \| `"strict"` | `"report"` | How to handle concerning badges. See above. |
142
+ | `extractor` | `"reference"` \| `"anthropic"` | `"reference"` | Extractor to use. See above. |
143
+ | `bundlePath` | string | — | Repo-relative or absolute path to the trust bundle JSON. Omit to use an empty bundle. |
144
+ | `model` | string | — | Model for the anthropic extractor. Only used when `extractor` is `"anthropic"`. |
145
+ | `agentId` | string | `"flow-agents-hook"` | Agent identifier for provenance in the trust report. |
146
+
147
+ ---
148
+
149
+ ## Environment variable overrides
150
+
151
+ For one-off sessions or CI pipelines, you can override the config with environment variables. These take precedence over `flow-agents-settings.json`.
152
+
153
+ | Variable | Effect |
154
+ |----------|--------|
155
+ | `FLOW_AGENTS_UTTERANCE_CHECK_ENABLED=true\|false` | Force the hook on or off, overriding the config `enabled` field. |
156
+ | `FLOW_AGENTS_UTTERANCE_CHECK_STRICT=true` | Force strict mode. |
157
+ | `FLOW_AGENTS_UTTERANCE_CHECK_BUNDLE_PATH=/path/to/bundle.json` | Override `bundlePath`. |
158
+ | `FLOW_AGENTS_UTTERANCE_CHECK_AGENT_ID=my-agent` | Override `agentId`. |
159
+ | `FLOW_AGENTS_UTTERANCE_CHECK_EXTRACTOR=anthropic\|reference` | Override `extractor`. |
160
+
161
+ **When the config file is absent and no env vars are set**, the hook is disabled. This is the safe default — existing repos are not affected until they opt in.
162
+
163
+ ---
164
+
165
+ ## Registering the hook
166
+
167
+ Add the utterance check to a Claude Code session via `.claude/settings.json`:
168
+
169
+ ```json
170
+ {
171
+ "hooks": {
172
+ "PostToolUse": [
173
+ {
174
+ "matcher": ".*",
175
+ "hooks": [
176
+ {
177
+ "type": "command",
178
+ "command": "node scripts/hooks/claude-hook-adapter.js PostToolUse post:utterance-check utterance-check.js standard,strict"
179
+ }
180
+ ]
181
+ }
182
+ ]
183
+ }
184
+ }
185
+ ```
186
+
187
+ Or run the hook directly (Kiro/Codex convention, exit 2 blocks):
188
+
189
+ ```bash
190
+ node scripts/hooks/run-hook.js post:utterance-check utterance-check.js standard,strict
191
+ ```
192
+
193
+ The hook reads `context/settings/flow-agents-settings.json` relative to the repo root it detects from the hook event `cwd` or `process.cwd()`. No configuration needed in the hook command itself.
194
+
195
+ ---
196
+
197
+ ## CLI reference
198
+
199
+ The utterance check CLI is available as:
200
+
201
+ ```bash
202
+ node build/src/cli.js utterance-check check \
203
+ --utterance "The coverage is 92% and all tests pass." \
204
+ --bundle-path .veritas/trust.bundle.json \
205
+ --extractor anthropic \
206
+ --model claude-haiku-4-5 \
207
+ --agent-id my-session
208
+ ```
209
+
210
+ Options:
211
+
212
+ ```
213
+ --utterance TEXT Utterance text to check (required unless --not-configured).
214
+ --bundle-path FILE Trust bundle JSON file. Omit for an empty bundle (all unsupported).
215
+ --agent-id ID Agent identifier for provenance (default: flow-agents-utterance-check).
216
+ --extractor NAME 'reference' (default) or 'anthropic'.
217
+ --model MODEL Model for the anthropic extractor (e.g. claude-haiku-4-5).
218
+ --not-configured Skip survey call; output not_configured without error.
219
+ --strict Exit non-zero when any badge is disputed, rejected, or unsupported.
220
+ --help Show this help.
221
+ ```
222
+
223
+ The CLI outputs a JSON report to stdout:
224
+
225
+ ```json
226
+ {
227
+ "status": "ok",
228
+ "agent_id": "my-session",
229
+ "utterance_excerpt": "The coverage is 92% and all tests pass.",
230
+ "statements": [
231
+ {
232
+ "excerpt": "coverage is 92%",
233
+ "badge": "unsupported",
234
+ "target": {
235
+ "subjectType": "unknown",
236
+ "subjectId": "coverage",
237
+ "fieldOrBehavior": "is"
238
+ }
239
+ }
240
+ ],
241
+ "summary": "unsupported:2"
242
+ }
243
+ ```
244
+
245
+ Badge values:
246
+
247
+ | Badge | Meaning |
248
+ |-------|---------|
249
+ | `verified` | Matched a claim with verified status. |
250
+ | `assumed` | Matched a claim with assumed status. |
251
+ | `stale` | Matched a claim that is stale. |
252
+ | `disputed` | Matched a claim with conflicting evidence. |
253
+ | `rejected` | Matched a claim that was rejected. |
254
+ | `unsupported` | No matching claim in the trust bundle. |
255
+
256
+ Exit codes: `0` = pass, `0` = anthropic not_configured (fail open), `1` = survey unavailable, `2` = strict mode with concerning badges, `3` = usage error.
257
+
258
+ ---
259
+
260
+ ## Installing dependencies
261
+
262
+ The CLI adapter uses dynamic imports so flow-agents itself does not list `@kontourai/survey` as a dependency. Install in the target workspace:
263
+
264
+ ```bash
265
+ # Reference extractor only (default)
266
+ npm install @kontourai/survey
267
+
268
+ # Anthropic extractor (model-backed)
269
+ npm install @kontourai/survey @anthropic-ai/sdk
270
+ ```
271
+
272
+ ---
273
+
274
+ ## Ownership split
275
+
276
+ | Area | Flow Agents owns | Survey owns |
277
+ |------|-----------------|-------------|
278
+ | Hook wiring | PostToolUse/Stop hook, badge guidance format, config loading | None |
279
+ | Extraction | Invoking the CLI, extractor selection, fail-open handling | Statement extraction, extractor interface, anthropic integration |
280
+ | Resolution | Passing the trust bundle path | Inquiry pipeline, claim resolution |
281
+ | Output | Guidance text injected into agent context | UtteranceTrustReport with per-statement badges |
282
+ | Config | Per-repo `flow-agents-settings.json`, env var overrides | None |
283
+
284
+ Flow Agents does not own trust claim models, inquiry semantics, or extractor implementations.
285
+
286
+ ---
287
+
288
+ ## Non-goals
289
+
290
+ - Do not make `@kontourai/survey` a mandatory dependency of flow-agents.
291
+ - Do not copy Survey's extraction or inquiry schemas into flow-agents.
292
+ - Do not auto-register the hook in the default pack; it is opt-in only.
293
+ - Do not make the hook blocking without explicit `mode: "strict"` or the env override.
294
+ - Do not silently decide anything. The hook injects guidance; the agent decides next steps.
295
+
296
+ ---
297
+
298
+ ## Current integration shape
299
+
300
+ The integration delivers:
301
+
302
+ 1. `src/cli/utterance-check.ts` — TypeScript CLI adapter. Accepts utterance text, optional bundle path, agent ID, extractor name, and model. Dynamically imports `@kontourai/survey` (and optionally `@kontourai/survey/anthropic`). Outputs a JSON badge report to stdout and human-readable guidance to stderr.
303
+
304
+ 2. `scripts/hooks/utterance-check.js` — CJS hook script. PostToolUse/Stop, non-blocking in report mode. Reads per-repo policy from `context/settings/flow-agents-settings.json`, uses env vars as overrides. Resolves repo root from hook event `cwd`. Always fails open.
305
+
306
+ 3. `schemas/flow-agents-settings.schema.json` — JSON Schema for the per-repo settings file.
307
+
308
+ Survey source and API details: https://github.com/kontourai/survey
package/docs/vision.md ADDED
@@ -0,0 +1,45 @@
1
+ ---
2
+ title: Flow Agents Vision and Direction
3
+ ---
4
+
5
+ # Vision and Direction
6
+
7
+ This page captures where Flow Agents is headed, clearly labeled as direction rather than shipped capability. Shipped artifacts are documented in the [Runtime Hook Surface spec](spec/runtime-hook-surface.html) and the [Runtime and support matrix](index.html#runtime-and-support-matrix) on the overview page.
8
+
9
+ ---
10
+
11
+ ## What ships today
12
+
13
+ Flow Agents currently ships as a harness adapter layer: six core harness runtimes (base, Claude Code, Codex, Kiro, opencode, pi) receive bundled agents, skills, context, scripts, and hook wiring through the `npx @kontourai/flow-agents init` installer. The four canonical policy classes — workflow steering, quality gate, stop-goal-fit, and config protection — are implemented as canonical scripts under `scripts/hooks/` and wired to each host's native event surface at conformance levels L0, L1, or L2.
14
+
15
+ One official framework adapter spike exists: `integrations/strands/` is a Python `HookProvider` for AWS Strands that emits the canonical telemetry taxonomy and enforces config protection via tool-call cancellation. It is preview-status with documented limitations.
16
+
17
+ ---
18
+
19
+ ## Direction
20
+
21
+ The items below are direction, not committed delivery dates. They record the intended shape of where this work goes.
22
+
23
+ ### Kits beyond coding
24
+
25
+ The process-discipline layer is not coding-specific. The canonical policies, sidecar state model, and evidence taxonomy are defined without reference to source code, build systems, or CI. The direction is deployable agentic workflows — Flow Kits for domains beyond software delivery: knowledge work, research, operations, sales contexts, and personal productivity. The [North Star](north-star.html) records the broader scope.
26
+
27
+ ### TypeScript framework adapters
28
+
29
+ The Strands Python spike proves the thesis: the policy engine is not harness-specific. The direction is TypeScript framework adapters that consume the canonical policy engine natively via the published `@kontourai/flow-agents` npm package, rather than shelling out to bash scripts. Candidate frameworks include LangGraph, VoltAgent, and the OpenAI Agents SDK. The [Runtime Hook Surface spec](spec/runtime-hook-surface.html) documents the adapter contract and the framework event mapping tables for each.
30
+
31
+ ### Kontour Console as the unifying telemetry surface
32
+
33
+ Today, telemetry writes to local JSONL files by default, with optional sinks to a local or hosted Kontour Console. The direction is Kontour Console as the unifying surface that spans both harness sessions (Claude Code, Codex, Kiro, opencode, pi) and deployed framework agents (Strands, LangGraph, etc.) — so the same workflow state, evidence, and hook telemetry are visible regardless of which runtime executed the work.
34
+
35
+ ### Conformance kit for community adapters
36
+
37
+ The runtime matrix includes a "conformance-certified" tier for community and third-party adapters that self-certify at a declared L0, L1, or L2 level. A conformance kit — a test suite and declaration format — is in development. It does not yet ship.
38
+
39
+ ---
40
+
41
+ ## What this is not
42
+
43
+ Flow Agents is not building another agent runtime, coding assistant, workflow engine, or orchestration control plane. The model, the runtime, the IDE, the agent UI, the workflow engine, and the repo governance engine are all deliberately out of scope. Flow Agents owns the glue: discovery, just-in-time guidance, scoped delegation, Flow-backed workflow state inside agent runtimes, evidence-backed completion, and feedback loops.
44
+
45
+ See the [North Star](north-star.html) for the full design principles and the [Developer Architecture](developer-architecture.html) for the product boundary map.
@@ -378,7 +378,7 @@ Completion gate:
378
378
 
379
379
  The validator and stop hook enforce this shape for terminal workflows. If a delivery is terminal and neither the Markdown artifact nor `state.json.artifact_paths` points at durable docs, validation should fail unless the artifact records an explicit no-docs decision.
380
380
 
381
- ## 10. Capture Learning
381
+ ## 11. Capture Learning
382
382
 
383
383
  Use `learning-review` after release, failed gates, incidents, repeated friction, or workflow gaps.
384
384
 
@@ -12,7 +12,7 @@ run_one() {
12
12
  }
13
13
 
14
14
  case "$TARGET" in
15
- kiro|claude|codex)
15
+ kiro|claude|codex|opencode|pi)
16
16
  run_one "$TARGET"
17
17
  ;;
18
18
  all)
@@ -20,10 +20,12 @@ case "$TARGET" in
20
20
  run_one kiro || status=1
21
21
  run_one claude || status=1
22
22
  run_one codex || status=1
23
+ run_one opencode || status=1
24
+ run_one pi || status=1
23
25
  exit "$status"
24
26
  ;;
25
27
  *)
26
- echo "Usage: bash evals/acceptance/run.sh [all|kiro|claude|codex]"
28
+ echo "Usage: bash evals/acceptance/run.sh [all|kiro|claude|codex|opencode|pi]"
27
29
  exit 1
28
30
  ;;
29
31
  esac
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
5
+ source "$ROOT_DIR/evals/lib/node.sh"
6
+ TMP_WORK=""
7
+ pass=0
8
+ fail=0
9
+ skip=0
10
+
11
+ cleanup() {
12
+ [[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
13
+ }
14
+ trap cleanup EXIT
15
+
16
+ _pass() { echo " ✓ $1"; pass=$((pass + 1)); }
17
+ _fail() { echo " ✗ $1"; fail=$((fail + 1)); }
18
+ _skip() { echo " ○ $1"; skip=$((skip + 1)); }
19
+
20
+ wait_for_telemetry() {
21
+ local file="$1"
22
+ local i=0
23
+ while [[ $i -lt 150 ]]; do
24
+ [[ -s "$file" ]] && return 0
25
+ sleep 0.1
26
+ i=$((i + 1))
27
+ done
28
+ return 1
29
+ }
30
+
31
+ echo "=== Harness Acceptance: opencode ==="
32
+ echo ""
33
+
34
+ if ! command -v opencode >/dev/null 2>&1; then
35
+ _skip "opencode CLI not installed"
36
+ echo ""
37
+ echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
38
+ exit 0
39
+ fi
40
+
41
+ cd "$ROOT_DIR"
42
+ flow_agents_node scripts/build-universal-bundles.js >/dev/null
43
+
44
+ TMP_WORK="$(mktemp -d /tmp/opencode-acceptance-work.XXXXXX)"
45
+ (cd dist/opencode && bash install.sh "$TMP_WORK") >/dev/null
46
+
47
+ echo "--- Plugin Load + Telemetry ---"
48
+ cd "$TMP_WORK"
49
+ rm -rf .telemetry
50
+
51
+ MODEL_ARGS=()
52
+ if [[ -n "${FLOW_AGENTS_ACCEPT_OPENCODE_MODEL:-}" ]]; then
53
+ MODEL_ARGS=(-m "$FLOW_AGENTS_ACCEPT_OPENCODE_MODEL")
54
+ fi
55
+
56
+ # Models sometimes answer without calling the tool (nondeterminism), which
57
+ # would void the tool.invoke/tool.result assertions — force the tool call
58
+ # and retry once if no tool events landed.
59
+ ACCEPT_PROMPT="You MUST call the read tool before replying — answering from memory is a failure. Read the first 5 lines of README.md with the read tool, then reply: done"
60
+ run_output=""
61
+ provider_error=0
62
+ for _attempt in 1 2; do
63
+ run_output="$(opencode run "${MODEL_ARGS[@]}" "$ACCEPT_PROMPT" 2>&1 || true)"
64
+ if echo "$run_output" | grep -qi "error"; then
65
+ provider_error=1
66
+ break
67
+ fi
68
+ provider_error=0
69
+ for _i in $(seq 1 50); do
70
+ [[ -s "$TMP_WORK/.telemetry/full.jsonl" ]] && grep -q '"tool.invoke"' "$TMP_WORK/.telemetry/full.jsonl" 2>/dev/null && break
71
+ sleep 0.3
72
+ done
73
+ grep -q '"tool.invoke"' "$TMP_WORK/.telemetry/full.jsonl" 2>/dev/null && break
74
+ done
75
+
76
+ LATEST_LOG="$(ls -t ~/.local/share/opencode/log/*.log 2>/dev/null | head -1 || true)"
77
+ if [[ -n "$LATEST_LOG" ]] && grep -q "plugins/flow-agents.js loading plugin" "$LATEST_LOG" 2>/dev/null; then
78
+ _pass "opencode log confirms flow-agents plugin loaded"
79
+ else
80
+ _fail "opencode log did not confirm flow-agents plugin loaded"
81
+ fi
82
+
83
+ telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
84
+ if [[ "$provider_error" -eq 1 ]]; then
85
+ _skip "opencode telemetry assertions skipped (provider/auth error)"
86
+ _skip "opencode telemetry tool events skipped (provider/auth error)"
87
+ else
88
+ if wait_for_telemetry "$telemetry_file"; then
89
+ _pass "opencode telemetry log was written"
90
+ else
91
+ _fail "opencode telemetry log was not written"
92
+ fi
93
+
94
+ if [[ -f "$telemetry_file" ]] && \
95
+ node -e "
96
+ const fs = require('fs');
97
+ const lines = fs.readFileSync('$telemetry_file', 'utf8').trim().split('\n');
98
+ const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
99
+ const hasInvoke = types.some(t => t === 'tool.invoke');
100
+ const hasResult = types.some(t => t === 'tool.result');
101
+ process.exit(hasInvoke && hasResult ? 0 : 1);
102
+ " 2>/dev/null; then
103
+ _pass "opencode telemetry contains tool.invoke and tool.result events"
104
+ else
105
+ _fail "opencode telemetry missing tool.invoke or tool.result events"
106
+ fi
107
+ fi
108
+
109
+ PARENT_TELEMETRY="$(dirname "$TMP_WORK")/.telemetry"
110
+ if [[ -d "$PARENT_TELEMETRY" ]]; then
111
+ _fail "opencode wrote .telemetry to workspace parent directory"
112
+ else
113
+ _pass "no .telemetry leak to workspace parent directory"
114
+ fi
115
+
116
+ echo ""
117
+ echo "==========================="
118
+ total=$((pass + fail))
119
+ echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
120
+ [[ "$fail" -gt 0 ]] && exit 1
121
+ exit 0
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
5
+ source "$ROOT_DIR/evals/lib/node.sh"
6
+ TMP_WORK=""
7
+ pass=0
8
+ fail=0
9
+ skip=0
10
+
11
+ cleanup() {
12
+ [[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
13
+ }
14
+ trap cleanup EXIT
15
+
16
+ _pass() { echo " ✓ $1"; pass=$((pass + 1)); }
17
+ _fail() { echo " ✗ $1"; fail=$((fail + 1)); }
18
+ _skip() { echo " ○ $1"; skip=$((skip + 1)); }
19
+
20
+ wait_for_telemetry() {
21
+ local file="$1"
22
+ local i=0
23
+ while [[ $i -lt 150 ]]; do
24
+ [[ -s "$file" ]] && return 0
25
+ sleep 0.1
26
+ i=$((i + 1))
27
+ done
28
+ return 1
29
+ }
30
+
31
+ echo "=== Harness Acceptance: pi ==="
32
+ echo ""
33
+
34
+ if ! command -v pi >/dev/null 2>&1; then
35
+ _skip "pi CLI not installed"
36
+ echo ""
37
+ echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
38
+ exit 0
39
+ fi
40
+
41
+ cd "$ROOT_DIR"
42
+ flow_agents_node scripts/build-universal-bundles.js >/dev/null
43
+
44
+ TMP_WORK="$(mktemp -d /tmp/pi-acceptance-work.XXXXXX)"
45
+ (cd dist/pi && bash install.sh "$TMP_WORK") >/dev/null
46
+
47
+ echo "--- Telemetry ---"
48
+ cd "$TMP_WORK"
49
+ rm -rf .telemetry
50
+
51
+ run_output="$(pi --approve -p \
52
+ "Use your read tool to read the first 5 lines of README.md, then reply: done" 2>&1 || true)"
53
+ provider_error=0
54
+ if echo "$run_output" | grep -qi "error"; then
55
+ provider_error=1
56
+ fi
57
+
58
+ telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
59
+ if [[ "$provider_error" -eq 1 ]]; then
60
+ _skip "pi telemetry assertions skipped (provider/auth error)"
61
+ _skip "pi telemetry event types skipped (provider/auth error)"
62
+ _skip "pi telemetry session events skipped (provider/auth error)"
63
+ else
64
+ if wait_for_telemetry "$telemetry_file"; then
65
+ _pass "pi telemetry log was written"
66
+ else
67
+ _fail "pi telemetry log was not written"
68
+ fi
69
+
70
+ if [[ -f "$telemetry_file" ]] && \
71
+ node -e "
72
+ const fs = require('fs');
73
+ const lines = fs.readFileSync('$telemetry_file', 'utf8').trim().split('\n');
74
+ const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
75
+ const required = ['session.start', 'tool.invoke', 'tool.result', 'session.end'];
76
+ const missing = required.filter(t => !types.includes(t));
77
+ if (missing.length > 0) { process.stderr.write('missing: ' + missing.join(', ') + '\n'); process.exit(1); }
78
+ process.exit(0);
79
+ " 2>/dev/null; then
80
+ _pass "pi telemetry contains session.start, tool.invoke, tool.result, session.end"
81
+ else
82
+ _fail "pi telemetry missing one or more required event types (session.start, tool.invoke, tool.result, session.end)"
83
+ fi
84
+ fi
85
+
86
+ PARENT_TELEMETRY="$(dirname "$TMP_WORK")/.telemetry"
87
+ if [[ -d "$PARENT_TELEMETRY" ]]; then
88
+ _fail "pi wrote .telemetry to workspace parent directory"
89
+ else
90
+ _pass "no .telemetry leak to workspace parent directory"
91
+ fi
92
+
93
+ echo ""
94
+ echo "==========================="
95
+ total=$((pass + fail))
96
+ echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
97
+ [[ "$fail" -gt 0 ]] && exit 1
98
+ exit 0