npm - @kontourai/flow-agents - Versions diffs - 0.1.1 → 0.2.0 - Mend

@kontourai/flow-agents 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

package/.github/dependabot.yml +23 -0
package/.github/workflows/publish-npm.yml +1 -1
package/.github/workflows/release-please.yml +31 -0
package/.github/workflows/runtime-compat.yml +118 -0
package/CHANGELOG.md +38 -0
package/CONTRIBUTING.md +4 -0
package/README.md +58 -19
package/build/src/cli/init.js +215 -5
package/build/src/cli/utterance-check.js +236 -0
package/build/src/cli.js +3 -0
package/build/src/tools/build-universal-bundles.js +268 -0
package/build/src/tools/filter-installed-packs.js +3 -0
package/build/src/tools/validate-source-tree.js +6 -1
package/context/scripts/telemetry/lib/config.sh +5 -1
package/context/settings/flow-agents-settings.json +7 -0
package/docs/agent-system-guidebook.md +4 -5
package/docs/context-map.md +1 -0
package/docs/index.md +46 -6
package/docs/integrations/conformance.md +246 -0
package/docs/integrations/framework-adapter.md +275 -0
package/docs/integrations/harness-install.md +213 -0
package/docs/integrations/index.md +54 -0
package/docs/north-star.md +3 -3
package/docs/repository-structure.md +1 -1
package/docs/skills-map.md +10 -4
package/docs/spec/runtime-hook-surface.md +472 -0
package/docs/survey-utterance-check.md +308 -0
package/docs/vision.md +45 -0
package/docs/workflow-usage-guide.md +1 -1
package/evals/acceptance/run.sh +4 -2
package/evals/acceptance/test_opencode_harness.sh +121 -0
package/evals/acceptance/test_pi_harness.sh +98 -0
package/evals/integration/test_bundle_install.sh +226 -1
package/evals/integration/test_bundle_lifecycle.sh +641 -0
package/evals/integration/test_utterance_check.sh +518 -0
package/evals/run.sh +2 -0
package/evals/static/test_universal_bundles.sh +137 -2
package/integrations/strands/README.md +256 -0
package/integrations/strands/example.py +74 -0
package/integrations/strands/flow_agents_strands/__init__.py +27 -0
package/integrations/strands/flow_agents_strands/hooks.py +194 -0
package/integrations/strands/flow_agents_strands/policy.py +348 -0
package/integrations/strands/flow_agents_strands/steering.py +172 -0
package/integrations/strands/flow_agents_strands/telemetry.py +238 -0
package/integrations/strands/pyproject.toml +38 -0
package/integrations/strands/tests/__init__.py +0 -0
package/integrations/strands/tests/test_hooks.py +304 -0
package/integrations/strands/tests/test_policy.py +315 -0
package/integrations/strands/tests/test_telemetry.py +184 -0
package/integrations/strands-ts/README.md +224 -0
package/integrations/strands-ts/bin/conformance-shim.mjs +257 -0
package/integrations/strands-ts/package.json +53 -0
package/integrations/strands-ts/src/hooks.ts +208 -0
package/integrations/strands-ts/src/index.ts +22 -0
package/integrations/strands-ts/src/policy.ts +345 -0
package/integrations/strands-ts/src/telemetry.ts +251 -0
package/integrations/strands-ts/test/test-policy.ts +322 -0
package/integrations/strands-ts/test/test-telemetry.ts +226 -0
package/integrations/strands-ts/tsconfig.json +20 -0
package/package.json +7 -2
package/packaging/conformance/README.md +142 -0
package/packaging/conformance/fixtures/config-protection--allow-no-path.json +18 -0
package/packaging/conformance/fixtures/config-protection--allow-safe-file.json +20 -0
package/packaging/conformance/fixtures/config-protection--block-biome.json +20 -0
package/packaging/conformance/fixtures/config-protection--block-eslintrc.json +20 -0
package/packaging/conformance/fixtures/quality-gate--allow-no-path.json +17 -0
package/packaging/conformance/fixtures/quality-gate--allow-nonexistent-file.json +19 -0
package/packaging/conformance/fixtures/stop-goal-fit--allow-clean-cwd.json +17 -0
package/packaging/conformance/fixtures/stop-goal-fit--block-strict-mode.json +23 -0
package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +21 -0
package/packaging/conformance/fixtures/workflow-steering--allow-no-state.json +16 -0
package/packaging/conformance/fixtures/workflow-steering--inject-active-state.json +29 -0
package/packaging/conformance/fixtures/workflow-steering--inject-subagent-steering.json +25 -0
package/packaging/conformance/package.json +4 -0
package/packaging/conformance/run-conformance.js +322 -0
package/packaging/manifest.json +59 -0
package/schemas/flow-agents-settings.schema.json +48 -0
package/scripts/README.md +5 -0
package/scripts/dogfood.js +16 -0
package/scripts/hooks/opencode-hook-adapter.js +123 -0
package/scripts/hooks/opencode-telemetry-hook.js +101 -0
package/scripts/hooks/pi-hook-adapter.js +123 -0
package/scripts/hooks/pi-telemetry-hook.js +105 -0
package/scripts/hooks/run-hook.js +8 -0
package/scripts/hooks/utterance-check.js +327 -0
package/scripts/telemetry/lib/config.sh +5 -1
package/skills/idea-to-backlog/SKILL.md +1 -1
package/src/cli/init.ts +219 -6
package/src/cli/utterance-check.ts +324 -0
package/src/cli.ts +3 -0
package/src/tools/build-universal-bundles.ts +266 -0
package/src/tools/filter-installed-packs.ts +3 -0
package/src/tools/validate-source-tree.ts +6 -1
package/build/src/cli/docs-preview.js +0 -39
package/build/src/cli/export-bookmarks.js +0 -38
package/build/src/cli/import-bookmarks.js +0 -50
package/build/src/cli/instinct-cli.js +0 -93

package/docs/survey-utterance-check.md ADDED Viewed

@@ -0,0 +1,308 @@
+---
+title: Survey Utterance Check Integration
+---
+# Survey Utterance Check Integration
+When an agent says something factual — "test coverage is 92%", "the API is backward-compatible", "no breaking changes in this release" — that claim either has evidence behind it or it doesn't. The utterance check feature bridges Flow Agents hooks to `@kontourai/survey` so that every factual statement in an agent response is compared against a trust bundle and tagged with a badge. Statements with no backing evidence are flagged inline so the agent can acknowledge the gap rather than assert silently.
+This document explains how to enable and configure the feature, what the workflow looks like end to end, and what to watch out for.
+---
+## What actually happens
+Here is a concrete walkthrough from agent response to badge guidance:
+```
+Agent says: "The test coverage for auth-service is 92%.
+             All critical paths have been verified."
+Flow Agents hook (PostToolUse):
+  1. Captures the agent response text from the PostToolUse event.
+  2. Invokes the utterance-check CLI with the response text and your trust bundle.
+@kontourai/survey (inside the CLI):
+  3. Extractor splits the response into factual statements:
+       - "test coverage for auth-service is 92%"
+       - "All critical paths have been verified"
+  4. Each statement is resolved against the trust bundle.
+  5. Neither statement has a matching verified claim → both resolve as "unsupported".
+Flow Agents hook injects guidance into the agent context:
+  UTTERANCE CHECK: 2 statement(s) in this response lack evidence coverage.
+  Summary: unsupported:2
+    - [unsupported] "test coverage for auth-service is 92%"
+    - [unsupported] "All critical paths have been verified"
+  Evidence note: unsupported = no matching claim in the trust bundle; ...
+```
+The agent sees honest gap disclosure rather than silent pass-through. It can then cite sources, note the gap explicitly, or record a coverage claim via `@kontourai/survey`.
+---
+## Deciding between report and strict mode
+The hook has two modes:
+| Mode | Effect |
+|------|--------|
+| `report` (default) | Appends badge guidance to the agent context. Never blocks. Agent decides next step. |
+| `strict` | If any statement is `unsupported`, `disputed`, or `rejected`, the hook exits 2, which routes the Stop event back to the agent for revision. |
+Use **report** when you want visibility without gate behavior — good for exploratory sessions, onboarding, or repos where the trust bundle is still being built out. Use **strict** when you want the agent to revise or cite sources before completing a turn — appropriate for regulated workflows, production deployments, or repos with a well-populated bundle.
+The empty-bundle caveat: if you enable the hook without a `bundlePath`, every factual statement the extractor finds will resolve as `unsupported` because there are no claims to match against. In strict mode this means every response with factual statements will be blocked. Make sure you either provide a `bundlePath` or use report mode until you have a bundle.
+---
+## The trust bundle
+The trust bundle is a JSON file with a `claims` array. It is the authoritative record of what is considered evidenced for your codebase. Two practical sources:
+- **Veritas-generated bundle**: if your repo uses `@veritas/veritas`, it can produce a `trust.bundle.json` from `.veritas/evidence`. Point `bundlePath` at that output.
+- **Surface report**: the `@kontourai/surface` package can generate a trust bundle from a surface verification run. If your repo runs surface checks, look for the generated bundle in the surface output directory (e.g. `dist/trust-bundle.json` or a named artifact).
+- **Hand-authored bundle**: a minimal bundle is just `{ "claims": [] }`. Add claims incrementally as you record evidence.
+An empty or missing bundle means everything is unsupported. That is not necessarily wrong — it is an honest starting state — but it is only useful in report mode.
+---
+## Choosing an extractor
+The extractor is responsible for splitting the agent utterance into discrete factual statements. Two are available:
+| Extractor | How it works | Requirements |
+|-----------|-------------|--------------|
+| `reference` (default) | Pattern-based heuristics. Fast, no API call, no key needed. Works offline. Lower recall on complex prose. | `@kontourai/survey` installed |
+| `anthropic` | Model-backed extraction via `@kontourai/survey/anthropic`. Higher recall, understands context and nuance, can split compound claims. | `@kontourai/survey` + `@anthropic-ai/sdk` installed, `ANTHROPIC_API_KEY` set |
+For most exploratory use, `reference` is sufficient. Switch to `anthropic` when you find the reference extractor is missing statements that matter for your domain.
+The `anthropic` extractor fails open: if `ANTHROPIC_API_KEY` is missing or `@anthropic-ai/sdk` is not installed, the CLI emits `status: "not_configured"` (with a clear explanation in `summary`) and exits 0. The hook treats this as a silent pass-through. You will see a message in stderr explaining what is missing, but the hook will not block.
+---
+## Per-repo configuration
+The canonical way to enable utterance checking is a `context/settings/flow-agents-settings.json` file in the consumer repo. This is a peer to `context/settings/backlog-provider-settings.json` — the same directory, the same convention.
+**Minimal example (report mode, reference extractor):**
+```json
+{
+  "$schema": "../../node_modules/@kontourai/flow-agents/schemas/flow-agents-settings.schema.json",
+  "schema_version": "1.0",
+  "utteranceCheck": {
+    "enabled": true,
+    "mode": "report",
+    "extractor": "reference"
+  }
+}
+```
+**With a trust bundle and anthropic extractor:**
+```json
+{
+  "$schema": "../../node_modules/@kontourai/flow-agents/schemas/flow-agents-settings.schema.json",
+  "schema_version": "1.0",
+  "utteranceCheck": {
+    "enabled": true,
+    "mode": "report",
+    "extractor": "anthropic",
+    "bundlePath": ".veritas/trust.bundle.json",
+    "model": "claude-haiku-4-5",
+    "agentId": "surface-agent"
+  }
+}
+```
+**Strict mode:**
+```json
+{
+  "$schema": "../../node_modules/@kontourai/flow-agents/schemas/flow-agents-settings.schema.json",
+  "schema_version": "1.0",
+  "utteranceCheck": {
+    "enabled": true,
+    "mode": "strict",
+    "extractor": "anthropic",
+    "bundlePath": "dist/trust-bundle.json"
+  }
+}
+```
+Config field reference:
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `enabled` | boolean | `false` | Whether utterance checking is active for this repo. |
+| `mode` | `"report"` \| `"strict"` | `"report"` | How to handle concerning badges. See above. |
+| `extractor` | `"reference"` \| `"anthropic"` | `"reference"` | Extractor to use. See above. |
+| `bundlePath` | string | — | Repo-relative or absolute path to the trust bundle JSON. Omit to use an empty bundle. |
+| `model` | string | — | Model for the anthropic extractor. Only used when `extractor` is `"anthropic"`. |
+| `agentId` | string | `"flow-agents-hook"` | Agent identifier for provenance in the trust report. |
+---
+## Environment variable overrides
+For one-off sessions or CI pipelines, you can override the config with environment variables. These take precedence over `flow-agents-settings.json`.
+| Variable | Effect |
+|----------|--------|
+| `FLOW_AGENTS_UTTERANCE_CHECK_ENABLED=true\|false` | Force the hook on or off, overriding the config `enabled` field. |
+| `FLOW_AGENTS_UTTERANCE_CHECK_STRICT=true` | Force strict mode. |
+| `FLOW_AGENTS_UTTERANCE_CHECK_BUNDLE_PATH=/path/to/bundle.json` | Override `bundlePath`. |
+| `FLOW_AGENTS_UTTERANCE_CHECK_AGENT_ID=my-agent` | Override `agentId`. |
+| `FLOW_AGENTS_UTTERANCE_CHECK_EXTRACTOR=anthropic\|reference` | Override `extractor`. |
+**When the config file is absent and no env vars are set**, the hook is disabled. This is the safe default — existing repos are not affected until they opt in.
+---
+## Registering the hook
+Add the utterance check to a Claude Code session via `.claude/settings.json`:
+```json
+{
+  "hooks": {
+    "PostToolUse": [
+      {
+        "matcher": ".*",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "node scripts/hooks/claude-hook-adapter.js PostToolUse post:utterance-check utterance-check.js standard,strict"
+          }
+        ]
+      }
+    ]
+  }
+}
+```
+Or run the hook directly (Kiro/Codex convention, exit 2 blocks):
+```bash
+node scripts/hooks/run-hook.js post:utterance-check utterance-check.js standard,strict
+```
+The hook reads `context/settings/flow-agents-settings.json` relative to the repo root it detects from the hook event `cwd` or `process.cwd()`. No configuration needed in the hook command itself.
+---
+## CLI reference
+The utterance check CLI is available as:
+```bash
+node build/src/cli.js utterance-check check \
+  --utterance "The coverage is 92% and all tests pass." \
+  --bundle-path .veritas/trust.bundle.json \
+  --extractor anthropic \
+  --model claude-haiku-4-5 \
+  --agent-id my-session
+```
+Options:
+```
+  --utterance TEXT      Utterance text to check (required unless --not-configured).
+  --bundle-path FILE    Trust bundle JSON file. Omit for an empty bundle (all unsupported).
+  --agent-id ID         Agent identifier for provenance (default: flow-agents-utterance-check).
+  --extractor NAME      'reference' (default) or 'anthropic'.
+  --model MODEL         Model for the anthropic extractor (e.g. claude-haiku-4-5).
+  --not-configured      Skip survey call; output not_configured without error.
+  --strict              Exit non-zero when any badge is disputed, rejected, or unsupported.
+  --help                Show this help.
+```
+The CLI outputs a JSON report to stdout:
+```json
+{
+  "status": "ok",
+  "agent_id": "my-session",
+  "utterance_excerpt": "The coverage is 92% and all tests pass.",
+  "statements": [
+    {
+      "excerpt": "coverage is 92%",
+      "badge": "unsupported",
+      "target": {
+        "subjectType": "unknown",
+        "subjectId": "coverage",
+        "fieldOrBehavior": "is"
+      }
+    }
+  ],
+  "summary": "unsupported:2"
+}
+```
+Badge values:
+| Badge | Meaning |
+|-------|---------|
+| `verified` | Matched a claim with verified status. |
+| `assumed` | Matched a claim with assumed status. |
+| `stale` | Matched a claim that is stale. |
+| `disputed` | Matched a claim with conflicting evidence. |
+| `rejected` | Matched a claim that was rejected. |
+| `unsupported` | No matching claim in the trust bundle. |
+Exit codes: `0` = pass, `0` = anthropic not_configured (fail open), `1` = survey unavailable, `2` = strict mode with concerning badges, `3` = usage error.
+---
+## Installing dependencies
+The CLI adapter uses dynamic imports so flow-agents itself does not list `@kontourai/survey` as a dependency. Install in the target workspace:
+```bash
+# Reference extractor only (default)
+npm install @kontourai/survey
+# Anthropic extractor (model-backed)
+npm install @kontourai/survey @anthropic-ai/sdk
+```
+---
+## Ownership split
+| Area | Flow Agents owns | Survey owns |
+|------|-----------------|-------------|
+| Hook wiring | PostToolUse/Stop hook, badge guidance format, config loading | None |
+| Extraction | Invoking the CLI, extractor selection, fail-open handling | Statement extraction, extractor interface, anthropic integration |
+| Resolution | Passing the trust bundle path | Inquiry pipeline, claim resolution |
+| Output | Guidance text injected into agent context | UtteranceTrustReport with per-statement badges |
+| Config | Per-repo `flow-agents-settings.json`, env var overrides | None |
+Flow Agents does not own trust claim models, inquiry semantics, or extractor implementations.
+---
+## Non-goals
+- Do not make `@kontourai/survey` a mandatory dependency of flow-agents.
+- Do not copy Survey's extraction or inquiry schemas into flow-agents.
+- Do not auto-register the hook in the default pack; it is opt-in only.
+- Do not make the hook blocking without explicit `mode: "strict"` or the env override.
+- Do not silently decide anything. The hook injects guidance; the agent decides next steps.
+---
+## Current integration shape
+The integration delivers:
+1. `src/cli/utterance-check.ts` — TypeScript CLI adapter. Accepts utterance text, optional bundle path, agent ID, extractor name, and model. Dynamically imports `@kontourai/survey` (and optionally `@kontourai/survey/anthropic`). Outputs a JSON badge report to stdout and human-readable guidance to stderr.
+2. `scripts/hooks/utterance-check.js` — CJS hook script. PostToolUse/Stop, non-blocking in report mode. Reads per-repo policy from `context/settings/flow-agents-settings.json`, uses env vars as overrides. Resolves repo root from hook event `cwd`. Always fails open.
+3. `schemas/flow-agents-settings.schema.json` — JSON Schema for the per-repo settings file.
+Survey source and API details: https://github.com/kontourai/survey

package/docs/vision.md ADDED Viewed

@@ -0,0 +1,45 @@
+---
+title: Flow Agents Vision and Direction
+---
+# Vision and Direction
+This page captures where Flow Agents is headed, clearly labeled as direction rather than shipped capability. Shipped artifacts are documented in the [Runtime Hook Surface spec](spec/runtime-hook-surface.html) and the [Runtime and support matrix](index.html#runtime-and-support-matrix) on the overview page.
+---
+## What ships today
+Flow Agents currently ships as a harness adapter layer: six core harness runtimes (base, Claude Code, Codex, Kiro, opencode, pi) receive bundled agents, skills, context, scripts, and hook wiring through the `npx @kontourai/flow-agents init` installer. The four canonical policy classes — workflow steering, quality gate, stop-goal-fit, and config protection — are implemented as canonical scripts under `scripts/hooks/` and wired to each host's native event surface at conformance levels L0, L1, or L2.
+One official framework adapter spike exists: `integrations/strands/` is a Python `HookProvider` for AWS Strands that emits the canonical telemetry taxonomy and enforces config protection via tool-call cancellation. It is preview-status with documented limitations.
+---
+## Direction
+The items below are direction, not committed delivery dates. They record the intended shape of where this work goes.
+### Kits beyond coding
+The process-discipline layer is not coding-specific. The canonical policies, sidecar state model, and evidence taxonomy are defined without reference to source code, build systems, or CI. The direction is deployable agentic workflows — Flow Kits for domains beyond software delivery: knowledge work, research, operations, sales contexts, and personal productivity. The [North Star](north-star.html) records the broader scope.
+### TypeScript framework adapters
+The Strands Python spike proves the thesis: the policy engine is not harness-specific. The direction is TypeScript framework adapters that consume the canonical policy engine natively via the published `@kontourai/flow-agents` npm package, rather than shelling out to bash scripts. Candidate frameworks include LangGraph, VoltAgent, and the OpenAI Agents SDK. The [Runtime Hook Surface spec](spec/runtime-hook-surface.html) documents the adapter contract and the framework event mapping tables for each.
+### Kontour Console as the unifying telemetry surface
+Today, telemetry writes to local JSONL files by default, with optional sinks to a local or hosted Kontour Console. The direction is Kontour Console as the unifying surface that spans both harness sessions (Claude Code, Codex, Kiro, opencode, pi) and deployed framework agents (Strands, LangGraph, etc.) — so the same workflow state, evidence, and hook telemetry are visible regardless of which runtime executed the work.
+### Conformance kit for community adapters
+The runtime matrix includes a "conformance-certified" tier for community and third-party adapters that self-certify at a declared L0, L1, or L2 level. A conformance kit — a test suite and declaration format — is in development. It does not yet ship.
+---
+## What this is not
+Flow Agents is not building another agent runtime, coding assistant, workflow engine, or orchestration control plane. The model, the runtime, the IDE, the agent UI, the workflow engine, and the repo governance engine are all deliberately out of scope. Flow Agents owns the glue: discovery, just-in-time guidance, scoped delegation, Flow-backed workflow state inside agent runtimes, evidence-backed completion, and feedback loops.
+See the [North Star](north-star.html) for the full design principles and the [Developer Architecture](developer-architecture.html) for the product boundary map.

package/docs/workflow-usage-guide.md CHANGED Viewed

@@ -378,7 +378,7 @@ Completion gate:
 The validator and stop hook enforce this shape for terminal workflows. If a delivery is terminal and neither the Markdown artifact nor `state.json.artifact_paths` points at durable docs, validation should fail unless the artifact records an explicit no-docs decision.
-## 10. Capture Learning
+## 11. Capture Learning
 Use `learning-review` after release, failed gates, incidents, repeated friction, or workflow gaps.

package/evals/acceptance/run.sh CHANGED Viewed

@@ -12,7 +12,7 @@ run_one() {
 }
 case "$TARGET" in
-  kiro|claude|codex)
+  kiro|claude|codex|opencode|pi)
     run_one "$TARGET"
     ;;
   all)
@@ -20,10 +20,12 @@ case "$TARGET" in
     run_one kiro || status=1
     run_one claude || status=1
     run_one codex || status=1
+    run_one opencode || status=1
+    run_one pi || status=1
     exit "$status"
     ;;
   *)
-    echo "Usage: bash evals/acceptance/run.sh [all|kiro|claude|codex]"
+    echo "Usage: bash evals/acceptance/run.sh [all|kiro|claude|codex|opencode|pi]"
     exit 1
     ;;
 esac

package/evals/acceptance/test_opencode_harness.sh ADDED Viewed

@@ -0,0 +1,121 @@
+#!/usr/bin/env bash
+set -euo pipefail
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+source "$ROOT_DIR/evals/lib/node.sh"
+TMP_WORK=""
+pass=0
+fail=0
+skip=0
+cleanup() {
+  [[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
+}
+trap cleanup EXIT
+_pass() { echo "  ✓ $1"; pass=$((pass + 1)); }
+_fail() { echo "  ✗ $1"; fail=$((fail + 1)); }
+_skip() { echo "  ○ $1"; skip=$((skip + 1)); }
+wait_for_telemetry() {
+  local file="$1"
+  local i=0
+  while [[ $i -lt 150 ]]; do
+    [[ -s "$file" ]] && return 0
+    sleep 0.1
+    i=$((i + 1))
+  done
+  return 1
+}
+echo "=== Harness Acceptance: opencode ==="
+echo ""
+if ! command -v opencode >/dev/null 2>&1; then
+  _skip "opencode CLI not installed"
+  echo ""
+  echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
+  exit 0
+fi
+cd "$ROOT_DIR"
+flow_agents_node scripts/build-universal-bundles.js >/dev/null
+TMP_WORK="$(mktemp -d /tmp/opencode-acceptance-work.XXXXXX)"
+(cd dist/opencode && bash install.sh "$TMP_WORK") >/dev/null
+echo "--- Plugin Load + Telemetry ---"
+cd "$TMP_WORK"
+rm -rf .telemetry
+MODEL_ARGS=()
+if [[ -n "${FLOW_AGENTS_ACCEPT_OPENCODE_MODEL:-}" ]]; then
+  MODEL_ARGS=(-m "$FLOW_AGENTS_ACCEPT_OPENCODE_MODEL")
+fi
+# Models sometimes answer without calling the tool (nondeterminism), which
+# would void the tool.invoke/tool.result assertions — force the tool call
+# and retry once if no tool events landed.
+ACCEPT_PROMPT="You MUST call the read tool before replying — answering from memory is a failure. Read the first 5 lines of README.md with the read tool, then reply: done"
+run_output=""
+provider_error=0
+for _attempt in 1 2; do
+  run_output="$(opencode run "${MODEL_ARGS[@]}" "$ACCEPT_PROMPT" 2>&1 || true)"
+  if echo "$run_output" | grep -qi "error"; then
+    provider_error=1
+    break
+  fi
+  provider_error=0
+  for _i in $(seq 1 50); do
+    [[ -s "$TMP_WORK/.telemetry/full.jsonl" ]] && grep -q '"tool.invoke"' "$TMP_WORK/.telemetry/full.jsonl" 2>/dev/null && break
+    sleep 0.3
+  done
+  grep -q '"tool.invoke"' "$TMP_WORK/.telemetry/full.jsonl" 2>/dev/null && break
+done
+LATEST_LOG="$(ls -t ~/.local/share/opencode/log/*.log 2>/dev/null | head -1 || true)"
+if [[ -n "$LATEST_LOG" ]] && grep -q "plugins/flow-agents.js loading plugin" "$LATEST_LOG" 2>/dev/null; then
+  _pass "opencode log confirms flow-agents plugin loaded"
+else
+  _fail "opencode log did not confirm flow-agents plugin loaded"
+fi
+telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
+if [[ "$provider_error" -eq 1 ]]; then
+  _skip "opencode telemetry assertions skipped (provider/auth error)"
+  _skip "opencode telemetry tool events skipped (provider/auth error)"
+else
+  if wait_for_telemetry "$telemetry_file"; then
+    _pass "opencode telemetry log was written"
+  else
+    _fail "opencode telemetry log was not written"
+  fi
+  if [[ -f "$telemetry_file" ]] && \
+    node -e "
+const fs = require('fs');
+const lines = fs.readFileSync('$telemetry_file', 'utf8').trim().split('\n');
+const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
+const hasInvoke = types.some(t => t === 'tool.invoke');
+const hasResult = types.some(t => t === 'tool.result');
+process.exit(hasInvoke && hasResult ? 0 : 1);
+" 2>/dev/null; then
+    _pass "opencode telemetry contains tool.invoke and tool.result events"
+  else
+    _fail "opencode telemetry missing tool.invoke or tool.result events"
+  fi
+fi
+PARENT_TELEMETRY="$(dirname "$TMP_WORK")/.telemetry"
+if [[ -d "$PARENT_TELEMETRY" ]]; then
+  _fail "opencode wrote .telemetry to workspace parent directory"
+else
+  _pass "no .telemetry leak to workspace parent directory"
+fi
+echo ""
+echo "==========================="
+total=$((pass + fail))
+echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
+[[ "$fail" -gt 0 ]] && exit 1
+exit 0

package/evals/acceptance/test_pi_harness.sh ADDED Viewed

@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+set -euo pipefail
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+source "$ROOT_DIR/evals/lib/node.sh"
+TMP_WORK=""
+pass=0
+fail=0
+skip=0
+cleanup() {
+  [[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
+}
+trap cleanup EXIT
+_pass() { echo "  ✓ $1"; pass=$((pass + 1)); }
+_fail() { echo "  ✗ $1"; fail=$((fail + 1)); }
+_skip() { echo "  ○ $1"; skip=$((skip + 1)); }
+wait_for_telemetry() {
+  local file="$1"
+  local i=0
+  while [[ $i -lt 150 ]]; do
+    [[ -s "$file" ]] && return 0
+    sleep 0.1
+    i=$((i + 1))
+  done
+  return 1
+}
+echo "=== Harness Acceptance: pi ==="
+echo ""
+if ! command -v pi >/dev/null 2>&1; then
+  _skip "pi CLI not installed"
+  echo ""
+  echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
+  exit 0
+fi
+cd "$ROOT_DIR"
+flow_agents_node scripts/build-universal-bundles.js >/dev/null
+TMP_WORK="$(mktemp -d /tmp/pi-acceptance-work.XXXXXX)"
+(cd dist/pi && bash install.sh "$TMP_WORK") >/dev/null
+echo "--- Telemetry ---"
+cd "$TMP_WORK"
+rm -rf .telemetry
+run_output="$(pi --approve -p \
+  "Use your read tool to read the first 5 lines of README.md, then reply: done" 2>&1 || true)"
+provider_error=0
+if echo "$run_output" | grep -qi "error"; then
+  provider_error=1
+fi
+telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
+if [[ "$provider_error" -eq 1 ]]; then
+  _skip "pi telemetry assertions skipped (provider/auth error)"
+  _skip "pi telemetry event types skipped (provider/auth error)"
+  _skip "pi telemetry session events skipped (provider/auth error)"
+else
+  if wait_for_telemetry "$telemetry_file"; then
+    _pass "pi telemetry log was written"
+  else
+    _fail "pi telemetry log was not written"
+  fi
+  if [[ -f "$telemetry_file" ]] && \
+    node -e "
+const fs = require('fs');
+const lines = fs.readFileSync('$telemetry_file', 'utf8').trim().split('\n');
+const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
+const required = ['session.start', 'tool.invoke', 'tool.result', 'session.end'];
+const missing = required.filter(t => !types.includes(t));
+if (missing.length > 0) { process.stderr.write('missing: ' + missing.join(', ') + '\n'); process.exit(1); }
+process.exit(0);
+" 2>/dev/null; then
+    _pass "pi telemetry contains session.start, tool.invoke, tool.result, session.end"
+  else
+    _fail "pi telemetry missing one or more required event types (session.start, tool.invoke, tool.result, session.end)"
+  fi
+fi
+PARENT_TELEMETRY="$(dirname "$TMP_WORK")/.telemetry"
+if [[ -d "$PARENT_TELEMETRY" ]]; then
+  _fail "pi wrote .telemetry to workspace parent directory"
+else
+  _pass "no .telemetry leak to workspace parent directory"
+fi
+echo ""
+echo "==========================="
+total=$((pass + fail))
+echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
+[[ "$fail" -gt 0 ]] && exit 1
+exit 0