@kontourai/flow-agents 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.github/workflows/ci.yml +6 -1
  2. package/.github/workflows/kit-gates-demo.yml +6 -2
  3. package/CHANGELOG.md +25 -0
  4. package/CONTRIBUTING.md +30 -0
  5. package/agents/dev.json +1 -1
  6. package/agents/tool-planner.json +1 -1
  7. package/build/src/cli/workflow-sidecar.js +70 -5
  8. package/build/src/flow-kit/validate.js +32 -1
  9. package/build/src/tools/build-universal-bundles.js +14 -0
  10. package/console.telemetry.json +1 -1
  11. package/docs/adr/0004-gates-expect-surface-claims.md +7 -7
  12. package/docs/kit-authoring-guide.md +99 -6
  13. package/docs/operating-layers.md +2 -2
  14. package/docs/veritas-integration.md +4 -4
  15. package/docs/workflow-eval-strategy.md +2 -2
  16. package/docs/workflow-usage-guide.md +1 -1
  17. package/evals/acceptance/test_opencode_harness.sh +18 -10
  18. package/evals/acceptance/test_pi_harness.sh +10 -6
  19. package/evals/ci/run-baseline.sh +1 -1
  20. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/flows/runtime.flow.json +4 -4
  21. package/evals/fixtures/flow-kit-repository/valid-local-kit/flows/review.flow.json +4 -4
  22. package/evals/fixtures/kit-conformance-levels/k0-flows-only/flows/review.flow.json +4 -4
  23. package/evals/fixtures/kit-conformance-levels/k1-agent-extension/flows/build.flow.json +4 -4
  24. package/evals/fixtures/kit-conformance-levels/k2-with-evals/flows/synthesize.flow.json +4 -4
  25. package/evals/fixtures/kit-conformance-levels/third-party-extension/flows/review.flow.json +4 -4
  26. package/evals/fixtures/surface-trust/accepted-claim-trust-report.json +2 -2
  27. package/evals/fixtures/surface-trust/artifact-absent.json +2 -2
  28. package/evals/fixtures/surface-trust/integrity-mismatch-trust-report.json +2 -2
  29. package/evals/fixtures/surface-trust/missing-authority-trust-report.json +2 -2
  30. package/evals/fixtures/surface-trust/provider-absent.json +2 -2
  31. package/evals/fixtures/surface-trust/rejected-claim-trust-report.json +2 -2
  32. package/evals/fixtures/surface-trust/stale-claim-trust-snapshot.json +2 -2
  33. package/evals/integration/test_console_learning_projection.sh +1 -1
  34. package/evals/integration/test_goal_fit_hook.sh +144 -0
  35. package/evals/integration/test_kit_conformance_levels.sh +55 -1
  36. package/evals/integration/test_workflow_sidecar_writer.sh +9 -9
  37. package/evals/static/test_package.sh +3 -3
  38. package/evals/static/test_workflow_skills.sh +4 -4
  39. package/kits/builder/flows/build.flow.json +48 -48
  40. package/kits/builder/flows/shape.flow.json +36 -36
  41. package/kits/knowledge/adapters/obsidian-store/index.js +137 -26
  42. package/kits/knowledge/evals/contract-suite/suite.test.js +90 -0
  43. package/kits/knowledge/flows/compile.flow.json +12 -12
  44. package/kits/knowledge/flows/consolidate.flow.json +16 -16
  45. package/kits/knowledge/flows/ingest.flow.json +12 -12
  46. package/kits/knowledge/flows/retire.flow.json +16 -16
  47. package/kits/knowledge/flows/store-contract.flow.json +12 -12
  48. package/kits/knowledge/flows/synthesize.flow.json +16 -16
  49. package/kits/release-evidence/flows/release-evidence.flow.json +3 -3
  50. package/package.json +5 -2
  51. package/schemas/workflow-evidence.schema.json +2 -1
  52. package/scripts/hooks/stop-goal-fit.js +66 -18
  53. package/src/cli/workflow-sidecar.ts +62 -4
  54. package/src/flow-kit/validate.ts +55 -1
  55. package/src/tools/build-universal-bundles.ts +14 -0
@@ -106,9 +106,9 @@ If Veritas is unavailable and the workflow expected it, record `not_verified` in
106
106
 
107
107
  ## Builder Kit Trust Evidence
108
108
 
109
- Builder Kit gates stay provider-neutral. The Builder Kit Flow Definition names gate expectations as `kind: "surface.claim"` and declares the claim type, subject, accepted statuses, and blocking behavior. It does not name Veritas or any other trust producer.
109
+ Builder Kit gates stay provider-neutral. The Builder Kit Flow Definition names gate expectations as `kind: "trust.bundle"` (the Hachure-aligned gate kind) and declares the claim type, subject, accepted statuses, and blocking behavior. It does not name Veritas or any other trust producer.
110
110
 
111
- When a trust-backed path is configured, Flow Agents may attach a compact Surface-shaped reference to the Builder Kit evidence gate. The reference points at a TrustReport or Trust Snapshot, carries the related gate id, Surface claim type, claim status, artifact ref, integrity summary, authority or trusted-producer summary, subject, and freshness state, and then maps to the normal Flow gate result. Flow owns the gate authority decision, route reason, trusted producer mapping, and accepted gap behavior. Surface owns the portable trust state represented by the Surface claim and the TrustReport / Trust Snapshot. A Probe can request or clarify the evidence needed before planning or before a later Builder Kit gate retries.
111
+ When a trust-backed path is configured, Flow Agents may attach a compact Hachure trust.bundle reference to the Builder Kit evidence gate. The reference uses `artifact_kind: "trust.bundle"` (the Hachure-aligned canonical value), carries the related gate id, domain claim type, claim status, artifact ref, integrity summary, authority or trusted-producer summary, subject, and freshness state, and then maps to the normal Flow gate result. When the `hachure` optional dependency is installed, referenced artifacts are validated against hachure's trust-bundle.schema.json at evidence-recording time. Flow owns the gate authority decision, route reason, trusted producer mapping, and accepted gap behavior. Surface owns the portable trust state represented by the Surface claim and the TrustReport / Trust Snapshot. A Probe can request or clarify the evidence needed before planning or before a later Builder Kit gate retries.
112
112
 
113
113
  Veritas is only one optional producer of those artifacts. A local Veritas readiness run can emit native Veritas evidence and, when configured, point Flow Agents at a Surface-shaped TrustReport or Trust Snapshot. Flow Agents records the reference; it does not copy Veritas rule models, readiness semantics, or provider-native fields into Builder Kit gates.
114
114
 
@@ -116,8 +116,8 @@ Provider and artifact absence are explicit:
116
116
 
117
117
  - If no trust provider is configured, ordinary Builder Kit activation, planning, verification, and evidence gates continue to work through the existing Flow Kit path.
118
118
  - If a trust-backed path was requested but no provider is configured, the trust check records `not_verified` with a clear gap instead of blocking unrelated Builder Kit usage.
119
- - If a provider is configured but the expected TrustReport or Trust Snapshot is absent or unreadable, only the requested trust-backed evidence check records `not_verified`; it does not silently pass and it does not make Veritas mandatory.
120
- - If a TrustReport or Trust Snapshot is present but has a rejected, stale, expired, missing-authority, or integrity-mismatched Surface claim, the Builder Kit evidence gate routes through the normal `fail` or `not_verified` path.
119
+ - If a provider is configured but the expected Hachure trust.bundle artifact is absent or unreadable, only the requested trust-backed evidence check records `not_verified`; it does not silently pass and it does not make Veritas mandatory.
120
+ - If a Hachure trust.bundle artifact is present but has a rejected, stale, expired, missing-authority, or integrity-mismatched claim, the Builder Kit evidence gate routes through the normal `fail` or `not_verified` path.
121
121
 
122
122
  ## Adoption Gate
123
123
 
@@ -6,7 +6,7 @@ title: Workflow Eval Strategy
6
6
 
7
7
  The Builder Kit workflow system now has concrete skill contracts for `idea-to-backlog`, `pull-work`, `plan-work`, `review-work`, `deliver`, `evidence-gate`, `release-readiness`, and `learning-review`, plus shared workflow contracts in `context/contracts/`. Evals should prove both the written contracts and the agent behavior around gates, artifacts, worktrees, Goal Fit, release readiness, final acceptance docs, and learning feedback.
8
8
 
9
- Flow Agents evals prove coordination, install, runtime adapter behavior, and artifact discipline. They should not redefine Flow gate authority: Flow Definitions use typed `expects` entries, Surface claim gates use `kind: "surface.claim"`, and Flow project config owns trusted producer mappings plus gate overrides.
9
+ Flow Agents evals prove coordination, install, runtime adapter behavior, and artifact discipline. They should not redefine Flow gate authority: Flow Definitions use typed `expects` entries, trust-bundle gates use `kind: "trust.bundle"`, and Flow project config owns trusted producer mappings plus gate overrides.
10
10
 
11
11
  ## Goals
12
12
 
@@ -161,7 +161,7 @@ Surface trust artifact attachment is covered by deterministic schema, runtime, a
161
161
  bash evals/integration/test_workflow_sidecar_writer.sh
162
162
  ```
163
163
 
164
- That eval exercises Builder Kit `surface.claim` evidence using provider-neutral TrustReport / Trust Snapshot fixtures for accepted, rejected, stale, missing-authority, integrity-mismatch, provider-absent, and artifact-absent cases. It proves Flow Agents can record compact Surface claim evidence in `evidence.json` and report pass, fail, or `NOT_VERIFIED` gaps without requiring provider-specific fields.
164
+ That eval exercises Builder Kit `trust.bundle` evidence using provider-neutral Hachure trust.bundle fixtures for accepted, rejected, stale, missing-authority, integrity-mismatch, provider-absent, and artifact-absent cases. It proves Flow Agents can record compact Surface claim evidence in `evidence.json` and report pass, fail, or `NOT_VERIFIED` gaps without requiring provider-specific fields.
165
165
 
166
166
  This coverage does not redefine Flow gate authority. Flow Definitions continue to express expectations, Flow project config owns trusted producer mappings and gate overrides, and Flow gate authority remains outside the local report writer. Runtime/provider gaps should be recorded as `NOT_VERIFIED` when a configured Surface claim path cannot be checked; ordinary Builder Kit workflows remain valid when no trust provider or trust artifact is configured.
167
167
 
@@ -6,7 +6,7 @@ title: Workflow Usage Guide
6
6
 
7
7
  This guide shows how to use the Builder Kit workflow skills in normal chats.
8
8
 
9
- > **Which doc do I want?** This page is the *driver's manual* — what to say at each stage and what should happen. If you want the conceptual map first — layers, sidecars, hooks, evidence, and why the system is shaped this way — read the [Agent System Guidebook](agent-system-guidebook.md). For a one-line summary of every skill and gate, use the [Skills Map](skills-map.md). Flow Agents coordinates the local runtime, installs Flow Kits, and records artifacts; Flow owns gate semantics, including typed `expects` entries with `kind: "surface.claim"`, trusted producer config, and gate overrides.
9
+ > **Which doc do I want?** This page is the *driver's manual* — what to say at each stage and what should happen. If you want the conceptual map first — layers, sidecars, hooks, evidence, and why the system is shaped this way — read the [Agent System Guidebook](agent-system-guidebook.md). For a one-line summary of every skill and gate, use the [Skills Map](skills-map.md). Flow Agents coordinates the local runtime, installs Flow Kits, and records artifacts; Flow owns gate semantics, including typed `expects` entries with `kind: "trust.bundle"`, trusted producer config, and gate overrides.
10
10
 
11
11
  The core pattern is:
12
12
 
@@ -21,7 +21,7 @@ wait_for_telemetry() {
21
21
  local file="$1"
22
22
  local i=0
23
23
  while [[ $i -lt 150 ]]; do
24
- [[ -s "$file" ]] && return 0
24
+ if [[ -s "$file" ]] && grep -q '"tool.invoke"' "$file" 2>/dev/null && grep -q '"tool.result"' "$file" 2>/dev/null; then return 0; fi
25
25
  sleep 0.1
26
26
  i=$((i + 1))
27
27
  done
@@ -73,23 +73,31 @@ for _attempt in 1 2; do
73
73
  grep -q '"tool.invoke"' "$TMP_WORK/.telemetry/full.jsonl" 2>/dev/null && break
74
74
  done
75
75
 
76
- LATEST_LOG="$(ls -t ~/.local/share/opencode/log/*.log 2>/dev/null | head -1 || true)"
77
- if [[ -n "$LATEST_LOG" ]] && grep -q "plugins/flow-agents.js loading plugin" "$LATEST_LOG" 2>/dev/null; then
78
- _pass "opencode log confirms flow-agents plugin loaded"
76
+ # Confirm load via the plugin's own marker file (written by the FlowAgentsPlugin
77
+ # factory at startup). This replaces grepping opencode's internal
78
+ # "plugins/flow-agents.js loading plugin" message, which opencode 1.17.x dropped
79
+ # and which opencode does not reliably surface to its log file — a stale-assertion
80
+ # false failure (#75). The factory runs regardless of provider, so this load
81
+ # signal is independent of whether a model turn completes.
82
+ if [[ -f "$TMP_WORK/.telemetry/opencode-plugin.loaded" ]]; then
83
+ _pass "flow-agents plugin loaded (factory marker present)"
79
84
  else
80
- _fail "opencode log did not confirm flow-agents plugin loaded"
85
+ _fail "flow-agents plugin did not load (factory marker absent)"
81
86
  fi
82
87
 
83
88
  telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
84
89
  if [[ "$provider_error" -eq 1 ]]; then
85
90
  _skip "opencode telemetry assertions skipped (provider/auth error)"
86
91
  _skip "opencode telemetry tool events skipped (provider/auth error)"
92
+ elif ! wait_for_telemetry "$telemetry_file"; then
93
+ # No telemetry was produced at all — the agent never completed a model turn,
94
+ # expected in a provider-less environment (e.g. CI with no API key). The binary
95
+ # install, bundle, and mechanical hook chain are already covered; skip the
96
+ # live-model-dependent telemetry assertions rather than fail on them.
97
+ _skip "opencode telemetry assertions skipped (no telemetry — agent did not complete a turn, likely no provider)"
98
+ _skip "opencode telemetry tool events skipped (no turn)"
87
99
  else
88
- if wait_for_telemetry "$telemetry_file"; then
89
- _pass "opencode telemetry log was written"
90
- else
91
- _fail "opencode telemetry log was not written"
92
- fi
100
+ _pass "opencode telemetry log was written"
93
101
 
94
102
  if [[ -f "$telemetry_file" ]] && \
95
103
  node -e "
@@ -21,7 +21,7 @@ wait_for_telemetry() {
21
21
  local file="$1"
22
22
  local i=0
23
23
  while [[ $i -lt 150 ]]; do
24
- [[ -s "$file" ]] && return 0
24
+ if [[ -s "$file" ]] && grep -q '"session.start"' "$file" 2>/dev/null && grep -q '"tool.invoke"' "$file" 2>/dev/null && grep -q '"tool.result"' "$file" 2>/dev/null && grep -q '"session.end"' "$file" 2>/dev/null; then return 0; fi
25
25
  sleep 0.1
26
26
  i=$((i + 1))
27
27
  done
@@ -60,12 +60,16 @@ if [[ "$provider_error" -eq 1 ]]; then
60
60
  _skip "pi telemetry assertions skipped (provider/auth error)"
61
61
  _skip "pi telemetry event types skipped (provider/auth error)"
62
62
  _skip "pi telemetry session events skipped (provider/auth error)"
63
+ elif ! wait_for_telemetry "$telemetry_file"; then
64
+ # No telemetry was produced at all — the agent never completed a model turn,
65
+ # which in a provider-less environment (e.g. CI with no API key) is expected.
66
+ # The binary install, bundle, and mechanical hook chain are already covered;
67
+ # skip the live-model-dependent telemetry assertions rather than fail on them.
68
+ _skip "pi telemetry assertions skipped (no telemetry — agent did not complete a turn, likely no provider)"
69
+ _skip "pi telemetry event types skipped (no turn)"
70
+ _skip "pi telemetry session events skipped (no turn)"
63
71
  else
64
- if wait_for_telemetry "$telemetry_file"; then
65
- _pass "pi telemetry log was written"
66
- else
67
- _fail "pi telemetry log was not written"
68
- fi
72
+ _pass "pi telemetry log was written"
69
73
 
70
74
  if [[ -f "$telemetry_file" ]] && \
71
75
  node -e "
@@ -74,7 +74,7 @@ LANE_RUNTIME_AND_KIT=(
74
74
  "Kit conformance levels integration"
75
75
  "Local Flow Kit install integration"
76
76
  "Flow Kit install-git integration"
77
- # QUARANTINED (#74): passes on macOS, fails on Linux CI — not gating until triaged
77
+ "Console learning projection integration"
78
78
  "Context map integration"
79
79
  "Effective backlog settings integration"
80
80
  "Flow agents statusline integration"
@@ -11,12 +11,12 @@
11
11
  "expects": [
12
12
  {
13
13
  "id": "runtime-evidence",
14
- "kind": "surface.claim",
14
+ "kind": "trust.bundle",
15
15
  "required": true,
16
16
  "description": "Runtime activation evidence exists.",
17
- "claim": {
18
- "type": "mixed.runtime.evidence",
19
- "subject": "artifact",
17
+ "bundle_claim": {
18
+ "claimType": "mixed.runtime.evidence",
19
+ "subjectType": "artifact",
20
20
  "accepted_statuses": ["trusted", "accepted"]
21
21
  }
22
22
  }
@@ -11,12 +11,12 @@
11
11
  "expects": [
12
12
  {
13
13
  "id": "review-evidence",
14
- "kind": "surface.claim",
14
+ "kind": "trust.bundle",
15
15
  "required": true,
16
16
  "description": "Review evidence has been recorded.",
17
- "claim": {
18
- "type": "example.review.evidence",
19
- "subject": "artifact",
17
+ "bundle_claim": {
18
+ "claimType": "example.review.evidence",
19
+ "subjectType": "artifact",
20
20
  "accepted_statuses": ["trusted", "accepted"]
21
21
  }
22
22
  }
@@ -11,12 +11,12 @@
11
11
  "expects": [
12
12
  {
13
13
  "id": "review-finding",
14
- "kind": "surface.claim",
14
+ "kind": "trust.bundle",
15
15
  "required": true,
16
16
  "description": "Review finding recorded.",
17
- "claim": {
18
- "type": "k0.review.finding",
19
- "subject": "artifact",
17
+ "bundle_claim": {
18
+ "claimType": "k0.review.finding",
19
+ "subjectType": "artifact",
20
20
  "accepted_statuses": ["trusted", "accepted"]
21
21
  }
22
22
  }
@@ -11,12 +11,12 @@
11
11
  "expects": [
12
12
  {
13
13
  "id": "build-evidence",
14
- "kind": "surface.claim",
14
+ "kind": "trust.bundle",
15
15
  "required": true,
16
16
  "description": "Build evidence recorded.",
17
- "claim": {
18
- "type": "k1.build.evidence",
19
- "subject": "artifact",
17
+ "bundle_claim": {
18
+ "claimType": "k1.build.evidence",
19
+ "subjectType": "artifact",
20
20
  "accepted_statuses": ["trusted", "accepted"]
21
21
  }
22
22
  }
@@ -11,12 +11,12 @@
11
11
  "expects": [
12
12
  {
13
13
  "id": "synthesis-evidence",
14
- "kind": "surface.claim",
14
+ "kind": "trust.bundle",
15
15
  "required": true,
16
16
  "description": "Synthesis evidence with provenance refs.",
17
- "claim": {
18
- "type": "k2.synthesize.evidence",
19
- "subject": "artifact",
17
+ "bundle_claim": {
18
+ "claimType": "k2.synthesize.evidence",
19
+ "subjectType": "artifact",
20
20
  "accepted_statuses": ["trusted", "accepted"]
21
21
  }
22
22
  }
@@ -11,12 +11,12 @@
11
11
  "expects": [
12
12
  {
13
13
  "id": "review-evidence",
14
- "kind": "surface.claim",
14
+ "kind": "trust.bundle",
15
15
  "required": true,
16
16
  "description": "Review evidence.",
17
- "claim": {
18
- "type": "third-party.review.evidence",
19
- "subject": "artifact",
17
+ "bundle_claim": {
18
+ "claimType": "third-party.review.evidence",
19
+ "subjectType": "artifact",
20
20
  "accepted_statuses": ["trusted", "accepted"]
21
21
  }
22
22
  }
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "schema_version": "1.0",
3
- "artifact_kind": "TrustReport",
3
+ "artifact_kind": "trust.bundle",
4
4
  "artifact_ref": "surface-trust://fixtures/accepted-claim-trust-report.json",
5
5
  "subject": {
6
6
  "type": "flow-step",
@@ -8,7 +8,7 @@
8
8
  },
9
9
  "gate": {
10
10
  "id": "tests-evidence",
11
- "kind": "surface.claim"
11
+ "kind": "trust.bundle"
12
12
  },
13
13
  "claim": {
14
14
  "type": "builder.verify.tests",
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "schema_version": "1.0",
3
3
  "scenario": "artifact_absent",
4
- "artifact_kind": "TrustReport",
4
+ "artifact_kind": "trust.bundle",
5
5
  "artifact_ref": "surface-trust://fixtures/missing-trust-report.json",
6
6
  "gate": {
7
7
  "id": "implementation-plan",
8
- "kind": "surface.claim"
8
+ "kind": "trust.bundle"
9
9
  },
10
10
  "claim": {
11
11
  "type": "builder.plan.implementation",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "schema_version": "1.0",
3
- "artifact_kind": "TrustReport",
3
+ "artifact_kind": "trust.bundle",
4
4
  "artifact_ref": "surface-trust://fixtures/integrity-mismatch-trust-report.json",
5
5
  "subject": {
6
6
  "type": "artifact",
@@ -8,7 +8,7 @@
8
8
  },
9
9
  "gate": {
10
10
  "id": "implementation-plan",
11
- "kind": "surface.claim"
11
+ "kind": "trust.bundle"
12
12
  },
13
13
  "claim": {
14
14
  "type": "builder.plan.implementation",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "schema_version": "1.0",
3
- "artifact_kind": "TrustReport",
3
+ "artifact_kind": "trust.bundle",
4
4
  "artifact_ref": "surface-trust://fixtures/missing-authority-trust-report.json",
5
5
  "subject": {
6
6
  "type": "change",
@@ -8,7 +8,7 @@
8
8
  },
9
9
  "gate": {
10
10
  "id": "implementation-scope",
11
- "kind": "surface.claim"
11
+ "kind": "trust.bundle"
12
12
  },
13
13
  "claim": {
14
14
  "type": "builder.execute.scope",
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "schema_version": "1.0",
3
3
  "scenario": "provider_absent",
4
- "artifact_kind": "Trust Snapshot",
4
+ "artifact_kind": "trust.bundle",
5
5
  "artifact_ref": null,
6
6
  "gate": {
7
7
  "id": "selected-work",
8
- "kind": "surface.claim"
8
+ "kind": "trust.bundle"
9
9
  },
10
10
  "claim": {
11
11
  "type": "builder.pull-work.selected",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "schema_version": "1.0",
3
- "artifact_kind": "TrustReport",
3
+ "artifact_kind": "trust.bundle",
4
4
  "artifact_ref": "surface-trust://fixtures/rejected-claim-trust-report.json",
5
5
  "subject": {
6
6
  "type": "change",
@@ -8,7 +8,7 @@
8
8
  },
9
9
  "gate": {
10
10
  "id": "implementation-scope",
11
- "kind": "surface.claim"
11
+ "kind": "trust.bundle"
12
12
  },
13
13
  "claim": {
14
14
  "type": "builder.execute.scope",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "schema_version": "1.0",
3
- "artifact_kind": "Trust Snapshot",
3
+ "artifact_kind": "trust.bundle",
4
4
  "artifact_ref": "surface-trust://fixtures/stale-claim-trust-snapshot.json",
5
5
  "subject": {
6
6
  "type": "flow-step",
@@ -8,7 +8,7 @@
8
8
  },
9
9
  "gate": {
10
10
  "id": "tests-evidence",
11
- "kind": "surface.claim"
11
+ "kind": "trust.bundle"
12
12
  },
13
13
  "claim": {
14
14
  "type": "builder.verify.tests",
@@ -6,7 +6,7 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
6
6
  source "$ROOT/evals/lib/node.sh"
7
7
 
8
8
  FIXTURE_DIR="$ROOT/evals/fixtures/console-learning-projection"
9
- TMPDIR_EVAL="$(mktemp -d /private/tmp/eval-console-learning-projection.XXXXXX)"
9
+ TMPDIR_EVAL="$(cd "$(mktemp -d "${TMPDIR:-/tmp}/eval-console-learning-projection.XXXXXX")" && pwd -P)"
10
10
  ARTIFACT_ROOT="$TMPDIR_EVAL/artifacts"
11
11
  KONTOUR_ROOT="$TMPDIR_EVAL/.kontour"
12
12
  GENERATED_AT="2026-06-06T20:00:00Z"
@@ -473,6 +473,150 @@ else
473
473
  _fail "promoted doc is missing source or acceptance sections"
474
474
  fi
475
475
 
476
+ # --- npm-install regression: validator-environment errors must not block goal-fit ---
477
+ # Simulate the npm-installed condition: build/ is present (always shipped in package files)
478
+ # but tsc is absent from PATH, so `npm run workflow:validate-artifacts` (which rebuilds)
479
+ # would fail. The fix directly invokes node build/.../validate-workflow-artifacts.js instead.
480
+
481
+ NPM_INSTALL_REPO="$TMPDIR_EVAL/npm-install-repo"
482
+ mkdir -p "$NPM_INSTALL_REPO/.flow-agents/npm-install-task"
483
+ printf '# Test Repo\n' > "$NPM_INSTALL_REPO/AGENTS.md"
484
+
485
+ cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/npm-install-task--deliver.md" <<'MARKDOWN'
486
+ # npm install test task
487
+
488
+ branch: main
489
+ worktree: main
490
+ created: 2026-06-01
491
+ status: delivered
492
+ type: deliver
493
+
494
+ ## Definition Of Done
495
+ - **User outcome:** Something works.
496
+ - **Acceptance criteria:**
497
+ - [x] Thing works - Evidence: tested
498
+
499
+ ## Goal Fit Gate
500
+ - [x] Original user goal restated
501
+ - [x] Every acceptance criterion has evidence
502
+
503
+ ## Verification Report
504
+
505
+ ### Verdict: PASS
506
+
507
+ ## Final Acceptance
508
+
509
+ - [ ] CI passed
510
+ MARKDOWN
511
+
512
+ cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/state.json" <<'JSON'
513
+ {
514
+ "schema_version": "1.0",
515
+ "task_slug": "npm-install-task",
516
+ "status": "delivered",
517
+ "phase": "done",
518
+ "updated_at": "2026-06-01T00:00:00Z",
519
+ "next_action": { "status": "done", "summary": "Local delivery complete." }
520
+ }
521
+ JSON
522
+
523
+ cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/acceptance.json" <<'JSON'
524
+ {
525
+ "schema_version": "1.0",
526
+ "task_slug": "npm-install-task",
527
+ "criteria": [
528
+ {
529
+ "id": "thing-works",
530
+ "description": "Thing works.",
531
+ "status": "pass",
532
+ "evidence_refs": [
533
+ { "kind": "artifact", "file": "npm-install-task--deliver.md", "summary": "Delivery artifact." }
534
+ ]
535
+ }
536
+ ],
537
+ "goal_fit": { "status": "pass", "summary": "User outcome achieved." }
538
+ }
539
+ JSON
540
+
541
+ cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/evidence.json" <<'JSON'
542
+ {
543
+ "schema_version": "1.0",
544
+ "task_slug": "npm-install-task",
545
+ "verdict": "pass",
546
+ "checks": [
547
+ { "id": "build", "kind": "test", "status": "pass", "summary": "Build passed." }
548
+ ],
549
+ "not_verified_gaps": []
550
+ }
551
+ JSON
552
+
553
+ cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/handoff.json" <<'JSON'
554
+ {
555
+ "schema_version": "1.0",
556
+ "task_slug": "npm-install-task",
557
+ "summary": "Local delivery complete.",
558
+ "current_state_ref": "state.json",
559
+ "next_steps": [],
560
+ "blockers": [],
561
+ "warnings": []
562
+ }
563
+ JSON
564
+
565
+ # Part 1 of fix: invoke the already-built validator directly (no tsc).
566
+ # Poison tsc so that any call to it fails; confirm the hook does not call it
567
+ # and validates clean sidecars successfully.
568
+ FAKE_TSC_DIR="$TMPDIR_EVAL/fake-tsc"
569
+ mkdir -p "$FAKE_TSC_DIR"
570
+ printf '#!/usr/bin/env bash\necho "error TS5023: tsc should not be called" >&2\nexit 1\n' > "$FAKE_TSC_DIR/tsc"
571
+ chmod +x "$FAKE_TSC_DIR/tsc"
572
+
573
+ if PATH="$FAKE_TSC_DIR:$PATH" FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true \
574
+ node "$ROOT/scripts/hooks/stop-goal-fit.js" \
575
+ >"$TMPDIR_EVAL/npm-install-valid.out" 2>"$TMPDIR_EVAL/npm-install-valid.err" <<JSON
576
+ {"hook_event_name":"Stop","cwd":"$NPM_INSTALL_REPO"}
577
+ JSON
578
+ then
579
+ _pass "strict hook with poisoned tsc uses built validator and does not block valid sidecars"
580
+ else
581
+ _fail "strict hook should not block valid sidecars even with tsc absent: $(cat "$TMPDIR_EVAL/npm-install-valid.err")"
582
+ fi
583
+
584
+ if ! rg -q 'tsc: command not found\|TS5023\|tsc should not be called' "$TMPDIR_EVAL/npm-install-valid.err"; then
585
+ _pass "hook does not emit tsc error noise when using built validator"
586
+ else
587
+ _fail "hook leaked tsc error into goal-fit output"
588
+ fi
589
+
590
+ # Part 2 of fix: when the validator cannot run at all (build/ absent and npm fails),
591
+ # the hook must skip cleanly — never block in strict mode due to an env error.
592
+ mv "$ROOT/build" "$ROOT/build-absent"
593
+
594
+ SPAWN_FAIL_DIR="$TMPDIR_EVAL/spawn-fail"
595
+ mkdir -p "$SPAWN_FAIL_DIR"
596
+ printf '#!/usr/bin/env bash\necho "npm ERR! tsc: command not found" >&2\nexit 127\n' > "$SPAWN_FAIL_DIR/npm"
597
+ chmod +x "$SPAWN_FAIL_DIR/npm"
598
+
599
+ if PATH="$SPAWN_FAIL_DIR:$PATH" FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true \
600
+ node "$ROOT/scripts/hooks/stop-goal-fit.js" \
601
+ >"$TMPDIR_EVAL/npm-install-env-err.out" 2>"$TMPDIR_EVAL/npm-install-env-err.err" <<JSON
602
+ {"hook_event_name":"Stop","cwd":"$NPM_INSTALL_REPO"}
603
+ JSON
604
+ then
605
+ _pass "strict hook does not block when validator environment fails (build/ absent, tsc missing)"
606
+ else
607
+ _fail "strict hook must not block when validator env fails: $(cat "$TMPDIR_EVAL/npm-install-env-err.err")"
608
+ fi
609
+
610
+ if rg -q 'sidecar validation skipped' "$TMPDIR_EVAL/npm-install-env-err.err"; then
611
+ _pass "hook emits sidecar validation skipped warning for environment errors"
612
+ else
613
+ _fail "hook did not emit 'sidecar validation skipped' for environment errors"
614
+ fi
615
+
616
+ # Restore build/ so subsequent evals are unaffected.
617
+ mv "$ROOT/build-absent" "$ROOT/build"
618
+
619
+
476
620
  if [[ "$errors" -eq 0 ]]; then
477
621
  echo "Goal Fit hook integration passed."
478
622
  exit 0
@@ -164,7 +164,7 @@ run_inspect "$ROOT/kits/builder" "$out" || true
164
164
  if node -e "
165
165
  const d = require('fs').readFileSync('$out', 'utf8');
166
166
  const r = JSON.parse(d);
167
- const required = ['kit_id','kit_name','conformance','targets','third_party_extensions'];
167
+ const required = ['kit_id','kit_name','conformance','targets','third_party_extensions','trust'];
168
168
  for (const k of required) {
169
169
  if (!(k in r)) throw new Error('missing key: ' + k);
170
170
  }
@@ -199,6 +199,60 @@ else
199
199
  cat "$out"
200
200
  fi
201
201
 
202
+ # ===================================================================
203
+ echo ""
204
+ echo "=== 8. Trust axis: first-party allowlist (builder and knowledge) ==="
205
+ # ===================================================================
206
+
207
+ for kit_name in builder knowledge; do
208
+ out="$TMP_DIR/trust-${kit_name}.out"
209
+ run_inspect "$ROOT/kits/$kit_name" "$out" || true
210
+ trust=$(node -e "const d=require('fs').readFileSync('$out','utf8'); console.log(JSON.parse(d).trust)" 2>/dev/null)
211
+ if [[ "$trust" == "first-party" ]]; then
212
+ pass "$kit_name kit trust: first-party (in Kontour allowlist)"
213
+ else
214
+ fail "$kit_name kit trust: expected first-party, got '$trust'"
215
+ cat "$out"
216
+ fi
217
+ done
218
+
219
+ # ===================================================================
220
+ echo ""
221
+ echo "=== 9. Trust axis: unverified for third-party and fixture kits ==="
222
+ # ===================================================================
223
+
224
+ for fixture in k0-flows-only k1-agent-extension k2-with-evals third-party-extension; do
225
+ out="$TMP_DIR/trust-${fixture}.out"
226
+ run_inspect "$ROOT/evals/fixtures/kit-conformance-levels/$fixture" "$out" || true
227
+ trust=$(node -e "const d=require('fs').readFileSync('$out','utf8'); console.log(JSON.parse(d).trust)" 2>/dev/null)
228
+ if [[ "$trust" == "unverified" ]]; then
229
+ pass "$fixture fixture trust: unverified (not in first-party allowlist)"
230
+ else
231
+ fail "$fixture fixture trust: expected unverified, got '$trust'"
232
+ cat "$out"
233
+ fi
234
+ done
235
+
236
+ # ===================================================================
237
+ echo ""
238
+ echo "=== 10. Trust field present in inspect JSON schema ==="
239
+ # ===================================================================
240
+
241
+ out="$TMP_DIR/trust-schema.out"
242
+ run_inspect "$ROOT/kits/builder" "$out" || true
243
+ if node -e "
244
+ const d = require('fs').readFileSync('$out', 'utf8');
245
+ const r = JSON.parse(d);
246
+ if (!('trust' in r)) throw new Error('missing key: trust');
247
+ const valid = ['first-party', 'verified', 'unverified'];
248
+ if (!valid.includes(r.trust)) throw new Error('trust must be one of: ' + valid.join(', ') + '; got: ' + r.trust);
249
+ " 2>/dev/null; then
250
+ pass "inspect JSON output includes trust field with valid value"
251
+ else
252
+ fail "inspect JSON output is missing trust field or has invalid value"
253
+ cat "$out"
254
+ fi
255
+
202
256
  # ===================================================================
203
257
  echo ""
204
258
  if [[ "$errors" -eq 0 ]]; then