@kontourai/flow-agents 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +6 -1
- package/.github/workflows/kit-gates-demo.yml +6 -2
- package/CHANGELOG.md +25 -0
- package/CONTRIBUTING.md +30 -0
- package/agents/dev.json +1 -1
- package/agents/tool-planner.json +1 -1
- package/build/src/cli/workflow-sidecar.js +70 -5
- package/build/src/flow-kit/validate.js +32 -1
- package/build/src/tools/build-universal-bundles.js +14 -0
- package/console.telemetry.json +1 -1
- package/docs/adr/0004-gates-expect-surface-claims.md +7 -7
- package/docs/kit-authoring-guide.md +99 -6
- package/docs/operating-layers.md +2 -2
- package/docs/veritas-integration.md +4 -4
- package/docs/workflow-eval-strategy.md +2 -2
- package/docs/workflow-usage-guide.md +1 -1
- package/evals/acceptance/test_opencode_harness.sh +18 -10
- package/evals/acceptance/test_pi_harness.sh +10 -6
- package/evals/ci/run-baseline.sh +1 -1
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/flows/runtime.flow.json +4 -4
- package/evals/fixtures/flow-kit-repository/valid-local-kit/flows/review.flow.json +4 -4
- package/evals/fixtures/kit-conformance-levels/k0-flows-only/flows/review.flow.json +4 -4
- package/evals/fixtures/kit-conformance-levels/k1-agent-extension/flows/build.flow.json +4 -4
- package/evals/fixtures/kit-conformance-levels/k2-with-evals/flows/synthesize.flow.json +4 -4
- package/evals/fixtures/kit-conformance-levels/third-party-extension/flows/review.flow.json +4 -4
- package/evals/fixtures/surface-trust/accepted-claim-trust-report.json +2 -2
- package/evals/fixtures/surface-trust/artifact-absent.json +2 -2
- package/evals/fixtures/surface-trust/integrity-mismatch-trust-report.json +2 -2
- package/evals/fixtures/surface-trust/missing-authority-trust-report.json +2 -2
- package/evals/fixtures/surface-trust/provider-absent.json +2 -2
- package/evals/fixtures/surface-trust/rejected-claim-trust-report.json +2 -2
- package/evals/fixtures/surface-trust/stale-claim-trust-snapshot.json +2 -2
- package/evals/integration/test_console_learning_projection.sh +1 -1
- package/evals/integration/test_goal_fit_hook.sh +144 -0
- package/evals/integration/test_kit_conformance_levels.sh +55 -1
- package/evals/integration/test_workflow_sidecar_writer.sh +9 -9
- package/evals/static/test_package.sh +3 -3
- package/evals/static/test_workflow_skills.sh +4 -4
- package/kits/builder/flows/build.flow.json +48 -48
- package/kits/builder/flows/shape.flow.json +36 -36
- package/kits/knowledge/adapters/obsidian-store/index.js +137 -26
- package/kits/knowledge/evals/contract-suite/suite.test.js +90 -0
- package/kits/knowledge/flows/compile.flow.json +12 -12
- package/kits/knowledge/flows/consolidate.flow.json +16 -16
- package/kits/knowledge/flows/ingest.flow.json +12 -12
- package/kits/knowledge/flows/retire.flow.json +16 -16
- package/kits/knowledge/flows/store-contract.flow.json +12 -12
- package/kits/knowledge/flows/synthesize.flow.json +16 -16
- package/kits/release-evidence/flows/release-evidence.flow.json +3 -3
- package/package.json +5 -2
- package/schemas/workflow-evidence.schema.json +2 -1
- package/scripts/hooks/stop-goal-fit.js +66 -18
- package/src/cli/workflow-sidecar.ts +62 -4
- package/src/flow-kit/validate.ts +55 -1
- package/src/tools/build-universal-bundles.ts +14 -0
|
@@ -106,9 +106,9 @@ If Veritas is unavailable and the workflow expected it, record `not_verified` in
|
|
|
106
106
|
|
|
107
107
|
## Builder Kit Trust Evidence
|
|
108
108
|
|
|
109
|
-
Builder Kit gates stay provider-neutral. The Builder Kit Flow Definition names gate expectations as `kind: "
|
|
109
|
+
Builder Kit gates stay provider-neutral. The Builder Kit Flow Definition names gate expectations as `kind: "trust.bundle"` (the Hachure-aligned gate kind) and declares the claim type, subject, accepted statuses, and blocking behavior. It does not name Veritas or any other trust producer.
|
|
110
110
|
|
|
111
|
-
When a trust-backed path is configured, Flow Agents may attach a compact
|
|
111
|
+
When a trust-backed path is configured, Flow Agents may attach a compact Hachure trust.bundle reference to the Builder Kit evidence gate. The reference uses `artifact_kind: "trust.bundle"` (the Hachure-aligned canonical value), carries the related gate id, domain claim type, claim status, artifact ref, integrity summary, authority or trusted-producer summary, subject, and freshness state, and then maps to the normal Flow gate result. When the `hachure` optional dependency is installed, referenced artifacts are validated against hachure's trust-bundle.schema.json at evidence-recording time. Flow owns the gate authority decision, route reason, trusted producer mapping, and accepted gap behavior. Surface owns the portable trust state represented by the Surface claim and the TrustReport / Trust Snapshot. A Probe can request or clarify the evidence needed before planning or before a later Builder Kit gate retries.
|
|
112
112
|
|
|
113
113
|
Veritas is only one optional producer of those artifacts. A local Veritas readiness run can emit native Veritas evidence and, when configured, point Flow Agents at a Surface-shaped TrustReport or Trust Snapshot. Flow Agents records the reference; it does not copy Veritas rule models, readiness semantics, or provider-native fields into Builder Kit gates.
|
|
114
114
|
|
|
@@ -116,8 +116,8 @@ Provider and artifact absence are explicit:
|
|
|
116
116
|
|
|
117
117
|
- If no trust provider is configured, ordinary Builder Kit activation, planning, verification, and evidence gates continue to work through the existing Flow Kit path.
|
|
118
118
|
- If a trust-backed path was requested but no provider is configured, the trust check records `not_verified` with a clear gap instead of blocking unrelated Builder Kit usage.
|
|
119
|
-
- If a provider is configured but the expected
|
|
120
|
-
- If a
|
|
119
|
+
- If a provider is configured but the expected Hachure trust.bundle artifact is absent or unreadable, only the requested trust-backed evidence check records `not_verified`; it does not silently pass and it does not make Veritas mandatory.
|
|
120
|
+
- If a Hachure trust.bundle artifact is present but has a rejected, stale, expired, missing-authority, or integrity-mismatched claim, the Builder Kit evidence gate routes through the normal `fail` or `not_verified` path.
|
|
121
121
|
|
|
122
122
|
## Adoption Gate
|
|
123
123
|
|
|
@@ -6,7 +6,7 @@ title: Workflow Eval Strategy
|
|
|
6
6
|
|
|
7
7
|
The Builder Kit workflow system now has concrete skill contracts for `idea-to-backlog`, `pull-work`, `plan-work`, `review-work`, `deliver`, `evidence-gate`, `release-readiness`, and `learning-review`, plus shared workflow contracts in `context/contracts/`. Evals should prove both the written contracts and the agent behavior around gates, artifacts, worktrees, Goal Fit, release readiness, final acceptance docs, and learning feedback.
|
|
8
8
|
|
|
9
|
-
Flow Agents evals prove coordination, install, runtime adapter behavior, and artifact discipline. They should not redefine Flow gate authority: Flow Definitions use typed `expects` entries,
|
|
9
|
+
Flow Agents evals prove coordination, install, runtime adapter behavior, and artifact discipline. They should not redefine Flow gate authority: Flow Definitions use typed `expects` entries, trust-bundle gates use `kind: "trust.bundle"`, and Flow project config owns trusted producer mappings plus gate overrides.
|
|
10
10
|
|
|
11
11
|
## Goals
|
|
12
12
|
|
|
@@ -161,7 +161,7 @@ Surface trust artifact attachment is covered by deterministic schema, runtime, a
|
|
|
161
161
|
bash evals/integration/test_workflow_sidecar_writer.sh
|
|
162
162
|
```
|
|
163
163
|
|
|
164
|
-
That eval exercises Builder Kit `
|
|
164
|
+
That eval exercises Builder Kit `trust.bundle` evidence using provider-neutral Hachure trust.bundle fixtures for accepted, rejected, stale, missing-authority, integrity-mismatch, provider-absent, and artifact-absent cases. It proves Flow Agents can record compact Surface claim evidence in `evidence.json` and report pass, fail, or `NOT_VERIFIED` gaps without requiring provider-specific fields.
|
|
165
165
|
|
|
166
166
|
This coverage does not redefine Flow gate authority. Flow Definitions continue to express expectations, Flow project config owns trusted producer mappings and gate overrides, and Flow gate authority remains outside the local report writer. Runtime/provider gaps should be recorded as `NOT_VERIFIED` when a configured Surface claim path cannot be checked; ordinary Builder Kit workflows remain valid when no trust provider or trust artifact is configured.
|
|
167
167
|
|
|
@@ -6,7 +6,7 @@ title: Workflow Usage Guide
|
|
|
6
6
|
|
|
7
7
|
This guide shows how to use the Builder Kit workflow skills in normal chats.
|
|
8
8
|
|
|
9
|
-
> **Which doc do I want?** This page is the *driver's manual* — what to say at each stage and what should happen. If you want the conceptual map first — layers, sidecars, hooks, evidence, and why the system is shaped this way — read the [Agent System Guidebook](agent-system-guidebook.md). For a one-line summary of every skill and gate, use the [Skills Map](skills-map.md). Flow Agents coordinates the local runtime, installs Flow Kits, and records artifacts; Flow owns gate semantics, including typed `expects` entries with `kind: "
|
|
9
|
+
> **Which doc do I want?** This page is the *driver's manual* — what to say at each stage and what should happen. If you want the conceptual map first — layers, sidecars, hooks, evidence, and why the system is shaped this way — read the [Agent System Guidebook](agent-system-guidebook.md). For a one-line summary of every skill and gate, use the [Skills Map](skills-map.md). Flow Agents coordinates the local runtime, installs Flow Kits, and records artifacts; Flow owns gate semantics, including typed `expects` entries with `kind: "trust.bundle"`, trusted producer config, and gate overrides.
|
|
10
10
|
|
|
11
11
|
The core pattern is:
|
|
12
12
|
|
|
@@ -21,7 +21,7 @@ wait_for_telemetry() {
|
|
|
21
21
|
local file="$1"
|
|
22
22
|
local i=0
|
|
23
23
|
while [[ $i -lt 150 ]]; do
|
|
24
|
-
[[ -s "$file" ]] && return 0
|
|
24
|
+
if [[ -s "$file" ]] && grep -q '"tool.invoke"' "$file" 2>/dev/null && grep -q '"tool.result"' "$file" 2>/dev/null; then return 0; fi
|
|
25
25
|
sleep 0.1
|
|
26
26
|
i=$((i + 1))
|
|
27
27
|
done
|
|
@@ -73,23 +73,31 @@ for _attempt in 1 2; do
|
|
|
73
73
|
grep -q '"tool.invoke"' "$TMP_WORK/.telemetry/full.jsonl" 2>/dev/null && break
|
|
74
74
|
done
|
|
75
75
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
76
|
+
# Confirm load via the plugin's own marker file (written by the FlowAgentsPlugin
|
|
77
|
+
# factory at startup). This replaces grepping opencode's internal
|
|
78
|
+
# "plugins/flow-agents.js loading plugin" message, which opencode 1.17.x dropped
|
|
79
|
+
# and which opencode does not reliably surface to its log file — a stale-assertion
|
|
80
|
+
# false failure (#75). The factory runs regardless of provider, so this load
|
|
81
|
+
# signal is independent of whether a model turn completes.
|
|
82
|
+
if [[ -f "$TMP_WORK/.telemetry/opencode-plugin.loaded" ]]; then
|
|
83
|
+
_pass "flow-agents plugin loaded (factory marker present)"
|
|
79
84
|
else
|
|
80
|
-
_fail "
|
|
85
|
+
_fail "flow-agents plugin did not load (factory marker absent)"
|
|
81
86
|
fi
|
|
82
87
|
|
|
83
88
|
telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
|
|
84
89
|
if [[ "$provider_error" -eq 1 ]]; then
|
|
85
90
|
_skip "opencode telemetry assertions skipped (provider/auth error)"
|
|
86
91
|
_skip "opencode telemetry tool events skipped (provider/auth error)"
|
|
92
|
+
elif ! wait_for_telemetry "$telemetry_file"; then
|
|
93
|
+
# No telemetry was produced at all — the agent never completed a model turn,
|
|
94
|
+
# expected in a provider-less environment (e.g. CI with no API key). The binary
|
|
95
|
+
# install, bundle, and mechanical hook chain are already covered; skip the
|
|
96
|
+
# live-model-dependent telemetry assertions rather than fail on them.
|
|
97
|
+
_skip "opencode telemetry assertions skipped (no telemetry — agent did not complete a turn, likely no provider)"
|
|
98
|
+
_skip "opencode telemetry tool events skipped (no turn)"
|
|
87
99
|
else
|
|
88
|
-
|
|
89
|
-
_pass "opencode telemetry log was written"
|
|
90
|
-
else
|
|
91
|
-
_fail "opencode telemetry log was not written"
|
|
92
|
-
fi
|
|
100
|
+
_pass "opencode telemetry log was written"
|
|
93
101
|
|
|
94
102
|
if [[ -f "$telemetry_file" ]] && \
|
|
95
103
|
node -e "
|
|
@@ -21,7 +21,7 @@ wait_for_telemetry() {
|
|
|
21
21
|
local file="$1"
|
|
22
22
|
local i=0
|
|
23
23
|
while [[ $i -lt 150 ]]; do
|
|
24
|
-
[[ -s "$file" ]] && return 0
|
|
24
|
+
if [[ -s "$file" ]] && grep -q '"session.start"' "$file" 2>/dev/null && grep -q '"tool.invoke"' "$file" 2>/dev/null && grep -q '"tool.result"' "$file" 2>/dev/null && grep -q '"session.end"' "$file" 2>/dev/null; then return 0; fi
|
|
25
25
|
sleep 0.1
|
|
26
26
|
i=$((i + 1))
|
|
27
27
|
done
|
|
@@ -60,12 +60,16 @@ if [[ "$provider_error" -eq 1 ]]; then
|
|
|
60
60
|
_skip "pi telemetry assertions skipped (provider/auth error)"
|
|
61
61
|
_skip "pi telemetry event types skipped (provider/auth error)"
|
|
62
62
|
_skip "pi telemetry session events skipped (provider/auth error)"
|
|
63
|
+
elif ! wait_for_telemetry "$telemetry_file"; then
|
|
64
|
+
# No telemetry was produced at all — the agent never completed a model turn,
|
|
65
|
+
# which in a provider-less environment (e.g. CI with no API key) is expected.
|
|
66
|
+
# The binary install, bundle, and mechanical hook chain are already covered;
|
|
67
|
+
# skip the live-model-dependent telemetry assertions rather than fail on them.
|
|
68
|
+
_skip "pi telemetry assertions skipped (no telemetry — agent did not complete a turn, likely no provider)"
|
|
69
|
+
_skip "pi telemetry event types skipped (no turn)"
|
|
70
|
+
_skip "pi telemetry session events skipped (no turn)"
|
|
63
71
|
else
|
|
64
|
-
|
|
65
|
-
_pass "pi telemetry log was written"
|
|
66
|
-
else
|
|
67
|
-
_fail "pi telemetry log was not written"
|
|
68
|
-
fi
|
|
72
|
+
_pass "pi telemetry log was written"
|
|
69
73
|
|
|
70
74
|
if [[ -f "$telemetry_file" ]] && \
|
|
71
75
|
node -e "
|
package/evals/ci/run-baseline.sh
CHANGED
|
@@ -74,7 +74,7 @@ LANE_RUNTIME_AND_KIT=(
|
|
|
74
74
|
"Kit conformance levels integration"
|
|
75
75
|
"Local Flow Kit install integration"
|
|
76
76
|
"Flow Kit install-git integration"
|
|
77
|
-
|
|
77
|
+
"Console learning projection integration"
|
|
78
78
|
"Context map integration"
|
|
79
79
|
"Effective backlog settings integration"
|
|
80
80
|
"Flow agents statusline integration"
|
|
@@ -11,12 +11,12 @@
|
|
|
11
11
|
"expects": [
|
|
12
12
|
{
|
|
13
13
|
"id": "runtime-evidence",
|
|
14
|
-
"kind": "
|
|
14
|
+
"kind": "trust.bundle",
|
|
15
15
|
"required": true,
|
|
16
16
|
"description": "Runtime activation evidence exists.",
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
17
|
+
"bundle_claim": {
|
|
18
|
+
"claimType": "mixed.runtime.evidence",
|
|
19
|
+
"subjectType": "artifact",
|
|
20
20
|
"accepted_statuses": ["trusted", "accepted"]
|
|
21
21
|
}
|
|
22
22
|
}
|
|
@@ -11,12 +11,12 @@
|
|
|
11
11
|
"expects": [
|
|
12
12
|
{
|
|
13
13
|
"id": "review-evidence",
|
|
14
|
-
"kind": "
|
|
14
|
+
"kind": "trust.bundle",
|
|
15
15
|
"required": true,
|
|
16
16
|
"description": "Review evidence has been recorded.",
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
17
|
+
"bundle_claim": {
|
|
18
|
+
"claimType": "example.review.evidence",
|
|
19
|
+
"subjectType": "artifact",
|
|
20
20
|
"accepted_statuses": ["trusted", "accepted"]
|
|
21
21
|
}
|
|
22
22
|
}
|
|
@@ -11,12 +11,12 @@
|
|
|
11
11
|
"expects": [
|
|
12
12
|
{
|
|
13
13
|
"id": "review-finding",
|
|
14
|
-
"kind": "
|
|
14
|
+
"kind": "trust.bundle",
|
|
15
15
|
"required": true,
|
|
16
16
|
"description": "Review finding recorded.",
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
17
|
+
"bundle_claim": {
|
|
18
|
+
"claimType": "k0.review.finding",
|
|
19
|
+
"subjectType": "artifact",
|
|
20
20
|
"accepted_statuses": ["trusted", "accepted"]
|
|
21
21
|
}
|
|
22
22
|
}
|
|
@@ -11,12 +11,12 @@
|
|
|
11
11
|
"expects": [
|
|
12
12
|
{
|
|
13
13
|
"id": "build-evidence",
|
|
14
|
-
"kind": "
|
|
14
|
+
"kind": "trust.bundle",
|
|
15
15
|
"required": true,
|
|
16
16
|
"description": "Build evidence recorded.",
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
17
|
+
"bundle_claim": {
|
|
18
|
+
"claimType": "k1.build.evidence",
|
|
19
|
+
"subjectType": "artifact",
|
|
20
20
|
"accepted_statuses": ["trusted", "accepted"]
|
|
21
21
|
}
|
|
22
22
|
}
|
|
@@ -11,12 +11,12 @@
|
|
|
11
11
|
"expects": [
|
|
12
12
|
{
|
|
13
13
|
"id": "synthesis-evidence",
|
|
14
|
-
"kind": "
|
|
14
|
+
"kind": "trust.bundle",
|
|
15
15
|
"required": true,
|
|
16
16
|
"description": "Synthesis evidence with provenance refs.",
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
17
|
+
"bundle_claim": {
|
|
18
|
+
"claimType": "k2.synthesize.evidence",
|
|
19
|
+
"subjectType": "artifact",
|
|
20
20
|
"accepted_statuses": ["trusted", "accepted"]
|
|
21
21
|
}
|
|
22
22
|
}
|
|
@@ -11,12 +11,12 @@
|
|
|
11
11
|
"expects": [
|
|
12
12
|
{
|
|
13
13
|
"id": "review-evidence",
|
|
14
|
-
"kind": "
|
|
14
|
+
"kind": "trust.bundle",
|
|
15
15
|
"required": true,
|
|
16
16
|
"description": "Review evidence.",
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
17
|
+
"bundle_claim": {
|
|
18
|
+
"claimType": "third-party.review.evidence",
|
|
19
|
+
"subjectType": "artifact",
|
|
20
20
|
"accepted_statuses": ["trusted", "accepted"]
|
|
21
21
|
}
|
|
22
22
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schema_version": "1.0",
|
|
3
|
-
"artifact_kind": "
|
|
3
|
+
"artifact_kind": "trust.bundle",
|
|
4
4
|
"artifact_ref": "surface-trust://fixtures/accepted-claim-trust-report.json",
|
|
5
5
|
"subject": {
|
|
6
6
|
"type": "flow-step",
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
},
|
|
9
9
|
"gate": {
|
|
10
10
|
"id": "tests-evidence",
|
|
11
|
-
"kind": "
|
|
11
|
+
"kind": "trust.bundle"
|
|
12
12
|
},
|
|
13
13
|
"claim": {
|
|
14
14
|
"type": "builder.verify.tests",
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schema_version": "1.0",
|
|
3
3
|
"scenario": "artifact_absent",
|
|
4
|
-
"artifact_kind": "
|
|
4
|
+
"artifact_kind": "trust.bundle",
|
|
5
5
|
"artifact_ref": "surface-trust://fixtures/missing-trust-report.json",
|
|
6
6
|
"gate": {
|
|
7
7
|
"id": "implementation-plan",
|
|
8
|
-
"kind": "
|
|
8
|
+
"kind": "trust.bundle"
|
|
9
9
|
},
|
|
10
10
|
"claim": {
|
|
11
11
|
"type": "builder.plan.implementation",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schema_version": "1.0",
|
|
3
|
-
"artifact_kind": "
|
|
3
|
+
"artifact_kind": "trust.bundle",
|
|
4
4
|
"artifact_ref": "surface-trust://fixtures/integrity-mismatch-trust-report.json",
|
|
5
5
|
"subject": {
|
|
6
6
|
"type": "artifact",
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
},
|
|
9
9
|
"gate": {
|
|
10
10
|
"id": "implementation-plan",
|
|
11
|
-
"kind": "
|
|
11
|
+
"kind": "trust.bundle"
|
|
12
12
|
},
|
|
13
13
|
"claim": {
|
|
14
14
|
"type": "builder.plan.implementation",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schema_version": "1.0",
|
|
3
|
-
"artifact_kind": "
|
|
3
|
+
"artifact_kind": "trust.bundle",
|
|
4
4
|
"artifact_ref": "surface-trust://fixtures/missing-authority-trust-report.json",
|
|
5
5
|
"subject": {
|
|
6
6
|
"type": "change",
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
},
|
|
9
9
|
"gate": {
|
|
10
10
|
"id": "implementation-scope",
|
|
11
|
-
"kind": "
|
|
11
|
+
"kind": "trust.bundle"
|
|
12
12
|
},
|
|
13
13
|
"claim": {
|
|
14
14
|
"type": "builder.execute.scope",
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schema_version": "1.0",
|
|
3
3
|
"scenario": "provider_absent",
|
|
4
|
-
"artifact_kind": "
|
|
4
|
+
"artifact_kind": "trust.bundle",
|
|
5
5
|
"artifact_ref": null,
|
|
6
6
|
"gate": {
|
|
7
7
|
"id": "selected-work",
|
|
8
|
-
"kind": "
|
|
8
|
+
"kind": "trust.bundle"
|
|
9
9
|
},
|
|
10
10
|
"claim": {
|
|
11
11
|
"type": "builder.pull-work.selected",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schema_version": "1.0",
|
|
3
|
-
"artifact_kind": "
|
|
3
|
+
"artifact_kind": "trust.bundle",
|
|
4
4
|
"artifact_ref": "surface-trust://fixtures/rejected-claim-trust-report.json",
|
|
5
5
|
"subject": {
|
|
6
6
|
"type": "change",
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
},
|
|
9
9
|
"gate": {
|
|
10
10
|
"id": "implementation-scope",
|
|
11
|
-
"kind": "
|
|
11
|
+
"kind": "trust.bundle"
|
|
12
12
|
},
|
|
13
13
|
"claim": {
|
|
14
14
|
"type": "builder.execute.scope",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schema_version": "1.0",
|
|
3
|
-
"artifact_kind": "
|
|
3
|
+
"artifact_kind": "trust.bundle",
|
|
4
4
|
"artifact_ref": "surface-trust://fixtures/stale-claim-trust-snapshot.json",
|
|
5
5
|
"subject": {
|
|
6
6
|
"type": "flow-step",
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
},
|
|
9
9
|
"gate": {
|
|
10
10
|
"id": "tests-evidence",
|
|
11
|
-
"kind": "
|
|
11
|
+
"kind": "trust.bundle"
|
|
12
12
|
},
|
|
13
13
|
"claim": {
|
|
14
14
|
"type": "builder.verify.tests",
|
|
@@ -6,7 +6,7 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
|
6
6
|
source "$ROOT/evals/lib/node.sh"
|
|
7
7
|
|
|
8
8
|
FIXTURE_DIR="$ROOT/evals/fixtures/console-learning-projection"
|
|
9
|
-
TMPDIR_EVAL="$(mktemp -d
|
|
9
|
+
TMPDIR_EVAL="$(cd "$(mktemp -d "${TMPDIR:-/tmp}/eval-console-learning-projection.XXXXXX")" && pwd -P)"
|
|
10
10
|
ARTIFACT_ROOT="$TMPDIR_EVAL/artifacts"
|
|
11
11
|
KONTOUR_ROOT="$TMPDIR_EVAL/.kontour"
|
|
12
12
|
GENERATED_AT="2026-06-06T20:00:00Z"
|
|
@@ -473,6 +473,150 @@ else
|
|
|
473
473
|
_fail "promoted doc is missing source or acceptance sections"
|
|
474
474
|
fi
|
|
475
475
|
|
|
476
|
+
# --- npm-install regression: validator-environment errors must not block goal-fit ---
|
|
477
|
+
# Simulate the npm-installed condition: build/ is present (always shipped in package files)
|
|
478
|
+
# but tsc is absent from PATH, so `npm run workflow:validate-artifacts` (which rebuilds)
|
|
479
|
+
# would fail. The fix directly invokes node build/.../validate-workflow-artifacts.js instead.
|
|
480
|
+
|
|
481
|
+
NPM_INSTALL_REPO="$TMPDIR_EVAL/npm-install-repo"
|
|
482
|
+
mkdir -p "$NPM_INSTALL_REPO/.flow-agents/npm-install-task"
|
|
483
|
+
printf '# Test Repo\n' > "$NPM_INSTALL_REPO/AGENTS.md"
|
|
484
|
+
|
|
485
|
+
cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/npm-install-task--deliver.md" <<'MARKDOWN'
|
|
486
|
+
# npm install test task
|
|
487
|
+
|
|
488
|
+
branch: main
|
|
489
|
+
worktree: main
|
|
490
|
+
created: 2026-06-01
|
|
491
|
+
status: delivered
|
|
492
|
+
type: deliver
|
|
493
|
+
|
|
494
|
+
## Definition Of Done
|
|
495
|
+
- **User outcome:** Something works.
|
|
496
|
+
- **Acceptance criteria:**
|
|
497
|
+
- [x] Thing works - Evidence: tested
|
|
498
|
+
|
|
499
|
+
## Goal Fit Gate
|
|
500
|
+
- [x] Original user goal restated
|
|
501
|
+
- [x] Every acceptance criterion has evidence
|
|
502
|
+
|
|
503
|
+
## Verification Report
|
|
504
|
+
|
|
505
|
+
### Verdict: PASS
|
|
506
|
+
|
|
507
|
+
## Final Acceptance
|
|
508
|
+
|
|
509
|
+
- [ ] CI passed
|
|
510
|
+
MARKDOWN
|
|
511
|
+
|
|
512
|
+
cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/state.json" <<'JSON'
|
|
513
|
+
{
|
|
514
|
+
"schema_version": "1.0",
|
|
515
|
+
"task_slug": "npm-install-task",
|
|
516
|
+
"status": "delivered",
|
|
517
|
+
"phase": "done",
|
|
518
|
+
"updated_at": "2026-06-01T00:00:00Z",
|
|
519
|
+
"next_action": { "status": "done", "summary": "Local delivery complete." }
|
|
520
|
+
}
|
|
521
|
+
JSON
|
|
522
|
+
|
|
523
|
+
cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/acceptance.json" <<'JSON'
|
|
524
|
+
{
|
|
525
|
+
"schema_version": "1.0",
|
|
526
|
+
"task_slug": "npm-install-task",
|
|
527
|
+
"criteria": [
|
|
528
|
+
{
|
|
529
|
+
"id": "thing-works",
|
|
530
|
+
"description": "Thing works.",
|
|
531
|
+
"status": "pass",
|
|
532
|
+
"evidence_refs": [
|
|
533
|
+
{ "kind": "artifact", "file": "npm-install-task--deliver.md", "summary": "Delivery artifact." }
|
|
534
|
+
]
|
|
535
|
+
}
|
|
536
|
+
],
|
|
537
|
+
"goal_fit": { "status": "pass", "summary": "User outcome achieved." }
|
|
538
|
+
}
|
|
539
|
+
JSON
|
|
540
|
+
|
|
541
|
+
cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/evidence.json" <<'JSON'
|
|
542
|
+
{
|
|
543
|
+
"schema_version": "1.0",
|
|
544
|
+
"task_slug": "npm-install-task",
|
|
545
|
+
"verdict": "pass",
|
|
546
|
+
"checks": [
|
|
547
|
+
{ "id": "build", "kind": "test", "status": "pass", "summary": "Build passed." }
|
|
548
|
+
],
|
|
549
|
+
"not_verified_gaps": []
|
|
550
|
+
}
|
|
551
|
+
JSON
|
|
552
|
+
|
|
553
|
+
cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/handoff.json" <<'JSON'
|
|
554
|
+
{
|
|
555
|
+
"schema_version": "1.0",
|
|
556
|
+
"task_slug": "npm-install-task",
|
|
557
|
+
"summary": "Local delivery complete.",
|
|
558
|
+
"current_state_ref": "state.json",
|
|
559
|
+
"next_steps": [],
|
|
560
|
+
"blockers": [],
|
|
561
|
+
"warnings": []
|
|
562
|
+
}
|
|
563
|
+
JSON
|
|
564
|
+
|
|
565
|
+
# Part 1 of fix: invoke the already-built validator directly (no tsc).
|
|
566
|
+
# Poison tsc so that any call to it fails; confirm the hook does not call it
|
|
567
|
+
# and validates clean sidecars successfully.
|
|
568
|
+
FAKE_TSC_DIR="$TMPDIR_EVAL/fake-tsc"
|
|
569
|
+
mkdir -p "$FAKE_TSC_DIR"
|
|
570
|
+
printf '#!/usr/bin/env bash\necho "error TS5023: tsc should not be called" >&2\nexit 1\n' > "$FAKE_TSC_DIR/tsc"
|
|
571
|
+
chmod +x "$FAKE_TSC_DIR/tsc"
|
|
572
|
+
|
|
573
|
+
if PATH="$FAKE_TSC_DIR:$PATH" FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true \
|
|
574
|
+
node "$ROOT/scripts/hooks/stop-goal-fit.js" \
|
|
575
|
+
>"$TMPDIR_EVAL/npm-install-valid.out" 2>"$TMPDIR_EVAL/npm-install-valid.err" <<JSON
|
|
576
|
+
{"hook_event_name":"Stop","cwd":"$NPM_INSTALL_REPO"}
|
|
577
|
+
JSON
|
|
578
|
+
then
|
|
579
|
+
_pass "strict hook with poisoned tsc uses built validator and does not block valid sidecars"
|
|
580
|
+
else
|
|
581
|
+
_fail "strict hook should not block valid sidecars even with tsc absent: $(cat "$TMPDIR_EVAL/npm-install-valid.err")"
|
|
582
|
+
fi
|
|
583
|
+
|
|
584
|
+
if ! rg -q 'tsc: command not found\|TS5023\|tsc should not be called' "$TMPDIR_EVAL/npm-install-valid.err"; then
|
|
585
|
+
_pass "hook does not emit tsc error noise when using built validator"
|
|
586
|
+
else
|
|
587
|
+
_fail "hook leaked tsc error into goal-fit output"
|
|
588
|
+
fi
|
|
589
|
+
|
|
590
|
+
# Part 2 of fix: when the validator cannot run at all (build/ absent and npm fails),
|
|
591
|
+
# the hook must skip cleanly — never block in strict mode due to an env error.
|
|
592
|
+
mv "$ROOT/build" "$ROOT/build-absent"
|
|
593
|
+
|
|
594
|
+
SPAWN_FAIL_DIR="$TMPDIR_EVAL/spawn-fail"
|
|
595
|
+
mkdir -p "$SPAWN_FAIL_DIR"
|
|
596
|
+
printf '#!/usr/bin/env bash\necho "npm ERR! tsc: command not found" >&2\nexit 127\n' > "$SPAWN_FAIL_DIR/npm"
|
|
597
|
+
chmod +x "$SPAWN_FAIL_DIR/npm"
|
|
598
|
+
|
|
599
|
+
if PATH="$SPAWN_FAIL_DIR:$PATH" FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true \
|
|
600
|
+
node "$ROOT/scripts/hooks/stop-goal-fit.js" \
|
|
601
|
+
>"$TMPDIR_EVAL/npm-install-env-err.out" 2>"$TMPDIR_EVAL/npm-install-env-err.err" <<JSON
|
|
602
|
+
{"hook_event_name":"Stop","cwd":"$NPM_INSTALL_REPO"}
|
|
603
|
+
JSON
|
|
604
|
+
then
|
|
605
|
+
_pass "strict hook does not block when validator environment fails (build/ absent, tsc missing)"
|
|
606
|
+
else
|
|
607
|
+
_fail "strict hook must not block when validator env fails: $(cat "$TMPDIR_EVAL/npm-install-env-err.err")"
|
|
608
|
+
fi
|
|
609
|
+
|
|
610
|
+
if rg -q 'sidecar validation skipped' "$TMPDIR_EVAL/npm-install-env-err.err"; then
|
|
611
|
+
_pass "hook emits sidecar validation skipped warning for environment errors"
|
|
612
|
+
else
|
|
613
|
+
_fail "hook did not emit 'sidecar validation skipped' for environment errors"
|
|
614
|
+
fi
|
|
615
|
+
|
|
616
|
+
# Restore build/ so subsequent evals are unaffected.
|
|
617
|
+
mv "$ROOT/build-absent" "$ROOT/build"
|
|
618
|
+
|
|
619
|
+
|
|
476
620
|
if [[ "$errors" -eq 0 ]]; then
|
|
477
621
|
echo "Goal Fit hook integration passed."
|
|
478
622
|
exit 0
|
|
@@ -164,7 +164,7 @@ run_inspect "$ROOT/kits/builder" "$out" || true
|
|
|
164
164
|
if node -e "
|
|
165
165
|
const d = require('fs').readFileSync('$out', 'utf8');
|
|
166
166
|
const r = JSON.parse(d);
|
|
167
|
-
const required = ['kit_id','kit_name','conformance','targets','third_party_extensions'];
|
|
167
|
+
const required = ['kit_id','kit_name','conformance','targets','third_party_extensions','trust'];
|
|
168
168
|
for (const k of required) {
|
|
169
169
|
if (!(k in r)) throw new Error('missing key: ' + k);
|
|
170
170
|
}
|
|
@@ -199,6 +199,60 @@ else
|
|
|
199
199
|
cat "$out"
|
|
200
200
|
fi
|
|
201
201
|
|
|
202
|
+
# ===================================================================
|
|
203
|
+
echo ""
|
|
204
|
+
echo "=== 8. Trust axis: first-party allowlist (builder and knowledge) ==="
|
|
205
|
+
# ===================================================================
|
|
206
|
+
|
|
207
|
+
for kit_name in builder knowledge; do
|
|
208
|
+
out="$TMP_DIR/trust-${kit_name}.out"
|
|
209
|
+
run_inspect "$ROOT/kits/$kit_name" "$out" || true
|
|
210
|
+
trust=$(node -e "const d=require('fs').readFileSync('$out','utf8'); console.log(JSON.parse(d).trust)" 2>/dev/null)
|
|
211
|
+
if [[ "$trust" == "first-party" ]]; then
|
|
212
|
+
pass "$kit_name kit trust: first-party (in Kontour allowlist)"
|
|
213
|
+
else
|
|
214
|
+
fail "$kit_name kit trust: expected first-party, got '$trust'"
|
|
215
|
+
cat "$out"
|
|
216
|
+
fi
|
|
217
|
+
done
|
|
218
|
+
|
|
219
|
+
# ===================================================================
|
|
220
|
+
echo ""
|
|
221
|
+
echo "=== 9. Trust axis: unverified for third-party and fixture kits ==="
|
|
222
|
+
# ===================================================================
|
|
223
|
+
|
|
224
|
+
for fixture in k0-flows-only k1-agent-extension k2-with-evals third-party-extension; do
|
|
225
|
+
out="$TMP_DIR/trust-${fixture}.out"
|
|
226
|
+
run_inspect "$ROOT/evals/fixtures/kit-conformance-levels/$fixture" "$out" || true
|
|
227
|
+
trust=$(node -e "const d=require('fs').readFileSync('$out','utf8'); console.log(JSON.parse(d).trust)" 2>/dev/null)
|
|
228
|
+
if [[ "$trust" == "unverified" ]]; then
|
|
229
|
+
pass "$fixture fixture trust: unverified (not in first-party allowlist)"
|
|
230
|
+
else
|
|
231
|
+
fail "$fixture fixture trust: expected unverified, got '$trust'"
|
|
232
|
+
cat "$out"
|
|
233
|
+
fi
|
|
234
|
+
done
|
|
235
|
+
|
|
236
|
+
# ===================================================================
|
|
237
|
+
echo ""
|
|
238
|
+
echo "=== 10. Trust field present in inspect JSON schema ==="
|
|
239
|
+
# ===================================================================
|
|
240
|
+
|
|
241
|
+
out="$TMP_DIR/trust-schema.out"
|
|
242
|
+
run_inspect "$ROOT/kits/builder" "$out" || true
|
|
243
|
+
if node -e "
|
|
244
|
+
const d = require('fs').readFileSync('$out', 'utf8');
|
|
245
|
+
const r = JSON.parse(d);
|
|
246
|
+
if (!('trust' in r)) throw new Error('missing key: trust');
|
|
247
|
+
const valid = ['first-party', 'verified', 'unverified'];
|
|
248
|
+
if (!valid.includes(r.trust)) throw new Error('trust must be one of: ' + valid.join(', ') + '; got: ' + r.trust);
|
|
249
|
+
" 2>/dev/null; then
|
|
250
|
+
pass "inspect JSON output includes trust field with valid value"
|
|
251
|
+
else
|
|
252
|
+
fail "inspect JSON output is missing trust field or has invalid value"
|
|
253
|
+
cat "$out"
|
|
254
|
+
fi
|
|
255
|
+
|
|
202
256
|
# ===================================================================
|
|
203
257
|
echo ""
|
|
204
258
|
if [[ "$errors" -eq 0 ]]; then
|