@kontourai/flow-agents 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/runtime-compat.yml +1 -1
- package/CHANGELOG.md +8 -0
- package/build/src/cli/workflow-sidecar.d.ts +16 -0
- package/build/src/cli/workflow-sidecar.js +64 -10
- package/build/src/lib/flow-resolver.d.ts +29 -0
- package/build/src/lib/flow-resolver.js +71 -0
- package/evals/ci/antigaming-suite.sh +1 -0
- package/evals/integration/test_command_log_fork_classification.sh +134 -0
- package/evals/integration/test_kit_identity_trust.sh +393 -0
- package/evals/run.sh +2 -0
- package/package.json +4 -4
- package/scripts/hooks/stop-goal-fit.js +76 -23
- package/scripts/repair-command-log.js +115 -0
- package/src/cli/workflow-sidecar.ts +65 -9
- package/src/lib/flow-resolver.ts +85 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test_kit_identity_trust.sh — Regression eval for kit identity end-to-end in the trust chain.
|
|
3
|
+
#
|
|
4
|
+
# Proves Fix 1 and Fix 2 from the kit-identity task:
|
|
5
|
+
#
|
|
6
|
+
# Fix 1 (surfaceCheckFromArtifact reads kit from bundle, never hardcodes "builder"):
|
|
7
|
+
# 1a. KNOWLEDGE-TYPED bundle → kitIdentityFromBundle derives kitId="knowledge", subject="knowledge-kit"
|
|
8
|
+
# 1b. BUILDER-TYPED bundle → kitIdentityFromBundle derives kitId="builder", subject="builder-kit"
|
|
9
|
+
# 1c. WORKFLOW-ONLY bundle (no kit-typed claim, no current.json) → kitId="unknown", subject="unknown-kit"
|
|
10
|
+
# 1d. record-evidence --surface-trust-json <knowledge-fixture> completes without crash
|
|
11
|
+
#
|
|
12
|
+
# Fix 2 (route-back guard is FlowDefinition-driven, not hardcoded to builder.build):
|
|
13
|
+
# 2a. builder.build: verification→execution still enforced (identical behavior preserved)
|
|
14
|
+
# 2b. Custom non-builder flow WITH route_back_policy: verification→execution ENFORCED
|
|
15
|
+
# 2c. Custom flow WITHOUT route_back_policy: verification→execution NOT ENFORCED
|
|
16
|
+
#
|
|
17
|
+
# Deterministic, no model spend, self-cleaning.
|
|
18
|
+
# Usage: bash evals/integration/test_kit_identity_trust.sh
|
|
19
|
+
|
|
20
|
+
set -uo pipefail
|
|
21
|
+
|
|
22
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
23
|
+
source "$ROOT/evals/lib/node.sh"
|
|
24
|
+
|
|
25
|
+
TMP="$(mktemp -d)"
|
|
26
|
+
errors=0
|
|
27
|
+
|
|
28
|
+
_pass() { echo " ✓ $1"; }
|
|
29
|
+
_fail() { echo " ✗ $1"; errors=$((errors + 1)); }
|
|
30
|
+
|
|
31
|
+
cleanup() { rm -rf "$TMP"; }
|
|
32
|
+
trap cleanup EXIT
|
|
33
|
+
|
|
34
|
+
SIDECAR_JS="${ROOT}/build/src/cli/workflow-sidecar.js"
|
|
35
|
+
SIDECAR_BUNDLE_WRITER="workflow-sidecar"
|
|
36
|
+
|
|
37
|
+
echo ""
|
|
38
|
+
echo "=== Fix 1: kitIdentityFromBundle reads kit from bundle claims (not hardcoded 'builder') ==="
|
|
39
|
+
|
|
40
|
+
# ─── Write fixture bundle files (note: argv[2] = file path since argv[1] = "-" for stdin) ─────────
|
|
41
|
+
|
|
42
|
+
node - "$TMP/knowledge.bundle" << 'NODE'
|
|
43
|
+
const fs = require('fs');
|
|
44
|
+
// argv[0]=node, argv[1]="-", argv[2]=file path
|
|
45
|
+
const bundlePath = process.argv[2];
|
|
46
|
+
const bundle = {
|
|
47
|
+
schemaVersion: 3, source: "test-fixture",
|
|
48
|
+
claims: [{
|
|
49
|
+
id: "c-knowledge-1", claimType: "knowledge.verify.tests",
|
|
50
|
+
subjectType: "flow-step", subjectId: "test-slug/knowledge-ev",
|
|
51
|
+
surface: "flow-agents.workflow", fieldOrBehavior: "knowledge verification",
|
|
52
|
+
value: "pass", status: "verified",
|
|
53
|
+
createdAt: "2026-06-27T00:00:00Z", updatedAt: "2026-06-27T00:00:00Z",
|
|
54
|
+
impactLevel: "high", verificationPolicyId: "policy:knowledge.verify.tests"
|
|
55
|
+
}],
|
|
56
|
+
evidence: [], policies: [], events: []
|
|
57
|
+
};
|
|
58
|
+
fs.writeFileSync(bundlePath, JSON.stringify(bundle, null, 2));
|
|
59
|
+
NODE
|
|
60
|
+
|
|
61
|
+
node - "$TMP/builder.bundle" << 'NODE'
|
|
62
|
+
const fs = require('fs');
|
|
63
|
+
const bundlePath = process.argv[2];
|
|
64
|
+
const bundle = {
|
|
65
|
+
schemaVersion: 3, source: "test-fixture",
|
|
66
|
+
claims: [{
|
|
67
|
+
id: "c-builder-1", claimType: "builder.verify.tests",
|
|
68
|
+
subjectType: "flow-step", subjectId: "test-slug/builder-ev",
|
|
69
|
+
surface: "flow-agents.workflow", fieldOrBehavior: "builder verification",
|
|
70
|
+
value: "pass", status: "verified",
|
|
71
|
+
createdAt: "2026-06-27T00:00:00Z", updatedAt: "2026-06-27T00:00:00Z",
|
|
72
|
+
impactLevel: "high", verificationPolicyId: "policy:builder.verify.tests"
|
|
73
|
+
}],
|
|
74
|
+
evidence: [], policies: [], events: []
|
|
75
|
+
};
|
|
76
|
+
fs.writeFileSync(bundlePath, JSON.stringify(bundle, null, 2));
|
|
77
|
+
NODE
|
|
78
|
+
|
|
79
|
+
node - "$TMP/workflow-only.bundle" << 'NODE'
|
|
80
|
+
const fs = require('fs');
|
|
81
|
+
const bundlePath = process.argv[2];
|
|
82
|
+
const bundle = {
|
|
83
|
+
schemaVersion: 3, source: "test-fixture",
|
|
84
|
+
claims: [{
|
|
85
|
+
id: "c-wf-1", claimType: "workflow.check.build",
|
|
86
|
+
subjectType: "workflow-check", subjectId: "test-slug/build",
|
|
87
|
+
surface: "flow-agents.workflow", fieldOrBehavior: "build check",
|
|
88
|
+
value: "pass", status: "verified",
|
|
89
|
+
createdAt: "2026-06-27T00:00:00Z", updatedAt: "2026-06-27T00:00:00Z",
|
|
90
|
+
impactLevel: "high", verificationPolicyId: "policy:workflow.check.build"
|
|
91
|
+
}],
|
|
92
|
+
evidence: [], policies: [], events: []
|
|
93
|
+
};
|
|
94
|
+
fs.writeFileSync(bundlePath, JSON.stringify(bundle, null, 2));
|
|
95
|
+
NODE
|
|
96
|
+
|
|
97
|
+
echo ""
|
|
98
|
+
echo "=== 1a. KNOWLEDGE-TYPED bundle → kitIdentityFromBundle derives knowledge kit ==="
|
|
99
|
+
KNOWLEDGE_BUNDLE="$TMP/knowledge.bundle"
|
|
100
|
+
SIDECAR_JS_PATH="$SIDECAR_JS"
|
|
101
|
+
node --input-type=module << JSEOF
|
|
102
|
+
import { kitIdentityFromBundle } from '${SIDECAR_JS_PATH}';
|
|
103
|
+
import { readFileSync } from 'node:fs';
|
|
104
|
+
const raw = JSON.parse(readFileSync('${KNOWLEDGE_BUNDLE}', 'utf8'));
|
|
105
|
+
const result = kitIdentityFromBundle(raw, '${KNOWLEDGE_BUNDLE}');
|
|
106
|
+
if (result.kitId !== 'knowledge') throw new Error('Expected kitId=knowledge, got: ' + result.kitId);
|
|
107
|
+
if (result.subject !== 'knowledge-kit') throw new Error('Expected subject=knowledge-kit, got: ' + result.subject);
|
|
108
|
+
if (!result.claimType.startsWith('knowledge.')) throw new Error('Expected claimType to start with knowledge., got: ' + result.claimType);
|
|
109
|
+
if (result.claimType === 'knowledge.trust.bundle') throw new Error('Should use the specific claim type, not the generic fallback, got: ' + result.claimType);
|
|
110
|
+
JSEOF
|
|
111
|
+
if [ $? -eq 0 ]; then
|
|
112
|
+
_pass "KNOWLEDGE bundle: kitId=knowledge, subject=knowledge-kit, claimType=knowledge.verify.tests (not builder)"
|
|
113
|
+
else
|
|
114
|
+
_fail "KNOWLEDGE bundle: expected kitId=knowledge and subject=knowledge-kit, not builder hardcode"
|
|
115
|
+
fi
|
|
116
|
+
|
|
117
|
+
echo ""
|
|
118
|
+
echo "=== 1b. BUILDER-TYPED bundle → kitIdentityFromBundle derives builder kit ==="
|
|
119
|
+
BUILDER_BUNDLE="$TMP/builder.bundle"
|
|
120
|
+
node --input-type=module << JSEOF
|
|
121
|
+
import { kitIdentityFromBundle } from '${SIDECAR_JS_PATH}';
|
|
122
|
+
import { readFileSync } from 'node:fs';
|
|
123
|
+
const raw = JSON.parse(readFileSync('${BUILDER_BUNDLE}', 'utf8'));
|
|
124
|
+
const result = kitIdentityFromBundle(raw, '${BUILDER_BUNDLE}');
|
|
125
|
+
if (result.kitId !== 'builder') throw new Error('Expected kitId=builder, got: ' + result.kitId);
|
|
126
|
+
if (result.subject !== 'builder-kit') throw new Error('Expected subject=builder-kit, got: ' + result.subject);
|
|
127
|
+
if (!result.claimType.startsWith('builder.')) throw new Error('Expected claimType to start with builder., got: ' + result.claimType);
|
|
128
|
+
JSEOF
|
|
129
|
+
if [ $? -eq 0 ]; then
|
|
130
|
+
_pass "BUILDER bundle: kitId=builder, subject=builder-kit (correctly derived from claims, not hardcoded)"
|
|
131
|
+
else
|
|
132
|
+
_fail "BUILDER bundle: expected kitId=builder and subject=builder-kit"
|
|
133
|
+
fi
|
|
134
|
+
|
|
135
|
+
echo ""
|
|
136
|
+
echo "=== 1c. WORKFLOW-ONLY bundle (no kit-typed claim, no current.json) → unknown identity ==="
|
|
137
|
+
ISOLATED_DIR="$TMP/isolated-session"
|
|
138
|
+
mkdir -p "$ISOLATED_DIR"
|
|
139
|
+
cp "$TMP/workflow-only.bundle" "$ISOLATED_DIR/workflow-only.bundle"
|
|
140
|
+
WORKFLOW_BUNDLE="$ISOLATED_DIR/workflow-only.bundle"
|
|
141
|
+
node --input-type=module << JSEOF
|
|
142
|
+
import { kitIdentityFromBundle } from '${SIDECAR_JS_PATH}';
|
|
143
|
+
import { readFileSync } from 'node:fs';
|
|
144
|
+
const raw = JSON.parse(readFileSync('${WORKFLOW_BUNDLE}', 'utf8'));
|
|
145
|
+
const result = kitIdentityFromBundle(raw, '${WORKFLOW_BUNDLE}');
|
|
146
|
+
if (result.kitId !== 'unknown') throw new Error('Expected kitId=unknown (no kit-typed claim, no active flow), got: ' + result.kitId);
|
|
147
|
+
if (result.subject !== 'unknown-kit') throw new Error('Expected subject=unknown-kit, got: ' + result.subject);
|
|
148
|
+
if (result.claimType !== 'unknown.trust.bundle') throw new Error('Expected claimType=unknown.trust.bundle, got: ' + result.claimType);
|
|
149
|
+
JSEOF
|
|
150
|
+
if [ $? -eq 0 ]; then
|
|
151
|
+
_pass "WORKFLOW-ONLY bundle: kitId=unknown, subject=unknown-kit (never falls back to builder)"
|
|
152
|
+
else
|
|
153
|
+
_fail "WORKFLOW-ONLY bundle: expected kitId=unknown (no hardcoded builder fallback)"
|
|
154
|
+
fi
|
|
155
|
+
|
|
156
|
+
echo ""
|
|
157
|
+
echo "=== 1d. Full pipeline: record-evidence --surface-trust-json with knowledge fixture ==="
|
|
158
|
+
PIPELINE_AROOT="$TMP/pipeline-test/.flow-agents"
|
|
159
|
+
PIPELINE_SLUG="pipeline-kit-identity"
|
|
160
|
+
PIPELINE_DIR="$PIPELINE_AROOT/$PIPELINE_SLUG"
|
|
161
|
+
mkdir -p "$PIPELINE_AROOT"
|
|
162
|
+
|
|
163
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" ensure-session \
|
|
164
|
+
--artifact-root "$PIPELINE_AROOT" \
|
|
165
|
+
--task-slug "$PIPELINE_SLUG" \
|
|
166
|
+
--title "Pipeline kit identity test" \
|
|
167
|
+
--summary "Proves record-evidence processes knowledge bundle without crashing." \
|
|
168
|
+
--criterion "Kit identity preserved" \
|
|
169
|
+
--timestamp "2026-06-27T10:00:00Z" > "$TMP/pipeline-ensure.out" 2>&1
|
|
170
|
+
|
|
171
|
+
KNOWLEDGE_BUNDLE_PATH="$TMP/knowledge.bundle"
|
|
172
|
+
if flow_agents_node "$SIDECAR_BUNDLE_WRITER" record-evidence "$PIPELINE_DIR" \
|
|
173
|
+
--verdict not_verified \
|
|
174
|
+
--surface-trust-json "$KNOWLEDGE_BUNDLE_PATH" \
|
|
175
|
+
--timestamp "2026-06-27T10:01:00Z" > "$TMP/pipeline-evidence.out" 2>&1; then
|
|
176
|
+
if [[ -f "$PIPELINE_DIR/trust.bundle" ]]; then
|
|
177
|
+
_pass "record-evidence --surface-trust-json with knowledge bundle completes (pipeline proof: fix is in production code path)"
|
|
178
|
+
else
|
|
179
|
+
_fail "record-evidence --surface-trust-json with knowledge bundle did not write trust.bundle"
|
|
180
|
+
fi
|
|
181
|
+
else
|
|
182
|
+
_fail "record-evidence --surface-trust-json with knowledge bundle failed: $(cat "$TMP/pipeline-evidence.out")"
|
|
183
|
+
fi
|
|
184
|
+
|
|
185
|
+
echo ""
|
|
186
|
+
echo "=== Fix 2: FlowDefinition-driven route-back guard ==="
|
|
187
|
+
|
|
188
|
+
# ─── 2a. builder.build: verification→execution still enforced ─────────────────
|
|
189
|
+
echo ""
|
|
190
|
+
echo "=== 2a. builder.build route-back guard: still enforces verification→execution ==="
|
|
191
|
+
BUILDER_DIR="$TMP/fix2-builder/.flow-agents/builder-fix2"
|
|
192
|
+
mkdir -p "$TMP/fix2-builder/.flow-agents"
|
|
193
|
+
|
|
194
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" ensure-session \
|
|
195
|
+
--artifact-root "$TMP/fix2-builder/.flow-agents" \
|
|
196
|
+
--task-slug "builder-fix2" \
|
|
197
|
+
--title "Fix2 builder route-back test" \
|
|
198
|
+
--summary "Verify builder.build route-back still enforced." \
|
|
199
|
+
--timestamp "2026-06-27T10:00:00Z" > "$TMP/fix2-builder-ensure.out" 2>&1
|
|
200
|
+
|
|
201
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$BUILDER_DIR" \
|
|
202
|
+
--status verifying --phase verification \
|
|
203
|
+
--summary "Moving to verification." \
|
|
204
|
+
--flow-definition builder.build \
|
|
205
|
+
--timestamp "2026-06-27T10:01:00Z" > "$TMP/fix2-builder-verify.out" 2>&1
|
|
206
|
+
|
|
207
|
+
if flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$BUILDER_DIR" \
|
|
208
|
+
--status in_progress --phase execution \
|
|
209
|
+
--summary "Route back without reason." \
|
|
210
|
+
--flow-definition builder.build \
|
|
211
|
+
--timestamp "2026-06-27T10:02:00Z" > "$TMP/fix2-builder-noReason.out" 2>&1; then
|
|
212
|
+
_fail "builder.build route-back should require --route-back-reason"
|
|
213
|
+
elif grep -q 'route_back_reason_required' "$TMP/fix2-builder-noReason.out"; then
|
|
214
|
+
_pass "builder.build: verification→execution requires --route-back-reason (identical behavior preserved)"
|
|
215
|
+
else
|
|
216
|
+
_fail "builder.build route-back lacked expected diagnostic (got: $(cat "$TMP/fix2-builder-noReason.out"))"
|
|
217
|
+
fi
|
|
218
|
+
|
|
219
|
+
if flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$BUILDER_DIR" \
|
|
220
|
+
--status in_progress --phase execution \
|
|
221
|
+
--summary "Route back with reason." \
|
|
222
|
+
--flow-definition builder.build \
|
|
223
|
+
--route-back-reason implementation_defect \
|
|
224
|
+
--timestamp "2026-06-27T10:03:00Z" > "$TMP/fix2-builder-withReason.out" 2>&1; then
|
|
225
|
+
_pass "builder.build: verification→execution with reason succeeds (identical behavior preserved)"
|
|
226
|
+
else
|
|
227
|
+
_fail "builder.build route-back with reason should succeed (got: $(cat "$TMP/fix2-builder-withReason.out"))"
|
|
228
|
+
fi
|
|
229
|
+
|
|
230
|
+
# ─── 2b. Custom non-builder flow WITH route_back_policy: enforced ─────────────
|
|
231
|
+
echo ""
|
|
232
|
+
echo "=== 2b. Custom non-builder flow WITH route_back_policy: enforced ==="
|
|
233
|
+
|
|
234
|
+
CUSTOM_FLOWS_DIR="$TMP/custom-flows"
|
|
235
|
+
mkdir -p "$CUSTOM_FLOWS_DIR"
|
|
236
|
+
|
|
237
|
+
# Write acme.deliver flow with route_back_policy (using argv[2] correctly)
|
|
238
|
+
node - "$CUSTOM_FLOWS_DIR/acme.deliver.flow.json" << 'NODE'
|
|
239
|
+
const fs = require('fs');
|
|
240
|
+
const flowPath = process.argv[2];
|
|
241
|
+
const flow = {
|
|
242
|
+
id: "acme.deliver", version: "1.0",
|
|
243
|
+
phase_map: { execution: "execute", verification: "verify" },
|
|
244
|
+
steps: [{ id: "execute", next: "verify" }, { id: "verify", next: "done" }, { id: "done", next: null }],
|
|
245
|
+
gates: {
|
|
246
|
+
"execute-gate": {
|
|
247
|
+
step: "execute",
|
|
248
|
+
expects: [{ id: "execution-scope", kind: "trust.bundle", required: true,
|
|
249
|
+
bundle_claim: { claimType: "acme.execute.scope", subjectType: "change", accepted_statuses: ["trusted","accepted"] } }]
|
|
250
|
+
},
|
|
251
|
+
"verify-gate": {
|
|
252
|
+
step: "verify",
|
|
253
|
+
on_route_back: { implementation_defect: "execute", missing_evidence: "verify", default: "verify" },
|
|
254
|
+
route_back_policy: { max_attempts: 2, on_exceeded: "block" },
|
|
255
|
+
expects: [{ id: "verify-evidence", kind: "trust.bundle", required: true,
|
|
256
|
+
bundle_claim: { claimType: "acme.verify.tests", subjectType: "flow-step", accepted_statuses: ["trusted","accepted"] } }]
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
};
|
|
260
|
+
fs.writeFileSync(flowPath, JSON.stringify(flow, null, 2));
|
|
261
|
+
NODE
|
|
262
|
+
|
|
263
|
+
ACME_DIR="$TMP/fix2-acme/.flow-agents/acme-fix2"
|
|
264
|
+
mkdir -p "$TMP/fix2-acme/.flow-agents"
|
|
265
|
+
|
|
266
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" ensure-session \
|
|
267
|
+
--artifact-root "$TMP/fix2-acme/.flow-agents" \
|
|
268
|
+
--task-slug "acme-fix2" \
|
|
269
|
+
--title "Fix2 acme route-back test" \
|
|
270
|
+
--summary "Verify non-builder flow with route_back_policy is enforced." \
|
|
271
|
+
--timestamp "2026-06-27T10:00:00Z" > "$TMP/fix2-acme-ensure.out" 2>&1
|
|
272
|
+
|
|
273
|
+
# Set FLOW_AGENTS_FLOW_DEFS_DIR and export it for the duration of this block
|
|
274
|
+
export FLOW_AGENTS_FLOW_DEFS_DIR="$CUSTOM_FLOWS_DIR"
|
|
275
|
+
|
|
276
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$ACME_DIR" \
|
|
277
|
+
--status verifying --phase verification \
|
|
278
|
+
--summary "Moving acme to verification." \
|
|
279
|
+
--flow-definition acme.deliver \
|
|
280
|
+
--timestamp "2026-06-27T10:01:00Z" > "$TMP/fix2-acme-verify.out" 2>&1
|
|
281
|
+
|
|
282
|
+
if flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$ACME_DIR" \
|
|
283
|
+
--status in_progress --phase execution \
|
|
284
|
+
--summary "Acme route back without reason." \
|
|
285
|
+
--flow-definition acme.deliver \
|
|
286
|
+
--timestamp "2026-06-27T10:02:00Z" > "$TMP/fix2-acme-noReason.out" 2>&1; then
|
|
287
|
+
_fail "acme.deliver route-back should require --route-back-reason when route_back_policy is declared"
|
|
288
|
+
elif grep -q 'route_back_reason_required' "$TMP/fix2-acme-noReason.out"; then
|
|
289
|
+
_pass "acme.deliver (non-builder): verification→execution requires reason when route_back_policy declared"
|
|
290
|
+
else
|
|
291
|
+
_fail "acme.deliver route-back lacked expected diagnostic (got: $(cat "$TMP/fix2-acme-noReason.out"))"
|
|
292
|
+
fi
|
|
293
|
+
|
|
294
|
+
# Do 2 successful route-backs
|
|
295
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$ACME_DIR" \
|
|
296
|
+
--status in_progress --phase execution \
|
|
297
|
+
--summary "Acme route back 1." --flow-definition acme.deliver \
|
|
298
|
+
--route-back-reason implementation_defect \
|
|
299
|
+
--timestamp "2026-06-27T10:03:00Z" > "$TMP/fix2-acme-rb1.out" 2>&1
|
|
300
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$ACME_DIR" \
|
|
301
|
+
--status verifying --phase verification \
|
|
302
|
+
--summary "Back to verify." --flow-definition acme.deliver \
|
|
303
|
+
--timestamp "2026-06-27T10:04:00Z" > "$TMP/fix2-acme-fwd1.out" 2>&1
|
|
304
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$ACME_DIR" \
|
|
305
|
+
--status in_progress --phase execution \
|
|
306
|
+
--summary "Acme route back 2." --flow-definition acme.deliver \
|
|
307
|
+
--route-back-reason implementation_defect \
|
|
308
|
+
--timestamp "2026-06-27T10:05:00Z" > "$TMP/fix2-acme-rb2.out" 2>&1
|
|
309
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$ACME_DIR" \
|
|
310
|
+
--status verifying --phase verification \
|
|
311
|
+
--summary "Back to verify again." --flow-definition acme.deliver \
|
|
312
|
+
--timestamp "2026-06-27T10:06:00Z" > "$TMP/fix2-acme-fwd2.out" 2>&1
|
|
313
|
+
|
|
314
|
+
# Third attempt should exceed max_attempts=2 (flow declares max 2, not hardcoded 3)
|
|
315
|
+
if flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$ACME_DIR" \
|
|
316
|
+
--status in_progress --phase execution \
|
|
317
|
+
--summary "Acme exceeds route-back limit." --flow-definition acme.deliver \
|
|
318
|
+
--route-back-reason implementation_defect \
|
|
319
|
+
--timestamp "2026-06-27T10:07:00Z" > "$TMP/fix2-acme-exceeded.out" 2>&1; then
|
|
320
|
+
_fail "acme.deliver should block after flow-declared max_attempts=2 route-backs"
|
|
321
|
+
elif grep -q 'route_back_attempts_exceeded' "$TMP/fix2-acme-exceeded.out"; then
|
|
322
|
+
_pass "acme.deliver: blocks after flow-declared max_attempts=2 (not the hardcoded 3 from old builder code)"
|
|
323
|
+
else
|
|
324
|
+
_fail "acme.deliver exceeded max_attempts but wrong diagnostic (got: $(cat "$TMP/fix2-acme-exceeded.out"))"
|
|
325
|
+
fi
|
|
326
|
+
|
|
327
|
+
unset FLOW_AGENTS_FLOW_DEFS_DIR
|
|
328
|
+
|
|
329
|
+
# ─── 2c. Custom flow WITHOUT route_back_policy: NOT enforced ──────────────────
|
|
330
|
+
echo ""
|
|
331
|
+
echo "=== 2c. Custom flow WITHOUT route_back_policy: verification→execution NOT enforced ==="
|
|
332
|
+
|
|
333
|
+
CUSTOM_FLOWS_DIR_2="$TMP/custom-flows-2"
|
|
334
|
+
mkdir -p "$CUSTOM_FLOWS_DIR_2"
|
|
335
|
+
|
|
336
|
+
node - "$CUSTOM_FLOWS_DIR_2/acme.nodecl.flow.json" << 'NODE'
|
|
337
|
+
const fs = require('fs');
|
|
338
|
+
const flowPath = process.argv[2];
|
|
339
|
+
const flow = {
|
|
340
|
+
id: "acme.nodecl", version: "1.0",
|
|
341
|
+
phase_map: { execution: "execute", verification: "verify" },
|
|
342
|
+
steps: [{ id: "execute", next: "verify" }, { id: "verify", next: "done" }, { id: "done", next: null }],
|
|
343
|
+
gates: {
|
|
344
|
+
"verify-gate": {
|
|
345
|
+
step: "verify",
|
|
346
|
+
expects: [{ id: "verify-evidence", kind: "trust.bundle", required: true,
|
|
347
|
+
bundle_claim: { claimType: "acme.verify.tests", subjectType: "flow-step", accepted_statuses: ["trusted","accepted"] } }]
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
};
|
|
351
|
+
fs.writeFileSync(flowPath, JSON.stringify(flow, null, 2));
|
|
352
|
+
NODE
|
|
353
|
+
|
|
354
|
+
NODECL_DIR="$TMP/fix2-nodecl/.flow-agents/nodecl-fix2"
|
|
355
|
+
mkdir -p "$TMP/fix2-nodecl/.flow-agents"
|
|
356
|
+
|
|
357
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" ensure-session \
|
|
358
|
+
--artifact-root "$TMP/fix2-nodecl/.flow-agents" \
|
|
359
|
+
--task-slug "nodecl-fix2" \
|
|
360
|
+
--title "Fix2 nodecl route-back test" \
|
|
361
|
+
--summary "Verify flow without route_back_policy is not guarded." \
|
|
362
|
+
--timestamp "2026-06-27T10:00:00Z" > "$TMP/fix2-nodecl-ensure.out" 2>&1
|
|
363
|
+
|
|
364
|
+
export FLOW_AGENTS_FLOW_DEFS_DIR="$CUSTOM_FLOWS_DIR_2"
|
|
365
|
+
|
|
366
|
+
flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$NODECL_DIR" \
|
|
367
|
+
--status verifying --phase verification \
|
|
368
|
+
--summary "Moving nodecl to verification." \
|
|
369
|
+
--flow-definition acme.nodecl \
|
|
370
|
+
--timestamp "2026-06-27T10:01:00Z" > "$TMP/fix2-nodecl-verify.out" 2>&1
|
|
371
|
+
|
|
372
|
+
if flow_agents_node "$SIDECAR_BUNDLE_WRITER" advance-state "$NODECL_DIR" \
|
|
373
|
+
--status in_progress --phase execution \
|
|
374
|
+
--summary "Nodecl route back — should be free without reason." \
|
|
375
|
+
--flow-definition acme.nodecl \
|
|
376
|
+
--timestamp "2026-06-27T10:02:00Z" > "$TMP/fix2-nodecl-rb.out" 2>&1 \
|
|
377
|
+
&& [[ ! -f "$NODECL_DIR/transition-attempts.json" ]]; then
|
|
378
|
+
_pass "acme.nodecl (no route_back_policy): verification→execution freely allowed, no attempts file"
|
|
379
|
+
else
|
|
380
|
+
_fail "acme.nodecl without route_back_policy should allow route-back freely (got: $(cat "$TMP/fix2-nodecl-rb.out"))"
|
|
381
|
+
fi
|
|
382
|
+
|
|
383
|
+
unset FLOW_AGENTS_FLOW_DEFS_DIR
|
|
384
|
+
|
|
385
|
+
echo ""
|
|
386
|
+
echo "────────────────────────────────────────────"
|
|
387
|
+
if [[ "$errors" -eq 0 ]]; then
|
|
388
|
+
echo "test_kit_identity_trust: all checks passed."
|
|
389
|
+
exit 0
|
|
390
|
+
else
|
|
391
|
+
echo "test_kit_identity_trust: $errors check(s) FAILED."
|
|
392
|
+
exit 1
|
|
393
|
+
fi
|
package/evals/run.sh
CHANGED
|
@@ -242,6 +242,8 @@ run_integration() {
|
|
|
242
242
|
echo ""
|
|
243
243
|
bash "$EVAL_DIR/integration/test_verify_cli.sh" || result=1
|
|
244
244
|
echo ""
|
|
245
|
+
bash "$EVAL_DIR/integration/test_kit_identity_trust.sh" || result=1
|
|
246
|
+
echo ""
|
|
245
247
|
bash "$EVAL_DIR/acceptance/prove-capture-teeth-declared.sh" || result=1
|
|
246
248
|
return $result
|
|
247
249
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kontourai/flow-agents",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.1",
|
|
4
4
|
"description": "Flow Agents — a Kontour product that applies Flow and Veritas discipline as a portable process layer inside the agent tools you already use: Claude Code, Codex, Kiro, opencode, pi, and GitHub Actions — with framework adapters (AWS Strands preview) on the same policy-engine contract.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"agents",
|
|
@@ -136,15 +136,15 @@
|
|
|
136
136
|
"kit": "npm run build --silent && node build/src/cli.js kit"
|
|
137
137
|
},
|
|
138
138
|
"devDependencies": {
|
|
139
|
-
"@types/node": "^
|
|
140
|
-
"promptfoo": "^0.121.
|
|
139
|
+
"@types/node": "^26.0.1",
|
|
140
|
+
"promptfoo": "^0.121.17",
|
|
141
141
|
"typescript": "^6.0.3"
|
|
142
142
|
},
|
|
143
143
|
"dependencies": {
|
|
144
144
|
"@kontourai/flow": "~1.3.0"
|
|
145
145
|
},
|
|
146
146
|
"optionalDependencies": {
|
|
147
|
-
"hachure": "^0.
|
|
147
|
+
"hachure": "^0.5.1",
|
|
148
148
|
"@kontourai/surface": "^1.2.0"
|
|
149
149
|
}
|
|
150
150
|
}
|
|
@@ -803,17 +803,30 @@ function canonicalJsonForVerify(record) {
|
|
|
803
803
|
|
|
804
804
|
/**
|
|
805
805
|
* Verify the hash chain of command-log.jsonl.
|
|
806
|
-
* Returns { status, brokenAt } where:
|
|
807
|
-
* status = "ok" | "legacy" | "broken"
|
|
806
|
+
* Returns { status, brokenAt, forkAt } where:
|
|
807
|
+
* status = "ok" | "legacy" | "broken" | "forked"
|
|
808
808
|
* brokenAt = index (0-based) of the first broken entry, or null
|
|
809
|
+
* forkAt = index (0-based) of the first concurrent-fork sibling, or null
|
|
810
|
+
*
|
|
811
|
+
* "forked" is a BENIGN concurrent-append race, not tampering: two PostToolUse
|
|
812
|
+
* captures appended off the same parent tip (e.g. parallel agents sharing one
|
|
813
|
+
* log) before the writer lock (flow-agents#232) serialized them. It is
|
|
814
|
+
* distinguished from "broken" because:
|
|
815
|
+
* - every entry's hash is still self-consistent (no content was edited), and
|
|
816
|
+
* - every entry's parent is reachable (nothing was reordered or removed);
|
|
817
|
+
* - the only anomaly is a parent claimed by >1 capture-sourced sibling.
|
|
818
|
+
* Tamper — a content edit (self-hash mismatch), a reorder, or a deletion
|
|
819
|
+
* (unreachable parent) — still returns "broken". A fork cannot be used to
|
|
820
|
+
* launder a content edit: editing a record breaks its self-hash, which is
|
|
821
|
+
* checked before fork classification.
|
|
809
822
|
*/
|
|
810
823
|
function verifyCommandLogChain(artifactDir) {
|
|
811
824
|
const file = path.join(artifactDir, 'command-log.jsonl');
|
|
812
825
|
let raw = '';
|
|
813
|
-
try { raw = fs.readFileSync(file, 'utf8'); } catch { return { status: 'legacy', brokenAt: null }; }
|
|
826
|
+
try { raw = fs.readFileSync(file, 'utf8'); } catch { return { status: 'legacy', brokenAt: null, forkAt: null }; }
|
|
814
827
|
|
|
815
828
|
const lines = raw.split('\n').filter(l => l.trim());
|
|
816
|
-
if (lines.length === 0) return { status: 'legacy', brokenAt: null };
|
|
829
|
+
if (lines.length === 0) return { status: 'legacy', brokenAt: null, forkAt: null };
|
|
817
830
|
|
|
818
831
|
// Parse all entries, tolerating unparseable lines (they count as legacy/unchained).
|
|
819
832
|
const entries = [];
|
|
@@ -823,18 +836,25 @@ function verifyCommandLogChain(artifactDir) {
|
|
|
823
836
|
if (entry && typeof entry === 'object') entries.push(entry);
|
|
824
837
|
} catch { /* skip malformed lines */ }
|
|
825
838
|
}
|
|
826
|
-
if (entries.length === 0) return { status: 'legacy', brokenAt: null };
|
|
839
|
+
if (entries.length === 0) return { status: 'legacy', brokenAt: null, forkAt: null };
|
|
827
840
|
|
|
828
841
|
// Classify: are there any chained entries?
|
|
829
842
|
const hasAnyChain = entries.some(e => e._chain && typeof e._chain.hash === 'string');
|
|
830
|
-
if (!hasAnyChain) return { status: 'legacy', brokenAt: null };
|
|
831
|
-
|
|
832
|
-
//
|
|
833
|
-
//
|
|
834
|
-
//
|
|
835
|
-
|
|
843
|
+
if (!hasAnyChain) return { status: 'legacy', brokenAt: null, forkAt: null };
|
|
844
|
+
|
|
845
|
+
// Walk in file order. A chained entry is ACCEPTED when both:
|
|
846
|
+
// (a) self-consistent: hash === sha256(prevHash + canonicalJson(record)),
|
|
847
|
+
// so a content edit (e.g. flipping exitCode) without rehashing fails; and
|
|
848
|
+
// (b) reachable: prevHash is genesis or the hash of any prior accepted entry.
|
|
849
|
+
// We track the SET of reachable hashes (not just the latest tip) so that
|
|
850
|
+
// concurrent-fork siblings — which share a still-reachable parent — are
|
|
851
|
+
// tolerated, while a reorder/deletion (parent not reachable) is caught.
|
|
852
|
+
const reachable = new Set([CHAIN_GENESIS_VERIFY]);
|
|
853
|
+
const parentSources = new Map(); // prevHash -> [source, ...] (fork detection)
|
|
836
854
|
let prevWasChained = false;
|
|
837
|
-
let
|
|
855
|
+
let forked = false;
|
|
856
|
+
let firstForkAt = null;
|
|
857
|
+
|
|
838
858
|
for (let i = 0; i < entries.length; i++) {
|
|
839
859
|
const entry = entries[i];
|
|
840
860
|
const chain = entry._chain;
|
|
@@ -842,26 +862,43 @@ function verifyCommandLogChain(artifactDir) {
|
|
|
842
862
|
// Legacy entry without _chain. If we have already seen a chained entry,
|
|
843
863
|
// a gap in the chain (a legacy entry in the middle) counts as broken
|
|
844
864
|
// (it could indicate a removed chained entry was replaced by a legacy one).
|
|
845
|
-
if (prevWasChained) return { status: 'broken', brokenAt: i };
|
|
865
|
+
if (prevWasChained) return { status: 'broken', brokenAt: i, forkAt: null };
|
|
846
866
|
// Before any chained entry: tolerate (legacy prefix).
|
|
847
867
|
continue;
|
|
848
868
|
}
|
|
849
869
|
|
|
850
|
-
//
|
|
851
|
-
|
|
852
|
-
|
|
870
|
+
// (a) Self-consistency. A content edit without rehashing fails here.
|
|
871
|
+
if (typeof chain.prevHash !== 'string') return { status: 'broken', brokenAt: i, forkAt: null };
|
|
872
|
+
const selfHash = crypto.createHash('sha256')
|
|
873
|
+
.update(chain.prevHash + canonicalJsonForVerify(entry), 'utf8')
|
|
853
874
|
.digest('hex');
|
|
854
|
-
if (chain.hash !==
|
|
855
|
-
|
|
856
|
-
//
|
|
857
|
-
|
|
875
|
+
if (chain.hash !== selfHash) return { status: 'broken', brokenAt: i, forkAt: null };
|
|
876
|
+
|
|
877
|
+
// (b) Reachability. An unreachable parent means a reorder or a removed
|
|
878
|
+
// predecessor — structural tamper, not a benign concurrent append.
|
|
879
|
+
if (!reachable.has(chain.prevHash)) return { status: 'broken', brokenAt: i, forkAt: null };
|
|
880
|
+
|
|
881
|
+
// Fork detection: a parent claimed by more than one entry is a fork. It is
|
|
882
|
+
// benign only when EVERY sibling on that parent is a PostToolUse capture
|
|
883
|
+
// (two captures racing on the same tip). Any non-capture sibling on a
|
|
884
|
+
// shared parent is treated as tamper (conservative).
|
|
885
|
+
const sources = parentSources.get(chain.prevHash) || [];
|
|
886
|
+
sources.push(entry.source);
|
|
887
|
+
parentSources.set(chain.prevHash, sources);
|
|
888
|
+
if (sources.length > 1) {
|
|
889
|
+
if (!sources.every(s => s === 'postToolUse-capture')) {
|
|
890
|
+
return { status: 'broken', brokenAt: i, forkAt: null };
|
|
891
|
+
}
|
|
892
|
+
if (firstForkAt === null) firstForkAt = i;
|
|
893
|
+
forked = true;
|
|
894
|
+
}
|
|
858
895
|
|
|
859
|
-
|
|
896
|
+
reachable.add(chain.hash);
|
|
860
897
|
prevWasChained = true;
|
|
861
|
-
chainedCount += 1;
|
|
862
898
|
}
|
|
863
899
|
|
|
864
|
-
return { status: '
|
|
900
|
+
if (forked) return { status: 'forked', brokenAt: null, forkAt: firstForkAt };
|
|
901
|
+
return { status: 'ok', brokenAt: null, forkAt: null };
|
|
865
902
|
}
|
|
866
903
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
867
904
|
|
|
@@ -1065,6 +1102,11 @@ function captureCrossReference(root, artifactDir, activeFlowStep) {
|
|
|
1065
1102
|
//
|
|
1066
1103
|
// ok → proceed normally (chain is valid, log is trustworthy).
|
|
1067
1104
|
// legacy → proceed normally (pre-B2 log, no chain to verify, existing behavior).
|
|
1105
|
+
// forked → benign concurrent-append race (not tampering): emit a loud but
|
|
1106
|
+
// NON-blocking advisory and keep trusting the records. The capture
|
|
1107
|
+
// contradiction teeth still run (the records are genuine, just not
|
|
1108
|
+
// linearly ordered); the operator can re-linearize with the repair
|
|
1109
|
+
// tool. This is what stops honest parallel work from being trapped.
|
|
1068
1110
|
// broken → emit a loud warning and treat ALL claimed-pass commands relying on
|
|
1069
1111
|
// this log as NOT_VERIFIED/blocking — do not let them sail through.
|
|
1070
1112
|
let chainBroken = false;
|
|
@@ -1079,6 +1121,17 @@ function captureCrossReference(root, artifactDir, activeFlowStep) {
|
|
|
1079
1121
|
'This is tamper-EVIDENCE (hash-chain broken); alteration, removal, or reordering detected. ' +
|
|
1080
1122
|
'NOT_VERIFIED: cannot confirm or deny claimed passes.'
|
|
1081
1123
|
);
|
|
1124
|
+
} else if (chainResult.status === 'forked') {
|
|
1125
|
+
// NOT a hard block: this string must not match HARD_BLOCK/FULL_BLOCK. A
|
|
1126
|
+
// concurrent fork is benign — no content was edited and nothing was
|
|
1127
|
+
// removed — so honest parallel work proceeds. We surface it loudly and
|
|
1128
|
+
// point at the deterministic repair.
|
|
1129
|
+
const forkIdx = chainResult.forkAt !== null ? ` (entry ${chainResult.forkAt})` : '';
|
|
1130
|
+
warnings.push(
|
|
1131
|
+
`${base} command-log shows a concurrent-capture fork${forkIdx} — two PostToolUse captures appended off the same parent ` +
|
|
1132
|
+
'(parallel writers before the writer lock). This is NOT tampering: every record is self-consistent and reachable. ' +
|
|
1133
|
+
'Records remain trusted; re-linearize with: node scripts/repair-command-log.js <artifact-dir>'
|
|
1134
|
+
);
|
|
1082
1135
|
}
|
|
1083
1136
|
}
|
|
1084
1137
|
|