@kontourai/flow-agents 2.0.1 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/.github/actions/trust-verify/action.yml +4 -2
  2. package/.github/workflows/ci.yml +16 -4
  3. package/.github/workflows/docs-pages.yml +1 -1
  4. package/.github/workflows/kit-gates-demo.yml +2 -2
  5. package/.github/workflows/publish-npm.yml +2 -2
  6. package/.github/workflows/runtime-compat.yml +2 -2
  7. package/.github/workflows/trust-reconcile.yml +1 -1
  8. package/CHANGELOG.md +28 -0
  9. package/README.md +3 -3
  10. package/build/src/cli/workflow-sidecar.js +8 -2
  11. package/context/scripts/telemetry/lib/config.sh +15 -0
  12. package/context/scripts/telemetry/telemetry.conf +4 -0
  13. package/context/scripts/telemetry/telemetry.sh +23 -1
  14. package/docs/design/flowrun-eventsourcing-design.md +216 -0
  15. package/docs/design/workflowrun-observability-design.md +431 -0
  16. package/evals/ci/antigaming-suite.sh +1 -0
  17. package/evals/ci/run-baseline.sh +2 -0
  18. package/evals/integration/test_command_log_concurrency.sh +114 -0
  19. package/evals/integration/test_gate_lockdown.sh +21 -6
  20. package/evals/integration/test_usage_cost.sh +119 -0
  21. package/evals/integration/test_verify_cli.sh +23 -0
  22. package/integrations/strands/flow_agents_strands/hooks.py +126 -1
  23. package/integrations/strands/flow_agents_strands/telemetry.py +172 -0
  24. package/integrations/strands/tests/test_usage.py +129 -0
  25. package/integrations/strands-ts/src/hooks.ts +135 -1
  26. package/integrations/strands-ts/src/telemetry.ts +170 -0
  27. package/integrations/strands-ts/test/test-usage.ts +85 -0
  28. package/package.json +2 -2
  29. package/scripts/ci/trust-reconcile.js +7 -23
  30. package/scripts/hooks/evidence-capture.js +85 -50
  31. package/scripts/hooks/stop-goal-fit.js +18 -45
  32. package/scripts/lib/command-log-chain.js +73 -0
  33. package/scripts/repair-command-log.js +8 -15
  34. package/scripts/telemetry/lib/config.sh +15 -0
  35. package/scripts/telemetry/lib/pricing.sh +42 -0
  36. package/scripts/telemetry/lib/usage.sh +108 -0
  37. package/scripts/telemetry/pricing.golden.json +15 -0
  38. package/scripts/telemetry/pricing.json +31 -0
  39. package/scripts/telemetry/telemetry.conf +4 -0
  40. package/scripts/telemetry/telemetry.sh +23 -1
  41. package/src/cli/workflow-sidecar.ts +8 -2
@@ -125,6 +125,70 @@ function readKitFlows(flowAgentsDir: string): KitFlowEntry[] {
125
125
  return results;
126
126
  }
127
127
 
128
+ // ---------------------------------------------------------------------------
129
+ // Usage extraction — map a Strands model-call event onto the documented
130
+ // Anthropic usage object, defensively across SDK shapes.
131
+ // ---------------------------------------------------------------------------
132
+
133
+ function asRecord(value: unknown): Record<string, unknown> | undefined {
134
+ return value && typeof value === "object" ? (value as Record<string, unknown>) : undefined;
135
+ }
136
+
137
+ function numField(obj: Record<string, unknown> | undefined, ...keys: string[]): number {
138
+ if (!obj) return 0;
139
+ for (const key of keys) {
140
+ const v = obj[key];
141
+ if (typeof v === "number" && Number.isFinite(v)) return v;
142
+ }
143
+ return 0;
144
+ }
145
+
146
+ function strField(obj: Record<string, unknown> | undefined, ...keys: string[]): string | undefined {
147
+ if (!obj) return undefined;
148
+ for (const key of keys) {
149
+ const v = obj[key];
150
+ if (typeof v === "string" && v) return v;
151
+ }
152
+ return undefined;
153
+ }
154
+
155
+ export function extractModelUsage(
156
+ event: StrandsEvent
157
+ ): { model: string; input: number; output: number; cacheCreation: number; cacheRead: number } | null {
158
+ // Find the usage object wherever the event surfaces it.
159
+ const containers = [
160
+ event,
161
+ asRecord(event.usage),
162
+ asRecord(event.response),
163
+ asRecord(event.result),
164
+ asRecord(event.message),
165
+ asRecord(event.output),
166
+ asRecord(event.modelResponse),
167
+ ];
168
+ let usage: Record<string, unknown> | undefined;
169
+ let modelCarrier: Record<string, unknown> | undefined;
170
+ for (const container of containers) {
171
+ const c = asRecord(container);
172
+ if (!c) continue;
173
+ const candidate = asRecord(c.usage) ?? (("input_tokens" in c || "inputTokens" in c) ? c : undefined);
174
+ if (candidate && !usage) usage = candidate;
175
+ if (!modelCarrier && (typeof c.model === "string" || typeof c.modelId === "string")) modelCarrier = c;
176
+ }
177
+ if (!usage) return null;
178
+
179
+ const input = numField(usage, "input_tokens", "inputTokens");
180
+ const output = numField(usage, "output_tokens", "outputTokens");
181
+ const cacheCreation = numField(usage, "cache_creation_input_tokens", "cacheCreationInputTokens");
182
+ const cacheRead = numField(usage, "cache_read_input_tokens", "cacheReadInputTokens");
183
+ if (input === 0 && output === 0 && cacheCreation === 0 && cacheRead === 0) return null;
184
+
185
+ const model =
186
+ strField(modelCarrier, "model", "modelId") ??
187
+ strField(usage, "model") ??
188
+ "unknown";
189
+ return { model, input, output, cacheCreation, cacheRead };
190
+ }
191
+
128
192
  function buildKitFlowsHint(flows: KitFlowEntry[]): string {
129
193
  if (flows.length === 0) return "";
130
194
  const lines = ["KIT FLOWS: the following kit flows are activated for this workspace:"];
@@ -164,6 +228,11 @@ export class FlowAgentsHooks {
164
228
  private readonly policyGate: PolicyGate;
165
229
  private readonly _workspace: string;
166
230
  private _sessionStartMs: number | null = null;
231
+ // Per-model token accumulator, summed across model-call events for the session.
232
+ private _usageByModel = new Map<
233
+ string,
234
+ { input: number; output: number; cacheCreation: number; cacheRead: number }
235
+ >();
167
236
 
168
237
  constructor(options: FlowAgentsHooksOptions = {}) {
169
238
  this._workspace = findRepoRoot(options.workspace ?? process.cwd());
@@ -248,6 +317,15 @@ export class FlowAgentsHooks {
248
317
  registry.addCallback(AfterInvocationEvent, (event) => this.onAfterInvocation(event));
249
318
  registry.addCallback(BeforeToolCallEvent, (event) => this.onBeforeToolCall(event));
250
319
  registry.addCallback(AfterToolCallEvent, (event) => this.onAfterToolCall(event));
320
+
321
+ // AfterModelCallEvent carries per-call token usage (the SDK's documented
322
+ // usage source). Optional — only registered if the installed SDK exposes it,
323
+ // so older SDKs still work (usage is simply not collected there).
324
+ // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-explicit-any
325
+ const AfterModelCallEvent = (require("strands-agents") as any).AfterModelCallEvent as EventClass | undefined;
326
+ if (AfterModelCallEvent) {
327
+ registry.addCallback(AfterModelCallEvent, (event) => this.onAfterModelCall(event));
328
+ }
251
329
  }
252
330
 
253
331
  // --------------------------------------------------------------------------
@@ -262,13 +340,68 @@ export class FlowAgentsHooks {
262
340
  this.sink.emitUserPromptSubmit();
263
341
  }
264
342
 
265
- /** AfterInvocationEvent → stop / session.end */
343
+ /** AfterInvocationEvent → emit session.usage (if any) then stop / session.end */
266
344
  onAfterInvocation(_event: StrandsEvent): void {
267
345
  const durationMs =
268
346
  this._sessionStartMs !== null ? Date.now() - this._sessionStartMs : 0;
347
+
348
+ if (this._usageByModel.size > 0) {
349
+ const byModel = Array.from(this._usageByModel.entries()).map(([model, t]) => ({
350
+ model,
351
+ inputTokens: t.input,
352
+ outputTokens: t.output,
353
+ cacheCreationInputTokens: t.cacheCreation,
354
+ cacheReadInputTokens: t.cacheRead,
355
+ }));
356
+ const sum = byModel.reduce(
357
+ (acc, m) => ({
358
+ input: acc.input + m.inputTokens,
359
+ output: acc.output + m.outputTokens,
360
+ cacheCreation: acc.cacheCreation + m.cacheCreationInputTokens,
361
+ cacheRead: acc.cacheRead + m.cacheReadInputTokens,
362
+ }),
363
+ { input: 0, output: 0, cacheCreation: 0, cacheRead: 0 }
364
+ );
365
+ this.sink.emitUsage({
366
+ model: byModel.length === 1 ? byModel[0].model : undefined,
367
+ durationS: durationMs / 1000,
368
+ inputTokens: sum.input,
369
+ outputTokens: sum.output,
370
+ cacheCreationInputTokens: sum.cacheCreation,
371
+ cacheReadInputTokens: sum.cacheRead,
372
+ byModel,
373
+ });
374
+ this._usageByModel.clear();
375
+ }
376
+
269
377
  this.sink.emitSessionEnd(durationMs);
270
378
  }
271
379
 
380
+ /**
381
+ * AfterModelCallEvent → accumulate per-model token usage.
382
+ *
383
+ * Reads the documented Anthropic usage object (input_tokens, output_tokens,
384
+ * cache_creation_input_tokens, cache_read_input_tokens) from wherever the
385
+ * Strands event surfaces it. Defensive across SDK shapes — if no usage is
386
+ * found, the call is a no-op (tokens for that turn are simply not counted).
387
+ */
388
+ onAfterModelCall(event: StrandsEvent): void {
389
+ const extracted = extractModelUsage(event);
390
+ if (!extracted) return;
391
+ const { model, input, output, cacheCreation, cacheRead } = extracted;
392
+ const current = this._usageByModel.get(model) ?? {
393
+ input: 0,
394
+ output: 0,
395
+ cacheCreation: 0,
396
+ cacheRead: 0,
397
+ };
398
+ current.input += input;
399
+ current.output += output;
400
+ current.cacheCreation += cacheCreation;
401
+ current.cacheRead += cacheRead;
402
+ this._usageByModel.set(model, current);
403
+ }
404
+
272
405
  /**
273
406
  * BeforeToolCallEvent → preToolUse / tool.invoke + config-protection policy gate.
274
407
  *
@@ -307,6 +440,7 @@ export class FlowAgentsHooks {
307
440
  /** Call once after constructing / wiring to emit the agentSpawn event. */
308
441
  emitSessionStart(): void {
309
442
  this._sessionStartMs = Date.now();
443
+ this._usageByModel.clear();
310
444
  this.sink.emitSessionStart();
311
445
  }
312
446
  }
@@ -12,8 +12,13 @@
12
12
 
13
13
  import fs from "node:fs";
14
14
  import path from "node:path";
15
+ import { fileURLToPath } from "node:url";
15
16
  import { randomUUID } from "node:crypto";
16
17
 
18
+ // ESM has no __dirname; derive it (this package is "type":"module").
19
+ const __filename = fileURLToPath(import.meta.url);
20
+ const __dirname = path.dirname(__filename);
21
+
17
22
  // ---------------------------------------------------------------------------
18
23
  // Strands TS → canonical event-name mapping
19
24
  // Mirrors STRANDS_TO_CANONICAL in integrations/strands/flow_agents_strands/telemetry.py
@@ -248,4 +253,169 @@ export class TelemetrySink {
248
253
  emitUserPromptSubmit(extra?: Record<string, unknown>): TelemetryEvent {
249
254
  return this.emit("userPromptSubmit", extra);
250
255
  }
256
+
257
+ /**
258
+ * Emit a `session.usage` event with real token counts + derived cost.
259
+ *
260
+ * The Strands SDK surfaces per-invocation usage on AfterModelCall /
261
+ * AfterInvocation events; accumulate those and pass the totals here at
262
+ * session end. Tokens are the source of truth; estimated_cost_usd is derived
263
+ * from PRICING (the console recomputes it authoritatively, so a pricing
264
+ * change is retroactive). Mirrors the `session.usage` shape emitted by
265
+ * scripts/telemetry/telemetry.sh so the console aggregates both identically.
266
+ */
267
+ emitUsage(usage: UsageInput): TelemetryEvent {
268
+ const event = this.buildBaseEvent("session.usage");
269
+ event.event_id = `${event.event_id}-usage`;
270
+ event.hook = { ...event.hook, event_name: "usage" };
271
+
272
+ const byModel = (usage.byModel ?? []).map((entry) => {
273
+ const tokens = normalizeTokens(entry);
274
+ return {
275
+ model: entry.model,
276
+ input_tokens: tokens.input,
277
+ output_tokens: tokens.output,
278
+ cache_creation_input_tokens: tokens.cacheCreation,
279
+ cache_read_input_tokens: tokens.cacheRead,
280
+ estimated_cost_usd: costForModel(entry.model, tokens)
281
+ };
282
+ });
283
+
284
+ const flat = normalizeTokens(usage);
285
+ const cost = byModel.length
286
+ ? round6(byModel.reduce((sum, m) => sum + m.estimated_cost_usd, 0))
287
+ : costForModel(usage.model, flat);
288
+
289
+ event.usage = {
290
+ model: usage.model ?? this.runtime,
291
+ duration_s: usage.durationS ?? null,
292
+ input_tokens: flat.input,
293
+ output_tokens: flat.output,
294
+ cache_creation_input_tokens: flat.cacheCreation,
295
+ cache_read_input_tokens: flat.cacheRead,
296
+ estimated_cost_usd: cost,
297
+ pricing_version: pricingVersion(),
298
+ by_model: byModel.length ? byModel : null
299
+ };
300
+
301
+ try {
302
+ fs.appendFileSync(this.logFile, JSON.stringify(event) + "\n", "utf8");
303
+ } catch {
304
+ // fail-open: telemetry must never block agent work
305
+ }
306
+ return event;
307
+ }
308
+ }
309
+
310
+ // ---------------------------------------------------------------------------
311
+ // Usage / cost — mirror of scripts/telemetry/pricing.json (per 1M tokens, USD)
312
+ // ---------------------------------------------------------------------------
313
+
314
+ export interface TokenCounts {
315
+ inputTokens?: number;
316
+ outputTokens?: number;
317
+ cacheCreationInputTokens?: number;
318
+ cacheReadInputTokens?: number;
319
+ }
320
+
321
+ export interface UsageInput extends TokenCounts {
322
+ model?: string;
323
+ durationS?: number;
324
+ byModel?: Array<TokenCounts & { model: string }>;
325
+ }
326
+
327
+ interface NormalizedTokens {
328
+ input: number;
329
+ output: number;
330
+ cacheCreation: number;
331
+ cacheRead: number;
332
+ }
333
+
334
+ // Pricing is read from the single-source registry (scripts/telemetry/pricing.json),
335
+ // never hand-maintained here. Resolution: TELEMETRY_PRICING_FILE /
336
+ // FLOW_AGENTS_PRICING_FILE env path, else the repo-relative registry, else a
337
+ // minimal fallback. Tokens are exact regardless; the console recomputes cost
338
+ // authoritatively, so a missing file only degrades the sink's stamped estimate.
339
+ interface PricingVersionBlock {
340
+ cache_multipliers: { write_5m: number; write_1h: number; read: number };
341
+ models: Record<string, { input: number; output: number }>;
342
+ default: { input: number; output: number };
343
+ zero_cost_models: string[];
344
+ }
345
+ interface PricingRegistry {
346
+ current_version: string;
347
+ versions: Record<string, PricingVersionBlock>;
348
+ }
349
+
350
+ const FALLBACK_REGISTRY: PricingRegistry = {
351
+ current_version: "fallback",
352
+ versions: {
353
+ fallback: {
354
+ cache_multipliers: { write_5m: 1.25, write_1h: 2.0, read: 0.1 },
355
+ models: {},
356
+ default: { input: 5.0, output: 25.0 },
357
+ zero_cost_models: ["<synthetic>", "synthetic", "unknown", ""]
358
+ }
359
+ }
360
+ };
361
+
362
+ let cachedRegistry: PricingRegistry | null = null;
363
+ function loadRegistry(): PricingRegistry {
364
+ if (cachedRegistry) return cachedRegistry;
365
+ const candidates = [
366
+ process.env.TELEMETRY_PRICING_FILE,
367
+ process.env.FLOW_AGENTS_PRICING_FILE,
368
+ path.join(__dirname, "../../../scripts/telemetry/pricing.json"),
369
+ path.join(__dirname, "../../../../scripts/telemetry/pricing.json")
370
+ ].filter((p): p is string => Boolean(p));
371
+ for (const candidate of candidates) {
372
+ try {
373
+ const parsed = JSON.parse(fs.readFileSync(candidate, "utf8"));
374
+ if (parsed && typeof parsed.current_version === "string" && parsed.versions) {
375
+ cachedRegistry = parsed as PricingRegistry;
376
+ return cachedRegistry;
377
+ }
378
+ } catch {
379
+ // try next candidate
380
+ }
381
+ }
382
+ cachedRegistry = FALLBACK_REGISTRY;
383
+ return cachedRegistry;
384
+ }
385
+
386
+ function pricingVersion(): string {
387
+ return loadRegistry().current_version;
388
+ }
389
+
390
+ function num(value: number | undefined): number {
391
+ return typeof value === "number" && Number.isFinite(value) ? value : 0;
392
+ }
393
+
394
+ function round6(value: number): number {
395
+ return Math.round(value * 1_000_000) / 1_000_000;
396
+ }
397
+
398
+ function normalizeTokens(tokens: TokenCounts): NormalizedTokens {
399
+ return {
400
+ input: num(tokens.inputTokens),
401
+ output: num(tokens.outputTokens),
402
+ cacheCreation: num(tokens.cacheCreationInputTokens),
403
+ cacheRead: num(tokens.cacheReadInputTokens)
404
+ };
405
+ }
406
+
407
+ function costForModel(model: string | undefined, tokens: NormalizedTokens): number {
408
+ const registry = loadRegistry();
409
+ const block = registry.versions[registry.current_version] ?? FALLBACK_REGISTRY.versions.fallback;
410
+ const key = (model ?? "").trim();
411
+ if (block.zero_cost_models.includes(key)) return 0;
412
+ const rate = block.models[key] ?? block.default;
413
+ const cm = block.cache_multipliers;
414
+ return round6(
415
+ (tokens.input * rate.input +
416
+ tokens.output * rate.output +
417
+ tokens.cacheCreation * rate.input * cm.write_5m +
418
+ tokens.cacheRead * rate.input * cm.read) /
419
+ 1_000_000
420
+ );
251
421
  }
@@ -0,0 +1,85 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import fs from "node:fs";
4
+ import path from "node:path";
5
+ import os from "node:os";
6
+ import { fileURLToPath } from "node:url";
7
+ import { TelemetrySink } from "../src/telemetry.js";
8
+ import { extractModelUsage } from "../src/hooks.js";
9
+
10
+ const here = path.dirname(fileURLToPath(import.meta.url));
11
+ const tmpSink = () => new TelemetrySink({ workspace: fs.mkdtempSync(path.join(os.tmpdir(), "ts-usage-")) });
12
+
13
+ test("emitUsage writes tokens + cost + pricing_version + by_model", () => {
14
+ const ev = tmpSink().emitUsage({
15
+ model: "claude-opus-4-8",
16
+ inputTokens: 1000,
17
+ outputTokens: 2000,
18
+ cacheReadInputTokens: 500000,
19
+ byModel: [{ model: "claude-opus-4-8", inputTokens: 1000, outputTokens: 2000, cacheReadInputTokens: 500000 }]
20
+ } as any);
21
+ const u = ev.usage as any;
22
+ assert.equal(u.input_tokens, 1000);
23
+ assert.equal(u.output_tokens, 2000);
24
+ assert.equal(u.cache_read_input_tokens, 500000);
25
+ assert.equal(u.pricing_version, "2026-06-28");
26
+ assert.equal(u.estimated_cost_usd, 0.305); // (1000*5 + 2000*25 + 500000*5*0.1)/1e6
27
+ assert.equal(u.by_model[0].model, "claude-opus-4-8");
28
+ });
29
+
30
+ test("emitUsage multi-model sums + prices each", () => {
31
+ const ev = tmpSink().emitUsage({
32
+ outputTokens: 2000,
33
+ byModel: [
34
+ { model: "claude-opus-4-8", outputTokens: 1000 },
35
+ { model: "claude-haiku-4-5", outputTokens: 1000 }
36
+ ]
37
+ } as any);
38
+ const u = ev.usage as any;
39
+ const costs: Record<string, number> = Object.fromEntries(u.by_model.map((m: any) => [m.model, m.estimated_cost_usd]));
40
+ assert.equal(costs["claude-opus-4-8"], 0.025);
41
+ assert.equal(costs["claude-haiku-4-5"], 0.005);
42
+ assert.equal(u.estimated_cost_usd, 0.03);
43
+ });
44
+
45
+ test("extractModelUsage reads usage from varied event shapes", () => {
46
+ assert.deepEqual(
47
+ extractModelUsage({ model: "claude-opus-4-8", usage: { input_tokens: 10, output_tokens: 20, cache_read_input_tokens: 30 } } as any),
48
+ { model: "claude-opus-4-8", input: 10, output: 20, cacheCreation: 0, cacheRead: 30 }
49
+ );
50
+ // camelCase + modelId
51
+ const camel = extractModelUsage({ modelId: "claude-haiku-4-5", usage: { inputTokens: 5, outputTokens: 6 } } as any);
52
+ assert.equal(camel?.model, "claude-haiku-4-5");
53
+ assert.equal(camel?.input, 5);
54
+ // nested response carrier
55
+ const nested = extractModelUsage({ response: { model: "claude-fable-5", usage: { output_tokens: 100 } } } as any);
56
+ assert.equal(nested?.model, "claude-fable-5");
57
+ assert.equal(nested?.output, 100);
58
+ // no usage / all-zero → null
59
+ assert.equal(extractModelUsage({ model: "x" } as any), null);
60
+ assert.equal(extractModelUsage({ model: "x", usage: { input_tokens: 0, output_tokens: 0 } } as any), null);
61
+ });
62
+
63
+ test("cross-runtime golden vectors (TS sink prices identically)", () => {
64
+ const candidates = [
65
+ path.join(here, "../../../../scripts/telemetry/pricing.golden.json"),
66
+ path.join(here, "../../../scripts/telemetry/pricing.golden.json"),
67
+ path.join(process.cwd(), "../../scripts/telemetry/pricing.golden.json")
68
+ ];
69
+ const file = candidates.find((p) => fs.existsSync(p));
70
+ assert.ok(file, "pricing.golden.json not found");
71
+ const golden = JSON.parse(fs.readFileSync(file!, "utf8"));
72
+ const sink = tmpSink();
73
+ for (const c of golden.cases) {
74
+ const ev = sink.emitUsage({
75
+ byModel: [{
76
+ model: c.model,
77
+ inputTokens: c.tokens.input,
78
+ outputTokens: c.tokens.output,
79
+ cacheCreationInputTokens: c.tokens.cache_creation,
80
+ cacheReadInputTokens: c.tokens.cache_read
81
+ }]
82
+ } as any);
83
+ assert.equal((ev.usage as any).estimated_cost_usd, c.expected_cost_usd, `golden ${c.name} (${c.model})`);
84
+ }
85
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kontourai/flow-agents",
3
- "version": "2.0.1",
3
+ "version": "2.1.1",
4
4
  "description": "Flow Agents — a Kontour product that applies Flow and Veritas discipline as a portable process layer inside the agent tools you already use: Claude Code, Codex, Kiro, opencode, pi, and GitHub Actions — with framework adapters (AWS Strands preview) on the same policy-engine contract.",
5
5
  "keywords": [
6
6
  "agents",
@@ -141,7 +141,7 @@
141
141
  "typescript": "^6.0.3"
142
142
  },
143
143
  "dependencies": {
144
- "@kontourai/flow": "~1.3.0"
144
+ "@kontourai/flow": "~1.4.1"
145
145
  },
146
146
  "optionalDependencies": {
147
147
  "hachure": "^0.5.1",
@@ -61,6 +61,10 @@ const { spawnSync } = require('child_process');
61
61
  const fs = require('fs');
62
62
  const os = require('os');
63
63
  const path = require('path');
64
+ // One normative definition shared with scripts/hooks/stop-goal-fit.js — the local
65
+ // copy here had drifted (it was missing the trailing `/bin/true` check), which is
66
+ // exactly why this is now imported rather than duplicated.
67
+ const { hasLaunderingOperator } = require('../lib/command-log-chain.js');
64
68
 
65
69
  // ---------------------------------------------------------------------------
66
70
  // Helpers
@@ -80,29 +84,9 @@ function isPassingValue(v) {
80
84
  return v === true || v === 1 || v === 'true' || v === 'pass';
81
85
  }
82
86
 
83
- /**
84
- * Returns true when a command string contains an exit-code-laundering operator.
85
- * These operators mask real exit codes so the real sub-command may have failed silently.
86
- *
87
- * Rules (applied to claimed verification commands only):
88
- * - ANY || operator — verify commands must not contain ||. This catches:
89
- * || exit 0, || echo ok, || /bin/true, || true, || :, etc.
90
- * - ; or newline followed by true / : / exit 0 — trailing success injection
91
- *
92
- * NOTE: Logic must stay identical to scripts/hooks/stop-goal-fit.js hasLaunderingOperator.
93
- * Centralize into a shared module as a follow-up (coordinate-free duplication for now).
94
- */
95
- function hasLaunderingOperator(cmd) {
96
- // Flag ANY || operator — masks the exit code of the left-hand command.
97
- if (/\|\|/.test(cmd)) return true;
98
- // Flag ; or newline followed by true / : / exit 0
99
- if (/[;\n]\s*true\b/.test(cmd)) return true;
100
- if (/[;\n]\s*:\s*(?:$|\s|;|\n)/.test(cmd)) return true;
101
- if (/[;\n]\s*exit\s+0\b/.test(cmd)) return true;
102
- // Flag pipe to true (pipeline absorbs exit code)
103
- if (/\|\s*true\b/.test(cmd)) return true;
104
- return false;
105
- }
87
+ // hasLaunderingOperator is imported from ../lib/command-log-chain.js (above) so this
88
+ // CI reconciler and the stop-goal-fit verifier apply the identical exit-code-mask
89
+ // heuristic see that module for the rules.
106
90
 
107
91
  /**
108
92
  * Run a single shell command under bash, capturing exit code.
@@ -62,43 +62,16 @@ const COMMAND_TOOL_NAME = /(^|[^a-z])(bash|shell|sh|exec|run|command|terminal|cm
62
62
 
63
63
  // ─── Hash-chain integrity (tamper-EVIDENCE) ───────────────────────────────────
64
64
  //
65
- // Genesis prevHash: a fixed arbitrary sentinel used when the log is empty or
66
- // the last entry has no _chain field (legacy record). This is NOT the SHA256 of
67
- // any specific input string it is a fixed constant chosen for the original
68
- // implementation. (A previous comment incorrectly claimed it was
69
- // sha256("flow-agents:command-log:genesis"); that is wrong.)
70
- //
71
- // Writer (this file, CHAIN_GENESIS) and verifier (stop-goal-fit.js,
72
- // CHAIN_GENESIS_VERIFY) MUST use the same value. Do not change one without
73
- // changing the other — existing chained logs depend on this constant.
74
- //
75
- // HONEST FRAMING: this makes alteration DETECTABLE, not impossible. An agent
76
- // that rewrites all hashes can still forge the chain. The real tamper-proof
77
- // boundary is the signed checkpoint (B1). We do not oversell this boundary.
78
- const CHAIN_GENESIS = 'a3f9e2b7d5c84f1e6a0d2c3b9f7e1a4d8c6b5f2e9a0d3c7b1f4e8a2d6c0b9f3';
79
-
80
- /**
81
- * Stable canonical JSON for the chain input: the record WITHOUT the `_chain`
82
- * field, keys sorted alphabetically. This ensures the hash is independent of
83
- * key insertion order and that `_chain` itself does not contribute to its own
84
- * hash (circular dependency).
85
- */
86
- function canonicalJsonForChain(record) {
87
- // Strip _chain if present (should not be, but defensive).
88
- const keys = Object.keys(record).filter(k => k !== '_chain').sort();
89
- const obj = {};
90
- for (const k of keys) obj[k] = record[k];
91
- return JSON.stringify(obj);
92
- }
93
-
94
- /**
95
- * Compute the sha256 hex hash for this chain link.
96
- * hash = sha256(prevHash + canonicalJson(record))
97
- */
98
- function computeChainHash(prevHash, record) {
99
- const input = prevHash + canonicalJsonForChain(record);
100
- return crypto.createHash('sha256').update(input, 'utf8').digest('hex');
101
- }
65
+ // CHAIN_GENESIS is a fixed arbitrary sentinel NOT the SHA256 of any specific
66
+ // input string (a previous comment incorrectly claimed sha256("…:genesis")). The
67
+ // writer here and the verifier in stop-goal-fit.js MUST canonicalize and seed
68
+ // identically, so the genesis constant and the canonicalJson/hash helpers live in
69
+ // ONE shared module that both import — divergence is structurally impossible.
70
+ const {
71
+ CHAIN_GENESIS,
72
+ canonicalJsonForChain,
73
+ computeChainHash,
74
+ } = require('../lib/command-log-chain.js');
102
75
 
103
76
  /**
104
77
  * Read the last entry from command-log.jsonl that has a `_chain` block.
@@ -121,6 +94,58 @@ function readLastChainState(logFile) {
121
94
  }
122
95
  return { seq: -1, hash: CHAIN_GENESIS };
123
96
  }
97
+
98
+ // ─── Concurrency-safe append (lockfile) ──────────────────────────────────────
99
+ //
100
+ // The chain link is a read-(last hash)→compute→append critical section. Without
101
+ // mutual exclusion, two capture processes writing to the SAME command-log
102
+ // concurrently (e.g. parallel agents in one workspace) can both read the same
103
+ // prevHash and append entries with an identical seq/prevHash — forking the chain
104
+ // and tripping the tamper-evidence verifier on a benign race. We serialize the
105
+ // section with an atomic create-exclusive lockfile.
106
+ //
107
+ // FAIL-OPEN, like the rest of this hook: if the lock cannot be acquired we still
108
+ // append (capture must NEVER block the agent or drop evidence), accepting the
109
+ // small residual race rather than losing the record. A crashed holder's stale
110
+ // lock is stolen after LOCK_STALE_MS so a dead process can't wedge capture.
111
+ const LOCK_RETRY_MS = 5; // backoff between attempts
112
+ const LOCK_MAX_TRIES = 200; // ~1s total acquisition budget
113
+ const LOCK_STALE_MS = 10000; // steal a lock older than this (crashed holder)
114
+
115
+ /** Synchronous sleep without busy-spinning. Best-effort; no-ops if unavailable. */
116
+ function sleepSync(ms) {
117
+ try { Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, ms); }
118
+ catch { /* SharedArrayBuffer/Atomics unavailable — skip the backoff */ }
119
+ }
120
+
121
+ /**
122
+ * Acquire an exclusive lock via atomic create-exclusive (O_CREAT|O_EXCL).
123
+ * Returns a file descriptor on success, or null on failure (caller fails open).
124
+ */
125
+ function acquireLock(lockFile) {
126
+ for (let i = 0; i < LOCK_MAX_TRIES; i++) {
127
+ try {
128
+ const fd = fs.openSync(lockFile, 'wx');
129
+ try { fs.writeSync(fd, String(process.pid)); } catch { /* pid is advisory only */ }
130
+ return fd;
131
+ } catch (err) {
132
+ if (!err || err.code !== 'EEXIST') return null; // unexpected — fail open
133
+ // Lock held: steal it if the holder appears dead (stale), else back off.
134
+ try {
135
+ const st = fs.statSync(lockFile);
136
+ if (Date.now() - st.mtimeMs > LOCK_STALE_MS) { fs.unlinkSync(lockFile); continue; }
137
+ } catch { continue; } // lock vanished between open and stat — retry immediately
138
+ sleepSync(LOCK_RETRY_MS);
139
+ }
140
+ }
141
+ return null;
142
+ }
143
+
144
+ /** Release a lock acquired by acquireLock. Best-effort. */
145
+ function releaseLock(fd, lockFile) {
146
+ try { fs.closeSync(fd); } catch { /* already closed */ }
147
+ try { fs.unlinkSync(lockFile); } catch { /* already removed */ }
148
+ }
124
149
  // ─────────────────────────────────────────────────────────────────────────────
125
150
 
126
151
  function parseJson(raw) {
@@ -305,20 +330,30 @@ function run(rawInput) {
305
330
  const logFile = path.join(artifactDir, 'command-log.jsonl');
306
331
  fs.mkdirSync(artifactDir, { recursive: true });
307
332
 
308
- // Hash-chain integrity: compute _chain before appending. Fail-open: any
309
- // error in chain computation falls back to the plain record (no _chain).
310
- // A chain failure must NEVER block capture or corrupt the log.
311
- let recordToWrite = record;
333
+ // Serialize the read→compute→append critical section so concurrent captures
334
+ // (parallel agents sharing this log) cannot fork the hash-chain. Fail-open:
335
+ // a null fd means we could not lock we still append rather than drop the
336
+ // record. The lock is always released in finally.
337
+ const lockFile = logFile + '.lock';
338
+ const lockFd = acquireLock(lockFile);
312
339
  try {
313
- const { seq: prevSeq, hash: prevHash } = readLastChainState(logFile);
314
- const seq = prevSeq + 1;
315
- const hash = computeChainHash(prevHash, record);
316
- // Spread record fields then add _chain so the chain field is appended last
317
- // (cosmetic ordering; canonicalJsonForChain excludes it during hashing).
318
- recordToWrite = { ...record, _chain: { seq, prevHash, hash } };
319
- } catch { /* chain computation failed — write plain record, do not block */ }
320
-
321
- fs.appendFileSync(logFile, JSON.stringify(recordToWrite) + '\n');
340
+ // Hash-chain integrity: compute _chain before appending. Fail-open: any
341
+ // error in chain computation falls back to the plain record (no _chain).
342
+ // A chain failure must NEVER block capture or corrupt the log.
343
+ let recordToWrite = record;
344
+ try {
345
+ const { seq: prevSeq, hash: prevHash } = readLastChainState(logFile);
346
+ const seq = prevSeq + 1;
347
+ const hash = computeChainHash(prevHash, record);
348
+ // Spread record fields then add _chain so the chain field is appended last
349
+ // (cosmetic ordering; canonicalJsonForChain excludes it during hashing).
350
+ recordToWrite = { ...record, _chain: { seq, prevHash, hash } };
351
+ } catch { /* chain computation failed — write plain record, do not block */ }
352
+
353
+ fs.appendFileSync(logFile, JSON.stringify(recordToWrite) + '\n');
354
+ } finally {
355
+ if (lockFd !== null) releaseLock(lockFd, lockFile);
356
+ }
322
357
  } catch { /* fail-open: capture never blocks or corrupts */ }
323
358
  return rawInput;
324
359
  }