@kontourai/flow-agents 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/.github/actions/trust-verify/action.yml +4 -2
  2. package/.github/workflows/ci.yml +12 -0
  3. package/CHANGELOG.md +21 -0
  4. package/README.md +3 -3
  5. package/build/src/cli/workflow-sidecar.js +8 -2
  6. package/context/scripts/telemetry/lib/config.sh +15 -0
  7. package/context/scripts/telemetry/telemetry.conf +4 -0
  8. package/context/scripts/telemetry/telemetry.sh +23 -1
  9. package/docs/design/flowrun-eventsourcing-design.md +216 -0
  10. package/docs/design/workflowrun-observability-design.md +431 -0
  11. package/evals/ci/antigaming-suite.sh +1 -0
  12. package/evals/ci/run-baseline.sh +2 -0
  13. package/evals/integration/test_command_log_concurrency.sh +114 -0
  14. package/evals/integration/test_usage_cost.sh +119 -0
  15. package/evals/integration/test_verify_cli.sh +23 -0
  16. package/integrations/strands/flow_agents_strands/hooks.py +126 -1
  17. package/integrations/strands/flow_agents_strands/telemetry.py +172 -0
  18. package/integrations/strands/tests/test_usage.py +129 -0
  19. package/integrations/strands-ts/src/hooks.ts +135 -1
  20. package/integrations/strands-ts/src/telemetry.ts +170 -0
  21. package/integrations/strands-ts/test/test-usage.ts +85 -0
  22. package/package.json +2 -2
  23. package/scripts/hooks/evidence-capture.js +75 -13
  24. package/scripts/telemetry/lib/config.sh +15 -0
  25. package/scripts/telemetry/lib/pricing.sh +42 -0
  26. package/scripts/telemetry/lib/usage.sh +108 -0
  27. package/scripts/telemetry/pricing.golden.json +15 -0
  28. package/scripts/telemetry/pricing.json +31 -0
  29. package/scripts/telemetry/telemetry.conf +4 -0
  30. package/scripts/telemetry/telemetry.sh +23 -1
  31. package/src/cli/workflow-sidecar.ts +8 -2
@@ -1,6 +1,12 @@
1
1
  #!/usr/bin/env bash
2
2
  # usage.sh — Session usage metric functions
3
3
 
4
+ # Module directory, resolved once at source time (cwd-independent).
5
+ USAGE_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+
7
+ # Single-source pricing registry loader (local / remote / bundled).
8
+ source "${USAGE_LIB_DIR}/pricing.sh"
9
+
4
10
  # Resolve model from agent-spec.json
5
11
  usage_get_model() {
6
12
  local agent_name="$1"
@@ -27,3 +33,105 @@ usage_count_delegations() {
27
33
  [[ ! -f "$jsonl_path" ]] && echo 0 && return
28
34
  grep -c "\"session_id\":\"${session_id}\".*\"event_type\":\"agent.delegate\"" "$jsonl_path" 2>/dev/null || echo 0
29
35
  }
36
+
37
+ # Parse a runtime transcript (JSONL) into real per-model token + cost usage.
38
+ # Ground truth lives in each assistant message's `.message.usage` block:
39
+ # input_tokens (uncached), output_tokens, cache_creation_input_tokens,
40
+ # cache_read_input_tokens — plus `.message.model`.
41
+ # Cost is derived from the versioned pricing registry: cache writes bill at
42
+ # input*write_5m, cache reads at input*read. Cost uses the registry's
43
+ # current_version (override with arg $2) and the result stamps `pricing_version`
44
+ # so the console can reproduce or recompute it. Emits a compact JSON object:
45
+ # { by_model: [ {model, input_tokens, output_tokens,
46
+ # cache_creation_input_tokens, cache_read_input_tokens,
47
+ # estimated_cost_usd} ],
48
+ # input_tokens, output_tokens, cache_creation_input_tokens,
49
+ # cache_read_input_tokens, estimated_cost_usd, pricing_version }
50
+ # Prints nothing (non-zero) when the transcript is missing/unparseable so the
51
+ # caller can fall back to null usage. Never blocks agent work.
52
+ # Expected transcript usage path (Claude Code / Anthropic usage object). Bumped
53
+ # if the on-disk schema changes so drift is logged rather than silently zeroed.
54
+ USAGE_TRANSCRIPT_SCHEMA="message.usage.input_tokens"
55
+
56
+ # Append a one-line schema-drift warning (transcript carried usage data we could
57
+ # not parse). Goes to TELEMETRY_DRIFT_LOG if set, else stderr. Never fatal.
58
+ usage_log_drift() {
59
+ local transcript="$1"
60
+ local msg="[telemetry] pricing/usage drift: ${transcript} has usage data but expected path '${USAGE_TRANSCRIPT_SCHEMA}' parsed 0 tokens — transcript schema may have changed"
61
+ if [[ -n "${TELEMETRY_DRIFT_LOG:-}" ]]; then
62
+ echo "$msg" >> "${TELEMETRY_DRIFT_LOG}" 2>/dev/null || echo "$msg" >&2
63
+ else
64
+ echo "$msg" >&2
65
+ fi
66
+ }
67
+
68
+ usage_parse_transcript() {
69
+ local transcript="$1" version="${2:-}"
70
+ [[ -z "$transcript" || ! -f "$transcript" ]] && return 1
71
+ command -v jq >/dev/null 2>&1 || return 1
72
+ local registry
73
+ registry="$(pricing_registry)" || return 1
74
+ [[ -z "$registry" ]] && return 1
75
+
76
+ local out
77
+ out="$(jq -n --argjson registry "$registry" --arg version "$version" '
78
+ $registry as $reg
79
+ | (if $version == "" then ($reg.current_version) else $version end) as $ver
80
+ | ($reg.versions[$ver]) as $p
81
+ | if $p == null then empty else . end
82
+ | ($p.cache_multipliers) as $cm
83
+ | (reduce inputs as $l ({};
84
+ ($l.message.usage) as $u
85
+ | if $u then
86
+ (($l.message.model) // "unknown") as $m
87
+ | .[$m].input = ((.[$m].input // 0) + (($u.input_tokens) // 0))
88
+ | .[$m].output = ((.[$m].output // 0) + (($u.output_tokens) // 0))
89
+ | .[$m].cache_creation = ((.[$m].cache_creation // 0) + (($u.cache_creation_input_tokens) // 0))
90
+ | .[$m].cache_read = ((.[$m].cache_read // 0) + (($u.cache_read_input_tokens) // 0))
91
+ else . end)) as $agg
92
+ | ($agg | to_entries
93
+ | map(
94
+ .key as $m | .value as $u
95
+ | (($p.models[$m]) // $p.default) as $rate
96
+ | (if ([$m] | inside($p.zero_cost_models)) then 0 else 1 end) as $billable
97
+ | {
98
+ model: $m,
99
+ input_tokens: ($u.input // 0),
100
+ output_tokens: ($u.output // 0),
101
+ cache_creation_input_tokens: ($u.cache_creation // 0),
102
+ cache_read_input_tokens: ($u.cache_read // 0),
103
+ estimated_cost_usd: (
104
+ $billable * (
105
+ ($u.input // 0) * $rate.input
106
+ + ($u.output // 0) * $rate.output
107
+ + ($u.cache_creation // 0) * $rate.input * $cm.write_5m
108
+ + ($u.cache_read // 0) * $rate.input * $cm.read
109
+ ) / 1000000
110
+ )
111
+ })) as $by_model
112
+ | {
113
+ by_model: $by_model,
114
+ input_tokens: ([$by_model[].input_tokens] | add // 0),
115
+ output_tokens: ([$by_model[].output_tokens] | add // 0),
116
+ cache_creation_input_tokens: ([$by_model[].cache_creation_input_tokens] | add // 0),
117
+ cache_read_input_tokens: ([$by_model[].cache_read_input_tokens] | add // 0),
118
+ estimated_cost_usd: (([$by_model[].estimated_cost_usd] | add // 0) * 1000000 | round / 1000000),
119
+ pricing_version: $ver
120
+ }
121
+ ' < "$transcript" 2>/dev/null)"
122
+
123
+ [[ -z "$out" ]] && return 1
124
+
125
+ # Drift / emptiness check: if we parsed zero tokens but the transcript clearly
126
+ # contains usage data, the schema drifted — warn and fall back to null usage.
127
+ local total
128
+ total="$(printf '%s' "$out" | jq -r '((.input_tokens // 0) + (.output_tokens // 0) + (.cache_creation_input_tokens // 0) + (.cache_read_input_tokens // 0))' 2>/dev/null)"
129
+ if [[ -z "$total" || "$total" == "0" ]]; then
130
+ if grep -q '"input_tokens"' "$transcript" 2>/dev/null; then
131
+ usage_log_drift "$transcript"
132
+ fi
133
+ return 1
134
+ fi
135
+
136
+ printf '%s\n' "$out"
137
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "_note": "Cross-runtime cost golden vectors. Keep IN SYNC with console-telemetry/test/golden-vectors.json (identical content). Asserted by the flow-agents bash usage tests, the Python sink tests, and the console-telemetry package so every runtime that prices tokens produces the SAME cost. If these drift between repos, a runtime's cost math has diverged.",
3
+ "pricing_version": "2026-06-28",
4
+ "cases": [
5
+ { "name": "opus cache-read-dominated", "model": "claude-opus-4-8", "tokens": { "input": 1000, "output": 2000, "cache_creation": 0, "cache_read": 500000 }, "expected_cost_usd": 0.305 },
6
+ { "name": "opus output only", "model": "claude-opus-4-8", "tokens": { "input": 0, "output": 1000, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.025 },
7
+ { "name": "fable output", "model": "claude-fable-5", "tokens": { "input": 0, "output": 100, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.005 },
8
+ { "name": "haiku output", "model": "claude-haiku-4-5", "tokens": { "input": 0, "output": 1000, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.005 },
9
+ { "name": "sonnet input 1M", "model": "claude-sonnet-4-6", "tokens": { "input": 1000000, "output": 0, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 3.0 },
10
+ { "name": "opus cache-write 5m tier", "model": "claude-opus-4-8", "tokens": { "input": 0, "output": 0, "cache_creation": 1000000, "cache_read": 0 }, "expected_cost_usd": 6.25 },
11
+ { "name": "opus billion-scale", "model": "claude-opus-4-8", "tokens": { "input": 200000, "output": 1600000, "cache_creation": 9000000, "cache_read": 1000000000 }, "expected_cost_usd": 597.25 },
12
+ { "name": "synthetic is free", "model": "<synthetic>", "tokens": { "input": 999, "output": 999, "cache_creation": 999, "cache_read": 999 }, "expected_cost_usd": 0 },
13
+ { "name": "unknown model uses default rate", "model": "some-unlisted-model", "tokens": { "input": 1000000, "output": 0, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 5.0 }
14
+ ]
15
+ }
@@ -0,0 +1,31 @@
1
+ {
2
+ "schema_version": "2.0",
3
+ "current_version": "2026-06-28",
4
+ "source": "Anthropic public list pricing; cache multipliers per prompt-caching docs",
5
+ "versions": {
6
+ "2026-06-28": {
7
+ "effective_date": "2026-06-28",
8
+ "currency": "USD",
9
+ "unit": "per_1m_tokens",
10
+ "cache_multipliers": {
11
+ "write_5m": 1.25,
12
+ "write_1h": 2.0,
13
+ "read": 0.1
14
+ },
15
+ "models": {
16
+ "claude-fable-5": { "input": 10.0, "output": 50.0 },
17
+ "claude-mythos-5": { "input": 10.0, "output": 50.0 },
18
+ "claude-opus-4-8": { "input": 5.0, "output": 25.0 },
19
+ "claude-opus-4-7": { "input": 5.0, "output": 25.0 },
20
+ "claude-opus-4-6": { "input": 5.0, "output": 25.0 },
21
+ "claude-opus-4-5": { "input": 5.0, "output": 25.0 },
22
+ "claude-opus-4-1": { "input": 15.0, "output": 75.0 },
23
+ "claude-sonnet-4-6": { "input": 3.0, "output": 15.0 },
24
+ "claude-sonnet-4-5": { "input": 3.0, "output": 15.0 },
25
+ "claude-haiku-4-5": { "input": 1.0, "output": 5.0 }
26
+ },
27
+ "default": { "input": 5.0, "output": 25.0 },
28
+ "zero_cost_models": ["<synthetic>", "synthetic", "unknown", ""]
29
+ }
30
+ }
31
+ }
@@ -8,6 +8,10 @@ channel.analytics.redact=tool.input,tool.output,turn.prompt_text,delegation.targ
8
8
  # The transport derives /api/telemetry/records from console_telemetry_url.
9
9
  # console_telemetry_token=
10
10
  # console_tenant_id=
11
+ # Live pricing registry source. If unset, derived from console_telemetry_url as
12
+ # <console>/api/telemetry/pricing so bash/Python/TS runtimes read one live
13
+ # source; lib/pricing.sh caches it and falls back to bundled pricing.json.
14
+ # console_pricing_url=https://console.kontourai.io/api/telemetry/pricing
11
15
  enrich_system=true
12
16
  enrich_workspace=true
13
17
  enrich_auth=true
@@ -309,13 +309,35 @@ add_stop_data_and_emit_usage() {
309
309
  tool_count=$(usage_count_tool_calls "$session_id" "$full_log")
310
310
  delegation_count=$(usage_count_delegations "$session_id" "$full_log")
311
311
 
312
+ # Ground-truth token + cost usage from the runtime transcript, when the
313
+ # runtime exposes one (Claude Code, Codex, etc. set hook.transcript_path).
314
+ # Tokens are source-of-truth; estimated_cost_usd is derived from pricing.json
315
+ # (recomputed authoritatively console-side, so pricing updates are retroactive).
316
+ local transcript_path transcript_usage
317
+ transcript_path=$(echo "$event" | jq -r '.hook.transcript_path // ""')
318
+ transcript_usage=$(usage_parse_transcript "$transcript_path")
319
+ [[ -z "$transcript_usage" ]] && transcript_usage='null'
320
+
312
321
  local usage_event
313
322
  usage_event=$(echo "$event" | jq -c \
314
323
  --arg m "$model" \
315
324
  --argjson tc "$tool_count" \
316
325
  --argjson dc "$delegation_count" \
326
+ --argjson tu "$transcript_usage" \
317
327
  '.event_type = "session.usage" | .event_id = (.event_id + "-usage") | . + {
318
- usage: {model: $m, duration_s: .session.duration_s, tool_invocations: $tc, delegations: $dc, input_tokens: null, output_tokens: null, estimated_cost_usd: null}
328
+ usage: ({
329
+ model: $m,
330
+ duration_s: .session.duration_s,
331
+ tool_invocations: $tc,
332
+ delegations: $dc,
333
+ input_tokens: ($tu.input_tokens // null),
334
+ output_tokens: ($tu.output_tokens // null),
335
+ cache_creation_input_tokens: ($tu.cache_creation_input_tokens // null),
336
+ cache_read_input_tokens: ($tu.cache_read_input_tokens // null),
337
+ estimated_cost_usd: ($tu.estimated_cost_usd // null),
338
+ pricing_version: ($tu.pricing_version // null),
339
+ by_model: ($tu.by_model // null)
340
+ })
319
341
  }')
320
342
  transport_emit "$usage_event"
321
343
  fi
@@ -19,11 +19,17 @@ export const verdicts = new Set(["pass", "partial", "fail", "not_verified"]);
19
19
  function now(): string { return new Date().toISOString().replace(/\.\d{3}Z$/, "Z"); }
20
20
  function read(file: string): string { return fs.readFileSync(file, "utf8"); }
21
21
  export function writeJson(file: string, payload: AnyObj): void { fs.mkdirSync(path.dirname(file), { recursive: true }); fs.writeFileSync(file, `${JSON.stringify(payload, null, 2)}\n`); }
22
- function printJson(payload: AnyObj): void { console.log(JSON.stringify(payload).replace(/":/g, '": ').replace(/,"/g, ', "')); }
22
+ // Single-line but readable "key": "value" form. Built by collapsing the
23
+ // structural whitespace from an indented stringify — corruption-proof, unlike a
24
+ // regex that would also rewrite ":"/"," sequences inside string values.
25
+ function spacedLine(payload: AnyObj, replacer?: (string | number)[]): string {
26
+ return JSON.stringify(payload, replacer as never, 1).replace(/\n\s*/g, " ");
27
+ }
28
+ function printJson(payload: AnyObj): void { console.log(spacedLine(payload)); }
23
29
  export function loadJson(file: string, fallback: AnyObj = {}): AnyObj { return fs.existsSync(file) ? JSON.parse(read(file)) : { ...fallback }; }
24
30
  export function appendJsonl(file: string, payload: AnyObj): void {
25
31
  fs.mkdirSync(path.dirname(file), { recursive: true });
26
- const line = JSON.stringify(payload, Object.keys(payload).sort()).replace(/":/g, '": ').replace(/,"/g, ', "');
32
+ const line = spacedLine(payload, Object.keys(payload).sort());
27
33
  fs.appendFileSync(file, `${line}\n`);
28
34
  }
29
35
  function die(message: string): never { throw new Error(message); }