@kontourai/flow-agents 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/actions/trust-verify/action.yml +4 -2
- package/.github/workflows/ci.yml +12 -0
- package/CHANGELOG.md +21 -0
- package/README.md +3 -3
- package/build/src/cli/workflow-sidecar.js +8 -2
- package/context/scripts/telemetry/lib/config.sh +15 -0
- package/context/scripts/telemetry/telemetry.conf +4 -0
- package/context/scripts/telemetry/telemetry.sh +23 -1
- package/docs/design/flowrun-eventsourcing-design.md +216 -0
- package/docs/design/workflowrun-observability-design.md +431 -0
- package/evals/ci/antigaming-suite.sh +1 -0
- package/evals/ci/run-baseline.sh +2 -0
- package/evals/integration/test_command_log_concurrency.sh +114 -0
- package/evals/integration/test_usage_cost.sh +119 -0
- package/evals/integration/test_verify_cli.sh +23 -0
- package/integrations/strands/flow_agents_strands/hooks.py +126 -1
- package/integrations/strands/flow_agents_strands/telemetry.py +172 -0
- package/integrations/strands/tests/test_usage.py +129 -0
- package/integrations/strands-ts/src/hooks.ts +135 -1
- package/integrations/strands-ts/src/telemetry.ts +170 -0
- package/integrations/strands-ts/test/test-usage.ts +85 -0
- package/package.json +2 -2
- package/scripts/hooks/evidence-capture.js +75 -13
- package/scripts/telemetry/lib/config.sh +15 -0
- package/scripts/telemetry/lib/pricing.sh +42 -0
- package/scripts/telemetry/lib/usage.sh +108 -0
- package/scripts/telemetry/pricing.golden.json +15 -0
- package/scripts/telemetry/pricing.json +31 -0
- package/scripts/telemetry/telemetry.conf +4 -0
- package/scripts/telemetry/telemetry.sh +23 -1
- package/src/cli/workflow-sidecar.ts +8 -2
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
# usage.sh — Session usage metric functions
|
|
3
3
|
|
|
4
|
+
# Module directory, resolved once at source time (cwd-independent).
|
|
5
|
+
USAGE_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
6
|
+
|
|
7
|
+
# Single-source pricing registry loader (local / remote / bundled).
|
|
8
|
+
source "${USAGE_LIB_DIR}/pricing.sh"
|
|
9
|
+
|
|
4
10
|
# Resolve model from agent-spec.json
|
|
5
11
|
usage_get_model() {
|
|
6
12
|
local agent_name="$1"
|
|
@@ -27,3 +33,105 @@ usage_count_delegations() {
|
|
|
27
33
|
[[ ! -f "$jsonl_path" ]] && echo 0 && return
|
|
28
34
|
grep -c "\"session_id\":\"${session_id}\".*\"event_type\":\"agent.delegate\"" "$jsonl_path" 2>/dev/null || echo 0
|
|
29
35
|
}
|
|
36
|
+
|
|
37
|
+
# Parse a runtime transcript (JSONL) into real per-model token + cost usage.
|
|
38
|
+
# Ground truth lives in each assistant message's `.message.usage` block:
|
|
39
|
+
# input_tokens (uncached), output_tokens, cache_creation_input_tokens,
|
|
40
|
+
# cache_read_input_tokens — plus `.message.model`.
|
|
41
|
+
# Cost is derived from the versioned pricing registry: cache writes bill at
|
|
42
|
+
# input*write_5m, cache reads at input*read. Cost uses the registry's
|
|
43
|
+
# current_version (override with arg $2) and the result stamps `pricing_version`
|
|
44
|
+
# so the console can reproduce or recompute it. Emits a compact JSON object:
|
|
45
|
+
# { by_model: [ {model, input_tokens, output_tokens,
|
|
46
|
+
# cache_creation_input_tokens, cache_read_input_tokens,
|
|
47
|
+
# estimated_cost_usd} ],
|
|
48
|
+
# input_tokens, output_tokens, cache_creation_input_tokens,
|
|
49
|
+
# cache_read_input_tokens, estimated_cost_usd, pricing_version }
|
|
50
|
+
# Prints nothing (non-zero) when the transcript is missing/unparseable so the
|
|
51
|
+
# caller can fall back to null usage. Never blocks agent work.
|
|
52
|
+
# Expected transcript usage path (Claude Code / Anthropic usage object). Bumped
|
|
53
|
+
# if the on-disk schema changes so drift is logged rather than silently zeroed.
|
|
54
|
+
USAGE_TRANSCRIPT_SCHEMA="message.usage.input_tokens"
|
|
55
|
+
|
|
56
|
+
# Append a one-line schema-drift warning (transcript carried usage data we could
|
|
57
|
+
# not parse). Goes to TELEMETRY_DRIFT_LOG if set, else stderr. Never fatal.
|
|
58
|
+
usage_log_drift() {
|
|
59
|
+
local transcript="$1"
|
|
60
|
+
local msg="[telemetry] pricing/usage drift: ${transcript} has usage data but expected path '${USAGE_TRANSCRIPT_SCHEMA}' parsed 0 tokens — transcript schema may have changed"
|
|
61
|
+
if [[ -n "${TELEMETRY_DRIFT_LOG:-}" ]]; then
|
|
62
|
+
echo "$msg" >> "${TELEMETRY_DRIFT_LOG}" 2>/dev/null || echo "$msg" >&2
|
|
63
|
+
else
|
|
64
|
+
echo "$msg" >&2
|
|
65
|
+
fi
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
usage_parse_transcript() {
|
|
69
|
+
local transcript="$1" version="${2:-}"
|
|
70
|
+
[[ -z "$transcript" || ! -f "$transcript" ]] && return 1
|
|
71
|
+
command -v jq >/dev/null 2>&1 || return 1
|
|
72
|
+
local registry
|
|
73
|
+
registry="$(pricing_registry)" || return 1
|
|
74
|
+
[[ -z "$registry" ]] && return 1
|
|
75
|
+
|
|
76
|
+
local out
|
|
77
|
+
out="$(jq -n --argjson registry "$registry" --arg version "$version" '
|
|
78
|
+
$registry as $reg
|
|
79
|
+
| (if $version == "" then ($reg.current_version) else $version end) as $ver
|
|
80
|
+
| ($reg.versions[$ver]) as $p
|
|
81
|
+
| if $p == null then empty else . end
|
|
82
|
+
| ($p.cache_multipliers) as $cm
|
|
83
|
+
| (reduce inputs as $l ({};
|
|
84
|
+
($l.message.usage) as $u
|
|
85
|
+
| if $u then
|
|
86
|
+
(($l.message.model) // "unknown") as $m
|
|
87
|
+
| .[$m].input = ((.[$m].input // 0) + (($u.input_tokens) // 0))
|
|
88
|
+
| .[$m].output = ((.[$m].output // 0) + (($u.output_tokens) // 0))
|
|
89
|
+
| .[$m].cache_creation = ((.[$m].cache_creation // 0) + (($u.cache_creation_input_tokens) // 0))
|
|
90
|
+
| .[$m].cache_read = ((.[$m].cache_read // 0) + (($u.cache_read_input_tokens) // 0))
|
|
91
|
+
else . end)) as $agg
|
|
92
|
+
| ($agg | to_entries
|
|
93
|
+
| map(
|
|
94
|
+
.key as $m | .value as $u
|
|
95
|
+
| (($p.models[$m]) // $p.default) as $rate
|
|
96
|
+
| (if ([$m] | inside($p.zero_cost_models)) then 0 else 1 end) as $billable
|
|
97
|
+
| {
|
|
98
|
+
model: $m,
|
|
99
|
+
input_tokens: ($u.input // 0),
|
|
100
|
+
output_tokens: ($u.output // 0),
|
|
101
|
+
cache_creation_input_tokens: ($u.cache_creation // 0),
|
|
102
|
+
cache_read_input_tokens: ($u.cache_read // 0),
|
|
103
|
+
estimated_cost_usd: (
|
|
104
|
+
$billable * (
|
|
105
|
+
($u.input // 0) * $rate.input
|
|
106
|
+
+ ($u.output // 0) * $rate.output
|
|
107
|
+
+ ($u.cache_creation // 0) * $rate.input * $cm.write_5m
|
|
108
|
+
+ ($u.cache_read // 0) * $rate.input * $cm.read
|
|
109
|
+
) / 1000000
|
|
110
|
+
)
|
|
111
|
+
})) as $by_model
|
|
112
|
+
| {
|
|
113
|
+
by_model: $by_model,
|
|
114
|
+
input_tokens: ([$by_model[].input_tokens] | add // 0),
|
|
115
|
+
output_tokens: ([$by_model[].output_tokens] | add // 0),
|
|
116
|
+
cache_creation_input_tokens: ([$by_model[].cache_creation_input_tokens] | add // 0),
|
|
117
|
+
cache_read_input_tokens: ([$by_model[].cache_read_input_tokens] | add // 0),
|
|
118
|
+
estimated_cost_usd: (([$by_model[].estimated_cost_usd] | add // 0) * 1000000 | round / 1000000),
|
|
119
|
+
pricing_version: $ver
|
|
120
|
+
}
|
|
121
|
+
' < "$transcript" 2>/dev/null)"
|
|
122
|
+
|
|
123
|
+
[[ -z "$out" ]] && return 1
|
|
124
|
+
|
|
125
|
+
# Drift / emptiness check: if we parsed zero tokens but the transcript clearly
|
|
126
|
+
# contains usage data, the schema drifted — warn and fall back to null usage.
|
|
127
|
+
local total
|
|
128
|
+
total="$(printf '%s' "$out" | jq -r '((.input_tokens // 0) + (.output_tokens // 0) + (.cache_creation_input_tokens // 0) + (.cache_read_input_tokens // 0))' 2>/dev/null)"
|
|
129
|
+
if [[ -z "$total" || "$total" == "0" ]]; then
|
|
130
|
+
if grep -q '"input_tokens"' "$transcript" 2>/dev/null; then
|
|
131
|
+
usage_log_drift "$transcript"
|
|
132
|
+
fi
|
|
133
|
+
return 1
|
|
134
|
+
fi
|
|
135
|
+
|
|
136
|
+
printf '%s\n' "$out"
|
|
137
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_note": "Cross-runtime cost golden vectors. Keep IN SYNC with console-telemetry/test/golden-vectors.json (identical content). Asserted by the flow-agents bash usage tests, the Python sink tests, and the console-telemetry package so every runtime that prices tokens produces the SAME cost. If these drift between repos, a runtime's cost math has diverged.",
|
|
3
|
+
"pricing_version": "2026-06-28",
|
|
4
|
+
"cases": [
|
|
5
|
+
{ "name": "opus cache-read-dominated", "model": "claude-opus-4-8", "tokens": { "input": 1000, "output": 2000, "cache_creation": 0, "cache_read": 500000 }, "expected_cost_usd": 0.305 },
|
|
6
|
+
{ "name": "opus output only", "model": "claude-opus-4-8", "tokens": { "input": 0, "output": 1000, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.025 },
|
|
7
|
+
{ "name": "fable output", "model": "claude-fable-5", "tokens": { "input": 0, "output": 100, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.005 },
|
|
8
|
+
{ "name": "haiku output", "model": "claude-haiku-4-5", "tokens": { "input": 0, "output": 1000, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.005 },
|
|
9
|
+
{ "name": "sonnet input 1M", "model": "claude-sonnet-4-6", "tokens": { "input": 1000000, "output": 0, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 3.0 },
|
|
10
|
+
{ "name": "opus cache-write 5m tier", "model": "claude-opus-4-8", "tokens": { "input": 0, "output": 0, "cache_creation": 1000000, "cache_read": 0 }, "expected_cost_usd": 6.25 },
|
|
11
|
+
{ "name": "opus billion-scale", "model": "claude-opus-4-8", "tokens": { "input": 200000, "output": 1600000, "cache_creation": 9000000, "cache_read": 1000000000 }, "expected_cost_usd": 597.25 },
|
|
12
|
+
{ "name": "synthetic is free", "model": "<synthetic>", "tokens": { "input": 999, "output": 999, "cache_creation": 999, "cache_read": 999 }, "expected_cost_usd": 0 },
|
|
13
|
+
{ "name": "unknown model uses default rate", "model": "some-unlisted-model", "tokens": { "input": 1000000, "output": 0, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 5.0 }
|
|
14
|
+
]
|
|
15
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema_version": "2.0",
|
|
3
|
+
"current_version": "2026-06-28",
|
|
4
|
+
"source": "Anthropic public list pricing; cache multipliers per prompt-caching docs",
|
|
5
|
+
"versions": {
|
|
6
|
+
"2026-06-28": {
|
|
7
|
+
"effective_date": "2026-06-28",
|
|
8
|
+
"currency": "USD",
|
|
9
|
+
"unit": "per_1m_tokens",
|
|
10
|
+
"cache_multipliers": {
|
|
11
|
+
"write_5m": 1.25,
|
|
12
|
+
"write_1h": 2.0,
|
|
13
|
+
"read": 0.1
|
|
14
|
+
},
|
|
15
|
+
"models": {
|
|
16
|
+
"claude-fable-5": { "input": 10.0, "output": 50.0 },
|
|
17
|
+
"claude-mythos-5": { "input": 10.0, "output": 50.0 },
|
|
18
|
+
"claude-opus-4-8": { "input": 5.0, "output": 25.0 },
|
|
19
|
+
"claude-opus-4-7": { "input": 5.0, "output": 25.0 },
|
|
20
|
+
"claude-opus-4-6": { "input": 5.0, "output": 25.0 },
|
|
21
|
+
"claude-opus-4-5": { "input": 5.0, "output": 25.0 },
|
|
22
|
+
"claude-opus-4-1": { "input": 15.0, "output": 75.0 },
|
|
23
|
+
"claude-sonnet-4-6": { "input": 3.0, "output": 15.0 },
|
|
24
|
+
"claude-sonnet-4-5": { "input": 3.0, "output": 15.0 },
|
|
25
|
+
"claude-haiku-4-5": { "input": 1.0, "output": 5.0 }
|
|
26
|
+
},
|
|
27
|
+
"default": { "input": 5.0, "output": 25.0 },
|
|
28
|
+
"zero_cost_models": ["<synthetic>", "synthetic", "unknown", ""]
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -8,6 +8,10 @@ channel.analytics.redact=tool.input,tool.output,turn.prompt_text,delegation.targ
|
|
|
8
8
|
# The transport derives /api/telemetry/records from console_telemetry_url.
|
|
9
9
|
# console_telemetry_token=
|
|
10
10
|
# console_tenant_id=
|
|
11
|
+
# Live pricing registry source. If unset, derived from console_telemetry_url as
|
|
12
|
+
# <console>/api/telemetry/pricing so bash/Python/TS runtimes read one live
|
|
13
|
+
# source; lib/pricing.sh caches it and falls back to bundled pricing.json.
|
|
14
|
+
# console_pricing_url=https://console.kontourai.io/api/telemetry/pricing
|
|
11
15
|
enrich_system=true
|
|
12
16
|
enrich_workspace=true
|
|
13
17
|
enrich_auth=true
|
|
@@ -309,13 +309,35 @@ add_stop_data_and_emit_usage() {
|
|
|
309
309
|
tool_count=$(usage_count_tool_calls "$session_id" "$full_log")
|
|
310
310
|
delegation_count=$(usage_count_delegations "$session_id" "$full_log")
|
|
311
311
|
|
|
312
|
+
# Ground-truth token + cost usage from the runtime transcript, when the
|
|
313
|
+
# runtime exposes one (Claude Code, Codex, etc. set hook.transcript_path).
|
|
314
|
+
# Tokens are source-of-truth; estimated_cost_usd is derived from pricing.json
|
|
315
|
+
# (recomputed authoritatively console-side, so pricing updates are retroactive).
|
|
316
|
+
local transcript_path transcript_usage
|
|
317
|
+
transcript_path=$(echo "$event" | jq -r '.hook.transcript_path // ""')
|
|
318
|
+
transcript_usage=$(usage_parse_transcript "$transcript_path")
|
|
319
|
+
[[ -z "$transcript_usage" ]] && transcript_usage='null'
|
|
320
|
+
|
|
312
321
|
local usage_event
|
|
313
322
|
usage_event=$(echo "$event" | jq -c \
|
|
314
323
|
--arg m "$model" \
|
|
315
324
|
--argjson tc "$tool_count" \
|
|
316
325
|
--argjson dc "$delegation_count" \
|
|
326
|
+
--argjson tu "$transcript_usage" \
|
|
317
327
|
'.event_type = "session.usage" | .event_id = (.event_id + "-usage") | . + {
|
|
318
|
-
usage: {
|
|
328
|
+
usage: ({
|
|
329
|
+
model: $m,
|
|
330
|
+
duration_s: .session.duration_s,
|
|
331
|
+
tool_invocations: $tc,
|
|
332
|
+
delegations: $dc,
|
|
333
|
+
input_tokens: ($tu.input_tokens // null),
|
|
334
|
+
output_tokens: ($tu.output_tokens // null),
|
|
335
|
+
cache_creation_input_tokens: ($tu.cache_creation_input_tokens // null),
|
|
336
|
+
cache_read_input_tokens: ($tu.cache_read_input_tokens // null),
|
|
337
|
+
estimated_cost_usd: ($tu.estimated_cost_usd // null),
|
|
338
|
+
pricing_version: ($tu.pricing_version // null),
|
|
339
|
+
by_model: ($tu.by_model // null)
|
|
340
|
+
})
|
|
319
341
|
}')
|
|
320
342
|
transport_emit "$usage_event"
|
|
321
343
|
fi
|
|
@@ -19,11 +19,17 @@ export const verdicts = new Set(["pass", "partial", "fail", "not_verified"]);
|
|
|
19
19
|
function now(): string { return new Date().toISOString().replace(/\.\d{3}Z$/, "Z"); }
|
|
20
20
|
function read(file: string): string { return fs.readFileSync(file, "utf8"); }
|
|
21
21
|
export function writeJson(file: string, payload: AnyObj): void { fs.mkdirSync(path.dirname(file), { recursive: true }); fs.writeFileSync(file, `${JSON.stringify(payload, null, 2)}\n`); }
|
|
22
|
-
|
|
22
|
+
// Single-line but readable "key": "value" form. Built by collapsing the
|
|
23
|
+
// structural whitespace from an indented stringify — corruption-proof, unlike a
|
|
24
|
+
// regex that would also rewrite ":"/"," sequences inside string values.
|
|
25
|
+
function spacedLine(payload: AnyObj, replacer?: (string | number)[]): string {
|
|
26
|
+
return JSON.stringify(payload, replacer as never, 1).replace(/\n\s*/g, " ");
|
|
27
|
+
}
|
|
28
|
+
function printJson(payload: AnyObj): void { console.log(spacedLine(payload)); }
|
|
23
29
|
export function loadJson(file: string, fallback: AnyObj = {}): AnyObj { return fs.existsSync(file) ? JSON.parse(read(file)) : { ...fallback }; }
|
|
24
30
|
export function appendJsonl(file: string, payload: AnyObj): void {
|
|
25
31
|
fs.mkdirSync(path.dirname(file), { recursive: true });
|
|
26
|
-
const line =
|
|
32
|
+
const line = spacedLine(payload, Object.keys(payload).sort());
|
|
27
33
|
fs.appendFileSync(file, `${line}\n`);
|
|
28
34
|
}
|
|
29
35
|
function die(message: string): never { throw new Error(message); }
|