npm - @kontourai/flow-agents - Versions diffs - 2.0.1 → 2.1.0 - Mend

@kontourai/flow-agents 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/.github/actions/trust-verify/action.yml +4 -2
package/.github/workflows/ci.yml +12 -0
package/CHANGELOG.md +21 -0
package/README.md +3 -3
package/build/src/cli/workflow-sidecar.js +8 -2
package/context/scripts/telemetry/lib/config.sh +15 -0
package/context/scripts/telemetry/telemetry.conf +4 -0
package/context/scripts/telemetry/telemetry.sh +23 -1
package/docs/design/flowrun-eventsourcing-design.md +216 -0
package/docs/design/workflowrun-observability-design.md +431 -0
package/evals/ci/antigaming-suite.sh +1 -0
package/evals/ci/run-baseline.sh +2 -0
package/evals/integration/test_command_log_concurrency.sh +114 -0
package/evals/integration/test_usage_cost.sh +119 -0
package/evals/integration/test_verify_cli.sh +23 -0
package/integrations/strands/flow_agents_strands/hooks.py +126 -1
package/integrations/strands/flow_agents_strands/telemetry.py +172 -0
package/integrations/strands/tests/test_usage.py +129 -0
package/integrations/strands-ts/src/hooks.ts +135 -1
package/integrations/strands-ts/src/telemetry.ts +170 -0
package/integrations/strands-ts/test/test-usage.ts +85 -0
package/package.json +2 -2
package/scripts/hooks/evidence-capture.js +75 -13
package/scripts/telemetry/lib/config.sh +15 -0
package/scripts/telemetry/lib/pricing.sh +42 -0
package/scripts/telemetry/lib/usage.sh +108 -0
package/scripts/telemetry/pricing.golden.json +15 -0
package/scripts/telemetry/pricing.json +31 -0
package/scripts/telemetry/telemetry.conf +4 -0
package/scripts/telemetry/telemetry.sh +23 -1
package/src/cli/workflow-sidecar.ts +8 -2

package/scripts/telemetry/lib/usage.sh CHANGED Viewed

@@ -1,6 +1,12 @@
 #!/usr/bin/env bash
 # usage.sh — Session usage metric functions
+# Module directory, resolved once at source time (cwd-independent).
+USAGE_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Single-source pricing registry loader (local / remote / bundled).
+source "${USAGE_LIB_DIR}/pricing.sh"
 # Resolve model from agent-spec.json
 usage_get_model() {
   local agent_name="$1"
@@ -27,3 +33,105 @@ usage_count_delegations() {
   [[ ! -f "$jsonl_path" ]] && echo 0 && return
   grep -c "\"session_id\":\"${session_id}\".*\"event_type\":\"agent.delegate\"" "$jsonl_path" 2>/dev/null || echo 0
 }
+# Parse a runtime transcript (JSONL) into real per-model token + cost usage.
+# Ground truth lives in each assistant message's `.message.usage` block:
+#   input_tokens (uncached), output_tokens, cache_creation_input_tokens,
+#   cache_read_input_tokens — plus `.message.model`.
+# Cost is derived from the versioned pricing registry: cache writes bill at
+# input*write_5m, cache reads at input*read. Cost uses the registry's
+# current_version (override with arg $2) and the result stamps `pricing_version`
+# so the console can reproduce or recompute it. Emits a compact JSON object:
+#   { by_model: [ {model, input_tokens, output_tokens,
+#                  cache_creation_input_tokens, cache_read_input_tokens,
+#                  estimated_cost_usd} ],
+#     input_tokens, output_tokens, cache_creation_input_tokens,
+#     cache_read_input_tokens, estimated_cost_usd, pricing_version }
+# Prints nothing (non-zero) when the transcript is missing/unparseable so the
+# caller can fall back to null usage. Never blocks agent work.
+# Expected transcript usage path (Claude Code / Anthropic usage object). Bumped
+# if the on-disk schema changes so drift is logged rather than silently zeroed.
+USAGE_TRANSCRIPT_SCHEMA="message.usage.input_tokens"
+# Append a one-line schema-drift warning (transcript carried usage data we could
+# not parse). Goes to TELEMETRY_DRIFT_LOG if set, else stderr. Never fatal.
+usage_log_drift() {
+  local transcript="$1"
+  local msg="[telemetry] pricing/usage drift: ${transcript} has usage data but expected path '${USAGE_TRANSCRIPT_SCHEMA}' parsed 0 tokens — transcript schema may have changed"
+  if [[ -n "${TELEMETRY_DRIFT_LOG:-}" ]]; then
+    echo "$msg" >> "${TELEMETRY_DRIFT_LOG}" 2>/dev/null || echo "$msg" >&2
+  else
+    echo "$msg" >&2
+  fi
+}
+usage_parse_transcript() {
+  local transcript="$1" version="${2:-}"
+  [[ -z "$transcript" || ! -f "$transcript" ]] && return 1
+  command -v jq >/dev/null 2>&1 || return 1
+  local registry
+  registry="$(pricing_registry)" || return 1
+  [[ -z "$registry" ]] && return 1
+  local out
+  out="$(jq -n --argjson registry "$registry" --arg version "$version" '
+    $registry as $reg
+    | (if $version == "" then ($reg.current_version) else $version end) as $ver
+    | ($reg.versions[$ver]) as $p
+    | if $p == null then empty else . end
+    | ($p.cache_multipliers) as $cm
+    | (reduce inputs as $l ({};
+        ($l.message.usage) as $u
+        | if $u then
+            (($l.message.model) // "unknown") as $m
+            | .[$m].input          = ((.[$m].input // 0)          + (($u.input_tokens) // 0))
+            | .[$m].output         = ((.[$m].output // 0)         + (($u.output_tokens) // 0))
+            | .[$m].cache_creation = ((.[$m].cache_creation // 0) + (($u.cache_creation_input_tokens) // 0))
+            | .[$m].cache_read     = ((.[$m].cache_read // 0)     + (($u.cache_read_input_tokens) // 0))
+          else . end)) as $agg
+    | ($agg | to_entries
+        | map(
+            .key as $m | .value as $u
+            | (($p.models[$m]) // $p.default) as $rate
+            | (if ([$m] | inside($p.zero_cost_models)) then 0 else 1 end) as $billable
+            | {
+                model: $m,
+                input_tokens: ($u.input // 0),
+                output_tokens: ($u.output // 0),
+                cache_creation_input_tokens: ($u.cache_creation // 0),
+                cache_read_input_tokens: ($u.cache_read // 0),
+                estimated_cost_usd: (
+                  $billable * (
+                    ($u.input // 0)          * $rate.input
+                    + ($u.output // 0)         * $rate.output
+                    + ($u.cache_creation // 0) * $rate.input * $cm.write_5m
+                    + ($u.cache_read // 0)     * $rate.input * $cm.read
+                  ) / 1000000
+                )
+              })) as $by_model
+    | {
+        by_model: $by_model,
+        input_tokens: ([$by_model[].input_tokens] | add // 0),
+        output_tokens: ([$by_model[].output_tokens] | add // 0),
+        cache_creation_input_tokens: ([$by_model[].cache_creation_input_tokens] | add // 0),
+        cache_read_input_tokens: ([$by_model[].cache_read_input_tokens] | add // 0),
+        estimated_cost_usd: (([$by_model[].estimated_cost_usd] | add // 0) * 1000000 | round / 1000000),
+        pricing_version: $ver
+      }
+  ' < "$transcript" 2>/dev/null)"
+  [[ -z "$out" ]] && return 1
+  # Drift / emptiness check: if we parsed zero tokens but the transcript clearly
+  # contains usage data, the schema drifted — warn and fall back to null usage.
+  local total
+  total="$(printf '%s' "$out" | jq -r '((.input_tokens // 0) + (.output_tokens // 0) + (.cache_creation_input_tokens // 0) + (.cache_read_input_tokens // 0))' 2>/dev/null)"
+  if [[ -z "$total" || "$total" == "0" ]]; then
+    if grep -q '"input_tokens"' "$transcript" 2>/dev/null; then
+      usage_log_drift "$transcript"
+    fi
+    return 1
+  fi
+  printf '%s\n' "$out"
+}

package/scripts/telemetry/pricing.golden.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "_note": "Cross-runtime cost golden vectors. Keep IN SYNC with console-telemetry/test/golden-vectors.json (identical content). Asserted by the flow-agents bash usage tests, the Python sink tests, and the console-telemetry package so every runtime that prices tokens produces the SAME cost. If these drift between repos, a runtime's cost math has diverged.",
+  "pricing_version": "2026-06-28",
+  "cases": [
+    { "name": "opus cache-read-dominated", "model": "claude-opus-4-8", "tokens": { "input": 1000, "output": 2000, "cache_creation": 0, "cache_read": 500000 }, "expected_cost_usd": 0.305 },
+    { "name": "opus output only", "model": "claude-opus-4-8", "tokens": { "input": 0, "output": 1000, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.025 },
+    { "name": "fable output", "model": "claude-fable-5", "tokens": { "input": 0, "output": 100, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.005 },
+    { "name": "haiku output", "model": "claude-haiku-4-5", "tokens": { "input": 0, "output": 1000, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 0.005 },
+    { "name": "sonnet input 1M", "model": "claude-sonnet-4-6", "tokens": { "input": 1000000, "output": 0, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 3.0 },
+    { "name": "opus cache-write 5m tier", "model": "claude-opus-4-8", "tokens": { "input": 0, "output": 0, "cache_creation": 1000000, "cache_read": 0 }, "expected_cost_usd": 6.25 },
+    { "name": "opus billion-scale", "model": "claude-opus-4-8", "tokens": { "input": 200000, "output": 1600000, "cache_creation": 9000000, "cache_read": 1000000000 }, "expected_cost_usd": 597.25 },
+    { "name": "synthetic is free", "model": "<synthetic>", "tokens": { "input": 999, "output": 999, "cache_creation": 999, "cache_read": 999 }, "expected_cost_usd": 0 },
+    { "name": "unknown model uses default rate", "model": "some-unlisted-model", "tokens": { "input": 1000000, "output": 0, "cache_creation": 0, "cache_read": 0 }, "expected_cost_usd": 5.0 }
+  ]
+}

package/scripts/telemetry/pricing.json ADDED Viewed

@@ -0,0 +1,31 @@
+{
+  "schema_version": "2.0",
+  "current_version": "2026-06-28",
+  "source": "Anthropic public list pricing; cache multipliers per prompt-caching docs",
+  "versions": {
+    "2026-06-28": {
+      "effective_date": "2026-06-28",
+      "currency": "USD",
+      "unit": "per_1m_tokens",
+      "cache_multipliers": {
+        "write_5m": 1.25,
+        "write_1h": 2.0,
+        "read": 0.1
+      },
+      "models": {
+        "claude-fable-5": { "input": 10.0, "output": 50.0 },
+        "claude-mythos-5": { "input": 10.0, "output": 50.0 },
+        "claude-opus-4-8": { "input": 5.0, "output": 25.0 },
+        "claude-opus-4-7": { "input": 5.0, "output": 25.0 },
+        "claude-opus-4-6": { "input": 5.0, "output": 25.0 },
+        "claude-opus-4-5": { "input": 5.0, "output": 25.0 },
+        "claude-opus-4-1": { "input": 15.0, "output": 75.0 },
+        "claude-sonnet-4-6": { "input": 3.0, "output": 15.0 },
+        "claude-sonnet-4-5": { "input": 3.0, "output": 15.0 },
+        "claude-haiku-4-5": { "input": 1.0, "output": 5.0 }
+      },
+      "default": { "input": 5.0, "output": 25.0 },
+      "zero_cost_models": ["<synthetic>", "synthetic", "unknown", ""]
+    }
+  }
+}

package/scripts/telemetry/telemetry.conf CHANGED Viewed

@@ -8,6 +8,10 @@ channel.analytics.redact=tool.input,tool.output,turn.prompt_text,delegation.targ
 # The transport derives /api/telemetry/records from console_telemetry_url.
 # console_telemetry_token=
 # console_tenant_id=
+# Live pricing registry source. If unset, derived from console_telemetry_url as
+# <console>/api/telemetry/pricing so bash/Python/TS runtimes read one live
+# source; lib/pricing.sh caches it and falls back to bundled pricing.json.
+# console_pricing_url=https://console.kontourai.io/api/telemetry/pricing
 enrich_system=true
 enrich_workspace=true
 enrich_auth=true

package/scripts/telemetry/telemetry.sh CHANGED Viewed

@@ -309,13 +309,35 @@ add_stop_data_and_emit_usage() {
     tool_count=$(usage_count_tool_calls "$session_id" "$full_log")
     delegation_count=$(usage_count_delegations "$session_id" "$full_log")
+    # Ground-truth token + cost usage from the runtime transcript, when the
+    # runtime exposes one (Claude Code, Codex, etc. set hook.transcript_path).
+    # Tokens are source-of-truth; estimated_cost_usd is derived from pricing.json
+    # (recomputed authoritatively console-side, so pricing updates are retroactive).
+    local transcript_path transcript_usage
+    transcript_path=$(echo "$event" | jq -r '.hook.transcript_path // ""')
+    transcript_usage=$(usage_parse_transcript "$transcript_path")
+    [[ -z "$transcript_usage" ]] && transcript_usage='null'
     local usage_event
     usage_event=$(echo "$event" | jq -c \
       --arg m "$model" \
       --argjson tc "$tool_count" \
       --argjson dc "$delegation_count" \
+      --argjson tu "$transcript_usage" \
       '.event_type = "session.usage" | .event_id = (.event_id + "-usage") | . + {
-        usage: {model: $m, duration_s: .session.duration_s, tool_invocations: $tc, delegations: $dc, input_tokens: null, output_tokens: null, estimated_cost_usd: null}
+        usage: ({
+          model: $m,
+          duration_s: .session.duration_s,
+          tool_invocations: $tc,
+          delegations: $dc,
+          input_tokens: ($tu.input_tokens // null),
+          output_tokens: ($tu.output_tokens // null),
+          cache_creation_input_tokens: ($tu.cache_creation_input_tokens // null),
+          cache_read_input_tokens: ($tu.cache_read_input_tokens // null),
+          estimated_cost_usd: ($tu.estimated_cost_usd // null),
+          pricing_version: ($tu.pricing_version // null),
+          by_model: ($tu.by_model // null)
+        })
       }')
     transport_emit "$usage_event"
   fi

package/src/cli/workflow-sidecar.ts CHANGED Viewed

@@ -19,11 +19,17 @@ export const verdicts = new Set(["pass", "partial", "fail", "not_verified"]);
 function now(): string { return new Date().toISOString().replace(/\.\d{3}Z$/, "Z"); }
 function read(file: string): string { return fs.readFileSync(file, "utf8"); }
 export function writeJson(file: string, payload: AnyObj): void { fs.mkdirSync(path.dirname(file), { recursive: true }); fs.writeFileSync(file, `${JSON.stringify(payload, null, 2)}\n`); }
-function printJson(payload: AnyObj): void { console.log(JSON.stringify(payload).replace(/":/g, '": ').replace(/,"/g, ', "')); }
+// Single-line but readable "key": "value" form. Built by collapsing the
+// structural whitespace from an indented stringify — corruption-proof, unlike a
+// regex that would also rewrite ":"/"," sequences inside string values.
+function spacedLine(payload: AnyObj, replacer?: (string | number)[]): string {
+  return JSON.stringify(payload, replacer as never, 1).replace(/\n\s*/g, " ");
+}
+function printJson(payload: AnyObj): void { console.log(spacedLine(payload)); }
 export function loadJson(file: string, fallback: AnyObj = {}): AnyObj { return fs.existsSync(file) ? JSON.parse(read(file)) : { ...fallback }; }
 export function appendJsonl(file: string, payload: AnyObj): void {
   fs.mkdirSync(path.dirname(file), { recursive: true });
-  const line = JSON.stringify(payload, Object.keys(payload).sort()).replace(/":/g, '": ').replace(/,"/g, ', "');
+  const line = spacedLine(payload, Object.keys(payload).sort());
   fs.appendFileSync(file, `${line}\n`);
 }
 function die(message: string): never { throw new Error(message); }