@kontourai/flow-agents 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/.github/actions/trust-verify/action.yml +4 -2
  2. package/.github/workflows/ci.yml +12 -0
  3. package/.github/workflows/runtime-compat.yml +1 -1
  4. package/CHANGELOG.md +29 -0
  5. package/README.md +3 -3
  6. package/build/src/cli/workflow-sidecar.d.ts +16 -0
  7. package/build/src/cli/workflow-sidecar.js +72 -12
  8. package/build/src/lib/flow-resolver.d.ts +29 -0
  9. package/build/src/lib/flow-resolver.js +71 -0
  10. package/context/scripts/telemetry/lib/config.sh +15 -0
  11. package/context/scripts/telemetry/telemetry.conf +4 -0
  12. package/context/scripts/telemetry/telemetry.sh +23 -1
  13. package/docs/design/flowrun-eventsourcing-design.md +216 -0
  14. package/docs/design/workflowrun-observability-design.md +431 -0
  15. package/evals/ci/antigaming-suite.sh +2 -0
  16. package/evals/ci/run-baseline.sh +2 -0
  17. package/evals/integration/test_command_log_concurrency.sh +114 -0
  18. package/evals/integration/test_command_log_fork_classification.sh +134 -0
  19. package/evals/integration/test_kit_identity_trust.sh +393 -0
  20. package/evals/integration/test_usage_cost.sh +119 -0
  21. package/evals/integration/test_verify_cli.sh +23 -0
  22. package/evals/run.sh +2 -0
  23. package/integrations/strands/flow_agents_strands/hooks.py +126 -1
  24. package/integrations/strands/flow_agents_strands/telemetry.py +172 -0
  25. package/integrations/strands/tests/test_usage.py +129 -0
  26. package/integrations/strands-ts/src/hooks.ts +135 -1
  27. package/integrations/strands-ts/src/telemetry.ts +170 -0
  28. package/integrations/strands-ts/test/test-usage.ts +85 -0
  29. package/package.json +5 -5
  30. package/scripts/hooks/evidence-capture.js +75 -13
  31. package/scripts/hooks/stop-goal-fit.js +76 -23
  32. package/scripts/repair-command-log.js +115 -0
  33. package/scripts/telemetry/lib/config.sh +15 -0
  34. package/scripts/telemetry/lib/pricing.sh +42 -0
  35. package/scripts/telemetry/lib/usage.sh +108 -0
  36. package/scripts/telemetry/pricing.golden.json +15 -0
  37. package/scripts/telemetry/pricing.json +31 -0
  38. package/scripts/telemetry/telemetry.conf +4 -0
  39. package/scripts/telemetry/telemetry.sh +23 -1
  40. package/src/cli/workflow-sidecar.ts +73 -11
  41. package/src/lib/flow-resolver.ts +85 -0
@@ -196,6 +196,29 @@ else
196
196
  _fail "HELP-FLAG: expected usage text, got: $out4"
197
197
  fi
198
198
 
199
+ # ─── TEST 5: composite action path resolution ──────────────────────────────────
200
+ # Regression for the cross-repo path bug: the action at .github/actions/trust-verify/
201
+ # resolves node scripts relative to github.action_path. A wrong `../` depth makes the
202
+ # action fail with "Cannot find module" in a CONSUMER repo (it passes a local CLI test
203
+ # but breaks the actual adoption path). Assert every action_path-relative script ref
204
+ # resolves to a real file.
205
+ echo "=== TEST 5: trust-verify action node refs resolve to real scripts ==="
206
+ if node -e '
207
+ const fs=require("fs"), path=require("path");
208
+ const root=process.argv[1];
209
+ const actionDir=path.join(root,".github/actions/trust-verify");
210
+ const y=fs.readFileSync(path.join(actionDir,"action.yml"),"utf8");
211
+ const refs=[...y.matchAll(/action_path \}\}\/([^"]+\.js)/g)].map(m=>m[1]);
212
+ if(refs.length===0){console.error("no action_path script refs found");process.exit(1);}
213
+ let ok=true;
214
+ for(const r of refs){ if(!fs.existsSync(path.resolve(actionDir,r))){console.error("UNRESOLVED: "+r);ok=false;} }
215
+ process.exit(ok?0:1);
216
+ ' "$ROOT"; then
217
+ _pass "ACTION-PATH: all trust-verify action.yml script refs resolve"
218
+ else
219
+ _fail "ACTION-PATH: a trust-verify action.yml script ref does not resolve (wrong ../ depth?)"
220
+ fi
221
+
199
222
  # ─── Summary ──────────────────────────────────────────────────────────────────
200
223
  echo ""
201
224
  echo "────────────────────────────────────────────"
package/evals/run.sh CHANGED
@@ -242,6 +242,8 @@ run_integration() {
242
242
  echo ""
243
243
  bash "$EVAL_DIR/integration/test_verify_cli.sh" || result=1
244
244
  echo ""
245
+ bash "$EVAL_DIR/integration/test_kit_identity_trust.sh" || result=1
246
+ echo ""
245
247
  bash "$EVAL_DIR/acceptance/prove-capture-teeth-declared.sh" || result=1
246
248
  return $result
247
249
  }
@@ -81,6 +81,8 @@ class FlowAgentsHooks:
81
81
  self._policy = policy_gate if policy_gate is not None else PolicyGate()
82
82
  self._steering = SteeringContext(workspace=workspace)
83
83
  self._session_start_ts: Optional[float] = None
84
+ # Per-model token accumulator, summed across model-call events.
85
+ self._usage_by_model: Dict[str, Dict[str, int]] = {}
84
86
 
85
87
  # ------------------------------------------------------------------
86
88
  # Public API available WITHOUT strands installed
@@ -137,6 +139,21 @@ class FlowAgentsHooks:
137
139
  registry.add_callback(BeforeToolCallEvent, self._on_before_tool_call)
138
140
  registry.add_callback(AfterToolCallEvent, self._on_after_tool_call)
139
141
 
142
+ # Model-call event carries per-call token usage (the SDK's documented
143
+ # usage source). Optional — registered only if the installed SDK exposes
144
+ # it, under whichever name this SDK version uses.
145
+ try:
146
+ import strands.hooks as _sh # type: ignore[import]
147
+
148
+ model_event = (
149
+ getattr(_sh, "AfterModelCallEvent", None)
150
+ or getattr(_sh, "AfterModelInvocationEvent", None)
151
+ )
152
+ if model_event is not None:
153
+ registry.add_callback(model_event, self._on_after_model_call)
154
+ except ImportError:
155
+ pass
156
+
140
157
  # ------------------------------------------------------------------
141
158
  # Private callbacks
142
159
  # ------------------------------------------------------------------
@@ -144,6 +161,7 @@ class FlowAgentsHooks:
144
161
  def _on_agent_initialized(self, event: Any) -> None:
145
162
  """AgentInitializedEvent → agentSpawn / session.start"""
146
163
  self._session_start_ts = time.monotonic()
164
+ self._usage_by_model = {}
147
165
  self._sink.emit_session_start()
148
166
 
149
167
  def _on_before_invocation(self, event: Any) -> None:
@@ -153,12 +171,58 @@ class FlowAgentsHooks:
153
171
  self._sink.emit("userPromptSubmit")
154
172
 
155
173
  def _on_after_invocation(self, event: Any) -> None:
156
- """AfterInvocationEvent → stop / session.end"""
174
+ """AfterInvocationEvent → emit session.usage (if any) then stop / session.end"""
157
175
  duration_s = 0.0
158
176
  if self._session_start_ts is not None:
159
177
  duration_s = time.monotonic() - self._session_start_ts
178
+
179
+ if self._usage_by_model:
180
+ by_model = []
181
+ totals = {"input": 0, "output": 0, "cache_creation": 0, "cache_read": 0}
182
+ for model, tok in self._usage_by_model.items():
183
+ by_model.append(
184
+ {
185
+ "model": model,
186
+ "input_tokens": tok["input"],
187
+ "output_tokens": tok["output"],
188
+ "cache_creation_input_tokens": tok["cache_creation"],
189
+ "cache_read_input_tokens": tok["cache_read"],
190
+ }
191
+ )
192
+ for key in totals:
193
+ totals[key] += tok[key]
194
+ self._sink.emit_usage(
195
+ model=next(iter(self._usage_by_model)) if len(self._usage_by_model) == 1 else None,
196
+ input_tokens=totals["input"],
197
+ output_tokens=totals["output"],
198
+ cache_creation_input_tokens=totals["cache_creation"],
199
+ cache_read_input_tokens=totals["cache_read"],
200
+ duration_s=duration_s,
201
+ by_model=by_model,
202
+ )
203
+ self._usage_by_model = {}
204
+
160
205
  self._sink.emit_session_end(duration_s=duration_s)
161
206
 
207
+ def _on_after_model_call(self, event: Any) -> None:
208
+ """Model-call event → accumulate per-model token usage.
209
+
210
+ Reads the documented Anthropic usage object (input_tokens, output_tokens,
211
+ cache_creation_input_tokens, cache_read_input_tokens) from wherever the
212
+ Strands event surfaces it. Defensive across SDK shapes; no-op if absent.
213
+ """
214
+ extracted = _extract_model_usage(event)
215
+ if extracted is None:
216
+ return
217
+ model = extracted["model"]
218
+ acc = self._usage_by_model.setdefault(
219
+ model, {"input": 0, "output": 0, "cache_creation": 0, "cache_read": 0}
220
+ )
221
+ acc["input"] += extracted["input"]
222
+ acc["output"] += extracted["output"]
223
+ acc["cache_creation"] += extracted["cache_creation"]
224
+ acc["cache_read"] += extracted["cache_read"]
225
+
162
226
  def _on_before_tool_call(self, event: Any) -> None:
163
227
  """
164
228
  BeforeToolCallEvent → preToolUse / tool.invoke + policy gate.
@@ -192,3 +256,64 @@ class FlowAgentsHooks:
192
256
  tool_name = tool_use.get("name", "")
193
257
  result = getattr(event, "result", None)
194
258
  self._sink.emit_tool_result(tool_name=tool_name, tool_output=result)
259
+
260
+
261
+ # ----------------------------------------------------------------------------
262
+ # Usage extraction — map a Strands model-call event onto the documented
263
+ # Anthropic usage object, defensively across SDK shapes (object or dict).
264
+ # ----------------------------------------------------------------------------
265
+
266
+
267
+ def _attr(obj: Any, *keys: str) -> Any:
268
+ for key in keys:
269
+ if isinstance(obj, dict):
270
+ if key in obj and obj[key] is not None:
271
+ return obj[key]
272
+ else:
273
+ value = getattr(obj, key, None)
274
+ if value is not None:
275
+ return value
276
+ return None
277
+
278
+
279
+ def _num(obj: Any, *keys: str) -> int:
280
+ value = _attr(obj, *keys)
281
+ return value if isinstance(value, (int, float)) else 0
282
+
283
+
284
+ def _extract_model_usage(event: Any) -> Optional[Dict[str, Any]]:
285
+ containers = [
286
+ event,
287
+ _attr(event, "usage"),
288
+ _attr(event, "response"),
289
+ _attr(event, "result"),
290
+ _attr(event, "message"),
291
+ _attr(event, "output"),
292
+ _attr(event, "model_response"),
293
+ ]
294
+ usage = None
295
+ model_carrier = None
296
+ for container in containers:
297
+ if container is None:
298
+ continue
299
+ candidate = _attr(container, "usage")
300
+ if candidate is None and (_attr(container, "input_tokens", "inputTokens") is not None):
301
+ candidate = container
302
+ if candidate is not None and usage is None:
303
+ usage = candidate
304
+ if model_carrier is None and _attr(container, "model", "model_id", "modelId") is not None:
305
+ model_carrier = container
306
+ if usage is None:
307
+ return None
308
+
309
+ tokens = {
310
+ "input": _num(usage, "input_tokens", "inputTokens"),
311
+ "output": _num(usage, "output_tokens", "outputTokens"),
312
+ "cache_creation": _num(usage, "cache_creation_input_tokens", "cacheCreationInputTokens"),
313
+ "cache_read": _num(usage, "cache_read_input_tokens", "cacheReadInputTokens"),
314
+ }
315
+ if not any(tokens.values()):
316
+ return None
317
+
318
+ model = _attr(model_carrier, "model", "model_id", "modelId") or _attr(usage, "model") or "unknown"
319
+ return {"model": str(model), **tokens}
@@ -216,6 +216,90 @@ class TelemetrySink:
216
216
  {"turn": {"prompt_text": "", "steering_context": steering_text}},
217
217
  )
218
218
 
219
+ def emit_usage(
220
+ self,
221
+ *,
222
+ model: Optional[str] = None,
223
+ input_tokens: int = 0,
224
+ output_tokens: int = 0,
225
+ cache_creation_input_tokens: int = 0,
226
+ cache_read_input_tokens: int = 0,
227
+ duration_s: Optional[float] = None,
228
+ by_model: Optional[list] = None,
229
+ ) -> Dict[str, Any]:
230
+ """
231
+ Emit a ``session.usage`` event with real token counts + derived cost.
232
+
233
+ The Strands SDK surfaces per-invocation usage on model-call events;
234
+ accumulate those and pass the totals here at session end. Tokens are the
235
+ source of truth; ``estimated_cost_usd`` is derived from PRICING (the
236
+ console recomputes it authoritatively, so a pricing change is
237
+ retroactive). Mirrors the ``session.usage`` shape emitted by
238
+ scripts/telemetry/telemetry.sh so the console aggregates both the same.
239
+ """
240
+ event = self._base_event("session.usage")
241
+ event["event_id"] = f"{event['event_id']}-usage"
242
+ event["hook"] = {
243
+ "event_name": "usage",
244
+ "runtime_session_id": "",
245
+ "turn_id": "",
246
+ "transcript_path": "",
247
+ "model": model or "",
248
+ "source": "strands",
249
+ "stop_hook_active": None,
250
+ "last_assistant_message": "",
251
+ "raw_input": None,
252
+ }
253
+
254
+ by_model_out = []
255
+ for entry in by_model or []:
256
+ tokens = _normalize_tokens(entry)
257
+ em = entry.get("model", "unknown")
258
+ by_model_out.append(
259
+ {
260
+ "model": em,
261
+ "input_tokens": tokens["input"],
262
+ "output_tokens": tokens["output"],
263
+ "cache_creation_input_tokens": tokens["cache_creation"],
264
+ "cache_read_input_tokens": tokens["cache_read"],
265
+ "estimated_cost_usd": _cost_for_model(em, tokens),
266
+ }
267
+ )
268
+
269
+ flat = _normalize_tokens(
270
+ {
271
+ "input_tokens": input_tokens,
272
+ "output_tokens": output_tokens,
273
+ "cache_creation_input_tokens": cache_creation_input_tokens,
274
+ "cache_read_input_tokens": cache_read_input_tokens,
275
+ }
276
+ )
277
+ cost = (
278
+ round(sum(m["estimated_cost_usd"] for m in by_model_out), 6)
279
+ if by_model_out
280
+ else _cost_for_model(model, flat)
281
+ )
282
+
283
+ event["usage"] = {
284
+ "model": model or self.runtime,
285
+ "duration_s": duration_s,
286
+ "input_tokens": flat["input"],
287
+ "output_tokens": flat["output"],
288
+ "cache_creation_input_tokens": flat["cache_creation"],
289
+ "cache_read_input_tokens": flat["cache_read"],
290
+ "estimated_cost_usd": cost,
291
+ "pricing_version": _pricing_version(),
292
+ "by_model": by_model_out or None,
293
+ }
294
+
295
+ try:
296
+ with self._log_file.open("a", encoding="utf-8") as fh:
297
+ fh.write(json.dumps(event) + "\n")
298
+ except OSError:
299
+ pass # fail-open: telemetry must never block agent work
300
+
301
+ return event
302
+
219
303
 
220
304
  def _normalize_tool_name(name: str) -> str:
221
305
  """
@@ -236,3 +320,91 @@ def _normalize_tool_name(name: str) -> str:
236
320
  "use_subagent": "use_subagent",
237
321
  }
238
322
  return _MAP.get(name.lower(), name)
323
+
324
+
325
+ # ---------------------------------------------------------------------------
326
+ # Usage / cost — mirror of scripts/telemetry/pricing.json (per 1M tokens, USD)
327
+ # ---------------------------------------------------------------------------
328
+
329
+ # Pricing is read from the single-source registry (scripts/telemetry/pricing.json),
330
+ # never hand-maintained here. Resolution: TELEMETRY_PRICING_FILE /
331
+ # FLOW_AGENTS_PRICING_FILE env path, else the repo-relative registry, else a
332
+ # minimal fallback. Tokens are exact regardless; the console recomputes cost
333
+ # authoritatively, so a missing file only degrades the sink's stamped estimate.
334
+ _FALLBACK_REGISTRY = {
335
+ "current_version": "fallback",
336
+ "versions": {
337
+ "fallback": {
338
+ "cache_multipliers": {"write_5m": 1.25, "write_1h": 2.0, "read": 0.1},
339
+ "models": {},
340
+ "default": {"input": 5.0, "output": 25.0},
341
+ "zero_cost_models": ["<synthetic>", "synthetic", "unknown", ""],
342
+ }
343
+ },
344
+ }
345
+ _REGISTRY_CACHE: Optional[Dict[str, Any]] = None
346
+
347
+
348
+ def _load_registry() -> Dict[str, Any]:
349
+ global _REGISTRY_CACHE
350
+ if _REGISTRY_CACHE is not None:
351
+ return _REGISTRY_CACHE
352
+ here = os.path.dirname(os.path.abspath(__file__))
353
+ candidates = [
354
+ os.environ.get("TELEMETRY_PRICING_FILE"),
355
+ os.environ.get("FLOW_AGENTS_PRICING_FILE"),
356
+ os.path.join(here, "..", "..", "..", "scripts", "telemetry", "pricing.json"),
357
+ os.path.join(here, "..", "..", "..", "..", "scripts", "telemetry", "pricing.json"),
358
+ ]
359
+ for candidate in candidates:
360
+ if not candidate:
361
+ continue
362
+ try:
363
+ with open(candidate, "r", encoding="utf-8") as fh:
364
+ parsed = json.load(fh)
365
+ if isinstance(parsed, dict) and isinstance(parsed.get("versions"), dict):
366
+ _REGISTRY_CACHE = parsed
367
+ return _REGISTRY_CACHE
368
+ except (OSError, ValueError):
369
+ continue
370
+ _REGISTRY_CACHE = _FALLBACK_REGISTRY
371
+ return _REGISTRY_CACHE
372
+
373
+
374
+ def _pricing_version() -> str:
375
+ return str(_load_registry().get("current_version", "fallback"))
376
+
377
+
378
+ def _version_block() -> Dict[str, Any]:
379
+ reg = _load_registry()
380
+ versions = reg.get("versions", {})
381
+ return versions.get(reg.get("current_version"), _FALLBACK_REGISTRY["versions"]["fallback"])
382
+
383
+
384
+ def _num(value: Any) -> int:
385
+ return value if isinstance(value, (int, float)) else 0
386
+
387
+
388
+ def _normalize_tokens(entry: Dict[str, Any]) -> Dict[str, int]:
389
+ return {
390
+ "input": _num(entry.get("input_tokens")),
391
+ "output": _num(entry.get("output_tokens")),
392
+ "cache_creation": _num(entry.get("cache_creation_input_tokens")),
393
+ "cache_read": _num(entry.get("cache_read_input_tokens")),
394
+ }
395
+
396
+
397
+ def _cost_for_model(model: Optional[str], tokens: Dict[str, int]) -> float:
398
+ block = _version_block()
399
+ key = (model or "").strip()
400
+ if key in set(block.get("zero_cost_models", [])):
401
+ return 0.0
402
+ rate = block.get("models", {}).get(key, block.get("default", {"input": 5.0, "output": 25.0}))
403
+ cm = block.get("cache_multipliers", {"write_5m": 1.25, "read": 0.1})
404
+ cost = (
405
+ tokens["input"] * rate["input"]
406
+ + tokens["output"] * rate["output"]
407
+ + tokens["cache_creation"] * rate["input"] * cm["write_5m"]
408
+ + tokens["cache_read"] * rate["input"] * cm["read"]
409
+ ) / 1_000_000
410
+ return round(cost, 6)
@@ -0,0 +1,129 @@
1
+ """Tests for usage + cost: emit_usage, _extract_model_usage, _cost_for_model.
2
+
3
+ Covers the Python sink's share of the telemetry usage/cost surface, plus the
4
+ cross-runtime golden vectors (scripts/telemetry/pricing.golden.json) which must
5
+ price identically across bash / Python / the console-telemetry package.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import tempfile
11
+ import unittest
12
+
13
+ from flow_agents_strands.telemetry import TelemetrySink, _cost_for_model, _normalize_tokens
14
+ from flow_agents_strands.hooks import _extract_model_usage
15
+
16
+ _HERE = os.path.dirname(os.path.abspath(__file__))
17
+ _GOLDEN = os.path.join(_HERE, "..", "..", "..", "scripts", "telemetry", "pricing.golden.json")
18
+
19
+
20
+ def _read_usage_event(sink_dir):
21
+ """Return the single session.usage event written under sink_dir."""
22
+ for root, _dirs, files in os.walk(sink_dir):
23
+ for name in files:
24
+ if name == "full.jsonl":
25
+ with open(os.path.join(root, name), encoding="utf-8") as fh:
26
+ for line in fh:
27
+ rec = json.loads(line)
28
+ if rec.get("event_type") == "session.usage":
29
+ return rec["usage"]
30
+ return None
31
+
32
+
33
+ class TestEmitUsage(unittest.TestCase):
34
+ def test_emit_usage_writes_tokens_cost_version_and_by_model(self):
35
+ d = tempfile.mkdtemp()
36
+ sink = TelemetrySink(workspace=d)
37
+ sink.emit_usage(
38
+ model="claude-opus-4-8",
39
+ input_tokens=1000,
40
+ output_tokens=2000,
41
+ cache_read_input_tokens=500000,
42
+ by_model=[
43
+ {"model": "claude-opus-4-8", "input_tokens": 1000, "output_tokens": 2000, "cache_read_input_tokens": 500000}
44
+ ],
45
+ )
46
+ usage = _read_usage_event(d)
47
+ self.assertIsNotNone(usage)
48
+ self.assertEqual(usage["input_tokens"], 1000)
49
+ self.assertEqual(usage["output_tokens"], 2000)
50
+ self.assertEqual(usage["cache_read_input_tokens"], 500000)
51
+ self.assertEqual(usage["pricing_version"], "2026-06-28")
52
+ # opus: (1000*5 + 2000*25 + 500000*5*0.1)/1e6 = 0.305
53
+ self.assertAlmostEqual(usage["estimated_cost_usd"], 0.305, places=6)
54
+ self.assertEqual(usage["by_model"][0]["model"], "claude-opus-4-8")
55
+
56
+ def test_emit_usage_multi_model_sums_and_prices_each(self):
57
+ d = tempfile.mkdtemp()
58
+ sink = TelemetrySink(workspace=d)
59
+ sink.emit_usage(
60
+ input_tokens=0,
61
+ output_tokens=2000,
62
+ by_model=[
63
+ {"model": "claude-opus-4-8", "output_tokens": 1000},
64
+ {"model": "claude-haiku-4-5", "output_tokens": 1000},
65
+ ],
66
+ )
67
+ usage = _read_usage_event(d)
68
+ costs = {m["model"]: m["estimated_cost_usd"] for m in usage["by_model"]}
69
+ self.assertAlmostEqual(costs["claude-opus-4-8"], 0.025, places=6) # 1000*25/1e6
70
+ self.assertAlmostEqual(costs["claude-haiku-4-5"], 0.005, places=6) # 1000*5/1e6
71
+ self.assertAlmostEqual(usage["estimated_cost_usd"], 0.03, places=6)
72
+
73
+
74
+ class TestExtractModelUsage(unittest.TestCase):
75
+ class _Ev:
76
+ pass
77
+
78
+ def _ev(self, **kw):
79
+ e = self._Ev()
80
+ for k, v in kw.items():
81
+ setattr(e, k, v)
82
+ return e
83
+
84
+ def test_extract_from_object_with_usage_and_model(self):
85
+ e = self._ev(model="claude-opus-4-8", usage={"input_tokens": 10, "output_tokens": 20, "cache_read_input_tokens": 30})
86
+ got = _extract_model_usage(e)
87
+ self.assertEqual(got, {"model": "claude-opus-4-8", "input": 10, "output": 20, "cache_creation": 0, "cache_read": 30})
88
+
89
+ def test_extract_from_dict_and_camelcase(self):
90
+ e = self._ev(usage={"inputTokens": 5, "outputTokens": 6}, model_id="claude-haiku-4-5")
91
+ got = _extract_model_usage(e)
92
+ self.assertEqual(got["model"], "claude-haiku-4-5")
93
+ self.assertEqual(got["input"], 5)
94
+ self.assertEqual(got["output"], 6)
95
+
96
+ def test_extract_from_nested_response(self):
97
+ e = self._ev(response={"model": "claude-fable-5", "usage": {"output_tokens": 100}})
98
+ got = _extract_model_usage(e)
99
+ self.assertEqual(got["model"], "claude-fable-5")
100
+ self.assertEqual(got["output"], 100)
101
+
102
+ def test_extract_returns_none_when_no_usage(self):
103
+ self.assertIsNone(_extract_model_usage(self._ev(model="x")))
104
+
105
+ def test_extract_returns_none_when_all_zero(self):
106
+ self.assertIsNone(_extract_model_usage(self._ev(model="x", usage={"input_tokens": 0, "output_tokens": 0})))
107
+
108
+
109
+ class TestGoldenVectors(unittest.TestCase):
110
+ def test_cross_runtime_golden_vectors(self):
111
+ with open(_GOLDEN, encoding="utf-8") as fh:
112
+ golden = json.load(fh)
113
+ for case in golden["cases"]:
114
+ t = case["tokens"]
115
+ tokens = _normalize_tokens({
116
+ "input_tokens": t["input"],
117
+ "output_tokens": t["output"],
118
+ "cache_creation_input_tokens": t["cache_creation"],
119
+ "cache_read_input_tokens": t["cache_read"],
120
+ })
121
+ cost = _cost_for_model(case["model"], tokens)
122
+ self.assertAlmostEqual(
123
+ cost, case["expected_cost_usd"], places=6,
124
+ msg=f"golden '{case['name']}' ({case['model']}): expected {case['expected_cost_usd']}, got {cost}",
125
+ )
126
+
127
+
128
+ if __name__ == "__main__":
129
+ unittest.main()