npm - @kontourai/flow-agents - Versions diffs - 2.0.0 → 2.1.0 - Mend

@kontourai/flow-agents 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/.github/actions/trust-verify/action.yml +4 -2
package/.github/workflows/ci.yml +12 -0
package/.github/workflows/runtime-compat.yml +1 -1
package/CHANGELOG.md +29 -0
package/README.md +3 -3
package/build/src/cli/workflow-sidecar.d.ts +16 -0
package/build/src/cli/workflow-sidecar.js +72 -12
package/build/src/lib/flow-resolver.d.ts +29 -0
package/build/src/lib/flow-resolver.js +71 -0
package/context/scripts/telemetry/lib/config.sh +15 -0
package/context/scripts/telemetry/telemetry.conf +4 -0
package/context/scripts/telemetry/telemetry.sh +23 -1
package/docs/design/flowrun-eventsourcing-design.md +216 -0
package/docs/design/workflowrun-observability-design.md +431 -0
package/evals/ci/antigaming-suite.sh +2 -0
package/evals/ci/run-baseline.sh +2 -0
package/evals/integration/test_command_log_concurrency.sh +114 -0
package/evals/integration/test_command_log_fork_classification.sh +134 -0
package/evals/integration/test_kit_identity_trust.sh +393 -0
package/evals/integration/test_usage_cost.sh +119 -0
package/evals/integration/test_verify_cli.sh +23 -0
package/evals/run.sh +2 -0
package/integrations/strands/flow_agents_strands/hooks.py +126 -1
package/integrations/strands/flow_agents_strands/telemetry.py +172 -0
package/integrations/strands/tests/test_usage.py +129 -0
package/integrations/strands-ts/src/hooks.ts +135 -1
package/integrations/strands-ts/src/telemetry.ts +170 -0
package/integrations/strands-ts/test/test-usage.ts +85 -0
package/package.json +5 -5
package/scripts/hooks/evidence-capture.js +75 -13
package/scripts/hooks/stop-goal-fit.js +76 -23
package/scripts/repair-command-log.js +115 -0
package/scripts/telemetry/lib/config.sh +15 -0
package/scripts/telemetry/lib/pricing.sh +42 -0
package/scripts/telemetry/lib/usage.sh +108 -0
package/scripts/telemetry/pricing.golden.json +15 -0
package/scripts/telemetry/pricing.json +31 -0
package/scripts/telemetry/telemetry.conf +4 -0
package/scripts/telemetry/telemetry.sh +23 -1
package/src/cli/workflow-sidecar.ts +73 -11
package/src/lib/flow-resolver.ts +85 -0

package/evals/integration/test_verify_cli.sh CHANGED Viewed

@@ -196,6 +196,29 @@ else
   _fail "HELP-FLAG: expected usage text, got: $out4"
 fi
+# ─── TEST 5: composite action path resolution ──────────────────────────────────
+# Regression for the cross-repo path bug: the action at .github/actions/trust-verify/
+# resolves node scripts relative to github.action_path. A wrong `../` depth makes the
+# action fail with "Cannot find module" in a CONSUMER repo (it passes a local CLI test
+# but breaks the actual adoption path). Assert every action_path-relative script ref
+# resolves to a real file.
+echo "=== TEST 5: trust-verify action node refs resolve to real scripts ==="
+if node -e '
+  const fs=require("fs"), path=require("path");
+  const root=process.argv[1];
+  const actionDir=path.join(root,".github/actions/trust-verify");
+  const y=fs.readFileSync(path.join(actionDir,"action.yml"),"utf8");
+  const refs=[...y.matchAll(/action_path \}\}\/([^"]+\.js)/g)].map(m=>m[1]);
+  if(refs.length===0){console.error("no action_path script refs found");process.exit(1);}
+  let ok=true;
+  for(const r of refs){ if(!fs.existsSync(path.resolve(actionDir,r))){console.error("UNRESOLVED: "+r);ok=false;} }
+  process.exit(ok?0:1);
+' "$ROOT"; then
+  _pass "ACTION-PATH: all trust-verify action.yml script refs resolve"
+else
+  _fail "ACTION-PATH: a trust-verify action.yml script ref does not resolve (wrong ../ depth?)"
+fi
 # ─── Summary ──────────────────────────────────────────────────────────────────
 echo ""
 echo "────────────────────────────────────────────"

package/evals/run.sh CHANGED Viewed

@@ -242,6 +242,8 @@ run_integration() {
   echo ""
   bash "$EVAL_DIR/integration/test_verify_cli.sh" || result=1
   echo ""
+  bash "$EVAL_DIR/integration/test_kit_identity_trust.sh" || result=1
+  echo ""
   bash "$EVAL_DIR/acceptance/prove-capture-teeth-declared.sh" || result=1
   return $result
 }

package/integrations/strands/flow_agents_strands/hooks.py CHANGED Viewed

@@ -81,6 +81,8 @@ class FlowAgentsHooks:
         self._policy = policy_gate if policy_gate is not None else PolicyGate()
         self._steering = SteeringContext(workspace=workspace)
         self._session_start_ts: Optional[float] = None
+        # Per-model token accumulator, summed across model-call events.
+        self._usage_by_model: Dict[str, Dict[str, int]] = {}
     # ------------------------------------------------------------------
     # Public API available WITHOUT strands installed
@@ -137,6 +139,21 @@ class FlowAgentsHooks:
         registry.add_callback(BeforeToolCallEvent, self._on_before_tool_call)
         registry.add_callback(AfterToolCallEvent, self._on_after_tool_call)
+        # Model-call event carries per-call token usage (the SDK's documented
+        # usage source). Optional — registered only if the installed SDK exposes
+        # it, under whichever name this SDK version uses.
+        try:
+            import strands.hooks as _sh  # type: ignore[import]
+            model_event = (
+                getattr(_sh, "AfterModelCallEvent", None)
+                or getattr(_sh, "AfterModelInvocationEvent", None)
+            )
+            if model_event is not None:
+                registry.add_callback(model_event, self._on_after_model_call)
+        except ImportError:
+            pass
     # ------------------------------------------------------------------
     # Private callbacks
     # ------------------------------------------------------------------
@@ -144,6 +161,7 @@ class FlowAgentsHooks:
     def _on_agent_initialized(self, event: Any) -> None:
         """AgentInitializedEvent → agentSpawn / session.start"""
         self._session_start_ts = time.monotonic()
+        self._usage_by_model = {}
         self._sink.emit_session_start()
     def _on_before_invocation(self, event: Any) -> None:
@@ -153,12 +171,58 @@ class FlowAgentsHooks:
         self._sink.emit("userPromptSubmit")
     def _on_after_invocation(self, event: Any) -> None:
-        """AfterInvocationEvent → stop / session.end"""
+        """AfterInvocationEvent → emit session.usage (if any) then stop / session.end"""
         duration_s = 0.0
         if self._session_start_ts is not None:
             duration_s = time.monotonic() - self._session_start_ts
+        if self._usage_by_model:
+            by_model = []
+            totals = {"input": 0, "output": 0, "cache_creation": 0, "cache_read": 0}
+            for model, tok in self._usage_by_model.items():
+                by_model.append(
+                    {
+                        "model": model,
+                        "input_tokens": tok["input"],
+                        "output_tokens": tok["output"],
+                        "cache_creation_input_tokens": tok["cache_creation"],
+                        "cache_read_input_tokens": tok["cache_read"],
+                    }
+                )
+                for key in totals:
+                    totals[key] += tok[key]
+            self._sink.emit_usage(
+                model=next(iter(self._usage_by_model)) if len(self._usage_by_model) == 1 else None,
+                input_tokens=totals["input"],
+                output_tokens=totals["output"],
+                cache_creation_input_tokens=totals["cache_creation"],
+                cache_read_input_tokens=totals["cache_read"],
+                duration_s=duration_s,
+                by_model=by_model,
+            )
+            self._usage_by_model = {}
         self._sink.emit_session_end(duration_s=duration_s)
+    def _on_after_model_call(self, event: Any) -> None:
+        """Model-call event → accumulate per-model token usage.
+        Reads the documented Anthropic usage object (input_tokens, output_tokens,
+        cache_creation_input_tokens, cache_read_input_tokens) from wherever the
+        Strands event surfaces it. Defensive across SDK shapes; no-op if absent.
+        """
+        extracted = _extract_model_usage(event)
+        if extracted is None:
+            return
+        model = extracted["model"]
+        acc = self._usage_by_model.setdefault(
+            model, {"input": 0, "output": 0, "cache_creation": 0, "cache_read": 0}
+        )
+        acc["input"] += extracted["input"]
+        acc["output"] += extracted["output"]
+        acc["cache_creation"] += extracted["cache_creation"]
+        acc["cache_read"] += extracted["cache_read"]
     def _on_before_tool_call(self, event: Any) -> None:
         """
         BeforeToolCallEvent → preToolUse / tool.invoke + policy gate.
@@ -192,3 +256,64 @@ class FlowAgentsHooks:
         tool_name = tool_use.get("name", "")
         result = getattr(event, "result", None)
         self._sink.emit_tool_result(tool_name=tool_name, tool_output=result)
+# ----------------------------------------------------------------------------
+# Usage extraction — map a Strands model-call event onto the documented
+# Anthropic usage object, defensively across SDK shapes (object or dict).
+# ----------------------------------------------------------------------------
+def _attr(obj: Any, *keys: str) -> Any:
+    for key in keys:
+        if isinstance(obj, dict):
+            if key in obj and obj[key] is not None:
+                return obj[key]
+        else:
+            value = getattr(obj, key, None)
+            if value is not None:
+                return value
+    return None
+def _num(obj: Any, *keys: str) -> int:
+    value = _attr(obj, *keys)
+    return value if isinstance(value, (int, float)) else 0
+def _extract_model_usage(event: Any) -> Optional[Dict[str, Any]]:
+    containers = [
+        event,
+        _attr(event, "usage"),
+        _attr(event, "response"),
+        _attr(event, "result"),
+        _attr(event, "message"),
+        _attr(event, "output"),
+        _attr(event, "model_response"),
+    ]
+    usage = None
+    model_carrier = None
+    for container in containers:
+        if container is None:
+            continue
+        candidate = _attr(container, "usage")
+        if candidate is None and (_attr(container, "input_tokens", "inputTokens") is not None):
+            candidate = container
+        if candidate is not None and usage is None:
+            usage = candidate
+        if model_carrier is None and _attr(container, "model", "model_id", "modelId") is not None:
+            model_carrier = container
+    if usage is None:
+        return None
+    tokens = {
+        "input": _num(usage, "input_tokens", "inputTokens"),
+        "output": _num(usage, "output_tokens", "outputTokens"),
+        "cache_creation": _num(usage, "cache_creation_input_tokens", "cacheCreationInputTokens"),
+        "cache_read": _num(usage, "cache_read_input_tokens", "cacheReadInputTokens"),
+    }
+    if not any(tokens.values()):
+        return None
+    model = _attr(model_carrier, "model", "model_id", "modelId") or _attr(usage, "model") or "unknown"
+    return {"model": str(model), **tokens}

package/integrations/strands/flow_agents_strands/telemetry.py CHANGED Viewed

@@ -216,6 +216,90 @@ class TelemetrySink:
             {"turn": {"prompt_text": "", "steering_context": steering_text}},
         )
+    def emit_usage(
+        self,
+        *,
+        model: Optional[str] = None,
+        input_tokens: int = 0,
+        output_tokens: int = 0,
+        cache_creation_input_tokens: int = 0,
+        cache_read_input_tokens: int = 0,
+        duration_s: Optional[float] = None,
+        by_model: Optional[list] = None,
+    ) -> Dict[str, Any]:
+        """
+        Emit a ``session.usage`` event with real token counts + derived cost.
+        The Strands SDK surfaces per-invocation usage on model-call events;
+        accumulate those and pass the totals here at session end. Tokens are the
+        source of truth; ``estimated_cost_usd`` is derived from PRICING (the
+        console recomputes it authoritatively, so a pricing change is
+        retroactive). Mirrors the ``session.usage`` shape emitted by
+        scripts/telemetry/telemetry.sh so the console aggregates both the same.
+        """
+        event = self._base_event("session.usage")
+        event["event_id"] = f"{event['event_id']}-usage"
+        event["hook"] = {
+            "event_name": "usage",
+            "runtime_session_id": "",
+            "turn_id": "",
+            "transcript_path": "",
+            "model": model or "",
+            "source": "strands",
+            "stop_hook_active": None,
+            "last_assistant_message": "",
+            "raw_input": None,
+        }
+        by_model_out = []
+        for entry in by_model or []:
+            tokens = _normalize_tokens(entry)
+            em = entry.get("model", "unknown")
+            by_model_out.append(
+                {
+                    "model": em,
+                    "input_tokens": tokens["input"],
+                    "output_tokens": tokens["output"],
+                    "cache_creation_input_tokens": tokens["cache_creation"],
+                    "cache_read_input_tokens": tokens["cache_read"],
+                    "estimated_cost_usd": _cost_for_model(em, tokens),
+                }
+            )
+        flat = _normalize_tokens(
+            {
+                "input_tokens": input_tokens,
+                "output_tokens": output_tokens,
+                "cache_creation_input_tokens": cache_creation_input_tokens,
+                "cache_read_input_tokens": cache_read_input_tokens,
+            }
+        )
+        cost = (
+            round(sum(m["estimated_cost_usd"] for m in by_model_out), 6)
+            if by_model_out
+            else _cost_for_model(model, flat)
+        )
+        event["usage"] = {
+            "model": model or self.runtime,
+            "duration_s": duration_s,
+            "input_tokens": flat["input"],
+            "output_tokens": flat["output"],
+            "cache_creation_input_tokens": flat["cache_creation"],
+            "cache_read_input_tokens": flat["cache_read"],
+            "estimated_cost_usd": cost,
+            "pricing_version": _pricing_version(),
+            "by_model": by_model_out or None,
+        }
+        try:
+            with self._log_file.open("a", encoding="utf-8") as fh:
+                fh.write(json.dumps(event) + "\n")
+        except OSError:
+            pass  # fail-open: telemetry must never block agent work
+        return event
 def _normalize_tool_name(name: str) -> str:
     """
@@ -236,3 +320,91 @@ def _normalize_tool_name(name: str) -> str:
         "use_subagent": "use_subagent",
     }
     return _MAP.get(name.lower(), name)
+# ---------------------------------------------------------------------------
+# Usage / cost — mirror of scripts/telemetry/pricing.json (per 1M tokens, USD)
+# ---------------------------------------------------------------------------
+# Pricing is read from the single-source registry (scripts/telemetry/pricing.json),
+# never hand-maintained here. Resolution: TELEMETRY_PRICING_FILE /
+# FLOW_AGENTS_PRICING_FILE env path, else the repo-relative registry, else a
+# minimal fallback. Tokens are exact regardless; the console recomputes cost
+# authoritatively, so a missing file only degrades the sink's stamped estimate.
+_FALLBACK_REGISTRY = {
+    "current_version": "fallback",
+    "versions": {
+        "fallback": {
+            "cache_multipliers": {"write_5m": 1.25, "write_1h": 2.0, "read": 0.1},
+            "models": {},
+            "default": {"input": 5.0, "output": 25.0},
+            "zero_cost_models": ["<synthetic>", "synthetic", "unknown", ""],
+        }
+    },
+}
+_REGISTRY_CACHE: Optional[Dict[str, Any]] = None
+def _load_registry() -> Dict[str, Any]:
+    global _REGISTRY_CACHE
+    if _REGISTRY_CACHE is not None:
+        return _REGISTRY_CACHE
+    here = os.path.dirname(os.path.abspath(__file__))
+    candidates = [
+        os.environ.get("TELEMETRY_PRICING_FILE"),
+        os.environ.get("FLOW_AGENTS_PRICING_FILE"),
+        os.path.join(here, "..", "..", "..", "scripts", "telemetry", "pricing.json"),
+        os.path.join(here, "..", "..", "..", "..", "scripts", "telemetry", "pricing.json"),
+    ]
+    for candidate in candidates:
+        if not candidate:
+            continue
+        try:
+            with open(candidate, "r", encoding="utf-8") as fh:
+                parsed = json.load(fh)
+            if isinstance(parsed, dict) and isinstance(parsed.get("versions"), dict):
+                _REGISTRY_CACHE = parsed
+                return _REGISTRY_CACHE
+        except (OSError, ValueError):
+            continue
+    _REGISTRY_CACHE = _FALLBACK_REGISTRY
+    return _REGISTRY_CACHE
+def _pricing_version() -> str:
+    return str(_load_registry().get("current_version", "fallback"))
+def _version_block() -> Dict[str, Any]:
+    reg = _load_registry()
+    versions = reg.get("versions", {})
+    return versions.get(reg.get("current_version"), _FALLBACK_REGISTRY["versions"]["fallback"])
+def _num(value: Any) -> int:
+    return value if isinstance(value, (int, float)) else 0
+def _normalize_tokens(entry: Dict[str, Any]) -> Dict[str, int]:
+    return {
+        "input": _num(entry.get("input_tokens")),
+        "output": _num(entry.get("output_tokens")),
+        "cache_creation": _num(entry.get("cache_creation_input_tokens")),
+        "cache_read": _num(entry.get("cache_read_input_tokens")),
+    }
+def _cost_for_model(model: Optional[str], tokens: Dict[str, int]) -> float:
+    block = _version_block()
+    key = (model or "").strip()
+    if key in set(block.get("zero_cost_models", [])):
+        return 0.0
+    rate = block.get("models", {}).get(key, block.get("default", {"input": 5.0, "output": 25.0}))
+    cm = block.get("cache_multipliers", {"write_5m": 1.25, "read": 0.1})
+    cost = (
+        tokens["input"] * rate["input"]
+        + tokens["output"] * rate["output"]
+        + tokens["cache_creation"] * rate["input"] * cm["write_5m"]
+        + tokens["cache_read"] * rate["input"] * cm["read"]
+    ) / 1_000_000
+    return round(cost, 6)

package/integrations/strands/tests/test_usage.py ADDED Viewed

@@ -0,0 +1,129 @@
+"""Tests for usage + cost: emit_usage, _extract_model_usage, _cost_for_model.
+Covers the Python sink's share of the telemetry usage/cost surface, plus the
+cross-runtime golden vectors (scripts/telemetry/pricing.golden.json) which must
+price identically across bash / Python / the console-telemetry package.
+"""
+import json
+import os
+import tempfile
+import unittest
+from flow_agents_strands.telemetry import TelemetrySink, _cost_for_model, _normalize_tokens
+from flow_agents_strands.hooks import _extract_model_usage
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_GOLDEN = os.path.join(_HERE, "..", "..", "..", "scripts", "telemetry", "pricing.golden.json")
+def _read_usage_event(sink_dir):
+    """Return the single session.usage event written under sink_dir."""
+    for root, _dirs, files in os.walk(sink_dir):
+        for name in files:
+            if name == "full.jsonl":
+                with open(os.path.join(root, name), encoding="utf-8") as fh:
+                    for line in fh:
+                        rec = json.loads(line)
+                        if rec.get("event_type") == "session.usage":
+                            return rec["usage"]
+    return None
+class TestEmitUsage(unittest.TestCase):
+    def test_emit_usage_writes_tokens_cost_version_and_by_model(self):
+        d = tempfile.mkdtemp()
+        sink = TelemetrySink(workspace=d)
+        sink.emit_usage(
+            model="claude-opus-4-8",
+            input_tokens=1000,
+            output_tokens=2000,
+            cache_read_input_tokens=500000,
+            by_model=[
+                {"model": "claude-opus-4-8", "input_tokens": 1000, "output_tokens": 2000, "cache_read_input_tokens": 500000}
+            ],
+        )
+        usage = _read_usage_event(d)
+        self.assertIsNotNone(usage)
+        self.assertEqual(usage["input_tokens"], 1000)
+        self.assertEqual(usage["output_tokens"], 2000)
+        self.assertEqual(usage["cache_read_input_tokens"], 500000)
+        self.assertEqual(usage["pricing_version"], "2026-06-28")
+        # opus: (1000*5 + 2000*25 + 500000*5*0.1)/1e6 = 0.305
+        self.assertAlmostEqual(usage["estimated_cost_usd"], 0.305, places=6)
+        self.assertEqual(usage["by_model"][0]["model"], "claude-opus-4-8")
+    def test_emit_usage_multi_model_sums_and_prices_each(self):
+        d = tempfile.mkdtemp()
+        sink = TelemetrySink(workspace=d)
+        sink.emit_usage(
+            input_tokens=0,
+            output_tokens=2000,
+            by_model=[
+                {"model": "claude-opus-4-8", "output_tokens": 1000},
+                {"model": "claude-haiku-4-5", "output_tokens": 1000},
+            ],
+        )
+        usage = _read_usage_event(d)
+        costs = {m["model"]: m["estimated_cost_usd"] for m in usage["by_model"]}
+        self.assertAlmostEqual(costs["claude-opus-4-8"], 0.025, places=6)  # 1000*25/1e6
+        self.assertAlmostEqual(costs["claude-haiku-4-5"], 0.005, places=6)  # 1000*5/1e6
+        self.assertAlmostEqual(usage["estimated_cost_usd"], 0.03, places=6)
+class TestExtractModelUsage(unittest.TestCase):
+    class _Ev:
+        pass
+    def _ev(self, **kw):
+        e = self._Ev()
+        for k, v in kw.items():
+            setattr(e, k, v)
+        return e
+    def test_extract_from_object_with_usage_and_model(self):
+        e = self._ev(model="claude-opus-4-8", usage={"input_tokens": 10, "output_tokens": 20, "cache_read_input_tokens": 30})
+        got = _extract_model_usage(e)
+        self.assertEqual(got, {"model": "claude-opus-4-8", "input": 10, "output": 20, "cache_creation": 0, "cache_read": 30})
+    def test_extract_from_dict_and_camelcase(self):
+        e = self._ev(usage={"inputTokens": 5, "outputTokens": 6}, model_id="claude-haiku-4-5")
+        got = _extract_model_usage(e)
+        self.assertEqual(got["model"], "claude-haiku-4-5")
+        self.assertEqual(got["input"], 5)
+        self.assertEqual(got["output"], 6)
+    def test_extract_from_nested_response(self):
+        e = self._ev(response={"model": "claude-fable-5", "usage": {"output_tokens": 100}})
+        got = _extract_model_usage(e)
+        self.assertEqual(got["model"], "claude-fable-5")
+        self.assertEqual(got["output"], 100)
+    def test_extract_returns_none_when_no_usage(self):
+        self.assertIsNone(_extract_model_usage(self._ev(model="x")))
+    def test_extract_returns_none_when_all_zero(self):
+        self.assertIsNone(_extract_model_usage(self._ev(model="x", usage={"input_tokens": 0, "output_tokens": 0})))
+class TestGoldenVectors(unittest.TestCase):
+    def test_cross_runtime_golden_vectors(self):
+        with open(_GOLDEN, encoding="utf-8") as fh:
+            golden = json.load(fh)
+        for case in golden["cases"]:
+            t = case["tokens"]
+            tokens = _normalize_tokens({
+                "input_tokens": t["input"],
+                "output_tokens": t["output"],
+                "cache_creation_input_tokens": t["cache_creation"],
+                "cache_read_input_tokens": t["cache_read"],
+            })
+            cost = _cost_for_model(case["model"], tokens)
+            self.assertAlmostEqual(
+                cost, case["expected_cost_usd"], places=6,
+                msg=f"golden '{case['name']}' ({case['model']}): expected {case['expected_cost_usd']}, got {cost}",
+            )
+if __name__ == "__main__":
+    unittest.main()