@kontourai/flow-agents 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. package/.githooks/pre-push +11 -0
  2. package/.github/workflows/ci.yml +210 -0
  3. package/.github/workflows/docs-pages.yml +52 -0
  4. package/.github/workflows/publish-npm.yml +104 -0
  5. package/AGENTS.md +26 -0
  6. package/CHANGELOG.md +66 -0
  7. package/CODE_OF_CONDUCT.md +25 -0
  8. package/CONTEXT.md +300 -0
  9. package/CONTRIBUTING.md +44 -0
  10. package/LICENSE +201 -0
  11. package/README.md +129 -0
  12. package/SECURITY.md +33 -0
  13. package/agent-cards/dev.json +19 -0
  14. package/agents/dev.json +127 -0
  15. package/agents/tool-code-reviewer.json +61 -0
  16. package/agents/tool-dependencies-updater.json +118 -0
  17. package/agents/tool-explore-config.json +92 -0
  18. package/agents/tool-explore-deps.json +92 -0
  19. package/agents/tool-explore-entry.json +92 -0
  20. package/agents/tool-explore-patterns.json +92 -0
  21. package/agents/tool-explore-structure.json +92 -0
  22. package/agents/tool-explore-tests.json +92 -0
  23. package/agents/tool-planner.json +57 -0
  24. package/agents/tool-playwright.json +145 -0
  25. package/agents/tool-security-reviewer.json +56 -0
  26. package/agents/tool-verifier.json +61 -0
  27. package/agents/tool-worker.json +58 -0
  28. package/build/src/cli/console-learning-projection.js +123 -0
  29. package/build/src/cli/docs-preview.js +39 -0
  30. package/build/src/cli/effective-backlog-settings.js +102 -0
  31. package/build/src/cli/export-bookmarks.js +38 -0
  32. package/build/src/cli/fixture-retirement-audit.js +140 -0
  33. package/build/src/cli/flow-kit.js +138 -0
  34. package/build/src/cli/import-bookmarks.js +50 -0
  35. package/build/src/cli/init.js +239 -0
  36. package/build/src/cli/instinct-cli.js +93 -0
  37. package/build/src/cli/promote-workflow-artifact.js +63 -0
  38. package/build/src/cli/publish-change-helper.js +154 -0
  39. package/build/src/cli/pull-work-provider.js +469 -0
  40. package/build/src/cli/runtime-adapter.js +23 -0
  41. package/build/src/cli/telemetry-doctor.js +221 -0
  42. package/build/src/cli/usage-feedback.js +443 -0
  43. package/build/src/cli/validate-hook-influence.js +152 -0
  44. package/build/src/cli/validate-source-tree.js +31 -0
  45. package/build/src/cli/validate-workflow-artifacts.js +486 -0
  46. package/build/src/cli/veritas-governance.js +262 -0
  47. package/build/src/cli/workflow-artifact-cleanup-audit.js +272 -0
  48. package/build/src/cli/workflow-sidecar.js +816 -0
  49. package/build/src/cli.js +89 -0
  50. package/build/src/flow-kit/validate.js +75 -0
  51. package/build/src/lib/args.js +45 -0
  52. package/build/src/lib/fs.js +62 -0
  53. package/build/src/lib/workflow-learning-projection.js +334 -0
  54. package/build/src/runtime-adapters.js +146 -0
  55. package/build/src/tools/build-universal-bundles.js +397 -0
  56. package/build/src/tools/common.js +56 -0
  57. package/build/src/tools/filter-installed-packs.js +132 -0
  58. package/build/src/tools/generate-context-map.js +198 -0
  59. package/build/src/tools/validate-package.js +64 -0
  60. package/build/src/tools/validate-source-tree.js +622 -0
  61. package/console.telemetry.json +176 -0
  62. package/context/base-rules.md +17 -0
  63. package/context/code-review-standards.md +62 -0
  64. package/context/coding-standards.md +42 -0
  65. package/context/common/orchestrators.md +12 -0
  66. package/context/common/subagents.md +28 -0
  67. package/context/contracts/artifact-contract.md +182 -0
  68. package/context/contracts/builder-kit-workflow-state-contract.md +319 -0
  69. package/context/contracts/delivery-contract.md +69 -0
  70. package/context/contracts/execution-contract.md +53 -0
  71. package/context/contracts/governance-adapter-contract.md +67 -0
  72. package/context/contracts/planning-contract.md +85 -0
  73. package/context/contracts/review-contract.md +104 -0
  74. package/context/contracts/sandbox-policy.md +52 -0
  75. package/context/contracts/verification-contract.md +134 -0
  76. package/context/contracts/work-item-contract.md +215 -0
  77. package/context/deferred/demo-mode.md +33 -0
  78. package/context/deferred/languages/go.md +31 -0
  79. package/context/deferred/languages/python.md +31 -0
  80. package/context/deferred/languages/typescript.md +34 -0
  81. package/context/deferred/parallelization.md +35 -0
  82. package/context/deferred/worktree-isolation.md +24 -0
  83. package/context/development-workflow.md +50 -0
  84. package/context/scripts/context-budget/budget-scan.sh +166 -0
  85. package/context/scripts/detect-tools.sh +3 -0
  86. package/context/scripts/discover-agents.sh +28 -0
  87. package/context/scripts/git-status.sh +49 -0
  88. package/context/scripts/hooks/config-protection.js +79 -0
  89. package/context/scripts/hooks/desktop-notify.sh +39 -0
  90. package/context/scripts/hooks/governance-audit.sh +135 -0
  91. package/context/scripts/hooks/lib/audit-transport.sh +40 -0
  92. package/context/scripts/hooks/lib/hook-flags.js +49 -0
  93. package/context/scripts/hooks/lib/patterns.sh +57 -0
  94. package/context/scripts/hooks/lib/resolve-formatter.js +80 -0
  95. package/context/scripts/hooks/post-edit-accumulator.js +66 -0
  96. package/context/scripts/hooks/pre-commit-quality.js +194 -0
  97. package/context/scripts/hooks/quality-gate.js +93 -0
  98. package/context/scripts/hooks/report-only-guard.js +21 -0
  99. package/context/scripts/hooks/run-hook.js +136 -0
  100. package/context/scripts/hooks/stop-format-typecheck.js +141 -0
  101. package/context/scripts/hooks/stop-goal-fit.js +337 -0
  102. package/context/scripts/hooks/workflow-steering.js +250 -0
  103. package/context/scripts/telemetry/console-presets.sh +14 -0
  104. package/context/scripts/telemetry/install-console-config.sh +214 -0
  105. package/context/scripts/telemetry/lib/config.sh +85 -0
  106. package/context/scripts/telemetry/lib/enrich.sh +115 -0
  107. package/context/scripts/telemetry/lib/redact.sh +22 -0
  108. package/context/scripts/telemetry/lib/session.sh +63 -0
  109. package/context/scripts/telemetry/lib/transport.sh +183 -0
  110. package/context/scripts/telemetry/lib/usage.sh +29 -0
  111. package/context/scripts/telemetry/sync-agents.sh +173 -0
  112. package/context/scripts/telemetry/telemetry.conf +23 -0
  113. package/context/scripts/telemetry/telemetry.sh +387 -0
  114. package/context/scripts/validate-package.sh +89 -0
  115. package/context/settings/backlog-provider-settings.json +54 -0
  116. package/context/templates/core/identity.md +26 -0
  117. package/context/templates/core/user.md +15 -0
  118. package/docs/_config.yml +15 -0
  119. package/docs/_layouts/default.html +87 -0
  120. package/docs/adr/0001-flow-agents-consumes-flow.md +77 -0
  121. package/docs/adr/0002-flow-kits-as-extension-unit.md +13 -0
  122. package/docs/adr/0003-flow-agents-coordinates-kits-and-adapters.md +13 -0
  123. package/docs/adr/0004-gates-expect-surface-claims.md +15 -0
  124. package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +48 -0
  125. package/docs/adr/0006-typescript-first-source-policy.md +98 -0
  126. package/docs/agent-system-guidebook.md +391 -0
  127. package/docs/agent-usage-feedback-loop.md +351 -0
  128. package/docs/assets/favicon.svg +13 -0
  129. package/docs/assets/og-image.png +0 -0
  130. package/docs/assets/site.css +774 -0
  131. package/docs/assets/site.js +139 -0
  132. package/docs/configurable-workflow-routing.md +174 -0
  133. package/docs/context-map.md +145 -0
  134. package/docs/developer-architecture.md +145 -0
  135. package/docs/developer-hook-setup.md +61 -0
  136. package/docs/fixture-ownership.md +44 -0
  137. package/docs/flow-kit-repository-contract.md +180 -0
  138. package/docs/index.md +129 -0
  139. package/docs/kontour-resource-contract.md +358 -0
  140. package/docs/migrations.md +64 -0
  141. package/docs/north-star.md +322 -0
  142. package/docs/operating-layers.md +110 -0
  143. package/docs/repository-structure.md +132 -0
  144. package/docs/sandbox-policy.md +56 -0
  145. package/docs/skills-map.md +203 -0
  146. package/docs/standards-register.md +96 -0
  147. package/docs/veritas-integration.md +165 -0
  148. package/docs/work-item-adapters.md +72 -0
  149. package/docs/workflow-artifact-lifecycle.md +141 -0
  150. package/docs/workflow-eval-strategy.md +295 -0
  151. package/docs/workflow-shared-contracts.md +51 -0
  152. package/docs/workflow-usage-guide.md +443 -0
  153. package/evals/ARCHITECTURE.md +143 -0
  154. package/evals/CONVENTIONS.md +58 -0
  155. package/evals/README.md +128 -0
  156. package/evals/acceptance/run.sh +29 -0
  157. package/evals/acceptance/test_claude_harness.sh +242 -0
  158. package/evals/acceptance/test_codex_harness.sh +108 -0
  159. package/evals/acceptance/test_kiro_harness.sh +128 -0
  160. package/evals/cases/dev/404.html +97 -0
  161. package/evals/cases/dev/code-review.yaml +44 -0
  162. package/evals/cases/dev/dashboard.html +300 -0
  163. package/evals/cases/dev/deliver.yaml +66 -0
  164. package/evals/cases/dev/dependency-update.yaml +16 -0
  165. package/evals/cases/dev/explore.yaml +20 -0
  166. package/evals/cases/dev/index.html +370 -0
  167. package/evals/cases/dev/package-lock.json +28 -0
  168. package/evals/cases/dev/package.json +16 -0
  169. package/evals/cases/dev/plan-work.yaml +20 -0
  170. package/evals/cases/dev/promptfooconfig.yaml +666 -0
  171. package/evals/cases/dev/search-first.yaml +20 -0
  172. package/evals/cases/dev/tdd-workflow.yaml +48 -0
  173. package/evals/cases/dev/verify-work.yaml +44 -0
  174. package/evals/cases/dev/workflow.yaml +34 -0
  175. package/evals/ci/run-baseline.sh +283 -0
  176. package/evals/fixtures/backlog-provider-settings/global-default.json +44 -0
  177. package/evals/fixtures/backlog-provider-settings/project-override.json +53 -0
  178. package/evals/fixtures/builder-kit-workflow-state/baseline-freshness-resolution-hint.json +139 -0
  179. package/evals/fixtures/builder-kit-workflow-state/direct-primitive-stop.json +59 -0
  180. package/evals/fixtures/builder-kit-workflow-state/empty-board-route-shape.json +55 -0
  181. package/evals/fixtures/builder-kit-workflow-state/happy-path.json +71 -0
  182. package/evals/fixtures/builder-kit-workflow-state/mid-work-resume.json +80 -0
  183. package/evals/fixtures/builder-kit-workflow-state/missing-prestep-recovery.json +65 -0
  184. package/evals/fixtures/builder-kit-workflow-state/product-build-chaining.json +60 -0
  185. package/evals/fixtures/builder-kit-workflow-state/stale-continuation-requires-new-probe.json +57 -0
  186. package/evals/fixtures/console-learning-projection/artifacts/console-learning-correction/learning.json +50 -0
  187. package/evals/fixtures/console-learning-projection/artifacts/console-learning-open-route/learning.json +41 -0
  188. package/evals/fixtures/flow-kit-repository/invalid-absolute-path/kit.json +8 -0
  189. package/evals/fixtures/flow-kit-repository/invalid-asset-section/flows/review.flow.json +6 -0
  190. package/evals/fixtures/flow-kit-repository/invalid-asset-section/kit.json +11 -0
  191. package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/flows/review.flow.json +6 -0
  192. package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/kit.json +9 -0
  193. package/evals/fixtures/flow-kit-repository/invalid-id/flows/review.flow.json +6 -0
  194. package/evals/fixtures/flow-kit-repository/invalid-id/kit.json +8 -0
  195. package/evals/fixtures/flow-kit-repository/invalid-malformed-json/kit.json +8 -0
  196. package/evals/fixtures/flow-kit-repository/invalid-missing-flow/kit.json +8 -0
  197. package/evals/fixtures/flow-kit-repository/invalid-missing-id/flows/review.flow.json +6 -0
  198. package/evals/fixtures/flow-kit-repository/invalid-missing-id/kit.json +7 -0
  199. package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/flows/review.flow.json +6 -0
  200. package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/kit.json +7 -0
  201. package/evals/fixtures/flow-kit-repository/invalid-name/flows/review.flow.json +6 -0
  202. package/evals/fixtures/flow-kit-repository/invalid-name/kit.json +8 -0
  203. package/evals/fixtures/flow-kit-repository/invalid-schema-version/flows/review.flow.json +6 -0
  204. package/evals/fixtures/flow-kit-repository/invalid-schema-version/kit.json +8 -0
  205. package/evals/fixtures/flow-kit-repository/invalid-traversal/kit.json +8 -0
  206. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/adapters/example.json +3 -0
  207. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/assets/example.txt +1 -0
  208. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/docs/README.md +3 -0
  209. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/flows/runtime.flow.json +26 -0
  210. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-evals/example.json +3 -0
  211. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-skills/mixed/SKILL.md +3 -0
  212. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit.json +44 -0
  213. package/evals/fixtures/flow-kit-repository/valid-local-kit/docs/README.md +3 -0
  214. package/evals/fixtures/flow-kit-repository/valid-local-kit/flows/review.flow.json +26 -0
  215. package/evals/fixtures/flow-kit-repository/valid-local-kit/kit.json +20 -0
  216. package/evals/fixtures/hook-influence/cases.json +336 -0
  217. package/evals/fixtures/pull-work-provider/github-issues.json +170 -0
  218. package/evals/fixtures/pull-work-wip-shepherding/global-wip-informs.json +43 -0
  219. package/evals/fixtures/pull-work-wip-shepherding/personal-wip-blocks.json +42 -0
  220. package/evals/fixtures/surface-trust/accepted-claim-trust-report.json +31 -0
  221. package/evals/fixtures/surface-trust/artifact-absent.json +19 -0
  222. package/evals/fixtures/surface-trust/integrity-mismatch-trust-report.json +32 -0
  223. package/evals/fixtures/surface-trust/missing-authority-trust-report.json +27 -0
  224. package/evals/fixtures/surface-trust/provider-absent.json +19 -0
  225. package/evals/fixtures/surface-trust/rejected-claim-trust-report.json +30 -0
  226. package/evals/fixtures/surface-trust/stale-claim-trust-snapshot.json +31 -0
  227. package/evals/fixtures/usage-feedback/sample-full.jsonl +11 -0
  228. package/evals/fixtures/usage-feedback/sample-outcomes.jsonl +1 -0
  229. package/evals/fixtures/veritas-governance-adapter/fake-veritas-pass.sh +18 -0
  230. package/evals/fixtures/veritas-governance-adapter/fake-veritas-secret-fail.sh +10 -0
  231. package/evals/fixtures/veritas-governance-adapter/fake-veritas-unconfigured.sh +4 -0
  232. package/evals/integration/test_bundle_install.sh +541 -0
  233. package/evals/integration/test_console_learning_projection.sh +192 -0
  234. package/evals/integration/test_context_map.sh +65 -0
  235. package/evals/integration/test_effective_backlog_settings.sh +58 -0
  236. package/evals/integration/test_fixture_retirement_audit.sh +58 -0
  237. package/evals/integration/test_flow_agents_statusline.sh +93 -0
  238. package/evals/integration/test_flow_kit_repository.sh +90 -0
  239. package/evals/integration/test_goal_fit_hook.sh +482 -0
  240. package/evals/integration/test_hook_category_behaviors.sh +190 -0
  241. package/evals/integration/test_hook_influence_cases.sh +69 -0
  242. package/evals/integration/test_local_flow_kit_install.sh +145 -0
  243. package/evals/integration/test_publish_change_helper.sh +176 -0
  244. package/evals/integration/test_pull_work_provider.sh +140 -0
  245. package/evals/integration/test_runtime_adapter_activation.sh +106 -0
  246. package/evals/integration/test_telemetry.sh +485 -0
  247. package/evals/integration/test_telemetry_doctor.sh +193 -0
  248. package/evals/integration/test_usage_feedback_dashboard.sh +169 -0
  249. package/evals/integration/test_usage_feedback_global.sh +117 -0
  250. package/evals/integration/test_usage_feedback_import.sh +227 -0
  251. package/evals/integration/test_usage_feedback_outcomes.sh +165 -0
  252. package/evals/integration/test_usage_feedback_report.sh +263 -0
  253. package/evals/integration/test_veritas_governance_adapter.sh +235 -0
  254. package/evals/integration/test_workflow_artifact_cleanup_audit.sh +287 -0
  255. package/evals/integration/test_workflow_artifacts.sh +1247 -0
  256. package/evals/integration/test_workflow_sidecar_writer.sh +2112 -0
  257. package/evals/integration/test_workflow_steering_hook.sh +337 -0
  258. package/evals/lib/assertions/delegated-to.js +40 -0
  259. package/evals/lib/assertions/max-tool-calls.js +15 -0
  260. package/evals/lib/assertions/no-write-tools.js +27 -0
  261. package/evals/lib/assertions/pass-at-k.js +39 -0
  262. package/evals/lib/assertions/telemetry-utils.js +105 -0
  263. package/evals/lib/assertions/tool-called.js +39 -0
  264. package/evals/lib/assertions/verify-after-fix.js +61 -0
  265. package/evals/lib/claude-judge.sh +40 -0
  266. package/evals/lib/claude-provider.sh +74 -0
  267. package/evals/lib/codex-judge.sh +39 -0
  268. package/evals/lib/codex-provider.sh +81 -0
  269. package/evals/lib/eval-dev.sh +5 -0
  270. package/evals/lib/eval-judge.sh +22 -0
  271. package/evals/lib/eval-provider.sh +26 -0
  272. package/evals/lib/eval-report.sh +73 -0
  273. package/evals/lib/kiro-dev.sh +4 -0
  274. package/evals/lib/kiro-judge.sh +17 -0
  275. package/evals/lib/kiro-provider.sh +62 -0
  276. package/evals/lib/node.sh +111 -0
  277. package/evals/promptfooconfig.yaml +70 -0
  278. package/evals/run.sh +309 -0
  279. package/evals/static/test_evidence_refs.sh +141 -0
  280. package/evals/static/test_package.sh +407 -0
  281. package/evals/static/test_repo_hooks.sh +68 -0
  282. package/evals/static/test_universal_bundles.sh +274 -0
  283. package/evals/static/test_workflow_skills.sh +1207 -0
  284. package/install.sh +64 -0
  285. package/integrations/veritas/flow-agents.adapter.json +138 -0
  286. package/integrations/veritas/flow-agents.authority-settings.json +26 -0
  287. package/integrations/veritas/flow-agents.repo-standards.json +82 -0
  288. package/kits/builder/flows/build.flow.json +218 -0
  289. package/kits/builder/flows/shape.flow.json +127 -0
  290. package/kits/builder/kit.json +19 -0
  291. package/kits/catalog.json +11 -0
  292. package/package.json +130 -0
  293. package/packaging/README.md +60 -0
  294. package/packaging/manifest.json +173 -0
  295. package/packaging/packs.json +69 -0
  296. package/powers/dependency-checker/POWER.md +20 -0
  297. package/powers/dependency-checker/mcp.json +20 -0
  298. package/powers/playwright/POWER.md +25 -0
  299. package/powers/playwright/mcp.json +12 -0
  300. package/prompts/code-audit.md +123 -0
  301. package/prompts/kcommit.md +88 -0
  302. package/schemas/backlog-provider-settings.schema.json +138 -0
  303. package/schemas/workflow-acceptance.schema.json +216 -0
  304. package/schemas/workflow-critique.schema.json +113 -0
  305. package/schemas/workflow-evidence.schema.json +357 -0
  306. package/schemas/workflow-handoff.schema.json +52 -0
  307. package/schemas/workflow-learning.schema.json +223 -0
  308. package/schemas/workflow-release.schema.json +172 -0
  309. package/schemas/workflow-state.schema.json +80 -0
  310. package/scripts/README.md +111 -0
  311. package/scripts/build-universal-bundles.js +3 -0
  312. package/scripts/check-content-boundary.cjs +99 -0
  313. package/scripts/context-budget/budget-scan.sh +166 -0
  314. package/scripts/detect-tools.sh +3 -0
  315. package/scripts/discover-agents.sh +28 -0
  316. package/scripts/effective-backlog-settings.js +2 -0
  317. package/scripts/filter-installed-packs.js +2 -0
  318. package/scripts/flow-kit.js +2 -0
  319. package/scripts/generate-context-map.js +2 -0
  320. package/scripts/git-status.sh +49 -0
  321. package/scripts/hooks/claude-hook-adapter.js +174 -0
  322. package/scripts/hooks/claude-telemetry-hook.js +115 -0
  323. package/scripts/hooks/codex-hook-adapter.js +176 -0
  324. package/scripts/hooks/codex-telemetry-hook.js +95 -0
  325. package/scripts/hooks/config-protection.js +79 -0
  326. package/scripts/hooks/desktop-notify.sh +39 -0
  327. package/scripts/hooks/governance-audit.sh +135 -0
  328. package/scripts/hooks/lib/audit-transport.sh +40 -0
  329. package/scripts/hooks/lib/hook-flags.js +49 -0
  330. package/scripts/hooks/lib/patterns.sh +57 -0
  331. package/scripts/hooks/lib/resolve-formatter.js +80 -0
  332. package/scripts/hooks/post-edit-accumulator.js +66 -0
  333. package/scripts/hooks/pre-commit-quality.js +194 -0
  334. package/scripts/hooks/quality-gate.js +93 -0
  335. package/scripts/hooks/report-only-guard.js +21 -0
  336. package/scripts/hooks/run-hook.js +136 -0
  337. package/scripts/hooks/stop-format-typecheck.js +141 -0
  338. package/scripts/hooks/stop-goal-fit.js +337 -0
  339. package/scripts/hooks/workflow-steering.js +250 -0
  340. package/scripts/install-codex-home.sh +106 -0
  341. package/scripts/package.json +3 -0
  342. package/scripts/promote-workflow-artifact.js +2 -0
  343. package/scripts/publish-change-helper.js +2 -0
  344. package/scripts/pull-work-provider.js +2 -0
  345. package/scripts/setup-repo-hooks.sh +8 -0
  346. package/scripts/statusline/flow-agents-statusline.js +157 -0
  347. package/scripts/telemetry/console-presets.sh +14 -0
  348. package/scripts/telemetry/install-console-config.sh +214 -0
  349. package/scripts/telemetry/lib/config.sh +85 -0
  350. package/scripts/telemetry/lib/enrich.sh +115 -0
  351. package/scripts/telemetry/lib/redact.sh +22 -0
  352. package/scripts/telemetry/lib/session.sh +63 -0
  353. package/scripts/telemetry/lib/transport.sh +183 -0
  354. package/scripts/telemetry/lib/usage.sh +29 -0
  355. package/scripts/telemetry/sync-agents.sh +173 -0
  356. package/scripts/telemetry/telemetry.conf +23 -0
  357. package/scripts/telemetry/telemetry.sh +387 -0
  358. package/scripts/usage-feedback.js +2 -0
  359. package/scripts/validate-hook-influence-cases.js +2 -0
  360. package/scripts/validate-package.sh +89 -0
  361. package/scripts/validate-source-tree.js +9 -0
  362. package/skills/agentic-engineering/SKILL.md +62 -0
  363. package/skills/browser-test/SKILL.md +51 -0
  364. package/skills/builder-shape/SKILL.md +76 -0
  365. package/skills/context-budget/SKILL.md +40 -0
  366. package/skills/deliver/SKILL.md +241 -0
  367. package/skills/dependency-update/SKILL.md +68 -0
  368. package/skills/design-probe/SKILL.md +107 -0
  369. package/skills/eval-rebuild/SKILL.md +39 -0
  370. package/skills/evidence-gate/SKILL.md +186 -0
  371. package/skills/execute-plan/SKILL.md +110 -0
  372. package/skills/explore/SKILL.md +137 -0
  373. package/skills/feedback-loop/SKILL.md +87 -0
  374. package/skills/fix-bug/SKILL.md +133 -0
  375. package/skills/frontend-design/SKILL.md +80 -0
  376. package/skills/github-cli/SKILL.md +63 -0
  377. package/skills/idea-to-backlog/SKILL.md +267 -0
  378. package/skills/knowledge-capture/SKILL.md +55 -0
  379. package/skills/learning-review/SKILL.md +115 -0
  380. package/skills/pickup-probe/SKILL.md +114 -0
  381. package/skills/plan-work/SKILL.md +176 -0
  382. package/skills/pull-work/SKILL.md +309 -0
  383. package/skills/release-readiness/SKILL.md +121 -0
  384. package/skills/review-work/SKILL.md +161 -0
  385. package/skills/search-first/SKILL.md +66 -0
  386. package/skills/tdd-workflow/SKILL.md +140 -0
  387. package/skills/verify-work/SKILL.md +109 -0
  388. package/src/cli/console-learning-projection.ts +140 -0
  389. package/src/cli/effective-backlog-settings.ts +99 -0
  390. package/src/cli/fixture-retirement-audit.ts +154 -0
  391. package/src/cli/flow-kit.ts +139 -0
  392. package/src/cli/init.ts +248 -0
  393. package/src/cli/promote-workflow-artifact.ts +64 -0
  394. package/src/cli/publish-change-helper.ts +143 -0
  395. package/src/cli/pull-work-provider.ts +481 -0
  396. package/src/cli/runtime-adapter.ts +24 -0
  397. package/src/cli/telemetry-doctor.ts +243 -0
  398. package/src/cli/usage-feedback.ts +418 -0
  399. package/src/cli/validate-hook-influence.ts +119 -0
  400. package/src/cli/validate-source-tree.ts +30 -0
  401. package/src/cli/validate-workflow-artifacts.ts +411 -0
  402. package/src/cli/veritas-governance.ts +322 -0
  403. package/src/cli/workflow-artifact-cleanup-audit.ts +281 -0
  404. package/src/cli/workflow-sidecar.ts +676 -0
  405. package/src/cli.ts +95 -0
  406. package/src/flow-kit/validate.ts +74 -0
  407. package/src/lib/args.ts +43 -0
  408. package/src/lib/fs.ts +62 -0
  409. package/src/lib/workflow-learning-projection.ts +491 -0
  410. package/src/runtime-adapters.ts +154 -0
  411. package/src/tools/build-universal-bundles.ts +366 -0
  412. package/src/tools/common.ts +61 -0
  413. package/src/tools/filter-installed-packs.ts +129 -0
  414. package/src/tools/generate-context-map.ts +199 -0
  415. package/src/tools/validate-package.ts +57 -0
  416. package/src/tools/validate-source-tree.ts +488 -0
  417. package/tsconfig.json +19 -0
  418. package/veritas.claims.json +6 -0
@@ -0,0 +1,128 @@
1
+ # Flow Agents Eval Suite
2
+
3
+ Evaluation coverage for the canonical Flow Agents source tree and generated universal bundles.
4
+
5
+ ## Quick Start
6
+
7
+ ```bash
8
+ npm install
9
+
10
+ # Run the fast local gate: source validation, static package checks, integration checks
11
+ bash evals/run.sh
12
+
13
+ # Run only source/static checks
14
+ bash evals/run.sh static
15
+
16
+ # Run only integration checks
17
+ bash evals/run.sh integration
18
+
19
+ # Run harness-native acceptance checks
20
+ bash evals/run.sh acceptance
21
+
22
+ # Claude acceptance is cheap by default. Opt in to prompt-mode Claude usage only when needed.
23
+ FLOW_AGENTS_ACCEPTANCE_CLAUDE_LLM=1 bash evals/run.sh acceptance claude
24
+
25
+ # Run behavioral evals through the default Kiro runtime
26
+ bash evals/run.sh llm
27
+
28
+ # Run one behavioral suite through Codex as subject runtime and judge
29
+ bash evals/run.sh llm dev --runtime codex
30
+
31
+ # Run Claude Code as the subject runtime while Codex judges rubrics
32
+ bash evals/run.sh llm dev --runtime claude --judge-runtime codex
33
+
34
+ # Run cheaper behavioral subsets
35
+ bash evals/run.sh llm dev --suite smoke
36
+ bash evals/run.sh llm dev --suite regression
37
+
38
+ # View promptfoo results
39
+ npm run promptfoo:view
40
+ ```
41
+
42
+ ## Layers
43
+
44
+ ### Layer 1: Static (`bash evals/run.sh static`)
45
+
46
+ Validates the source tree and generated bundle exports:
47
+ - canonical source validation via `npm run validate:source --`
48
+ - package shape, schemas, resources, hooks, routing, MCP server references, write-tool invariants, and agent cards
49
+ - universal bundle build/export checks for Kiro, Claude Code, and Codex
50
+
51
+ Runs in seconds and has no LLM cost.
52
+
53
+ ### Layer 2: Integration (`bash evals/run.sh integration`)
54
+
55
+ Validates runtime-adjacent contracts:
56
+ - telemetry event schemas, type mapping, field presence, prompt capture, tool capture, redaction, and agent discovery
57
+ - workflow artifact quality and deterministic end-to-end delivery chain fixtures
58
+ - bundle install smoke tests for Kiro, Claude Code, and Codex temp installs
59
+
60
+ Runs in seconds and has no LLM cost.
61
+
62
+ ### Layer 3: Behavioral (`bash evals/run.sh llm`)
63
+
64
+ Runs selected agents through an eval runtime and scores responses with deterministic telemetry assertions plus LLM rubrics. Kiro is the default subject runtime. Pass `--runtime codex` or `--runtime claude` to run Codex or Claude Code where supported.
65
+
66
+ Subject runtime and judge runtime are separate:
67
+
68
+ ```bash
69
+ bash evals/run.sh llm dev --runtime claude --judge-runtime codex
70
+ bash evals/run.sh llm dev --runtime claude --judge-runtime claude
71
+ ```
72
+
73
+ Use `--suite smoke`, `--suite regression`, or `--suite capability` to avoid running the full behavioral suite when a targeted gate is enough. `smoke` runs the first few cases, `regression` filters `metadata.type=regression`, and `capability` filters `metadata.type=capability`.
74
+
75
+ Current behavioral suites:
76
+ - `dev`
77
+
78
+ The root `evals/promptfooconfig.yaml` is a legacy combined promptfoo config for targeted manual runs. Prefer `bash evals/run.sh llm <agent>` or the per-agent configs in `evals/cases/<agent>/promptfooconfig.yaml`.
79
+
80
+ ### Layer 4: Acceptance (`bash evals/run.sh acceptance`)
81
+
82
+ Runs harness-native smoke tests against generated bundles:
83
+ - `Claude Code` discovers workspace agents and can answer through `dev`
84
+ - `claude` lists project agents and verifies exported telemetry hook configuration without model usage by default
85
+ - `codex exec` loads the exported `.codex` bundle and returns a final response
86
+
87
+ This layer is environment-dependent and requires installed, authenticated CLIs.
88
+
89
+ Claude prompt-mode acceptance is opt-in with `FLOW_AGENTS_ACCEPTANCE_CLAUDE_LLM=1`. Real Claude CLI hook telemetry assertions are opt-in with `FLOW_AGENTS_ACCEPTANCE_REQUIRE_CLAUDE_TELEMETRY=1`; deterministic integration tests cover the telemetry wrapper without spending Claude usage.
90
+
91
+ ## Coverage
92
+
93
+ Covered now:
94
+ - source/package drift and bundle export drift
95
+ - telemetry schema and redaction contracts
96
+ - install smoke tests for generated bundles
97
+ - normalized telemetry for Kiro, Codex, and Claude Code hook events
98
+ - behavioral routing and workflow checks for the supported per-agent suites
99
+
100
+ Deferred:
101
+ - multi-turn conversation evals
102
+ - adversarial/red-team evals
103
+ - behavioral coverage for every exported tool agent
104
+ - full LLM-driven end-to-end delivery runs on every edit; deterministic artifact-chain E2E coverage runs in integration
105
+ - direct token usage assertions, because CLI-backed exec providers do not expose reliable token counts today
106
+
107
+ ## Adding Eval Cases
108
+
109
+ Add behavioral cases to `evals/cases/<agent>/promptfooconfig.yaml`. Each test should include:
110
+ - `vars.prompt` with the user prompt
111
+ - `options.provider.id` or suite default provider
112
+ - deterministic assertions when telemetry can prove the behavior
113
+ - an `llm-rubric` for workflow quality when needed
114
+ - `metadata.type` set to `capability` or `regression`
115
+
116
+ Run the affected suite with:
117
+
118
+ ```bash
119
+ bash evals/run.sh llm <agent>
120
+ ```
121
+
122
+ ## Prerequisites
123
+
124
+ - `jq` for static and integration checks
125
+ - `Claude Code` for default behavioral and Kiro acceptance checks
126
+ - `codex` for `--runtime codex`, `--judge-runtime codex`, and Codex acceptance checks
127
+ - `claude` for `--runtime claude`, `--judge-runtime claude`, and Claude Code acceptance checks
128
+ - `promptfoo` for behavioral evals and result viewing, installed with `npm install` from the repo root
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
5
+ ACCEPT_DIR="$ROOT_DIR/evals/acceptance"
6
+ TARGET="${1:-all}"
7
+
8
+ run_one() {
9
+ local name="$1"
10
+ echo ""
11
+ bash "$ACCEPT_DIR/test_${name}_harness.sh"
12
+ }
13
+
14
+ case "$TARGET" in
15
+ kiro|claude|codex)
16
+ run_one "$TARGET"
17
+ ;;
18
+ all)
19
+ status=0
20
+ run_one kiro || status=1
21
+ run_one claude || status=1
22
+ run_one codex || status=1
23
+ exit "$status"
24
+ ;;
25
+ *)
26
+ echo "Usage: bash evals/acceptance/run.sh [all|kiro|claude|codex]"
27
+ exit 1
28
+ ;;
29
+ esac
@@ -0,0 +1,242 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
5
+ source "$ROOT_DIR/evals/lib/node.sh"
6
+ TMP_WORK=""
7
+ pass=0
8
+ fail=0
9
+ skip=0
10
+
11
+ cleanup() {
12
+ [[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
13
+ }
14
+ trap cleanup EXIT
15
+
16
+ _pass() { echo " ✓ $1"; pass=$((pass + 1)); }
17
+ _fail() { echo " ✗ $1"; fail=$((fail + 1)); }
18
+ _skip() { echo " ○ $1"; skip=$((skip + 1)); }
19
+
20
+ wait_for_telemetry() {
21
+ local file="$1"
22
+ local i=0
23
+ while [[ $i -lt 50 ]]; do
24
+ [[ -s "$file" ]] && return 0
25
+ sleep 0.1
26
+ i=$((i + 1))
27
+ done
28
+ return 1
29
+ }
30
+
31
+ echo "=== Harness Acceptance: Claude Code ==="
32
+ echo ""
33
+ echo "Set FLOW_AGENTS_ACCEPTANCE_CLAUDE_LLM=1 to run prompt-mode Claude checks."
34
+ echo "Set FLOW_AGENTS_ACCEPTANCE_REQUIRE_CLAUDE_TELEMETRY=1 to require real Claude CLI hook telemetry."
35
+ echo ""
36
+
37
+ if ! command -v claude >/dev/null 2>&1; then
38
+ _skip "claude CLI not installed"
39
+ echo ""
40
+ echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
41
+ exit 0
42
+ fi
43
+
44
+ cd "$ROOT_DIR"
45
+ flow_agents_node scripts/build-universal-bundles.js >/dev/null
46
+
47
+ TMP_WORK="$(mktemp -d /tmp/claude-acceptance-work.XXXXXX)"
48
+ bash dist/claude-code/install.sh "$TMP_WORK" >/dev/null
49
+
50
+ echo "--- Agent List ---"
51
+ list_output="$(cd "$TMP_WORK" && claude agents --setting-sources local,project,user 2>&1 || true)"
52
+ if echo "$list_output" | grep -q "Project agents:"; then
53
+ _pass "claude lists project agents"
54
+ else
55
+ _fail "claude did not list project agents"
56
+ fi
57
+
58
+ if echo "$list_output" | grep -q "dev ·"; then
59
+ _pass "claude project agent list includes dev"
60
+ else
61
+ _fail "claude project agent list did not include dev"
62
+ fi
63
+
64
+ if [[ -f "$TMP_WORK/.claude/settings.json" ]] && grep -q "claude-telemetry-hook.js" "$TMP_WORK/.claude/settings.json"; then
65
+ _pass "claude project settings include telemetry hooks"
66
+ else
67
+ _fail "claude project settings missing telemetry hooks"
68
+ fi
69
+
70
+ if [[ "${FLOW_AGENTS_ACCEPTANCE_CLAUDE_LLM:-0}" != "1" ]]; then
71
+ _skip "Claude prompt-mode checks skipped to avoid model usage"
72
+ echo ""
73
+ echo "==========================="
74
+ total=$((pass + fail))
75
+ echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
76
+ [[ "$fail" -gt 0 ]] && exit 1
77
+ exit 0
78
+ fi
79
+
80
+ echo ""
81
+ echo "--- Print Smoke ---"
82
+ print_output="$(cd "$TMP_WORK" && claude -p --agent dev --permission-mode bypassPermissions --add-dir "$TMP_WORK" --output-format text "Reply with READY only." 2>&1 || true)"
83
+ if echo "$print_output" | grep -qx "READY"; then
84
+ _pass "dev agent replied READY in print mode"
85
+ else
86
+ _fail "dev agent did not return plain READY in print mode"
87
+ fi
88
+
89
+ echo ""
90
+ echo "--- Behavioral Route ---"
91
+ route_output="$(cd "$TMP_WORK" && node - <<'NODE'
92
+ const { spawnSync } = require("node:child_process");
93
+ const result = spawnSync("claude", [
94
+ "-p",
95
+ "--agent",
96
+ "dev",
97
+ "--permission-mode",
98
+ "bypassPermissions",
99
+ "--add-dir",
100
+ ".",
101
+ "--output-format",
102
+ "text",
103
+ "A user asks: 'Explore the codebase and explain what it does.' Which skill should you activate first? Reply with only the skill name or NONE.",
104
+ ], { encoding: "utf8", timeout: 30000 });
105
+ process.stdout.write(result.stdout || "");
106
+ process.stdout.write(result.stderr || "");
107
+ NODE
108
+ )"
109
+ route_output_trimmed="$(printf '%s' "$route_output" | tr -d '\r' | tail -n 1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
110
+ if [[ "$route_output_trimmed" == "explore" ]]; then
111
+ _pass "claude dev selects explore for repository exploration"
112
+ else
113
+ _fail "claude dev did not select explore (got: $route_output_trimmed)"
114
+ fi
115
+
116
+ echo ""
117
+ echo "--- deliver Route ---"
118
+ sa_build_output="$(cd "$TMP_WORK" && node - <<'NODE'
119
+ const { spawnSync } = require("node:child_process");
120
+ const result = spawnSync("claude", [
121
+ "-p",
122
+ "--agent",
123
+ "dev",
124
+ "--permission-mode",
125
+ "bypassPermissions",
126
+ "--add-dir",
127
+ ".",
128
+ "--output-format",
129
+ "text",
130
+ "A user asks: 'Build a CLI tool that converts markdown files to HTML'. Which skill should you activate first? Reply with only the skill name or NONE.",
131
+ ], { encoding: "utf8", timeout: 30000 });
132
+ process.stdout.write(result.stdout || "");
133
+ process.stdout.write(result.stderr || "");
134
+ NODE
135
+ )"
136
+ sa_build_trimmed="$(printf '%s' "$sa_build_output" | tr -d '\r' | tail -n 1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
137
+ if [[ "$sa_build_trimmed" == "deliver" ]]; then
138
+ _pass "claude dev selects deliver for broad build requests"
139
+ else
140
+ _fail "claude dev did not select deliver (got: $sa_build_trimmed)"
141
+ fi
142
+
143
+ echo ""
144
+ echo "--- Live Hook Influence ---"
145
+ mkdir -p "$TMP_WORK/.flow-agents/live-hook" "$TMP_WORK/docs"
146
+ printf '# Context Map\n' > "$TMP_WORK/docs/context-map.md"
147
+ cat > "$TMP_WORK/.flow-agents/live-hook/state.json" <<'JSON'
148
+ {
149
+ "schema_version": "1.0",
150
+ "task_slug": "live-hook",
151
+ "status": "not_verified",
152
+ "phase": "verification",
153
+ "updated_at": "2026-05-10T00:00:00Z",
154
+ "next_action": {
155
+ "status": "needs_user",
156
+ "summary": "Acknowledge live hook guidance.",
157
+ "target_phase": "verification"
158
+ }
159
+ }
160
+ JSON
161
+ cat > "$TMP_WORK/.flow-agents/live-hook/critique.json" <<'JSON'
162
+ {
163
+ "schema_version": "1.0",
164
+ "task_slug": "live-hook",
165
+ "status": "fail",
166
+ "required": true,
167
+ "updated_at": "2026-05-10T00:01:00Z",
168
+ "critiques": [
169
+ {
170
+ "id": "live-hook-review",
171
+ "reviewer": "tool-code-reviewer",
172
+ "reviewed_at": "2026-05-10T00:01:00Z",
173
+ "verdict": "fail",
174
+ "summary": "Live hook guidance must be acknowledged.",
175
+ "findings": [
176
+ {
177
+ "id": "open-live-hook",
178
+ "severity": "high",
179
+ "status": "open",
180
+ "description": "Report the unfinished workflow state."
181
+ }
182
+ ]
183
+ }
184
+ ]
185
+ }
186
+ JSON
187
+ hook_output="$(cd "$TMP_WORK" && node - <<'NODE'
188
+ const { spawnSync } = require("node:child_process");
189
+ const result = spawnSync("claude", [
190
+ "-p",
191
+ "--agent",
192
+ "dev",
193
+ "--permission-mode",
194
+ "bypassPermissions",
195
+ "--add-dir",
196
+ ".",
197
+ "--output-format",
198
+ "text",
199
+ "Use a harmless tool first, such as listing the current directory. After that, if Flow Agents hook guidance mentions WORKFLOW STATE ATTENTION or task live-hook, reply exactly HOOK_GUIDANCE_SEEN live-hook. If no such guidance is visible, reply exactly HOOK_GUIDANCE_MISSING.",
200
+ ], { encoding: "utf8", timeout: 45000 });
201
+ process.stdout.write(result.stdout || "");
202
+ process.stdout.write(result.stderr || "");
203
+ NODE
204
+ )"
205
+ hook_output_trimmed="$(printf '%s' "$hook_output" | tr -d '\r' | tail -n 1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
206
+ if [[ "$hook_output_trimmed" == "HOOK_GUIDANCE_SEEN live-hook" ]]; then
207
+ _pass "claude live session responds to workflow hook guidance"
208
+ else
209
+ _fail "claude live session did not respond to workflow hook guidance (got: $hook_output_trimmed)"
210
+ fi
211
+
212
+ echo ""
213
+ echo "--- Telemetry ---"
214
+ telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
215
+ if [[ "${FLOW_AGENTS_ACCEPTANCE_REQUIRE_CLAUDE_TELEMETRY:-0}" != "1" ]]; then
216
+ _skip "real Claude CLI telemetry assertion skipped"
217
+ else
218
+ if wait_for_telemetry "$telemetry_file"; then
219
+ _pass "claude telemetry log was written"
220
+ else
221
+ _fail "claude telemetry log was not written"
222
+ fi
223
+
224
+ if [[ -f "$telemetry_file" ]] && jq -e 'select(.agent.runtime == "claude-code")' "$telemetry_file" >/dev/null 2>&1; then
225
+ _pass "claude telemetry uses normalized claude-code runtime"
226
+ else
227
+ _fail "claude telemetry did not include claude-code runtime"
228
+ fi
229
+
230
+ if [[ -f "$telemetry_file" ]] && jq -e 'select(.event_type == "turn.user")' "$telemetry_file" >/dev/null 2>&1; then
231
+ _pass "claude telemetry captures user prompts"
232
+ else
233
+ _fail "claude telemetry did not capture user prompts"
234
+ fi
235
+ fi
236
+
237
+ echo ""
238
+ echo "==========================="
239
+ total=$((pass + fail))
240
+ echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
241
+ [[ "$fail" -gt 0 ]] && exit 1
242
+ exit 0
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
5
+ source "$ROOT_DIR/evals/lib/node.sh"
6
+ TMP_WORK=""
7
+ TMP_LOG=""
8
+ TMP_LAST=""
9
+ pass=0
10
+ fail=0
11
+ skip=0
12
+
13
+ cleanup() {
14
+ [[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
15
+ [[ -n "$TMP_LOG" ]] && rm -f "$TMP_LOG"
16
+ [[ -n "$TMP_LAST" ]] && rm -f "$TMP_LAST"
17
+ }
18
+ trap cleanup EXIT
19
+
20
+ _pass() { echo " ✓ $1"; pass=$((pass + 1)); }
21
+ _fail() { echo " ✗ $1"; fail=$((fail + 1)); }
22
+ _skip() { echo " ○ $1"; skip=$((skip + 1)); }
23
+
24
+ echo "=== Harness Acceptance: Codex ==="
25
+ echo ""
26
+
27
+ if ! command -v codex >/dev/null 2>&1; then
28
+ _skip "codex CLI not installed"
29
+ echo ""
30
+ echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
31
+ exit 0
32
+ fi
33
+
34
+ cd "$ROOT_DIR"
35
+ flow_agents_node scripts/build-universal-bundles.js >/dev/null
36
+
37
+ TMP_WORK="$(mktemp -d /tmp/codex-acceptance-work.XXXXXX)"
38
+ TMP_LOG="$(mktemp /tmp/codex-acceptance-log.XXXXXX)"
39
+ TMP_LAST="$(mktemp /tmp/codex-acceptance-last.XXXXXX)"
40
+ bash dist/codex/install.sh "$TMP_WORK" >/dev/null
41
+
42
+ echo "--- Exec Smoke ---"
43
+ if codex exec --skip-git-repo-check -C "$TMP_WORK" --sandbox read-only --json --output-last-message "$TMP_LAST" "After any required startup checks, reply with READY only." >"$TMP_LOG" 2>&1; then
44
+ _pass "codex exec completed successfully"
45
+ else
46
+ _fail "codex exec exited non-zero"
47
+ fi
48
+
49
+ if grep -q "Ignoring malformed agent role definition" "$TMP_LOG"; then
50
+ _fail "codex reported malformed exported agent roles"
51
+ else
52
+ _pass "codex accepted exported local agent role files"
53
+ fi
54
+
55
+ if grep -q "failed to stat skills path" "$TMP_LOG"; then
56
+ _fail "codex could not stat exported skill paths"
57
+ else
58
+ _pass "codex resolved exported skill paths"
59
+ fi
60
+
61
+ if grep -q "READY" "$TMP_LAST"; then
62
+ _pass "codex returned READY in final message"
63
+ else
64
+ _fail "codex final message did not contain READY"
65
+ fi
66
+
67
+ echo ""
68
+ echo "--- Behavioral Route ---"
69
+ TMP_ROUTE_LOG="$(mktemp /tmp/codex-acceptance-route.XXXXXX)"
70
+ if node -e 'const fs=require("fs"); const cp=require("child_process"); const [work,log]=process.argv.slice(1); const r=cp.spawnSync("codex",["exec","--skip-git-repo-check","-C",work,"--sandbox","read-only","--json","Before doing anything else, state the exact skill you are activating if any, then explore the codebase and explain what it does."],{encoding:"utf8",timeout:45000}); fs.writeFileSync(log,(r.stdout||"")+(r.stderr||"")); process.exit(r.error?.code==="ETIMEDOUT" ? 0 : (r.status ?? 1));' "$TMP_WORK" "$TMP_ROUTE_LOG"
71
+ then
72
+ _pass "codex behavioral route command completed successfully"
73
+ else
74
+ _fail "codex behavioral route command exited non-zero"
75
+ fi
76
+
77
+ if grep -Fq 'Activating `$explore`' "$TMP_ROUTE_LOG" || grep -Fq 'Activating skill: `explore`' "$TMP_ROUTE_LOG" || grep -Fq 'Activating skill: explore' "$TMP_ROUTE_LOG"; then
78
+ _pass "codex dev activates explore on repository exploration"
79
+ else
80
+ _fail "codex dev did not activate explore on repository exploration"
81
+ fi
82
+
83
+ rm -f "$TMP_ROUTE_LOG"
84
+
85
+ echo ""
86
+ echo "--- deliver Route ---"
87
+ TMP_BUILD_LOG="$(mktemp /tmp/codex-acceptance-build.XXXXXX)"
88
+ if node -e 'const fs=require("fs"); const cp=require("child_process"); const [work,log]=process.argv.slice(1); const r=cp.spawnSync("codex",["exec","--skip-git-repo-check","-C",work,"--sandbox","read-only","--json","Before doing anything else, state the exact skill you are activating if any, then begin the deliver workflow for '\''Build a CLI tool that converts markdown files to HTML'\'', but stop after deciding the initial skill and first phase."],{encoding:"utf8",timeout:45000}); fs.writeFileSync(log,(r.stdout||"")+(r.stderr||"")); process.exit(r.error?.code==="ETIMEDOUT" ? 0 : (r.status ?? 1));' "$TMP_WORK" "$TMP_BUILD_LOG"
89
+ then
90
+ _pass "codex deliver route command completed successfully"
91
+ else
92
+ _fail "codex deliver route command exited non-zero"
93
+ fi
94
+
95
+ if grep -Fq 'Activating skill: `$deliver`' "$TMP_BUILD_LOG" || grep -Fq 'Activating skill: `deliver`' "$TMP_BUILD_LOG" || grep -Fq 'Activating skill: deliver' "$TMP_BUILD_LOG"; then
96
+ _pass "codex dev activates deliver for broad build requests"
97
+ else
98
+ _fail "codex dev did not activate deliver for broad build requests"
99
+ fi
100
+
101
+ rm -f "$TMP_BUILD_LOG"
102
+
103
+ echo ""
104
+ echo "==========================="
105
+ total=$((pass + fail))
106
+ echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
107
+ [[ "$fail" -gt 0 ]] && exit 1
108
+ exit 0
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
5
+ source "$ROOT_DIR/evals/lib/node.sh"
6
+ TMP_HOME=""
7
+ TMP_WORK=""
8
+ TMP_TELEMETRY=""
9
+ pass=0
10
+ fail=0
11
+ skip=0
12
+
13
+ cleanup() {
14
+ [[ -n "$TMP_HOME" ]] && rm -rf "$TMP_HOME"
15
+ [[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
16
+ [[ -n "$TMP_TELEMETRY" ]] && rm -rf "$TMP_TELEMETRY"
17
+ }
18
+ trap cleanup EXIT
19
+
20
+ _pass() { echo " ✓ $1"; pass=$((pass + 1)); }
21
+ _fail() { echo " ✗ $1"; fail=$((fail + 1)); }
22
+ _skip() { echo " ○ $1"; skip=$((skip + 1)); }
23
+ strip_ansi() {
24
+ perl -pe 's/\e\[[0-9;?]*[ -\/]*[@-~]//g; s/\e\(B//g'
25
+ }
26
+
27
+ echo "=== Harness Acceptance: Kiro ==="
28
+ echo ""
29
+
30
+ if ! command -v kiro-cli >/dev/null 2>&1; then
31
+ _skip "kiro-cli not installed"
32
+ echo ""
33
+ echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
34
+ exit 0
35
+ fi
36
+
37
+ cd "$ROOT_DIR"
38
+ flow_agents_node scripts/build-universal-bundles.js >/dev/null
39
+
40
+ TMP_HOME="$(mktemp -d /tmp/kiro-acceptance-home.XXXXXX)"
41
+ TMP_WORK="$(mktemp -d /tmp/kiro-acceptance-work.XXXXXX)"
42
+ TMP_TELEMETRY="$(mktemp -d /tmp/kiro-acceptance-telemetry.XXXXXX)"
43
+ bash dist/kiro/install.sh "$TMP_HOME" >/dev/null
44
+ mkdir -p "$TMP_WORK/.kiro"
45
+ ln -s "$TMP_HOME/agents" "$TMP_WORK/.kiro/agents"
46
+
47
+ echo "--- Agent List ---"
48
+ list_output="$(cd "$TMP_WORK" && kiro-cli agent list 2>&1 || true)"
49
+ if echo "$list_output" | grep -q "dev[[:space:]]\+Workspace"; then
50
+ _pass "workspace agent list includes dev"
51
+ else
52
+ _fail "workspace agent list did not include dev"
53
+ fi
54
+
55
+ echo ""
56
+ echo "--- Chat Smoke ---"
57
+ chat_output="$(cd "$TMP_WORK" && kiro-cli chat --agent dev --no-interactive "Reply with READY only." 2>&1 || true)"
58
+ if echo "$chat_output" | grep -q "READY"; then
59
+ _pass "dev agent replied to chat smoke prompt"
60
+ else
61
+ _fail "dev agent did not reply READY"
62
+ fi
63
+
64
+ echo ""
65
+ echo "--- Explore Behavior ---"
66
+ explore_output="$(cd "$TMP_WORK" && TELEMETRY_ENABLED=true TELEMETRY_DATA_DIR="$TMP_TELEMETRY" TELEMETRY_SESSION_DIR="$TMP_TELEMETRY/sessions" TELEMETRY_CHANNELS=full TELEMETRY_CHANNEL_FULL_LOG_FILE="$TMP_TELEMETRY/full.jsonl" node - <<'NODE'
67
+ const { spawnSync } = require("node:child_process");
68
+ const result = spawnSync("kiro-cli", [
69
+ "chat",
70
+ "--agent",
71
+ "dev",
72
+ "--no-interactive",
73
+ "--trust-all-tools",
74
+ "Explore the codebase and explain what it does.",
75
+ ], { encoding: "utf8", timeout: 30000 });
76
+ process.stdout.write(result.stdout || "");
77
+ process.stdout.write(result.stderr || "");
78
+ NODE
79
+ )"
80
+ explore_clean="$(printf '%s' "$explore_output" | strip_ansi)"
81
+ if echo "$explore_clean" | grep -q "Activating skill: explore"; then
82
+ _pass "dev activates the explore skill on a plain explore prompt"
83
+ else
84
+ _fail "dev did not activate the explore skill on a plain explore prompt"
85
+ fi
86
+
87
+ if echo "$explore_clean" | grep -q "Tool validation failed"; then
88
+ _fail "explore workflow exceeded harness delegation limits"
89
+ else
90
+ _pass "explore workflow stayed within harness delegation limits"
91
+ fi
92
+
93
+ if [[ -f "$TMP_TELEMETRY/full.jsonl" ]] && rg -q '"event_type":"agent.delegate"' "$TMP_TELEMETRY/full.jsonl"; then
94
+ _pass "telemetry confirms delegated explore execution"
95
+ else
96
+ _fail "telemetry did not confirm delegated explore execution"
97
+ fi
98
+
99
+ echo ""
100
+ echo "--- Strict Stop Gate ---"
101
+ mkdir -p "$TMP_WORK/.flow-agents/live-stop"
102
+ cat > "$TMP_WORK/.flow-agents/live-stop/live-stop--deliver.md" <<'MARKDOWN'
103
+ # Live Stop Gate
104
+
105
+ status: executing
106
+ type: deliver
107
+
108
+ ## Plan
109
+
110
+ This delivery artifact is intentionally incomplete so the strict stop hook must surface Goal Fit guidance.
111
+ MARKDOWN
112
+
113
+ stop_output="$(cd "$TMP_WORK" && FLOW_AGENTS_GOAL_FIT_STRICT=true kiro-cli chat --agent dev --no-interactive "Reply with READY only." 2>&1 || true)"
114
+ stop_clean="$(printf '%s' "$stop_output" | strip_ansi)"
115
+ if echo "$stop_clean" | grep -q 'stop "node .*stop:goal-fit stop-goal-fit.js standard,strict" failed with exit code: 2' \
116
+ && echo "$stop_clean" | grep -q '\[Hook\] Goal Fit warning:' \
117
+ && echo "$stop_clean" | grep -q 'live-stop--deliver.md is still status:executing'; then
118
+ _pass "strict Goal Fit stop hook surfaces live Kiro stop gate"
119
+ else
120
+ _fail "strict Goal Fit stop hook did not surface live Kiro stop gate"
121
+ fi
122
+
123
+ echo ""
124
+ echo "==========================="
125
+ total=$((pass + fail))
126
+ echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
127
+ [[ "$fail" -gt 0 ]] && exit 1
128
+ exit 0