@kontourai/flow-agents 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. package/.githooks/pre-push +11 -0
  2. package/.github/workflows/ci.yml +210 -0
  3. package/.github/workflows/docs-pages.yml +52 -0
  4. package/.github/workflows/publish-npm.yml +104 -0
  5. package/AGENTS.md +26 -0
  6. package/CHANGELOG.md +66 -0
  7. package/CODE_OF_CONDUCT.md +25 -0
  8. package/CONTEXT.md +300 -0
  9. package/CONTRIBUTING.md +44 -0
  10. package/LICENSE +201 -0
  11. package/README.md +129 -0
  12. package/SECURITY.md +33 -0
  13. package/agent-cards/dev.json +19 -0
  14. package/agents/dev.json +127 -0
  15. package/agents/tool-code-reviewer.json +61 -0
  16. package/agents/tool-dependencies-updater.json +118 -0
  17. package/agents/tool-explore-config.json +92 -0
  18. package/agents/tool-explore-deps.json +92 -0
  19. package/agents/tool-explore-entry.json +92 -0
  20. package/agents/tool-explore-patterns.json +92 -0
  21. package/agents/tool-explore-structure.json +92 -0
  22. package/agents/tool-explore-tests.json +92 -0
  23. package/agents/tool-planner.json +57 -0
  24. package/agents/tool-playwright.json +145 -0
  25. package/agents/tool-security-reviewer.json +56 -0
  26. package/agents/tool-verifier.json +61 -0
  27. package/agents/tool-worker.json +58 -0
  28. package/build/src/cli/console-learning-projection.js +123 -0
  29. package/build/src/cli/docs-preview.js +39 -0
  30. package/build/src/cli/effective-backlog-settings.js +102 -0
  31. package/build/src/cli/export-bookmarks.js +38 -0
  32. package/build/src/cli/fixture-retirement-audit.js +140 -0
  33. package/build/src/cli/flow-kit.js +138 -0
  34. package/build/src/cli/import-bookmarks.js +50 -0
  35. package/build/src/cli/init.js +239 -0
  36. package/build/src/cli/instinct-cli.js +93 -0
  37. package/build/src/cli/promote-workflow-artifact.js +63 -0
  38. package/build/src/cli/publish-change-helper.js +154 -0
  39. package/build/src/cli/pull-work-provider.js +469 -0
  40. package/build/src/cli/runtime-adapter.js +23 -0
  41. package/build/src/cli/telemetry-doctor.js +221 -0
  42. package/build/src/cli/usage-feedback.js +443 -0
  43. package/build/src/cli/validate-hook-influence.js +152 -0
  44. package/build/src/cli/validate-source-tree.js +31 -0
  45. package/build/src/cli/validate-workflow-artifacts.js +486 -0
  46. package/build/src/cli/veritas-governance.js +262 -0
  47. package/build/src/cli/workflow-artifact-cleanup-audit.js +272 -0
  48. package/build/src/cli/workflow-sidecar.js +816 -0
  49. package/build/src/cli.js +89 -0
  50. package/build/src/flow-kit/validate.js +75 -0
  51. package/build/src/lib/args.js +45 -0
  52. package/build/src/lib/fs.js +62 -0
  53. package/build/src/lib/workflow-learning-projection.js +334 -0
  54. package/build/src/runtime-adapters.js +146 -0
  55. package/build/src/tools/build-universal-bundles.js +397 -0
  56. package/build/src/tools/common.js +56 -0
  57. package/build/src/tools/filter-installed-packs.js +132 -0
  58. package/build/src/tools/generate-context-map.js +198 -0
  59. package/build/src/tools/validate-package.js +64 -0
  60. package/build/src/tools/validate-source-tree.js +622 -0
  61. package/console.telemetry.json +176 -0
  62. package/context/base-rules.md +17 -0
  63. package/context/code-review-standards.md +62 -0
  64. package/context/coding-standards.md +42 -0
  65. package/context/common/orchestrators.md +12 -0
  66. package/context/common/subagents.md +28 -0
  67. package/context/contracts/artifact-contract.md +182 -0
  68. package/context/contracts/builder-kit-workflow-state-contract.md +319 -0
  69. package/context/contracts/delivery-contract.md +69 -0
  70. package/context/contracts/execution-contract.md +53 -0
  71. package/context/contracts/governance-adapter-contract.md +67 -0
  72. package/context/contracts/planning-contract.md +85 -0
  73. package/context/contracts/review-contract.md +104 -0
  74. package/context/contracts/sandbox-policy.md +52 -0
  75. package/context/contracts/verification-contract.md +134 -0
  76. package/context/contracts/work-item-contract.md +215 -0
  77. package/context/deferred/demo-mode.md +33 -0
  78. package/context/deferred/languages/go.md +31 -0
  79. package/context/deferred/languages/python.md +31 -0
  80. package/context/deferred/languages/typescript.md +34 -0
  81. package/context/deferred/parallelization.md +35 -0
  82. package/context/deferred/worktree-isolation.md +24 -0
  83. package/context/development-workflow.md +50 -0
  84. package/context/scripts/context-budget/budget-scan.sh +166 -0
  85. package/context/scripts/detect-tools.sh +3 -0
  86. package/context/scripts/discover-agents.sh +28 -0
  87. package/context/scripts/git-status.sh +49 -0
  88. package/context/scripts/hooks/config-protection.js +79 -0
  89. package/context/scripts/hooks/desktop-notify.sh +39 -0
  90. package/context/scripts/hooks/governance-audit.sh +135 -0
  91. package/context/scripts/hooks/lib/audit-transport.sh +40 -0
  92. package/context/scripts/hooks/lib/hook-flags.js +49 -0
  93. package/context/scripts/hooks/lib/patterns.sh +57 -0
  94. package/context/scripts/hooks/lib/resolve-formatter.js +80 -0
  95. package/context/scripts/hooks/post-edit-accumulator.js +66 -0
  96. package/context/scripts/hooks/pre-commit-quality.js +194 -0
  97. package/context/scripts/hooks/quality-gate.js +93 -0
  98. package/context/scripts/hooks/report-only-guard.js +21 -0
  99. package/context/scripts/hooks/run-hook.js +136 -0
  100. package/context/scripts/hooks/stop-format-typecheck.js +141 -0
  101. package/context/scripts/hooks/stop-goal-fit.js +337 -0
  102. package/context/scripts/hooks/workflow-steering.js +250 -0
  103. package/context/scripts/telemetry/console-presets.sh +14 -0
  104. package/context/scripts/telemetry/install-console-config.sh +214 -0
  105. package/context/scripts/telemetry/lib/config.sh +85 -0
  106. package/context/scripts/telemetry/lib/enrich.sh +115 -0
  107. package/context/scripts/telemetry/lib/redact.sh +22 -0
  108. package/context/scripts/telemetry/lib/session.sh +63 -0
  109. package/context/scripts/telemetry/lib/transport.sh +183 -0
  110. package/context/scripts/telemetry/lib/usage.sh +29 -0
  111. package/context/scripts/telemetry/sync-agents.sh +173 -0
  112. package/context/scripts/telemetry/telemetry.conf +23 -0
  113. package/context/scripts/telemetry/telemetry.sh +387 -0
  114. package/context/scripts/validate-package.sh +89 -0
  115. package/context/settings/backlog-provider-settings.json +54 -0
  116. package/context/templates/core/identity.md +26 -0
  117. package/context/templates/core/user.md +15 -0
  118. package/docs/_config.yml +15 -0
  119. package/docs/_layouts/default.html +87 -0
  120. package/docs/adr/0001-flow-agents-consumes-flow.md +77 -0
  121. package/docs/adr/0002-flow-kits-as-extension-unit.md +13 -0
  122. package/docs/adr/0003-flow-agents-coordinates-kits-and-adapters.md +13 -0
  123. package/docs/adr/0004-gates-expect-surface-claims.md +15 -0
  124. package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +48 -0
  125. package/docs/adr/0006-typescript-first-source-policy.md +98 -0
  126. package/docs/agent-system-guidebook.md +391 -0
  127. package/docs/agent-usage-feedback-loop.md +351 -0
  128. package/docs/assets/favicon.svg +13 -0
  129. package/docs/assets/og-image.png +0 -0
  130. package/docs/assets/site.css +774 -0
  131. package/docs/assets/site.js +139 -0
  132. package/docs/configurable-workflow-routing.md +174 -0
  133. package/docs/context-map.md +145 -0
  134. package/docs/developer-architecture.md +145 -0
  135. package/docs/developer-hook-setup.md +61 -0
  136. package/docs/fixture-ownership.md +44 -0
  137. package/docs/flow-kit-repository-contract.md +180 -0
  138. package/docs/index.md +129 -0
  139. package/docs/kontour-resource-contract.md +358 -0
  140. package/docs/migrations.md +64 -0
  141. package/docs/north-star.md +322 -0
  142. package/docs/operating-layers.md +110 -0
  143. package/docs/repository-structure.md +132 -0
  144. package/docs/sandbox-policy.md +56 -0
  145. package/docs/skills-map.md +203 -0
  146. package/docs/standards-register.md +96 -0
  147. package/docs/veritas-integration.md +165 -0
  148. package/docs/work-item-adapters.md +72 -0
  149. package/docs/workflow-artifact-lifecycle.md +141 -0
  150. package/docs/workflow-eval-strategy.md +295 -0
  151. package/docs/workflow-shared-contracts.md +51 -0
  152. package/docs/workflow-usage-guide.md +443 -0
  153. package/evals/ARCHITECTURE.md +143 -0
  154. package/evals/CONVENTIONS.md +58 -0
  155. package/evals/README.md +128 -0
  156. package/evals/acceptance/run.sh +29 -0
  157. package/evals/acceptance/test_claude_harness.sh +242 -0
  158. package/evals/acceptance/test_codex_harness.sh +108 -0
  159. package/evals/acceptance/test_kiro_harness.sh +128 -0
  160. package/evals/cases/dev/404.html +97 -0
  161. package/evals/cases/dev/code-review.yaml +44 -0
  162. package/evals/cases/dev/dashboard.html +300 -0
  163. package/evals/cases/dev/deliver.yaml +66 -0
  164. package/evals/cases/dev/dependency-update.yaml +16 -0
  165. package/evals/cases/dev/explore.yaml +20 -0
  166. package/evals/cases/dev/index.html +370 -0
  167. package/evals/cases/dev/package-lock.json +28 -0
  168. package/evals/cases/dev/package.json +16 -0
  169. package/evals/cases/dev/plan-work.yaml +20 -0
  170. package/evals/cases/dev/promptfooconfig.yaml +666 -0
  171. package/evals/cases/dev/search-first.yaml +20 -0
  172. package/evals/cases/dev/tdd-workflow.yaml +48 -0
  173. package/evals/cases/dev/verify-work.yaml +44 -0
  174. package/evals/cases/dev/workflow.yaml +34 -0
  175. package/evals/ci/run-baseline.sh +283 -0
  176. package/evals/fixtures/backlog-provider-settings/global-default.json +44 -0
  177. package/evals/fixtures/backlog-provider-settings/project-override.json +53 -0
  178. package/evals/fixtures/builder-kit-workflow-state/baseline-freshness-resolution-hint.json +139 -0
  179. package/evals/fixtures/builder-kit-workflow-state/direct-primitive-stop.json +59 -0
  180. package/evals/fixtures/builder-kit-workflow-state/empty-board-route-shape.json +55 -0
  181. package/evals/fixtures/builder-kit-workflow-state/happy-path.json +71 -0
  182. package/evals/fixtures/builder-kit-workflow-state/mid-work-resume.json +80 -0
  183. package/evals/fixtures/builder-kit-workflow-state/missing-prestep-recovery.json +65 -0
  184. package/evals/fixtures/builder-kit-workflow-state/product-build-chaining.json +60 -0
  185. package/evals/fixtures/builder-kit-workflow-state/stale-continuation-requires-new-probe.json +57 -0
  186. package/evals/fixtures/console-learning-projection/artifacts/console-learning-correction/learning.json +50 -0
  187. package/evals/fixtures/console-learning-projection/artifacts/console-learning-open-route/learning.json +41 -0
  188. package/evals/fixtures/flow-kit-repository/invalid-absolute-path/kit.json +8 -0
  189. package/evals/fixtures/flow-kit-repository/invalid-asset-section/flows/review.flow.json +6 -0
  190. package/evals/fixtures/flow-kit-repository/invalid-asset-section/kit.json +11 -0
  191. package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/flows/review.flow.json +6 -0
  192. package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/kit.json +9 -0
  193. package/evals/fixtures/flow-kit-repository/invalid-id/flows/review.flow.json +6 -0
  194. package/evals/fixtures/flow-kit-repository/invalid-id/kit.json +8 -0
  195. package/evals/fixtures/flow-kit-repository/invalid-malformed-json/kit.json +8 -0
  196. package/evals/fixtures/flow-kit-repository/invalid-missing-flow/kit.json +8 -0
  197. package/evals/fixtures/flow-kit-repository/invalid-missing-id/flows/review.flow.json +6 -0
  198. package/evals/fixtures/flow-kit-repository/invalid-missing-id/kit.json +7 -0
  199. package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/flows/review.flow.json +6 -0
  200. package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/kit.json +7 -0
  201. package/evals/fixtures/flow-kit-repository/invalid-name/flows/review.flow.json +6 -0
  202. package/evals/fixtures/flow-kit-repository/invalid-name/kit.json +8 -0
  203. package/evals/fixtures/flow-kit-repository/invalid-schema-version/flows/review.flow.json +6 -0
  204. package/evals/fixtures/flow-kit-repository/invalid-schema-version/kit.json +8 -0
  205. package/evals/fixtures/flow-kit-repository/invalid-traversal/kit.json +8 -0
  206. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/adapters/example.json +3 -0
  207. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/assets/example.txt +1 -0
  208. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/docs/README.md +3 -0
  209. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/flows/runtime.flow.json +26 -0
  210. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-evals/example.json +3 -0
  211. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-skills/mixed/SKILL.md +3 -0
  212. package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit.json +44 -0
  213. package/evals/fixtures/flow-kit-repository/valid-local-kit/docs/README.md +3 -0
  214. package/evals/fixtures/flow-kit-repository/valid-local-kit/flows/review.flow.json +26 -0
  215. package/evals/fixtures/flow-kit-repository/valid-local-kit/kit.json +20 -0
  216. package/evals/fixtures/hook-influence/cases.json +336 -0
  217. package/evals/fixtures/pull-work-provider/github-issues.json +170 -0
  218. package/evals/fixtures/pull-work-wip-shepherding/global-wip-informs.json +43 -0
  219. package/evals/fixtures/pull-work-wip-shepherding/personal-wip-blocks.json +42 -0
  220. package/evals/fixtures/surface-trust/accepted-claim-trust-report.json +31 -0
  221. package/evals/fixtures/surface-trust/artifact-absent.json +19 -0
  222. package/evals/fixtures/surface-trust/integrity-mismatch-trust-report.json +32 -0
  223. package/evals/fixtures/surface-trust/missing-authority-trust-report.json +27 -0
  224. package/evals/fixtures/surface-trust/provider-absent.json +19 -0
  225. package/evals/fixtures/surface-trust/rejected-claim-trust-report.json +30 -0
  226. package/evals/fixtures/surface-trust/stale-claim-trust-snapshot.json +31 -0
  227. package/evals/fixtures/usage-feedback/sample-full.jsonl +11 -0
  228. package/evals/fixtures/usage-feedback/sample-outcomes.jsonl +1 -0
  229. package/evals/fixtures/veritas-governance-adapter/fake-veritas-pass.sh +18 -0
  230. package/evals/fixtures/veritas-governance-adapter/fake-veritas-secret-fail.sh +10 -0
  231. package/evals/fixtures/veritas-governance-adapter/fake-veritas-unconfigured.sh +4 -0
  232. package/evals/integration/test_bundle_install.sh +541 -0
  233. package/evals/integration/test_console_learning_projection.sh +192 -0
  234. package/evals/integration/test_context_map.sh +65 -0
  235. package/evals/integration/test_effective_backlog_settings.sh +58 -0
  236. package/evals/integration/test_fixture_retirement_audit.sh +58 -0
  237. package/evals/integration/test_flow_agents_statusline.sh +93 -0
  238. package/evals/integration/test_flow_kit_repository.sh +90 -0
  239. package/evals/integration/test_goal_fit_hook.sh +482 -0
  240. package/evals/integration/test_hook_category_behaviors.sh +190 -0
  241. package/evals/integration/test_hook_influence_cases.sh +69 -0
  242. package/evals/integration/test_local_flow_kit_install.sh +145 -0
  243. package/evals/integration/test_publish_change_helper.sh +176 -0
  244. package/evals/integration/test_pull_work_provider.sh +140 -0
  245. package/evals/integration/test_runtime_adapter_activation.sh +106 -0
  246. package/evals/integration/test_telemetry.sh +485 -0
  247. package/evals/integration/test_telemetry_doctor.sh +193 -0
  248. package/evals/integration/test_usage_feedback_dashboard.sh +169 -0
  249. package/evals/integration/test_usage_feedback_global.sh +117 -0
  250. package/evals/integration/test_usage_feedback_import.sh +227 -0
  251. package/evals/integration/test_usage_feedback_outcomes.sh +165 -0
  252. package/evals/integration/test_usage_feedback_report.sh +263 -0
  253. package/evals/integration/test_veritas_governance_adapter.sh +235 -0
  254. package/evals/integration/test_workflow_artifact_cleanup_audit.sh +287 -0
  255. package/evals/integration/test_workflow_artifacts.sh +1247 -0
  256. package/evals/integration/test_workflow_sidecar_writer.sh +2112 -0
  257. package/evals/integration/test_workflow_steering_hook.sh +337 -0
  258. package/evals/lib/assertions/delegated-to.js +40 -0
  259. package/evals/lib/assertions/max-tool-calls.js +15 -0
  260. package/evals/lib/assertions/no-write-tools.js +27 -0
  261. package/evals/lib/assertions/pass-at-k.js +39 -0
  262. package/evals/lib/assertions/telemetry-utils.js +105 -0
  263. package/evals/lib/assertions/tool-called.js +39 -0
  264. package/evals/lib/assertions/verify-after-fix.js +61 -0
  265. package/evals/lib/claude-judge.sh +40 -0
  266. package/evals/lib/claude-provider.sh +74 -0
  267. package/evals/lib/codex-judge.sh +39 -0
  268. package/evals/lib/codex-provider.sh +81 -0
  269. package/evals/lib/eval-dev.sh +5 -0
  270. package/evals/lib/eval-judge.sh +22 -0
  271. package/evals/lib/eval-provider.sh +26 -0
  272. package/evals/lib/eval-report.sh +73 -0
  273. package/evals/lib/kiro-dev.sh +4 -0
  274. package/evals/lib/kiro-judge.sh +17 -0
  275. package/evals/lib/kiro-provider.sh +62 -0
  276. package/evals/lib/node.sh +111 -0
  277. package/evals/promptfooconfig.yaml +70 -0
  278. package/evals/run.sh +309 -0
  279. package/evals/static/test_evidence_refs.sh +141 -0
  280. package/evals/static/test_package.sh +407 -0
  281. package/evals/static/test_repo_hooks.sh +68 -0
  282. package/evals/static/test_universal_bundles.sh +274 -0
  283. package/evals/static/test_workflow_skills.sh +1207 -0
  284. package/install.sh +64 -0
  285. package/integrations/veritas/flow-agents.adapter.json +138 -0
  286. package/integrations/veritas/flow-agents.authority-settings.json +26 -0
  287. package/integrations/veritas/flow-agents.repo-standards.json +82 -0
  288. package/kits/builder/flows/build.flow.json +218 -0
  289. package/kits/builder/flows/shape.flow.json +127 -0
  290. package/kits/builder/kit.json +19 -0
  291. package/kits/catalog.json +11 -0
  292. package/package.json +130 -0
  293. package/packaging/README.md +60 -0
  294. package/packaging/manifest.json +173 -0
  295. package/packaging/packs.json +69 -0
  296. package/powers/dependency-checker/POWER.md +20 -0
  297. package/powers/dependency-checker/mcp.json +20 -0
  298. package/powers/playwright/POWER.md +25 -0
  299. package/powers/playwright/mcp.json +12 -0
  300. package/prompts/code-audit.md +123 -0
  301. package/prompts/kcommit.md +88 -0
  302. package/schemas/backlog-provider-settings.schema.json +138 -0
  303. package/schemas/workflow-acceptance.schema.json +216 -0
  304. package/schemas/workflow-critique.schema.json +113 -0
  305. package/schemas/workflow-evidence.schema.json +357 -0
  306. package/schemas/workflow-handoff.schema.json +52 -0
  307. package/schemas/workflow-learning.schema.json +223 -0
  308. package/schemas/workflow-release.schema.json +172 -0
  309. package/schemas/workflow-state.schema.json +80 -0
  310. package/scripts/README.md +111 -0
  311. package/scripts/build-universal-bundles.js +3 -0
  312. package/scripts/check-content-boundary.cjs +99 -0
  313. package/scripts/context-budget/budget-scan.sh +166 -0
  314. package/scripts/detect-tools.sh +3 -0
  315. package/scripts/discover-agents.sh +28 -0
  316. package/scripts/effective-backlog-settings.js +2 -0
  317. package/scripts/filter-installed-packs.js +2 -0
  318. package/scripts/flow-kit.js +2 -0
  319. package/scripts/generate-context-map.js +2 -0
  320. package/scripts/git-status.sh +49 -0
  321. package/scripts/hooks/claude-hook-adapter.js +174 -0
  322. package/scripts/hooks/claude-telemetry-hook.js +115 -0
  323. package/scripts/hooks/codex-hook-adapter.js +176 -0
  324. package/scripts/hooks/codex-telemetry-hook.js +95 -0
  325. package/scripts/hooks/config-protection.js +79 -0
  326. package/scripts/hooks/desktop-notify.sh +39 -0
  327. package/scripts/hooks/governance-audit.sh +135 -0
  328. package/scripts/hooks/lib/audit-transport.sh +40 -0
  329. package/scripts/hooks/lib/hook-flags.js +49 -0
  330. package/scripts/hooks/lib/patterns.sh +57 -0
  331. package/scripts/hooks/lib/resolve-formatter.js +80 -0
  332. package/scripts/hooks/post-edit-accumulator.js +66 -0
  333. package/scripts/hooks/pre-commit-quality.js +194 -0
  334. package/scripts/hooks/quality-gate.js +93 -0
  335. package/scripts/hooks/report-only-guard.js +21 -0
  336. package/scripts/hooks/run-hook.js +136 -0
  337. package/scripts/hooks/stop-format-typecheck.js +141 -0
  338. package/scripts/hooks/stop-goal-fit.js +337 -0
  339. package/scripts/hooks/workflow-steering.js +250 -0
  340. package/scripts/install-codex-home.sh +106 -0
  341. package/scripts/package.json +3 -0
  342. package/scripts/promote-workflow-artifact.js +2 -0
  343. package/scripts/publish-change-helper.js +2 -0
  344. package/scripts/pull-work-provider.js +2 -0
  345. package/scripts/setup-repo-hooks.sh +8 -0
  346. package/scripts/statusline/flow-agents-statusline.js +157 -0
  347. package/scripts/telemetry/console-presets.sh +14 -0
  348. package/scripts/telemetry/install-console-config.sh +214 -0
  349. package/scripts/telemetry/lib/config.sh +85 -0
  350. package/scripts/telemetry/lib/enrich.sh +115 -0
  351. package/scripts/telemetry/lib/redact.sh +22 -0
  352. package/scripts/telemetry/lib/session.sh +63 -0
  353. package/scripts/telemetry/lib/transport.sh +183 -0
  354. package/scripts/telemetry/lib/usage.sh +29 -0
  355. package/scripts/telemetry/sync-agents.sh +173 -0
  356. package/scripts/telemetry/telemetry.conf +23 -0
  357. package/scripts/telemetry/telemetry.sh +387 -0
  358. package/scripts/usage-feedback.js +2 -0
  359. package/scripts/validate-hook-influence-cases.js +2 -0
  360. package/scripts/validate-package.sh +89 -0
  361. package/scripts/validate-source-tree.js +9 -0
  362. package/skills/agentic-engineering/SKILL.md +62 -0
  363. package/skills/browser-test/SKILL.md +51 -0
  364. package/skills/builder-shape/SKILL.md +76 -0
  365. package/skills/context-budget/SKILL.md +40 -0
  366. package/skills/deliver/SKILL.md +241 -0
  367. package/skills/dependency-update/SKILL.md +68 -0
  368. package/skills/design-probe/SKILL.md +107 -0
  369. package/skills/eval-rebuild/SKILL.md +39 -0
  370. package/skills/evidence-gate/SKILL.md +186 -0
  371. package/skills/execute-plan/SKILL.md +110 -0
  372. package/skills/explore/SKILL.md +137 -0
  373. package/skills/feedback-loop/SKILL.md +87 -0
  374. package/skills/fix-bug/SKILL.md +133 -0
  375. package/skills/frontend-design/SKILL.md +80 -0
  376. package/skills/github-cli/SKILL.md +63 -0
  377. package/skills/idea-to-backlog/SKILL.md +267 -0
  378. package/skills/knowledge-capture/SKILL.md +55 -0
  379. package/skills/learning-review/SKILL.md +115 -0
  380. package/skills/pickup-probe/SKILL.md +114 -0
  381. package/skills/plan-work/SKILL.md +176 -0
  382. package/skills/pull-work/SKILL.md +309 -0
  383. package/skills/release-readiness/SKILL.md +121 -0
  384. package/skills/review-work/SKILL.md +161 -0
  385. package/skills/search-first/SKILL.md +66 -0
  386. package/skills/tdd-workflow/SKILL.md +140 -0
  387. package/skills/verify-work/SKILL.md +109 -0
  388. package/src/cli/console-learning-projection.ts +140 -0
  389. package/src/cli/effective-backlog-settings.ts +99 -0
  390. package/src/cli/fixture-retirement-audit.ts +154 -0
  391. package/src/cli/flow-kit.ts +139 -0
  392. package/src/cli/init.ts +248 -0
  393. package/src/cli/promote-workflow-artifact.ts +64 -0
  394. package/src/cli/publish-change-helper.ts +143 -0
  395. package/src/cli/pull-work-provider.ts +481 -0
  396. package/src/cli/runtime-adapter.ts +24 -0
  397. package/src/cli/telemetry-doctor.ts +243 -0
  398. package/src/cli/usage-feedback.ts +418 -0
  399. package/src/cli/validate-hook-influence.ts +119 -0
  400. package/src/cli/validate-source-tree.ts +30 -0
  401. package/src/cli/validate-workflow-artifacts.ts +411 -0
  402. package/src/cli/veritas-governance.ts +322 -0
  403. package/src/cli/workflow-artifact-cleanup-audit.ts +281 -0
  404. package/src/cli/workflow-sidecar.ts +676 -0
  405. package/src/cli.ts +95 -0
  406. package/src/flow-kit/validate.ts +74 -0
  407. package/src/lib/args.ts +43 -0
  408. package/src/lib/fs.ts +62 -0
  409. package/src/lib/workflow-learning-projection.ts +491 -0
  410. package/src/runtime-adapters.ts +154 -0
  411. package/src/tools/build-universal-bundles.ts +366 -0
  412. package/src/tools/common.ts +61 -0
  413. package/src/tools/filter-installed-packs.ts +129 -0
  414. package/src/tools/generate-context-map.ts +199 -0
  415. package/src/tools/validate-package.ts +57 -0
  416. package/src/tools/validate-source-tree.ts +488 -0
  417. package/tsconfig.json +19 -0
  418. package/veritas.claims.json +6 -0
@@ -0,0 +1,61 @@
1
+ // verify-after-fix.js — Assert that any code change during review/verify is followed by a clean verification pass
2
+ const { getNewEvents, filterByType, getToolInvocations, getSubagentCalls } = require('./telemetry-utils');
3
+
4
+ const WRITE_TOOLS = new Set(['write files', 'write', 'apply_patch', 'edit']);
5
+ const REVIEW_AGENTS = new Set(['tool-code-reviewer', 'tool-security-reviewer']);
6
+ const VERIFY_AGENTS = new Set(['tool-verifier', 'tool-playwright']);
7
+ const REPORTER_AGENTS = new Set([...REVIEW_AGENTS, ...VERIFY_AGENTS]);
8
+
9
+ module.exports = (output, { config }) => {
10
+ const events = getNewEvents();
11
+ const toolEvents = getToolInvocations(events);
12
+ const subagentCalls = getSubagentCalls(events);
13
+ const violations = [];
14
+
15
+ // Check 1: Reviewers/verifiers must never invoke write tools
16
+ const reporterWrites = toolEvents.filter(e => {
17
+ const agent = e.agent && e.agent.name;
18
+ const tool = e.tool && e.tool.name && String(e.tool.name).toLowerCase();
19
+ return agent && REPORTER_AGENTS.has(agent) && WRITE_TOOLS.has(tool);
20
+ });
21
+
22
+ if (reporterWrites.length > 0) {
23
+ violations.push(
24
+ `Reporter agents wrote code: ${reporterWrites.map(e => `${e.agent.name} → ${e.tool.name}`).join('; ')}`
25
+ );
26
+ }
27
+
28
+ // Check 2: After any write tool call, there must be a subsequent tool-verifier delegation
29
+ const allEvents = events;
30
+ let lastWriteIdx = -1;
31
+ let lastVerifyIdx = -1;
32
+
33
+ for (let i = 0; i < allEvents.length; i++) {
34
+ const e = allEvents[i];
35
+ const toolName = e.tool && e.tool.name && String(e.tool.name).toLowerCase();
36
+ if (e.event_type === 'tool.invoke' && e.tool && WRITE_TOOLS.has(toolName)) {
37
+ lastWriteIdx = i;
38
+ }
39
+ if (e.event_type === 'tool.invoke' && e.tool && toolName === 'delegate to a specialist agent' &&
40
+ e.tool.input && e.tool.input.command === 'InvokeSubagents') {
41
+ const subs = e.tool.input.content && e.tool.input.content.subagents;
42
+ if (subs && subs.some(s => s.agent_name === 'tool-verifier')) {
43
+ lastVerifyIdx = i;
44
+ }
45
+ }
46
+ if (e.event_type === 'tool.invoke' && e.tool && toolName === 'spawn_agent' &&
47
+ e.tool.input && e.tool.input.agent_type === 'tool-verifier') {
48
+ lastVerifyIdx = i;
49
+ }
50
+ }
51
+
52
+ if (lastWriteIdx > lastVerifyIdx) {
53
+ violations.push('Code was written after the last verification pass — missing re-verify');
54
+ }
55
+
56
+ if (violations.length === 0) {
57
+ return { pass: true, score: 1, reason: 'No code changes without subsequent verification' };
58
+ }
59
+
60
+ return { pass: false, score: 0, reason: `Re-verify violations: ${violations.join('; ')}` };
61
+ };
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env bash
2
+ # claude-judge.sh — Promptfoo exec provider for llm-rubric judging via Claude Code.
3
+ set -euo pipefail
4
+
5
+ PROMPT="${1:-}"
6
+ TIMEOUT="${CLAUDE_EVAL_JUDGE_TIMEOUT:-180}"
7
+ MAX_LEN=200000
8
+ if [[ ${#PROMPT} -gt $MAX_LEN ]]; then
9
+ PROMPT="${PROMPT:0:$MAX_LEN}... [truncated for eval - output exceeded ${MAX_LEN} chars]"
10
+ fi
11
+
12
+ OUT="$(mktemp /tmp/flow-agents-claude-judge.XXXXXX)"
13
+ LOG="$(mktemp /tmp/flow-agents-claude-judge-log.XXXXXX)"
14
+ trap 'rm -f "$OUT" "$LOG"' EXIT
15
+
16
+ if ! command -v claude >/dev/null 2>&1; then
17
+ echo "claude CLI is not installed or not on PATH" >&2
18
+ exit 2
19
+ fi
20
+
21
+ if command -v timeout >/dev/null 2>&1; then
22
+ TIMEOUT_CMD=(timeout "$TIMEOUT")
23
+ elif command -v gtimeout >/dev/null 2>&1; then
24
+ TIMEOUT_CMD=(gtimeout "$TIMEOUT")
25
+ else
26
+ TIMEOUT_CMD=()
27
+ fi
28
+
29
+ "${TIMEOUT_CMD[@]}" claude \
30
+ -p \
31
+ --permission-mode bypassPermissions \
32
+ --add-dir /tmp \
33
+ --output-format text \
34
+ "$PROMPT" >"$OUT" 2>"$LOG" || {
35
+ cat "$OUT" 2>/dev/null
36
+ sed -n '1,120p' "$LOG" >&2
37
+ exit 1
38
+ }
39
+
40
+ cat "$OUT"
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env bash
2
+ # claude-provider.sh — Promptfoo exec provider that runs Flow Agents through Claude Code.
3
+ set -euo pipefail
4
+
5
+ PROMPT="${1:-}"
6
+ OPTIONS="${2:-}"
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
9
+ TIMEOUT="${CLAUDE_EVAL_TIMEOUT:-300}"
10
+ FLUSH_SLEEP="${FLOW_AGENTS_EVAL_TELEMETRY_FLUSH_SLEEP:-0.5}"
11
+ SNAPSHOT_FILE="${FLOW_AGENTS_EVAL_TELEMETRY_SNAPSHOT:-/tmp/promptfoo-eval-telemetry-snapshot.txt}"
12
+ TELEMETRY_FILE_MARKER="${FLOW_AGENTS_EVAL_TELEMETRY_FILE_MARKER:-/tmp/promptfoo-eval-telemetry-file.txt}"
13
+
14
+ AGENT=""
15
+ if [[ -n "$OPTIONS" ]]; then
16
+ AGENT=$(node -e "let d='';process.stdin.on('data',c=>d+=c).on('end',()=>{try{const j=JSON.parse(d);process.stdout.write(j.config?.agent||'')}catch{}})" <<<"$OPTIONS" 2>/dev/null || true)
17
+ fi
18
+ AGENT="${AGENT:-${FLOW_AGENTS_EVAL_AGENT:-dev}}"
19
+
20
+ if ! command -v claude >/dev/null 2>&1; then
21
+ echo "claude CLI is not installed or not on PATH" >&2
22
+ exit 2
23
+ fi
24
+
25
+ run_claude() {
26
+ if command -v timeout >/dev/null 2>&1; then
27
+ timeout "$TIMEOUT" "${CLAUDE_CMD[@]}"
28
+ elif command -v gtimeout >/dev/null 2>&1; then
29
+ gtimeout "$TIMEOUT" "${CLAUDE_CMD[@]}"
30
+ else
31
+ "${CLAUDE_CMD[@]}"
32
+ fi
33
+ }
34
+
35
+ prepare_workdir() {
36
+ local work_root="${CLAUDE_EVAL_WORK_ROOT:-/tmp/flow-agents-claude-eval}"
37
+ local work_dir="$work_root/$AGENT"
38
+ rm -rf "$work_dir"
39
+ mkdir -p "$work_dir"
40
+ (cd "$ROOT_DIR" && flow_agents_node scripts/build-universal-bundles.js >/dev/null)
41
+ bash "$ROOT_DIR/dist/claude-code/install.sh" "$work_dir" >/dev/null
42
+ mkdir -p "$work_dir/.telemetry"
43
+ echo "$work_dir"
44
+ }
45
+
46
+ WORK_DIR="$(prepare_workdir)"
47
+ TELEMETRY_FILE="$WORK_DIR/.telemetry/full.jsonl"
48
+ echo "$TELEMETRY_FILE" > "$TELEMETRY_FILE_MARKER"
49
+ if [[ -f "$TELEMETRY_FILE" ]]; then
50
+ wc -l < "$TELEMETRY_FILE" | tr -d ' ' > "$SNAPSHOT_FILE"
51
+ else
52
+ echo "0" > "$SNAPSHOT_FILE"
53
+ fi
54
+
55
+ CLAUDE_CMD=(
56
+ env
57
+ FLOW_AGENTS_CLAUDE_TELEMETRY_CHANNELS="${FLOW_AGENTS_CLAUDE_TELEMETRY_CHANNELS:-full,analytics}"
58
+ claude
59
+ -p
60
+ --agent "$AGENT"
61
+ --permission-mode bypassPermissions
62
+ --add-dir "$WORK_DIR"
63
+ --output-format text
64
+ "$PROMPT"
65
+ )
66
+
67
+ set +e
68
+ RAW=$(cd "$WORK_DIR" && run_claude 2>&1)
69
+ STATUS=$?
70
+ set -e
71
+ sleep "$FLUSH_SLEEP"
72
+ echo "$RAW" | sed $'s/\x1b\[[0-9;]*[a-zA-Z]//g; s/\x1b\[[0-9;]*m//g; s/\x07//g' \
73
+ | grep -v '^\s*$'
74
+ exit "$STATUS"
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env bash
2
+ # codex-judge.sh — Promptfoo exec provider for llm-rubric judging via Codex.
3
+ set -euo pipefail
4
+
5
+ PROMPT="${1:-}"
6
+ TIMEOUT="${CODEX_EVAL_JUDGE_TIMEOUT:-180}"
7
+ MAX_LEN=200000
8
+ if [[ ${#PROMPT} -gt $MAX_LEN ]]; then
9
+ PROMPT="${PROMPT:0:$MAX_LEN}... [truncated for eval - output exceeded ${MAX_LEN} chars]"
10
+ fi
11
+
12
+ OUT="$(mktemp /tmp/flow-agents-codex-judge.XXXXXX)"
13
+ LOG="$(mktemp /tmp/flow-agents-codex-judge-log.XXXXXX)"
14
+ trap 'rm -f "$OUT" "$LOG"' EXIT
15
+
16
+ if command -v timeout >/dev/null 2>&1; then
17
+ TIMEOUT_CMD=(timeout "$TIMEOUT")
18
+ elif command -v gtimeout >/dev/null 2>&1; then
19
+ TIMEOUT_CMD=(gtimeout "$TIMEOUT")
20
+ else
21
+ TIMEOUT_CMD=()
22
+ fi
23
+
24
+ "${TIMEOUT_CMD[@]}" codex exec \
25
+ --ignore-user-config \
26
+ --skip-git-repo-check \
27
+ -C /tmp \
28
+ --sandbox read-only \
29
+ --json \
30
+ -c model='"gpt-5.5"' \
31
+ -c model_reasoning_effort='"medium"' \
32
+ --output-last-message "$OUT" \
33
+ "$PROMPT" >"$LOG" 2>&1 || {
34
+ cat "$OUT" 2>/dev/null
35
+ sed -n '1,120p' "$LOG" >&2
36
+ exit 1
37
+ }
38
+
39
+ cat "$OUT"
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env bash
2
+ # codex-provider.sh — Promptfoo exec provider that runs Flow Agents through Codex.
3
+ set -euo pipefail
4
+
5
+ PROMPT="${1:-}"
6
+ OPTIONS="${2:-}"
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
9
+ TIMEOUT="${CODEX_EVAL_TIMEOUT:-300}"
10
+ FLUSH_SLEEP="${FLOW_AGENTS_EVAL_TELEMETRY_FLUSH_SLEEP:-0.5}"
11
+ SNAPSHOT_FILE="${FLOW_AGENTS_EVAL_TELEMETRY_SNAPSHOT:-/tmp/promptfoo-eval-telemetry-snapshot.txt}"
12
+ TELEMETRY_FILE_MARKER="${FLOW_AGENTS_EVAL_TELEMETRY_FILE_MARKER:-/tmp/promptfoo-eval-telemetry-file.txt}"
13
+
14
+ AGENT=""
15
+ if [[ -n "$OPTIONS" ]]; then
16
+ AGENT=$(node -e "let d='';process.stdin.on('data',c=>d+=c).on('end',()=>{try{const j=JSON.parse(d);process.stdout.write(j.config?.agent||'')}catch{}})" <<<"$OPTIONS" 2>/dev/null || true)
17
+ fi
18
+ AGENT="${AGENT:-${FLOW_AGENTS_EVAL_AGENT:-dev}}"
19
+
20
+ profile_for_agent() {
21
+ case "$1" in
22
+ dev) echo "kdev" ;;
23
+ *) echo "" ;;
24
+ esac
25
+ }
26
+
27
+
28
+ strip_json_events() {
29
+ node -e "const rl=require('readline').createInterface({input:process.stdin});rl.on('line',l=>{if(!l)return;try{const o=JSON.parse(l);if(o.type==='agent_message'&&typeof o.text==='string')console.log(o.text);else if(o.type==='item.completed'&&o.item?.type==='agent_message'&&typeof o.item.text==='string')console.log(o.item.text)}catch{console.log(l)}})"
30
+ }
31
+
32
+ run_codex() {
33
+ if command -v timeout >/dev/null 2>&1; then
34
+ timeout "$TIMEOUT" "${CODEX_CMD[@]}" "$PROMPT"
35
+ elif command -v gtimeout >/dev/null 2>&1; then
36
+ gtimeout "$TIMEOUT" "${CODEX_CMD[@]}" "$PROMPT"
37
+ else
38
+ "${CODEX_CMD[@]}" "$PROMPT"
39
+ fi
40
+ }
41
+
42
+ prepare_workdir() {
43
+ local work_root="${CODEX_EVAL_WORK_ROOT:-/tmp/flow-agents-codex-eval}"
44
+ local work_dir="$work_root/$AGENT"
45
+ rm -rf "$work_dir"
46
+ mkdir -p "$work_dir"
47
+ (cd "$ROOT_DIR" && flow_agents_node scripts/build-universal-bundles.js >/dev/null)
48
+ cp -R "$ROOT_DIR/dist/codex/." "$work_dir/"
49
+ cp "$work_dir/.codex/config.toml" "$work_dir/.codex/config-eval.toml"
50
+ for auth_file in auth.json version.json installation_id; do
51
+ if [[ -f "${CODEX_REAL_HOME:-$HOME/.codex}/$auth_file" ]]; then
52
+ cp "${CODEX_REAL_HOME:-$HOME/.codex}/$auth_file" "$work_dir/.codex/$auth_file"
53
+ fi
54
+ done
55
+ mkdir -p "$work_dir/.telemetry"
56
+ echo "$work_dir"
57
+ }
58
+
59
+ WORK_DIR="$(prepare_workdir)"
60
+ TELEMETRY_FILE="$WORK_DIR/.telemetry/full.jsonl"
61
+ echo "$TELEMETRY_FILE" > "$TELEMETRY_FILE_MARKER"
62
+ if [[ -f "$TELEMETRY_FILE" ]]; then
63
+ wc -l < "$TELEMETRY_FILE" | tr -d ' ' > "$SNAPSHOT_FILE"
64
+ else
65
+ echo "0" > "$SNAPSHOT_FILE"
66
+ fi
67
+
68
+ PROFILE="$(profile_for_agent "$AGENT")"
69
+ if [[ -n "$PROFILE" ]]; then
70
+ CODEX_CMD=(env CODEX_HOME="$WORK_DIR/.codex" codex -p "$PROFILE" exec --skip-git-repo-check -C "$WORK_DIR" --sandbox read-only --json)
71
+ else
72
+ CODEX_CMD=(env CODEX_HOME="$WORK_DIR/.codex" codex -c "developer_instructions=$(node -e "const fs=require('fs'),p='$WORK_DIR/.codex/agents/$AGENT.toml';if(!fs.existsSync(p)){process.stdout.write('\"\"');process.exit(0)}const m=fs.readFileSync(p,'utf8').match(/^developer_instructions\\s*=\\s*(.+)$/m);process.stdout.write(m?m[1]:'\"\"')")" exec --skip-git-repo-check -C "$WORK_DIR" --sandbox read-only --json)
73
+ fi
74
+
75
+ set +e
76
+ RAW=$(run_codex 2>&1)
77
+ STATUS=$?
78
+ set -e
79
+ sleep "$FLUSH_SLEEP"
80
+ echo "$RAW" | strip_json_events | sed $'s/\x1b\[[0-9;]*[a-zA-Z]//g; s/\x1b\[[0-9;]*m//g; s/\x07//g'
81
+ exit "$STATUS"
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
3
+ export FLOW_AGENTS_EVAL_AGENT=dev
4
+ export KIRO_EVAL_AGENT=dev
5
+ exec bash "$SCRIPT_DIR/eval-provider.sh" "$@"
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env bash
2
+ # eval-judge.sh — Runtime-neutral promptfoo rubric judge provider.
3
+ set -euo pipefail
4
+
5
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+ RUNTIME="${FLOW_AGENTS_EVAL_JUDGE_RUNTIME:-${FLOW_AGENTS_EVAL_RUNTIME:-${EVAL_RUNTIME:-kiro}}}"
7
+
8
+ case "$RUNTIME" in
9
+ kiro|kiro-cli)
10
+ exec bash "$SCRIPT_DIR/kiro-judge.sh" "$@"
11
+ ;;
12
+ codex)
13
+ exec bash "$SCRIPT_DIR/codex-judge.sh" "$@"
14
+ ;;
15
+ claude|claude-code)
16
+ exec bash "$SCRIPT_DIR/claude-judge.sh" "$@"
17
+ ;;
18
+ *)
19
+ echo "Unsupported FLOW_AGENTS_EVAL_JUDGE_RUNTIME='$RUNTIME' (expected kiro, codex, or claude)" >&2
20
+ exit 2
21
+ ;;
22
+ esac
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env bash
2
+ # eval-provider.sh — Runtime-neutral promptfoo subject provider.
3
+ set -euo pipefail
4
+
5
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+ RUNTIME="${FLOW_AGENTS_EVAL_RUNTIME:-${EVAL_RUNTIME:-kiro}}"
7
+ AGENT="${FLOW_AGENTS_EVAL_AGENT:-${KIRO_EVAL_AGENT:-dev}}"
8
+
9
+ export FLOW_AGENTS_EVAL_AGENT="$AGENT"
10
+ export KIRO_EVAL_AGENT="$AGENT"
11
+
12
+ case "$RUNTIME" in
13
+ kiro|kiro-cli)
14
+ exec bash "$SCRIPT_DIR/kiro-provider.sh" "$@"
15
+ ;;
16
+ codex)
17
+ exec bash "$SCRIPT_DIR/codex-provider.sh" "$@"
18
+ ;;
19
+ claude|claude-code)
20
+ exec bash "$SCRIPT_DIR/claude-provider.sh" "$@"
21
+ ;;
22
+ *)
23
+ echo "Unsupported FLOW_AGENTS_EVAL_RUNTIME='$RUNTIME' (expected kiro, codex, or claude)" >&2
24
+ exit 2
25
+ ;;
26
+ esac
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env bash
2
+ # eval-report.sh — Generate markdown eval report from promptfoo JSON output
3
+ # Usage: bash lib/eval-report.sh <results-json> [previous-json]
4
+ # Output: markdown report to stdout
5
+ set -uo pipefail
6
+
7
+ RESULTS="${1:?Usage: bash lib/eval-report.sh <results.json> [previous.json]}"
8
+ PREVIOUS="${2:-}"
9
+
10
+ if [[ ! -f "$RESULTS" ]]; then
11
+ echo "Error: Results file not found: $RESULTS" >&2
12
+ exit 1
13
+ fi
14
+
15
+ AGENT=$(basename "$RESULTS" | sed 's/-[0-9].*$//')
16
+ DATE=$(date +%Y-%m-%d)
17
+
18
+ # Extract stats via jq
19
+ TOTAL=$(jq '.results.results | length' "$RESULTS")
20
+ PASSED=$(jq '[.results.results[] | select(.success == true)] | length' "$RESULTS")
21
+ FAILED=$((TOTAL - PASSED))
22
+ PASS_RATE=$(echo "scale=0; $PASSED * 100 / $TOTAL" | bc 2>/dev/null || echo "N/A")
23
+
24
+ # Check for repeat data
25
+ REPEAT=$(jq -r '.results.stats.repeatCount // 0' "$RESULTS" 2>/dev/null || echo "0")
26
+
27
+ cat <<EOF
28
+ # Eval Report: ${AGENT} — ${DATE}
29
+
30
+ ## Summary
31
+ - Cases: ${TOTAL} total
32
+ - Passed: ${PASSED}/${TOTAL} (${PASS_RATE}%)
33
+ - Failed: ${FAILED}
34
+ EOF
35
+
36
+ if [[ "$REPEAT" -gt 1 ]]; then
37
+ echo "- Repeat count: ${REPEAT} (pass@k computed per case)"
38
+ fi
39
+
40
+ echo ""
41
+ echo "## Results"
42
+ echo "| # | Prompt (truncated) | Pass | Assertions |"
43
+ echo "|---|-------------------|------|------------|"
44
+
45
+ jq -r '.results.results | to_entries[] | "\(.key + 1)|\(.value.vars.prompt // "N/A" | .[0:50])|\(.value.success)|\(.value.gradingResult.componentResults // [] | length) checked"' "$RESULTS" 2>/dev/null | \
46
+ while IFS='|' read -r num prompt pass asserts; do
47
+ icon=$([[ "$pass" == "true" ]] && echo "✓" || echo "✗")
48
+ echo "| ${num} | ${prompt} | ${icon} | ${asserts} |"
49
+ done
50
+
51
+ # Failures section
52
+ if [[ "$FAILED" -gt 0 ]]; then
53
+ echo ""
54
+ echo "## Failures"
55
+ jq -r '.results.results | to_entries[] | select(.value.success == false) | "### Case \(.key + 1): \(.value.vars.prompt // "N/A" | .[0:60])\n- Failing assertions: \([.value.gradingResult.componentResults[]? | select(.pass == false) | .assertion.type // "unknown" | select(.pass == false) | .assertion.type // "unknown"] | join(", "))\n"' "$RESULTS" 2>/dev/null
56
+ fi
57
+
58
+ # Trend comparison
59
+ if [[ -n "$PREVIOUS" && -f "$PREVIOUS" ]]; then
60
+ PREV_PASSED=$(jq '[.results.results[] | select(.success == true)] | length' "$PREVIOUS")
61
+ PREV_TOTAL=$(jq '.results.results | length' "$PREVIOUS")
62
+ echo ""
63
+ echo "## Trend"
64
+ echo "- Previous: ${PREV_PASSED}/${PREV_TOTAL}"
65
+ echo "- Current: ${PASSED}/${TOTAL}"
66
+ if [[ "$PASSED" -gt "$PREV_PASSED" ]]; then
67
+ echo "- Direction: ↑ improved"
68
+ elif [[ "$PASSED" -lt "$PREV_PASSED" ]]; then
69
+ echo "- Direction: ↓ regressed"
70
+ else
71
+ echo "- Direction: → stable"
72
+ fi
73
+ fi
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
3
+ export KIRO_EVAL_AGENT=dev
4
+ exec bash "$SCRIPT_DIR/kiro-provider.sh" "$@"
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env bash
2
+ # kiro-judge.sh — Promptfoo exec provider for llm-rubric judging via kiro-cli
3
+ set -o pipefail
4
+ PROMPT="$1"
5
+
6
+ # Truncate if too large for shell args (macOS limit ~262144 bytes)
7
+ MAX_LEN=200000
8
+ if [[ ${#PROMPT} -gt $MAX_LEN ]]; then
9
+ PROMPT="${PROMPT:0:$MAX_LEN}... [truncated for eval — output exceeded ${MAX_LEN} chars]"
10
+ fi
11
+
12
+ RAW=$(kiro-cli chat --no-interactive --trust-tools "" "$PROMPT" 2>/dev/null)
13
+ echo "$RAW" | sed $'s/\x1b\[[0-9;]*[a-zA-Z]//g; s/\x1b\[[0-9;]*m//g; s/\x07//g' \
14
+ | grep -v '^\s*$' | grep -v 'hooks finished' | grep -v 'Credits:' \
15
+ | grep -v 'WARNING:' | grep -v 'All tools are now trusted' \
16
+ | grep -v 'Checkpoints are not' | grep -v 'Learn more at' \
17
+ | sed 's/^> //' | sed 's/^[[:space:]]*//'
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env bash
2
+ # kiro-provider.sh — Promptfoo exec provider that runs kiro-cli agents
3
+ # Usage: bash kiro-provider.sh <prompt> <options_json> <context_json>
4
+ # Agent is determined from the prompt's {{agent}} variable passed via options JSON
5
+ set -o pipefail
6
+
7
+ PROMPT="$1"
8
+ OPTIONS="$2"
9
+ SNAPSHOT_FILE="/tmp/promptfoo-eval-telemetry-snapshot.txt"
10
+ TIMEOUT="${KIRO_EVAL_TIMEOUT:-300}"
11
+
12
+ # Extract agent from options JSON or env var
13
+ if [[ -n "$OPTIONS" ]]; then
14
+ AGENT=$(node -e "let d='';process.stdin.on('data',c=>d+=c).on('end',()=>{try{const j=JSON.parse(d);process.stdout.write(j.config?.agent||'')}catch{}})" <<<"$OPTIONS" 2>/dev/null)
15
+ fi
16
+ AGENT="${AGENT:-${KIRO_EVAL_AGENT:-dev}}"
17
+
18
+ # Auto-detect telemetry file from installed agent location
19
+ _find_telemetry() {
20
+ local agent="$1"
21
+ for f in "$HOME/.kiro/agents/"*"-${agent}.json"; do
22
+ [[ -f "$f" ]] || continue
23
+ local pkg_path
24
+ pkg_path=$(grep -o "$HOME/.flow-agents\"]*" "$f" 2>/dev/null | head -1 | sed 's|/context/.*||')
25
+ if [[ -n "$pkg_path" && -f "$pkg_path/.telemetry/full.jsonl" ]]; then
26
+ echo "$pkg_path/.telemetry/full.jsonl"
27
+ return
28
+ fi
29
+ done
30
+ echo "$HOME/.flow-agents"
31
+ }
32
+ TELEMETRY_FILE="$(_find_telemetry "$AGENT")"
33
+
34
+ SAFE_TOOLS="read files,code,grep,glob,knowledge,web_search,web_fetch,delegate to a specialist agent,todo tool,thinking,session,report_issue"
35
+
36
+ # Snapshot telemetry line count before run
37
+ if [[ -f "$TELEMETRY_FILE" ]]; then
38
+ wc -l < "$TELEMETRY_FILE" | tr -d ' ' > "$SNAPSHOT_FILE"
39
+ else
40
+ echo "0" > "$SNAPSHOT_FILE"
41
+ fi
42
+
43
+ # Run agent, capture output
44
+ RAW=$(timeout "$TIMEOUT" kiro-cli chat \
45
+ --agent "$AGENT" \
46
+ --no-interactive \
47
+ --trust-tools "$SAFE_TOOLS" \
48
+ "$PROMPT" 2>/dev/null)
49
+
50
+ # Strip ANSI escape codes and bell chars
51
+ CLEAN=$(echo "$RAW" | sed $'s/\x1b\[[0-9;]*[a-zA-Z]//g; s/\x1b\[[0-9;]*m//g; s/\x07//g')
52
+
53
+ # Remove kiro chrome lines but keep the actual response content
54
+ echo "$CLEAN" | grep -v '^\s*$' \
55
+ | grep -v 'hooks finished' \
56
+ | grep -v 'Credits:' \
57
+ | grep -v 'WARNING:' \
58
+ | grep -v 'All tools are now trusted' \
59
+ | grep -v 'Checkpoints are not' \
60
+ | grep -v 'Learn more at' \
61
+ | sed 's/^> //' \
62
+ | sed 's/^[[:space:]]*//'
@@ -0,0 +1,111 @@
1
+ #!/usr/bin/env bash
2
+ # Shared command adapter for evals. Historical script entry paths are routed to TypeScript tools.
3
+
4
+ FLOW_AGENTS_EVAL_ROOT="${ROOT:-${ROOT_DIR:-}}"
5
+
6
+ flow_agents_build_ts() {
7
+ (cd "$FLOW_AGENTS_EVAL_ROOT" && npm run build --silent >/dev/null)
8
+ }
9
+
10
+ flow_agents_node() {
11
+ case "$1" in
12
+ */scripts/build-universal-bundles.js|scripts/build-universal-bundles.js)
13
+ shift
14
+ flow_agents_build_ts || return
15
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" build-bundles "$@"
16
+ return
17
+ ;;
18
+ */scripts/generate-context-map.js|scripts/generate-context-map.js)
19
+ shift
20
+ flow_agents_build_ts || return
21
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" context-map "$@"
22
+ return
23
+ ;;
24
+ */scripts/filter-installed-packs.js|scripts/filter-installed-packs.js)
25
+ shift
26
+ flow_agents_build_ts || return
27
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" filter-installed-packs "$@"
28
+ return
29
+ ;;
30
+ workflow-sidecar)
31
+ shift
32
+ flow_agents_build_ts || return
33
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/workflow-sidecar.js" "$@"
34
+ return
35
+ ;;
36
+ validate-workflow-artifacts)
37
+ shift
38
+ flow_agents_build_ts || return
39
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/validate-workflow-artifacts.js" "$@"
40
+ return
41
+ ;;
42
+ */scripts/validate-source-tree.js|scripts/validate-source-tree.js)
43
+ shift
44
+ flow_agents_build_ts || return
45
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/validate-source-tree.js" "$@"
46
+ return
47
+ ;;
48
+ */scripts/flow-kit.js|scripts/flow-kit.js)
49
+ shift
50
+ flow_agents_build_ts || return
51
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/flow-kit.js" "$@"
52
+ return
53
+ ;;
54
+ */scripts/effective-backlog-settings.js|scripts/effective-backlog-settings.js)
55
+ shift
56
+ flow_agents_build_ts || return
57
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/effective-backlog-settings.js" "$@"
58
+ return
59
+ ;;
60
+ */scripts/pull-work-provider.js|scripts/pull-work-provider.js)
61
+ shift
62
+ flow_agents_build_ts || return
63
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/pull-work-provider.js" "$@"
64
+ return
65
+ ;;
66
+ workflow-artifact-cleanup-audit)
67
+ shift
68
+ flow_agents_build_ts || return
69
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" workflow-artifact-cleanup-audit "$@"
70
+ return
71
+ ;;
72
+ fixture-retirement-audit)
73
+ shift
74
+ flow_agents_build_ts || return
75
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" fixture-retirement-audit "$@"
76
+ return
77
+ ;;
78
+ */scripts/publish-change-helper.js|scripts/publish-change-helper.js)
79
+ shift
80
+ flow_agents_build_ts || return
81
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/publish-change-helper.js" "$@"
82
+ return
83
+ ;;
84
+ */scripts/promote-workflow-artifact.js|scripts/promote-workflow-artifact.js)
85
+ shift
86
+ flow_agents_build_ts || return
87
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/promote-workflow-artifact.js" "$@"
88
+ return
89
+ ;;
90
+ */scripts/usage-feedback.js|scripts/usage-feedback.js)
91
+ shift
92
+ flow_agents_build_ts || return
93
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/usage-feedback.js" "$@"
94
+ return
95
+ ;;
96
+ veritas-governance)
97
+ shift
98
+ flow_agents_build_ts || return
99
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" veritas-governance "$@"
100
+ return
101
+ ;;
102
+ */scripts/validate-hook-influence-cases.js|scripts/validate-hook-influence-cases.js)
103
+ shift
104
+ flow_agents_build_ts || return
105
+ node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" validate-hook-influence "$@"
106
+ return
107
+ ;;
108
+ esac
109
+ echo "flow_agents_node: no TypeScript adapter registered for $1" >&2
110
+ return 64
111
+ }