@kontourai/flow-agents 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-push +11 -0
- package/.github/workflows/ci.yml +210 -0
- package/.github/workflows/docs-pages.yml +52 -0
- package/.github/workflows/publish-npm.yml +104 -0
- package/AGENTS.md +26 -0
- package/CHANGELOG.md +66 -0
- package/CODE_OF_CONDUCT.md +25 -0
- package/CONTEXT.md +300 -0
- package/CONTRIBUTING.md +44 -0
- package/LICENSE +201 -0
- package/README.md +129 -0
- package/SECURITY.md +33 -0
- package/agent-cards/dev.json +19 -0
- package/agents/dev.json +127 -0
- package/agents/tool-code-reviewer.json +61 -0
- package/agents/tool-dependencies-updater.json +118 -0
- package/agents/tool-explore-config.json +92 -0
- package/agents/tool-explore-deps.json +92 -0
- package/agents/tool-explore-entry.json +92 -0
- package/agents/tool-explore-patterns.json +92 -0
- package/agents/tool-explore-structure.json +92 -0
- package/agents/tool-explore-tests.json +92 -0
- package/agents/tool-planner.json +57 -0
- package/agents/tool-playwright.json +145 -0
- package/agents/tool-security-reviewer.json +56 -0
- package/agents/tool-verifier.json +61 -0
- package/agents/tool-worker.json +58 -0
- package/build/src/cli/console-learning-projection.js +123 -0
- package/build/src/cli/docs-preview.js +39 -0
- package/build/src/cli/effective-backlog-settings.js +102 -0
- package/build/src/cli/export-bookmarks.js +38 -0
- package/build/src/cli/fixture-retirement-audit.js +140 -0
- package/build/src/cli/flow-kit.js +138 -0
- package/build/src/cli/import-bookmarks.js +50 -0
- package/build/src/cli/init.js +239 -0
- package/build/src/cli/instinct-cli.js +93 -0
- package/build/src/cli/promote-workflow-artifact.js +63 -0
- package/build/src/cli/publish-change-helper.js +154 -0
- package/build/src/cli/pull-work-provider.js +469 -0
- package/build/src/cli/runtime-adapter.js +23 -0
- package/build/src/cli/telemetry-doctor.js +221 -0
- package/build/src/cli/usage-feedback.js +443 -0
- package/build/src/cli/validate-hook-influence.js +152 -0
- package/build/src/cli/validate-source-tree.js +31 -0
- package/build/src/cli/validate-workflow-artifacts.js +486 -0
- package/build/src/cli/veritas-governance.js +262 -0
- package/build/src/cli/workflow-artifact-cleanup-audit.js +272 -0
- package/build/src/cli/workflow-sidecar.js +816 -0
- package/build/src/cli.js +89 -0
- package/build/src/flow-kit/validate.js +75 -0
- package/build/src/lib/args.js +45 -0
- package/build/src/lib/fs.js +62 -0
- package/build/src/lib/workflow-learning-projection.js +334 -0
- package/build/src/runtime-adapters.js +146 -0
- package/build/src/tools/build-universal-bundles.js +397 -0
- package/build/src/tools/common.js +56 -0
- package/build/src/tools/filter-installed-packs.js +132 -0
- package/build/src/tools/generate-context-map.js +198 -0
- package/build/src/tools/validate-package.js +64 -0
- package/build/src/tools/validate-source-tree.js +622 -0
- package/console.telemetry.json +176 -0
- package/context/base-rules.md +17 -0
- package/context/code-review-standards.md +62 -0
- package/context/coding-standards.md +42 -0
- package/context/common/orchestrators.md +12 -0
- package/context/common/subagents.md +28 -0
- package/context/contracts/artifact-contract.md +182 -0
- package/context/contracts/builder-kit-workflow-state-contract.md +319 -0
- package/context/contracts/delivery-contract.md +69 -0
- package/context/contracts/execution-contract.md +53 -0
- package/context/contracts/governance-adapter-contract.md +67 -0
- package/context/contracts/planning-contract.md +85 -0
- package/context/contracts/review-contract.md +104 -0
- package/context/contracts/sandbox-policy.md +52 -0
- package/context/contracts/verification-contract.md +134 -0
- package/context/contracts/work-item-contract.md +215 -0
- package/context/deferred/demo-mode.md +33 -0
- package/context/deferred/languages/go.md +31 -0
- package/context/deferred/languages/python.md +31 -0
- package/context/deferred/languages/typescript.md +34 -0
- package/context/deferred/parallelization.md +35 -0
- package/context/deferred/worktree-isolation.md +24 -0
- package/context/development-workflow.md +50 -0
- package/context/scripts/context-budget/budget-scan.sh +166 -0
- package/context/scripts/detect-tools.sh +3 -0
- package/context/scripts/discover-agents.sh +28 -0
- package/context/scripts/git-status.sh +49 -0
- package/context/scripts/hooks/config-protection.js +79 -0
- package/context/scripts/hooks/desktop-notify.sh +39 -0
- package/context/scripts/hooks/governance-audit.sh +135 -0
- package/context/scripts/hooks/lib/audit-transport.sh +40 -0
- package/context/scripts/hooks/lib/hook-flags.js +49 -0
- package/context/scripts/hooks/lib/patterns.sh +57 -0
- package/context/scripts/hooks/lib/resolve-formatter.js +80 -0
- package/context/scripts/hooks/post-edit-accumulator.js +66 -0
- package/context/scripts/hooks/pre-commit-quality.js +194 -0
- package/context/scripts/hooks/quality-gate.js +93 -0
- package/context/scripts/hooks/report-only-guard.js +21 -0
- package/context/scripts/hooks/run-hook.js +136 -0
- package/context/scripts/hooks/stop-format-typecheck.js +141 -0
- package/context/scripts/hooks/stop-goal-fit.js +337 -0
- package/context/scripts/hooks/workflow-steering.js +250 -0
- package/context/scripts/telemetry/console-presets.sh +14 -0
- package/context/scripts/telemetry/install-console-config.sh +214 -0
- package/context/scripts/telemetry/lib/config.sh +85 -0
- package/context/scripts/telemetry/lib/enrich.sh +115 -0
- package/context/scripts/telemetry/lib/redact.sh +22 -0
- package/context/scripts/telemetry/lib/session.sh +63 -0
- package/context/scripts/telemetry/lib/transport.sh +183 -0
- package/context/scripts/telemetry/lib/usage.sh +29 -0
- package/context/scripts/telemetry/sync-agents.sh +173 -0
- package/context/scripts/telemetry/telemetry.conf +23 -0
- package/context/scripts/telemetry/telemetry.sh +387 -0
- package/context/scripts/validate-package.sh +89 -0
- package/context/settings/backlog-provider-settings.json +54 -0
- package/context/templates/core/identity.md +26 -0
- package/context/templates/core/user.md +15 -0
- package/docs/_config.yml +15 -0
- package/docs/_layouts/default.html +87 -0
- package/docs/adr/0001-flow-agents-consumes-flow.md +77 -0
- package/docs/adr/0002-flow-kits-as-extension-unit.md +13 -0
- package/docs/adr/0003-flow-agents-coordinates-kits-and-adapters.md +13 -0
- package/docs/adr/0004-gates-expect-surface-claims.md +15 -0
- package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +48 -0
- package/docs/adr/0006-typescript-first-source-policy.md +98 -0
- package/docs/agent-system-guidebook.md +391 -0
- package/docs/agent-usage-feedback-loop.md +351 -0
- package/docs/assets/favicon.svg +13 -0
- package/docs/assets/og-image.png +0 -0
- package/docs/assets/site.css +774 -0
- package/docs/assets/site.js +139 -0
- package/docs/configurable-workflow-routing.md +174 -0
- package/docs/context-map.md +145 -0
- package/docs/developer-architecture.md +145 -0
- package/docs/developer-hook-setup.md +61 -0
- package/docs/fixture-ownership.md +44 -0
- package/docs/flow-kit-repository-contract.md +180 -0
- package/docs/index.md +129 -0
- package/docs/kontour-resource-contract.md +358 -0
- package/docs/migrations.md +64 -0
- package/docs/north-star.md +322 -0
- package/docs/operating-layers.md +110 -0
- package/docs/repository-structure.md +132 -0
- package/docs/sandbox-policy.md +56 -0
- package/docs/skills-map.md +203 -0
- package/docs/standards-register.md +96 -0
- package/docs/veritas-integration.md +165 -0
- package/docs/work-item-adapters.md +72 -0
- package/docs/workflow-artifact-lifecycle.md +141 -0
- package/docs/workflow-eval-strategy.md +295 -0
- package/docs/workflow-shared-contracts.md +51 -0
- package/docs/workflow-usage-guide.md +443 -0
- package/evals/ARCHITECTURE.md +143 -0
- package/evals/CONVENTIONS.md +58 -0
- package/evals/README.md +128 -0
- package/evals/acceptance/run.sh +29 -0
- package/evals/acceptance/test_claude_harness.sh +242 -0
- package/evals/acceptance/test_codex_harness.sh +108 -0
- package/evals/acceptance/test_kiro_harness.sh +128 -0
- package/evals/cases/dev/404.html +97 -0
- package/evals/cases/dev/code-review.yaml +44 -0
- package/evals/cases/dev/dashboard.html +300 -0
- package/evals/cases/dev/deliver.yaml +66 -0
- package/evals/cases/dev/dependency-update.yaml +16 -0
- package/evals/cases/dev/explore.yaml +20 -0
- package/evals/cases/dev/index.html +370 -0
- package/evals/cases/dev/package-lock.json +28 -0
- package/evals/cases/dev/package.json +16 -0
- package/evals/cases/dev/plan-work.yaml +20 -0
- package/evals/cases/dev/promptfooconfig.yaml +666 -0
- package/evals/cases/dev/search-first.yaml +20 -0
- package/evals/cases/dev/tdd-workflow.yaml +48 -0
- package/evals/cases/dev/verify-work.yaml +44 -0
- package/evals/cases/dev/workflow.yaml +34 -0
- package/evals/ci/run-baseline.sh +283 -0
- package/evals/fixtures/backlog-provider-settings/global-default.json +44 -0
- package/evals/fixtures/backlog-provider-settings/project-override.json +53 -0
- package/evals/fixtures/builder-kit-workflow-state/baseline-freshness-resolution-hint.json +139 -0
- package/evals/fixtures/builder-kit-workflow-state/direct-primitive-stop.json +59 -0
- package/evals/fixtures/builder-kit-workflow-state/empty-board-route-shape.json +55 -0
- package/evals/fixtures/builder-kit-workflow-state/happy-path.json +71 -0
- package/evals/fixtures/builder-kit-workflow-state/mid-work-resume.json +80 -0
- package/evals/fixtures/builder-kit-workflow-state/missing-prestep-recovery.json +65 -0
- package/evals/fixtures/builder-kit-workflow-state/product-build-chaining.json +60 -0
- package/evals/fixtures/builder-kit-workflow-state/stale-continuation-requires-new-probe.json +57 -0
- package/evals/fixtures/console-learning-projection/artifacts/console-learning-correction/learning.json +50 -0
- package/evals/fixtures/console-learning-projection/artifacts/console-learning-open-route/learning.json +41 -0
- package/evals/fixtures/flow-kit-repository/invalid-absolute-path/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-asset-section/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-asset-section/kit.json +11 -0
- package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/kit.json +9 -0
- package/evals/fixtures/flow-kit-repository/invalid-id/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-id/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-malformed-json/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-flow/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-id/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-id/kit.json +7 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/kit.json +7 -0
- package/evals/fixtures/flow-kit-repository/invalid-name/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-name/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-schema-version/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-schema-version/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-traversal/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/adapters/example.json +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/assets/example.txt +1 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/docs/README.md +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/flows/runtime.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-evals/example.json +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-skills/mixed/SKILL.md +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit.json +44 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/docs/README.md +3 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/kit.json +20 -0
- package/evals/fixtures/hook-influence/cases.json +336 -0
- package/evals/fixtures/pull-work-provider/github-issues.json +170 -0
- package/evals/fixtures/pull-work-wip-shepherding/global-wip-informs.json +43 -0
- package/evals/fixtures/pull-work-wip-shepherding/personal-wip-blocks.json +42 -0
- package/evals/fixtures/surface-trust/accepted-claim-trust-report.json +31 -0
- package/evals/fixtures/surface-trust/artifact-absent.json +19 -0
- package/evals/fixtures/surface-trust/integrity-mismatch-trust-report.json +32 -0
- package/evals/fixtures/surface-trust/missing-authority-trust-report.json +27 -0
- package/evals/fixtures/surface-trust/provider-absent.json +19 -0
- package/evals/fixtures/surface-trust/rejected-claim-trust-report.json +30 -0
- package/evals/fixtures/surface-trust/stale-claim-trust-snapshot.json +31 -0
- package/evals/fixtures/usage-feedback/sample-full.jsonl +11 -0
- package/evals/fixtures/usage-feedback/sample-outcomes.jsonl +1 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-pass.sh +18 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-secret-fail.sh +10 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-unconfigured.sh +4 -0
- package/evals/integration/test_bundle_install.sh +541 -0
- package/evals/integration/test_console_learning_projection.sh +192 -0
- package/evals/integration/test_context_map.sh +65 -0
- package/evals/integration/test_effective_backlog_settings.sh +58 -0
- package/evals/integration/test_fixture_retirement_audit.sh +58 -0
- package/evals/integration/test_flow_agents_statusline.sh +93 -0
- package/evals/integration/test_flow_kit_repository.sh +90 -0
- package/evals/integration/test_goal_fit_hook.sh +482 -0
- package/evals/integration/test_hook_category_behaviors.sh +190 -0
- package/evals/integration/test_hook_influence_cases.sh +69 -0
- package/evals/integration/test_local_flow_kit_install.sh +145 -0
- package/evals/integration/test_publish_change_helper.sh +176 -0
- package/evals/integration/test_pull_work_provider.sh +140 -0
- package/evals/integration/test_runtime_adapter_activation.sh +106 -0
- package/evals/integration/test_telemetry.sh +485 -0
- package/evals/integration/test_telemetry_doctor.sh +193 -0
- package/evals/integration/test_usage_feedback_dashboard.sh +169 -0
- package/evals/integration/test_usage_feedback_global.sh +117 -0
- package/evals/integration/test_usage_feedback_import.sh +227 -0
- package/evals/integration/test_usage_feedback_outcomes.sh +165 -0
- package/evals/integration/test_usage_feedback_report.sh +263 -0
- package/evals/integration/test_veritas_governance_adapter.sh +235 -0
- package/evals/integration/test_workflow_artifact_cleanup_audit.sh +287 -0
- package/evals/integration/test_workflow_artifacts.sh +1247 -0
- package/evals/integration/test_workflow_sidecar_writer.sh +2112 -0
- package/evals/integration/test_workflow_steering_hook.sh +337 -0
- package/evals/lib/assertions/delegated-to.js +40 -0
- package/evals/lib/assertions/max-tool-calls.js +15 -0
- package/evals/lib/assertions/no-write-tools.js +27 -0
- package/evals/lib/assertions/pass-at-k.js +39 -0
- package/evals/lib/assertions/telemetry-utils.js +105 -0
- package/evals/lib/assertions/tool-called.js +39 -0
- package/evals/lib/assertions/verify-after-fix.js +61 -0
- package/evals/lib/claude-judge.sh +40 -0
- package/evals/lib/claude-provider.sh +74 -0
- package/evals/lib/codex-judge.sh +39 -0
- package/evals/lib/codex-provider.sh +81 -0
- package/evals/lib/eval-dev.sh +5 -0
- package/evals/lib/eval-judge.sh +22 -0
- package/evals/lib/eval-provider.sh +26 -0
- package/evals/lib/eval-report.sh +73 -0
- package/evals/lib/kiro-dev.sh +4 -0
- package/evals/lib/kiro-judge.sh +17 -0
- package/evals/lib/kiro-provider.sh +62 -0
- package/evals/lib/node.sh +111 -0
- package/evals/promptfooconfig.yaml +70 -0
- package/evals/run.sh +309 -0
- package/evals/static/test_evidence_refs.sh +141 -0
- package/evals/static/test_package.sh +407 -0
- package/evals/static/test_repo_hooks.sh +68 -0
- package/evals/static/test_universal_bundles.sh +274 -0
- package/evals/static/test_workflow_skills.sh +1207 -0
- package/install.sh +64 -0
- package/integrations/veritas/flow-agents.adapter.json +138 -0
- package/integrations/veritas/flow-agents.authority-settings.json +26 -0
- package/integrations/veritas/flow-agents.repo-standards.json +82 -0
- package/kits/builder/flows/build.flow.json +218 -0
- package/kits/builder/flows/shape.flow.json +127 -0
- package/kits/builder/kit.json +19 -0
- package/kits/catalog.json +11 -0
- package/package.json +130 -0
- package/packaging/README.md +60 -0
- package/packaging/manifest.json +173 -0
- package/packaging/packs.json +69 -0
- package/powers/dependency-checker/POWER.md +20 -0
- package/powers/dependency-checker/mcp.json +20 -0
- package/powers/playwright/POWER.md +25 -0
- package/powers/playwright/mcp.json +12 -0
- package/prompts/code-audit.md +123 -0
- package/prompts/kcommit.md +88 -0
- package/schemas/backlog-provider-settings.schema.json +138 -0
- package/schemas/workflow-acceptance.schema.json +216 -0
- package/schemas/workflow-critique.schema.json +113 -0
- package/schemas/workflow-evidence.schema.json +357 -0
- package/schemas/workflow-handoff.schema.json +52 -0
- package/schemas/workflow-learning.schema.json +223 -0
- package/schemas/workflow-release.schema.json +172 -0
- package/schemas/workflow-state.schema.json +80 -0
- package/scripts/README.md +111 -0
- package/scripts/build-universal-bundles.js +3 -0
- package/scripts/check-content-boundary.cjs +99 -0
- package/scripts/context-budget/budget-scan.sh +166 -0
- package/scripts/detect-tools.sh +3 -0
- package/scripts/discover-agents.sh +28 -0
- package/scripts/effective-backlog-settings.js +2 -0
- package/scripts/filter-installed-packs.js +2 -0
- package/scripts/flow-kit.js +2 -0
- package/scripts/generate-context-map.js +2 -0
- package/scripts/git-status.sh +49 -0
- package/scripts/hooks/claude-hook-adapter.js +174 -0
- package/scripts/hooks/claude-telemetry-hook.js +115 -0
- package/scripts/hooks/codex-hook-adapter.js +176 -0
- package/scripts/hooks/codex-telemetry-hook.js +95 -0
- package/scripts/hooks/config-protection.js +79 -0
- package/scripts/hooks/desktop-notify.sh +39 -0
- package/scripts/hooks/governance-audit.sh +135 -0
- package/scripts/hooks/lib/audit-transport.sh +40 -0
- package/scripts/hooks/lib/hook-flags.js +49 -0
- package/scripts/hooks/lib/patterns.sh +57 -0
- package/scripts/hooks/lib/resolve-formatter.js +80 -0
- package/scripts/hooks/post-edit-accumulator.js +66 -0
- package/scripts/hooks/pre-commit-quality.js +194 -0
- package/scripts/hooks/quality-gate.js +93 -0
- package/scripts/hooks/report-only-guard.js +21 -0
- package/scripts/hooks/run-hook.js +136 -0
- package/scripts/hooks/stop-format-typecheck.js +141 -0
- package/scripts/hooks/stop-goal-fit.js +337 -0
- package/scripts/hooks/workflow-steering.js +250 -0
- package/scripts/install-codex-home.sh +106 -0
- package/scripts/package.json +3 -0
- package/scripts/promote-workflow-artifact.js +2 -0
- package/scripts/publish-change-helper.js +2 -0
- package/scripts/pull-work-provider.js +2 -0
- package/scripts/setup-repo-hooks.sh +8 -0
- package/scripts/statusline/flow-agents-statusline.js +157 -0
- package/scripts/telemetry/console-presets.sh +14 -0
- package/scripts/telemetry/install-console-config.sh +214 -0
- package/scripts/telemetry/lib/config.sh +85 -0
- package/scripts/telemetry/lib/enrich.sh +115 -0
- package/scripts/telemetry/lib/redact.sh +22 -0
- package/scripts/telemetry/lib/session.sh +63 -0
- package/scripts/telemetry/lib/transport.sh +183 -0
- package/scripts/telemetry/lib/usage.sh +29 -0
- package/scripts/telemetry/sync-agents.sh +173 -0
- package/scripts/telemetry/telemetry.conf +23 -0
- package/scripts/telemetry/telemetry.sh +387 -0
- package/scripts/usage-feedback.js +2 -0
- package/scripts/validate-hook-influence-cases.js +2 -0
- package/scripts/validate-package.sh +89 -0
- package/scripts/validate-source-tree.js +9 -0
- package/skills/agentic-engineering/SKILL.md +62 -0
- package/skills/browser-test/SKILL.md +51 -0
- package/skills/builder-shape/SKILL.md +76 -0
- package/skills/context-budget/SKILL.md +40 -0
- package/skills/deliver/SKILL.md +241 -0
- package/skills/dependency-update/SKILL.md +68 -0
- package/skills/design-probe/SKILL.md +107 -0
- package/skills/eval-rebuild/SKILL.md +39 -0
- package/skills/evidence-gate/SKILL.md +186 -0
- package/skills/execute-plan/SKILL.md +110 -0
- package/skills/explore/SKILL.md +137 -0
- package/skills/feedback-loop/SKILL.md +87 -0
- package/skills/fix-bug/SKILL.md +133 -0
- package/skills/frontend-design/SKILL.md +80 -0
- package/skills/github-cli/SKILL.md +63 -0
- package/skills/idea-to-backlog/SKILL.md +267 -0
- package/skills/knowledge-capture/SKILL.md +55 -0
- package/skills/learning-review/SKILL.md +115 -0
- package/skills/pickup-probe/SKILL.md +114 -0
- package/skills/plan-work/SKILL.md +176 -0
- package/skills/pull-work/SKILL.md +309 -0
- package/skills/release-readiness/SKILL.md +121 -0
- package/skills/review-work/SKILL.md +161 -0
- package/skills/search-first/SKILL.md +66 -0
- package/skills/tdd-workflow/SKILL.md +140 -0
- package/skills/verify-work/SKILL.md +109 -0
- package/src/cli/console-learning-projection.ts +140 -0
- package/src/cli/effective-backlog-settings.ts +99 -0
- package/src/cli/fixture-retirement-audit.ts +154 -0
- package/src/cli/flow-kit.ts +139 -0
- package/src/cli/init.ts +248 -0
- package/src/cli/promote-workflow-artifact.ts +64 -0
- package/src/cli/publish-change-helper.ts +143 -0
- package/src/cli/pull-work-provider.ts +481 -0
- package/src/cli/runtime-adapter.ts +24 -0
- package/src/cli/telemetry-doctor.ts +243 -0
- package/src/cli/usage-feedback.ts +418 -0
- package/src/cli/validate-hook-influence.ts +119 -0
- package/src/cli/validate-source-tree.ts +30 -0
- package/src/cli/validate-workflow-artifacts.ts +411 -0
- package/src/cli/veritas-governance.ts +322 -0
- package/src/cli/workflow-artifact-cleanup-audit.ts +281 -0
- package/src/cli/workflow-sidecar.ts +676 -0
- package/src/cli.ts +95 -0
- package/src/flow-kit/validate.ts +74 -0
- package/src/lib/args.ts +43 -0
- package/src/lib/fs.ts +62 -0
- package/src/lib/workflow-learning-projection.ts +491 -0
- package/src/runtime-adapters.ts +154 -0
- package/src/tools/build-universal-bundles.ts +366 -0
- package/src/tools/common.ts +61 -0
- package/src/tools/filter-installed-packs.ts +129 -0
- package/src/tools/generate-context-map.ts +199 -0
- package/src/tools/validate-package.ts +57 -0
- package/src/tools/validate-source-tree.ts +488 -0
- package/tsconfig.json +19 -0
- package/veritas.claims.json +6 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
// verify-after-fix.js — Assert that any code change during review/verify is followed by a clean verification pass
|
|
2
|
+
const { getNewEvents, filterByType, getToolInvocations, getSubagentCalls } = require('./telemetry-utils');
|
|
3
|
+
|
|
4
|
+
const WRITE_TOOLS = new Set(['write files', 'write', 'apply_patch', 'edit']);
|
|
5
|
+
const REVIEW_AGENTS = new Set(['tool-code-reviewer', 'tool-security-reviewer']);
|
|
6
|
+
const VERIFY_AGENTS = new Set(['tool-verifier', 'tool-playwright']);
|
|
7
|
+
const REPORTER_AGENTS = new Set([...REVIEW_AGENTS, ...VERIFY_AGENTS]);
|
|
8
|
+
|
|
9
|
+
module.exports = (output, { config }) => {
|
|
10
|
+
const events = getNewEvents();
|
|
11
|
+
const toolEvents = getToolInvocations(events);
|
|
12
|
+
const subagentCalls = getSubagentCalls(events);
|
|
13
|
+
const violations = [];
|
|
14
|
+
|
|
15
|
+
// Check 1: Reviewers/verifiers must never invoke write tools
|
|
16
|
+
const reporterWrites = toolEvents.filter(e => {
|
|
17
|
+
const agent = e.agent && e.agent.name;
|
|
18
|
+
const tool = e.tool && e.tool.name && String(e.tool.name).toLowerCase();
|
|
19
|
+
return agent && REPORTER_AGENTS.has(agent) && WRITE_TOOLS.has(tool);
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
if (reporterWrites.length > 0) {
|
|
23
|
+
violations.push(
|
|
24
|
+
`Reporter agents wrote code: ${reporterWrites.map(e => `${e.agent.name} → ${e.tool.name}`).join('; ')}`
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Check 2: After any write tool call, there must be a subsequent tool-verifier delegation
|
|
29
|
+
const allEvents = events;
|
|
30
|
+
let lastWriteIdx = -1;
|
|
31
|
+
let lastVerifyIdx = -1;
|
|
32
|
+
|
|
33
|
+
for (let i = 0; i < allEvents.length; i++) {
|
|
34
|
+
const e = allEvents[i];
|
|
35
|
+
const toolName = e.tool && e.tool.name && String(e.tool.name).toLowerCase();
|
|
36
|
+
if (e.event_type === 'tool.invoke' && e.tool && WRITE_TOOLS.has(toolName)) {
|
|
37
|
+
lastWriteIdx = i;
|
|
38
|
+
}
|
|
39
|
+
if (e.event_type === 'tool.invoke' && e.tool && toolName === 'delegate to a specialist agent' &&
|
|
40
|
+
e.tool.input && e.tool.input.command === 'InvokeSubagents') {
|
|
41
|
+
const subs = e.tool.input.content && e.tool.input.content.subagents;
|
|
42
|
+
if (subs && subs.some(s => s.agent_name === 'tool-verifier')) {
|
|
43
|
+
lastVerifyIdx = i;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (e.event_type === 'tool.invoke' && e.tool && toolName === 'spawn_agent' &&
|
|
47
|
+
e.tool.input && e.tool.input.agent_type === 'tool-verifier') {
|
|
48
|
+
lastVerifyIdx = i;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if (lastWriteIdx > lastVerifyIdx) {
|
|
53
|
+
violations.push('Code was written after the last verification pass — missing re-verify');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (violations.length === 0) {
|
|
57
|
+
return { pass: true, score: 1, reason: 'No code changes without subsequent verification' };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return { pass: false, score: 0, reason: `Re-verify violations: ${violations.join('; ')}` };
|
|
61
|
+
};
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# claude-judge.sh — Promptfoo exec provider for llm-rubric judging via Claude Code.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
PROMPT="${1:-}"
|
|
6
|
+
TIMEOUT="${CLAUDE_EVAL_JUDGE_TIMEOUT:-180}"
|
|
7
|
+
MAX_LEN=200000
|
|
8
|
+
if [[ ${#PROMPT} -gt $MAX_LEN ]]; then
|
|
9
|
+
PROMPT="${PROMPT:0:$MAX_LEN}... [truncated for eval - output exceeded ${MAX_LEN} chars]"
|
|
10
|
+
fi
|
|
11
|
+
|
|
12
|
+
OUT="$(mktemp /tmp/flow-agents-claude-judge.XXXXXX)"
|
|
13
|
+
LOG="$(mktemp /tmp/flow-agents-claude-judge-log.XXXXXX)"
|
|
14
|
+
trap 'rm -f "$OUT" "$LOG"' EXIT
|
|
15
|
+
|
|
16
|
+
if ! command -v claude >/dev/null 2>&1; then
|
|
17
|
+
echo "claude CLI is not installed or not on PATH" >&2
|
|
18
|
+
exit 2
|
|
19
|
+
fi
|
|
20
|
+
|
|
21
|
+
if command -v timeout >/dev/null 2>&1; then
|
|
22
|
+
TIMEOUT_CMD=(timeout "$TIMEOUT")
|
|
23
|
+
elif command -v gtimeout >/dev/null 2>&1; then
|
|
24
|
+
TIMEOUT_CMD=(gtimeout "$TIMEOUT")
|
|
25
|
+
else
|
|
26
|
+
TIMEOUT_CMD=()
|
|
27
|
+
fi
|
|
28
|
+
|
|
29
|
+
"${TIMEOUT_CMD[@]}" claude \
|
|
30
|
+
-p \
|
|
31
|
+
--permission-mode bypassPermissions \
|
|
32
|
+
--add-dir /tmp \
|
|
33
|
+
--output-format text \
|
|
34
|
+
"$PROMPT" >"$OUT" 2>"$LOG" || {
|
|
35
|
+
cat "$OUT" 2>/dev/null
|
|
36
|
+
sed -n '1,120p' "$LOG" >&2
|
|
37
|
+
exit 1
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
cat "$OUT"
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# claude-provider.sh — Promptfoo exec provider that runs Flow Agents through Claude Code.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
PROMPT="${1:-}"
|
|
6
|
+
OPTIONS="${2:-}"
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
9
|
+
TIMEOUT="${CLAUDE_EVAL_TIMEOUT:-300}"
|
|
10
|
+
FLUSH_SLEEP="${FLOW_AGENTS_EVAL_TELEMETRY_FLUSH_SLEEP:-0.5}"
|
|
11
|
+
SNAPSHOT_FILE="${FLOW_AGENTS_EVAL_TELEMETRY_SNAPSHOT:-/tmp/promptfoo-eval-telemetry-snapshot.txt}"
|
|
12
|
+
TELEMETRY_FILE_MARKER="${FLOW_AGENTS_EVAL_TELEMETRY_FILE_MARKER:-/tmp/promptfoo-eval-telemetry-file.txt}"
|
|
13
|
+
|
|
14
|
+
AGENT=""
|
|
15
|
+
if [[ -n "$OPTIONS" ]]; then
|
|
16
|
+
AGENT=$(node -e "let d='';process.stdin.on('data',c=>d+=c).on('end',()=>{try{const j=JSON.parse(d);process.stdout.write(j.config?.agent||'')}catch{}})" <<<"$OPTIONS" 2>/dev/null || true)
|
|
17
|
+
fi
|
|
18
|
+
AGENT="${AGENT:-${FLOW_AGENTS_EVAL_AGENT:-dev}}"
|
|
19
|
+
|
|
20
|
+
if ! command -v claude >/dev/null 2>&1; then
|
|
21
|
+
echo "claude CLI is not installed or not on PATH" >&2
|
|
22
|
+
exit 2
|
|
23
|
+
fi
|
|
24
|
+
|
|
25
|
+
run_claude() {
|
|
26
|
+
if command -v timeout >/dev/null 2>&1; then
|
|
27
|
+
timeout "$TIMEOUT" "${CLAUDE_CMD[@]}"
|
|
28
|
+
elif command -v gtimeout >/dev/null 2>&1; then
|
|
29
|
+
gtimeout "$TIMEOUT" "${CLAUDE_CMD[@]}"
|
|
30
|
+
else
|
|
31
|
+
"${CLAUDE_CMD[@]}"
|
|
32
|
+
fi
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
prepare_workdir() {
|
|
36
|
+
local work_root="${CLAUDE_EVAL_WORK_ROOT:-/tmp/flow-agents-claude-eval}"
|
|
37
|
+
local work_dir="$work_root/$AGENT"
|
|
38
|
+
rm -rf "$work_dir"
|
|
39
|
+
mkdir -p "$work_dir"
|
|
40
|
+
(cd "$ROOT_DIR" && flow_agents_node scripts/build-universal-bundles.js >/dev/null)
|
|
41
|
+
bash "$ROOT_DIR/dist/claude-code/install.sh" "$work_dir" >/dev/null
|
|
42
|
+
mkdir -p "$work_dir/.telemetry"
|
|
43
|
+
echo "$work_dir"
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
WORK_DIR="$(prepare_workdir)"
|
|
47
|
+
TELEMETRY_FILE="$WORK_DIR/.telemetry/full.jsonl"
|
|
48
|
+
echo "$TELEMETRY_FILE" > "$TELEMETRY_FILE_MARKER"
|
|
49
|
+
if [[ -f "$TELEMETRY_FILE" ]]; then
|
|
50
|
+
wc -l < "$TELEMETRY_FILE" | tr -d ' ' > "$SNAPSHOT_FILE"
|
|
51
|
+
else
|
|
52
|
+
echo "0" > "$SNAPSHOT_FILE"
|
|
53
|
+
fi
|
|
54
|
+
|
|
55
|
+
CLAUDE_CMD=(
|
|
56
|
+
env
|
|
57
|
+
FLOW_AGENTS_CLAUDE_TELEMETRY_CHANNELS="${FLOW_AGENTS_CLAUDE_TELEMETRY_CHANNELS:-full,analytics}"
|
|
58
|
+
claude
|
|
59
|
+
-p
|
|
60
|
+
--agent "$AGENT"
|
|
61
|
+
--permission-mode bypassPermissions
|
|
62
|
+
--add-dir "$WORK_DIR"
|
|
63
|
+
--output-format text
|
|
64
|
+
"$PROMPT"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
set +e
|
|
68
|
+
RAW=$(cd "$WORK_DIR" && run_claude 2>&1)
|
|
69
|
+
STATUS=$?
|
|
70
|
+
set -e
|
|
71
|
+
sleep "$FLUSH_SLEEP"
|
|
72
|
+
echo "$RAW" | sed $'s/\x1b\[[0-9;]*[a-zA-Z]//g; s/\x1b\[[0-9;]*m//g; s/\x07//g' \
|
|
73
|
+
| grep -v '^\s*$'
|
|
74
|
+
exit "$STATUS"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# codex-judge.sh — Promptfoo exec provider for llm-rubric judging via Codex.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
PROMPT="${1:-}"
|
|
6
|
+
TIMEOUT="${CODEX_EVAL_JUDGE_TIMEOUT:-180}"
|
|
7
|
+
MAX_LEN=200000
|
|
8
|
+
if [[ ${#PROMPT} -gt $MAX_LEN ]]; then
|
|
9
|
+
PROMPT="${PROMPT:0:$MAX_LEN}... [truncated for eval - output exceeded ${MAX_LEN} chars]"
|
|
10
|
+
fi
|
|
11
|
+
|
|
12
|
+
OUT="$(mktemp /tmp/flow-agents-codex-judge.XXXXXX)"
|
|
13
|
+
LOG="$(mktemp /tmp/flow-agents-codex-judge-log.XXXXXX)"
|
|
14
|
+
trap 'rm -f "$OUT" "$LOG"' EXIT
|
|
15
|
+
|
|
16
|
+
if command -v timeout >/dev/null 2>&1; then
|
|
17
|
+
TIMEOUT_CMD=(timeout "$TIMEOUT")
|
|
18
|
+
elif command -v gtimeout >/dev/null 2>&1; then
|
|
19
|
+
TIMEOUT_CMD=(gtimeout "$TIMEOUT")
|
|
20
|
+
else
|
|
21
|
+
TIMEOUT_CMD=()
|
|
22
|
+
fi
|
|
23
|
+
|
|
24
|
+
"${TIMEOUT_CMD[@]}" codex exec \
|
|
25
|
+
--ignore-user-config \
|
|
26
|
+
--skip-git-repo-check \
|
|
27
|
+
-C /tmp \
|
|
28
|
+
--sandbox read-only \
|
|
29
|
+
--json \
|
|
30
|
+
-c model='"gpt-5.5"' \
|
|
31
|
+
-c model_reasoning_effort='"medium"' \
|
|
32
|
+
--output-last-message "$OUT" \
|
|
33
|
+
"$PROMPT" >"$LOG" 2>&1 || {
|
|
34
|
+
cat "$OUT" 2>/dev/null
|
|
35
|
+
sed -n '1,120p' "$LOG" >&2
|
|
36
|
+
exit 1
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
cat "$OUT"
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# codex-provider.sh — Promptfoo exec provider that runs Flow Agents through Codex.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
PROMPT="${1:-}"
|
|
6
|
+
OPTIONS="${2:-}"
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
9
|
+
TIMEOUT="${CODEX_EVAL_TIMEOUT:-300}"
|
|
10
|
+
FLUSH_SLEEP="${FLOW_AGENTS_EVAL_TELEMETRY_FLUSH_SLEEP:-0.5}"
|
|
11
|
+
SNAPSHOT_FILE="${FLOW_AGENTS_EVAL_TELEMETRY_SNAPSHOT:-/tmp/promptfoo-eval-telemetry-snapshot.txt}"
|
|
12
|
+
TELEMETRY_FILE_MARKER="${FLOW_AGENTS_EVAL_TELEMETRY_FILE_MARKER:-/tmp/promptfoo-eval-telemetry-file.txt}"
|
|
13
|
+
|
|
14
|
+
AGENT=""
|
|
15
|
+
if [[ -n "$OPTIONS" ]]; then
|
|
16
|
+
AGENT=$(node -e "let d='';process.stdin.on('data',c=>d+=c).on('end',()=>{try{const j=JSON.parse(d);process.stdout.write(j.config?.agent||'')}catch{}})" <<<"$OPTIONS" 2>/dev/null || true)
|
|
17
|
+
fi
|
|
18
|
+
AGENT="${AGENT:-${FLOW_AGENTS_EVAL_AGENT:-dev}}"
|
|
19
|
+
|
|
20
|
+
profile_for_agent() {
|
|
21
|
+
case "$1" in
|
|
22
|
+
dev) echo "kdev" ;;
|
|
23
|
+
*) echo "" ;;
|
|
24
|
+
esac
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
strip_json_events() {
|
|
29
|
+
node -e "const rl=require('readline').createInterface({input:process.stdin});rl.on('line',l=>{if(!l)return;try{const o=JSON.parse(l);if(o.type==='agent_message'&&typeof o.text==='string')console.log(o.text);else if(o.type==='item.completed'&&o.item?.type==='agent_message'&&typeof o.item.text==='string')console.log(o.item.text)}catch{console.log(l)}})"
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
run_codex() {
|
|
33
|
+
if command -v timeout >/dev/null 2>&1; then
|
|
34
|
+
timeout "$TIMEOUT" "${CODEX_CMD[@]}" "$PROMPT"
|
|
35
|
+
elif command -v gtimeout >/dev/null 2>&1; then
|
|
36
|
+
gtimeout "$TIMEOUT" "${CODEX_CMD[@]}" "$PROMPT"
|
|
37
|
+
else
|
|
38
|
+
"${CODEX_CMD[@]}" "$PROMPT"
|
|
39
|
+
fi
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
prepare_workdir() {
|
|
43
|
+
local work_root="${CODEX_EVAL_WORK_ROOT:-/tmp/flow-agents-codex-eval}"
|
|
44
|
+
local work_dir="$work_root/$AGENT"
|
|
45
|
+
rm -rf "$work_dir"
|
|
46
|
+
mkdir -p "$work_dir"
|
|
47
|
+
(cd "$ROOT_DIR" && flow_agents_node scripts/build-universal-bundles.js >/dev/null)
|
|
48
|
+
cp -R "$ROOT_DIR/dist/codex/." "$work_dir/"
|
|
49
|
+
cp "$work_dir/.codex/config.toml" "$work_dir/.codex/config-eval.toml"
|
|
50
|
+
for auth_file in auth.json version.json installation_id; do
|
|
51
|
+
if [[ -f "${CODEX_REAL_HOME:-$HOME/.codex}/$auth_file" ]]; then
|
|
52
|
+
cp "${CODEX_REAL_HOME:-$HOME/.codex}/$auth_file" "$work_dir/.codex/$auth_file"
|
|
53
|
+
fi
|
|
54
|
+
done
|
|
55
|
+
mkdir -p "$work_dir/.telemetry"
|
|
56
|
+
echo "$work_dir"
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
WORK_DIR="$(prepare_workdir)"
|
|
60
|
+
TELEMETRY_FILE="$WORK_DIR/.telemetry/full.jsonl"
|
|
61
|
+
echo "$TELEMETRY_FILE" > "$TELEMETRY_FILE_MARKER"
|
|
62
|
+
if [[ -f "$TELEMETRY_FILE" ]]; then
|
|
63
|
+
wc -l < "$TELEMETRY_FILE" | tr -d ' ' > "$SNAPSHOT_FILE"
|
|
64
|
+
else
|
|
65
|
+
echo "0" > "$SNAPSHOT_FILE"
|
|
66
|
+
fi
|
|
67
|
+
|
|
68
|
+
PROFILE="$(profile_for_agent "$AGENT")"
|
|
69
|
+
if [[ -n "$PROFILE" ]]; then
|
|
70
|
+
CODEX_CMD=(env CODEX_HOME="$WORK_DIR/.codex" codex -p "$PROFILE" exec --skip-git-repo-check -C "$WORK_DIR" --sandbox read-only --json)
|
|
71
|
+
else
|
|
72
|
+
CODEX_CMD=(env CODEX_HOME="$WORK_DIR/.codex" codex -c "developer_instructions=$(node -e "const fs=require('fs'),p='$WORK_DIR/.codex/agents/$AGENT.toml';if(!fs.existsSync(p)){process.stdout.write('\"\"');process.exit(0)}const m=fs.readFileSync(p,'utf8').match(/^developer_instructions\\s*=\\s*(.+)$/m);process.stdout.write(m?m[1]:'\"\"')")" exec --skip-git-repo-check -C "$WORK_DIR" --sandbox read-only --json)
|
|
73
|
+
fi
|
|
74
|
+
|
|
75
|
+
set +e
|
|
76
|
+
RAW=$(run_codex 2>&1)
|
|
77
|
+
STATUS=$?
|
|
78
|
+
set -e
|
|
79
|
+
sleep "$FLUSH_SLEEP"
|
|
80
|
+
echo "$RAW" | strip_json_events | sed $'s/\x1b\[[0-9;]*[a-zA-Z]//g; s/\x1b\[[0-9;]*m//g; s/\x07//g'
|
|
81
|
+
exit "$STATUS"
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# eval-judge.sh — Runtime-neutral promptfoo rubric judge provider.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
6
|
+
RUNTIME="${FLOW_AGENTS_EVAL_JUDGE_RUNTIME:-${FLOW_AGENTS_EVAL_RUNTIME:-${EVAL_RUNTIME:-kiro}}}"
|
|
7
|
+
|
|
8
|
+
case "$RUNTIME" in
|
|
9
|
+
kiro|kiro-cli)
|
|
10
|
+
exec bash "$SCRIPT_DIR/kiro-judge.sh" "$@"
|
|
11
|
+
;;
|
|
12
|
+
codex)
|
|
13
|
+
exec bash "$SCRIPT_DIR/codex-judge.sh" "$@"
|
|
14
|
+
;;
|
|
15
|
+
claude|claude-code)
|
|
16
|
+
exec bash "$SCRIPT_DIR/claude-judge.sh" "$@"
|
|
17
|
+
;;
|
|
18
|
+
*)
|
|
19
|
+
echo "Unsupported FLOW_AGENTS_EVAL_JUDGE_RUNTIME='$RUNTIME' (expected kiro, codex, or claude)" >&2
|
|
20
|
+
exit 2
|
|
21
|
+
;;
|
|
22
|
+
esac
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# eval-provider.sh — Runtime-neutral promptfoo subject provider.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
6
|
+
RUNTIME="${FLOW_AGENTS_EVAL_RUNTIME:-${EVAL_RUNTIME:-kiro}}"
|
|
7
|
+
AGENT="${FLOW_AGENTS_EVAL_AGENT:-${KIRO_EVAL_AGENT:-dev}}"
|
|
8
|
+
|
|
9
|
+
export FLOW_AGENTS_EVAL_AGENT="$AGENT"
|
|
10
|
+
export KIRO_EVAL_AGENT="$AGENT"
|
|
11
|
+
|
|
12
|
+
case "$RUNTIME" in
|
|
13
|
+
kiro|kiro-cli)
|
|
14
|
+
exec bash "$SCRIPT_DIR/kiro-provider.sh" "$@"
|
|
15
|
+
;;
|
|
16
|
+
codex)
|
|
17
|
+
exec bash "$SCRIPT_DIR/codex-provider.sh" "$@"
|
|
18
|
+
;;
|
|
19
|
+
claude|claude-code)
|
|
20
|
+
exec bash "$SCRIPT_DIR/claude-provider.sh" "$@"
|
|
21
|
+
;;
|
|
22
|
+
*)
|
|
23
|
+
echo "Unsupported FLOW_AGENTS_EVAL_RUNTIME='$RUNTIME' (expected kiro, codex, or claude)" >&2
|
|
24
|
+
exit 2
|
|
25
|
+
;;
|
|
26
|
+
esac
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# eval-report.sh — Generate markdown eval report from promptfoo JSON output
|
|
3
|
+
# Usage: bash lib/eval-report.sh <results-json> [previous-json]
|
|
4
|
+
# Output: markdown report to stdout
|
|
5
|
+
set -uo pipefail
|
|
6
|
+
|
|
7
|
+
RESULTS="${1:?Usage: bash lib/eval-report.sh <results.json> [previous.json]}"
|
|
8
|
+
PREVIOUS="${2:-}"
|
|
9
|
+
|
|
10
|
+
if [[ ! -f "$RESULTS" ]]; then
|
|
11
|
+
echo "Error: Results file not found: $RESULTS" >&2
|
|
12
|
+
exit 1
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
AGENT=$(basename "$RESULTS" | sed 's/-[0-9].*$//')
|
|
16
|
+
DATE=$(date +%Y-%m-%d)
|
|
17
|
+
|
|
18
|
+
# Extract stats via jq
|
|
19
|
+
TOTAL=$(jq '.results.results | length' "$RESULTS")
|
|
20
|
+
PASSED=$(jq '[.results.results[] | select(.success == true)] | length' "$RESULTS")
|
|
21
|
+
FAILED=$((TOTAL - PASSED))
|
|
22
|
+
PASS_RATE=$(echo "scale=0; $PASSED * 100 / $TOTAL" | bc 2>/dev/null || echo "N/A")
|
|
23
|
+
|
|
24
|
+
# Check for repeat data
|
|
25
|
+
REPEAT=$(jq -r '.results.stats.repeatCount // 0' "$RESULTS" 2>/dev/null || echo "0")
|
|
26
|
+
|
|
27
|
+
cat <<EOF
|
|
28
|
+
# Eval Report: ${AGENT} — ${DATE}
|
|
29
|
+
|
|
30
|
+
## Summary
|
|
31
|
+
- Cases: ${TOTAL} total
|
|
32
|
+
- Passed: ${PASSED}/${TOTAL} (${PASS_RATE}%)
|
|
33
|
+
- Failed: ${FAILED}
|
|
34
|
+
EOF
|
|
35
|
+
|
|
36
|
+
if [[ "$REPEAT" -gt 1 ]]; then
|
|
37
|
+
echo "- Repeat count: ${REPEAT} (pass@k computed per case)"
|
|
38
|
+
fi
|
|
39
|
+
|
|
40
|
+
echo ""
|
|
41
|
+
echo "## Results"
|
|
42
|
+
echo "| # | Prompt (truncated) | Pass | Assertions |"
|
|
43
|
+
echo "|---|-------------------|------|------------|"
|
|
44
|
+
|
|
45
|
+
jq -r '.results.results | to_entries[] | "\(.key + 1)|\(.value.vars.prompt // "N/A" | .[0:50])|\(.value.success)|\(.value.gradingResult.componentResults // [] | length) checked"' "$RESULTS" 2>/dev/null | \
|
|
46
|
+
while IFS='|' read -r num prompt pass asserts; do
|
|
47
|
+
icon=$([[ "$pass" == "true" ]] && echo "✓" || echo "✗")
|
|
48
|
+
echo "| ${num} | ${prompt} | ${icon} | ${asserts} |"
|
|
49
|
+
done
|
|
50
|
+
|
|
51
|
+
# Failures section
|
|
52
|
+
if [[ "$FAILED" -gt 0 ]]; then
|
|
53
|
+
echo ""
|
|
54
|
+
echo "## Failures"
|
|
55
|
+
jq -r '.results.results | to_entries[] | select(.value.success == false) | "### Case \(.key + 1): \(.value.vars.prompt // "N/A" | .[0:60])\n- Failing assertions: \([.value.gradingResult.componentResults[]? | select(.pass == false) | .assertion.type // "unknown" | select(.pass == false) | .assertion.type // "unknown"] | join(", "))\n"' "$RESULTS" 2>/dev/null
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
# Trend comparison
|
|
59
|
+
if [[ -n "$PREVIOUS" && -f "$PREVIOUS" ]]; then
|
|
60
|
+
PREV_PASSED=$(jq '[.results.results[] | select(.success == true)] | length' "$PREVIOUS")
|
|
61
|
+
PREV_TOTAL=$(jq '.results.results | length' "$PREVIOUS")
|
|
62
|
+
echo ""
|
|
63
|
+
echo "## Trend"
|
|
64
|
+
echo "- Previous: ${PREV_PASSED}/${PREV_TOTAL}"
|
|
65
|
+
echo "- Current: ${PASSED}/${TOTAL}"
|
|
66
|
+
if [[ "$PASSED" -gt "$PREV_PASSED" ]]; then
|
|
67
|
+
echo "- Direction: ↑ improved"
|
|
68
|
+
elif [[ "$PASSED" -lt "$PREV_PASSED" ]]; then
|
|
69
|
+
echo "- Direction: ↓ regressed"
|
|
70
|
+
else
|
|
71
|
+
echo "- Direction: → stable"
|
|
72
|
+
fi
|
|
73
|
+
fi
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# kiro-judge.sh — Promptfoo exec provider for llm-rubric judging via kiro-cli
|
|
3
|
+
set -o pipefail
|
|
4
|
+
PROMPT="$1"
|
|
5
|
+
|
|
6
|
+
# Truncate if too large for shell args (macOS limit ~262144 bytes)
|
|
7
|
+
MAX_LEN=200000
|
|
8
|
+
if [[ ${#PROMPT} -gt $MAX_LEN ]]; then
|
|
9
|
+
PROMPT="${PROMPT:0:$MAX_LEN}... [truncated for eval — output exceeded ${MAX_LEN} chars]"
|
|
10
|
+
fi
|
|
11
|
+
|
|
12
|
+
RAW=$(kiro-cli chat --no-interactive --trust-tools "" "$PROMPT" 2>/dev/null)
|
|
13
|
+
echo "$RAW" | sed $'s/\x1b\[[0-9;]*[a-zA-Z]//g; s/\x1b\[[0-9;]*m//g; s/\x07//g' \
|
|
14
|
+
| grep -v '^\s*$' | grep -v 'hooks finished' | grep -v 'Credits:' \
|
|
15
|
+
| grep -v 'WARNING:' | grep -v 'All tools are now trusted' \
|
|
16
|
+
| grep -v 'Checkpoints are not' | grep -v 'Learn more at' \
|
|
17
|
+
| sed 's/^> //' | sed 's/^[[:space:]]*//'
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# kiro-provider.sh — Promptfoo exec provider that runs kiro-cli agents
|
|
3
|
+
# Usage: bash kiro-provider.sh <prompt> <options_json> <context_json>
|
|
4
|
+
# Agent is determined from the prompt's {{agent}} variable passed via options JSON
|
|
5
|
+
set -o pipefail
|
|
6
|
+
|
|
7
|
+
PROMPT="$1"
|
|
8
|
+
OPTIONS="$2"
|
|
9
|
+
SNAPSHOT_FILE="/tmp/promptfoo-eval-telemetry-snapshot.txt"
|
|
10
|
+
TIMEOUT="${KIRO_EVAL_TIMEOUT:-300}"
|
|
11
|
+
|
|
12
|
+
# Extract agent from options JSON or env var
|
|
13
|
+
if [[ -n "$OPTIONS" ]]; then
|
|
14
|
+
AGENT=$(node -e "let d='';process.stdin.on('data',c=>d+=c).on('end',()=>{try{const j=JSON.parse(d);process.stdout.write(j.config?.agent||'')}catch{}})" <<<"$OPTIONS" 2>/dev/null)
|
|
15
|
+
fi
|
|
16
|
+
AGENT="${AGENT:-${KIRO_EVAL_AGENT:-dev}}"
|
|
17
|
+
|
|
18
|
+
# Auto-detect telemetry file from installed agent location
|
|
19
|
+
_find_telemetry() {
|
|
20
|
+
local agent="$1"
|
|
21
|
+
for f in "$HOME/.kiro/agents/"*"-${agent}.json"; do
|
|
22
|
+
[[ -f "$f" ]] || continue
|
|
23
|
+
local pkg_path
|
|
24
|
+
pkg_path=$(grep -o "$HOME/.flow-agents\"]*" "$f" 2>/dev/null | head -1 | sed 's|/context/.*||')
|
|
25
|
+
if [[ -n "$pkg_path" && -f "$pkg_path/.telemetry/full.jsonl" ]]; then
|
|
26
|
+
echo "$pkg_path/.telemetry/full.jsonl"
|
|
27
|
+
return
|
|
28
|
+
fi
|
|
29
|
+
done
|
|
30
|
+
echo "$HOME/.flow-agents"
|
|
31
|
+
}
|
|
32
|
+
TELEMETRY_FILE="$(_find_telemetry "$AGENT")"
|
|
33
|
+
|
|
34
|
+
SAFE_TOOLS="read files,code,grep,glob,knowledge,web_search,web_fetch,delegate to a specialist agent,todo tool,thinking,session,report_issue"
|
|
35
|
+
|
|
36
|
+
# Snapshot telemetry line count before run
|
|
37
|
+
if [[ -f "$TELEMETRY_FILE" ]]; then
|
|
38
|
+
wc -l < "$TELEMETRY_FILE" | tr -d ' ' > "$SNAPSHOT_FILE"
|
|
39
|
+
else
|
|
40
|
+
echo "0" > "$SNAPSHOT_FILE"
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
# Run agent, capture output
|
|
44
|
+
RAW=$(timeout "$TIMEOUT" kiro-cli chat \
|
|
45
|
+
--agent "$AGENT" \
|
|
46
|
+
--no-interactive \
|
|
47
|
+
--trust-tools "$SAFE_TOOLS" \
|
|
48
|
+
"$PROMPT" 2>/dev/null)
|
|
49
|
+
|
|
50
|
+
# Strip ANSI escape codes and bell chars
|
|
51
|
+
CLEAN=$(echo "$RAW" | sed $'s/\x1b\[[0-9;]*[a-zA-Z]//g; s/\x1b\[[0-9;]*m//g; s/\x07//g')
|
|
52
|
+
|
|
53
|
+
# Remove kiro chrome lines but keep the actual response content
|
|
54
|
+
echo "$CLEAN" | grep -v '^\s*$' \
|
|
55
|
+
| grep -v 'hooks finished' \
|
|
56
|
+
| grep -v 'Credits:' \
|
|
57
|
+
| grep -v 'WARNING:' \
|
|
58
|
+
| grep -v 'All tools are now trusted' \
|
|
59
|
+
| grep -v 'Checkpoints are not' \
|
|
60
|
+
| grep -v 'Learn more at' \
|
|
61
|
+
| sed 's/^> //' \
|
|
62
|
+
| sed 's/^[[:space:]]*//'
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Shared command adapter for evals. Historical script entry paths are routed to TypeScript tools.
|
|
3
|
+
|
|
4
|
+
FLOW_AGENTS_EVAL_ROOT="${ROOT:-${ROOT_DIR:-}}"
|
|
5
|
+
|
|
6
|
+
flow_agents_build_ts() {
|
|
7
|
+
(cd "$FLOW_AGENTS_EVAL_ROOT" && npm run build --silent >/dev/null)
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
flow_agents_node() {
|
|
11
|
+
case "$1" in
|
|
12
|
+
*/scripts/build-universal-bundles.js|scripts/build-universal-bundles.js)
|
|
13
|
+
shift
|
|
14
|
+
flow_agents_build_ts || return
|
|
15
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" build-bundles "$@"
|
|
16
|
+
return
|
|
17
|
+
;;
|
|
18
|
+
*/scripts/generate-context-map.js|scripts/generate-context-map.js)
|
|
19
|
+
shift
|
|
20
|
+
flow_agents_build_ts || return
|
|
21
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" context-map "$@"
|
|
22
|
+
return
|
|
23
|
+
;;
|
|
24
|
+
*/scripts/filter-installed-packs.js|scripts/filter-installed-packs.js)
|
|
25
|
+
shift
|
|
26
|
+
flow_agents_build_ts || return
|
|
27
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" filter-installed-packs "$@"
|
|
28
|
+
return
|
|
29
|
+
;;
|
|
30
|
+
workflow-sidecar)
|
|
31
|
+
shift
|
|
32
|
+
flow_agents_build_ts || return
|
|
33
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/workflow-sidecar.js" "$@"
|
|
34
|
+
return
|
|
35
|
+
;;
|
|
36
|
+
validate-workflow-artifacts)
|
|
37
|
+
shift
|
|
38
|
+
flow_agents_build_ts || return
|
|
39
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/validate-workflow-artifacts.js" "$@"
|
|
40
|
+
return
|
|
41
|
+
;;
|
|
42
|
+
*/scripts/validate-source-tree.js|scripts/validate-source-tree.js)
|
|
43
|
+
shift
|
|
44
|
+
flow_agents_build_ts || return
|
|
45
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/validate-source-tree.js" "$@"
|
|
46
|
+
return
|
|
47
|
+
;;
|
|
48
|
+
*/scripts/flow-kit.js|scripts/flow-kit.js)
|
|
49
|
+
shift
|
|
50
|
+
flow_agents_build_ts || return
|
|
51
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/flow-kit.js" "$@"
|
|
52
|
+
return
|
|
53
|
+
;;
|
|
54
|
+
*/scripts/effective-backlog-settings.js|scripts/effective-backlog-settings.js)
|
|
55
|
+
shift
|
|
56
|
+
flow_agents_build_ts || return
|
|
57
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/effective-backlog-settings.js" "$@"
|
|
58
|
+
return
|
|
59
|
+
;;
|
|
60
|
+
*/scripts/pull-work-provider.js|scripts/pull-work-provider.js)
|
|
61
|
+
shift
|
|
62
|
+
flow_agents_build_ts || return
|
|
63
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/pull-work-provider.js" "$@"
|
|
64
|
+
return
|
|
65
|
+
;;
|
|
66
|
+
workflow-artifact-cleanup-audit)
|
|
67
|
+
shift
|
|
68
|
+
flow_agents_build_ts || return
|
|
69
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" workflow-artifact-cleanup-audit "$@"
|
|
70
|
+
return
|
|
71
|
+
;;
|
|
72
|
+
fixture-retirement-audit)
|
|
73
|
+
shift
|
|
74
|
+
flow_agents_build_ts || return
|
|
75
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" fixture-retirement-audit "$@"
|
|
76
|
+
return
|
|
77
|
+
;;
|
|
78
|
+
*/scripts/publish-change-helper.js|scripts/publish-change-helper.js)
|
|
79
|
+
shift
|
|
80
|
+
flow_agents_build_ts || return
|
|
81
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/publish-change-helper.js" "$@"
|
|
82
|
+
return
|
|
83
|
+
;;
|
|
84
|
+
*/scripts/promote-workflow-artifact.js|scripts/promote-workflow-artifact.js)
|
|
85
|
+
shift
|
|
86
|
+
flow_agents_build_ts || return
|
|
87
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/promote-workflow-artifact.js" "$@"
|
|
88
|
+
return
|
|
89
|
+
;;
|
|
90
|
+
*/scripts/usage-feedback.js|scripts/usage-feedback.js)
|
|
91
|
+
shift
|
|
92
|
+
flow_agents_build_ts || return
|
|
93
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli/usage-feedback.js" "$@"
|
|
94
|
+
return
|
|
95
|
+
;;
|
|
96
|
+
veritas-governance)
|
|
97
|
+
shift
|
|
98
|
+
flow_agents_build_ts || return
|
|
99
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" veritas-governance "$@"
|
|
100
|
+
return
|
|
101
|
+
;;
|
|
102
|
+
*/scripts/validate-hook-influence-cases.js|scripts/validate-hook-influence-cases.js)
|
|
103
|
+
shift
|
|
104
|
+
flow_agents_build_ts || return
|
|
105
|
+
node "$FLOW_AGENTS_EVAL_ROOT/build/src/cli.js" validate-hook-influence "$@"
|
|
106
|
+
return
|
|
107
|
+
;;
|
|
108
|
+
esac
|
|
109
|
+
echo "flow_agents_node: no TypeScript adapter registered for $1" >&2
|
|
110
|
+
return 64
|
|
111
|
+
}
|