@kontourai/flow-agents 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-push +11 -0
- package/.github/workflows/ci.yml +210 -0
- package/.github/workflows/docs-pages.yml +52 -0
- package/.github/workflows/publish-npm.yml +104 -0
- package/AGENTS.md +26 -0
- package/CHANGELOG.md +66 -0
- package/CODE_OF_CONDUCT.md +25 -0
- package/CONTEXT.md +300 -0
- package/CONTRIBUTING.md +44 -0
- package/LICENSE +201 -0
- package/README.md +129 -0
- package/SECURITY.md +33 -0
- package/agent-cards/dev.json +19 -0
- package/agents/dev.json +127 -0
- package/agents/tool-code-reviewer.json +61 -0
- package/agents/tool-dependencies-updater.json +118 -0
- package/agents/tool-explore-config.json +92 -0
- package/agents/tool-explore-deps.json +92 -0
- package/agents/tool-explore-entry.json +92 -0
- package/agents/tool-explore-patterns.json +92 -0
- package/agents/tool-explore-structure.json +92 -0
- package/agents/tool-explore-tests.json +92 -0
- package/agents/tool-planner.json +57 -0
- package/agents/tool-playwright.json +145 -0
- package/agents/tool-security-reviewer.json +56 -0
- package/agents/tool-verifier.json +61 -0
- package/agents/tool-worker.json +58 -0
- package/build/src/cli/console-learning-projection.js +123 -0
- package/build/src/cli/docs-preview.js +39 -0
- package/build/src/cli/effective-backlog-settings.js +102 -0
- package/build/src/cli/export-bookmarks.js +38 -0
- package/build/src/cli/fixture-retirement-audit.js +140 -0
- package/build/src/cli/flow-kit.js +138 -0
- package/build/src/cli/import-bookmarks.js +50 -0
- package/build/src/cli/init.js +239 -0
- package/build/src/cli/instinct-cli.js +93 -0
- package/build/src/cli/promote-workflow-artifact.js +63 -0
- package/build/src/cli/publish-change-helper.js +154 -0
- package/build/src/cli/pull-work-provider.js +469 -0
- package/build/src/cli/runtime-adapter.js +23 -0
- package/build/src/cli/telemetry-doctor.js +221 -0
- package/build/src/cli/usage-feedback.js +443 -0
- package/build/src/cli/validate-hook-influence.js +152 -0
- package/build/src/cli/validate-source-tree.js +31 -0
- package/build/src/cli/validate-workflow-artifacts.js +486 -0
- package/build/src/cli/veritas-governance.js +262 -0
- package/build/src/cli/workflow-artifact-cleanup-audit.js +272 -0
- package/build/src/cli/workflow-sidecar.js +816 -0
- package/build/src/cli.js +89 -0
- package/build/src/flow-kit/validate.js +75 -0
- package/build/src/lib/args.js +45 -0
- package/build/src/lib/fs.js +62 -0
- package/build/src/lib/workflow-learning-projection.js +334 -0
- package/build/src/runtime-adapters.js +146 -0
- package/build/src/tools/build-universal-bundles.js +397 -0
- package/build/src/tools/common.js +56 -0
- package/build/src/tools/filter-installed-packs.js +132 -0
- package/build/src/tools/generate-context-map.js +198 -0
- package/build/src/tools/validate-package.js +64 -0
- package/build/src/tools/validate-source-tree.js +622 -0
- package/console.telemetry.json +176 -0
- package/context/base-rules.md +17 -0
- package/context/code-review-standards.md +62 -0
- package/context/coding-standards.md +42 -0
- package/context/common/orchestrators.md +12 -0
- package/context/common/subagents.md +28 -0
- package/context/contracts/artifact-contract.md +182 -0
- package/context/contracts/builder-kit-workflow-state-contract.md +319 -0
- package/context/contracts/delivery-contract.md +69 -0
- package/context/contracts/execution-contract.md +53 -0
- package/context/contracts/governance-adapter-contract.md +67 -0
- package/context/contracts/planning-contract.md +85 -0
- package/context/contracts/review-contract.md +104 -0
- package/context/contracts/sandbox-policy.md +52 -0
- package/context/contracts/verification-contract.md +134 -0
- package/context/contracts/work-item-contract.md +215 -0
- package/context/deferred/demo-mode.md +33 -0
- package/context/deferred/languages/go.md +31 -0
- package/context/deferred/languages/python.md +31 -0
- package/context/deferred/languages/typescript.md +34 -0
- package/context/deferred/parallelization.md +35 -0
- package/context/deferred/worktree-isolation.md +24 -0
- package/context/development-workflow.md +50 -0
- package/context/scripts/context-budget/budget-scan.sh +166 -0
- package/context/scripts/detect-tools.sh +3 -0
- package/context/scripts/discover-agents.sh +28 -0
- package/context/scripts/git-status.sh +49 -0
- package/context/scripts/hooks/config-protection.js +79 -0
- package/context/scripts/hooks/desktop-notify.sh +39 -0
- package/context/scripts/hooks/governance-audit.sh +135 -0
- package/context/scripts/hooks/lib/audit-transport.sh +40 -0
- package/context/scripts/hooks/lib/hook-flags.js +49 -0
- package/context/scripts/hooks/lib/patterns.sh +57 -0
- package/context/scripts/hooks/lib/resolve-formatter.js +80 -0
- package/context/scripts/hooks/post-edit-accumulator.js +66 -0
- package/context/scripts/hooks/pre-commit-quality.js +194 -0
- package/context/scripts/hooks/quality-gate.js +93 -0
- package/context/scripts/hooks/report-only-guard.js +21 -0
- package/context/scripts/hooks/run-hook.js +136 -0
- package/context/scripts/hooks/stop-format-typecheck.js +141 -0
- package/context/scripts/hooks/stop-goal-fit.js +337 -0
- package/context/scripts/hooks/workflow-steering.js +250 -0
- package/context/scripts/telemetry/console-presets.sh +14 -0
- package/context/scripts/telemetry/install-console-config.sh +214 -0
- package/context/scripts/telemetry/lib/config.sh +85 -0
- package/context/scripts/telemetry/lib/enrich.sh +115 -0
- package/context/scripts/telemetry/lib/redact.sh +22 -0
- package/context/scripts/telemetry/lib/session.sh +63 -0
- package/context/scripts/telemetry/lib/transport.sh +183 -0
- package/context/scripts/telemetry/lib/usage.sh +29 -0
- package/context/scripts/telemetry/sync-agents.sh +173 -0
- package/context/scripts/telemetry/telemetry.conf +23 -0
- package/context/scripts/telemetry/telemetry.sh +387 -0
- package/context/scripts/validate-package.sh +89 -0
- package/context/settings/backlog-provider-settings.json +54 -0
- package/context/templates/core/identity.md +26 -0
- package/context/templates/core/user.md +15 -0
- package/docs/_config.yml +15 -0
- package/docs/_layouts/default.html +87 -0
- package/docs/adr/0001-flow-agents-consumes-flow.md +77 -0
- package/docs/adr/0002-flow-kits-as-extension-unit.md +13 -0
- package/docs/adr/0003-flow-agents-coordinates-kits-and-adapters.md +13 -0
- package/docs/adr/0004-gates-expect-surface-claims.md +15 -0
- package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +48 -0
- package/docs/adr/0006-typescript-first-source-policy.md +98 -0
- package/docs/agent-system-guidebook.md +391 -0
- package/docs/agent-usage-feedback-loop.md +351 -0
- package/docs/assets/favicon.svg +13 -0
- package/docs/assets/og-image.png +0 -0
- package/docs/assets/site.css +774 -0
- package/docs/assets/site.js +139 -0
- package/docs/configurable-workflow-routing.md +174 -0
- package/docs/context-map.md +145 -0
- package/docs/developer-architecture.md +145 -0
- package/docs/developer-hook-setup.md +61 -0
- package/docs/fixture-ownership.md +44 -0
- package/docs/flow-kit-repository-contract.md +180 -0
- package/docs/index.md +129 -0
- package/docs/kontour-resource-contract.md +358 -0
- package/docs/migrations.md +64 -0
- package/docs/north-star.md +322 -0
- package/docs/operating-layers.md +110 -0
- package/docs/repository-structure.md +132 -0
- package/docs/sandbox-policy.md +56 -0
- package/docs/skills-map.md +203 -0
- package/docs/standards-register.md +96 -0
- package/docs/veritas-integration.md +165 -0
- package/docs/work-item-adapters.md +72 -0
- package/docs/workflow-artifact-lifecycle.md +141 -0
- package/docs/workflow-eval-strategy.md +295 -0
- package/docs/workflow-shared-contracts.md +51 -0
- package/docs/workflow-usage-guide.md +443 -0
- package/evals/ARCHITECTURE.md +143 -0
- package/evals/CONVENTIONS.md +58 -0
- package/evals/README.md +128 -0
- package/evals/acceptance/run.sh +29 -0
- package/evals/acceptance/test_claude_harness.sh +242 -0
- package/evals/acceptance/test_codex_harness.sh +108 -0
- package/evals/acceptance/test_kiro_harness.sh +128 -0
- package/evals/cases/dev/404.html +97 -0
- package/evals/cases/dev/code-review.yaml +44 -0
- package/evals/cases/dev/dashboard.html +300 -0
- package/evals/cases/dev/deliver.yaml +66 -0
- package/evals/cases/dev/dependency-update.yaml +16 -0
- package/evals/cases/dev/explore.yaml +20 -0
- package/evals/cases/dev/index.html +370 -0
- package/evals/cases/dev/package-lock.json +28 -0
- package/evals/cases/dev/package.json +16 -0
- package/evals/cases/dev/plan-work.yaml +20 -0
- package/evals/cases/dev/promptfooconfig.yaml +666 -0
- package/evals/cases/dev/search-first.yaml +20 -0
- package/evals/cases/dev/tdd-workflow.yaml +48 -0
- package/evals/cases/dev/verify-work.yaml +44 -0
- package/evals/cases/dev/workflow.yaml +34 -0
- package/evals/ci/run-baseline.sh +283 -0
- package/evals/fixtures/backlog-provider-settings/global-default.json +44 -0
- package/evals/fixtures/backlog-provider-settings/project-override.json +53 -0
- package/evals/fixtures/builder-kit-workflow-state/baseline-freshness-resolution-hint.json +139 -0
- package/evals/fixtures/builder-kit-workflow-state/direct-primitive-stop.json +59 -0
- package/evals/fixtures/builder-kit-workflow-state/empty-board-route-shape.json +55 -0
- package/evals/fixtures/builder-kit-workflow-state/happy-path.json +71 -0
- package/evals/fixtures/builder-kit-workflow-state/mid-work-resume.json +80 -0
- package/evals/fixtures/builder-kit-workflow-state/missing-prestep-recovery.json +65 -0
- package/evals/fixtures/builder-kit-workflow-state/product-build-chaining.json +60 -0
- package/evals/fixtures/builder-kit-workflow-state/stale-continuation-requires-new-probe.json +57 -0
- package/evals/fixtures/console-learning-projection/artifacts/console-learning-correction/learning.json +50 -0
- package/evals/fixtures/console-learning-projection/artifacts/console-learning-open-route/learning.json +41 -0
- package/evals/fixtures/flow-kit-repository/invalid-absolute-path/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-asset-section/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-asset-section/kit.json +11 -0
- package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/kit.json +9 -0
- package/evals/fixtures/flow-kit-repository/invalid-id/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-id/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-malformed-json/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-flow/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-id/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-id/kit.json +7 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/kit.json +7 -0
- package/evals/fixtures/flow-kit-repository/invalid-name/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-name/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-schema-version/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-schema-version/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-traversal/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/adapters/example.json +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/assets/example.txt +1 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/docs/README.md +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/flows/runtime.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-evals/example.json +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-skills/mixed/SKILL.md +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit.json +44 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/docs/README.md +3 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/kit.json +20 -0
- package/evals/fixtures/hook-influence/cases.json +336 -0
- package/evals/fixtures/pull-work-provider/github-issues.json +170 -0
- package/evals/fixtures/pull-work-wip-shepherding/global-wip-informs.json +43 -0
- package/evals/fixtures/pull-work-wip-shepherding/personal-wip-blocks.json +42 -0
- package/evals/fixtures/surface-trust/accepted-claim-trust-report.json +31 -0
- package/evals/fixtures/surface-trust/artifact-absent.json +19 -0
- package/evals/fixtures/surface-trust/integrity-mismatch-trust-report.json +32 -0
- package/evals/fixtures/surface-trust/missing-authority-trust-report.json +27 -0
- package/evals/fixtures/surface-trust/provider-absent.json +19 -0
- package/evals/fixtures/surface-trust/rejected-claim-trust-report.json +30 -0
- package/evals/fixtures/surface-trust/stale-claim-trust-snapshot.json +31 -0
- package/evals/fixtures/usage-feedback/sample-full.jsonl +11 -0
- package/evals/fixtures/usage-feedback/sample-outcomes.jsonl +1 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-pass.sh +18 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-secret-fail.sh +10 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-unconfigured.sh +4 -0
- package/evals/integration/test_bundle_install.sh +541 -0
- package/evals/integration/test_console_learning_projection.sh +192 -0
- package/evals/integration/test_context_map.sh +65 -0
- package/evals/integration/test_effective_backlog_settings.sh +58 -0
- package/evals/integration/test_fixture_retirement_audit.sh +58 -0
- package/evals/integration/test_flow_agents_statusline.sh +93 -0
- package/evals/integration/test_flow_kit_repository.sh +90 -0
- package/evals/integration/test_goal_fit_hook.sh +482 -0
- package/evals/integration/test_hook_category_behaviors.sh +190 -0
- package/evals/integration/test_hook_influence_cases.sh +69 -0
- package/evals/integration/test_local_flow_kit_install.sh +145 -0
- package/evals/integration/test_publish_change_helper.sh +176 -0
- package/evals/integration/test_pull_work_provider.sh +140 -0
- package/evals/integration/test_runtime_adapter_activation.sh +106 -0
- package/evals/integration/test_telemetry.sh +485 -0
- package/evals/integration/test_telemetry_doctor.sh +193 -0
- package/evals/integration/test_usage_feedback_dashboard.sh +169 -0
- package/evals/integration/test_usage_feedback_global.sh +117 -0
- package/evals/integration/test_usage_feedback_import.sh +227 -0
- package/evals/integration/test_usage_feedback_outcomes.sh +165 -0
- package/evals/integration/test_usage_feedback_report.sh +263 -0
- package/evals/integration/test_veritas_governance_adapter.sh +235 -0
- package/evals/integration/test_workflow_artifact_cleanup_audit.sh +287 -0
- package/evals/integration/test_workflow_artifacts.sh +1247 -0
- package/evals/integration/test_workflow_sidecar_writer.sh +2112 -0
- package/evals/integration/test_workflow_steering_hook.sh +337 -0
- package/evals/lib/assertions/delegated-to.js +40 -0
- package/evals/lib/assertions/max-tool-calls.js +15 -0
- package/evals/lib/assertions/no-write-tools.js +27 -0
- package/evals/lib/assertions/pass-at-k.js +39 -0
- package/evals/lib/assertions/telemetry-utils.js +105 -0
- package/evals/lib/assertions/tool-called.js +39 -0
- package/evals/lib/assertions/verify-after-fix.js +61 -0
- package/evals/lib/claude-judge.sh +40 -0
- package/evals/lib/claude-provider.sh +74 -0
- package/evals/lib/codex-judge.sh +39 -0
- package/evals/lib/codex-provider.sh +81 -0
- package/evals/lib/eval-dev.sh +5 -0
- package/evals/lib/eval-judge.sh +22 -0
- package/evals/lib/eval-provider.sh +26 -0
- package/evals/lib/eval-report.sh +73 -0
- package/evals/lib/kiro-dev.sh +4 -0
- package/evals/lib/kiro-judge.sh +17 -0
- package/evals/lib/kiro-provider.sh +62 -0
- package/evals/lib/node.sh +111 -0
- package/evals/promptfooconfig.yaml +70 -0
- package/evals/run.sh +309 -0
- package/evals/static/test_evidence_refs.sh +141 -0
- package/evals/static/test_package.sh +407 -0
- package/evals/static/test_repo_hooks.sh +68 -0
- package/evals/static/test_universal_bundles.sh +274 -0
- package/evals/static/test_workflow_skills.sh +1207 -0
- package/install.sh +64 -0
- package/integrations/veritas/flow-agents.adapter.json +138 -0
- package/integrations/veritas/flow-agents.authority-settings.json +26 -0
- package/integrations/veritas/flow-agents.repo-standards.json +82 -0
- package/kits/builder/flows/build.flow.json +218 -0
- package/kits/builder/flows/shape.flow.json +127 -0
- package/kits/builder/kit.json +19 -0
- package/kits/catalog.json +11 -0
- package/package.json +130 -0
- package/packaging/README.md +60 -0
- package/packaging/manifest.json +173 -0
- package/packaging/packs.json +69 -0
- package/powers/dependency-checker/POWER.md +20 -0
- package/powers/dependency-checker/mcp.json +20 -0
- package/powers/playwright/POWER.md +25 -0
- package/powers/playwright/mcp.json +12 -0
- package/prompts/code-audit.md +123 -0
- package/prompts/kcommit.md +88 -0
- package/schemas/backlog-provider-settings.schema.json +138 -0
- package/schemas/workflow-acceptance.schema.json +216 -0
- package/schemas/workflow-critique.schema.json +113 -0
- package/schemas/workflow-evidence.schema.json +357 -0
- package/schemas/workflow-handoff.schema.json +52 -0
- package/schemas/workflow-learning.schema.json +223 -0
- package/schemas/workflow-release.schema.json +172 -0
- package/schemas/workflow-state.schema.json +80 -0
- package/scripts/README.md +111 -0
- package/scripts/build-universal-bundles.js +3 -0
- package/scripts/check-content-boundary.cjs +99 -0
- package/scripts/context-budget/budget-scan.sh +166 -0
- package/scripts/detect-tools.sh +3 -0
- package/scripts/discover-agents.sh +28 -0
- package/scripts/effective-backlog-settings.js +2 -0
- package/scripts/filter-installed-packs.js +2 -0
- package/scripts/flow-kit.js +2 -0
- package/scripts/generate-context-map.js +2 -0
- package/scripts/git-status.sh +49 -0
- package/scripts/hooks/claude-hook-adapter.js +174 -0
- package/scripts/hooks/claude-telemetry-hook.js +115 -0
- package/scripts/hooks/codex-hook-adapter.js +176 -0
- package/scripts/hooks/codex-telemetry-hook.js +95 -0
- package/scripts/hooks/config-protection.js +79 -0
- package/scripts/hooks/desktop-notify.sh +39 -0
- package/scripts/hooks/governance-audit.sh +135 -0
- package/scripts/hooks/lib/audit-transport.sh +40 -0
- package/scripts/hooks/lib/hook-flags.js +49 -0
- package/scripts/hooks/lib/patterns.sh +57 -0
- package/scripts/hooks/lib/resolve-formatter.js +80 -0
- package/scripts/hooks/post-edit-accumulator.js +66 -0
- package/scripts/hooks/pre-commit-quality.js +194 -0
- package/scripts/hooks/quality-gate.js +93 -0
- package/scripts/hooks/report-only-guard.js +21 -0
- package/scripts/hooks/run-hook.js +136 -0
- package/scripts/hooks/stop-format-typecheck.js +141 -0
- package/scripts/hooks/stop-goal-fit.js +337 -0
- package/scripts/hooks/workflow-steering.js +250 -0
- package/scripts/install-codex-home.sh +106 -0
- package/scripts/package.json +3 -0
- package/scripts/promote-workflow-artifact.js +2 -0
- package/scripts/publish-change-helper.js +2 -0
- package/scripts/pull-work-provider.js +2 -0
- package/scripts/setup-repo-hooks.sh +8 -0
- package/scripts/statusline/flow-agents-statusline.js +157 -0
- package/scripts/telemetry/console-presets.sh +14 -0
- package/scripts/telemetry/install-console-config.sh +214 -0
- package/scripts/telemetry/lib/config.sh +85 -0
- package/scripts/telemetry/lib/enrich.sh +115 -0
- package/scripts/telemetry/lib/redact.sh +22 -0
- package/scripts/telemetry/lib/session.sh +63 -0
- package/scripts/telemetry/lib/transport.sh +183 -0
- package/scripts/telemetry/lib/usage.sh +29 -0
- package/scripts/telemetry/sync-agents.sh +173 -0
- package/scripts/telemetry/telemetry.conf +23 -0
- package/scripts/telemetry/telemetry.sh +387 -0
- package/scripts/usage-feedback.js +2 -0
- package/scripts/validate-hook-influence-cases.js +2 -0
- package/scripts/validate-package.sh +89 -0
- package/scripts/validate-source-tree.js +9 -0
- package/skills/agentic-engineering/SKILL.md +62 -0
- package/skills/browser-test/SKILL.md +51 -0
- package/skills/builder-shape/SKILL.md +76 -0
- package/skills/context-budget/SKILL.md +40 -0
- package/skills/deliver/SKILL.md +241 -0
- package/skills/dependency-update/SKILL.md +68 -0
- package/skills/design-probe/SKILL.md +107 -0
- package/skills/eval-rebuild/SKILL.md +39 -0
- package/skills/evidence-gate/SKILL.md +186 -0
- package/skills/execute-plan/SKILL.md +110 -0
- package/skills/explore/SKILL.md +137 -0
- package/skills/feedback-loop/SKILL.md +87 -0
- package/skills/fix-bug/SKILL.md +133 -0
- package/skills/frontend-design/SKILL.md +80 -0
- package/skills/github-cli/SKILL.md +63 -0
- package/skills/idea-to-backlog/SKILL.md +267 -0
- package/skills/knowledge-capture/SKILL.md +55 -0
- package/skills/learning-review/SKILL.md +115 -0
- package/skills/pickup-probe/SKILL.md +114 -0
- package/skills/plan-work/SKILL.md +176 -0
- package/skills/pull-work/SKILL.md +309 -0
- package/skills/release-readiness/SKILL.md +121 -0
- package/skills/review-work/SKILL.md +161 -0
- package/skills/search-first/SKILL.md +66 -0
- package/skills/tdd-workflow/SKILL.md +140 -0
- package/skills/verify-work/SKILL.md +109 -0
- package/src/cli/console-learning-projection.ts +140 -0
- package/src/cli/effective-backlog-settings.ts +99 -0
- package/src/cli/fixture-retirement-audit.ts +154 -0
- package/src/cli/flow-kit.ts +139 -0
- package/src/cli/init.ts +248 -0
- package/src/cli/promote-workflow-artifact.ts +64 -0
- package/src/cli/publish-change-helper.ts +143 -0
- package/src/cli/pull-work-provider.ts +481 -0
- package/src/cli/runtime-adapter.ts +24 -0
- package/src/cli/telemetry-doctor.ts +243 -0
- package/src/cli/usage-feedback.ts +418 -0
- package/src/cli/validate-hook-influence.ts +119 -0
- package/src/cli/validate-source-tree.ts +30 -0
- package/src/cli/validate-workflow-artifacts.ts +411 -0
- package/src/cli/veritas-governance.ts +322 -0
- package/src/cli/workflow-artifact-cleanup-audit.ts +281 -0
- package/src/cli/workflow-sidecar.ts +676 -0
- package/src/cli.ts +95 -0
- package/src/flow-kit/validate.ts +74 -0
- package/src/lib/args.ts +43 -0
- package/src/lib/fs.ts +62 -0
- package/src/lib/workflow-learning-projection.ts +491 -0
- package/src/runtime-adapters.ts +154 -0
- package/src/tools/build-universal-bundles.ts +366 -0
- package/src/tools/common.ts +61 -0
- package/src/tools/filter-installed-packs.ts +129 -0
- package/src/tools/generate-context-map.ts +199 -0
- package/src/tools/validate-package.ts +57 -0
- package/src/tools/validate-source-tree.ts +488 -0
- package/tsconfig.json +19 -0
- package/veritas.claims.json +6 -0
package/evals/README.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# Flow Agents Eval Suite
|
|
2
|
+
|
|
3
|
+
Evaluation coverage for the canonical Flow Agents source tree and generated universal bundles.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install
|
|
9
|
+
|
|
10
|
+
# Run the fast local gate: source validation, static package checks, integration checks
|
|
11
|
+
bash evals/run.sh
|
|
12
|
+
|
|
13
|
+
# Run only source/static checks
|
|
14
|
+
bash evals/run.sh static
|
|
15
|
+
|
|
16
|
+
# Run only integration checks
|
|
17
|
+
bash evals/run.sh integration
|
|
18
|
+
|
|
19
|
+
# Run harness-native acceptance checks
|
|
20
|
+
bash evals/run.sh acceptance
|
|
21
|
+
|
|
22
|
+
# Claude acceptance is cheap by default. Opt in to prompt-mode Claude usage only when needed.
|
|
23
|
+
FLOW_AGENTS_ACCEPTANCE_CLAUDE_LLM=1 bash evals/run.sh acceptance claude
|
|
24
|
+
|
|
25
|
+
# Run behavioral evals through the default Kiro runtime
|
|
26
|
+
bash evals/run.sh llm
|
|
27
|
+
|
|
28
|
+
# Run one behavioral suite through Codex as subject runtime and judge
|
|
29
|
+
bash evals/run.sh llm dev --runtime codex
|
|
30
|
+
|
|
31
|
+
# Run Claude Code as the subject runtime while Codex judges rubrics
|
|
32
|
+
bash evals/run.sh llm dev --runtime claude --judge-runtime codex
|
|
33
|
+
|
|
34
|
+
# Run cheaper behavioral subsets
|
|
35
|
+
bash evals/run.sh llm dev --suite smoke
|
|
36
|
+
bash evals/run.sh llm dev --suite regression
|
|
37
|
+
|
|
38
|
+
# View promptfoo results
|
|
39
|
+
npm run promptfoo:view
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Layers
|
|
43
|
+
|
|
44
|
+
### Layer 1: Static (`bash evals/run.sh static`)
|
|
45
|
+
|
|
46
|
+
Validates the source tree and generated bundle exports:
|
|
47
|
+
- canonical source validation via `npm run validate:source --`
|
|
48
|
+
- package shape, schemas, resources, hooks, routing, MCP server references, write-tool invariants, and agent cards
|
|
49
|
+
- universal bundle build/export checks for Kiro, Claude Code, and Codex
|
|
50
|
+
|
|
51
|
+
Runs in seconds and has no LLM cost.
|
|
52
|
+
|
|
53
|
+
### Layer 2: Integration (`bash evals/run.sh integration`)
|
|
54
|
+
|
|
55
|
+
Validates runtime-adjacent contracts:
|
|
56
|
+
- telemetry event schemas, type mapping, field presence, prompt capture, tool capture, redaction, and agent discovery
|
|
57
|
+
- workflow artifact quality and deterministic end-to-end delivery chain fixtures
|
|
58
|
+
- bundle install smoke tests for Kiro, Claude Code, and Codex temp installs
|
|
59
|
+
|
|
60
|
+
Runs in seconds and has no LLM cost.
|
|
61
|
+
|
|
62
|
+
### Layer 3: Behavioral (`bash evals/run.sh llm`)
|
|
63
|
+
|
|
64
|
+
Runs selected agents through an eval runtime and scores responses with deterministic telemetry assertions plus LLM rubrics. Kiro is the default subject runtime. Pass `--runtime codex` or `--runtime claude` to run Codex or Claude Code where supported.
|
|
65
|
+
|
|
66
|
+
Subject runtime and judge runtime are separate:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
bash evals/run.sh llm dev --runtime claude --judge-runtime codex
|
|
70
|
+
bash evals/run.sh llm dev --runtime claude --judge-runtime claude
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Use `--suite smoke`, `--suite regression`, or `--suite capability` to avoid running the full behavioral suite when a targeted gate is enough. `smoke` runs the first few cases, `regression` filters `metadata.type=regression`, and `capability` filters `metadata.type=capability`.
|
|
74
|
+
|
|
75
|
+
Current behavioral suites:
|
|
76
|
+
- `dev`
|
|
77
|
+
|
|
78
|
+
The root `evals/promptfooconfig.yaml` is a legacy combined promptfoo config for targeted manual runs. Prefer `bash evals/run.sh llm <agent>` or the per-agent configs in `evals/cases/<agent>/promptfooconfig.yaml`.
|
|
79
|
+
|
|
80
|
+
### Layer 4: Acceptance (`bash evals/run.sh acceptance`)
|
|
81
|
+
|
|
82
|
+
Runs harness-native smoke tests against generated bundles:
|
|
83
|
+
- `Claude Code` discovers workspace agents and can answer through `dev`
|
|
84
|
+
- `claude` lists project agents and verifies exported telemetry hook configuration without model usage by default
|
|
85
|
+
- `codex exec` loads the exported `.codex` bundle and returns a final response
|
|
86
|
+
|
|
87
|
+
This layer is environment-dependent and requires installed, authenticated CLIs.
|
|
88
|
+
|
|
89
|
+
Claude prompt-mode acceptance is opt-in with `FLOW_AGENTS_ACCEPTANCE_CLAUDE_LLM=1`. Real Claude CLI hook telemetry assertions are opt-in with `FLOW_AGENTS_ACCEPTANCE_REQUIRE_CLAUDE_TELEMETRY=1`; deterministic integration tests cover the telemetry wrapper without spending Claude usage.
|
|
90
|
+
|
|
91
|
+
## Coverage
|
|
92
|
+
|
|
93
|
+
Covered now:
|
|
94
|
+
- source/package drift and bundle export drift
|
|
95
|
+
- telemetry schema and redaction contracts
|
|
96
|
+
- install smoke tests for generated bundles
|
|
97
|
+
- normalized telemetry for Kiro, Codex, and Claude Code hook events
|
|
98
|
+
- behavioral routing and workflow checks for the supported per-agent suites
|
|
99
|
+
|
|
100
|
+
Deferred:
|
|
101
|
+
- multi-turn conversation evals
|
|
102
|
+
- adversarial/red-team evals
|
|
103
|
+
- behavioral coverage for every exported tool agent
|
|
104
|
+
- full LLM-driven end-to-end delivery runs on every edit; deterministic artifact-chain E2E coverage runs in integration
|
|
105
|
+
- direct token usage assertions, because CLI-backed exec providers do not expose reliable token counts today
|
|
106
|
+
|
|
107
|
+
## Adding Eval Cases
|
|
108
|
+
|
|
109
|
+
Add behavioral cases to `evals/cases/<agent>/promptfooconfig.yaml`. Each test should include:
|
|
110
|
+
- `vars.prompt` with the user prompt
|
|
111
|
+
- `options.provider.id` or suite default provider
|
|
112
|
+
- deterministic assertions when telemetry can prove the behavior
|
|
113
|
+
- an `llm-rubric` for workflow quality when needed
|
|
114
|
+
- `metadata.type` set to `capability` or `regression`
|
|
115
|
+
|
|
116
|
+
Run the affected suite with:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
bash evals/run.sh llm <agent>
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Prerequisites
|
|
123
|
+
|
|
124
|
+
- `jq` for static and integration checks
|
|
125
|
+
- `Claude Code` for default behavioral and Kiro acceptance checks
|
|
126
|
+
- `codex` for `--runtime codex`, `--judge-runtime codex`, and Codex acceptance checks
|
|
127
|
+
- `claude` for `--runtime claude`, `--judge-runtime claude`, and Claude Code acceptance checks
|
|
128
|
+
- `promptfoo` for behavioral evals and result viewing, installed with `npm install` from the repo root
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
5
|
+
ACCEPT_DIR="$ROOT_DIR/evals/acceptance"
|
|
6
|
+
TARGET="${1:-all}"
|
|
7
|
+
|
|
8
|
+
run_one() {
|
|
9
|
+
local name="$1"
|
|
10
|
+
echo ""
|
|
11
|
+
bash "$ACCEPT_DIR/test_${name}_harness.sh"
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
case "$TARGET" in
|
|
15
|
+
kiro|claude|codex)
|
|
16
|
+
run_one "$TARGET"
|
|
17
|
+
;;
|
|
18
|
+
all)
|
|
19
|
+
status=0
|
|
20
|
+
run_one kiro || status=1
|
|
21
|
+
run_one claude || status=1
|
|
22
|
+
run_one codex || status=1
|
|
23
|
+
exit "$status"
|
|
24
|
+
;;
|
|
25
|
+
*)
|
|
26
|
+
echo "Usage: bash evals/acceptance/run.sh [all|kiro|claude|codex]"
|
|
27
|
+
exit 1
|
|
28
|
+
;;
|
|
29
|
+
esac
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
5
|
+
source "$ROOT_DIR/evals/lib/node.sh"
|
|
6
|
+
TMP_WORK=""
|
|
7
|
+
pass=0
|
|
8
|
+
fail=0
|
|
9
|
+
skip=0
|
|
10
|
+
|
|
11
|
+
cleanup() {
|
|
12
|
+
[[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
|
|
13
|
+
}
|
|
14
|
+
trap cleanup EXIT
|
|
15
|
+
|
|
16
|
+
_pass() { echo " ✓ $1"; pass=$((pass + 1)); }
|
|
17
|
+
_fail() { echo " ✗ $1"; fail=$((fail + 1)); }
|
|
18
|
+
_skip() { echo " ○ $1"; skip=$((skip + 1)); }
|
|
19
|
+
|
|
20
|
+
wait_for_telemetry() {
|
|
21
|
+
local file="$1"
|
|
22
|
+
local i=0
|
|
23
|
+
while [[ $i -lt 50 ]]; do
|
|
24
|
+
[[ -s "$file" ]] && return 0
|
|
25
|
+
sleep 0.1
|
|
26
|
+
i=$((i + 1))
|
|
27
|
+
done
|
|
28
|
+
return 1
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
echo "=== Harness Acceptance: Claude Code ==="
|
|
32
|
+
echo ""
|
|
33
|
+
echo "Set FLOW_AGENTS_ACCEPTANCE_CLAUDE_LLM=1 to run prompt-mode Claude checks."
|
|
34
|
+
echo "Set FLOW_AGENTS_ACCEPTANCE_REQUIRE_CLAUDE_TELEMETRY=1 to require real Claude CLI hook telemetry."
|
|
35
|
+
echo ""
|
|
36
|
+
|
|
37
|
+
if ! command -v claude >/dev/null 2>&1; then
|
|
38
|
+
_skip "claude CLI not installed"
|
|
39
|
+
echo ""
|
|
40
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
41
|
+
exit 0
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
cd "$ROOT_DIR"
|
|
45
|
+
flow_agents_node scripts/build-universal-bundles.js >/dev/null
|
|
46
|
+
|
|
47
|
+
TMP_WORK="$(mktemp -d /tmp/claude-acceptance-work.XXXXXX)"
|
|
48
|
+
bash dist/claude-code/install.sh "$TMP_WORK" >/dev/null
|
|
49
|
+
|
|
50
|
+
echo "--- Agent List ---"
|
|
51
|
+
list_output="$(cd "$TMP_WORK" && claude agents --setting-sources local,project,user 2>&1 || true)"
|
|
52
|
+
if echo "$list_output" | grep -q "Project agents:"; then
|
|
53
|
+
_pass "claude lists project agents"
|
|
54
|
+
else
|
|
55
|
+
_fail "claude did not list project agents"
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
if echo "$list_output" | grep -q "dev ·"; then
|
|
59
|
+
_pass "claude project agent list includes dev"
|
|
60
|
+
else
|
|
61
|
+
_fail "claude project agent list did not include dev"
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
if [[ -f "$TMP_WORK/.claude/settings.json" ]] && grep -q "claude-telemetry-hook.js" "$TMP_WORK/.claude/settings.json"; then
|
|
65
|
+
_pass "claude project settings include telemetry hooks"
|
|
66
|
+
else
|
|
67
|
+
_fail "claude project settings missing telemetry hooks"
|
|
68
|
+
fi
|
|
69
|
+
|
|
70
|
+
if [[ "${FLOW_AGENTS_ACCEPTANCE_CLAUDE_LLM:-0}" != "1" ]]; then
|
|
71
|
+
_skip "Claude prompt-mode checks skipped to avoid model usage"
|
|
72
|
+
echo ""
|
|
73
|
+
echo "==========================="
|
|
74
|
+
total=$((pass + fail))
|
|
75
|
+
echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
|
|
76
|
+
[[ "$fail" -gt 0 ]] && exit 1
|
|
77
|
+
exit 0
|
|
78
|
+
fi
|
|
79
|
+
|
|
80
|
+
echo ""
|
|
81
|
+
echo "--- Print Smoke ---"
|
|
82
|
+
print_output="$(cd "$TMP_WORK" && claude -p --agent dev --permission-mode bypassPermissions --add-dir "$TMP_WORK" --output-format text "Reply with READY only." 2>&1 || true)"
|
|
83
|
+
if echo "$print_output" | grep -qx "READY"; then
|
|
84
|
+
_pass "dev agent replied READY in print mode"
|
|
85
|
+
else
|
|
86
|
+
_fail "dev agent did not return plain READY in print mode"
|
|
87
|
+
fi
|
|
88
|
+
|
|
89
|
+
echo ""
|
|
90
|
+
echo "--- Behavioral Route ---"
|
|
91
|
+
route_output="$(cd "$TMP_WORK" && node - <<'NODE'
|
|
92
|
+
const { spawnSync } = require("node:child_process");
|
|
93
|
+
const result = spawnSync("claude", [
|
|
94
|
+
"-p",
|
|
95
|
+
"--agent",
|
|
96
|
+
"dev",
|
|
97
|
+
"--permission-mode",
|
|
98
|
+
"bypassPermissions",
|
|
99
|
+
"--add-dir",
|
|
100
|
+
".",
|
|
101
|
+
"--output-format",
|
|
102
|
+
"text",
|
|
103
|
+
"A user asks: 'Explore the codebase and explain what it does.' Which skill should you activate first? Reply with only the skill name or NONE.",
|
|
104
|
+
], { encoding: "utf8", timeout: 30000 });
|
|
105
|
+
process.stdout.write(result.stdout || "");
|
|
106
|
+
process.stdout.write(result.stderr || "");
|
|
107
|
+
NODE
|
|
108
|
+
)"
|
|
109
|
+
route_output_trimmed="$(printf '%s' "$route_output" | tr -d '\r' | tail -n 1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
|
|
110
|
+
if [[ "$route_output_trimmed" == "explore" ]]; then
|
|
111
|
+
_pass "claude dev selects explore for repository exploration"
|
|
112
|
+
else
|
|
113
|
+
_fail "claude dev did not select explore (got: $route_output_trimmed)"
|
|
114
|
+
fi
|
|
115
|
+
|
|
116
|
+
echo ""
|
|
117
|
+
echo "--- deliver Route ---"
|
|
118
|
+
sa_build_output="$(cd "$TMP_WORK" && node - <<'NODE'
|
|
119
|
+
const { spawnSync } = require("node:child_process");
|
|
120
|
+
const result = spawnSync("claude", [
|
|
121
|
+
"-p",
|
|
122
|
+
"--agent",
|
|
123
|
+
"dev",
|
|
124
|
+
"--permission-mode",
|
|
125
|
+
"bypassPermissions",
|
|
126
|
+
"--add-dir",
|
|
127
|
+
".",
|
|
128
|
+
"--output-format",
|
|
129
|
+
"text",
|
|
130
|
+
"A user asks: 'Build a CLI tool that converts markdown files to HTML'. Which skill should you activate first? Reply with only the skill name or NONE.",
|
|
131
|
+
], { encoding: "utf8", timeout: 30000 });
|
|
132
|
+
process.stdout.write(result.stdout || "");
|
|
133
|
+
process.stdout.write(result.stderr || "");
|
|
134
|
+
NODE
|
|
135
|
+
)"
|
|
136
|
+
sa_build_trimmed="$(printf '%s' "$sa_build_output" | tr -d '\r' | tail -n 1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
|
|
137
|
+
if [[ "$sa_build_trimmed" == "deliver" ]]; then
|
|
138
|
+
_pass "claude dev selects deliver for broad build requests"
|
|
139
|
+
else
|
|
140
|
+
_fail "claude dev did not select deliver (got: $sa_build_trimmed)"
|
|
141
|
+
fi
|
|
142
|
+
|
|
143
|
+
echo ""
|
|
144
|
+
echo "--- Live Hook Influence ---"
|
|
145
|
+
mkdir -p "$TMP_WORK/.flow-agents/live-hook" "$TMP_WORK/docs"
|
|
146
|
+
printf '# Context Map\n' > "$TMP_WORK/docs/context-map.md"
|
|
147
|
+
cat > "$TMP_WORK/.flow-agents/live-hook/state.json" <<'JSON'
|
|
148
|
+
{
|
|
149
|
+
"schema_version": "1.0",
|
|
150
|
+
"task_slug": "live-hook",
|
|
151
|
+
"status": "not_verified",
|
|
152
|
+
"phase": "verification",
|
|
153
|
+
"updated_at": "2026-05-10T00:00:00Z",
|
|
154
|
+
"next_action": {
|
|
155
|
+
"status": "needs_user",
|
|
156
|
+
"summary": "Acknowledge live hook guidance.",
|
|
157
|
+
"target_phase": "verification"
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
JSON
|
|
161
|
+
cat > "$TMP_WORK/.flow-agents/live-hook/critique.json" <<'JSON'
|
|
162
|
+
{
|
|
163
|
+
"schema_version": "1.0",
|
|
164
|
+
"task_slug": "live-hook",
|
|
165
|
+
"status": "fail",
|
|
166
|
+
"required": true,
|
|
167
|
+
"updated_at": "2026-05-10T00:01:00Z",
|
|
168
|
+
"critiques": [
|
|
169
|
+
{
|
|
170
|
+
"id": "live-hook-review",
|
|
171
|
+
"reviewer": "tool-code-reviewer",
|
|
172
|
+
"reviewed_at": "2026-05-10T00:01:00Z",
|
|
173
|
+
"verdict": "fail",
|
|
174
|
+
"summary": "Live hook guidance must be acknowledged.",
|
|
175
|
+
"findings": [
|
|
176
|
+
{
|
|
177
|
+
"id": "open-live-hook",
|
|
178
|
+
"severity": "high",
|
|
179
|
+
"status": "open",
|
|
180
|
+
"description": "Report the unfinished workflow state."
|
|
181
|
+
}
|
|
182
|
+
]
|
|
183
|
+
}
|
|
184
|
+
]
|
|
185
|
+
}
|
|
186
|
+
JSON
|
|
187
|
+
hook_output="$(cd "$TMP_WORK" && node - <<'NODE'
|
|
188
|
+
const { spawnSync } = require("node:child_process");
|
|
189
|
+
const result = spawnSync("claude", [
|
|
190
|
+
"-p",
|
|
191
|
+
"--agent",
|
|
192
|
+
"dev",
|
|
193
|
+
"--permission-mode",
|
|
194
|
+
"bypassPermissions",
|
|
195
|
+
"--add-dir",
|
|
196
|
+
".",
|
|
197
|
+
"--output-format",
|
|
198
|
+
"text",
|
|
199
|
+
"Use a harmless tool first, such as listing the current directory. After that, if Flow Agents hook guidance mentions WORKFLOW STATE ATTENTION or task live-hook, reply exactly HOOK_GUIDANCE_SEEN live-hook. If no such guidance is visible, reply exactly HOOK_GUIDANCE_MISSING.",
|
|
200
|
+
], { encoding: "utf8", timeout: 45000 });
|
|
201
|
+
process.stdout.write(result.stdout || "");
|
|
202
|
+
process.stdout.write(result.stderr || "");
|
|
203
|
+
NODE
|
|
204
|
+
)"
|
|
205
|
+
hook_output_trimmed="$(printf '%s' "$hook_output" | tr -d '\r' | tail -n 1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
|
|
206
|
+
if [[ "$hook_output_trimmed" == "HOOK_GUIDANCE_SEEN live-hook" ]]; then
|
|
207
|
+
_pass "claude live session responds to workflow hook guidance"
|
|
208
|
+
else
|
|
209
|
+
_fail "claude live session did not respond to workflow hook guidance (got: $hook_output_trimmed)"
|
|
210
|
+
fi
|
|
211
|
+
|
|
212
|
+
echo ""
|
|
213
|
+
echo "--- Telemetry ---"
|
|
214
|
+
telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
|
|
215
|
+
if [[ "${FLOW_AGENTS_ACCEPTANCE_REQUIRE_CLAUDE_TELEMETRY:-0}" != "1" ]]; then
|
|
216
|
+
_skip "real Claude CLI telemetry assertion skipped"
|
|
217
|
+
else
|
|
218
|
+
if wait_for_telemetry "$telemetry_file"; then
|
|
219
|
+
_pass "claude telemetry log was written"
|
|
220
|
+
else
|
|
221
|
+
_fail "claude telemetry log was not written"
|
|
222
|
+
fi
|
|
223
|
+
|
|
224
|
+
if [[ -f "$telemetry_file" ]] && jq -e 'select(.agent.runtime == "claude-code")' "$telemetry_file" >/dev/null 2>&1; then
|
|
225
|
+
_pass "claude telemetry uses normalized claude-code runtime"
|
|
226
|
+
else
|
|
227
|
+
_fail "claude telemetry did not include claude-code runtime"
|
|
228
|
+
fi
|
|
229
|
+
|
|
230
|
+
if [[ -f "$telemetry_file" ]] && jq -e 'select(.event_type == "turn.user")' "$telemetry_file" >/dev/null 2>&1; then
|
|
231
|
+
_pass "claude telemetry captures user prompts"
|
|
232
|
+
else
|
|
233
|
+
_fail "claude telemetry did not capture user prompts"
|
|
234
|
+
fi
|
|
235
|
+
fi
|
|
236
|
+
|
|
237
|
+
echo ""
|
|
238
|
+
echo "==========================="
|
|
239
|
+
total=$((pass + fail))
|
|
240
|
+
echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
|
|
241
|
+
[[ "$fail" -gt 0 ]] && exit 1
|
|
242
|
+
exit 0
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
5
|
+
source "$ROOT_DIR/evals/lib/node.sh"
|
|
6
|
+
TMP_WORK=""
|
|
7
|
+
TMP_LOG=""
|
|
8
|
+
TMP_LAST=""
|
|
9
|
+
pass=0
|
|
10
|
+
fail=0
|
|
11
|
+
skip=0
|
|
12
|
+
|
|
13
|
+
cleanup() {
|
|
14
|
+
[[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
|
|
15
|
+
[[ -n "$TMP_LOG" ]] && rm -f "$TMP_LOG"
|
|
16
|
+
[[ -n "$TMP_LAST" ]] && rm -f "$TMP_LAST"
|
|
17
|
+
}
|
|
18
|
+
trap cleanup EXIT
|
|
19
|
+
|
|
20
|
+
_pass() { echo " ✓ $1"; pass=$((pass + 1)); }
|
|
21
|
+
_fail() { echo " ✗ $1"; fail=$((fail + 1)); }
|
|
22
|
+
_skip() { echo " ○ $1"; skip=$((skip + 1)); }
|
|
23
|
+
|
|
24
|
+
echo "=== Harness Acceptance: Codex ==="
|
|
25
|
+
echo ""
|
|
26
|
+
|
|
27
|
+
if ! command -v codex >/dev/null 2>&1; then
|
|
28
|
+
_skip "codex CLI not installed"
|
|
29
|
+
echo ""
|
|
30
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
31
|
+
exit 0
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
cd "$ROOT_DIR"
|
|
35
|
+
flow_agents_node scripts/build-universal-bundles.js >/dev/null
|
|
36
|
+
|
|
37
|
+
TMP_WORK="$(mktemp -d /tmp/codex-acceptance-work.XXXXXX)"
|
|
38
|
+
TMP_LOG="$(mktemp /tmp/codex-acceptance-log.XXXXXX)"
|
|
39
|
+
TMP_LAST="$(mktemp /tmp/codex-acceptance-last.XXXXXX)"
|
|
40
|
+
bash dist/codex/install.sh "$TMP_WORK" >/dev/null
|
|
41
|
+
|
|
42
|
+
echo "--- Exec Smoke ---"
|
|
43
|
+
if codex exec --skip-git-repo-check -C "$TMP_WORK" --sandbox read-only --json --output-last-message "$TMP_LAST" "After any required startup checks, reply with READY only." >"$TMP_LOG" 2>&1; then
|
|
44
|
+
_pass "codex exec completed successfully"
|
|
45
|
+
else
|
|
46
|
+
_fail "codex exec exited non-zero"
|
|
47
|
+
fi
|
|
48
|
+
|
|
49
|
+
if grep -q "Ignoring malformed agent role definition" "$TMP_LOG"; then
|
|
50
|
+
_fail "codex reported malformed exported agent roles"
|
|
51
|
+
else
|
|
52
|
+
_pass "codex accepted exported local agent role files"
|
|
53
|
+
fi
|
|
54
|
+
|
|
55
|
+
if grep -q "failed to stat skills path" "$TMP_LOG"; then
|
|
56
|
+
_fail "codex could not stat exported skill paths"
|
|
57
|
+
else
|
|
58
|
+
_pass "codex resolved exported skill paths"
|
|
59
|
+
fi
|
|
60
|
+
|
|
61
|
+
if grep -q "READY" "$TMP_LAST"; then
|
|
62
|
+
_pass "codex returned READY in final message"
|
|
63
|
+
else
|
|
64
|
+
_fail "codex final message did not contain READY"
|
|
65
|
+
fi
|
|
66
|
+
|
|
67
|
+
echo ""
|
|
68
|
+
echo "--- Behavioral Route ---"
|
|
69
|
+
TMP_ROUTE_LOG="$(mktemp /tmp/codex-acceptance-route.XXXXXX)"
|
|
70
|
+
if node -e 'const fs=require("fs"); const cp=require("child_process"); const [work,log]=process.argv.slice(1); const r=cp.spawnSync("codex",["exec","--skip-git-repo-check","-C",work,"--sandbox","read-only","--json","Before doing anything else, state the exact skill you are activating if any, then explore the codebase and explain what it does."],{encoding:"utf8",timeout:45000}); fs.writeFileSync(log,(r.stdout||"")+(r.stderr||"")); process.exit(r.error?.code==="ETIMEDOUT" ? 0 : (r.status ?? 1));' "$TMP_WORK" "$TMP_ROUTE_LOG"
|
|
71
|
+
then
|
|
72
|
+
_pass "codex behavioral route command completed successfully"
|
|
73
|
+
else
|
|
74
|
+
_fail "codex behavioral route command exited non-zero"
|
|
75
|
+
fi
|
|
76
|
+
|
|
77
|
+
if grep -Fq 'Activating `$explore`' "$TMP_ROUTE_LOG" || grep -Fq 'Activating skill: `explore`' "$TMP_ROUTE_LOG" || grep -Fq 'Activating skill: explore' "$TMP_ROUTE_LOG"; then
|
|
78
|
+
_pass "codex dev activates explore on repository exploration"
|
|
79
|
+
else
|
|
80
|
+
_fail "codex dev did not activate explore on repository exploration"
|
|
81
|
+
fi
|
|
82
|
+
|
|
83
|
+
rm -f "$TMP_ROUTE_LOG"
|
|
84
|
+
|
|
85
|
+
echo ""
|
|
86
|
+
echo "--- deliver Route ---"
|
|
87
|
+
TMP_BUILD_LOG="$(mktemp /tmp/codex-acceptance-build.XXXXXX)"
|
|
88
|
+
if node -e 'const fs=require("fs"); const cp=require("child_process"); const [work,log]=process.argv.slice(1); const r=cp.spawnSync("codex",["exec","--skip-git-repo-check","-C",work,"--sandbox","read-only","--json","Before doing anything else, state the exact skill you are activating if any, then begin the deliver workflow for '\''Build a CLI tool that converts markdown files to HTML'\'', but stop after deciding the initial skill and first phase."],{encoding:"utf8",timeout:45000}); fs.writeFileSync(log,(r.stdout||"")+(r.stderr||"")); process.exit(r.error?.code==="ETIMEDOUT" ? 0 : (r.status ?? 1));' "$TMP_WORK" "$TMP_BUILD_LOG"
|
|
89
|
+
then
|
|
90
|
+
_pass "codex deliver route command completed successfully"
|
|
91
|
+
else
|
|
92
|
+
_fail "codex deliver route command exited non-zero"
|
|
93
|
+
fi
|
|
94
|
+
|
|
95
|
+
if grep -Fq 'Activating skill: `$deliver`' "$TMP_BUILD_LOG" || grep -Fq 'Activating skill: `deliver`' "$TMP_BUILD_LOG" || grep -Fq 'Activating skill: deliver' "$TMP_BUILD_LOG"; then
|
|
96
|
+
_pass "codex dev activates deliver for broad build requests"
|
|
97
|
+
else
|
|
98
|
+
_fail "codex dev did not activate deliver for broad build requests"
|
|
99
|
+
fi
|
|
100
|
+
|
|
101
|
+
rm -f "$TMP_BUILD_LOG"
|
|
102
|
+
|
|
103
|
+
echo ""
|
|
104
|
+
echo "==========================="
|
|
105
|
+
total=$((pass + fail))
|
|
106
|
+
echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
|
|
107
|
+
[[ "$fail" -gt 0 ]] && exit 1
|
|
108
|
+
exit 0
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
5
|
+
source "$ROOT_DIR/evals/lib/node.sh"
|
|
6
|
+
TMP_HOME=""
|
|
7
|
+
TMP_WORK=""
|
|
8
|
+
TMP_TELEMETRY=""
|
|
9
|
+
pass=0
|
|
10
|
+
fail=0
|
|
11
|
+
skip=0
|
|
12
|
+
|
|
13
|
+
cleanup() {
|
|
14
|
+
[[ -n "$TMP_HOME" ]] && rm -rf "$TMP_HOME"
|
|
15
|
+
[[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
|
|
16
|
+
[[ -n "$TMP_TELEMETRY" ]] && rm -rf "$TMP_TELEMETRY"
|
|
17
|
+
}
|
|
18
|
+
trap cleanup EXIT
|
|
19
|
+
|
|
20
|
+
_pass() { echo " ✓ $1"; pass=$((pass + 1)); }
|
|
21
|
+
_fail() { echo " ✗ $1"; fail=$((fail + 1)); }
|
|
22
|
+
_skip() { echo " ○ $1"; skip=$((skip + 1)); }
|
|
23
|
+
strip_ansi() {
|
|
24
|
+
perl -pe 's/\e\[[0-9;?]*[ -\/]*[@-~]//g; s/\e\(B//g'
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
echo "=== Harness Acceptance: Kiro ==="
|
|
28
|
+
echo ""
|
|
29
|
+
|
|
30
|
+
if ! command -v kiro-cli >/dev/null 2>&1; then
|
|
31
|
+
_skip "kiro-cli not installed"
|
|
32
|
+
echo ""
|
|
33
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
34
|
+
exit 0
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
cd "$ROOT_DIR"
|
|
38
|
+
flow_agents_node scripts/build-universal-bundles.js >/dev/null
|
|
39
|
+
|
|
40
|
+
TMP_HOME="$(mktemp -d /tmp/kiro-acceptance-home.XXXXXX)"
|
|
41
|
+
TMP_WORK="$(mktemp -d /tmp/kiro-acceptance-work.XXXXXX)"
|
|
42
|
+
TMP_TELEMETRY="$(mktemp -d /tmp/kiro-acceptance-telemetry.XXXXXX)"
|
|
43
|
+
bash dist/kiro/install.sh "$TMP_HOME" >/dev/null
|
|
44
|
+
mkdir -p "$TMP_WORK/.kiro"
|
|
45
|
+
ln -s "$TMP_HOME/agents" "$TMP_WORK/.kiro/agents"
|
|
46
|
+
|
|
47
|
+
echo "--- Agent List ---"
|
|
48
|
+
list_output="$(cd "$TMP_WORK" && kiro-cli agent list 2>&1 || true)"
|
|
49
|
+
if echo "$list_output" | grep -q "dev[[:space:]]\+Workspace"; then
|
|
50
|
+
_pass "workspace agent list includes dev"
|
|
51
|
+
else
|
|
52
|
+
_fail "workspace agent list did not include dev"
|
|
53
|
+
fi
|
|
54
|
+
|
|
55
|
+
echo ""
|
|
56
|
+
echo "--- Chat Smoke ---"
|
|
57
|
+
chat_output="$(cd "$TMP_WORK" && kiro-cli chat --agent dev --no-interactive "Reply with READY only." 2>&1 || true)"
|
|
58
|
+
if echo "$chat_output" | grep -q "READY"; then
|
|
59
|
+
_pass "dev agent replied to chat smoke prompt"
|
|
60
|
+
else
|
|
61
|
+
_fail "dev agent did not reply READY"
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
echo ""
|
|
65
|
+
echo "--- Explore Behavior ---"
|
|
66
|
+
explore_output="$(cd "$TMP_WORK" && TELEMETRY_ENABLED=true TELEMETRY_DATA_DIR="$TMP_TELEMETRY" TELEMETRY_SESSION_DIR="$TMP_TELEMETRY/sessions" TELEMETRY_CHANNELS=full TELEMETRY_CHANNEL_FULL_LOG_FILE="$TMP_TELEMETRY/full.jsonl" node - <<'NODE'
|
|
67
|
+
const { spawnSync } = require("node:child_process");
|
|
68
|
+
const result = spawnSync("kiro-cli", [
|
|
69
|
+
"chat",
|
|
70
|
+
"--agent",
|
|
71
|
+
"dev",
|
|
72
|
+
"--no-interactive",
|
|
73
|
+
"--trust-all-tools",
|
|
74
|
+
"Explore the codebase and explain what it does.",
|
|
75
|
+
], { encoding: "utf8", timeout: 30000 });
|
|
76
|
+
process.stdout.write(result.stdout || "");
|
|
77
|
+
process.stdout.write(result.stderr || "");
|
|
78
|
+
NODE
|
|
79
|
+
)"
|
|
80
|
+
explore_clean="$(printf '%s' "$explore_output" | strip_ansi)"
|
|
81
|
+
if echo "$explore_clean" | grep -q "Activating skill: explore"; then
|
|
82
|
+
_pass "dev activates the explore skill on a plain explore prompt"
|
|
83
|
+
else
|
|
84
|
+
_fail "dev did not activate the explore skill on a plain explore prompt"
|
|
85
|
+
fi
|
|
86
|
+
|
|
87
|
+
if echo "$explore_clean" | grep -q "Tool validation failed"; then
|
|
88
|
+
_fail "explore workflow exceeded harness delegation limits"
|
|
89
|
+
else
|
|
90
|
+
_pass "explore workflow stayed within harness delegation limits"
|
|
91
|
+
fi
|
|
92
|
+
|
|
93
|
+
if [[ -f "$TMP_TELEMETRY/full.jsonl" ]] && rg -q '"event_type":"agent.delegate"' "$TMP_TELEMETRY/full.jsonl"; then
|
|
94
|
+
_pass "telemetry confirms delegated explore execution"
|
|
95
|
+
else
|
|
96
|
+
_fail "telemetry did not confirm delegated explore execution"
|
|
97
|
+
fi
|
|
98
|
+
|
|
99
|
+
echo ""
|
|
100
|
+
echo "--- Strict Stop Gate ---"
|
|
101
|
+
mkdir -p "$TMP_WORK/.flow-agents/live-stop"
|
|
102
|
+
cat > "$TMP_WORK/.flow-agents/live-stop/live-stop--deliver.md" <<'MARKDOWN'
|
|
103
|
+
# Live Stop Gate
|
|
104
|
+
|
|
105
|
+
status: executing
|
|
106
|
+
type: deliver
|
|
107
|
+
|
|
108
|
+
## Plan
|
|
109
|
+
|
|
110
|
+
This delivery artifact is intentionally incomplete so the strict stop hook must surface Goal Fit guidance.
|
|
111
|
+
MARKDOWN
|
|
112
|
+
|
|
113
|
+
stop_output="$(cd "$TMP_WORK" && FLOW_AGENTS_GOAL_FIT_STRICT=true kiro-cli chat --agent dev --no-interactive "Reply with READY only." 2>&1 || true)"
|
|
114
|
+
stop_clean="$(printf '%s' "$stop_output" | strip_ansi)"
|
|
115
|
+
if echo "$stop_clean" | grep -q 'stop "node .*stop:goal-fit stop-goal-fit.js standard,strict" failed with exit code: 2' \
|
|
116
|
+
&& echo "$stop_clean" | grep -q '\[Hook\] Goal Fit warning:' \
|
|
117
|
+
&& echo "$stop_clean" | grep -q 'live-stop--deliver.md is still status:executing'; then
|
|
118
|
+
_pass "strict Goal Fit stop hook surfaces live Kiro stop gate"
|
|
119
|
+
else
|
|
120
|
+
_fail "strict Goal Fit stop hook did not surface live Kiro stop gate"
|
|
121
|
+
fi
|
|
122
|
+
|
|
123
|
+
echo ""
|
|
124
|
+
echo "==========================="
|
|
125
|
+
total=$((pass + fail))
|
|
126
|
+
echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
|
|
127
|
+
[[ "$fail" -gt 0 ]] && exit 1
|
|
128
|
+
exit 0
|