@kontourai/flow-agents 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-push +11 -0
- package/.github/workflows/ci.yml +210 -0
- package/.github/workflows/docs-pages.yml +52 -0
- package/.github/workflows/publish-npm.yml +104 -0
- package/AGENTS.md +26 -0
- package/CHANGELOG.md +66 -0
- package/CODE_OF_CONDUCT.md +25 -0
- package/CONTEXT.md +300 -0
- package/CONTRIBUTING.md +44 -0
- package/LICENSE +201 -0
- package/README.md +129 -0
- package/SECURITY.md +33 -0
- package/agent-cards/dev.json +19 -0
- package/agents/dev.json +127 -0
- package/agents/tool-code-reviewer.json +61 -0
- package/agents/tool-dependencies-updater.json +118 -0
- package/agents/tool-explore-config.json +92 -0
- package/agents/tool-explore-deps.json +92 -0
- package/agents/tool-explore-entry.json +92 -0
- package/agents/tool-explore-patterns.json +92 -0
- package/agents/tool-explore-structure.json +92 -0
- package/agents/tool-explore-tests.json +92 -0
- package/agents/tool-planner.json +57 -0
- package/agents/tool-playwright.json +145 -0
- package/agents/tool-security-reviewer.json +56 -0
- package/agents/tool-verifier.json +61 -0
- package/agents/tool-worker.json +58 -0
- package/build/src/cli/console-learning-projection.js +123 -0
- package/build/src/cli/docs-preview.js +39 -0
- package/build/src/cli/effective-backlog-settings.js +102 -0
- package/build/src/cli/export-bookmarks.js +38 -0
- package/build/src/cli/fixture-retirement-audit.js +140 -0
- package/build/src/cli/flow-kit.js +138 -0
- package/build/src/cli/import-bookmarks.js +50 -0
- package/build/src/cli/init.js +239 -0
- package/build/src/cli/instinct-cli.js +93 -0
- package/build/src/cli/promote-workflow-artifact.js +63 -0
- package/build/src/cli/publish-change-helper.js +154 -0
- package/build/src/cli/pull-work-provider.js +469 -0
- package/build/src/cli/runtime-adapter.js +23 -0
- package/build/src/cli/telemetry-doctor.js +221 -0
- package/build/src/cli/usage-feedback.js +443 -0
- package/build/src/cli/validate-hook-influence.js +152 -0
- package/build/src/cli/validate-source-tree.js +31 -0
- package/build/src/cli/validate-workflow-artifacts.js +486 -0
- package/build/src/cli/veritas-governance.js +262 -0
- package/build/src/cli/workflow-artifact-cleanup-audit.js +272 -0
- package/build/src/cli/workflow-sidecar.js +816 -0
- package/build/src/cli.js +89 -0
- package/build/src/flow-kit/validate.js +75 -0
- package/build/src/lib/args.js +45 -0
- package/build/src/lib/fs.js +62 -0
- package/build/src/lib/workflow-learning-projection.js +334 -0
- package/build/src/runtime-adapters.js +146 -0
- package/build/src/tools/build-universal-bundles.js +397 -0
- package/build/src/tools/common.js +56 -0
- package/build/src/tools/filter-installed-packs.js +132 -0
- package/build/src/tools/generate-context-map.js +198 -0
- package/build/src/tools/validate-package.js +64 -0
- package/build/src/tools/validate-source-tree.js +622 -0
- package/console.telemetry.json +176 -0
- package/context/base-rules.md +17 -0
- package/context/code-review-standards.md +62 -0
- package/context/coding-standards.md +42 -0
- package/context/common/orchestrators.md +12 -0
- package/context/common/subagents.md +28 -0
- package/context/contracts/artifact-contract.md +182 -0
- package/context/contracts/builder-kit-workflow-state-contract.md +319 -0
- package/context/contracts/delivery-contract.md +69 -0
- package/context/contracts/execution-contract.md +53 -0
- package/context/contracts/governance-adapter-contract.md +67 -0
- package/context/contracts/planning-contract.md +85 -0
- package/context/contracts/review-contract.md +104 -0
- package/context/contracts/sandbox-policy.md +52 -0
- package/context/contracts/verification-contract.md +134 -0
- package/context/contracts/work-item-contract.md +215 -0
- package/context/deferred/demo-mode.md +33 -0
- package/context/deferred/languages/go.md +31 -0
- package/context/deferred/languages/python.md +31 -0
- package/context/deferred/languages/typescript.md +34 -0
- package/context/deferred/parallelization.md +35 -0
- package/context/deferred/worktree-isolation.md +24 -0
- package/context/development-workflow.md +50 -0
- package/context/scripts/context-budget/budget-scan.sh +166 -0
- package/context/scripts/detect-tools.sh +3 -0
- package/context/scripts/discover-agents.sh +28 -0
- package/context/scripts/git-status.sh +49 -0
- package/context/scripts/hooks/config-protection.js +79 -0
- package/context/scripts/hooks/desktop-notify.sh +39 -0
- package/context/scripts/hooks/governance-audit.sh +135 -0
- package/context/scripts/hooks/lib/audit-transport.sh +40 -0
- package/context/scripts/hooks/lib/hook-flags.js +49 -0
- package/context/scripts/hooks/lib/patterns.sh +57 -0
- package/context/scripts/hooks/lib/resolve-formatter.js +80 -0
- package/context/scripts/hooks/post-edit-accumulator.js +66 -0
- package/context/scripts/hooks/pre-commit-quality.js +194 -0
- package/context/scripts/hooks/quality-gate.js +93 -0
- package/context/scripts/hooks/report-only-guard.js +21 -0
- package/context/scripts/hooks/run-hook.js +136 -0
- package/context/scripts/hooks/stop-format-typecheck.js +141 -0
- package/context/scripts/hooks/stop-goal-fit.js +337 -0
- package/context/scripts/hooks/workflow-steering.js +250 -0
- package/context/scripts/telemetry/console-presets.sh +14 -0
- package/context/scripts/telemetry/install-console-config.sh +214 -0
- package/context/scripts/telemetry/lib/config.sh +85 -0
- package/context/scripts/telemetry/lib/enrich.sh +115 -0
- package/context/scripts/telemetry/lib/redact.sh +22 -0
- package/context/scripts/telemetry/lib/session.sh +63 -0
- package/context/scripts/telemetry/lib/transport.sh +183 -0
- package/context/scripts/telemetry/lib/usage.sh +29 -0
- package/context/scripts/telemetry/sync-agents.sh +173 -0
- package/context/scripts/telemetry/telemetry.conf +23 -0
- package/context/scripts/telemetry/telemetry.sh +387 -0
- package/context/scripts/validate-package.sh +89 -0
- package/context/settings/backlog-provider-settings.json +54 -0
- package/context/templates/core/identity.md +26 -0
- package/context/templates/core/user.md +15 -0
- package/docs/_config.yml +15 -0
- package/docs/_layouts/default.html +87 -0
- package/docs/adr/0001-flow-agents-consumes-flow.md +77 -0
- package/docs/adr/0002-flow-kits-as-extension-unit.md +13 -0
- package/docs/adr/0003-flow-agents-coordinates-kits-and-adapters.md +13 -0
- package/docs/adr/0004-gates-expect-surface-claims.md +15 -0
- package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +48 -0
- package/docs/adr/0006-typescript-first-source-policy.md +98 -0
- package/docs/agent-system-guidebook.md +391 -0
- package/docs/agent-usage-feedback-loop.md +351 -0
- package/docs/assets/favicon.svg +13 -0
- package/docs/assets/og-image.png +0 -0
- package/docs/assets/site.css +774 -0
- package/docs/assets/site.js +139 -0
- package/docs/configurable-workflow-routing.md +174 -0
- package/docs/context-map.md +145 -0
- package/docs/developer-architecture.md +145 -0
- package/docs/developer-hook-setup.md +61 -0
- package/docs/fixture-ownership.md +44 -0
- package/docs/flow-kit-repository-contract.md +180 -0
- package/docs/index.md +129 -0
- package/docs/kontour-resource-contract.md +358 -0
- package/docs/migrations.md +64 -0
- package/docs/north-star.md +322 -0
- package/docs/operating-layers.md +110 -0
- package/docs/repository-structure.md +132 -0
- package/docs/sandbox-policy.md +56 -0
- package/docs/skills-map.md +203 -0
- package/docs/standards-register.md +96 -0
- package/docs/veritas-integration.md +165 -0
- package/docs/work-item-adapters.md +72 -0
- package/docs/workflow-artifact-lifecycle.md +141 -0
- package/docs/workflow-eval-strategy.md +295 -0
- package/docs/workflow-shared-contracts.md +51 -0
- package/docs/workflow-usage-guide.md +443 -0
- package/evals/ARCHITECTURE.md +143 -0
- package/evals/CONVENTIONS.md +58 -0
- package/evals/README.md +128 -0
- package/evals/acceptance/run.sh +29 -0
- package/evals/acceptance/test_claude_harness.sh +242 -0
- package/evals/acceptance/test_codex_harness.sh +108 -0
- package/evals/acceptance/test_kiro_harness.sh +128 -0
- package/evals/cases/dev/404.html +97 -0
- package/evals/cases/dev/code-review.yaml +44 -0
- package/evals/cases/dev/dashboard.html +300 -0
- package/evals/cases/dev/deliver.yaml +66 -0
- package/evals/cases/dev/dependency-update.yaml +16 -0
- package/evals/cases/dev/explore.yaml +20 -0
- package/evals/cases/dev/index.html +370 -0
- package/evals/cases/dev/package-lock.json +28 -0
- package/evals/cases/dev/package.json +16 -0
- package/evals/cases/dev/plan-work.yaml +20 -0
- package/evals/cases/dev/promptfooconfig.yaml +666 -0
- package/evals/cases/dev/search-first.yaml +20 -0
- package/evals/cases/dev/tdd-workflow.yaml +48 -0
- package/evals/cases/dev/verify-work.yaml +44 -0
- package/evals/cases/dev/workflow.yaml +34 -0
- package/evals/ci/run-baseline.sh +283 -0
- package/evals/fixtures/backlog-provider-settings/global-default.json +44 -0
- package/evals/fixtures/backlog-provider-settings/project-override.json +53 -0
- package/evals/fixtures/builder-kit-workflow-state/baseline-freshness-resolution-hint.json +139 -0
- package/evals/fixtures/builder-kit-workflow-state/direct-primitive-stop.json +59 -0
- package/evals/fixtures/builder-kit-workflow-state/empty-board-route-shape.json +55 -0
- package/evals/fixtures/builder-kit-workflow-state/happy-path.json +71 -0
- package/evals/fixtures/builder-kit-workflow-state/mid-work-resume.json +80 -0
- package/evals/fixtures/builder-kit-workflow-state/missing-prestep-recovery.json +65 -0
- package/evals/fixtures/builder-kit-workflow-state/product-build-chaining.json +60 -0
- package/evals/fixtures/builder-kit-workflow-state/stale-continuation-requires-new-probe.json +57 -0
- package/evals/fixtures/console-learning-projection/artifacts/console-learning-correction/learning.json +50 -0
- package/evals/fixtures/console-learning-projection/artifacts/console-learning-open-route/learning.json +41 -0
- package/evals/fixtures/flow-kit-repository/invalid-absolute-path/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-asset-section/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-asset-section/kit.json +11 -0
- package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-duplicate-flow/kit.json +9 -0
- package/evals/fixtures/flow-kit-repository/invalid-id/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-id/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-malformed-json/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-flow/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-id/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-id/kit.json +7 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-schema-version/kit.json +7 -0
- package/evals/fixtures/flow-kit-repository/invalid-name/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-name/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-schema-version/flows/review.flow.json +6 -0
- package/evals/fixtures/flow-kit-repository/invalid-schema-version/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/invalid-traversal/kit.json +8 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/adapters/example.json +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/assets/example.txt +1 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/docs/README.md +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/flows/runtime.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-evals/example.json +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit-skills/mixed/SKILL.md +3 -0
- package/evals/fixtures/flow-kit-repository/mixed-runtime-kit/kit.json +44 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/docs/README.md +3 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/valid-local-kit/kit.json +20 -0
- package/evals/fixtures/hook-influence/cases.json +336 -0
- package/evals/fixtures/pull-work-provider/github-issues.json +170 -0
- package/evals/fixtures/pull-work-wip-shepherding/global-wip-informs.json +43 -0
- package/evals/fixtures/pull-work-wip-shepherding/personal-wip-blocks.json +42 -0
- package/evals/fixtures/surface-trust/accepted-claim-trust-report.json +31 -0
- package/evals/fixtures/surface-trust/artifact-absent.json +19 -0
- package/evals/fixtures/surface-trust/integrity-mismatch-trust-report.json +32 -0
- package/evals/fixtures/surface-trust/missing-authority-trust-report.json +27 -0
- package/evals/fixtures/surface-trust/provider-absent.json +19 -0
- package/evals/fixtures/surface-trust/rejected-claim-trust-report.json +30 -0
- package/evals/fixtures/surface-trust/stale-claim-trust-snapshot.json +31 -0
- package/evals/fixtures/usage-feedback/sample-full.jsonl +11 -0
- package/evals/fixtures/usage-feedback/sample-outcomes.jsonl +1 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-pass.sh +18 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-secret-fail.sh +10 -0
- package/evals/fixtures/veritas-governance-adapter/fake-veritas-unconfigured.sh +4 -0
- package/evals/integration/test_bundle_install.sh +541 -0
- package/evals/integration/test_console_learning_projection.sh +192 -0
- package/evals/integration/test_context_map.sh +65 -0
- package/evals/integration/test_effective_backlog_settings.sh +58 -0
- package/evals/integration/test_fixture_retirement_audit.sh +58 -0
- package/evals/integration/test_flow_agents_statusline.sh +93 -0
- package/evals/integration/test_flow_kit_repository.sh +90 -0
- package/evals/integration/test_goal_fit_hook.sh +482 -0
- package/evals/integration/test_hook_category_behaviors.sh +190 -0
- package/evals/integration/test_hook_influence_cases.sh +69 -0
- package/evals/integration/test_local_flow_kit_install.sh +145 -0
- package/evals/integration/test_publish_change_helper.sh +176 -0
- package/evals/integration/test_pull_work_provider.sh +140 -0
- package/evals/integration/test_runtime_adapter_activation.sh +106 -0
- package/evals/integration/test_telemetry.sh +485 -0
- package/evals/integration/test_telemetry_doctor.sh +193 -0
- package/evals/integration/test_usage_feedback_dashboard.sh +169 -0
- package/evals/integration/test_usage_feedback_global.sh +117 -0
- package/evals/integration/test_usage_feedback_import.sh +227 -0
- package/evals/integration/test_usage_feedback_outcomes.sh +165 -0
- package/evals/integration/test_usage_feedback_report.sh +263 -0
- package/evals/integration/test_veritas_governance_adapter.sh +235 -0
- package/evals/integration/test_workflow_artifact_cleanup_audit.sh +287 -0
- package/evals/integration/test_workflow_artifacts.sh +1247 -0
- package/evals/integration/test_workflow_sidecar_writer.sh +2112 -0
- package/evals/integration/test_workflow_steering_hook.sh +337 -0
- package/evals/lib/assertions/delegated-to.js +40 -0
- package/evals/lib/assertions/max-tool-calls.js +15 -0
- package/evals/lib/assertions/no-write-tools.js +27 -0
- package/evals/lib/assertions/pass-at-k.js +39 -0
- package/evals/lib/assertions/telemetry-utils.js +105 -0
- package/evals/lib/assertions/tool-called.js +39 -0
- package/evals/lib/assertions/verify-after-fix.js +61 -0
- package/evals/lib/claude-judge.sh +40 -0
- package/evals/lib/claude-provider.sh +74 -0
- package/evals/lib/codex-judge.sh +39 -0
- package/evals/lib/codex-provider.sh +81 -0
- package/evals/lib/eval-dev.sh +5 -0
- package/evals/lib/eval-judge.sh +22 -0
- package/evals/lib/eval-provider.sh +26 -0
- package/evals/lib/eval-report.sh +73 -0
- package/evals/lib/kiro-dev.sh +4 -0
- package/evals/lib/kiro-judge.sh +17 -0
- package/evals/lib/kiro-provider.sh +62 -0
- package/evals/lib/node.sh +111 -0
- package/evals/promptfooconfig.yaml +70 -0
- package/evals/run.sh +309 -0
- package/evals/static/test_evidence_refs.sh +141 -0
- package/evals/static/test_package.sh +407 -0
- package/evals/static/test_repo_hooks.sh +68 -0
- package/evals/static/test_universal_bundles.sh +274 -0
- package/evals/static/test_workflow_skills.sh +1207 -0
- package/install.sh +64 -0
- package/integrations/veritas/flow-agents.adapter.json +138 -0
- package/integrations/veritas/flow-agents.authority-settings.json +26 -0
- package/integrations/veritas/flow-agents.repo-standards.json +82 -0
- package/kits/builder/flows/build.flow.json +218 -0
- package/kits/builder/flows/shape.flow.json +127 -0
- package/kits/builder/kit.json +19 -0
- package/kits/catalog.json +11 -0
- package/package.json +130 -0
- package/packaging/README.md +60 -0
- package/packaging/manifest.json +173 -0
- package/packaging/packs.json +69 -0
- package/powers/dependency-checker/POWER.md +20 -0
- package/powers/dependency-checker/mcp.json +20 -0
- package/powers/playwright/POWER.md +25 -0
- package/powers/playwright/mcp.json +12 -0
- package/prompts/code-audit.md +123 -0
- package/prompts/kcommit.md +88 -0
- package/schemas/backlog-provider-settings.schema.json +138 -0
- package/schemas/workflow-acceptance.schema.json +216 -0
- package/schemas/workflow-critique.schema.json +113 -0
- package/schemas/workflow-evidence.schema.json +357 -0
- package/schemas/workflow-handoff.schema.json +52 -0
- package/schemas/workflow-learning.schema.json +223 -0
- package/schemas/workflow-release.schema.json +172 -0
- package/schemas/workflow-state.schema.json +80 -0
- package/scripts/README.md +111 -0
- package/scripts/build-universal-bundles.js +3 -0
- package/scripts/check-content-boundary.cjs +99 -0
- package/scripts/context-budget/budget-scan.sh +166 -0
- package/scripts/detect-tools.sh +3 -0
- package/scripts/discover-agents.sh +28 -0
- package/scripts/effective-backlog-settings.js +2 -0
- package/scripts/filter-installed-packs.js +2 -0
- package/scripts/flow-kit.js +2 -0
- package/scripts/generate-context-map.js +2 -0
- package/scripts/git-status.sh +49 -0
- package/scripts/hooks/claude-hook-adapter.js +174 -0
- package/scripts/hooks/claude-telemetry-hook.js +115 -0
- package/scripts/hooks/codex-hook-adapter.js +176 -0
- package/scripts/hooks/codex-telemetry-hook.js +95 -0
- package/scripts/hooks/config-protection.js +79 -0
- package/scripts/hooks/desktop-notify.sh +39 -0
- package/scripts/hooks/governance-audit.sh +135 -0
- package/scripts/hooks/lib/audit-transport.sh +40 -0
- package/scripts/hooks/lib/hook-flags.js +49 -0
- package/scripts/hooks/lib/patterns.sh +57 -0
- package/scripts/hooks/lib/resolve-formatter.js +80 -0
- package/scripts/hooks/post-edit-accumulator.js +66 -0
- package/scripts/hooks/pre-commit-quality.js +194 -0
- package/scripts/hooks/quality-gate.js +93 -0
- package/scripts/hooks/report-only-guard.js +21 -0
- package/scripts/hooks/run-hook.js +136 -0
- package/scripts/hooks/stop-format-typecheck.js +141 -0
- package/scripts/hooks/stop-goal-fit.js +337 -0
- package/scripts/hooks/workflow-steering.js +250 -0
- package/scripts/install-codex-home.sh +106 -0
- package/scripts/package.json +3 -0
- package/scripts/promote-workflow-artifact.js +2 -0
- package/scripts/publish-change-helper.js +2 -0
- package/scripts/pull-work-provider.js +2 -0
- package/scripts/setup-repo-hooks.sh +8 -0
- package/scripts/statusline/flow-agents-statusline.js +157 -0
- package/scripts/telemetry/console-presets.sh +14 -0
- package/scripts/telemetry/install-console-config.sh +214 -0
- package/scripts/telemetry/lib/config.sh +85 -0
- package/scripts/telemetry/lib/enrich.sh +115 -0
- package/scripts/telemetry/lib/redact.sh +22 -0
- package/scripts/telemetry/lib/session.sh +63 -0
- package/scripts/telemetry/lib/transport.sh +183 -0
- package/scripts/telemetry/lib/usage.sh +29 -0
- package/scripts/telemetry/sync-agents.sh +173 -0
- package/scripts/telemetry/telemetry.conf +23 -0
- package/scripts/telemetry/telemetry.sh +387 -0
- package/scripts/usage-feedback.js +2 -0
- package/scripts/validate-hook-influence-cases.js +2 -0
- package/scripts/validate-package.sh +89 -0
- package/scripts/validate-source-tree.js +9 -0
- package/skills/agentic-engineering/SKILL.md +62 -0
- package/skills/browser-test/SKILL.md +51 -0
- package/skills/builder-shape/SKILL.md +76 -0
- package/skills/context-budget/SKILL.md +40 -0
- package/skills/deliver/SKILL.md +241 -0
- package/skills/dependency-update/SKILL.md +68 -0
- package/skills/design-probe/SKILL.md +107 -0
- package/skills/eval-rebuild/SKILL.md +39 -0
- package/skills/evidence-gate/SKILL.md +186 -0
- package/skills/execute-plan/SKILL.md +110 -0
- package/skills/explore/SKILL.md +137 -0
- package/skills/feedback-loop/SKILL.md +87 -0
- package/skills/fix-bug/SKILL.md +133 -0
- package/skills/frontend-design/SKILL.md +80 -0
- package/skills/github-cli/SKILL.md +63 -0
- package/skills/idea-to-backlog/SKILL.md +267 -0
- package/skills/knowledge-capture/SKILL.md +55 -0
- package/skills/learning-review/SKILL.md +115 -0
- package/skills/pickup-probe/SKILL.md +114 -0
- package/skills/plan-work/SKILL.md +176 -0
- package/skills/pull-work/SKILL.md +309 -0
- package/skills/release-readiness/SKILL.md +121 -0
- package/skills/review-work/SKILL.md +161 -0
- package/skills/search-first/SKILL.md +66 -0
- package/skills/tdd-workflow/SKILL.md +140 -0
- package/skills/verify-work/SKILL.md +109 -0
- package/src/cli/console-learning-projection.ts +140 -0
- package/src/cli/effective-backlog-settings.ts +99 -0
- package/src/cli/fixture-retirement-audit.ts +154 -0
- package/src/cli/flow-kit.ts +139 -0
- package/src/cli/init.ts +248 -0
- package/src/cli/promote-workflow-artifact.ts +64 -0
- package/src/cli/publish-change-helper.ts +143 -0
- package/src/cli/pull-work-provider.ts +481 -0
- package/src/cli/runtime-adapter.ts +24 -0
- package/src/cli/telemetry-doctor.ts +243 -0
- package/src/cli/usage-feedback.ts +418 -0
- package/src/cli/validate-hook-influence.ts +119 -0
- package/src/cli/validate-source-tree.ts +30 -0
- package/src/cli/validate-workflow-artifacts.ts +411 -0
- package/src/cli/veritas-governance.ts +322 -0
- package/src/cli/workflow-artifact-cleanup-audit.ts +281 -0
- package/src/cli/workflow-sidecar.ts +676 -0
- package/src/cli.ts +95 -0
- package/src/flow-kit/validate.ts +74 -0
- package/src/lib/args.ts +43 -0
- package/src/lib/fs.ts +62 -0
- package/src/lib/workflow-learning-projection.ts +491 -0
- package/src/runtime-adapters.ts +154 -0
- package/src/tools/build-universal-bundles.ts +366 -0
- package/src/tools/common.ts +61 -0
- package/src/tools/filter-installed-packs.ts +129 -0
- package/src/tools/generate-context-map.ts +199 -0
- package/src/tools/validate-package.ts +57 -0
- package/src/tools/validate-source-tree.ts +488 -0
- package/tsconfig.json +19 -0
- package/veritas.claims.json +6 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
description: Flow Agents Eval Suite — dev skill activation and workflow compliance
|
|
2
|
+
providers:
|
|
3
|
+
- id: 'exec: bash lib/kiro-dev.sh'
|
|
4
|
+
label: dev
|
|
5
|
+
prompts:
|
|
6
|
+
- '{{prompt}}'
|
|
7
|
+
tests:
|
|
8
|
+
- vars:
|
|
9
|
+
prompt: Explore this codebase and tell me what it does
|
|
10
|
+
assert:
|
|
11
|
+
- type: javascript
|
|
12
|
+
value: file://lib/assertions/delegated-to.js
|
|
13
|
+
config:
|
|
14
|
+
expected:
|
|
15
|
+
- tool-explore-structure
|
|
16
|
+
- tool-explore-entry
|
|
17
|
+
- tool-explore-deps
|
|
18
|
+
- type: javascript
|
|
19
|
+
value: file://lib/assertions/tool-called.js
|
|
20
|
+
config:
|
|
21
|
+
tool: delegate to a specialist agent
|
|
22
|
+
- type: javascript
|
|
23
|
+
value: file://lib/assertions/no-write-tools.js
|
|
24
|
+
- type: llm-rubric
|
|
25
|
+
value: 'The agent activated the ''explore'' skill and fanned out parallel subagents
|
|
26
|
+
|
|
27
|
+
(tool-explore-structure, entry, deps, patterns, config, tests) in Wave 1.
|
|
28
|
+
|
|
29
|
+
Response contains a synthesis of directory structure, entry points, dependencies, and patterns.
|
|
30
|
+
|
|
31
|
+
'
|
|
32
|
+
options:
|
|
33
|
+
provider: 'exec: bash lib/kiro-dev.sh'
|
|
34
|
+
- vars:
|
|
35
|
+
prompt: Create a simple hello.py script that prints hello world
|
|
36
|
+
assert:
|
|
37
|
+
- type: javascript
|
|
38
|
+
value: file://lib/assertions/tool-called.js
|
|
39
|
+
config:
|
|
40
|
+
tool: todo tool
|
|
41
|
+
- type: javascript
|
|
42
|
+
value: file://lib/assertions/no-write-tools.js
|
|
43
|
+
- type: llm-rubric
|
|
44
|
+
value: 'The dev agent followed its Phase 0-5 workflow: checked existing TODOs,
|
|
45
|
+
|
|
46
|
+
ran git status, created a plan, then attempted implementation.
|
|
47
|
+
|
|
48
|
+
Write tools are blocked so implementation may fail — check it ATTEMPTED the workflow phases.
|
|
49
|
+
|
|
50
|
+
'
|
|
51
|
+
options:
|
|
52
|
+
provider: 'exec: bash lib/kiro-dev.sh'
|
|
53
|
+
- vars:
|
|
54
|
+
prompt: Check this project for outdated dependencies and security vulnerabilities
|
|
55
|
+
assert:
|
|
56
|
+
- type: javascript
|
|
57
|
+
value: file://lib/assertions/delegated-to.js
|
|
58
|
+
config:
|
|
59
|
+
expected:
|
|
60
|
+
- tool-dependencies-updater
|
|
61
|
+
- type: javascript
|
|
62
|
+
value: file://lib/assertions/no-write-tools.js
|
|
63
|
+
- type: llm-rubric
|
|
64
|
+
value: 'The agent activated the dependency-update skill and delegated to
|
|
65
|
+
|
|
66
|
+
tool-dependencies-updater rather than manually checking package files.
|
|
67
|
+
|
|
68
|
+
'
|
|
69
|
+
options:
|
|
70
|
+
provider: 'exec: bash lib/kiro-dev.sh'
|
package/evals/run.sh
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# run.sh — Entry point for the agent eval suite
|
|
3
|
+
# Usage:
|
|
4
|
+
# bash run.sh # Run layers 1+2 (fast, no LLM)
|
|
5
|
+
# bash run.sh static # Layer 1 only
|
|
6
|
+
# bash run.sh integration # Layer 2 only
|
|
7
|
+
# bash run.sh acceptance # Layer 4: harness-native smoke tests
|
|
8
|
+
# bash run.sh acceptance kiro
|
|
9
|
+
# bash run.sh llm # Layer 3: all agents
|
|
10
|
+
# bash run.sh llm dev # Layer 3: dev agent only
|
|
11
|
+
# bash run.sh llm dev --runtime codex # Run dev evals through Codex
|
|
12
|
+
# bash run.sh llm dev --runtime claude --judge-runtime codex
|
|
13
|
+
# bash run.sh llm dev --suite regression
|
|
14
|
+
# bash run.sh report dev # Generate report from last run
|
|
15
|
+
# bash run.sh llm dev --repeat 3 # Run with pass@k measurement
|
|
16
|
+
set -uo pipefail
|
|
17
|
+
|
|
18
|
+
EVAL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
19
|
+
ROOT_DIR="$(cd "$EVAL_DIR/.." && pwd)"
|
|
20
|
+
LAYER="${1:-all}"
|
|
21
|
+
AGENT="${2:-}"
|
|
22
|
+
RUNTIME="${FLOW_AGENTS_EVAL_RUNTIME:-${EVAL_RUNTIME:-kiro}}"
|
|
23
|
+
JUDGE_RUNTIME="${FLOW_AGENTS_EVAL_JUDGE_RUNTIME:-${EVAL_JUDGE_RUNTIME:-}}"
|
|
24
|
+
SUITE="${FLOW_AGENTS_EVAL_SUITE:-full}"
|
|
25
|
+
PROMPTFOO_BIN="${PROMPTFOO_BIN:-$ROOT_DIR/node_modules/.bin/promptfoo}"
|
|
26
|
+
if [[ ! -x "$PROMPTFOO_BIN" ]]; then
|
|
27
|
+
PROMPTFOO_BIN="$(command -v promptfoo 2>/dev/null || true)"
|
|
28
|
+
fi
|
|
29
|
+
|
|
30
|
+
run_promptfoo() {
|
|
31
|
+
if [[ -z "$PROMPTFOO_BIN" ]]; then
|
|
32
|
+
echo "promptfoo is not installed. Run 'npm install' from the repo root." >&2
|
|
33
|
+
return 127
|
|
34
|
+
fi
|
|
35
|
+
local config_dir="${PROMPTFOO_CONFIG_DIR:-$ROOT_DIR/.promptfoo}"
|
|
36
|
+
mkdir -p "$config_dir"
|
|
37
|
+
PROMPTFOO_CONFIG_DIR="$config_dir" \
|
|
38
|
+
PROMPTFOO_DISABLE_WAL_MODE="${PROMPTFOO_DISABLE_WAL_MODE:-true}" \
|
|
39
|
+
PROMPTFOO_DISABLE_TELEMETRY="${PROMPTFOO_DISABLE_TELEMETRY:-true}" \
|
|
40
|
+
"$PROMPTFOO_BIN" "$@"
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
parse_runtime_args() {
|
|
44
|
+
local rest=()
|
|
45
|
+
while [[ $# -gt 0 ]]; do
|
|
46
|
+
case "$1" in
|
|
47
|
+
--runtime)
|
|
48
|
+
if [[ -z "${2:-}" ]]; then
|
|
49
|
+
echo "--runtime requires kiro or codex" >&2
|
|
50
|
+
exit 1
|
|
51
|
+
fi
|
|
52
|
+
RUNTIME="$2"
|
|
53
|
+
shift 2
|
|
54
|
+
;;
|
|
55
|
+
--runtime=*)
|
|
56
|
+
RUNTIME="${1#--runtime=}"
|
|
57
|
+
shift
|
|
58
|
+
;;
|
|
59
|
+
--judge-runtime)
|
|
60
|
+
if [[ -z "${2:-}" ]]; then
|
|
61
|
+
echo "--judge-runtime requires kiro, codex, or claude" >&2
|
|
62
|
+
exit 1
|
|
63
|
+
fi
|
|
64
|
+
JUDGE_RUNTIME="$2"
|
|
65
|
+
shift 2
|
|
66
|
+
;;
|
|
67
|
+
--judge-runtime=*)
|
|
68
|
+
JUDGE_RUNTIME="${1#--judge-runtime=}"
|
|
69
|
+
shift
|
|
70
|
+
;;
|
|
71
|
+
--suite)
|
|
72
|
+
if [[ -z "${2:-}" ]]; then
|
|
73
|
+
echo "--suite requires smoke, regression, capability, or full" >&2
|
|
74
|
+
exit 1
|
|
75
|
+
fi
|
|
76
|
+
SUITE="$2"
|
|
77
|
+
shift 2
|
|
78
|
+
;;
|
|
79
|
+
--suite=*)
|
|
80
|
+
SUITE="${1#--suite=}"
|
|
81
|
+
shift
|
|
82
|
+
;;
|
|
83
|
+
*)
|
|
84
|
+
rest+=("$1")
|
|
85
|
+
shift
|
|
86
|
+
;;
|
|
87
|
+
esac
|
|
88
|
+
done
|
|
89
|
+
case "$RUNTIME" in
|
|
90
|
+
kiro|Claude\ Code|codex|claude|claude-code) ;;
|
|
91
|
+
*)
|
|
92
|
+
echo "Unsupported eval runtime '$RUNTIME' (expected kiro, codex, or claude)" >&2
|
|
93
|
+
exit 1
|
|
94
|
+
;;
|
|
95
|
+
esac
|
|
96
|
+
JUDGE_RUNTIME="${JUDGE_RUNTIME:-$RUNTIME}"
|
|
97
|
+
case "$JUDGE_RUNTIME" in
|
|
98
|
+
kiro|Claude\ Code|codex|claude|claude-code) ;;
|
|
99
|
+
*)
|
|
100
|
+
echo "Unsupported judge runtime '$JUDGE_RUNTIME' (expected kiro, codex, or claude)" >&2
|
|
101
|
+
exit 1
|
|
102
|
+
;;
|
|
103
|
+
esac
|
|
104
|
+
case "$SUITE" in
|
|
105
|
+
smoke)
|
|
106
|
+
rest=(--filter-first-n 3 "${rest[@]}")
|
|
107
|
+
;;
|
|
108
|
+
regression)
|
|
109
|
+
rest=(--filter-metadata type=regression "${rest[@]}")
|
|
110
|
+
;;
|
|
111
|
+
capability)
|
|
112
|
+
rest=(--filter-metadata type=capability "${rest[@]}")
|
|
113
|
+
;;
|
|
114
|
+
full|"")
|
|
115
|
+
;;
|
|
116
|
+
*)
|
|
117
|
+
echo "Unsupported suite '$SUITE' (expected smoke, regression, capability, or full)" >&2
|
|
118
|
+
exit 1
|
|
119
|
+
;;
|
|
120
|
+
esac
|
|
121
|
+
EVAL_ARGS=("${rest[@]}")
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
run_static() {
|
|
125
|
+
echo ""
|
|
126
|
+
echo "╔══════════════════════════════════════╗"
|
|
127
|
+
echo "║ Layer 1: Static Package Validation ║"
|
|
128
|
+
echo "╚══════════════════════════════════════╝"
|
|
129
|
+
local result=0
|
|
130
|
+
bash "$EVAL_DIR/static/test_package.sh" || result=1
|
|
131
|
+
echo ""
|
|
132
|
+
bash "$EVAL_DIR/static/test_universal_bundles.sh" || result=1
|
|
133
|
+
echo ""
|
|
134
|
+
bash "$EVAL_DIR/static/test_workflow_skills.sh" || result=1
|
|
135
|
+
echo ""
|
|
136
|
+
bash "$EVAL_DIR/static/test_evidence_refs.sh" || result=1
|
|
137
|
+
echo ""
|
|
138
|
+
bash "$EVAL_DIR/static/test_repo_hooks.sh" || result=1
|
|
139
|
+
return $result
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
run_integration() {
|
|
143
|
+
echo ""
|
|
144
|
+
echo "╔══════════════════════════════════════════╗"
|
|
145
|
+
echo "║ Layer 2: Telemetry Contract Validation ║"
|
|
146
|
+
echo "╚══════════════════════════════════════════╝"
|
|
147
|
+
local result=0
|
|
148
|
+
bash "$EVAL_DIR/integration/test_telemetry.sh" || result=1
|
|
149
|
+
echo ""
|
|
150
|
+
bash "$EVAL_DIR/integration/test_telemetry_doctor.sh" || result=1
|
|
151
|
+
echo ""
|
|
152
|
+
bash "$EVAL_DIR/integration/test_usage_feedback_outcomes.sh" || result=1
|
|
153
|
+
echo ""
|
|
154
|
+
bash "$EVAL_DIR/integration/test_usage_feedback_import.sh" || result=1
|
|
155
|
+
echo ""
|
|
156
|
+
bash "$EVAL_DIR/integration/test_usage_feedback_report.sh" || result=1
|
|
157
|
+
echo ""
|
|
158
|
+
bash "$EVAL_DIR/integration/test_usage_feedback_dashboard.sh" || result=1
|
|
159
|
+
echo ""
|
|
160
|
+
bash "$EVAL_DIR/integration/test_usage_feedback_global.sh" || result=1
|
|
161
|
+
echo ""
|
|
162
|
+
bash "$EVAL_DIR/integration/test_goal_fit_hook.sh" || result=1
|
|
163
|
+
echo ""
|
|
164
|
+
bash "$EVAL_DIR/integration/test_hook_category_behaviors.sh" || result=1
|
|
165
|
+
echo ""
|
|
166
|
+
bash "$EVAL_DIR/integration/test_workflow_artifacts.sh" || result=1
|
|
167
|
+
echo ""
|
|
168
|
+
bash "$EVAL_DIR/integration/test_workflow_artifact_cleanup_audit.sh" || result=1
|
|
169
|
+
echo ""
|
|
170
|
+
bash "$EVAL_DIR/integration/test_fixture_retirement_audit.sh" || result=1
|
|
171
|
+
echo ""
|
|
172
|
+
bash "$EVAL_DIR/integration/test_publish_change_helper.sh" || result=1
|
|
173
|
+
echo ""
|
|
174
|
+
bash "$EVAL_DIR/integration/test_workflow_sidecar_writer.sh" || result=1
|
|
175
|
+
echo ""
|
|
176
|
+
bash "$EVAL_DIR/integration/test_veritas_governance_adapter.sh" || result=1
|
|
177
|
+
echo ""
|
|
178
|
+
bash "$EVAL_DIR/integration/test_workflow_steering_hook.sh" || result=1
|
|
179
|
+
echo ""
|
|
180
|
+
bash "$EVAL_DIR/integration/test_hook_influence_cases.sh" || result=1
|
|
181
|
+
echo ""
|
|
182
|
+
bash "$EVAL_DIR/integration/test_flow_agents_statusline.sh" || result=1
|
|
183
|
+
echo ""
|
|
184
|
+
bash "$EVAL_DIR/integration/test_context_map.sh" || result=1
|
|
185
|
+
echo ""
|
|
186
|
+
bash "$EVAL_DIR/integration/test_flow_kit_repository.sh" || result=1
|
|
187
|
+
echo ""
|
|
188
|
+
bash "$EVAL_DIR/integration/test_local_flow_kit_install.sh" || result=1
|
|
189
|
+
echo ""
|
|
190
|
+
bash "$EVAL_DIR/integration/test_runtime_adapter_activation.sh" || result=1
|
|
191
|
+
echo ""
|
|
192
|
+
bash "$EVAL_DIR/integration/test_bundle_install.sh" || result=1
|
|
193
|
+
return $result
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
run_llm() {
|
|
197
|
+
parse_runtime_args "$@"
|
|
198
|
+
echo ""
|
|
199
|
+
echo "╔═══════════════════════════════════════╗"
|
|
200
|
+
echo "║ Layer 3: LLM Behavioral Evals ║"
|
|
201
|
+
echo "╚═══════════════════════════════════════╝"
|
|
202
|
+
echo ""
|
|
203
|
+
echo "Runtime: $RUNTIME"
|
|
204
|
+
echo "Judge Runtime: ${JUDGE_RUNTIME:-$RUNTIME}"
|
|
205
|
+
echo "Suite: $SUITE"
|
|
206
|
+
echo ""
|
|
207
|
+
|
|
208
|
+
if [[ -n "$AGENT" ]]; then
|
|
209
|
+
local config="$EVAL_DIR/cases/$AGENT/promptfooconfig.yaml"
|
|
210
|
+
if [[ ! -f "$config" ]]; then
|
|
211
|
+
echo "No config found for agent '$AGENT' at $config"
|
|
212
|
+
exit 1
|
|
213
|
+
fi
|
|
214
|
+
echo "Running evals for: $AGENT"
|
|
215
|
+
cd "$EVAL_DIR/cases/$AGENT"
|
|
216
|
+
mkdir -p "$EVAL_DIR/results"
|
|
217
|
+
local output_file="$EVAL_DIR/results/${AGENT}-${RUNTIME}-$(date +%Y-%m-%d).json"
|
|
218
|
+
FLOW_AGENTS_EVAL_RUNTIME="$RUNTIME" FLOW_AGENTS_EVAL_JUDGE_RUNTIME="${JUDGE_RUNTIME:-$RUNTIME}" FLOW_AGENTS_EVAL_SUITE="$SUITE" FLOW_AGENTS_EVAL_AGENT="$AGENT" KIRO_EVAL_AGENT="$AGENT" run_promptfoo eval --no-cache --output "$output_file" "${EVAL_ARGS[@]}"
|
|
219
|
+
echo ""
|
|
220
|
+
echo "Results saved to: $output_file"
|
|
221
|
+
else
|
|
222
|
+
echo "Running all agent evals..."
|
|
223
|
+
for agent_dir in "$EVAL_DIR"/cases/*/; do
|
|
224
|
+
agent=$(basename "$agent_dir")
|
|
225
|
+
[[ ! -f "$agent_dir/promptfooconfig.yaml" ]] && continue
|
|
226
|
+
echo ""
|
|
227
|
+
echo "--- $agent ---"
|
|
228
|
+
cd "$agent_dir"
|
|
229
|
+
mkdir -p "$EVAL_DIR/results"
|
|
230
|
+
local output_file="$EVAL_DIR/results/${agent}-${RUNTIME}-$(date +%Y-%m-%d).json"
|
|
231
|
+
FLOW_AGENTS_EVAL_RUNTIME="$RUNTIME" FLOW_AGENTS_EVAL_JUDGE_RUNTIME="${JUDGE_RUNTIME:-$RUNTIME}" FLOW_AGENTS_EVAL_SUITE="$SUITE" FLOW_AGENTS_EVAL_AGENT="$agent" KIRO_EVAL_AGENT="$agent" run_promptfoo eval --no-cache --output "$output_file" "${EVAL_ARGS[@]}"
|
|
232
|
+
done
|
|
233
|
+
fi
|
|
234
|
+
echo ""
|
|
235
|
+
echo "View results: npm run promptfoo:view"
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
run_acceptance() {
|
|
239
|
+
echo ""
|
|
240
|
+
echo "╔═══════════════════════════════════════╗"
|
|
241
|
+
echo "║ Layer 4: Harness Acceptance ║"
|
|
242
|
+
echo "╚═══════════════════════════════════════╝"
|
|
243
|
+
echo ""
|
|
244
|
+
local target="${AGENT:-all}"
|
|
245
|
+
bash "$EVAL_DIR/acceptance/run.sh" "$target"
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
run_report() {
|
|
249
|
+
local agent="${1:?Usage: bash run.sh report <agent>}"
|
|
250
|
+
local latest
|
|
251
|
+
latest=$(ls -t "$EVAL_DIR/results/${agent}"-*.json 2>/dev/null | head -1)
|
|
252
|
+
if [[ -z "$latest" ]]; then
|
|
253
|
+
echo "No results found for agent '$agent' in $EVAL_DIR/results/"
|
|
254
|
+
exit 1
|
|
255
|
+
fi
|
|
256
|
+
local previous
|
|
257
|
+
previous=$(ls -t "$EVAL_DIR/results/${agent}"-*.json 2>/dev/null | sed -n '2p')
|
|
258
|
+
|
|
259
|
+
echo ""
|
|
260
|
+
echo "╔══════════════════════════════╗"
|
|
261
|
+
echo "║ Eval Report: $agent"
|
|
262
|
+
echo "╚══════════════════════════════╝"
|
|
263
|
+
echo ""
|
|
264
|
+
|
|
265
|
+
mkdir -p "$EVAL_DIR/results/reports"
|
|
266
|
+
local report_file="$EVAL_DIR/results/reports/$(date +%Y-%m-%d)-${agent}.md"
|
|
267
|
+
bash "$EVAL_DIR/lib/eval-report.sh" "$latest" "$previous" | tee "$report_file"
|
|
268
|
+
echo ""
|
|
269
|
+
echo "Report saved to: $report_file"
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
case "$LAYER" in
|
|
273
|
+
static) run_static ;;
|
|
274
|
+
integration) run_integration ;;
|
|
275
|
+
llm)
|
|
276
|
+
shift
|
|
277
|
+
if [[ "${1:-}" == --* ]]; then
|
|
278
|
+
AGENT=""
|
|
279
|
+
else
|
|
280
|
+
AGENT="${1:-}"
|
|
281
|
+
[[ $# -gt 0 ]] && shift
|
|
282
|
+
fi
|
|
283
|
+
run_llm "$@"
|
|
284
|
+
;;
|
|
285
|
+
acceptance) shift; AGENT="${1:-all}"; run_acceptance ;;
|
|
286
|
+
report) shift; run_report "$@" ;;
|
|
287
|
+
all)
|
|
288
|
+
run_static
|
|
289
|
+
static_exit=$?
|
|
290
|
+
run_integration
|
|
291
|
+
integration_exit=$?
|
|
292
|
+
echo ""
|
|
293
|
+
echo "╔══════════════════════════╗"
|
|
294
|
+
echo "║ Summary: Layers 1 + 2 ║"
|
|
295
|
+
echo "╚══════════════════════════╝"
|
|
296
|
+
echo " Static: $([ $static_exit -eq 0 ] && echo PASS || echo FAIL)"
|
|
297
|
+
echo " Integration: $([ $integration_exit -eq 0 ] && echo PASS || echo FAIL)"
|
|
298
|
+
echo ""
|
|
299
|
+
if [[ $static_exit -ne 0 || $integration_exit -ne 0 ]]; then
|
|
300
|
+
echo "Fix Layer 1/2 failures before running Layer 3."
|
|
301
|
+
exit 1
|
|
302
|
+
fi
|
|
303
|
+
echo "Layers 1+2 passed. Run 'bash run.sh acceptance [kiro|claude|codex]' for harness smoke tests or 'bash run.sh llm [dev] [--runtime kiro|codex|claude] [--judge-runtime kiro|codex|claude]' for behavioral evals."
|
|
304
|
+
;;
|
|
305
|
+
*)
|
|
306
|
+
echo "Usage: bash run.sh [static|integration|acceptance|llm|report|all] [target]"
|
|
307
|
+
exit 1
|
|
308
|
+
;;
|
|
309
|
+
esac
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test_evidence_refs.sh — Structured evidence reference schema checks
|
|
3
|
+
set -uo pipefail
|
|
4
|
+
|
|
5
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
6
|
+
source "$ROOT/evals/lib/node.sh"
|
|
7
|
+
|
|
8
|
+
errors=0
|
|
9
|
+
pass() { echo " ✓ $1"; }
|
|
10
|
+
fail() { echo " ✗ $1"; errors=$((errors + 1)); }
|
|
11
|
+
|
|
12
|
+
echo "=== Evidence Ref Schema Checks ==="
|
|
13
|
+
|
|
14
|
+
if node --input-type=module <<'NODE'
|
|
15
|
+
import Ajv2020 from "ajv/dist/2020.js";
|
|
16
|
+
import fs from "node:fs";
|
|
17
|
+
import path from "node:path";
|
|
18
|
+
|
|
19
|
+
const root = process.cwd();
|
|
20
|
+
const ajv = new Ajv2020({ allErrors: true });
|
|
21
|
+
const acceptanceSchema = JSON.parse(fs.readFileSync(path.join(root, "schemas/workflow-acceptance.schema.json"), "utf8"));
|
|
22
|
+
const evidenceSchema = JSON.parse(fs.readFileSync(path.join(root, "schemas/workflow-evidence.schema.json"), "utf8"));
|
|
23
|
+
const validateAcceptance = ajv.compile(acceptanceSchema);
|
|
24
|
+
const validateEvidence = ajv.compile(evidenceSchema);
|
|
25
|
+
|
|
26
|
+
const acceptance = {
|
|
27
|
+
schema_version: "1.0",
|
|
28
|
+
task_slug: "structured-evidence-ref-fixture",
|
|
29
|
+
criteria: [
|
|
30
|
+
{
|
|
31
|
+
id: "AC1",
|
|
32
|
+
description: "Behavior claim cites command and source evidence.",
|
|
33
|
+
status: "pass",
|
|
34
|
+
evidence_refs: [
|
|
35
|
+
{
|
|
36
|
+
kind: "command",
|
|
37
|
+
excerpt: "npm run eval:static --silent",
|
|
38
|
+
summary: "Static evals passed."
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
kind: "source",
|
|
42
|
+
url: "https://github.com/example/repo/blob/0123456789abcdef0123456789abcdef01234567/src/index.ts#L10-L18",
|
|
43
|
+
file: "src/index.ts",
|
|
44
|
+
line_start: 10,
|
|
45
|
+
line_end: 18,
|
|
46
|
+
excerpt: "export function implementedBehavior() { return true; }"
|
|
47
|
+
}
|
|
48
|
+
]
|
|
49
|
+
}
|
|
50
|
+
],
|
|
51
|
+
goal_fit: {
|
|
52
|
+
status: "pass",
|
|
53
|
+
summary: "Structured refs validate."
|
|
54
|
+
}
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
const evidence = {
|
|
58
|
+
schema_version: "1.0",
|
|
59
|
+
task_slug: "structured-evidence-ref-fixture",
|
|
60
|
+
verdict: "pass",
|
|
61
|
+
checks: [
|
|
62
|
+
{
|
|
63
|
+
id: "static-eval",
|
|
64
|
+
kind: "test",
|
|
65
|
+
status: "pass",
|
|
66
|
+
command: "npm run eval:static --silent",
|
|
67
|
+
summary: "Static evals passed.",
|
|
68
|
+
artifact_refs: [
|
|
69
|
+
{
|
|
70
|
+
kind: "source",
|
|
71
|
+
file: "evals/static/test_evidence_refs.sh",
|
|
72
|
+
line_start: 1,
|
|
73
|
+
line_end: 1,
|
|
74
|
+
excerpt: "test_evidence_refs.sh - Structured evidence reference schema checks"
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
}
|
|
78
|
+
],
|
|
79
|
+
external_evidence: [
|
|
80
|
+
{
|
|
81
|
+
system: "github",
|
|
82
|
+
ref: {
|
|
83
|
+
kind: "provider",
|
|
84
|
+
url: "https://github.com/example/repo/actions/runs/1",
|
|
85
|
+
summary: "Provider check run."
|
|
86
|
+
},
|
|
87
|
+
summary: "Provider evidence."
|
|
88
|
+
}
|
|
89
|
+
]
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
const legacyAcceptance = structuredClone(acceptance);
|
|
93
|
+
legacyAcceptance.criteria[0].evidence_refs = ["legacy-string-ref"];
|
|
94
|
+
|
|
95
|
+
const invalidSource = structuredClone(evidence);
|
|
96
|
+
delete invalidSource.checks[0].artifact_refs[0].excerpt;
|
|
97
|
+
|
|
98
|
+
const emptyArtifact = structuredClone(evidence);
|
|
99
|
+
emptyArtifact.checks[0].artifact_refs = [{ kind: "artifact" }];
|
|
100
|
+
|
|
101
|
+
const emptyCommand = structuredClone(acceptance);
|
|
102
|
+
emptyCommand.criteria[0].evidence_refs = [{ kind: "command" }];
|
|
103
|
+
|
|
104
|
+
const emptyProvider = structuredClone(evidence);
|
|
105
|
+
emptyProvider.external_evidence[0].ref = { kind: "provider" };
|
|
106
|
+
|
|
107
|
+
if (!validateAcceptance(acceptance)) {
|
|
108
|
+
throw new Error(`structured acceptance refs should validate: ${ajv.errorsText(validateAcceptance.errors)}`);
|
|
109
|
+
}
|
|
110
|
+
if (!validateEvidence(evidence)) {
|
|
111
|
+
throw new Error(`structured evidence refs should validate: ${ajv.errorsText(validateEvidence.errors)}`);
|
|
112
|
+
}
|
|
113
|
+
if (validateAcceptance(legacyAcceptance)) {
|
|
114
|
+
throw new Error("legacy string evidence refs should fail");
|
|
115
|
+
}
|
|
116
|
+
if (validateEvidence(invalidSource)) {
|
|
117
|
+
throw new Error("source refs missing excerpt should fail");
|
|
118
|
+
}
|
|
119
|
+
if (validateEvidence(emptyArtifact)) {
|
|
120
|
+
throw new Error("artifact refs without file/url and summary/excerpt should fail");
|
|
121
|
+
}
|
|
122
|
+
if (validateAcceptance(emptyCommand)) {
|
|
123
|
+
throw new Error("command refs without excerpt/summary/url should fail");
|
|
124
|
+
}
|
|
125
|
+
if (validateEvidence(emptyProvider)) {
|
|
126
|
+
throw new Error("provider refs without url should fail");
|
|
127
|
+
}
|
|
128
|
+
NODE
|
|
129
|
+
then
|
|
130
|
+
pass "structured refs validate and incomplete refs fail"
|
|
131
|
+
else
|
|
132
|
+
fail "structured evidence ref schema check failed"
|
|
133
|
+
fi
|
|
134
|
+
|
|
135
|
+
if [[ $errors -eq 0 ]]; then
|
|
136
|
+
echo "=== PASS ==="
|
|
137
|
+
exit 0
|
|
138
|
+
else
|
|
139
|
+
echo "=== FAIL ($errors) ==="
|
|
140
|
+
exit 1
|
|
141
|
+
fi
|