npm - @kontourai/flow-agents - Versions diffs - 1.4.0 → 2.0.1 - Mend

@kontourai/flow-agents 1.4.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

package/.github/CODEOWNERS +29 -0
package/.github/actions/trust-verify/action.yml +145 -0
package/.github/workflows/ci.yml +11 -4
package/.github/workflows/kit-gates-demo.yml +2 -2
package/.github/workflows/publish-npm.yml +10 -2
package/.github/workflows/release-please.yml +1 -1
package/.github/workflows/runtime-compat.yml +1 -1
package/.github/workflows/trust-reconcile.yml +113 -0
package/AGENTS.md +13 -0
package/CHANGELOG.md +103 -0
package/CONTRIBUTING.md +4 -4
package/README.md +1 -0
package/agents/tool-planner.json +1 -1
package/build/src/cli/init.js +242 -20
package/build/src/cli/validate-workflow-artifacts.js +19 -2
package/build/src/cli/verify.d.ts +1 -0
package/build/src/cli/verify.js +90 -0
package/build/src/cli/workflow-sidecar.d.ts +316 -8
package/build/src/cli/workflow-sidecar.js +1996 -91
package/build/src/cli.js +2 -3
package/build/src/lib/flow-resolver.d.ts +111 -0
package/build/src/lib/flow-resolver.js +308 -0
package/build/src/tools/build-universal-bundles.js +34 -22
package/build/src/tools/generate-context-map.js +3 -16
package/build/src/tools/validate-source-tree.d.ts +1 -1
package/build/src/tools/validate-source-tree.js +42 -162
package/context/contracts/artifact-contract.md +10 -0
package/context/contracts/delivery-contract.md +1 -0
package/context/contracts/review-contract.md +1 -0
package/context/contracts/verification-contract.md +2 -0
package/context/gate-awareness.md +39 -0
package/context/scripts/hooks/stop-goal-fit.js +632 -70
package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
package/docs/adr/0007-skill-audit.md +1 -1
package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
package/docs/adr/0011-mcp-posture.md +100 -0
package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
package/docs/adr/0013-context-lifecycle.md +151 -0
package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
package/docs/adr/0016-three-hard-boundary-model.md +71 -0
package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
package/docs/agent-system-guidebook.md +5 -12
package/docs/context-map.md +4 -10
package/docs/index.md +3 -2
package/docs/integrations/framework-adapter.md +19 -6
package/docs/integrations/index.md +2 -2
package/docs/north-star.md +4 -4
package/docs/operating-layers.md +3 -3
package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
package/docs/repository-structure.md +2 -2
package/docs/skills-map.md +1 -0
package/docs/spec/runtime-hook-surface.md +62 -9
package/docs/standards-register.md +3 -3
package/docs/survey-utterance-check.md +1 -1
package/docs/trust-anchor-adoption.md +197 -0
package/docs/verifiable-trust.md +95 -0
package/docs/veritas-integration.md +2 -2
package/docs/workflow-usage-guide.md +69 -0
package/evals/acceptance/DEMO-false-completion.md +144 -0
package/evals/acceptance/demo-cast.sh +92 -0
package/evals/acceptance/demo-false-completion.sh +72 -0
package/evals/acceptance/demo-real-evidence.sh +104 -0
package/evals/acceptance/demo.tape +29 -0
package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
package/evals/acceptance/prove-capture-teeth.sh +114 -0
package/evals/acceptance/prove-teeth.sh +105 -0
package/evals/ci/antigaming-suite.sh +55 -0
package/evals/ci/run-baseline.sh +2 -0
package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
package/evals/integration/test_builder_step_producers.sh +379 -0
package/evals/integration/test_bundle_install.sh +35 -71
package/evals/integration/test_bundle_lifecycle.sh +39 -2
package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
package/evals/integration/test_checkpoint_signing.sh +489 -0
package/evals/integration/test_claim_lookup.sh +352 -0
package/evals/integration/test_command_log_fork_classification.sh +134 -0
package/evals/integration/test_command_log_integrity.sh +275 -0
package/evals/integration/test_context_map.sh +0 -2
package/evals/integration/test_dual_emit_flow_step.sh +278 -0
package/evals/integration/test_enforcer_expects_driven.sh +281 -0
package/evals/integration/test_evidence_capture_hook.sh +185 -0
package/evals/integration/test_flow_kit_repository.sh +2 -0
package/evals/integration/test_flowdef_session_activation.sh +273 -0
package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
package/evals/integration/test_gate_bypass_chain.sh +448 -0
package/evals/integration/test_gate_lockdown.sh +1137 -0
package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
package/evals/integration/test_goal_fit_hook.sh +69 -4
package/evals/integration/test_goal_fit_rederive.sh +263 -0
package/evals/integration/test_install_merge.sh +1176 -0
package/evals/integration/test_kit_identity_trust.sh +393 -0
package/evals/integration/test_mint_attestation.sh +373 -0
package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
package/evals/integration/test_publish_delivery.sh +269 -0
package/evals/integration/test_reconcile_soundness.sh +528 -0
package/evals/integration/test_resolvefirststep_security.sh +208 -0
package/evals/integration/test_session_resume_roundtrip.sh +286 -0
package/evals/integration/test_trust_checkpoint.sh +325 -0
package/evals/integration/test_trust_reconcile.sh +293 -0
package/evals/integration/test_verify_cli.sh +208 -0
package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
package/evals/lib/node.sh +0 -6
package/evals/run.sh +47 -0
package/evals/static/test_workflow_skills.sh +6 -13
package/install.sh +0 -7
package/integrations/strands-ts/README.md +25 -15
package/integrations/veritas/flow-agents.adapter.json +1 -2
package/kits/builder/flows/build.flow.json +59 -12
package/kits/builder/kit.json +85 -15
package/kits/builder/skills/continue-work/SKILL.md +116 -0
package/kits/builder/skills/deliver/SKILL.md +36 -6
package/kits/builder/skills/design-probe/SKILL.md +28 -0
package/kits/builder/skills/execute-plan/SKILL.md +9 -1
package/kits/builder/skills/gate-review/SKILL.md +234 -0
package/kits/builder/skills/learning-review/SKILL.md +30 -0
package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
package/kits/builder/skills/plan-work/SKILL.md +13 -1
package/kits/builder/skills/pull-work/SKILL.md +19 -0
package/kits/knowledge/adapters/default-store/index.js +38 -0
package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
package/kits/knowledge/docs/store-contract.md +314 -0
package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
package/kits/knowledge/evals/entities/suite.test.js +40 -0
package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
package/kits/knowledge/evals/retirement/suite.test.js +145 -0
package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
package/kits/knowledge/kit.json +51 -1
package/package.json +6 -6
package/packaging/conformance/README.md +10 -2
package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
package/packaging/conformance/run-conformance.js +1 -1
package/scripts/README.md +2 -1
package/scripts/build-universal-bundles.js +0 -1
package/scripts/ci/mint-attestation.js +221 -0
package/scripts/ci/trust-reconcile.js +545 -0
package/scripts/hooks/config-protection.js +423 -1
package/scripts/hooks/evidence-capture.js +348 -0
package/scripts/hooks/lib/liveness-read.js +113 -0
package/scripts/hooks/run-hook.js +6 -1
package/scripts/hooks/stop-goal-fit.js +1524 -79
package/scripts/hooks/workflow-steering.js +135 -5
package/scripts/install-codex-home.sh +39 -0
package/scripts/install-merge.js +330 -0
package/scripts/repair-command-log.js +115 -0
package/src/cli/init.ts +218 -20
package/src/cli/validate-workflow-artifacts.ts +18 -2
package/src/cli/verify.ts +100 -0
package/src/cli/workflow-sidecar.ts +2127 -84
package/src/cli.ts +2 -3
package/src/lib/flow-resolver.ts +369 -0
package/src/tools/build-universal-bundles.ts +34 -21
package/src/tools/generate-context-map.ts +3 -17
package/src/tools/validate-source-tree.ts +44 -104
package/build/src/tools/filter-installed-packs.d.ts +0 -2
package/build/src/tools/filter-installed-packs.js +0 -135
package/packaging/packs.json +0 -49
package/scripts/filter-installed-packs.js +0 -2
package/src/tools/filter-installed-packs.ts +0 -132

package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json ADDED Viewed

@@ -0,0 +1,20 @@
+{
+  "schema_version": "1.0",
+  "id": "missing-extension-asset-kit",
+  "name": "Missing Extension Asset Kit",
+  "product_name": "Missing Extension Asset Kit",
+  "description": "A valid Flow Kit container with a Flow Agents extension asset pointing at a missing file.",
+  "flows": [
+    {
+      "id": "missing.extension.asset.review",
+      "path": "flows/review.flow.json",
+      "description": "Review a small change."
+    }
+  ],
+  "docs": [
+    {
+      "id": "missing.docs",
+      "path": "docs/MISSING.md"
+    }
+  ]
+}

package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json ADDED Viewed

@@ -0,0 +1,26 @@
+{
+  "id": "unknown.extension.review",
+  "version": "1.0",
+  "steps": [
+    { "id": "review", "next": "done" },
+    { "id": "done", "next": null }
+  ],
+  "gates": {
+    "review-gate": {
+      "step": "review",
+      "expects": [
+        {
+          "id": "review-evidence",
+          "kind": "trust.bundle",
+          "required": true,
+          "description": "Review evidence has been recorded.",
+          "bundle_claim": {
+            "claimType": "example.review.evidence",
+            "subjectType": "artifact",
+            "accepted_statuses": ["trusted", "accepted"]
+          }
+        }
+      ]
+    }
+  }
+}

package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json ADDED Viewed

@@ -0,0 +1,18 @@
+{
+  "schema_version": "1.0",
+  "id": "unknown-extension-kit",
+  "name": "Unknown Extension Kit",
+  "product_name": "Unknown Extension Kit",
+  "description": "A valid Flow Kit container with an arbitrary top-level extension field.",
+  "flows": [
+    {
+      "id": "unknown.extension.review",
+      "path": "flows/review.flow.json",
+      "description": "Review a small change."
+    }
+  ],
+  "third_party_extension": {
+    "provider": "example.vendor",
+    "path": "vendor/meta.json"
+  }
+}

package/evals/integration/test_builder_step_producers.sh ADDED Viewed

@@ -0,0 +1,379 @@
+#!/usr/bin/env bash
+# test_builder_step_producers.sh — Integration eval for ADR 0016 Abstraction A P-d Increment 2.
+#
+# Proves for each of the 6 producer-wired gate claims:
+#   - record-gate-claim at the correct active step produces the declared claim
+#     (correct claimType + subjectType, status=verified in the bundle).
+#   - A TAMPERED bundle (stored verified, evidence fail) at that step BLOCKS (exit 2)
+#     with the tamper warning naming the declared claimType.
+#
+# Claims covered:
+#   1. builder.pull-work.selected           (step: pull-work,    expectation: selected-work)
+#   2. builder.design-probe.pickup-readiness (step: design-probe, expectation: pickup-probe-readiness)
+#   3. builder.design-probe.decisions        (step: design-probe, expectation: probe-decisions-or-accepted-gaps)
+#   4. builder.pr-open.pull-request          (step: pr-open,      expectation: pull-request-opened)
+#   5. builder.learn.decisions               (step: learn,        expectation: decision-evidence)
+#   6. builder.learn.evidence                (step: learn,        expectation: learning-evidence)
+#
+# build.flow.json confirmation:
+#   - All 6 claims above are required:true.
+#   - policy-compliance remains required:false (advisory — no skill producer).
+#
+# Deterministic, no model spend, self-cleaning.
+# Usage: bash evals/integration/test_builder_step_producers.sh
+set -uo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+source "$ROOT/evals/lib/node.sh"
+GATE="$ROOT/scripts/hooks/stop-goal-fit.js"
+export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
+TMP="$(mktemp -d)"
+errors=0
+_pass() { echo "  ✓ $1"; }
+_fail() { echo "  ✗ $1"; errors=$((errors + 1)); }
+cleanup() { rm -rf "$TMP"; }
+trap cleanup EXIT
+# ─── Helper: set active_step_id for a step.
+# For steps in the phase_map, use advance-state.
+# For design-probe (no phase mapping), use ensure-session --step-id.
+# ──────────────────────────────────────────────────────────────────
+set_active_step() {
+  local aroot="$1" slug="$2" step="$3"
+  case "$step" in
+    design-probe)
+      # design-probe has no lifecycle phase in the phase_map — set via ensure-session --step-id
+      flow_agents_node "workflow-sidecar" ensure-session \
+        --artifact-root "$aroot" \
+        --task-slug "$slug" \
+        --title "Producer test: $step" \
+        --summary "Test gate-claim producer at $step." \
+        --flow-id builder.build \
+        --step-id design-probe \
+        --timestamp "2026-06-26T00:00:30Z" >/dev/null 2>&1
+      ;;
+    pull-work)
+      flow_agents_node "workflow-sidecar" advance-state "$aroot/$slug" \
+        --status in_progress --phase pickup \
+        --summary "Testing at $step." --next-action "Record claim." \
+        --flow-definition builder.build \
+        --timestamp "2026-06-26T00:01:00Z" >/dev/null 2>&1
+      ;;
+    pr-open)
+      flow_agents_node "workflow-sidecar" advance-state "$aroot/$slug" \
+        --status in_progress --phase release \
+        --summary "Testing at $step." --next-action "Record claim." \
+        --flow-definition builder.build \
+        --timestamp "2026-06-26T00:01:00Z" >/dev/null 2>&1
+      ;;
+    learn)
+      flow_agents_node "workflow-sidecar" advance-state "$aroot/$slug" \
+        --status in_progress --phase learning \
+        --summary "Testing at $step." --next-action "Record claim." \
+        --flow-definition builder.build \
+        --timestamp "2026-06-26T00:01:00Z" >/dev/null 2>&1
+      ;;
+  esac
+}
+# ─── Helper: bootstrap a session for produce tests ───────────────────────────
+setup_session_for_produce() {
+  local aroot="$1" slug="$2" step="$3"
+  mkdir -p "$aroot"
+  flow_agents_node "workflow-sidecar" ensure-session \
+    --artifact-root "$aroot" \
+    --task-slug "$slug" \
+    --title "Producer test: $step" \
+    --summary "Test gate-claim producer at $step." \
+    --flow-id builder.build \
+    --timestamp "2026-06-26T00:00:00Z" >/dev/null 2>&1
+  flow_agents_node "workflow-sidecar" init-plan "$aroot/$slug/$slug--deliver.md" \
+    --source-request "Test" --summary "Testing" \
+    --timestamp "2026-06-26T00:00:00Z" >/dev/null 2>&1
+  set_active_step "$aroot" "$slug" "$step"
+}
+# ─── Helper: bootstrap a session + AGENTS.md for tamper tests ────────────────
+setup_tamper_session() {
+  local t_dir="$1" slug="$2" step="$3"
+  mkdir -p "$t_dir"
+  printf '# Repo\n' > "$t_dir/AGENTS.md"
+  mkdir -p "$t_dir/.flow-agents/$slug"
+  flow_agents_node "workflow-sidecar" ensure-session \
+    --artifact-root "$t_dir/.flow-agents" \
+    --task-slug "$slug" \
+    --title "Tamper test: $step" \
+    --summary "Testing tamper detection." \
+    --flow-id builder.build \
+    --timestamp "2026-06-26T00:00:00Z" >/dev/null 2>&1
+  flow_agents_node "workflow-sidecar" init-plan "$t_dir/.flow-agents/$slug/$slug--deliver.md" \
+    --source-request "Test" --summary "Testing" \
+    --timestamp "2026-06-26T00:00:00Z" >/dev/null 2>&1
+  set_active_step "$t_dir/.flow-agents" "$slug" "$step"
+}
+# ─── Test: produce a gate claim at a given step ───────────────────────────────
+test_produce_claim() {
+  local label="$1" step="$2" expectation="$3" expected_claim_type="$4" expected_subject_type="$5"
+  echo ""
+  echo "=== PRODUCE: $label ==="
+  local slug
+  slug="$(echo "prod-$step-$expectation" | tr '/' '-' | tr '.' '-')"
+  local aroot="$TMP/$slug"
+  setup_session_for_produce "$aroot" "$slug" "$step"
+  if flow_agents_node "workflow-sidecar" record-gate-claim "$aroot/$slug" \
+    --status pass \
+    --summary "Test claim: $label" \
+    --expectation "$expectation" \
+    --timestamp "2026-06-26T00:02:00Z" >/dev/null 2>&1; then
+    _pass "$label: record-gate-claim exits 0 at $step step"
+  else
+    _fail "$label: record-gate-claim failed at $step step"
+    return
+  fi
+  node -e "
+    const fs = require('fs');
+    const bundle = JSON.parse(fs.readFileSync('$aroot/$slug/trust.bundle', 'utf8'));
+    const target = (bundle.claims || []).find(c => c.claimType === '$expected_claim_type');
+    if (!target) {
+      console.error('no $expected_claim_type claim found; claims:', (bundle.claims||[]).map(c=>c.claimType).join(', '));
+      process.exit(1);
+    }
+    if (target.subjectType !== '$expected_subject_type') {
+      console.error('expected subjectType=$expected_subject_type, got', target.subjectType);
+      process.exit(1);
+    }
+    if (target.status !== 'verified') {
+      console.error('expected status=verified, got', target.status);
+      process.exit(1);
+    }
+  " 2>/dev/null \
+    && _pass "$label: bundle contains $expected_claim_type with subjectType=$expected_subject_type, status=verified" \
+    || _fail "$label: bundle missing or incorrect $expected_claim_type claim"
+}
+# ─── Test: tampered bundle at given step BLOCKS ───────────────────────────────
+test_tamper_blocks() {
+  local label="$1" step="$2" claim_type="$3" subject_type="$4"
+  echo ""
+  echo "=== TAMPER-BLOCKS: $label ==="
+  local slug
+  slug="$(echo "tamper-$step-$claim_type" | tr '.' '-' | tr '/' '-')"
+  local t_dir="$TMP/$slug"
+  setup_tamper_session "$t_dir" "$slug" "$step"
+  # Write a TAMPERED trust.bundle: stored verified, evidence passing=false
+  python3 - "$t_dir/.flow-agents/$slug/trust.bundle" "$claim_type" "$subject_type" << 'PY'
+import json, sys
+claim_type = sys.argv[2]
+subject_type = sys.argv[3]
+bundle = {
+    "schemaVersion": 3,
+    "source": "flow-agents/workflow-sidecar",
+    "claims": [{
+        "id": "c1",
+        "subjectId": "tamper/gate-claim-test",
+        "subjectType": subject_type,
+        "claimType": claim_type,
+        "fieldOrBehavior": "Gate claim test",
+        "value": "pass",
+        "impactLevel": "high",
+        "status": "verified",
+        "createdAt": "2026-06-26T00:00:00Z",
+        "updatedAt": "2026-06-26T00:00:00Z"
+    }],
+    "evidence": [{
+        "id": "ev1",
+        "claimId": "c1",
+        "evidenceType": "test_output",
+        "method": "validation",
+        "sourceRef": "command-log.jsonl",
+        "excerptOrSummary": "gate claim FAILED",
+        "observedAt": "2026-06-26T00:00:00Z",
+        "collectedBy": "harness",
+        "passing": False,
+        "blocking": True
+    }],
+    "policies": [],
+    "events": [{
+        "id": "evt1",
+        "claimId": "c1",
+        "status": "verified",
+        "actor": "agent",
+        "method": "workflow-check",
+        "evidenceIds": ["ev1"],
+        "createdAt": "2026-06-26T00:00:00Z"
+    }]
+}
+json.dump(bundle, open(sys.argv[1], 'w'))
+PY
+  set +e
+  tamper_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
+      node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$t_dir\"}")"
+  tamper_exit="$?"
+  set -e
+  if [ "$tamper_exit" -eq 2 ]; then
+    _pass "$label: tampered bundle blocks (exit 2)"
+  else
+    _fail "$label: tampered bundle did NOT block: exit=$tamper_exit"
+  fi
+  if echo "$tamper_out" | grep -qE "stored status.*does not match recompute|possible tampered bundle"; then
+    _pass "$label: tamper warning emits 'stored status does not match recompute'"
+  else
+    _fail "$label: tamper warning missing from output: $tamper_out"
+  fi
+  if echo "$tamper_out" | grep -q "caught false-completion"; then
+    _pass "$label: tamper warning emits 'caught false-completion'"
+  else
+    _fail "$label: tamper warning missing 'caught false-completion': $tamper_out"
+  fi
+  if echo "$tamper_out" | grep -q "$claim_type"; then
+    _pass "$label: tamper warning names declared claimType $claim_type"
+  else
+    _fail "$label: tamper warning does not name $claim_type: $tamper_out"
+  fi
+}
+# ─── Test 0: build.flow.json required:true confirmation ──────────────────────
+echo ""
+echo "=== 0. build.flow.json: confirm required:true for produced gates ==="
+node -e "
+  const fs = require('fs');
+  const flow = JSON.parse(fs.readFileSync('$ROOT/kits/builder/flows/build.flow.json', 'utf8'));
+  const requiredTrue = [
+    'selected-work',
+    'pickup-probe-readiness',
+    'probe-decisions-or-accepted-gaps',
+    'pull-request-opened',
+    'decision-evidence',
+    'learning-evidence',
+  ];
+  const requiredFalse = ['policy-compliance'];
+  let ok = true;
+  for (const [gateName, gate] of Object.entries(flow.gates)) {
+    for (const exp of gate.expects || []) {
+      if (requiredTrue.includes(exp.id) && exp.required !== true) {
+        console.error('FAIL: ' + exp.id + ' in ' + gateName + ' should be required:true, got ' + exp.required);
+        ok = false;
+      }
+      if (requiredFalse.includes(exp.id) && exp.required !== false) {
+        console.error('FAIL: ' + exp.id + ' in ' + gateName + ' should remain required:false (advisory), got ' + exp.required);
+        ok = false;
+      }
+    }
+  }
+  if (!ok) process.exit(1);
+" 2>/dev/null \
+  && _pass "build.flow.json: 6 produced gates are required:true, policy-compliance is required:false" \
+  || _fail "build.flow.json: required flag mismatch"
+node -e "
+  const fs = require('fs');
+  const flow = JSON.parse(fs.readFileSync('$ROOT/kits/builder/flows/build.flow.json', 'utf8'));
+  const producedIds = [
+    'selected-work',
+    'pickup-probe-readiness',
+    'probe-decisions-or-accepted-gaps',
+    'pull-request-opened',
+    'decision-evidence',
+    'learning-evidence',
+  ];
+  let ok = true;
+  for (const [gateName, gate] of Object.entries(flow.gates)) {
+    for (const exp of gate.expects || []) {
+      if (producedIds.includes(exp.id) && exp.explore_hint) {
+        console.error('FAIL: ' + exp.id + ' in ' + gateName + ' still has explore_hint (remove when producer exists)');
+        ok = false;
+      }
+    }
+  }
+  if (!ok) process.exit(1);
+" 2>/dev/null \
+  && _pass "build.flow.json: no explore_hint on produced gate entries" \
+  || _fail "build.flow.json: produced gate entries still have explore_hint"
+# ─── Tests 1–6: produce + tamper-block for each of the 6 claims ──────────────
+# Claim 1: builder.pull-work.selected
+test_produce_claim \
+  "builder.pull-work.selected" \
+  "pull-work" "selected-work" \
+  "builder.pull-work.selected" "work-item"
+test_tamper_blocks \
+  "builder.pull-work.selected" \
+  "pull-work" "builder.pull-work.selected" "work-item"
+# Claim 2: builder.design-probe.pickup-readiness
+test_produce_claim \
+  "builder.design-probe.pickup-readiness" \
+  "design-probe" "pickup-probe-readiness" \
+  "builder.design-probe.pickup-readiness" "work-item"
+test_tamper_blocks \
+  "builder.design-probe.pickup-readiness" \
+  "design-probe" "builder.design-probe.pickup-readiness" "work-item"
+# Claim 3: builder.design-probe.decisions
+test_produce_claim \
+  "builder.design-probe.decisions" \
+  "design-probe" "probe-decisions-or-accepted-gaps" \
+  "builder.design-probe.decisions" "decision"
+test_tamper_blocks \
+  "builder.design-probe.decisions" \
+  "design-probe" "builder.design-probe.decisions" "decision"
+# Claim 4: builder.pr-open.pull-request
+test_produce_claim \
+  "builder.pr-open.pull-request" \
+  "pr-open" "pull-request-opened" \
+  "builder.pr-open.pull-request" "pull-request"
+test_tamper_blocks \
+  "builder.pr-open.pull-request" \
+  "pr-open" "builder.pr-open.pull-request" "pull-request"
+# Claim 5: builder.learn.decisions
+test_produce_claim \
+  "builder.learn.decisions" \
+  "learn" "decision-evidence" \
+  "builder.learn.decisions" "decision"
+test_tamper_blocks \
+  "builder.learn.decisions" \
+  "learn" "builder.learn.decisions" "decision"
+# Claim 6: builder.learn.evidence
+test_produce_claim \
+  "builder.learn.evidence" \
+  "learn" "learning-evidence" \
+  "builder.learn.evidence" "release"
+test_tamper_blocks \
+  "builder.learn.evidence" \
+  "learn" "builder.learn.evidence" "release"
+# ─── Summary ──────────────────────────────────────────────────────────────────
+echo ""
+if [ "$errors" -eq 0 ]; then
+  echo "Builder step producer tests passed (6 claims: produce + tamper-block each)."
+  exit 0
+fi
+echo "Builder step producer tests FAILED: $errors issue(s)."
+exit 1

package/evals/integration/test_bundle_install.sh CHANGED Viewed

@@ -32,7 +32,7 @@ KIRO_DEST="$TMPDIR_EVAL/kiro-home"
 BASE_DEST="$TMPDIR_EVAL/base-workspace"
 CLAUDE_DEST="$TMPDIR_EVAL/claude-workspace"
 CODEX_DEST="$TMPDIR_EVAL/codex-workspace"
-CODEX_CORE_DEST="$TMPDIR_EVAL/codex-core-workspace"
+CODEX_FULL_DEST="$TMPDIR_EVAL/codex-full-workspace"
 CODEX_CONSOLE_DEST="$TMPDIR_EVAL/codex-console-workspace"
 CODEX_HOSTED_CONSOLE_DEST="$TMPDIR_EVAL/codex-hosted-console-workspace"
 CODEX_USER_HOSTED_CONSOLE_DEST="$TMPDIR_EVAL/codex-user-hosted-console-workspace"
@@ -42,7 +42,7 @@ BASE_INIT_DEST="$TMPDIR_EVAL/base-init-workspace"
 CODEX_INIT_DEST="$TMPDIR_EVAL/codex-init-workspace"
 OPENCODE_DEST="$TMPDIR_EVAL/opencode-workspace"
 OPENCODE_CONSOLE_DEST="$TMPDIR_EVAL/opencode-console-workspace"
-OPENCODE_CORE_DEST="$TMPDIR_EVAL/opencode-core-workspace"
+OPENCODE_FULL_DEST="$TMPDIR_EVAL/opencode-full-workspace"
 PI_DEST="$TMPDIR_EVAL/pi-workspace"
 CONSOLE_TOKEN_FILE="$TMPDIR_EVAL/console-token"
 printf 'test-token\n' > "$CONSOLE_TOKEN_FILE"
@@ -128,7 +128,7 @@ else
   _fail "opencode install with Console telemetry config failed"
 fi
-if node "$ROOT_DIR/build/src/cli.js" init --runtime opencode --dest "$OPENCODE_CORE_DEST" --yes >/dev/null; then
+if node "$ROOT_DIR/build/src/cli.js" init --runtime opencode --dest "$OPENCODE_FULL_DEST" --yes >/dev/null; then
   _pass "flow-agents init headless opencode install succeeded"
 else
   _fail "flow-agents init headless opencode install failed"
@@ -140,32 +140,17 @@ else
   _fail "pi install failed"
 fi
-USER_SKILLS_DIR="$CODEX_CORE_DEST/.codex/sk""ills/user-skill"
-mkdir -p "$CODEX_CORE_DEST/.codex/ag""ents" "$USER_SKILLS_DIR"
-printf 'name = "user-agent"\n' > "$CODEX_CORE_DEST/.codex/ag""ents/user-agent.toml"
+USER_SKILLS_DIR="$CODEX_FULL_DEST/.codex/sk""ills/user-skill"
+mkdir -p "$CODEX_FULL_DEST/.codex/ag""ents" "$USER_SKILLS_DIR"
+printf 'name = "user-agent"\n' > "$CODEX_FULL_DEST/.codex/ag""ents/user-agent.toml"
 printf '# user skill\n' > "$USER_SKILLS_DIR/SKILL.md"
-if (cd "$ROOT_DIR/dist/codex" && FLOW_AGENTS_PACKS=core bash install.sh "$CODEX_CORE_DEST" >/dev/null); then
-  _pass "Codex core-pack filtered install succeeded"
+# A fresh install ships the full standalone base (no pack filtering). Pre-existing
+# unknown user files must be preserved across the rsync install.
+if (cd "$ROOT_DIR/dist/codex" && bash install.sh "$CODEX_FULL_DEST" >/dev/null); then
+  _pass "Codex full install succeeded"
 else
-  _fail "Codex core-pack filtered install failed"
-fi
-FILTER_ATTACK_DEST="$TMPDIR_EVAL/filter-attack"
-mkdir -p "$FILTER_ATTACK_DEST/packaging" "$FILTER_ATTACK_DEST/skills"
-cat > "$FILTER_ATTACK_DEST/packaging/packs.json" <<'JSON'
-{
-  "schema_version": "1.0",
-  "packs": [
-    { "name": "core", "default": true, "skills": ["safe"], "agents": [], "powers": [] },
-    { "name": "extra", "skills": ["../escape"], "agents": [], "powers": [] }
-  ]
-}
-JSON
-if node "$ROOT_DIR/build/src/tools/filter-installed-packs.js" "$FILTER_ATTACK_DEST" --packs core >"$TMPDIR_EVAL/filter-attack.out" 2>"$TMPDIR_EVAL/filter-attack.err"; then
-  _fail "pack filter accepted unsafe metadata traversal"
-else
-  _pass "pack filter rejects unsafe metadata traversal before deletion"
+  _fail "Codex full install failed"
 fi
 echo ""
@@ -179,7 +164,7 @@ for dir in \
   "$CODEX_DEST/.codex/agents" \
   "$CODEX_DEST/.codex/skills" \
   "$CODEX_DEST/.flow-agents" \
-  "$CODEX_CORE_DEST/.flow-agents"; do
+  "$CODEX_FULL_DEST/.flow-agents"; do
   if [[ -d "$dir" ]]; then
     _pass "$dir exists"
   else
@@ -703,64 +688,43 @@ fi
 echo ""
-echo "--- Pack Filtering ---"
-CODEX_AGENTS_DIR="$CODEX_CORE_DEST/.codex/ag""ents"
-CORE_AGENT="$CODEX_AGENTS_DIR/tool-planner.toml"
-OPTIONAL_AGENT="$CODEX_AGENTS_DIR/dev.toml"
-if [[ -f "$CORE_AGENT" && ! -f "$OPTIONAL_AGENT" ]]; then
-  _pass "Codex core-pack install keeps core agents and prunes optional agents"
-else
-  _fail "Codex core-pack agent filtering failed"
-fi
-# Kit-owned skills (plan-work, deliver) are always present regardless of pack filter.
-# Pack filtering only prunes skills declared in packs.json (the tool-skills).
-# The development-pack tool-skill agentic-engineering should be pruned in a core-only install.
-if [[ -d "$CODEX_CORE_DEST/.codex/skills/plan-work" && -d "$CODEX_CORE_DEST/.codex/skills/deliver" && ! -d "$CODEX_CORE_DEST/.codex/skills/agentic-engineering" ]]; then
-  _pass "Codex core-pack install: kit-skills present, dev-only tool-skill pruned"
+echo "--- Full Standalone Base Install ---"
+# There is no pack layer: a fresh install ships the complete standalone base.
+# Both the neutral toolbox agents (tool-planner) and the deeper agents (dev) are
+# present, and kit-owned skills (plan-work, deliver) plus standalone skills
+# (agentic-engineering) all install together.
+# Codex excludes the dev orchestrator agent (manifest.codex.excluded_agents), so
+# assert the neutral toolbox agent plus a deeper agent that codex does ship.
+CODEX_AGENTS_DIR="$CODEX_FULL_DEST/.codex/ag""ents"
+if [[ -f "$CODEX_AGENTS_DIR/tool-planner.toml" && -f "$CODEX_AGENTS_DIR/tool-security-reviewer.toml" ]]; then
+  _pass "Codex full install ships the complete agent base"
 else
-  _fail "Codex core-pack skill filtering failed"
+  _fail "Codex full install is missing base agents"
 fi
-if [[ -f "$CODEX_CORE_DEST/.flow-agents/installed-packs.json" ]]; then
-  _pass "Codex core-pack install records selected packs"
+if [[ -d "$CODEX_FULL_DEST/.codex/skills/plan-work" && -d "$CODEX_FULL_DEST/.codex/skills/deliver" && -d "$CODEX_FULL_DEST/.codex/skills/agentic-engineering" ]]; then
+  _pass "Codex full install ships kit-skills and standalone skills together"
 else
-  _fail "Codex core-pack install did not record selected packs"
+  _fail "Codex full install is missing skills"
 fi
 if [[ -f "$CODEX_AGENTS_DIR/user-agent.toml" && -d "$USER_SKILLS_DIR" ]]; then
-  _pass "Codex core-pack install preserves unknown user files"
-else
-  _fail "Codex core-pack install removed unknown user files"
-fi
-# Pack filtering for opencode
-OPENCODE_AGENTS_DIR="$OPENCODE_CORE_DEST/.opencode/agents"
-if (cd "$ROOT_DIR/dist/opencode" && FLOW_AGENTS_PACKS=core bash install.sh "$OPENCODE_CORE_DEST" >/dev/null); then
-  _pass "opencode core-pack filtered install succeeded"
-else
-  _fail "opencode core-pack filtered install failed"
-fi
-if [[ -d "$OPENCODE_AGENTS_DIR/tool-planner.md" ]] || [[ -f "$OPENCODE_AGENTS_DIR/tool-planner.md" ]]; then
-  _pass "opencode core-pack install keeps core agents"
+  _pass "Codex full install preserves unknown user files"
 else
-  _fail "opencode core-pack agent filtering failed (tool-planner.md missing)"
+  _fail "Codex full install removed unknown user files"
 fi
-# Kit-owned skills (plan-work, deliver) are always present regardless of pack filter.
-# Pack filtering only prunes skills declared in packs.json (the tool-skills).
-# The development-pack tool-skill agentic-engineering should be pruned in a core-only install.
-if [[ -d "$OPENCODE_CORE_DEST/.opencode/skills/plan-work" && -d "$OPENCODE_CORE_DEST/.opencode/skills/deliver" && ! -d "$OPENCODE_CORE_DEST/.opencode/skills/agentic-engineering" ]]; then
-  _pass "opencode core-pack install: kit-skills present, dev-only tool-skill pruned"
+OPENCODE_AGENTS_DIR="$OPENCODE_FULL_DEST/.opencode/agents"
+if [[ -f "$OPENCODE_AGENTS_DIR/tool-planner.md" && -f "$OPENCODE_AGENTS_DIR/dev.md" ]]; then
+  _pass "opencode full install ships the complete agent base"
 else
-  _fail "opencode core-pack skill filtering failed"
+  _fail "opencode full install is missing base agents"
 fi
-if [[ -f "$OPENCODE_CORE_DEST/.flow-agents/installed-packs.json" ]]; then
-  _pass "opencode core-pack install records selected packs"
+if [[ -d "$OPENCODE_FULL_DEST/.opencode/skills/plan-work" && -d "$OPENCODE_FULL_DEST/.opencode/skills/deliver" && -d "$OPENCODE_FULL_DEST/.opencode/skills/agentic-engineering" ]]; then
+  _pass "opencode full install ships kit-skills and standalone skills together"
 else
-  _fail "opencode core-pack install did not record selected packs"
+  _fail "opencode full install is missing skills"
 fi
 echo ""