npm - @linimin/pi-letscook - Versions diffs - 0.1.30 → 0.1.31 - Mend

@linimin/pi-letscook 0.1.30 → 0.1.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/CHANGELOG.md +16 -0
package/README.md +48 -1
package/agents/completion-auditor.md +17 -0
package/agents/completion-reviewer.md +17 -0
package/agents/completion-stop-judge.md +17 -0
package/extensions/completion/index.ts +749 -195
package/extensions/completion/role-reporting.js +356 -0
package/package.json +2 -1
package/scripts/context-proposal-test.sh +115 -6
package/scripts/refocus-test.sh +11 -0
package/scripts/release-check.sh +2 -0
package/scripts/rubric-contract-test.sh +249 -0
package/scripts/smoke-test.sh +154 -23
package/skills/completion-protocol/SKILL.md +39 -0
package/skills/completion-protocol/references/completion.md +71 -0

package/scripts/smoke-test.sh CHANGED Viewed

@@ -5,31 +5,122 @@ PKG_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 TMPDIR="$(mktemp -d)"
 trap 'rm -rf "$TMPDIR"' EXIT
-cd "$TMPDIR"
+ROOT="$TMPDIR/repo"
+KICKOFF_PROMPT="$TMPDIR/kickoff-prompt.txt"
+RESUME_PROMPT="$TMPDIR/resume-prompt.txt"
+mkdir -p "$ROOT"
+cd "$ROOT"
 git init -q
-pi -e "$PKG_ROOT" -p "/cook smoke-test mission" >/tmp/pi-completion-smoke.out 2>/tmp/pi-completion-smoke.err &
-PI_PID=$!
-for _ in $(seq 1 60); do
-  if [[ -f .agent/profile.json && -f .agent/state.json && -f .agent/plan.json && -f .agent/active-slice.json ]]; then
-    break
-  fi
-  sleep 1
+PI_COMPLETION_SKIP_DRIVER_KICKOFF=1 \
+PI_COMPLETION_TEST_DRIVER_PROMPT_PATH="$KICKOFF_PROMPT" \
+pi -e "$PKG_ROOT" -p "/cook smoke-test mission" \
+  >"$TMPDIR/pi-completion-smoke-bootstrap.out" 2>"$TMPDIR/pi-completion-smoke-bootstrap.err"
+for file in .agent/profile.json .agent/state.json .agent/plan.json .agent/active-slice.json; do
+  [[ -f "$file" ]] || { echo "missing canonical bootstrap file: $file" >&2; exit 1; }
 done
-if [[ ! -f .agent/profile.json || ! -f .agent/state.json || ! -f .agent/plan.json || ! -f .agent/active-slice.json ]]; then
-  echo "completion bootstrap did not materialize canonical files in time" >&2
-  cat /tmp/pi-completion-smoke.err >&2 || true
-  kill "$PI_PID" >/dev/null 2>&1 || true
-  wait "$PI_PID" >/dev/null 2>&1 || true
-  exit 1
-fi
-kill "$PI_PID" >/dev/null 2>&1 || true
-wait "$PI_PID" >/dev/null 2>&1 || true
 bash .agent/verify_completion_control_plane.sh >/dev/null
 bash .agent/verify_completion_stop.sh >/dev/null
-python3 - <<'PY2'
+python3 - "$KICKOFF_PROMPT" <<'PY'
+import json
+import sys
+from pathlib import Path
+expected_task_type = 'completion-workflow'
+expected_eval_profile = 'completion-rubric-v1'
+profile = json.loads(Path('.agent/profile.json').read_text())
+state = json.loads(Path('.agent/state.json').read_text())
+plan = json.loads(Path('.agent/plan.json').read_text())
+active = json.loads(Path('.agent/active-slice.json').read_text())
+kickoff = Path(sys.argv[1]).read_text()
+assert profile['task_type'] == expected_task_type, 'profile.json task_type mismatch after bootstrap'
+assert profile['evaluation_profile'] == expected_eval_profile, 'profile.json evaluation_profile mismatch after bootstrap'
+assert state['task_type'] == expected_task_type, 'state.json task_type mismatch after bootstrap'
+assert state['evaluation_profile'] == expected_eval_profile, 'state.json evaluation_profile mismatch after bootstrap'
+assert plan['task_type'] == expected_task_type, 'plan.json task_type mismatch after bootstrap'
+assert plan['evaluation_profile'] == expected_eval_profile, 'plan.json evaluation_profile mismatch after bootstrap'
+assert active['task_type'] == expected_task_type, 'active-slice.json task_type mismatch after bootstrap'
+assert active['evaluation_profile'] == expected_eval_profile, 'active-slice.json evaluation_profile mismatch after bootstrap'
+assert active['implementation_surfaces'] == [], 'active-slice.json should scaffold empty implementation_surfaces'
+assert active['verification_commands'] == [], 'active-slice.json should scaffold empty verification_commands'
+assert 'Canonical routing profile:' in kickoff, 'kickoff prompt should expose canonical routing profile'
+assert f'- task_type: {expected_task_type}' in kickoff, 'kickoff prompt missing canonical task_type'
+assert f'- evaluation_profile: {expected_eval_profile}' in kickoff, 'kickoff prompt missing canonical evaluation_profile'
+PY
+PI_COMPLETION_SKIP_DRIVER_KICKOFF=1 \
+PI_COMPLETION_TEST_DRIVER_PROMPT_PATH="$RESUME_PROMPT" \
+pi -e "$PKG_ROOT" -p "/cook" \
+  >"$TMPDIR/pi-completion-smoke-resume.out" 2>"$TMPDIR/pi-completion-smoke-resume.err"
+python3 - "$RESUME_PROMPT" <<'PY'
+import sys
+from pathlib import Path
+expected_task_type = 'completion-workflow'
+expected_eval_profile = 'completion-rubric-v1'
+resume = Path(sys.argv[1]).read_text()
+assert 'Canonical routing profile:' in resume, 'resume prompt should expose canonical routing profile'
+assert f'- task_type: {expected_task_type}' in resume, 'resume prompt missing canonical task_type'
+assert f'- evaluation_profile: {expected_eval_profile}' in resume, 'resume prompt missing canonical evaluation_profile'
+PY
+python3 - <<'PY'
+import json
+from pathlib import Path
+path = Path('.agent/state.json')
+state = json.loads(path.read_text())
+state.pop('task_type', None)
+path.write_text(json.dumps(state, indent=2) + '\n')
+PY
+if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
+  echo "expected control-plane verification to fail when state.json omits task_type" >&2
+  exit 1
+fi
+python3 - <<'PY'
+import json
+from pathlib import Path
+profile = json.loads(Path('.agent/profile.json').read_text())
+state_path = Path('.agent/state.json')
+state = json.loads(state_path.read_text())
+state['task_type'] = profile['task_type']
+state_path.write_text(json.dumps(state, indent=2) + '\n')
+PY
+python3 - <<'PY'
+import json
+from pathlib import Path
+path = Path('.agent/active-slice.json')
+active = json.loads(path.read_text())
+active.pop('evaluation_profile', None)
+path.write_text(json.dumps(active, indent=2) + '\n')
+PY
+if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
+  echo "expected control-plane verification to fail when active-slice.json omits evaluation_profile" >&2
+  exit 1
+fi
+python3 - <<'PY'
+import json
+from pathlib import Path
+profile = json.loads(Path('.agent/profile.json').read_text())
+active_path = Path('.agent/active-slice.json')
+active = json.loads(active_path.read_text())
+active['evaluation_profile'] = profile['evaluation_profile']
+active_path.write_text(json.dumps(active, indent=2) + '\n')
+PY
+python3 - <<'PY'
 import json
 from pathlib import Path
 path = Path('.agent/active-slice.json')
@@ -41,8 +132,10 @@ active.update({
     'contract_ids': ['smoke-contract'],
     'acceptance_criteria': ['criterion'],
     'blocked_on': [],
-    'locked_notes': [],
+    'locked_notes': ['keep the change scoped to the selected active-slice contract'],
     'must_fix_findings': [],
+    'implementation_surfaces': ['extensions/completion/index.ts', '.agent/verify_completion_control_plane.sh'],
+    'verification_commands': ['bash .agent/verify_completion_control_plane.sh', 'npm run smoke-test'],
     'basis_commit': 'deadbeef',
     'remaining_contract_ids_before': ['smoke-contract'],
     'release_blocker_count_before': 1,
@@ -51,14 +144,14 @@ active.update({
 active.pop('priority', None)
 active.pop('why_now', None)
 path.write_text(json.dumps(active, indent=2) + '\n')
-PY2
+PY
 if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
   echo "expected control-plane verification to fail when selected active-slice omits priority/why_now" >&2
   exit 1
 fi
-python3 - <<'PY3'
+python3 - <<'PY'
 import json
 from pathlib import Path
 path = Path('.agent/active-slice.json')
@@ -66,9 +159,47 @@ active = json.loads(path.read_text())
 active['priority'] = 1
 active['why_now'] = 'smoke test exact handoff'
 path.write_text(json.dumps(active, indent=2) + '\n')
-PY3
+PY
+python3 - <<'PY'
+import json
+from pathlib import Path
+path = Path('.agent/active-slice.json')
+active = json.loads(path.read_text())
+active.pop('implementation_surfaces', None)
+active.pop('verification_commands', None)
+path.write_text(json.dumps(active, indent=2) + '\n')
+PY
+if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
+  echo "expected control-plane verification to fail when selected active-slice omits implementation_surfaces/verification_commands" >&2
+  exit 1
+fi
+python3 - <<'PY'
+import json
+from pathlib import Path
+path = Path('.agent/active-slice.json')
+active = json.loads(path.read_text())
+active['implementation_surfaces'] = ['extensions/completion/index.ts', '.agent/verify_completion_control_plane.sh']
+active['verification_commands'] = ['bash .agent/verify_completion_control_plane.sh', 'npm run smoke-test']
+path.write_text(json.dumps(active, indent=2) + '\n')
+PY
 bash .agent/verify_completion_control_plane.sh >/dev/null
 bash .agent/verify_completion_stop.sh >/dev/null
-echo "smoke test passed: $TMPDIR"
+python3 - "$PKG_ROOT" <<'PY'
+import sys
+from pathlib import Path
+text = Path(sys.argv[1], 'extensions/completion', 'index.ts').read_text()
+assert 'Active slice priority: ${activePriority}' in text, 'system reminder source should expose active-slice priority'
+assert 'Active slice why_now: ${activeWhyNow}' in text, 'system reminder source should expose active-slice why_now'
+assert 'Active implementation surfaces: ${implementationSurfaces.join(", ")}' in text, 'system reminder source should expose implementation_surfaces'
+assert 'Active verification commands: ${verificationCommands.join(" | ")}' in text, 'system reminder source should expose verification_commands'
+assert '`- implementation_surfaces: ${implementationSurfaces.join(" | ")}`' in text, 'resume capsule source should expose implementation_surfaces'
+assert '`- verification_commands: ${verificationCommands.join(" | ")}`' in text, 'resume capsule source should expose verification_commands'
+PY
+echo "smoke test passed: $ROOT"

package/skills/completion-protocol/SKILL.md CHANGED Viewed

@@ -148,6 +148,8 @@ The workflow driver must invoke `completion-regrounder` before continuing whenev
 - acceptance criteria for the selected or active slice are missing or unclear
 - the exact implementer handoff snapshot in `.agent/active-slice.json` is missing, stale, or contradictory
+The exact implementer handoff now includes implementation-scope surfaces and expected verification commands in addition to the locked slice goal, acceptance, notes, and before-slice counters.
 The workflow driver must not continue implementation, review, audit, or stop evaluation from compacted conversation memory alone.
 After compaction or recovery, `completion-implementer` must also re-read canonical `.agent/state.json`, `.agent/plan.json`, and `.agent/active-slice.json` before resuming work. If `.agent/active-slice.json` still contains a truthful exact handoff snapshot, continue from canonical state rather than asking the user to resend the original caller payload.
@@ -161,6 +163,43 @@ All completion reports must begin with:
 If a role-specific fixed format uses before/after wording, keep the same mission-anchor first line and then follow that role's exact format.
+## Structured Evaluation Rubric Foundation
+`completion-reviewer`, `completion-auditor`, and `completion-stop-judge` must emit rubric-backed evaluations using the same shared dimension names and verdict semantics.
+The shared rubric foundation now sits alongside canonical `task_type` and `evaluation_profile` signaling in the control plane. That signaling is routing metadata only; later slices may still add stricter profile-aware rubric-output enforcement.
+Required rubric section:
+- `Rubric:`
+- `- Contract coverage: pass|concern|fail - ...`
+- `- Correctness risk: pass|concern|fail - ...`
+- `- Verification evidence: pass|concern|fail - ...`
+- `- Docs/state parity: pass|concern|fail - ...`
+Use the dimension names and verdict words exactly as written above.
+Dimension meaning:
+- `Contract coverage` — whether the slice or current HEAD satisfies the locked acceptance criteria and role-specific workflow obligations.
+- `Correctness risk` — whether regressions, blocking defects, safety issues, or closure risks are still evident in current repo truth.
+- `Verification evidence` — whether tests, deterministic proof, and rerun verification are strong enough for the role's decision.
+- `Docs/state parity` — whether docs, config, runbooks, and canonical `.agent` state stay truthful to shipped behavior for the role's scope.
+Verdict semantics:
+- `pass` — no material issue remains for that dimension in the role's current decision.
+- `concern` — a real caveat or remaining gap exists, but it does not by itself force rejection or `NO-STOP`; explain the follow-up plainly.
+- `fail` — a blocking issue or contradictory truth exists and the role's final verdict must not be positive.
+Decision alignment rules:
+- Reviewer: any `fail` means `Acceptable as-is` must be `no`.
+- Auditor: use `concern` or `fail` to explain why the project is not yet done and whether canonical backlog/state remain truthful.
+- Stop judge: any `fail` means `Can the project stop now` must be `no`.
+Always include all four rubric lines, even when every dimension is `pass`.
 ## References
 Read these bundled references when you need the full protocol or scaffolding material:

package/skills/completion-protocol/references/completion.md CHANGED Viewed

@@ -36,6 +36,8 @@
   "project_name": "<repo-name>",
   "required_stop_judges": 3,
   "priority_policy_id": "completion-default",
+  "task_type": "completion-workflow",
+  "evaluation_profile": "completion-rubric-v1",
   "docs_surfaces": ["README.md", "docs/"]
 }
 ```
@@ -48,6 +50,8 @@ Required fields:
 - `schema_version`
 - `mission_anchor`
+- `task_type`
+- `evaluation_profile`
 - `current_phase`
 - `continuation_policy`
 - `continuation_reason`
@@ -111,6 +115,8 @@ Required fields:
 - `schema_version`
 - `mission_anchor`
+- `task_type`
+- `evaluation_profile`
 - `last_reground_at`
 - `plan_basis`
 - `candidate_slices`
@@ -143,6 +149,27 @@ Rules:
 `active-slice.json` carries one current slice cursor.
+Required base fields:
+- `schema_version`
+- `mission_anchor`
+- `task_type`
+- `evaluation_profile`
+- `status`
+- `slice_id`
+- `goal`
+- `contract_ids`
+- `acceptance_criteria`
+- `blocked_on`
+- `locked_notes`
+- `must_fix_findings`
+- `implementation_surfaces`
+- `verification_commands`
+- `basis_commit`
+- `remaining_contract_ids_before`
+- `release_blocker_count_before`
+- `high_value_gap_count_before`
 When `status` is `selected`, `in_progress`, `committed`, or `done`, `active-slice.json` must also carry the exact implementer handoff snapshot so `completion-implementer` can resume after compaction without asking the user to resend the original caller payload.
 Required exact handoff fields:
@@ -153,11 +180,18 @@ Required exact handoff fields:
 - `blocked_on`
 - `locked_notes`
 - `must_fix_findings`
+- `implementation_surfaces`
+- `verification_commands`
 - `basis_commit`
 - `remaining_contract_ids_before`
 - `release_blocker_count_before`
 - `high_value_gap_count_before`
+Field meaning:
+- `implementation_surfaces` — the repo files or surfaces this slice is expected to update or keep in parity, so implementers can resume on the right scope after compaction.
+- `verification_commands` — the focused and broader deterministic checks expected before the slice is committed.
 Allowed `status` values:
 - `idle`
@@ -199,6 +233,43 @@ Minimum record shape:
 Empty history files are legal.
+## Structured Evaluation Rubric Foundation
+`completion-reviewer`, `completion-auditor`, and `completion-stop-judge` must emit rubric-backed evaluations using the same shared dimension names and verdict semantics.
+The shared rubric foundation now sits alongside canonical `task_type` and `evaluation_profile` signaling in `.agent/profile.json`, `.agent/state.json`, `.agent/plan.json`, and `.agent/active-slice.json`. That signaling is routing metadata only; later slices may still add stricter profile-aware rubric-output enforcement.
+Required rubric section:
+- `Rubric:`
+- `- Contract coverage: pass|concern|fail - ...`
+- `- Correctness risk: pass|concern|fail - ...`
+- `- Verification evidence: pass|concern|fail - ...`
+- `- Docs/state parity: pass|concern|fail - ...`
+Use the dimension names and verdict words exactly as written above.
+Dimension meaning:
+- `Contract coverage` — whether the slice or current HEAD satisfies the locked acceptance criteria and role-specific workflow obligations.
+- `Correctness risk` — whether regressions, blocking defects, safety issues, or closure risks are still evident in current repo truth.
+- `Verification evidence` — whether tests, deterministic proof, and rerun verification are strong enough for the role's decision.
+- `Docs/state parity` — whether docs, config, runbooks, and canonical `.agent` state stay truthful to shipped behavior for the role's scope.
+Verdict semantics:
+- `pass` — no material issue remains for that dimension in the role's current decision.
+- `concern` — a real caveat or remaining gap exists, but it does not by itself force rejection or `NO-STOP`; explain the follow-up plainly.
+- `fail` — a blocking issue or contradictory truth exists and the role's final verdict must not be positive.
+Decision alignment rules:
+- Reviewer: any `fail` means `Acceptable as-is` must be `no`.
+- Auditor: use `concern` or `fail` to explain why the project is not yet done and whether canonical backlog/state remain truthful.
+- Stop judge: any `fail` means `Can the project stop now` must be `no`.
+Always include all four rubric lines, even when every dimension is `pass`.
 ## One-Slice Lifecycle
 1. Re-ground from current repo truth.