@linimin/pi-letscook 0.1.29 → 0.1.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,31 +5,122 @@ PKG_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
5
5
  TMPDIR="$(mktemp -d)"
6
6
  trap 'rm -rf "$TMPDIR"' EXIT
7
7
 
8
- cd "$TMPDIR"
8
+ ROOT="$TMPDIR/repo"
9
+ KICKOFF_PROMPT="$TMPDIR/kickoff-prompt.txt"
10
+ RESUME_PROMPT="$TMPDIR/resume-prompt.txt"
11
+
12
+ mkdir -p "$ROOT"
13
+ cd "$ROOT"
9
14
  git init -q
10
15
 
11
- pi -e "$PKG_ROOT" -p "/cook smoke-test mission" >/tmp/pi-completion-smoke.out 2>/tmp/pi-completion-smoke.err &
12
- PI_PID=$!
13
- for _ in $(seq 1 60); do
14
- if [[ -f .agent/profile.json && -f .agent/state.json && -f .agent/plan.json && -f .agent/active-slice.json ]]; then
15
- break
16
- fi
17
- sleep 1
16
+ PI_COMPLETION_SKIP_DRIVER_KICKOFF=1 \
17
+ PI_COMPLETION_TEST_DRIVER_PROMPT_PATH="$KICKOFF_PROMPT" \
18
+ pi -e "$PKG_ROOT" -p "/cook smoke-test mission" \
19
+ >"$TMPDIR/pi-completion-smoke-bootstrap.out" 2>"$TMPDIR/pi-completion-smoke-bootstrap.err"
20
+
21
+ for file in .agent/profile.json .agent/state.json .agent/plan.json .agent/active-slice.json; do
22
+ [[ -f "$file" ]] || { echo "missing canonical bootstrap file: $file" >&2; exit 1; }
18
23
  done
19
- if [[ ! -f .agent/profile.json || ! -f .agent/state.json || ! -f .agent/plan.json || ! -f .agent/active-slice.json ]]; then
20
- echo "completion bootstrap did not materialize canonical files in time" >&2
21
- cat /tmp/pi-completion-smoke.err >&2 || true
22
- kill "$PI_PID" >/dev/null 2>&1 || true
23
- wait "$PI_PID" >/dev/null 2>&1 || true
24
- exit 1
25
- fi
26
- kill "$PI_PID" >/dev/null 2>&1 || true
27
- wait "$PI_PID" >/dev/null 2>&1 || true
28
24
 
29
25
  bash .agent/verify_completion_control_plane.sh >/dev/null
30
26
  bash .agent/verify_completion_stop.sh >/dev/null
31
27
 
32
- python3 - <<'PY2'
28
+ python3 - "$KICKOFF_PROMPT" <<'PY'
29
+ import json
30
+ import sys
31
+ from pathlib import Path
32
+
33
+ expected_task_type = 'completion-workflow'
34
+ expected_eval_profile = 'completion-rubric-v1'
35
+
36
+ profile = json.loads(Path('.agent/profile.json').read_text())
37
+ state = json.loads(Path('.agent/state.json').read_text())
38
+ plan = json.loads(Path('.agent/plan.json').read_text())
39
+ active = json.loads(Path('.agent/active-slice.json').read_text())
40
+ kickoff = Path(sys.argv[1]).read_text()
41
+
42
+ assert profile['task_type'] == expected_task_type, 'profile.json task_type mismatch after bootstrap'
43
+ assert profile['evaluation_profile'] == expected_eval_profile, 'profile.json evaluation_profile mismatch after bootstrap'
44
+ assert state['task_type'] == expected_task_type, 'state.json task_type mismatch after bootstrap'
45
+ assert state['evaluation_profile'] == expected_eval_profile, 'state.json evaluation_profile mismatch after bootstrap'
46
+ assert plan['task_type'] == expected_task_type, 'plan.json task_type mismatch after bootstrap'
47
+ assert plan['evaluation_profile'] == expected_eval_profile, 'plan.json evaluation_profile mismatch after bootstrap'
48
+ assert active['task_type'] == expected_task_type, 'active-slice.json task_type mismatch after bootstrap'
49
+ assert active['evaluation_profile'] == expected_eval_profile, 'active-slice.json evaluation_profile mismatch after bootstrap'
50
+ assert active['implementation_surfaces'] == [], 'active-slice.json should scaffold empty implementation_surfaces'
51
+ assert active['verification_commands'] == [], 'active-slice.json should scaffold empty verification_commands'
52
+ assert 'Canonical routing profile:' in kickoff, 'kickoff prompt should expose canonical routing profile'
53
+ assert f'- task_type: {expected_task_type}' in kickoff, 'kickoff prompt missing canonical task_type'
54
+ assert f'- evaluation_profile: {expected_eval_profile}' in kickoff, 'kickoff prompt missing canonical evaluation_profile'
55
+ PY
56
+
57
+ PI_COMPLETION_SKIP_DRIVER_KICKOFF=1 \
58
+ PI_COMPLETION_TEST_DRIVER_PROMPT_PATH="$RESUME_PROMPT" \
59
+ pi -e "$PKG_ROOT" -p "/cook" \
60
+ >"$TMPDIR/pi-completion-smoke-resume.out" 2>"$TMPDIR/pi-completion-smoke-resume.err"
61
+
62
+ python3 - "$RESUME_PROMPT" <<'PY'
63
+ import sys
64
+ from pathlib import Path
65
+
66
+ expected_task_type = 'completion-workflow'
67
+ expected_eval_profile = 'completion-rubric-v1'
68
+ resume = Path(sys.argv[1]).read_text()
69
+
70
+ assert 'Canonical routing profile:' in resume, 'resume prompt should expose canonical routing profile'
71
+ assert f'- task_type: {expected_task_type}' in resume, 'resume prompt missing canonical task_type'
72
+ assert f'- evaluation_profile: {expected_eval_profile}' in resume, 'resume prompt missing canonical evaluation_profile'
73
+ PY
74
+
75
+ python3 - <<'PY'
76
+ import json
77
+ from pathlib import Path
78
+ path = Path('.agent/state.json')
79
+ state = json.loads(path.read_text())
80
+ state.pop('task_type', None)
81
+ path.write_text(json.dumps(state, indent=2) + '\n')
82
+ PY
83
+
84
+ if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
85
+ echo "expected control-plane verification to fail when state.json omits task_type" >&2
86
+ exit 1
87
+ fi
88
+
89
+ python3 - <<'PY'
90
+ import json
91
+ from pathlib import Path
92
+ profile = json.loads(Path('.agent/profile.json').read_text())
93
+ state_path = Path('.agent/state.json')
94
+ state = json.loads(state_path.read_text())
95
+ state['task_type'] = profile['task_type']
96
+ state_path.write_text(json.dumps(state, indent=2) + '\n')
97
+ PY
98
+
99
+ python3 - <<'PY'
100
+ import json
101
+ from pathlib import Path
102
+ path = Path('.agent/active-slice.json')
103
+ active = json.loads(path.read_text())
104
+ active.pop('evaluation_profile', None)
105
+ path.write_text(json.dumps(active, indent=2) + '\n')
106
+ PY
107
+
108
+ if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
109
+ echo "expected control-plane verification to fail when active-slice.json omits evaluation_profile" >&2
110
+ exit 1
111
+ fi
112
+
113
+ python3 - <<'PY'
114
+ import json
115
+ from pathlib import Path
116
+ profile = json.loads(Path('.agent/profile.json').read_text())
117
+ active_path = Path('.agent/active-slice.json')
118
+ active = json.loads(active_path.read_text())
119
+ active['evaluation_profile'] = profile['evaluation_profile']
120
+ active_path.write_text(json.dumps(active, indent=2) + '\n')
121
+ PY
122
+
123
+ python3 - <<'PY'
33
124
  import json
34
125
  from pathlib import Path
35
126
  path = Path('.agent/active-slice.json')
@@ -41,8 +132,10 @@ active.update({
41
132
  'contract_ids': ['smoke-contract'],
42
133
  'acceptance_criteria': ['criterion'],
43
134
  'blocked_on': [],
44
- 'locked_notes': [],
135
+ 'locked_notes': ['keep the change scoped to the selected active-slice contract'],
45
136
  'must_fix_findings': [],
137
+ 'implementation_surfaces': ['extensions/completion/index.ts', '.agent/verify_completion_control_plane.sh'],
138
+ 'verification_commands': ['bash .agent/verify_completion_control_plane.sh', 'npm run smoke-test'],
46
139
  'basis_commit': 'deadbeef',
47
140
  'remaining_contract_ids_before': ['smoke-contract'],
48
141
  'release_blocker_count_before': 1,
@@ -51,14 +144,14 @@ active.update({
51
144
  active.pop('priority', None)
52
145
  active.pop('why_now', None)
53
146
  path.write_text(json.dumps(active, indent=2) + '\n')
54
- PY2
147
+ PY
55
148
 
56
149
  if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
57
150
  echo "expected control-plane verification to fail when selected active-slice omits priority/why_now" >&2
58
151
  exit 1
59
152
  fi
60
153
 
61
- python3 - <<'PY3'
154
+ python3 - <<'PY'
62
155
  import json
63
156
  from pathlib import Path
64
157
  path = Path('.agent/active-slice.json')
@@ -66,9 +159,47 @@ active = json.loads(path.read_text())
66
159
  active['priority'] = 1
67
160
  active['why_now'] = 'smoke test exact handoff'
68
161
  path.write_text(json.dumps(active, indent=2) + '\n')
69
- PY3
162
+ PY
163
+
164
+ python3 - <<'PY'
165
+ import json
166
+ from pathlib import Path
167
+ path = Path('.agent/active-slice.json')
168
+ active = json.loads(path.read_text())
169
+ active.pop('implementation_surfaces', None)
170
+ active.pop('verification_commands', None)
171
+ path.write_text(json.dumps(active, indent=2) + '\n')
172
+ PY
173
+
174
+ if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
175
+ echo "expected control-plane verification to fail when selected active-slice omits implementation_surfaces/verification_commands" >&2
176
+ exit 1
177
+ fi
178
+
179
+ python3 - <<'PY'
180
+ import json
181
+ from pathlib import Path
182
+ path = Path('.agent/active-slice.json')
183
+ active = json.loads(path.read_text())
184
+ active['implementation_surfaces'] = ['extensions/completion/index.ts', '.agent/verify_completion_control_plane.sh']
185
+ active['verification_commands'] = ['bash .agent/verify_completion_control_plane.sh', 'npm run smoke-test']
186
+ path.write_text(json.dumps(active, indent=2) + '\n')
187
+ PY
70
188
 
71
189
  bash .agent/verify_completion_control_plane.sh >/dev/null
72
190
  bash .agent/verify_completion_stop.sh >/dev/null
73
191
 
74
- echo "smoke test passed: $TMPDIR"
192
+ python3 - "$PKG_ROOT" <<'PY'
193
+ import sys
194
+ from pathlib import Path
195
+
196
+ text = Path(sys.argv[1], 'extensions/completion', 'index.ts').read_text()
197
+ assert 'Active slice priority: ${activePriority}' in text, 'system reminder source should expose active-slice priority'
198
+ assert 'Active slice why_now: ${activeWhyNow}' in text, 'system reminder source should expose active-slice why_now'
199
+ assert 'Active implementation surfaces: ${implementationSurfaces.join(", ")}' in text, 'system reminder source should expose implementation_surfaces'
200
+ assert 'Active verification commands: ${verificationCommands.join(" | ")}' in text, 'system reminder source should expose verification_commands'
201
+ assert '`- implementation_surfaces: ${implementationSurfaces.join(" | ")}`' in text, 'resume capsule source should expose implementation_surfaces'
202
+ assert '`- verification_commands: ${verificationCommands.join(" | ")}`' in text, 'resume capsule source should expose verification_commands'
203
+ PY
204
+
205
+ echo "smoke test passed: $ROOT"
@@ -148,6 +148,8 @@ The workflow driver must invoke `completion-regrounder` before continuing whenev
148
148
  - acceptance criteria for the selected or active slice are missing or unclear
149
149
  - the exact implementer handoff snapshot in `.agent/active-slice.json` is missing, stale, or contradictory
150
150
 
151
+ The exact implementer handoff now includes implementation-scope surfaces and expected verification commands in addition to the locked slice goal, acceptance, notes, and before-slice counters.
152
+
151
153
  The workflow driver must not continue implementation, review, audit, or stop evaluation from compacted conversation memory alone.
152
154
 
153
155
  After compaction or recovery, `completion-implementer` must also re-read canonical `.agent/state.json`, `.agent/plan.json`, and `.agent/active-slice.json` before resuming work. If `.agent/active-slice.json` still contains a truthful exact handoff snapshot, continue from canonical state rather than asking the user to resend the original caller payload.
@@ -161,6 +163,43 @@ All completion reports must begin with:
161
163
 
162
164
  If a role-specific fixed format uses before/after wording, keep the same mission-anchor first line and then follow that role's exact format.
163
165
 
166
+ ## Structured Evaluation Rubric Foundation
167
+
168
+ `completion-reviewer`, `completion-auditor`, and `completion-stop-judge` must emit rubric-backed evaluations using the same shared dimension names and verdict semantics.
169
+
170
+ The shared rubric foundation now sits alongside canonical `task_type` and `evaluation_profile` signaling in the control plane. That signaling is routing metadata only; later slices may still add stricter profile-aware rubric-output enforcement.
171
+
172
+ Required rubric section:
173
+
174
+ - `Rubric:`
175
+ - `- Contract coverage: pass|concern|fail - ...`
176
+ - `- Correctness risk: pass|concern|fail - ...`
177
+ - `- Verification evidence: pass|concern|fail - ...`
178
+ - `- Docs/state parity: pass|concern|fail - ...`
179
+
180
+ Use the dimension names and verdict words exactly as written above.
181
+
182
+ Dimension meaning:
183
+
184
+ - `Contract coverage` — whether the slice or current HEAD satisfies the locked acceptance criteria and role-specific workflow obligations.
185
+ - `Correctness risk` — whether regressions, blocking defects, safety issues, or closure risks are still evident in current repo truth.
186
+ - `Verification evidence` — whether tests, deterministic proof, and rerun verification are strong enough for the role's decision.
187
+ - `Docs/state parity` — whether docs, config, runbooks, and canonical `.agent` state stay truthful to shipped behavior for the role's scope.
188
+
189
+ Verdict semantics:
190
+
191
+ - `pass` — no material issue remains for that dimension in the role's current decision.
192
+ - `concern` — a real caveat or remaining gap exists, but it does not by itself force rejection or `NO-STOP`; explain the follow-up plainly.
193
+ - `fail` — a blocking issue or contradictory truth exists and the role's final verdict must not be positive.
194
+
195
+ Decision alignment rules:
196
+
197
+ - Reviewer: any `fail` means `Acceptable as-is` must be `no`.
198
+ - Auditor: use `concern` or `fail` to explain why the project is not yet done and whether canonical backlog/state remain truthful.
199
+ - Stop judge: any `fail` means `Can the project stop now` must be `no`.
200
+
201
+ Always include all four rubric lines, even when every dimension is `pass`.
202
+
164
203
  ## References
165
204
 
166
205
  Read these bundled references when you need the full protocol or scaffolding material:
@@ -36,6 +36,8 @@
36
36
  "project_name": "<repo-name>",
37
37
  "required_stop_judges": 3,
38
38
  "priority_policy_id": "completion-default",
39
+ "task_type": "completion-workflow",
40
+ "evaluation_profile": "completion-rubric-v1",
39
41
  "docs_surfaces": ["README.md", "docs/"]
40
42
  }
41
43
  ```
@@ -48,6 +50,8 @@ Required fields:
48
50
 
49
51
  - `schema_version`
50
52
  - `mission_anchor`
53
+ - `task_type`
54
+ - `evaluation_profile`
51
55
  - `current_phase`
52
56
  - `continuation_policy`
53
57
  - `continuation_reason`
@@ -111,6 +115,8 @@ Required fields:
111
115
 
112
116
  - `schema_version`
113
117
  - `mission_anchor`
118
+ - `task_type`
119
+ - `evaluation_profile`
114
120
  - `last_reground_at`
115
121
  - `plan_basis`
116
122
  - `candidate_slices`
@@ -143,6 +149,27 @@ Rules:
143
149
 
144
150
  `active-slice.json` carries one current slice cursor.
145
151
 
152
+ Required base fields:
153
+
154
+ - `schema_version`
155
+ - `mission_anchor`
156
+ - `task_type`
157
+ - `evaluation_profile`
158
+ - `status`
159
+ - `slice_id`
160
+ - `goal`
161
+ - `contract_ids`
162
+ - `acceptance_criteria`
163
+ - `blocked_on`
164
+ - `locked_notes`
165
+ - `must_fix_findings`
166
+ - `implementation_surfaces`
167
+ - `verification_commands`
168
+ - `basis_commit`
169
+ - `remaining_contract_ids_before`
170
+ - `release_blocker_count_before`
171
+ - `high_value_gap_count_before`
172
+
146
173
  When `status` is `selected`, `in_progress`, `committed`, or `done`, `active-slice.json` must also carry the exact implementer handoff snapshot so `completion-implementer` can resume after compaction without asking the user to resend the original caller payload.
147
174
 
148
175
  Required exact handoff fields:
@@ -153,11 +180,18 @@ Required exact handoff fields:
153
180
  - `blocked_on`
154
181
  - `locked_notes`
155
182
  - `must_fix_findings`
183
+ - `implementation_surfaces`
184
+ - `verification_commands`
156
185
  - `basis_commit`
157
186
  - `remaining_contract_ids_before`
158
187
  - `release_blocker_count_before`
159
188
  - `high_value_gap_count_before`
160
189
 
190
+ Field meaning:
191
+
192
+ - `implementation_surfaces` — the repo files or surfaces this slice is expected to update or keep in parity, so implementers can resume on the right scope after compaction.
193
+ - `verification_commands` — the focused and broader deterministic checks expected before the slice is committed.
194
+
161
195
  Allowed `status` values:
162
196
 
163
197
  - `idle`
@@ -199,6 +233,43 @@ Minimum record shape:
199
233
 
200
234
  Empty history files are legal.
201
235
 
236
+ ## Structured Evaluation Rubric Foundation
237
+
238
+ `completion-reviewer`, `completion-auditor`, and `completion-stop-judge` must emit rubric-backed evaluations using the same shared dimension names and verdict semantics.
239
+
240
+ The shared rubric foundation now sits alongside canonical `task_type` and `evaluation_profile` signaling in `.agent/profile.json`, `.agent/state.json`, `.agent/plan.json`, and `.agent/active-slice.json`. That signaling is routing metadata only; later slices may still add stricter profile-aware rubric-output enforcement.
241
+
242
+ Required rubric section:
243
+
244
+ - `Rubric:`
245
+ - `- Contract coverage: pass|concern|fail - ...`
246
+ - `- Correctness risk: pass|concern|fail - ...`
247
+ - `- Verification evidence: pass|concern|fail - ...`
248
+ - `- Docs/state parity: pass|concern|fail - ...`
249
+
250
+ Use the dimension names and verdict words exactly as written above.
251
+
252
+ Dimension meaning:
253
+
254
+ - `Contract coverage` — whether the slice or current HEAD satisfies the locked acceptance criteria and role-specific workflow obligations.
255
+ - `Correctness risk` — whether regressions, blocking defects, safety issues, or closure risks are still evident in current repo truth.
256
+ - `Verification evidence` — whether tests, deterministic proof, and rerun verification are strong enough for the role's decision.
257
+ - `Docs/state parity` — whether docs, config, runbooks, and canonical `.agent` state stay truthful to shipped behavior for the role's scope.
258
+
259
+ Verdict semantics:
260
+
261
+ - `pass` — no material issue remains for that dimension in the role's current decision.
262
+ - `concern` — a real caveat or remaining gap exists, but it does not by itself force rejection or `NO-STOP`; explain the follow-up plainly.
263
+ - `fail` — a blocking issue or contradictory truth exists and the role's final verdict must not be positive.
264
+
265
+ Decision alignment rules:
266
+
267
+ - Reviewer: any `fail` means `Acceptable as-is` must be `no`.
268
+ - Auditor: use `concern` or `fail` to explain why the project is not yet done and whether canonical backlog/state remain truthful.
269
+ - Stop judge: any `fail` means `Can the project stop now` must be `no`.
270
+
271
+ Always include all four rubric lines, even when every dimension is `pass`.
272
+
202
273
  ## One-Slice Lifecycle
203
274
 
204
275
  1. Re-ground from current repo truth.