@hallucination-studio/harness-engine 1.0.0-beta.8.87407 → 1.0.0-beta.9.bb2cd30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -6
- package/package.json +8 -2
- package/skills/harness-repo-bootstrap/SKILL.md +18 -7
- package/skills/harness-repo-bootstrap/evals/cases.json +8 -0
- package/skills/harness-repo-bootstrap/evals/run_evals.py +453 -2
- package/skills/harness-repo-bootstrap/references/evaluation-loop.md +2 -0
- package/skills/harness-repo-bootstrap/references/exec-plans.md +14 -4
- package/skills/harness-repo-bootstrap/references/workflow.md +6 -0
- package/skills/harness-repo-bootstrap/scripts/manage_harness.py +1016 -22
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
4
5
|
import subprocess
|
|
5
6
|
import sys
|
|
6
7
|
import tempfile
|
|
@@ -67,6 +68,10 @@ def test_empty_repo_init(tmp_root):
|
|
|
67
68
|
raise AssertionError("Analysis should report missing exec-plan state")
|
|
68
69
|
if not analysis["missing_sops"]:
|
|
69
70
|
raise AssertionError("Analysis should report missing SOPs")
|
|
71
|
+
nested_output = tmp_root / "nested" / "generated" / "analysis.json"
|
|
72
|
+
run_manager("analyze", "--repo", str(repo), "--output", str(nested_output))
|
|
73
|
+
if not nested_output.exists():
|
|
74
|
+
raise AssertionError("analyze --output should create missing parent directories")
|
|
70
75
|
|
|
71
76
|
run_manager("init", "--repo", str(repo), "--answers", str(answers))
|
|
72
77
|
for relative_path in [
|
|
@@ -74,12 +79,14 @@ def test_empty_repo_init(tmp_root):
|
|
|
74
79
|
"ARCHITECTURE.md",
|
|
75
80
|
"docs/PLANS.md",
|
|
76
81
|
"docs/QUALITY_SCORE.md",
|
|
82
|
+
"docs/exec-plans/workstreams.md",
|
|
77
83
|
"docs/exec-plans/active/_template.md",
|
|
78
84
|
"docs/exec-plans/completed/README.md",
|
|
79
85
|
"docs/sops/encode-unseen-knowledge.md",
|
|
80
86
|
]:
|
|
81
87
|
assert_exists(repo, relative_path)
|
|
82
88
|
assert_contains(repo, "AGENTS.md", "docs/exec-plans/active/")
|
|
89
|
+
assert_contains(repo, "AGENTS.md", "docs/exec-plans/workstreams.md")
|
|
83
90
|
assert_contains(repo, "AGENTS.md", "docs/sops/")
|
|
84
91
|
assert_contains(repo, "AGENTS.md", ".codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check")
|
|
85
92
|
|
|
@@ -185,6 +192,69 @@ def test_closed_loop_plan(tmp_root):
|
|
|
185
192
|
"--append",
|
|
186
193
|
)
|
|
187
194
|
assert_contains(repo, "docs/PRODUCT_SENSE.md", fact)
|
|
195
|
+
run_manager(
|
|
196
|
+
"plan-close",
|
|
197
|
+
"--repo",
|
|
198
|
+
str(repo),
|
|
199
|
+
"--plan",
|
|
200
|
+
relative_plan,
|
|
201
|
+
"--summary",
|
|
202
|
+
"done",
|
|
203
|
+
expect_success=False,
|
|
204
|
+
)
|
|
205
|
+
failing_score = run_manager(
|
|
206
|
+
"quality-score",
|
|
207
|
+
"--repo",
|
|
208
|
+
str(repo),
|
|
209
|
+
"--plan",
|
|
210
|
+
relative_plan,
|
|
211
|
+
"--product-correctness",
|
|
212
|
+
"9",
|
|
213
|
+
"--ux-operator-clarity",
|
|
214
|
+
"8",
|
|
215
|
+
"--architecture-maintainability",
|
|
216
|
+
"7",
|
|
217
|
+
"--reliability-observability",
|
|
218
|
+
"8",
|
|
219
|
+
"--security-data-handling",
|
|
220
|
+
"8",
|
|
221
|
+
"--architecture-note",
|
|
222
|
+
"Plan closure needs a deterministic quality gate before handoff",
|
|
223
|
+
expect_success=False,
|
|
224
|
+
)
|
|
225
|
+
if failing_score["status"] != "fail":
|
|
226
|
+
raise AssertionError("Low dimension score should fail the quality gate")
|
|
227
|
+
plan_text_after_fail = plan_path.read_text()
|
|
228
|
+
if "## Rework Required" not in plan_text_after_fail:
|
|
229
|
+
raise AssertionError("Failing quality score should keep a rework section")
|
|
230
|
+
if "Improve Architecture and maintainability" not in plan_text_after_fail:
|
|
231
|
+
raise AssertionError("Failing quality score should name the low dimension")
|
|
232
|
+
check_after_fail = run_manager("check", "--repo", str(repo), expect_success=False)
|
|
233
|
+
if check_after_fail["status"] != "fail":
|
|
234
|
+
raise AssertionError("Harness check should fail while an active plan has a failed quality gate")
|
|
235
|
+
passing_score = run_manager(
|
|
236
|
+
"quality-score",
|
|
237
|
+
"--repo",
|
|
238
|
+
str(repo),
|
|
239
|
+
"--plan",
|
|
240
|
+
relative_plan,
|
|
241
|
+
"--product-correctness",
|
|
242
|
+
"9",
|
|
243
|
+
"--ux-operator-clarity",
|
|
244
|
+
"8",
|
|
245
|
+
"--architecture-maintainability",
|
|
246
|
+
"8",
|
|
247
|
+
"--reliability-observability",
|
|
248
|
+
"8",
|
|
249
|
+
"--security-data-handling",
|
|
250
|
+
"8",
|
|
251
|
+
"--product-note",
|
|
252
|
+
"Requested behavior is complete",
|
|
253
|
+
"--architecture-note",
|
|
254
|
+
"Plan closure now has a deterministic quality gate",
|
|
255
|
+
)
|
|
256
|
+
if passing_score["status"] != "pass":
|
|
257
|
+
raise AssertionError("Scores at or above the minimum should pass")
|
|
188
258
|
close_result = run_manager(
|
|
189
259
|
"plan-close",
|
|
190
260
|
"--repo",
|
|
@@ -247,6 +317,8 @@ def test_closed_loop_plan(tmp_root):
|
|
|
247
317
|
handle.write(
|
|
248
318
|
"\nThe `main` package owns keyboard input and rendering, while `game` contains pure state transitions.\n"
|
|
249
319
|
)
|
|
320
|
+
evidence_file = tmp_root / "evidence.txt"
|
|
321
|
+
evidence_file.write_text("main package owns keyboard input and rendering\n")
|
|
250
322
|
run_manager(
|
|
251
323
|
"knowledge-mark-written",
|
|
252
324
|
"--repo",
|
|
@@ -255,8 +327,25 @@ def test_closed_loop_plan(tmp_root):
|
|
|
255
327
|
id_relative_plan,
|
|
256
328
|
"--id",
|
|
257
329
|
log_result["id"],
|
|
258
|
-
"--evidence",
|
|
259
|
-
|
|
330
|
+
"--evidence-file",
|
|
331
|
+
str(evidence_file),
|
|
332
|
+
)
|
|
333
|
+
run_manager(
|
|
334
|
+
"quality-score",
|
|
335
|
+
"--repo",
|
|
336
|
+
str(repo),
|
|
337
|
+
"--plan",
|
|
338
|
+
id_relative_plan,
|
|
339
|
+
"--product-correctness",
|
|
340
|
+
"8",
|
|
341
|
+
"--ux-operator-clarity",
|
|
342
|
+
"8",
|
|
343
|
+
"--architecture-maintainability",
|
|
344
|
+
"8",
|
|
345
|
+
"--reliability-observability",
|
|
346
|
+
"8",
|
|
347
|
+
"--security-data-handling",
|
|
348
|
+
"8",
|
|
260
349
|
)
|
|
261
350
|
plan_text = id_plan_path.read_text()
|
|
262
351
|
if id_fact in (repo / "ARCHITECTURE.md").read_text():
|
|
@@ -279,6 +368,21 @@ def create_formatted_plan(repo):
|
|
|
279
368
|
plan_path.write_text(
|
|
280
369
|
"""# Execution Plan: Formatted Plan
|
|
281
370
|
|
|
371
|
+
## Quality Gate
|
|
372
|
+
|
|
373
|
+
Status: pass
|
|
374
|
+
Minimum score: 8.0
|
|
375
|
+
Average score: 8.0
|
|
376
|
+
Last scored: 2026-06-11T00:00:00Z
|
|
377
|
+
|
|
378
|
+
| Dimension | Score | Notes |
|
|
379
|
+
| --- | ---: | --- |
|
|
380
|
+
| Product correctness | 8.0 | ok |
|
|
381
|
+
| UX and operator clarity | 8.0 | ok |
|
|
382
|
+
| Architecture and maintainability | 8.0 | ok |
|
|
383
|
+
| Reliability and observability | 8.0 | ok |
|
|
384
|
+
| Security and data handling | 8.0 | ok |
|
|
385
|
+
|
|
282
386
|
## Durable Knowledge To Capture
|
|
283
387
|
|
|
284
388
|
- [ ] `snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`. -> `ARCHITECTURE.md`
|
|
@@ -301,10 +405,357 @@ def test_preserve_unmanaged_docs(tmp_root):
|
|
|
301
405
|
assert_exists(repo, "docs/PLANS.md")
|
|
302
406
|
|
|
303
407
|
|
|
408
|
+
def test_phase_continuity_workstream(tmp_root):
|
|
409
|
+
repo = tmp_root / "phase-repo"
|
|
410
|
+
repo.mkdir()
|
|
411
|
+
answers = tmp_root / "phase-answers.json"
|
|
412
|
+
write_answers(answers, project_name="phase-demo")
|
|
413
|
+
run_manager("init", "--repo", str(repo), "--answers", str(answers))
|
|
414
|
+
|
|
415
|
+
plan_result = run_manager(
|
|
416
|
+
"plan-start",
|
|
417
|
+
"--repo",
|
|
418
|
+
str(repo),
|
|
419
|
+
"--slug",
|
|
420
|
+
"local-workbench-phase-1",
|
|
421
|
+
"--goal",
|
|
422
|
+
"Complete Local Workbench Phase 1",
|
|
423
|
+
)
|
|
424
|
+
plan_path = Path(plan_result["plan"])
|
|
425
|
+
relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
|
|
426
|
+
run_manager(
|
|
427
|
+
"quality-score",
|
|
428
|
+
"--repo",
|
|
429
|
+
str(repo),
|
|
430
|
+
"--plan",
|
|
431
|
+
relative_plan,
|
|
432
|
+
"--product-correctness",
|
|
433
|
+
"8",
|
|
434
|
+
"--ux-operator-clarity",
|
|
435
|
+
"8",
|
|
436
|
+
"--architecture-maintainability",
|
|
437
|
+
"8",
|
|
438
|
+
"--reliability-observability",
|
|
439
|
+
"8",
|
|
440
|
+
"--security-data-handling",
|
|
441
|
+
"8",
|
|
442
|
+
)
|
|
443
|
+
close_without_continuity = run_manager(
|
|
444
|
+
"plan-close",
|
|
445
|
+
"--repo",
|
|
446
|
+
str(repo),
|
|
447
|
+
"--plan",
|
|
448
|
+
relative_plan,
|
|
449
|
+
"--summary",
|
|
450
|
+
"Phase 1 done",
|
|
451
|
+
expect_success=False,
|
|
452
|
+
)
|
|
453
|
+
if close_without_continuity:
|
|
454
|
+
raise AssertionError("plan-close should not produce JSON when phase continuity blocks closure")
|
|
455
|
+
check_without_continuity = run_manager("check", "--repo", str(repo), expect_success=False)
|
|
456
|
+
issue_codes = {issue["code"] for issue in check_without_continuity["issues"]}
|
|
457
|
+
if "phase-mode-not-declared" not in issue_codes:
|
|
458
|
+
raise AssertionError("check should flag phased plans that do not declare continuation")
|
|
459
|
+
|
|
460
|
+
run_manager(
|
|
461
|
+
"phase-set",
|
|
462
|
+
"--repo",
|
|
463
|
+
str(repo),
|
|
464
|
+
"--plan",
|
|
465
|
+
relative_plan,
|
|
466
|
+
"--mode",
|
|
467
|
+
"multi-phase",
|
|
468
|
+
"--workstream",
|
|
469
|
+
"local-workbench",
|
|
470
|
+
"--current-phase",
|
|
471
|
+
"1",
|
|
472
|
+
"--next-phase",
|
|
473
|
+
"2",
|
|
474
|
+
"--continuation",
|
|
475
|
+
"docs/exec-plans/workstreams.md#local-workbench",
|
|
476
|
+
"--next-action",
|
|
477
|
+
"Create Phase 2 plan for command adapters",
|
|
478
|
+
"--resume-notes",
|
|
479
|
+
"Read completed Phase 1 plan and ARCHITECTURE.md before continuing",
|
|
480
|
+
)
|
|
481
|
+
close_without_workstream = run_manager(
|
|
482
|
+
"plan-close",
|
|
483
|
+
"--repo",
|
|
484
|
+
str(repo),
|
|
485
|
+
"--plan",
|
|
486
|
+
relative_plan,
|
|
487
|
+
"--summary",
|
|
488
|
+
"Phase 1 done",
|
|
489
|
+
expect_success=False,
|
|
490
|
+
)
|
|
491
|
+
if close_without_workstream:
|
|
492
|
+
raise AssertionError("plan-close should not allow a workstreams continuation without a ledger entry")
|
|
493
|
+
run_manager(
|
|
494
|
+
"workstream-upsert",
|
|
495
|
+
"--repo",
|
|
496
|
+
str(repo),
|
|
497
|
+
"--id",
|
|
498
|
+
"local-workbench",
|
|
499
|
+
"--status",
|
|
500
|
+
"active",
|
|
501
|
+
"--current-plan",
|
|
502
|
+
relative_plan,
|
|
503
|
+
"--next-action",
|
|
504
|
+
"Create Phase 2 plan for command adapters",
|
|
505
|
+
"--goal",
|
|
506
|
+
"Refactor local workbench into a maintainable terminal workflow",
|
|
507
|
+
"--resume-notes",
|
|
508
|
+
"Read completed Phase 1 plan and ARCHITECTURE.md before continuing",
|
|
509
|
+
)
|
|
510
|
+
assert_contains(repo, "docs/exec-plans/workstreams.md", "local-workbench")
|
|
511
|
+
assert_contains(repo, "docs/exec-plans/workstreams.md", "Create Phase 2 plan for command adapters")
|
|
512
|
+
close_result = run_manager(
|
|
513
|
+
"plan-close",
|
|
514
|
+
"--repo",
|
|
515
|
+
str(repo),
|
|
516
|
+
"--plan",
|
|
517
|
+
relative_plan,
|
|
518
|
+
"--summary",
|
|
519
|
+
"Phase 1 done; Phase 2 recovery is recorded in workstreams.",
|
|
520
|
+
)
|
|
521
|
+
if close_result["status"] != "closed":
|
|
522
|
+
raise AssertionError("Phased plan should close after continuity and workstream recovery are recorded")
|
|
523
|
+
completed_relative_plan = "docs/exec-plans/completed/" + plan_path.name
|
|
524
|
+
workstreams_text = (repo / "docs/exec-plans/workstreams.md").read_text()
|
|
525
|
+
if completed_relative_plan not in workstreams_text:
|
|
526
|
+
raise AssertionError("plan-close should update workstream ledger to the completed plan path")
|
|
527
|
+
if relative_plan in workstreams_text:
|
|
528
|
+
raise AssertionError("workstream ledger should not keep stale active plan references after plan-close")
|
|
529
|
+
broken = workstreams_text.replace(completed_relative_plan, relative_plan)
|
|
530
|
+
(repo / "docs/exec-plans/workstreams.md").write_text(broken)
|
|
531
|
+
broken_check = run_manager("check", "--repo", str(repo), expect_success=False)
|
|
532
|
+
broken_codes = {issue["code"] for issue in broken_check["issues"]}
|
|
533
|
+
if "missing-workstream-plan-reference" not in broken_codes:
|
|
534
|
+
raise AssertionError("check should fail when workstream ledger points to a missing plan")
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def test_plan_path_canonicalization(tmp_root):
|
|
538
|
+
repo = tmp_root / "canonical-repo"
|
|
539
|
+
repo.mkdir()
|
|
540
|
+
answers = tmp_root / "canonical-answers.json"
|
|
541
|
+
write_answers(answers, project_name="canonical-demo")
|
|
542
|
+
run_manager("init", "--repo", str(repo), "--answers", str(answers))
|
|
543
|
+
|
|
544
|
+
plan_result = run_manager(
|
|
545
|
+
"plan-start",
|
|
546
|
+
"--repo",
|
|
547
|
+
str(repo),
|
|
548
|
+
"--slug",
|
|
549
|
+
"canonical-close",
|
|
550
|
+
"--goal",
|
|
551
|
+
"Close a plan when repo and plan paths use different filesystem spellings",
|
|
552
|
+
)
|
|
553
|
+
plan_path = Path(plan_result["plan"])
|
|
554
|
+
relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
|
|
555
|
+
run_manager(
|
|
556
|
+
"quality-score",
|
|
557
|
+
"--repo",
|
|
558
|
+
str(repo),
|
|
559
|
+
"--plan",
|
|
560
|
+
str(plan_path),
|
|
561
|
+
"--product-correctness",
|
|
562
|
+
"8",
|
|
563
|
+
"--ux-operator-clarity",
|
|
564
|
+
"8",
|
|
565
|
+
"--architecture-maintainability",
|
|
566
|
+
"8",
|
|
567
|
+
"--reliability-observability",
|
|
568
|
+
"8",
|
|
569
|
+
"--security-data-handling",
|
|
570
|
+
"8",
|
|
571
|
+
)
|
|
572
|
+
run_manager(
|
|
573
|
+
"workstream-upsert",
|
|
574
|
+
"--repo",
|
|
575
|
+
str(repo),
|
|
576
|
+
"--id",
|
|
577
|
+
"canonical-close",
|
|
578
|
+
"--status",
|
|
579
|
+
"active",
|
|
580
|
+
"--current-plan",
|
|
581
|
+
relative_plan,
|
|
582
|
+
"--next-action",
|
|
583
|
+
"Close after canonical path validation",
|
|
584
|
+
"--goal",
|
|
585
|
+
"Verify plan-close updates workstreams with normalized relative paths",
|
|
586
|
+
"--resume-notes",
|
|
587
|
+
"No special resume notes",
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
repo_arg = os.path.realpath(repo)
|
|
591
|
+
plan_arg = str(plan_path)
|
|
592
|
+
if repo_arg == str(repo) and plan_arg == str(plan_path.resolve()):
|
|
593
|
+
repo_arg = str(repo)
|
|
594
|
+
plan_arg = str(plan_path.resolve())
|
|
595
|
+
|
|
596
|
+
close_result = run_manager(
|
|
597
|
+
"plan-close",
|
|
598
|
+
"--repo",
|
|
599
|
+
repo_arg,
|
|
600
|
+
"--plan",
|
|
601
|
+
plan_arg,
|
|
602
|
+
"--summary",
|
|
603
|
+
"Closed with canonicalized plan path.",
|
|
604
|
+
)
|
|
605
|
+
if close_result["status"] != "closed":
|
|
606
|
+
raise AssertionError("plan-close should accept absolute plan paths inside the repo")
|
|
607
|
+
completed_relative_plan = "docs/exec-plans/completed/" + plan_path.name
|
|
608
|
+
workstreams_text = (repo / "docs/exec-plans/workstreams.md").read_text()
|
|
609
|
+
if completed_relative_plan not in workstreams_text:
|
|
610
|
+
raise AssertionError("canonicalized plan-close should update last completed plan")
|
|
611
|
+
if relative_plan in workstreams_text:
|
|
612
|
+
raise AssertionError("canonicalized plan-close should remove stale current plan references")
|
|
613
|
+
check_result = run_manager("check", "--repo", str(repo))
|
|
614
|
+
if check_result["status"] != "pass":
|
|
615
|
+
raise AssertionError("canonicalized plan-close should leave harness check passing")
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def test_defect_recovery_loop(tmp_root):
|
|
619
|
+
repo = tmp_root / "defect-repo"
|
|
620
|
+
repo.mkdir()
|
|
621
|
+
answers = tmp_root / "defect-answers.json"
|
|
622
|
+
write_answers(answers, project_name="defect-demo")
|
|
623
|
+
run_manager("init", "--repo", str(repo), "--answers", str(answers))
|
|
624
|
+
|
|
625
|
+
plan_result = run_manager(
|
|
626
|
+
"plan-start",
|
|
627
|
+
"--repo",
|
|
628
|
+
str(repo),
|
|
629
|
+
"--slug",
|
|
630
|
+
"snake-tail-collision",
|
|
631
|
+
"--goal",
|
|
632
|
+
"Validate defect recovery when Snake tail-cell collision behavior fails",
|
|
633
|
+
)
|
|
634
|
+
plan_path = Path(plan_result["plan"])
|
|
635
|
+
relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
|
|
636
|
+
defect_summary = (
|
|
637
|
+
"Snake marks game over when the head moves into the current tail cell during a non-eating tick"
|
|
638
|
+
)
|
|
639
|
+
defect_result = run_manager(
|
|
640
|
+
"defect-log",
|
|
641
|
+
"--repo",
|
|
642
|
+
str(repo),
|
|
643
|
+
"--plan",
|
|
644
|
+
relative_plan,
|
|
645
|
+
"--severity",
|
|
646
|
+
"P1",
|
|
647
|
+
"--summary",
|
|
648
|
+
defect_summary,
|
|
649
|
+
"--evidence",
|
|
650
|
+
"go test ./internal/game -run TestCanMoveIntoVacatedTailCell failed",
|
|
651
|
+
expect_success=False,
|
|
652
|
+
)
|
|
653
|
+
defect_id = defect_result["id"]
|
|
654
|
+
plan_text = plan_path.read_text()
|
|
655
|
+
if "## Defects To Resolve" not in plan_text or defect_id not in plan_text:
|
|
656
|
+
raise AssertionError("defect-log should record the open defect in the plan")
|
|
657
|
+
if "Status: fail" not in plan_text:
|
|
658
|
+
raise AssertionError("defect-log should force the quality gate to fail")
|
|
659
|
+
if "Resolve all open defects" not in plan_text:
|
|
660
|
+
raise AssertionError("defect-log should turn the bug into rework input")
|
|
661
|
+
|
|
662
|
+
score_with_open_defect = run_manager(
|
|
663
|
+
"quality-score",
|
|
664
|
+
"--repo",
|
|
665
|
+
str(repo),
|
|
666
|
+
"--plan",
|
|
667
|
+
relative_plan,
|
|
668
|
+
"--product-correctness",
|
|
669
|
+
"10",
|
|
670
|
+
"--ux-operator-clarity",
|
|
671
|
+
"10",
|
|
672
|
+
"--architecture-maintainability",
|
|
673
|
+
"10",
|
|
674
|
+
"--reliability-observability",
|
|
675
|
+
"10",
|
|
676
|
+
"--security-data-handling",
|
|
677
|
+
"10",
|
|
678
|
+
expect_success=False,
|
|
679
|
+
)
|
|
680
|
+
if score_with_open_defect["status"] != "fail" or defect_id not in score_with_open_defect["open_defects"]:
|
|
681
|
+
raise AssertionError("quality-score should fail while any defect is open")
|
|
682
|
+
check_with_open_defect = run_manager("check", "--repo", str(repo), expect_success=False)
|
|
683
|
+
issue_codes = {issue["code"] for issue in check_with_open_defect["issues"]}
|
|
684
|
+
if "open-defect" not in issue_codes:
|
|
685
|
+
raise AssertionError("check should surface unresolved defects")
|
|
686
|
+
close_with_open_defect = run_manager(
|
|
687
|
+
"plan-close",
|
|
688
|
+
"--repo",
|
|
689
|
+
str(repo),
|
|
690
|
+
"--plan",
|
|
691
|
+
relative_plan,
|
|
692
|
+
"--summary",
|
|
693
|
+
"Should not close with open defects",
|
|
694
|
+
expect_success=False,
|
|
695
|
+
)
|
|
696
|
+
if close_with_open_defect:
|
|
697
|
+
raise AssertionError("plan-close should not close while defects are open")
|
|
698
|
+
|
|
699
|
+
run_manager(
|
|
700
|
+
"defect-resolve",
|
|
701
|
+
"--repo",
|
|
702
|
+
str(repo),
|
|
703
|
+
"--plan",
|
|
704
|
+
relative_plan,
|
|
705
|
+
"--id",
|
|
706
|
+
defect_id,
|
|
707
|
+
"--fix-evidence",
|
|
708
|
+
"go test ./internal/game -run TestCanMoveIntoVacatedTailCell passed",
|
|
709
|
+
)
|
|
710
|
+
plan_text_after_resolve = plan_path.read_text()
|
|
711
|
+
if f"- [x] [bug:{defect_id}]" not in plan_text_after_resolve:
|
|
712
|
+
raise AssertionError("defect-resolve should close the defect checkbox")
|
|
713
|
+
if "Defects resolved. Re-run validation and `quality-score` before closing." not in plan_text_after_resolve:
|
|
714
|
+
raise AssertionError("defect-resolve should require a fresh quality score")
|
|
715
|
+
|
|
716
|
+
passing_score = run_manager(
|
|
717
|
+
"quality-score",
|
|
718
|
+
"--repo",
|
|
719
|
+
str(repo),
|
|
720
|
+
"--plan",
|
|
721
|
+
relative_plan,
|
|
722
|
+
"--product-correctness",
|
|
723
|
+
"9",
|
|
724
|
+
"--ux-operator-clarity",
|
|
725
|
+
"8",
|
|
726
|
+
"--architecture-maintainability",
|
|
727
|
+
"8",
|
|
728
|
+
"--reliability-observability",
|
|
729
|
+
"9",
|
|
730
|
+
"--security-data-handling",
|
|
731
|
+
"10",
|
|
732
|
+
)
|
|
733
|
+
if passing_score["status"] != "pass":
|
|
734
|
+
raise AssertionError("quality-score should pass after defects are resolved")
|
|
735
|
+
close_result = run_manager(
|
|
736
|
+
"plan-close",
|
|
737
|
+
"--repo",
|
|
738
|
+
str(repo),
|
|
739
|
+
"--plan",
|
|
740
|
+
relative_plan,
|
|
741
|
+
"--summary",
|
|
742
|
+
"Closed after defect recovery and fresh quality score.",
|
|
743
|
+
)
|
|
744
|
+
if close_result["status"] != "closed":
|
|
745
|
+
raise AssertionError("plan-close should close after defect recovery")
|
|
746
|
+
completed_plan = repo / "docs" / "exec-plans" / "completed" / plan_path.name
|
|
747
|
+
completed_text = completed_plan.read_text()
|
|
748
|
+
if "- [x] Add durable facts here as they emerge" in completed_text:
|
|
749
|
+
raise AssertionError("plan-close should not mark the default knowledge placeholder as completed")
|
|
750
|
+
|
|
751
|
+
|
|
304
752
|
EVALS = [
|
|
305
753
|
("empty-repo-init", test_empty_repo_init),
|
|
306
754
|
("frontend-analysis", test_frontend_analysis),
|
|
307
755
|
("closed-loop-plan", test_closed_loop_plan),
|
|
756
|
+
("phase-continuity-workstream", test_phase_continuity_workstream),
|
|
757
|
+
("plan-path-canonicalization", test_plan_path_canonicalization),
|
|
758
|
+
("defect-recovery-loop", test_defect_recovery_loop),
|
|
308
759
|
("preserve-unmanaged-docs", test_preserve_unmanaged_docs),
|
|
309
760
|
]
|
|
310
761
|
|
|
@@ -12,6 +12,8 @@ Use this loop when changing the skill, templates, scripts, or policy references:
|
|
|
12
12
|
- first-time initialization of an empty repository
|
|
13
13
|
- frontend-aware repository analysis
|
|
14
14
|
- execution-plan and knowledge-capture closure
|
|
15
|
+
- quality gates that block closure and force rework when scores fail
|
|
16
|
+
- phase continuity and workstream recovery for resumable work
|
|
15
17
|
- preservation of unmanaged user-owned docs
|
|
16
18
|
- local harness checks that do not require user-project CI
|
|
17
19
|
|
|
@@ -11,6 +11,7 @@ Execution plans are required for multi-step work, risky changes, or tasks that n
|
|
|
11
11
|
|
|
12
12
|
## Location
|
|
13
13
|
|
|
14
|
+
- Workstream recovery ledger: `docs/exec-plans/workstreams.md`
|
|
14
15
|
- Active: `docs/exec-plans/active/`
|
|
15
16
|
- Completed: `docs/exec-plans/completed/`
|
|
16
17
|
|
|
@@ -21,19 +22,28 @@ Execution plans are required for multi-step work, risky changes, or tasks that n
|
|
|
21
22
|
- constraints
|
|
22
23
|
- steps
|
|
23
24
|
- validation
|
|
25
|
+
- quality gate
|
|
26
|
+
- defects to resolve
|
|
27
|
+
- rework required
|
|
28
|
+
- phase continuity
|
|
24
29
|
- durable knowledge to capture
|
|
25
30
|
- completion notes
|
|
26
31
|
|
|
27
32
|
## Operating Rule
|
|
28
33
|
|
|
29
|
-
Update the active plan during the work. When the work is done, move it to `completed
|
|
34
|
+
Update the active plan during the work. When the work is done, score it, complete any required rework, record phase continuity for resumable work, move it to `completed`, and leave behind any durable facts in the right permanent docs.
|
|
30
35
|
|
|
31
36
|
## Closed Loop
|
|
32
37
|
|
|
33
38
|
Use the script, not ad hoc manual edits, for the lifecycle:
|
|
34
39
|
|
|
35
40
|
- `plan-start`: create a new active execution plan
|
|
36
|
-
- `knowledge-log`: append a durable fact that still needs to be written into permanent docs and return its stable id
|
|
37
|
-
- `knowledge-mark-written`: verify and mark a logged fact as written into its permanent doc; prefer `--id <knowledge-id> --evidence
|
|
38
|
-
- `
|
|
41
|
+
- `knowledge-log`: append a durable fact that still needs to be written into permanent docs and return its stable id; use `--fact-file` for shell-sensitive facts
|
|
42
|
+
- `knowledge-mark-written`: verify and mark a logged fact as written into its permanent doc; prefer `--id <knowledge-id> --evidence-file <file>` for shell-sensitive evidence, and use `--append` only to append the exact fact first
|
|
43
|
+
- `defect-log`: record a bug found by validation, evals, browser testing, or code review; this forces the quality gate to fail and makes the defect the next rework input
|
|
44
|
+
- `defect-resolve`: mark a logged defect fixed with validation or code evidence; re-run validation and `quality-score` before closing
|
|
45
|
+
- `quality-score`: write a scored quality gate into the plan; if it fails, the generated `## Rework Required` section becomes the next implementation input
|
|
46
|
+
- `phase-set`: declare whether phased or resumable work continues, pauses, stops, or completes
|
|
47
|
+
- `workstream-upsert`: update `docs/exec-plans/workstreams.md` so interrupted work can be recovered without chat history
|
|
48
|
+
- `plan-close`: refuse to close cleanly until the quality gate passes, phase continuity is recorded, and the listed knowledge items are marked as written to durable docs
|
|
39
49
|
- `check`: run a local handoff check without requiring target-repo CI
|
|
@@ -36,12 +36,18 @@ After the script runs, read the generated docs once and tighten weak generic phr
|
|
|
36
36
|
|
|
37
37
|
After the scaffold exists:
|
|
38
38
|
|
|
39
|
+
- read `docs/exec-plans/workstreams.md` before resuming interrupted or long-running work
|
|
39
40
|
- create an execution plan before multi-step work
|
|
40
41
|
- use `plan-start` instead of creating plan files manually when possible
|
|
41
42
|
- log durable facts during execution instead of waiting until the end
|
|
42
43
|
- follow the matching SOP for architecture, UI, observability, or knowledge capture work
|
|
43
44
|
- encode durable knowledge back into the repository before closing the task
|
|
44
45
|
- mark logged knowledge items as written after updating the permanent docs
|
|
46
|
+
- log every defect found by tests, evals, browser validation, or code review with `defect-log`
|
|
47
|
+
- resolve logged defects only after fixing the implementation and citing passing validation with `defect-resolve`
|
|
48
|
+
- run `quality-score` after implementation and validation
|
|
49
|
+
- if `quality-score` fails, implement the `## Rework Required` items and score again
|
|
50
|
+
- use `phase-set` and `workstream-upsert` when a plan belongs to phased or resumable work
|
|
45
51
|
- use `plan-close` to verify no durable knowledge is left stranded in the active plan
|
|
46
52
|
- run `.codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check --repo <target-repo>` before handoff
|
|
47
53
|
- do not add CI to the target repository unless the human explicitly asks for it
|