ultimate-pi 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/.agents/skills/harness-context/SKILL.md +13 -6
  2. package/.agents/skills/harness-debate-plan/SKILL.md +37 -20
  3. package/.agents/skills/harness-eval/SKILL.md +6 -21
  4. package/.agents/skills/harness-governor/SKILL.md +4 -3
  5. package/.agents/skills/harness-orchestration/SKILL.md +39 -51
  6. package/.agents/skills/harness-plan/SKILL.md +23 -12
  7. package/.agents/skills/harness-review/SKILL.md +52 -0
  8. package/.agents/skills/harness-sentrux-setup/SKILL.md +13 -1
  9. package/.agents/skills/harness-steer/SKILL.md +14 -0
  10. package/.pi/agents/harness/adversary.md +3 -10
  11. package/.pi/agents/harness/evaluator.md +3 -12
  12. package/.pi/agents/harness/executor.md +12 -14
  13. package/.pi/agents/harness/planning/decompose.md +7 -4
  14. package/.pi/agents/harness/planning/hypothesis-validator.md +2 -0
  15. package/.pi/agents/harness/planning/hypothesis.md +3 -1
  16. package/.pi/agents/harness/planning/plan-adversary.md +2 -0
  17. package/.pi/agents/harness/planning/plan-evaluator.md +2 -0
  18. package/.pi/agents/harness/planning/plan-synthesizer.md +25 -0
  19. package/.pi/agents/harness/planning/planning-context.md +48 -0
  20. package/.pi/agents/harness/planning/review-integrator.md +2 -0
  21. package/.pi/agents/harness/planning/scout-graphify.md +3 -1
  22. package/.pi/agents/harness/planning/scout-semantic.md +3 -1
  23. package/.pi/agents/harness/planning/scout-structure.md +3 -1
  24. package/.pi/agents/harness/planning/sprint-contract-auditor.md +2 -0
  25. package/.pi/agents/harness/sentrux-steward.md +51 -0
  26. package/.pi/extensions/00-posthog-network-bootstrap.ts +11 -0
  27. package/.pi/extensions/harness-live-widget.ts +27 -1
  28. package/.pi/extensions/harness-plan-approval.ts +62 -56
  29. package/.pi/extensions/harness-run-context.ts +541 -84
  30. package/.pi/extensions/harness-subagent-submit.ts +43 -10
  31. package/.pi/extensions/lib/harness-artifact-gate.ts +182 -0
  32. package/.pi/extensions/lib/harness-posthog.ts +9 -5
  33. package/.pi/extensions/lib/harness-spawn-topology.ts +188 -0
  34. package/.pi/extensions/lib/harness-subagent-auth.ts +1 -0
  35. package/.pi/extensions/lib/harness-subagent-policy.ts +23 -19
  36. package/.pi/extensions/lib/harness-subagent-precheck.ts +35 -9
  37. package/.pi/extensions/lib/harness-subagent-submit-pipeline.ts +66 -2
  38. package/.pi/extensions/lib/harness-subagent-submit-registry.ts +21 -3
  39. package/.pi/extensions/lib/harness-subagents-bridge.ts +7 -29
  40. package/.pi/extensions/lib/harness-subprocess-bootstrap.ts +73 -0
  41. package/.pi/extensions/lib/plan-approval/create-plan.ts +2 -3
  42. package/.pi/extensions/lib/plan-approval/resolve-disk.ts +102 -0
  43. package/.pi/extensions/lib/plan-approval/schema.ts +22 -8
  44. package/.pi/extensions/lib/plan-approval/types.ts +1 -1
  45. package/.pi/extensions/lib/plan-approval/validate.ts +2 -2
  46. package/.pi/extensions/lib/plan-approval-readiness.ts +241 -0
  47. package/.pi/extensions/lib/plan-debate-eligibility.ts +12 -5
  48. package/.pi/extensions/lib/plan-debate-gate.ts +22 -1
  49. package/.pi/extensions/lib/plan-debate-lanes.ts +32 -2
  50. package/.pi/extensions/lib/plan-review-gate.ts +8 -0
  51. package/.pi/extensions/lib/posthog-client.ts +76 -0
  52. package/.pi/extensions/policy-gate.ts +24 -19
  53. package/.pi/harness/agents.manifest.json +24 -16
  54. package/.pi/harness/corpus/cron.example +8 -0
  55. package/.pi/harness/corpus/graphify-kb-updater.config.json +159 -0
  56. package/.pi/harness/corpus/systemd/graphify-kb-updater.env.template +4 -0
  57. package/.pi/harness/corpus/systemd/graphify-kb-updater.service +17 -0
  58. package/.pi/harness/corpus/systemd/graphify-kb-updater.timer +11 -0
  59. package/.pi/harness/docs/adrs/0001-harness-constitution.md +2 -1
  60. package/.pi/harness/docs/adrs/0006-sentrux-dual-layer.md +7 -6
  61. package/.pi/harness/docs/adrs/0009-sentrux-rules-lifecycle.md +6 -1
  62. package/.pi/harness/docs/adrs/0031-harness-run-context.md +1 -1
  63. package/.pi/harness/docs/adrs/0032-harness-command-orchestration.md +7 -0
  64. package/.pi/harness/docs/adrs/0034-darwin-plan-research-pipeline.md +3 -3
  65. package/.pi/harness/docs/adrs/0036-implementation-research-and-selective-debate.md +8 -5
  66. package/.pi/harness/docs/adrs/0039-harness-post-run-review-gate.md +47 -0
  67. package/.pi/harness/docs/adrs/0040-practice-grounded-orchestration.md +40 -0
  68. package/.pi/harness/docs/adrs/0041-intelligent-planning-reconnaissance.md +39 -0
  69. package/.pi/harness/docs/adrs/0042-agent-native-orchestration.md +35 -0
  70. package/.pi/harness/docs/adrs/0043-path-first-harness-tools.md +38 -0
  71. package/.pi/harness/docs/adrs/0044-harness-steer-loop.md +36 -0
  72. package/.pi/harness/docs/adrs/README.md +10 -0
  73. package/.pi/harness/docs/graphify-kb-updater-runbook.md +157 -0
  74. package/.pi/harness/docs/practice-map.md +110 -0
  75. package/.pi/harness/env.harness.template +5 -3
  76. package/.pi/harness/evals/smoke/sentrux-stub.json +1 -1
  77. package/.pi/harness/evals/smoke/smoke-harness-plan.mjs +5 -2
  78. package/.pi/harness/specs/README.md +1 -1
  79. package/.pi/harness/specs/harness-run-context.schema.json +11 -0
  80. package/.pi/harness/specs/harness-spawn-context.schema.json +14 -0
  81. package/.pi/harness/specs/plan-execution-plan.schema.json +39 -1
  82. package/.pi/harness/specs/plan-packet.schema.json +4 -0
  83. package/.pi/harness/specs/plan-phase-status.schema.json +17 -0
  84. package/.pi/harness/specs/plan-phase-waiver.schema.json +25 -0
  85. package/.pi/harness/specs/plan-planning-context.schema.json +50 -0
  86. package/.pi/harness/specs/repair-brief.schema.json +45 -0
  87. package/.pi/harness/specs/review-outcome.schema.json +46 -0
  88. package/.pi/harness/specs/sentrux-manifest-proposal.schema.json +80 -0
  89. package/.pi/harness/specs/sentrux-signal.schema.json +43 -0
  90. package/.pi/harness/specs/steer-state.schema.json +20 -0
  91. package/.pi/lib/harness-context-mode-policy.ts +256 -0
  92. package/.pi/lib/harness-repair-brief.ts +145 -0
  93. package/.pi/lib/harness-run-context.ts +591 -32
  94. package/.pi/lib/harness-ui-state.ts +87 -9
  95. package/.pi/prompts/harness-auto.md +9 -9
  96. package/.pi/prompts/harness-critic.md +3 -30
  97. package/.pi/prompts/harness-eval.md +4 -37
  98. package/.pi/prompts/harness-plan.md +118 -54
  99. package/.pi/prompts/harness-review.md +150 -15
  100. package/.pi/prompts/harness-run.md +62 -10
  101. package/.pi/prompts/harness-sentrux-steward.md +55 -0
  102. package/.pi/prompts/harness-steer.md +30 -0
  103. package/.pi/scripts/graphify-kb-updater.mjs +358 -0
  104. package/.pi/scripts/harness-verify.mjs +22 -6
  105. package/.pi/scripts/harness-web-policy-guard.mjs +68 -0
  106. package/.pi/scripts/validate-plan-dag.mjs +3 -3
  107. package/AGENTS.md +1 -0
  108. package/CHANGELOG.md +11 -0
  109. package/package.json +5 -4
  110. package/.pi/prompts/git-sync.md +0 -124
@@ -0,0 +1,159 @@
1
+ {
2
+ "schema_version": "1.1.0",
3
+ "policy": "hybrid-allowlist-auto-promotion-with-conservative-staging",
4
+ "auto_promote_allowlist": true,
5
+ "source_taxonomy": {
6
+ "article": {
7
+ "category": "public_article_or_engineering_blog",
8
+ "risk_class": "low_to_medium",
9
+ "default_policy": "allowlist_auto_promote_when_approved"
10
+ },
11
+ "paper": {
12
+ "category": "research_paper_or_abstract_feed",
13
+ "risk_class": "medium",
14
+ "default_policy": "stage_until_rights_review"
15
+ },
16
+ "book": {
17
+ "category": "book_or_longform_local_file",
18
+ "risk_class": "high",
19
+ "default_policy": "manual_approval_required"
20
+ },
21
+ "transcript": {
22
+ "category": "youtube_or_audio_transcript",
23
+ "risk_class": "high",
24
+ "default_policy": "manual_approval_required"
25
+ },
26
+ "youtube": {
27
+ "category": "youtube_candidate_or_video_reference",
28
+ "risk_class": "high",
29
+ "default_policy": "stage_metadata_only_until_approved"
30
+ }
31
+ },
32
+ "competitor_taxonomy": {
33
+ "ai_coding_agents": {
34
+ "description": "Coding-agent products, CLIs, IDE agents, and model-native coding surfaces.",
35
+ "keywords": [
36
+ "claude code",
37
+ "cursor",
38
+ "codex",
39
+ "aider",
40
+ "copilot",
41
+ "windsurf",
42
+ "zed",
43
+ "replit",
44
+ "devin"
45
+ ]
46
+ },
47
+ "agentic_harnesses": {
48
+ "description": "Harnesses, orchestration frameworks, eval loops, task runners, and review gates.",
49
+ "keywords": [
50
+ "harness",
51
+ "orchestration",
52
+ "agent bus",
53
+ "eval",
54
+ "review gate",
55
+ "multi-agent",
56
+ "workflow"
57
+ ]
58
+ },
59
+ "context_engineering": {
60
+ "description": "Context retrieval, compaction, memory, skills, MCP, and codebase indexing.",
61
+ "keywords": [
62
+ "context engineering",
63
+ "mcp",
64
+ "memory",
65
+ "retrieval",
66
+ "compaction",
67
+ "skills",
68
+ "knowledge graph"
69
+ ]
70
+ }
71
+ },
72
+ "allowlist": [
73
+ {
74
+ "domain": "openai.com",
75
+ "approved": true,
76
+ "approved_by": "repo-policy",
77
+ "approved_at": "2026-05-23",
78
+ "allowed_source_classes": ["article"]
79
+ },
80
+ {
81
+ "domain": "anthropic.com",
82
+ "approved": true,
83
+ "approved_by": "repo-policy",
84
+ "approved_at": "2026-05-23",
85
+ "allowed_source_classes": ["article"]
86
+ },
87
+ {
88
+ "domain": "github.blog",
89
+ "approved": true,
90
+ "approved_by": "repo-policy",
91
+ "approved_at": "2026-05-23",
92
+ "allowed_source_classes": ["article"]
93
+ },
94
+ {
95
+ "domain": "martinfowler.com",
96
+ "approved": true,
97
+ "approved_by": "repo-policy",
98
+ "approved_at": "2026-05-23",
99
+ "allowed_source_classes": ["article"]
100
+ },
101
+ {
102
+ "domain": "addyosmani.com",
103
+ "approved": true,
104
+ "approved_by": "repo-policy",
105
+ "approved_at": "2026-05-23",
106
+ "allowed_source_classes": ["article"]
107
+ },
108
+ {
109
+ "domain": "arxiv.org",
110
+ "approved": false,
111
+ "approved_by": "manual-review-required",
112
+ "approved_at": "manual-review-required",
113
+ "allowed_source_classes": ["paper"]
114
+ }
115
+ ],
116
+ "article_queries": [
117
+ "agentic engineering harness engineering AI coding agents",
118
+ "AI coding harness evaluation orchestration context engineering"
119
+ ],
120
+ "paper_feeds": [
121
+ {
122
+ "title": "arXiv software engineering agents search feed",
123
+ "url": "https://arxiv.org/search/cs?query=agentic+software+engineering&searchtype=all",
124
+ "rights_access": {
125
+ "license": "source-specific",
126
+ "access": "public abstract/feed only; paper text requires review",
127
+ "approved_by": "manual-review-required",
128
+ "approved_at": "manual-review-required"
129
+ },
130
+ "provenance": {
131
+ "origin": "curated_search_feed",
132
+ "locator": "https://arxiv.org/search/cs?query=agentic+software+engineering&searchtype=all",
133
+ "notes": "Feed metadata only; paper body requires approval."
134
+ }
135
+ }
136
+ ],
137
+ "local_books": [
138
+ {
139
+ "path": "data/books",
140
+ "max_files": 75
141
+ }
142
+ ],
143
+ "local_transcripts": [
144
+ {
145
+ "path": "data/youtube-transcripts",
146
+ "max_files": 100
147
+ }
148
+ ],
149
+ "youtube_candidates": [
150
+ {
151
+ "title": "Review queue placeholder for agentic engineering YouTube talks",
152
+ "url": "https://www.youtube.com/results?search_query=agentic+engineering+harness+engineering",
153
+ "rights_access": null,
154
+ "approved": false,
155
+ "competitor_labels": ["agentic_harnesses"]
156
+ }
157
+ ],
158
+ "review_queue": []
159
+ }
@@ -0,0 +1,4 @@
1
+ # Copy to ~/.config/ultimate-pi/graphify-kb-updater.env and edit paths.
2
+ UP_ROOT=/home/USER/ai-projects/ultimate-pi
3
+ NODE_ENV=production
4
+ GRAPHIFY_KB_ARGS=--apply --refresh-graph --pilot-report --max-promotions 25
@@ -0,0 +1,17 @@
1
+ [Unit]
2
+ Description=Ultimate Pi Graphify knowledge-base updater
3
+ Documentation=file:%h/ai-projects/ultimate-pi/.pi/harness/docs/graphify-kb-updater-runbook.md
4
+ After=network-online.target
5
+ Wants=network-online.target
6
+
7
+ [Service]
8
+ Type=oneshot
9
+ EnvironmentFile=%h/.config/ultimate-pi/graphify-kb-updater.env
10
+ WorkingDirectory=${UP_ROOT}
11
+ ExecStart=/usr/bin/flock -n %t/graphify-kb-updater.lock /usr/bin/timeout 45m /usr/bin/env node .pi/scripts/graphify-kb-updater.mjs ${GRAPHIFY_KB_ARGS}
12
+ StandardOutput=append:%h/.local/state/ultimate-pi/graphify-kb-updater.log
13
+ StandardError=append:%h/.local/state/ultimate-pi/graphify-kb-updater.err
14
+ TimeoutStartSec=50m
15
+ Nice=10
16
+ IOSchedulingClass=best-effort
17
+ IOSchedulingPriority=7
@@ -0,0 +1,11 @@
1
+ [Unit]
2
+ Description=Run Ultimate Pi Graphify knowledge-base updater daily on a bounded schedule
3
+
4
+ [Timer]
5
+ OnCalendar=*-*-* 08:30:00
6
+ RandomizedDelaySec=30m
7
+ Persistent=true
8
+ Unit=graphify-kb-updater.service
9
+
10
+ [Install]
11
+ WantedBy=timers.target
@@ -13,7 +13,8 @@ ultimate-pi needs a stable governance model for agentic runs: plan-before-mutate
13
13
  2. Phases are `plan → execute → evaluate → adversary → merge` with policy-gate as the source of truth.
14
14
  3. Local JSONL under `.pi/harness/runs/` is the **source of truth** for run history; PostHog is for team dashboards.
15
15
  4. Context for harness paths uses **context-mode only** — never lean-ctx in harness skills or extensions.
16
- 5. `@posthog/pi` remains the LLM analytics layer; harness domain events use `harness-telemetry.ts`.
16
+ 5. Context-mode execute tools (`ctx_execute`, `ctx_batch_execute`, `ctx_execute_file`) are subject to the same phase matrix as `bash`/`write` via policy-gate.
17
+ 6. `@posthog/pi` remains the LLM analytics layer; harness domain events use `harness-telemetry.ts`.
17
18
 
18
19
  ## Consequences
19
20
 
@@ -5,15 +5,16 @@
5
5
 
6
6
  ## Context
7
7
 
8
- Evaluator trust requires both programmatic gates (policy, budget, integrity) and external observation signals (Sentrux MCP).
8
+ Evaluator trust requires both programmatic gates (policy, budget, integrity) and **measured structural actuals** from the Sentrux CLI (Pi sessions use CLI only — no Sentrux MCP in harness).
9
9
 
10
10
  ## Decision
11
11
 
12
12
  1. **Rules file:** `.sentrux/rules.toml` synced from manifest — see [ADR 0009](0009-sentrux-rules-lifecycle.md).
13
- 2. **CLI gate:** `node "$UP_PKG/.pi/scripts/harness-verify.mjs"` fails if `HARNESS_SENTRUX_REQUIRED=true` and no `harness-sentrux-signal` stub/file exists for the run (placeholder until MCP wired). Resolve `$UP_PKG` via [.pi/scripts/README.md](../../../scripts/README.md).
14
- 3. **MCP layer (Q2+):** Evaluator sessions must record at least one Sentrux observation before `harness_eval_verdict` promotion when Sentrux is enabled.
15
- 4. Observations flow through `observation-bus.ts` as `HarnessObservation` envelopes.
16
- 5. PostHog event: `harness_sentrux_signal` with `signal_type` and `score` only no secrets.
13
+ 2. **Run observation:** `/harness-run` writes `artifacts/sentrux-signal.yaml` and appends session custom entry `harness-sentrux-signal` after `sentrux check` + `sentrux gate` (baseline from `sentrux gate --save` before execute).
14
+ 3. **Verify gate:** `harness-verify.mjs` with `HARNESS_SENTRUX_REQUIRED=true` prefers `$HARNESS_RUN_DIR/artifacts/sentrux-signal.yaml`; falls back to `.pi/harness/evals/smoke/sentrux-stub.json` only when no run signal exists (CI smoke / pre-run verify).
15
+ 4. **Evaluator:** `harness/evaluator` in `benchmark` mode reads `sentrux-signal.yaml` and `benchmark-log.yaml` — metrics are inputs, not executor optimization targets.
16
+ 5. Observations flow through `observation-bus.ts` as `HarnessObservation` envelopes when wired.
17
+ 6. PostHog event: `harness_sentrux_signal` with `signal_type` and `score` only — no secrets.
17
18
 
18
19
  ## Consequences
19
20
 
@@ -23,7 +24,7 @@ Evaluator trust requires both programmatic gates (policy, budget, integrity) and
23
24
 
24
25
  ### Negative
25
26
 
26
- - Full MCP integration remains follow-up when Sentrux server is available.
27
+ - Teams must run `/harness-run` (or write `sentrux-signal.yaml`) before promotion verify when stub fallback is insufficient.
27
28
 
28
29
  ## References
29
30
 
@@ -20,7 +20,10 @@ Sentrux enforces architecture via [`.sentrux/rules.toml`](https://sentrux.dev/do
20
20
  - On `agent_end` when harness phase is `plan` or `merge`
21
21
  - `node "$UP_PKG/.pi/scripts/harness-verify.mjs"` fails if manifest hash ≠ last sync (`--check`)
22
22
  7. **Custom rules:** TOML outside the managed block is preserved on sync.
23
- 8. **Skill:** `harness-sentrux-setup` documents bootstrap vs `--force`.
23
+ 8. **Skill:** `harness-sentrux-setup` documents bootstrap vs steward vs sync vs observation.
24
+ 9. **Intent evolution:** `harness/sentrux-steward` proposes JSON Merge Patches via `submit_sentrux_manifest_proposal` → `artifacts/sentrux-manifest-proposal.yaml`, with graphify-first evidence (`graphify-out/GRAPH_REPORT.md`, `graphify query` / `path` / `explain`). Chair applies manifest edits; never silent auto-merge.
25
+ 10. **Material changes:** `add_layer`, `add_boundary`, `split_layer` require `adr_required` + `ask_user` when `human_required`. `tune_constraint` may proceed with sentrux/graphify evidence only when chair agrees.
26
+ 11. **Observation vs intent:** `/harness-run` + `/harness-review` run CLI fitness functions; observation failures → replan/fix. Manifest changes → steward + ADR, not directory-tree guessing.
24
27
 
25
28
  ## Consequences
26
29
 
@@ -36,6 +39,8 @@ Sentrux enforces architecture via [`.sentrux/rules.toml`](https://sentrux.dev/do
36
39
  ## References
37
40
 
38
41
  - ADR 0006 (Sentrux dual layer)
42
+ - `.pi/agents/harness/sentrux-steward.md`, `.pi/prompts/harness-sentrux-steward.md`
43
+ - `.pi/harness/specs/sentrux-manifest-proposal.schema.json`, `sentrux-signal.schema.json`
39
44
  - `.pi/scripts/harness-sentrux-bootstrap.mjs`
40
45
  - `.pi/scripts/sentrux-rules-sync.mjs`
41
46
  - `.agents/skills/harness-sentrux-setup/SKILL.md`
@@ -17,7 +17,7 @@ Manual harness steps required copying `run_id` and `plan-packet.json` paths betw
17
17
  4. **Hook order:** `harness-run-context` `before_agent_start` allocates/reuses `run_id` before `trace-recorder` `agent_start`. Trace writes phase files `trace-<phase>.json` plus rollup `trace.json`.
18
18
  5. PostHog `harness_run_started` at most once per logical `run_id`.
19
19
  6. Short commands: `/harness-run`, `/harness-eval`, etc. without args; recovery via `/harness-run-status`, `/harness-use-run`.
20
- 7. After execute, handoff recommends **`/harness-eval`** in the same session; review commands spawn isolated subagents (see ADR 0032). `active-run.json` still supports cross-session recovery when Pi was closed mid-run.
20
+ 7. After execute, handoff recommends **`/harness-eval`** in the same session; review commands spawn isolated subagents (see ADR 0032). `active-run.json` still supports cross-session recovery when Pi was closed mid-run. On a **new Pi session**, if disk has a non-stale active run but this session has no `harness-run-context` entry yet, show a one-time resume message and live-widget hint pointing at **`/harness-use-run <run-id>`** (no silent auto-bind).
21
21
  8. `hasApprovedPlanSignal` uses user-visible prompt only; execute requires `plan_ready` from disk validation **and** recorded `ask_user` approval (or `harness-plan-approval` entry).
22
22
  9. **Plan-phase writes:** policy-gate allows `write`/`edit` only on canonical `.pi/harness/runs/<run_id>/plan-packet.json` after approval; all other paths stay blocked until execute phase.
23
23
  10. **Approval-before-persist:** agents present the full plan, call `ask_user` (Approve / Request changes / Cancel), then write the packet. `--quick` narrows planning only — it does not skip approval.
@@ -28,9 +28,16 @@ Harness slash prompts duplicated logic already defined in `harness/*` agents. Th
28
28
  - Orchestrator must parse subagent JSON reliably and pass complete spawn context.
29
29
  - Scope enforcement remains prompt-driven for executor until optional path allowlist.
30
30
 
31
+ ## Amendment (2026-05-23)
32
+
33
+ - **`/harness-review`** is the master **post-run** orchestrator (benchmark + verdict + adversary). See ADR 0039.
34
+ - **`/harness-eval`** and **`/harness-critic`** are thin deprecated aliases; do not implement separate pipelines.
35
+ - Post-run artifacts use **`submit_*`** + **`harness_artifact_ready`** per ADR 0037; parent does not parse subprocess JSON into `artifacts/eval-verdict.yaml`.
36
+
31
37
  ## References
32
38
 
33
39
  - `.pi/prompts/harness-*.md`
40
+ - ADR 0039 — post-run review gate
34
41
  - `.pi/agents/harness/*.md`
35
42
  - `vendor/pi-subagents/src/subagents.ts`, `.pi/extensions/lib/harness-subagents-bridge.ts`
36
43
  - `.pi/extensions/lib/harness-subagent-policy.ts`
@@ -9,13 +9,13 @@
9
9
 
10
10
  ## Decision
11
11
 
12
- 1. **Always-on research chain** after parallel scouts:
12
+ 1. **Always-on research chain** after planning context (ADR 0041; **sequential** — WBS before approach):
13
13
  - `harness/planning/decompose` — DeepMind-style problem decomposition (`PlanDecompositionBrief`)
14
- - `harness/planning/hypothesis` — DARWIN hypothesis generation (`PlanHypothesisBrief`)
14
+ - `harness/planning/hypothesis` — DARWIN hypothesis generation (`PlanHypothesisBrief`); spawned only after `artifacts/decomposition.yaml` exists
15
15
  2. **Parent maps hypothesis → PlanPacket** — `plan-packet.schema.json` unchanged; execution gating stable.
16
16
  3. **Review Gate (ADR 0035):** outcome-based debate with `hypothesis-validator` on R1 (blind — task + hypothesis only). Retired `hypothesis-eval` as a separate pre-approval agent.
17
17
  4. **`approve_plan` optional `research_brief`** — rendered in `plan-review.md`; not written to `plan-packet.json`.
18
- 5. **`--quick`** still skips semantic scout only; never skips decompose/hypothesis.
18
+ 5. **`--quick`** still skips semantic coverage in planning context only; never skips decompose/hypothesis.
19
19
 
20
20
  ## Consequences
21
21
 
@@ -13,14 +13,14 @@ ADR 0034–0035 established Darwin research and outcome-based Review Gate debate
13
13
 
14
14
  ## Decision
15
15
 
16
- 1. **Phase 3.5** — After decompose/hypothesis, parent spawns in parallel:
17
- - `harness/planning/implementation-researcher` `PlanImplementationResearchBrief` `artifacts/implementation-research.yaml`
18
- - `harness/planning/stack-researcher` `PlanStackBrief` `artifacts/stack.yaml`
16
+ 1. **Phase 3.5** — After decompose/hypothesis, parent produces (subprocess optional):
17
+ - `artifacts/implementation-research.yaml` (`PlanImplementationResearchBrief`) inline and/or `implementation-researcher`
18
+ - `artifacts/stack.yaml` (`PlanStackBrief`) inline and/or `stack-researcher`
19
19
  2. Research stays **outside** debate; debate agents cite artifacts, no web tools.
20
- 3. **Phase 4d** — `harness_plan_debate_eligibility` (pre-debate only) selects `full | standard | light` and `required_focuses`; persisted on messenger + bus at `harness_debate_open`.
20
+ 3. **Phase 4d** — `harness_plan_debate_eligibility` (pre-debate only) selects `full | standard | light | fast` and `required_focuses`; persisted on messenger + bus at `harness_debate_open`.
21
21
  4. **Light profile** — `spec` + `quality` only, `min_focus_rounds=2`, reduced global cap; gate uses stored `required_focuses` (not hardcoded four).
22
22
  5. **Sprint auditor** — shared `lanesForRound(roundIndex, focus)` spawns sprint lane when `focus === quality` OR `roundIndex >= 4`.
23
- 6. **`--quick`** still skips semantic scout only; never skips Phase 3.5 or debate.
23
+ 6. **`--quick`** still skips semantic coverage in planning context only; never skips Phase 3.5 artifacts (med/high risk) or debate.
24
24
 
25
25
  ## Profiles
26
26
 
@@ -29,6 +29,9 @@ ADR 0034–0035 established Darwin research and outcome-based Review Gate debate
29
29
  | full | high risk, material fork, open implementation questions, DAG manual patch, many tensions | all four | 4 |
30
30
  | standard | default (ambiguous → standard) | all four | 4 |
31
31
  | light | low risk, no fork, high-confidence implementation + clear stack primary | spec, quality | 2 |
32
+ | fast | med/low, clear stack, no open questions | spec, quality | 1 (consolidated `review_gate_mode`) |
33
+
34
+ See [practice-map.md](../practice-map.md) and [ADR 0040](0040-practice-grounded-orchestration.md).
32
35
 
33
36
  ## Consequences
34
37
 
@@ -0,0 +1,47 @@
1
+ # ADR 0039: Post-run review gate (`/harness-review`)
2
+
3
+ - **Status:** Accepted
4
+ - **Date:** 2026-05-23
5
+
6
+ ## Context
7
+
8
+ Post-run flow split across `/harness-eval`, a thin `/harness-review` (verdict-only), and `/harness-critic`. Cross-session resume left `owner_pi_session_id` on the plan session, blocking parent orchestration. Status routing used session handoff strings instead of canonical `artifacts/eval-verdict.yaml`. Prompts still instructed parent JSON parsing and `write` to eval artifacts (ADR 0037 violation).
9
+
10
+ ## Decision
11
+
12
+ 1. **`/harness-review`** is the **master post-run orchestrator** (plan-grade): deterministic gates → benchmark evaluator → policy verdict → adversary (parallel with verdict when precheck allows) → optional tie-breaker → **`artifacts/review-outcome.yaml`**. Always complete review before replan; blocked execute routes here, not `/harness-plan`. `--quick` skips adversary and tie-breaker. Steer attempts 2+ may use **lite** review (benchmark + verdict; skip adversary unless prior `block_merge`).
13
+ 2. **`/harness-eval`** and **`/harness-critic`** are **deprecated aliases** that forward to `/harness-review` in the same turn.
14
+ 3. **Ownership:** `/harness-use-run --claim` and auto-claim on post-run commands (unless `--readonly`) set `owner_pi_session_id` and `pi_session_id` to the current Pi session.
15
+ 4. **Disk truth:** `resolveCompletionStatuses` reads `artifacts/eval-verdict.yaml` and `artifacts/adversary-report.yaml` for `nextStepAfterOutcome` and widget next steps. Persisted `next_recommended_command` on `run-context.yaml` wins when set.
16
+ 5. **Artifacts:** Evaluator uses `submit_eval_verdict`; adversary uses `submit_adversary_report`. Parent gates with `harness_artifact_ready` only. Parent may write `artifacts/benchmark-log.yaml` via `write_harness_yaml`; parent must not write eval/adversary verdict YAML.
17
+ 6. **Rollback:** `submit_executor_handoff` mirrors `rollback_refs` to `artifacts/executor-rollback.yaml` (no `artifacts/*.json`).
18
+
19
+ ## Phases (orchestrator)
20
+
21
+ | Phase | Actor | Output |
22
+ |-------|--------|--------|
23
+ | 0 | Parent | Parse args; claim run; require execute complete |
24
+ | 1 | Parent | `harness-verify.mjs`; optional `benchmark-log.yaml` |
25
+ | 2 | `harness/evaluator` benchmark | `eval-verdict.yaml` |
26
+ | 2b | Parent | Record benchmark fail in review-outcome; continue to verdict unless harness-verify hard-stops |
27
+ | 3 | `harness/evaluator` verdict | `eval-verdict.yaml` (policy) |
28
+ | 4 | `harness/adversary` | `adversary-report.yaml` |
29
+ | 5 | `harness/tie-breaker` | conditional |
30
+
31
+ ## Consequences
32
+
33
+ ### Positive
34
+
35
+ - One command after `/harness-run`; same-session and cross-session resume with `--claim`.
36
+ - Widget and run context align with on-disk verdicts.
37
+
38
+ ### Negative
39
+
40
+ - Full post-run pipeline latency is sequential in one command (acceptable vs broken multi-session flow).
41
+
42
+ ## References
43
+
44
+ - ADR 0032 (amended), ADR 0037
45
+ - `.pi/prompts/harness-review.md`
46
+ - `.pi/lib/harness-run-context.ts` (`claimRunOwnership`, `resolveCompletionStatuses`)
47
+ - `.agents/skills/harness-review/SKILL.md`
@@ -0,0 +1,40 @@
1
+ # ADR 0040: Practice-grounded orchestration and team topology
2
+
3
+ - **Status:** Accepted
4
+ - **Date:** 2026-05-23
5
+
6
+ ## Context
7
+
8
+ Harness commands (`/harness-plan`, `/harness-run`, `/harness-review`) already followed structured planning, generator–evaluator separation, and outcome-based debate (ADRs 0032–0039). The graphify corpus (PMBOK process groups, Team Topologies, Code Complete inspection, harness engineering, Lean spikes) was not surfaced in prompts—orchestrators could spawn redundant parallel thinkers (e.g. decompose ∥ hypothesis) and debate lanes without clear RACI.
9
+
10
+ ## Decision
11
+
12
+ 1. **Practice map** — [practice-map.md](../practice-map.md) is the source of truth: phase → practice → agent/script → spawn topology, debate RACI, anti-patterns.
13
+ 2. **Planning sequence** — After planning context (ADR 0041), **decompose then hypothesis** (sequential invariant). Hypothesis requires `artifacts/decomposition.yaml` (amends ADR 0034). For `low`/`med` risk, a single **plan-synthesizer** spawn may produce decomposition, hypothesis, and `execution_plan` in one pass, but those artifacts must still land on disk before blind validation (ADR 0042)—sequential **invariant**, not necessarily three parent spawn batches.
14
+ 3. **Reconnaissance dedup** — `decompose` must not run `graphify query` when `artifacts/planning-context.yaml` has `coverage.architecture.status: ok` (legacy: `scout-graphify.yaml` with `status: ok`).
15
+ 4. **Team topology rules** — Documented in practice-map and orchestration skills:
16
+ - Parallel only for independent merges (implementation ∥ stack; optional legacy scouts ≤3).
17
+ - Max 2 research lanes, 1 optional `planning-context` subagent, 1 executor, 1 debate agent per `subagent` batch.
18
+ - Debate: parent is chair; one agent per batch; Fagan-style roles (inspector, red team, DoD auditor, blind verifier, recorder).
19
+ 5. **Command prompts** — Name the proven practice per phase; link practice-map.
20
+ 6. **Profiles** — `fast` consolidated Review Gate documented alongside `light` threaded gate (ADR 0036 amended).
21
+
22
+ ## Consequences
23
+
24
+ ### Positive
25
+
26
+ - Every harness phase traceable to corpus-backed practice.
27
+ - Fewer detached hypotheses and duplicate graphify work (strengthened by ADR 0041 planning-context artifact).
28
+ - Clearer debate roster; smaller teams on low-risk plans via `fast`/`light`.
29
+
30
+ ### Negative
31
+
32
+ - Slightly longer plan phase wall-clock (sequential decompose → hypothesis).
33
+ - More documentation for agents to reference.
34
+
35
+ ## References
36
+
37
+ - [practice-map.md](../practice-map.md)
38
+ - ADR 0034, ADR 0036, ADR 0039
39
+ - `.pi/prompts/harness-plan.md`, `.pi/prompts/harness-run.md`, `.pi/prompts/harness-review.md`
40
+ - `graphify-out/GRAPH_REPORT.md` — Planning / Executing / Monitoring communities, Team Topologies, Harness Engineering
@@ -0,0 +1,39 @@
1
+ # ADR 0041: Intelligent planning reconnaissance (tools over tool-scouts)
2
+
3
+ - **Status:** Accepted
4
+ - **Date:** 2026-05-23
5
+
6
+ ## Context
7
+
8
+ ADR 0033 and 0040 mandated three parallel planning scouts (`scout-graphify`, `scout-structure`, `scout-semantic`), each bound to one tool family. That enforced coverage but constrained orchestrator intelligence: the parent always paid for three subprocesses even when one tool pass or a short graphify query sufficed.
9
+
10
+ The graphify corpus (Superpowers: *Rigid Where It Matters, Flexible Where It Doesn't*; context engineering: *Context > Model Intelligence*) supports hard gates on **artifacts and phase order**, not on **how many subprocesses** gather context.
11
+
12
+ ## Decision
13
+
14
+ 1. **Phase 1 default** — Parent compiles `artifacts/planning-context.yaml` using repo tools (`graphify`, `sg`, `ccc`, reads) per task need. No mandatory scout subprocess batch.
15
+ 2. **Artifact contract** — `plan-planning-context.schema.json` requires `coverage.architecture` and `coverage.structure` at `ok` or `partial`; `coverage.semantic` may be `skipped` when `--quick`.
16
+ 3. **Optional subprocess** — At most one `harness/planning/planning-context` subagent when isolation warrants; `submit_planning_context` writes the canonical artifact.
17
+ 4. **Legacy compat (one release)** — `scout-*.yaml` trio still satisfies approval readiness with deprecation warning; `decompose` dedup reads `planning-context` first.
18
+ 5. **Phase 3.5** — Requires `implementation-research.yaml` and `stack.yaml` for med/high risk; subprocess researchers optional (parent may spike inline).
19
+ 6. **Spawn topology** — Remove default parallel scout batch rules; keep decompose∥hypothesis and debate sequential laws.
20
+
21
+ ## Consequences
22
+
23
+ ### Positive
24
+
25
+ - Orchestrator chooses tools and depth by task; fewer ceremonial subprocesses.
26
+ - Single shared artifact reduces merge friction and redundant graphify in decompose.
27
+ - Hard gates (DAG, debate, approval) unchanged.
28
+
29
+ ### Negative
30
+
31
+ - Parent context window bears more reconnaissance load unless `planning-context` subagent is used.
32
+ - Legacy scout agents remain on disk until removal after deprecation window.
33
+
34
+ ## References
35
+
36
+ - [practice-map.md](../practice-map.md)
37
+ - ADR 0033, ADR 0040
38
+ - `.pi/prompts/harness-plan.md`
39
+ - `plan-planning-context.schema.json`
@@ -0,0 +1,35 @@
1
+ # ADR 0042: Agent-native orchestration
2
+
3
+ - **Status:** Accepted
4
+ - **Date:** 2026-05-23
5
+
6
+ ## Context
7
+
8
+ Harness commands inherited human PM rituals: serial debate “meetings,” ticket-granularity WBS, and tool calls that re-embed full plan packets in model context. Agents optimize for context window, spawn cost, and verifiable artifacts—not calendar boundaries or social coordination (see [practice-map.md](../practice-map.md)).
9
+
10
+ ## Decision
11
+
12
+ 1. **Agent translation column** — practice-map documents human practice → agent equivalent (scheduler + gates, lake-first plans, path-first tools, steer loop).
13
+ 2. **Boiling lakes** — Fewer `work_items` with richer specs and `context_bundle_path`; `executor_strategy` on PlanPacket (`single_pass` | `per_lake` | `per_work_item`).
14
+ 3. **Plan-verify probes** — For `fast`/`standard` profiles, parallel inspector + adversary probes replace serial “one role per batch” debate where gate supports `parallel_probes` (ADR 0036 extended).
15
+ 4. **Plan synthesizer** — For `low`/`med` risk, one `harness/planning/plan-synthesizer` pass may replace separate author spawn; **decomposition + hypothesis artifacts still required** on disk for blind validation (ADR 0040 invariant).
16
+ 5. **Path-first tools** — See ADR 0043; disk is source of truth for approval and submit pipelines.
17
+ 6. **Steer loop** — See ADR 0044; always complete post-run review; repair vs plan revise routing.
18
+
19
+ ## Consequences
20
+
21
+ ### Positive
22
+
23
+ - Lower plan/review wall-clock and token use.
24
+ - Plans sized for agent throughput, not sprint ticket count.
25
+
26
+ ### Negative
27
+
28
+ - More ADRs and schema fields for agents to learn.
29
+ - Migration period: optional fat tool args remain one release.
30
+
31
+ ## References
32
+
33
+ - [practice-map.md](../practice-map.md)
34
+ - ADR 0040, 0041, 0043, 0044
35
+ - `.cursor/plans/agent-native_harness_workflows_1d353489.plan.md` (design source)
@@ -0,0 +1,38 @@
1
+ # ADR 0043: Path-first harness tool contracts
2
+
3
+ - **Status:** Accepted
4
+ - **Date:** 2026-05-23
5
+
6
+ ## Context
7
+
8
+ `approve_plan`, `create_plan`, and `submit_*` often pass full YAML/JSON documents in tool arguments when the same bytes already exist under `.pi/harness/runs/<run_id>/`. That duplicates tokens and trains models to carry large structs in chat history.
9
+
10
+ ## Decision
11
+
12
+ 1. **`approve_plan`** — Primary API: `approve_plan({ human_summary?: string })`. Extension loads `plan_packet` from `runCtx.plan_packet_path` and `research-brief.yaml` from the run dir. Optional `plan_packet` / `research_brief` deprecated for one release.
13
+ 2. **`create_plan`** — Primary API: `create_plan()` or `create_plan({ plan_packet_path?: string })`. Verifies approval marker and optional content hash from approve time.
14
+ 3. **`submit_*`** — Accept `source_path` under the active run; read, validate, promote to canonical path. `document` remains optional (deprecated).
15
+ 4. **`merge_harness_yaml`** — Parent merges patches from artifact paths without pasting bodies into tool args.
16
+ 5. **Tool results** — Return `{ path, sha256, status }` (and ids where relevant), not full documents.
17
+
18
+ ## Safety
19
+
20
+ - Draft/canonical packet must exist on disk before approve.
21
+ - Re-`approve_plan` required when `execution_plan` or `acceptance_checks` change after a `plan_gap` revise (hash gate).
22
+
23
+ ## Consequences
24
+
25
+ ### Positive
26
+
27
+ - Approval turns stay small in session history.
28
+ - Subagents write once to disk; submit is O(path) tokens.
29
+
30
+ ### Negative
31
+
32
+ - Agents must write drafts before approve/submit (explicit discipline).
33
+
34
+ ## References
35
+
36
+ - `.pi/extensions/harness-plan-approval.ts`
37
+ - `.pi/extensions/lib/harness-subagent-submit-pipeline.ts`
38
+ - ADR 0042, 0044
@@ -0,0 +1,36 @@
1
+ # ADR 0044: Harness steer loop (post-run repair)
2
+
3
+ - **Status:** Accepted
4
+ - **Date:** 2026-05-23
5
+
6
+ ## Context
7
+
8
+ After `/harness-run`, failed benchmarks or blocked execution previously routed users to `/harness-plan "<new task>"` even when the approved plan was still valid—high friction and duplicate planning context.
9
+
10
+ ## Decision
11
+
12
+ 1. **Always review** — `/harness-run` ends with `next_command: /harness-review` (including `blocked` / partial work). Remove benchmark fail-fast skip of verdict/adversary (ADR 0039 amended).
13
+ 2. **Review artifacts** — Parent writes `artifacts/review-outcome.yaml` and `artifacts/repair-brief.yaml` (path pointers, not pasted bodies).
14
+ 3. **Remediation routing** — `review-outcome.remediation_class`: `implementation_gap` → `/harness-steer`; `plan_gap` → `/harness-plan` revise with `repair_brief_path`; `pass` → policy status. **Review outcome wins** over executor `scope_drift` when they disagree; tie → `plan_gap`.
15
+ 4. **`/harness-steer`** — Thin orchestrator: read briefs, set policy **phase `execute`**, spawn `harness/executor` with `mode: repair`, then `/harness-review` again.
16
+ 5. **Caps** — `HARNESS_STEER_MAX_ATTEMPTS` (default 3). **Tiered review:** full review on initial run + steer 1; steers 2+ use lite (benchmark + verdict) unless prior `block_merge` or user forces full.
17
+ 6. **Sentrux** — Refresh baseline or compare new violations only after steer mutations (avoid false degraded on every attempt).
18
+ 7. **Evaluate-phase writes** — Orchestrator may write review/steer YAML under run `artifacts/` in `evaluate`/`adversary` phase (allowlisted files).
19
+
20
+ ## Consequences
21
+
22
+ ### Positive
23
+
24
+ - One `approve_plan`; many repair cycles without re-typing tasks.
25
+ - `harness-auto` can loop until pass or cap.
26
+
27
+ ### Negative
28
+
29
+ - Higher review cost on failed runs (mitigated by tiered adversary).
30
+
31
+ ## References
32
+
33
+ - `.pi/prompts/harness-steer.md`
34
+ - `.pi/harness/specs/review-outcome.schema.json`, `repair-brief.schema.json`
35
+ - `nextStepAfterOutcome` in `.pi/lib/harness-run-context.ts`
36
+ - ADR 0039 (amended), 0043
@@ -24,6 +24,16 @@ Team-shared ADRs for the ultimate-pi harness live under `.pi/harness/docs/adrs/`
24
24
  | [0036](0036-implementation-research-and-selective-debate.md) | Implementation research and selective debate | Accepted |
25
25
  | [0037](0037-subagent-submit-tools.md) | Subagent submit tools (subprocess extension) | Accepted |
26
26
  | [0038](0038-budget-telemetry-only.md) | Budget caps telemetry-only by default | Accepted |
27
+ | [0039](0039-harness-post-run-review-gate.md) | `/harness-review` master post-run gate | Accepted |
28
+ | [0040](0040-practice-grounded-orchestration.md) | Practice-grounded orchestration & team topology | Accepted |
29
+ | [0041](0041-intelligent-planning-reconnaissance.md) | Intelligent planning reconnaissance (tools over tool-scouts) | Accepted |
30
+ | [0042](0042-agent-native-orchestration.md) | Agent-native orchestration (lakes, plan-verify probes, synthesizer) | Accepted |
31
+ | [0043](0043-path-first-harness-tools.md) | Path-first harness tool contracts | Accepted |
32
+ | [0044](0044-harness-steer-loop.md) | Post-run steer loop (repair vs plan revise) | Accepted |
33
+
34
+ ## Practice map
35
+
36
+ Phase-to-practice mapping for slash commands: [practice-map.md](../practice-map.md).
27
37
 
28
38
  ## Template
29
39