@gajae-code/coding-agent 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. package/CHANGELOG.md +36 -0
  2. package/README.md +1 -1
  3. package/dist/types/async/job-manager.d.ts +26 -0
  4. package/dist/types/cli/args.d.ts +1 -0
  5. package/dist/types/cli/list-models.d.ts +6 -0
  6. package/dist/types/cli/setup-cli.d.ts +8 -1
  7. package/dist/types/commands/gc.d.ts +26 -0
  8. package/dist/types/commands/setup.d.ts +7 -0
  9. package/dist/types/config/file-lock-gc.d.ts +5 -0
  10. package/dist/types/config/file-lock.d.ts +29 -0
  11. package/dist/types/config/model-registry.d.ts +4 -0
  12. package/dist/types/config/models-config-schema.d.ts +5 -0
  13. package/dist/types/config/settings-schema.d.ts +62 -0
  14. package/dist/types/coordinator/contract.d.ts +1 -1
  15. package/dist/types/defaults/gjc/extensions/grok-build/index.d.ts +1 -0
  16. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/index.d.ts +1 -0
  17. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/models/catalog.d.ts +25 -0
  18. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/payload/sanitize.d.ts +27 -0
  19. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/provider/billing.d.ts +8 -0
  20. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/provider/register.d.ts +5 -0
  21. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/provider/stream.d.ts +10 -0
  22. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/provider/usage.d.ts +2 -0
  23. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/shared/base-url.d.ts +2 -0
  24. package/dist/types/defaults/gjc/extensions/grok-cli-vendor/src/shared/errors.d.ts +38 -0
  25. package/dist/types/defaults/gjc-grok-cli.d.ts +5 -0
  26. package/dist/types/extensibility/extensions/index.d.ts +1 -0
  27. package/dist/types/extensibility/extensions/prefix-command-bridge.d.ts +35 -0
  28. package/dist/types/gjc-runtime/deep-interview-recorder.d.ts +103 -0
  29. package/dist/types/gjc-runtime/deep-interview-runtime.d.ts +2 -0
  30. package/dist/types/gjc-runtime/deep-interview-state.d.ts +112 -0
  31. package/dist/types/gjc-runtime/gc-render.d.ts +6 -0
  32. package/dist/types/gjc-runtime/gc-runtime.d.ts +134 -0
  33. package/dist/types/gjc-runtime/ledger-event-renderer.d.ts +68 -0
  34. package/dist/types/gjc-runtime/state-writer.d.ts +64 -2
  35. package/dist/types/gjc-runtime/team-gc.d.ts +7 -0
  36. package/dist/types/gjc-runtime/team-runtime.d.ts +5 -0
  37. package/dist/types/gjc-runtime/tmux-common.d.ts +11 -0
  38. package/dist/types/gjc-runtime/tmux-gc.d.ts +7 -0
  39. package/dist/types/gjc-runtime/tmux-sessions.d.ts +13 -0
  40. package/dist/types/gjc-runtime/ultragoal-guard.d.ts +10 -0
  41. package/dist/types/gjc-runtime/ultragoal-runtime.d.ts +29 -0
  42. package/dist/types/harness-control-plane/gc-adapter.d.ts +3 -0
  43. package/dist/types/harness-control-plane/owner.d.ts +7 -0
  44. package/dist/types/harness-control-plane/storage.d.ts +20 -0
  45. package/dist/types/modes/components/hook-selector.d.ts +7 -1
  46. package/dist/types/modes/components/provider-onboarding-selector.d.ts +1 -1
  47. package/dist/types/modes/controllers/command-controller.d.ts +1 -0
  48. package/dist/types/modes/interactive-mode.d.ts +1 -1
  49. package/dist/types/modes/rpc/rpc-mode.d.ts +72 -2
  50. package/dist/types/modes/shared/agent-wire/deep-interview-gate.d.ts +13 -0
  51. package/dist/types/modes/shared/agent-wire/session-registry.d.ts +25 -0
  52. package/dist/types/modes/shared/agent-wire/unattended-action-policy.d.ts +2 -0
  53. package/dist/types/modes/shared/agent-wire/unattended-session.d.ts +10 -0
  54. package/dist/types/modes/theme/defaults/index.d.ts +302 -0
  55. package/dist/types/modes/theme/theme.d.ts +1 -0
  56. package/dist/types/modes/types.d.ts +1 -1
  57. package/dist/types/session/agent-session.d.ts +1 -1
  58. package/dist/types/session/blob-store.d.ts +39 -3
  59. package/dist/types/session/history-storage.d.ts +2 -2
  60. package/dist/types/session/session-manager.d.ts +10 -1
  61. package/dist/types/setup/credential-import.d.ts +79 -0
  62. package/dist/types/skill-state/workflow-hud.d.ts +14 -0
  63. package/dist/types/task/executor.d.ts +1 -0
  64. package/dist/types/task/render.d.ts +1 -1
  65. package/dist/types/tools/ask.d.ts +15 -1
  66. package/dist/types/tools/subagent-render.d.ts +7 -1
  67. package/dist/types/tools/subagent.d.ts +27 -0
  68. package/dist/types/tools/ultragoal-ask-guard.d.ts +5 -0
  69. package/dist/types/web/search/index.d.ts +4 -4
  70. package/dist/types/web/search/provider.d.ts +16 -20
  71. package/dist/types/web/search/providers/base.d.ts +2 -1
  72. package/dist/types/web/search/providers/openai-compatible.d.ts +9 -0
  73. package/dist/types/web/search/types.d.ts +14 -2
  74. package/package.json +7 -7
  75. package/scripts/build-binary.ts +7 -0
  76. package/src/async/job-manager.ts +52 -0
  77. package/src/cli/args.ts +5 -0
  78. package/src/cli/auth-broker-cli.ts +1 -0
  79. package/src/cli/fast-help.ts +2 -0
  80. package/src/cli/list-models.ts +13 -1
  81. package/src/cli/setup-cli.ts +138 -3
  82. package/src/cli.ts +1 -0
  83. package/src/commands/gc.ts +22 -0
  84. package/src/commands/harness.ts +7 -3
  85. package/src/commands/setup.ts +5 -1
  86. package/src/commands/ultragoal.ts +3 -1
  87. package/src/config/file-lock-gc.ts +193 -0
  88. package/src/config/file-lock.ts +66 -10
  89. package/src/config/model-profile-activation.ts +15 -3
  90. package/src/config/model-profiles.ts +39 -30
  91. package/src/config/model-registry.ts +21 -1
  92. package/src/config/models-config-schema.ts +1 -0
  93. package/src/config/settings-schema.ts +62 -0
  94. package/src/coordinator/contract.ts +1 -0
  95. package/src/coordinator-mcp/server.ts +459 -3
  96. package/src/defaults/gjc/agent.models.grok-cli.yml +36 -0
  97. package/src/defaults/gjc/extensions/grok-build/index.ts +1 -0
  98. package/src/defaults/gjc/extensions/grok-build/package.json +7 -0
  99. package/src/defaults/gjc/extensions/grok-cli-vendor/biome.json +39 -0
  100. package/src/defaults/gjc/extensions/grok-cli-vendor/package.json +8 -0
  101. package/src/defaults/gjc/extensions/grok-cli-vendor/src/index.ts +1 -0
  102. package/src/defaults/gjc/extensions/grok-cli-vendor/src/models/catalog.ts +155 -0
  103. package/src/defaults/gjc/extensions/grok-cli-vendor/src/payload/sanitize.ts +361 -0
  104. package/src/defaults/gjc/extensions/grok-cli-vendor/src/provider/billing.ts +57 -0
  105. package/src/defaults/gjc/extensions/grok-cli-vendor/src/provider/register.ts +99 -0
  106. package/src/defaults/gjc/extensions/grok-cli-vendor/src/provider/stream.ts +50 -0
  107. package/src/defaults/gjc/extensions/grok-cli-vendor/src/provider/usage.ts +56 -0
  108. package/src/defaults/gjc/extensions/grok-cli-vendor/src/shared/base-url.ts +36 -0
  109. package/src/defaults/gjc/extensions/grok-cli-vendor/src/shared/errors.ts +44 -0
  110. package/src/defaults/gjc/skills/deep-interview/SKILL.md +131 -113
  111. package/src/defaults/gjc/skills/deep-interview/lateral-review-panel.md +49 -0
  112. package/src/defaults/gjc/skills/ultragoal/SKILL.md +30 -8
  113. package/src/defaults/gjc-defaults.ts +7 -0
  114. package/src/defaults/gjc-grok-cli.ts +22 -0
  115. package/src/extensibility/extensions/index.ts +1 -0
  116. package/src/extensibility/extensions/prefix-command-bridge.ts +128 -0
  117. package/src/gjc-runtime/deep-interview-recorder.ts +457 -0
  118. package/src/gjc-runtime/deep-interview-runtime.ts +18 -26
  119. package/src/gjc-runtime/deep-interview-state.ts +324 -0
  120. package/src/gjc-runtime/gc-render.ts +70 -0
  121. package/src/gjc-runtime/gc-runtime.ts +403 -0
  122. package/src/gjc-runtime/launch-tmux.ts +3 -4
  123. package/src/gjc-runtime/ledger-event-renderer.ts +164 -0
  124. package/src/gjc-runtime/ralplan-runtime.ts +232 -19
  125. package/src/gjc-runtime/state-renderer.ts +12 -3
  126. package/src/gjc-runtime/state-runtime.ts +48 -30
  127. package/src/gjc-runtime/state-writer.ts +254 -7
  128. package/src/gjc-runtime/team-gc.ts +49 -0
  129. package/src/gjc-runtime/team-runtime.ts +179 -2
  130. package/src/gjc-runtime/tmux-common.ts +14 -0
  131. package/src/gjc-runtime/tmux-gc.ts +177 -0
  132. package/src/gjc-runtime/tmux-sessions.ts +49 -1
  133. package/src/gjc-runtime/ultragoal-guard.ts +155 -0
  134. package/src/gjc-runtime/ultragoal-runtime.ts +1239 -31
  135. package/src/gjc-runtime/workflow-manifest.generated.json +44 -0
  136. package/src/gjc-runtime/workflow-manifest.ts +12 -0
  137. package/src/harness-control-plane/gc-adapter.ts +184 -0
  138. package/src/harness-control-plane/owner.ts +14 -2
  139. package/src/harness-control-plane/rpc-adapter.ts +1 -1
  140. package/src/harness-control-plane/storage.ts +70 -0
  141. package/src/hooks/skill-state.ts +121 -2
  142. package/src/internal-urls/docs-index.generated.ts +22 -12
  143. package/src/lsp/defaults.json +1 -0
  144. package/src/main.ts +18 -3
  145. package/src/modes/acp/acp-agent.ts +4 -2
  146. package/src/modes/bridge/bridge-mode.ts +2 -1
  147. package/src/modes/components/history-search.ts +5 -2
  148. package/src/modes/components/hook-selector.ts +19 -0
  149. package/src/modes/components/model-selector.ts +51 -8
  150. package/src/modes/components/provider-onboarding-selector.ts +6 -1
  151. package/src/modes/components/status-line/segments.ts +1 -1
  152. package/src/modes/controllers/command-controller.ts +25 -6
  153. package/src/modes/controllers/extension-ui-controller.ts +3 -0
  154. package/src/modes/controllers/selector-controller.ts +81 -1
  155. package/src/modes/interactive-mode.ts +11 -1
  156. package/src/modes/rpc/rpc-mode.ts +266 -34
  157. package/src/modes/shared/agent-wire/command-dispatch.ts +281 -261
  158. package/src/modes/shared/agent-wire/deep-interview-gate.ts +30 -1
  159. package/src/modes/shared/agent-wire/host-tool-bridge.ts +3 -0
  160. package/src/modes/shared/agent-wire/session-registry.ts +109 -0
  161. package/src/modes/shared/agent-wire/unattended-action-policy.ts +24 -0
  162. package/src/modes/shared/agent-wire/unattended-run-controller.ts +23 -3
  163. package/src/modes/shared/agent-wire/unattended-session.ts +32 -2
  164. package/src/modes/theme/defaults/claude-code.json +100 -0
  165. package/src/modes/theme/defaults/codex.json +100 -0
  166. package/src/modes/theme/defaults/index.ts +6 -0
  167. package/src/modes/theme/defaults/opencode.json +102 -0
  168. package/src/modes/theme/theme.ts +2 -2
  169. package/src/modes/types.ts +1 -1
  170. package/src/prompts/agents/executor.md +5 -2
  171. package/src/sdk.ts +29 -4
  172. package/src/session/agent-session.ts +99 -19
  173. package/src/session/blob-store.ts +59 -3
  174. package/src/session/history-storage.ts +32 -11
  175. package/src/session/session-manager.ts +72 -20
  176. package/src/setup/credential-import.ts +429 -0
  177. package/src/setup/hermes/templates/operator-instructions.v1.md +7 -1
  178. package/src/skill-state/deep-interview-mutation-guard.ts +2 -1
  179. package/src/skill-state/workflow-hud.ts +106 -10
  180. package/src/slash-commands/builtin-registry.ts +3 -2
  181. package/src/task/executor.ts +16 -1
  182. package/src/task/render.ts +18 -7
  183. package/src/tools/ask.ts +59 -2
  184. package/src/tools/cron.ts +1 -1
  185. package/src/tools/job.ts +3 -2
  186. package/src/tools/monitor.ts +36 -1
  187. package/src/tools/subagent-render.ts +128 -29
  188. package/src/tools/subagent.ts +173 -9
  189. package/src/tools/ultragoal-ask-guard.ts +39 -0
  190. package/src/web/search/index.ts +25 -25
  191. package/src/web/search/provider.ts +178 -87
  192. package/src/web/search/providers/base.ts +2 -1
  193. package/src/web/search/providers/openai-compatible.ts +151 -0
  194. package/src/web/search/types.ts +47 -22
@@ -52,16 +52,19 @@ Inspired by the [Ouroboros project](https://github.com/Q00/ouroboros) which demo
52
52
  - Do not proceed to execution until ambiguity ≤ the resolved threshold for this run and the user explicitly approves a scoped execution path
53
53
  - Allow early exit with a clear warning if ambiguity is still high
54
54
  - Persist interview state for resume across session interruptions
55
- - Challenge agents activate at specific round thresholds to shift perspective
55
+ - A multi-persona lateral-review panel convenes at ambiguity-milestone transitions (and before synthesizing any agent-supplied answer) to expose blind spots from independent perspectives
56
+ - Refine free-text answers into a structured interpretation and confirm nothing is lost before scoring
57
+ - After 3 consecutive agent-resolved answers (accepted auto-research candidates or auto-answers), route the next question to the user (dialectic rhythm guard)
58
+ - Run an independent closure audit and a one-sentence goal restatement, each requiring explicit user confirmation, before crystallizing the spec
56
59
  </Execution_Policy>
57
60
 
58
61
  <Internal_Auto_Mode_Protocol>
59
- - `auto-research-greenfield.md` and `auto-answer-uncertain.md` are internal prompt fragments loaded on demand with bundle metadata `kind: "skill-fragment"`; they are not public skills, are never slash-command/discoverable, and must not be registered through any `skill://` route.
62
+ - `auto-research-greenfield.md`, `auto-answer-uncertain.md`, and `lateral-review-panel.md` are internal prompt fragments loaded on demand with bundle metadata `kind: "skill-fragment"`; they are not public skills, are never slash-command/discoverable, and must not be registered through any `skill://` route.
60
63
  - Load fragments only for the specific hook that needs them, with forked inherited context kept read-only and prompt-budgeted; summarize active interview context before spawning the architect if the payload is large.
61
64
  - Auto-mode architects are read-only: no code edits, no `.gjc/` mutation, no workflow chaining, no formatters, and no execution delegation.
62
65
  - Validate every fragment response before using it: required sections must be present, candidates/answer must match the requested shape, rationale must cite available context, confidence must be explicit, and insufficient-context fallbacks must be honored.
63
66
  - If architect spawn, fragment loading, or response validation fails, continue the normal manual interview path silently and record an internal audit note in state by incrementing `architect_failures`; do not expose tool noise to the user unless it changes the next user-facing question.
64
- - Track `auto_researched_rounds`, `auto_answered_rounds`, and `architect_failures` in state and final spec metadata.
67
+ - Track `auto_researched_rounds`, `auto_answered_rounds`, `lateral_reviews`, `auto_answer_streak`, `refined_rounds`, `architect_failures`, and `lateral_panel_failures` in state and final spec metadata.
65
68
  </Internal_Auto_Mode_Protocol>
66
69
 
67
70
 
@@ -132,6 +135,7 @@ Deep Interview threshold: <resolvedThresholdPercent> (source: <resolvedThreshold
132
135
  "initial_idea": "<prompt-safe initial-context summary or user input>",
133
136
  "initial_context_summary": "<summary if oversized, else null>",
134
137
  "rounds": [],
138
+ "established_facts": [],
135
139
  "current_ambiguity": 1.0,
136
140
  "threshold": <resolvedThreshold>,
137
141
  "threshold_source": "<resolvedThresholdSource>",
@@ -144,10 +148,16 @@ Deep Interview threshold: <resolvedThresholdPercent> (source: <resolvedThreshold
144
148
  "deferrals": [],
145
149
  "last_targeted_component_id": null
146
150
  },
147
- "challenge_modes_used": [],
148
151
  "ontology_snapshots": [],
149
152
  "auto_researched_rounds": [],
150
153
  "auto_answered_rounds": [],
154
+ "lateral_reviews": [],
155
+ "lateral_panel_failures": 0,
156
+ "auto_answer_streak": 0,
157
+ "refined_rounds": [],
158
+ "closure_overrides": [],
159
+ "restated_goal": null,
160
+ "ambiguity_milestone": "initial",
151
161
  "architect_failures": 0
152
162
  }
153
163
  }
@@ -238,7 +248,7 @@ Build the question generation prompt with:
238
248
  - The prompt-safe initial-context summary (if one was created), otherwise the user's original idea
239
249
  - Prior Q&A rounds trimmed or summarized to fit the prompt budget while preserving decisions, constraints, unresolved gaps, and ontology changes
240
250
  - Current clarity scores per dimension (which is weakest?)
241
- - Challenge agent mode (if activated -- see Phase 3)
251
+ - Lateral-review panel findings (if convened this round -- see Phase 3)
242
252
  - Brownfield codebase context (if applicable), summarized to cited paths/symbols/patterns instead of raw dumps
243
253
  - Locked topology from Round 0, including active components, deferred components, prior per-component scores, and `last_targeted_component_id`
244
254
 
@@ -252,7 +262,9 @@ If any prompt input is too large, summarize it first and then continue from the
252
262
  - Generate a question that specifically improves that component's weakest dimension
253
263
  - State, in one sentence before the question, why this component/dimension pair is now the bottleneck to reducing ambiguity
254
264
  - Questions should expose ASSUMPTIONS, not gather feature lists
265
+ - **Facts vs decisions:** answer factual questions (current stack, versions, existing patterns, external API limits) from explore/research and present them as cited confirmations; route every *decision* (goals, scope, tradeoffs, desired behavior for new work) to the user. When unsure which a question is, treat it as a decision and ask.
255
266
  - If the scope is still conceptually fuzzy (entities keep shifting, the user is naming symptoms, or the core noun is unstable), switch to an ontology-style question that asks what the thing fundamentally IS before returning to feature/detail questions
267
+ - **Dialectic rhythm guard:** increment `state.auto_answer_streak` when a round is resolved without direct user judgment (an accepted auto-research candidate or an auto-answer); reset it to 0 on any direct, refined, or cited-confirmation answer from the user. If the streak reaches 3, route the next question directly to the user even if it looks auto-answerable, then reset. The interview is with the human, not the codebase.
256
268
 
257
269
  **Question styles by dimension:**
258
270
  | Dimension | Question Style | Example |
@@ -281,18 +293,50 @@ Round {n} | Component: {target_component_name} | Targeting: {weakest_dimension}
281
293
 
282
294
  Options should include contextually relevant choices plus free-text, translated/localized according to `language.instruction` when present.
283
295
 
296
+ When calling `ask`, SHOULD include optional structured metadata so the runtime can record the round without manual state writes: `deepInterview.round_id?`, `deepInterview.round`, `deepInterview.component`, `deepInterview.dimension`, and `deepInterview.ambiguity`. Keep this metadata aligned with the visible Round/Component/Targeting/Ambiguity line; if metadata cannot be supplied, the legacy formatted question text remains the fallback.
297
+
284
298
  ### Step 2b′: Auto-Answer Opted-Out Questions
285
299
 
286
300
  After the `ask` tool resolves and before ambiguity scoring, if the user opts out of answering the current question or explicitly asks the agent to decide, load `auto-answer-uncertain.md` as an internal `kind: "skill-fragment"` prompt for a fork-context architect. Pass the opted-out question, prompt-safe transcript summary, locked topology, current scores/gaps, and any auto-research candidates used for the round. The architect must return exactly one decisive answer with rationale, confidence, and explicit uncertainty. Validate the response shape before using it; if valid, record it as the tentative answer for scoring, append the round number to `auto_answered_rounds`, and mark the transcript answer as architect-assisted.
287
301
 
288
302
  Auto-answer has a clarity cap: unless the architect confidence is `high` and uncertainty is negligible, no dimension score improved solely by the auto-answer may exceed `0.85`. If the auto-answer would make ambiguity cross the resolved threshold, ask the user for threshold-crossing confirmation before Phase 4: present the tentative assumption and require explicit confirmation, revision, or continued questioning. On architect failure or invalid response, continue with the user's opt-out as an unresolved gap, increment `architect_failures`, and do not block the interview.
289
303
 
304
+ ### Step 2b″: Refine Free-Text Answers
305
+
306
+ When the user's answer is free-text that carries reasoning, constraints, or scope decisions, do not forward it to scoring as a lossy one-line label. First structure it into a compact interpretation using the canonical sections — **Decision**, **Reasoning**, **Constraints (user-stated)**, **Out of scope (user-stated)**, and **Codebase context (verified)** (omit empty sections) — then confirm with exactly one `ask` that nothing is lost or misrepresented. Apply `language.instruction` when present.
307
+
308
+ Offer options such as **Send as-is**, **Add a constraint**, **Mark something out of scope**, **Add context**, and **Rewrite**, plus free-text. If the user picks anything other than "Send as-is", collect the exact missing text with one follow-up `ask` (never infer it from the option label), fold it into the structured interpretation, and re-confirm. Do not advance to scoring while the user is still saying something is missing.
309
+
310
+ Skip Refine for short answers with no attached reasoning (e.g. "Yes" / "No" / a single proper noun), for pre-built option picks where the structure is already explicit, for auto-confirmed code/brownfield facts, and for architect auto-answers (already structured by Step 2b′). A refined answer counts as direct user judgment: record the round in `refined_rounds` and reset `auto_answer_streak` to 0. Feed the confirmed structured interpretation — not the raw free text — into Step 2c scoring and established-facts maintenance.
311
+
290
312
  ### Step 2c: Score Ambiguity
291
313
 
292
314
  After receiving the user's answer, score clarity across all dimensions.
293
315
 
294
316
  If the round used an auto-answer, include the architect answer, rationale, confidence, and uncertainty in the scoring prompt. Apply the Step 2b′ clarity cap mechanically before calculating ambiguity, and treat any low-confidence or insufficient-context auto-answer as an unresolved gap rather than user-confirmed truth.
295
317
 
318
+ Before scoring, compare the new answer against `state.established_facts`. Treat established facts as durable confirmed decisions with source-round evidence; do not score an answer in isolation from facts that the interview has already stabilized.
319
+
320
+ Ambiguity is BIDIRECTIONAL and NON-MONOTONIC. A later answer can increase ambiguity when it invalidates, weakens, or expands prior understanding; convergence is not assumed to be a one-way decrease.
321
+
322
+ Ambiguity-raising triggers:
323
+ - **A direct contradiction**: the answer contradicts an established fact.
324
+ - **B internal inconsistency**: two requirements that cannot co-hold are now present.
325
+ - **C low-quality/evasive**: the answer avoids, hand-waves, or fails to resolve the targeted gap.
326
+ - **D scope expansion**: the answer adds a component, entity, constraint, deliverable, or integration not already covered or explicitly deferred.
327
+
328
+ Use **mechanism A** for every ambiguity rise: a trigger LOWERS the affected component/dimension clarity score, and the existing weighted formula raises ambiguity. There is **no separate penalty term**; ambiguity remains bounded by the same greenfield/brownfield formula.
329
+
330
+ The rise is SILENT: no modal, no forced-resolution step, and no dedicated conflict UI. Surface it through the normal per-round report and by targeting the next question at the affected component/dimension.
331
+
332
+ Structured scorer output is required. Include `triggers`, `trigger_status`, `affected_component`, `affected_dimension`, `prior_dimension_score`, `new_dimension_score`, `prior_ambiguity`, `new_ambiguity`, `evidence`, `contradicted_established_fact` when relevant, and `disputed_unresolved_rationale` when applicable.
333
+
334
+ Established-facts maintenance: promote stable confirmed decisions into `state.established_facts` with source/evidence; when a new answer contradicts an established fact, mark the fact disputed and preserve the contradicted fact instead of deleting it.
335
+
336
+ TRANSITION VALIDATION: if a trigger is present, the affected dimension must not improve and overall ambiguity must rise vs the prior scored round, unless the trigger is explicitly marked disputed or unresolved with rationale.
337
+
338
+ Convergence Pacing deferral: do not add a min-round floor, score-drop cap, confidence dampening, or other explicit pacing brake. Bidirectional scoring is the pacing mechanism.
339
+
296
340
  **Scoring prompt** (use opus model, temperature 0.1 for consistency):
297
341
 
298
342
  ```
@@ -306,6 +350,9 @@ Transcript or prompt-safe transcript summary:
306
350
  Locked topology:
307
351
  {state.topology.components and state.topology.deferrals}
308
352
 
353
+ Established facts:
354
+ {state.established_facts}
355
+
309
356
  Score each active component on each dimension, then provide the overall dimension scores as the minimum or coverage-weighted weakest score across active components. Deferred components are excluded from ambiguity math but must remain listed in topology and the final spec.
310
357
 
311
358
  Score each dimension:
@@ -324,6 +371,7 @@ Also identify:
324
371
  - weakest_dimension: the single lowest-confidence dimension for that component this round
325
372
  - weakest_dimension_rationale: one sentence explaining why this component/dimension pair is the highest-leverage target for the next question
326
373
  - component_scores: object keyed by component id, with per-dimension scores and gaps
374
+ - structured_scorer_output: object containing triggers, trigger_status, affected_component, affected_dimension, prior_dimension_score, new_dimension_score, prior_ambiguity, new_ambiguity, evidence, contradicted_established_fact when relevant, and disputed_unresolved_rationale when applicable
327
375
 
328
376
  5. Ontology Extraction: Identify all key entities (nouns) discussed in the transcript.
329
377
 
@@ -373,11 +421,12 @@ Round {n} complete.
373
421
  | Constraints | {s} | {w} | {s*w} | {gap or "Clear"} |
374
422
  | Success Criteria | {s} | {w} | {s*w} | {gap or "Clear"} |
375
423
  | Context (brownfield) | {s} | {w} | {s*w} | {gap or "Clear"} |
376
- | **Ambiguity** | | | **{score}%** | |
424
+ | **Ambiguity** | | | **{prior_score}% -> {score}% {up|down|flat}** | {if up: trigger name such as "A direct contradiction"} |
377
425
 
378
426
  **Topology:** Targeted {target_component_name} | Active: {active_component_count} | Deferred: {deferred_component_count} | Next rotation after: {last_targeted_component_id}
379
427
 
380
428
  **Ontology:** {entity_count} entities | Stability: {stability_ratio} | New: {new} | Changed: {changed} | Stable: {stable}
429
+ **Milestone:** {prior_milestone} → {current_milestone}{milestone_transition ? " — lateral panel convened" : ""}
381
430
 
382
431
  **Next target:** {target_component_name} / {weakest_dimension} — {weakest_dimension_rationale}
383
432
 
@@ -389,7 +438,8 @@ Apply `language.instruction` when present before showing this progress report so
389
438
 
390
439
  ### Step 2e: Update State
391
440
 
392
- Update interview state with the new round, global scores, per-component `topology.components[].clarity_scores`, `topology.components[].weakest_dimension`, ontology snapshot, `topology.last_targeted_component_id`, `auto_researched_rounds`, `auto_answered_rounds`, and `architect_failures` via `gjc state write`; never patch `.gjc/state` directly unless an explicit force override is active.
441
+ Update state in two phases. The `ask` answer is first recorded by the runtime as an `answered` shell. Scoring then enriches the same round record to `scored` with global scores, per-component `topology.components[].clarity_scores`, `topology.components[].weakest_dimension`, trigger metadata, established-facts changes, ontology snapshot, `topology.last_targeted_component_id`, `auto_researched_rounds`, `auto_answered_rounds`, and `architect_failures`. When `deepInterview` ask metadata is present, no manual per-round `gjc state write` is required for the answer shell; only scoring enrichment/state maintenance remains. When metadata is absent, use the legacy `gjc state write` path to persist the new round and never patch `.gjc/state` directly unless an explicit force override is active.
442
+ Also recompute and persist `ambiguity_milestone` each round (detect band transitions for the Phase 3 panel), and persist `auto_answer_streak`, `refined_rounds`, `lateral_reviews`, and `lateral_panel_failures` alongside the existing fields.
393
443
 
394
444
  ### Step 2f: Check Soft Limits
395
445
 
@@ -397,28 +447,43 @@ Update interview state with the new round, global scores, per-component `topolog
397
447
  - **Round 10**: Show soft warning: "We're at 10 rounds. Current ambiguity: {score}%. Continue or proceed with current clarity?"
398
448
  - **Round 20**: Hard cap: "Maximum interview rounds reached. Proceeding with current clarity level ({score}%)."
399
449
 
400
- ## Phase 3: Challenge Agents
450
+ ## Phase 3: Lateral Review Panel (milestone-triggered)
451
+
452
+ The interview convenes a short multi-persona panel at **ambiguity-milestone transitions** instead of at fixed round numbers. Define milestone bands from the round's ambiguity score:
453
+
454
+ | Band | Ambiguity |
455
+ |------|-----------|
456
+ | `initial` | > 0.60 |
457
+ | `progress` | 0.60 ≥ a > 0.30 |
458
+ | `refined` | 0.30 ≥ a > threshold |
459
+ | `ready` | ≤ threshold |
401
460
 
402
- At specific round thresholds, shift the questioning perspective:
461
+ A transition occurs whenever the band changes versus the prior scored round — in either direction, since bidirectional scoring can move the band back up. On a transition, and also before synthesizing any agent-supplied answer (auto-research candidates, an auto-answer, or a code/brownfield auto-confirm that carries real interpretation), convene the panel before generating or asking the next question.
403
462
 
404
- ### Round 4+: Contrarian Mode
405
- Inject into the question generation prompt:
406
- > You are now in CONTRARIAN mode. Your next question should challenge the user's core assumption. Ask "What if the opposite were true?" or "What if this constraint doesn't actually exist?" The goal is to test whether the user's framing is correct or just habitual.
463
+ **Personas (run in parallel, independent context):** dispatch `researcher`, `contrarian`, and `simplifier` as parallel fork-context subagents through the `lateral-review-panel.md` fragment, each with its own copy of the prompt-safe context so no persona anchors on another's framing. Add the `architect` persona when the round changed system shape — scope expansion, a new component or integration (trigger D), or any change to ownership or architecture. Each persona is a read-only architect: no edits, no `.gjc/` mutation, no execution.
407
464
 
408
- ### Round 6+: Simplifier Mode
409
- Inject into the question generation prompt:
410
- > You are now in SIMPLIFIER mode. Your next question should probe whether complexity can be removed. Ask "What's the simplest version that would still be valuable?" or "Which of these constraints are actually necessary vs. assumed?" The goal is to find the minimal viable specification.
465
+ **Folding findings:** validate each persona response, then fold only concrete, user-safe findings into the next single user-facing question — as 2-3 ranked answer options or one recommended draft. The panel never adds a second question, never mutates requirements on its own, and never marks the interview complete. The one-question-per-round rule stays intact.
411
466
 
412
- ### Round 8+: Ontologist Mode (if ambiguity still > 0.3)
413
- Inject into the question generation prompt:
414
- > You are now in ONTOLOGIST mode. The ambiguity is still high after 8 rounds, suggesting we may be addressing symptoms rather than the core problem. The tracked entities so far are: {current_entities_summary from latest ontology snapshot}. Ask "What IS this, really?" or "Looking at these entities, which one is the CORE concept and which are just supporting?" The goal is to find the essence by examining the ontology.
467
+ **Persona lenses:**
468
+ - `researcher` — surfaces external facts, prior art, and unknowns the interview depends on.
469
+ - `contrarian` challenges the core assumption: "What if the opposite were true? Is this constraint real or habitual?"
470
+ - `simplifier` — probes whether complexity can be removed: "What is the simplest version that is still valuable?"
471
+ - `architect` — checks system shape, ownership, and integration impact when scope changed.
415
472
 
416
- Challenge modes are used ONCE each, then return to normal Socratic questioning. Track which modes have been used in state.
473
+ **Ontology escalation:** if ambiguity stalls (same score ±0.05 for 3 rounds) or stays > 0.30 after 8 rounds, instruct the panel (especially `contrarian` + `architect`) to ask "What IS this, really?" identify the core entity versus supporting views from the latest ontology snapshot before returning to feature questions.
474
+
475
+ **Bookkeeping:** record each convened panel in `state.lateral_reviews` (round, milestone transition or pre-answer trigger, personas dispatched, findings folded). On panel spawn or validation failure, fall back silently to the normal generated question and increment `lateral_panel_failures`; do not expose tool noise unless it changes the next user-facing question. The panel is a prompt-budgeted assist layer — summarize oversized context before dispatch.
417
476
 
418
477
  ## Phase 4: Crystallize Spec
419
478
 
420
479
  When ambiguity ≤ threshold (or hard cap / early exit):
421
480
 
481
+ **Before generating the spec, two gates must pass, in order:**
482
+
483
+ **4a. Closure / Acceptance Guard.** Even when ambiguity ≤ threshold, do not treat the math as completion. Run an independent readiness audit from the full main-session perspective (including explore findings, established facts, and triggers the scorer may not have fully weighed). Confirm every active topology component has goal/constraint/criteria coverage, no unresolved or disputed trigger remains on a path that matters, and no low-confidence auto-answer is standing in for user-confirmed truth above the clarity cap. If a material gap exists, explicitly override the gate to the user — "The math says ready, but I am not accepting it yet because {gap}" — and ask the single highest-impact follow-up, returning to Phase 2. Record any override in `state.closure_overrides`.
484
+
485
+ **4b. Restate gate.** Once closure passes, collapse the agreed answers into ONE sentence goal that covers every active component, and confirm it with a single `ask`: "If someone read only this line, would they reach the same outcome you have in mind?" Offer **Yes, crystallize**, **Adjust wording**, and **Missing scope**, plus free-text, applying `language.instruction` when present. On "Adjust wording" / "Missing scope", collect the exact correction with one follow-up `ask`, route it back through Step 2c scoring and established-facts maintenance (a correction can change ambiguity), then re-run closure and ask the Restate gate again. Cap at two loops; if alignment is not reached, return to Phase 2 with a targeted question instead of forcing a goal line. Persist the confirmed line as `state.restated_goal`.
486
+
422
487
  1. **Generate the specification** using opus model with the prompt-safe transcript. If the full interview transcript or initial context is too large, include the summary plus all concrete decisions, acceptance criteria, unresolved gaps, and ontology snapshots; never overflow the prompt with raw oversized context.
423
488
  - Apply `language.instruction` when present so user-facing prose in the spec preserves the session language; keep code identifiers, file paths, commands, JSON/settings keys, and quoted source text unchanged.
424
489
  2. **Write the final spec through the workflow CLI**: persist the artifact at `.gjc/specs/deep-interview-{slug}.md`
@@ -445,6 +510,11 @@ Spec structure:
445
510
  - Auto-Researched Rounds: {auto_researched_rounds}
446
511
  - Auto-Answered Rounds: {auto_answered_rounds}
447
512
  - Architect Failures: {architect_failures}
513
+ - Lateral Reviews: {lateral_reviews count with milestones}
514
+ - Lateral Panel Failures: {lateral_panel_failures}
515
+ - Refined Rounds: {refined_rounds}
516
+ - Closure Overrides: {closure_overrides count, or none}
517
+ - Restated Goal: {restated_goal}
448
518
 
449
519
  ## Clarity Breakdown
450
520
  | Dimension | Score | Weight | Weighted |
@@ -463,6 +533,15 @@ Spec structure:
463
533
  |-----------|--------|-------------|--------------------------|
464
534
  | {component.name} | {active|deferred} | {component.description} | {covered acceptance criteria or deferral reason} |
465
535
 
536
+ ## Established Facts
537
+ {List stable confirmed decisions promoted into `state.established_facts`, including source round, evidence, and disputed status when any fact was contradicted.}
538
+
539
+ ## Trigger Metadata
540
+ {Summarize per-round trigger metadata: trigger label/status, affected component/dimension, prior -> new ambiguity direction, evidence, contradicted established fact when relevant, and disputed/unresolved rationale when applicable.}
541
+
542
+ ## Lateral Review Panel
543
+ {Summarize convened panels: round, milestone transition or pre-answer trigger, personas dispatched, and the concrete findings folded into questions. Note any lateral_panel_failures.}
544
+
466
545
  ## Goal
467
546
  {crystal-clear goal statement derived from interview, covering every active topology component}
468
547
 
@@ -481,6 +560,9 @@ Spec structure:
481
560
  - [ ] {testable criterion 3}
482
561
  - ...
483
562
 
563
+ ## Deferrals
564
+ {List user-confirmed topology deferrals and scoring/pacing deferrals, including Convergence Pacing when applicable: no min-round floor, score-drop cap, or dampening; bidirectional scoring is the pacing mechanism.}
565
+
484
566
  ## Assumptions Exposed & Resolved
485
567
  | Assumption | Challenge | Resolution |
486
568
  |------------|-----------|------------|
@@ -573,7 +655,7 @@ Stage 1: Deep Interview Stage 2: ralplan consensus Stage 3: Separ
573
655
  ┌─────────────────────┐ ┌───────────────────────────┐ ┌──────────────────────┐
574
656
  │ Socratic Q&A │ │ Planner creates plan │ │ User chooses if/how │
575
657
  │ Ambiguity scoring │───>│ Architect reviews │───>│ execution proceeds │
576
- Challenge agents │ │ Critic validates │ │ via ultragoal (default) │
658
+ Lateral panel │ │ Critic validates │ │ via ultragoal (default) │
577
659
  │ Spec crystallization│ │ Loop until consensus │ │ no auto-handoff │
578
660
  │ Gate: ≤<resolvedThresholdPercent> ambiguity│ │ ADR + RALPLAN-DR summary │ │ │
579
661
  └─────────────────────┘ └───────────────────────────┘ └──────────────────────┘
@@ -601,8 +683,9 @@ Skipping any stage is possible but reduces quality assurance:
601
683
  - Use `gjc state write` / `gjc state read` for interview state persistence; the initial and subsequent deep-interview state payloads must include `threshold_source` alongside `threshold`; do not edit `.gjc/state` directly without force override.
602
684
  - Use the GJC workflow CLI to save the final spec at `.gjc/specs/deep-interview-{slug}.md` exactly; do not use `write`, `edit`, or `ast_edit` directly on `.gjc/` paths without force override.
603
685
  - Use public GJC workflow entrypoints to bridge to ralplan, ultragoal, or team only after explicit execution approval — never implement directly. Implementation handoff defaults to ultragoal; reserve team for when tmux-based interactive worker parallelization is genuinely required.
604
- - Challenge agent modes are prompt injections, not separate agent spawns
605
- - Use internal fragment auto-modes only at their documented hooks: `auto-research-greenfield.md` between Step 2a and 2b for greenfield `research: true` questions, and `auto-answer-uncertain.md` as Step 2b′ after `ask` resolves and before scoring.
686
+ - The lateral-review panel spawns read-only persona subagents (Task tool) in parallel with independent context; it is an assist layer, never an executor and never the completion authority
687
+ - Apply the Refine gate (Step 2b″), the Dialectic Rhythm Guard (Step 2a), and the Closure + Restate gates (Phase 4) through the `ask` tool, preserving `language.instruction` for each
688
+ - Use internal fragment auto-modes only at their documented hooks: `auto-research-greenfield.md` between Step 2a and 2b for greenfield `research: true` questions, `auto-answer-uncertain.md` as Step 2b′ after `ask` resolves and before scoring, and `lateral-review-panel.md` for the Phase 3 panel personas at ambiguity-milestone transitions and before synthesizing agent-supplied answers.
606
689
  - Fragment auto-modes are loaded on demand as `kind: "skill-fragment"`; they are not public workflow skills, not slash-command/discoverable, and not `skill://` registrations.
607
690
  </Tool_Usage>
608
691
 
@@ -633,15 +716,15 @@ Why good: Explored first, cited the repo evidence that triggered the question, t
633
716
  </Good>
634
717
 
635
718
  <Good>
636
- Contrarian mode activation:
719
+ Lateral panel — contrarian persona:
637
720
  ```
638
- Round 5 | Contrarian Mode | Ambiguity: 42%
721
+ Round 5 | Targeting: Constraints | Lateral panel: progress→refined (contrarian) | Ambiguity: 42%
639
722
 
640
723
  You've said this needs to support 10,000 concurrent users. What if it only
641
724
  needed to handle 100? Would the architecture change fundamentally, or is
642
725
  the 10K number an assumption rather than a measured requirement?
643
726
  ```
644
- Why good: Challenges a specific assumption (scale requirement) that could dramatically simplify the solution.
727
+ Why good: The lateral panel's contrarian persona challenges a specific assumption (scale requirement) that could dramatically simplify the solution.
645
728
  </Good>
646
729
 
647
730
  <Good>
@@ -659,26 +742,16 @@ Why good: Respects user's desire to stop but transparently shows the risk.
659
742
  </Good>
660
743
 
661
744
  <Good>
662
- Ontology convergence tracking:
663
- ```
664
- Round 3 entities: User, Task, Project (stability: N/A → 67%)
665
- Round 4 entities: User, Task, Project, Tag (stability: 75% — 3 stable, 1 new)
666
- Round 5 entities: User, Task, Project, Tag (stability: 100% — all 4 stable)
667
-
668
- "Ontology has converged — the same 4 entities appeared in 2 consecutive rounds
669
- with no changes. The domain model is stable."
670
- ```
671
- Why good: Shows entity tracking across rounds with visible convergence. Stability ratio increases as the domain model solidifies, giving mathematical evidence that the interview is converging on a stable understanding.
672
- </Good>
673
-
674
- <Good>
675
- Ontology-style question for scope-fuzzy tasks:
745
+ Ontology stabilization — ask, then watch it converge:
676
746
  ```
677
747
  Round 6 | Targeting: Goal Clarity | Why now: the core entity is still unstable across rounds, so feature questions would compound ambiguity | Ambiguity: 38%
678
748
 
679
- "Across the last rounds you've described this as a workflow, an inbox, and a planner. Which one is the core thing this product IS, and which ones are supporting metaphors or views?"
749
+ "Across the last rounds you've described this as a workflow, an inbox, and a planner. Which one is the core thing this product IS, and which are supporting views?"
750
+
751
+ → Round 7 entities: User, Task, Project (stability: 67%)
752
+ → Round 8 entities: User, Task, Project, Tag (stability: 100% — all 4 stable across 2 rounds)
680
753
  ```
681
- Why good: Uses ontology-style questioning to stabilize the core noun before drilling into features, which is the right move when the scope is fuzzy rather than merely incomplete.
754
+ Why good: An ontology-style question stabilizes the core noun before drilling into features; the stability ratio then climbing to 100% across consecutive rounds is the mathematical signal that the domain model has converged.
682
755
  </Good>
683
756
 
684
757
  <Bad>
@@ -690,14 +763,6 @@ Also, what's the deployment target?"
690
763
  Why bad: Four questions at once — causes shallow answers and makes scoring inaccurate.
691
764
  </Bad>
692
765
 
693
- <Bad>
694
- Asking about codebase facts:
695
- ```
696
- "What database does your project use?"
697
- ```
698
- Why bad: Should have spawned explore agent to find this. Never ask the user what the code already tells you.
699
- </Bad>
700
-
701
766
  <Bad>
702
767
  Proceeding despite high ambiguity:
703
768
  ```
@@ -718,29 +783,18 @@ Why bad: 45% ambiguity means nearly half the requirements are unclear. The mathe
718
783
  </Escalation_And_Stop_Conditions>
719
784
 
720
785
  <Final_Checklist>
721
- - [ ] Phase 0 completed before Phase 1: settings files were read, threshold was resolved, and the first user-visible line was `Deep Interview threshold: <resolvedThresholdPercent> (source: <resolvedThresholdSource>)`
722
- - [ ] State includes both `threshold` and `threshold_source`, and the final spec metadata records both values
723
- - [ ] Existing `language` state object was preserved, and `language.instruction` was applied to announcements, topology confirmation, option labels, interview questions, progress reports, and spec prose when present
724
- - [ ] Interview completed (ambiguity threshold OR user chose early exit)
725
- - [ ] Oversized initial context/history was summarized before scoring, question generation, spec generation, or execution handoff
726
- - [ ] Ambiguity score displayed after every round
727
- - [ ] Every round explicitly names the weakest dimension and why it is the next target
728
- - [ ] Challenge agents activated at correct thresholds (round 4, 6, 8)
729
- - [ ] Spec file persisted to `.gjc/specs/deep-interview-{slug}.md` exactly through the GJC workflow CLI; ephemeral artifacts/state used `gjc state write` or workflow CLI writes, with no direct `.gjc/` edits unless force override was explicitly active
730
- - [ ] Spec includes: topology, goal, constraints, acceptance criteria, clarity breakdown, transcript
731
- - [ ] Execution bridge presented via the `ask` tool
732
- - [ ] Selected execution mode invoked via public GJC workflow entrypoint only after explicit execution approval (never direct implementation)
733
- - [ ] If 3-stage pipeline selected: `/skill:ralplan` invoked with the spec as context, then stopped with the consensus plan marked `pending approval` until the user explicitly approves execution
734
- - [ ] State cleaned up after approved workflow handoff
735
- - [ ] Brownfield confirmation questions cite repo evidence (file/path/pattern) before asking the user to decide
736
- - [ ] Scope-fuzzy tasks can trigger ontology-style questioning to stabilize the core entity before feature elaboration
737
- - [ ] Round 0 topology gate completed before ambiguity scoring and persisted `topology.confirmed_at`
738
- - [ ] Per-round ambiguity report includes Topology target/coverage and Ontology row with entity count and stability ratio
739
- - [ ] Multi-component interviews rotate targeting across active components when N > 1
740
- - [ ] Spec includes Topology section with confirmed active components and user-confirmed deferrals
741
- - [ ] Spec includes Ontology (Key Entities) table and Ontology Convergence section
742
- - [ ] Internal auto-mode fragments, when used, were loaded only on demand as non-public `kind: "skill-fragment"` prompts; responses were validated, failures incremented `architect_failures`, and final metadata includes `auto_researched_rounds`, `auto_answered_rounds`, and `architect_failures`
743
- - [ ] Auto-answer threshold crossing, if any, received explicit user confirmation before spec crystallization
786
+ - [ ] Phase 0 ran before anything: threshold resolved and first line emitted as `Deep Interview threshold: <resolvedThresholdPercent> (source: <resolvedThresholdSource>)`; state and spec metadata record both `threshold` and `threshold_source`
787
+ - [ ] `language.instruction` preserved across announcements, questions, options, progress reports, and spec prose when present
788
+ - [ ] Oversized initial context/history summarized before scoring, question generation, spec generation, or handoff
789
+ - [ ] Round 0 topology gate completed before scoring; `topology.confirmed_at` persisted
790
+ - [ ] Ambiguity scored and displayed every round, naming the weakest component/dimension target (rotating across active components when N > 1)
791
+ - [ ] Lateral panel convened at milestone transitions (and before synthesizing agent-supplied answers) with parallel read-only personas
792
+ - [ ] Free-text answers passed the Refine gate; dialectic rhythm guard forced a user question after 3 agent-resolved answers; any auto-answer threshold crossing explicitly confirmed
793
+ - [ ] Closure / Acceptance Guard and the one-sentence Restate gate both passed before crystallization
794
+ - [ ] Interview reached ambiguity threshold OR an explicit early exit with warning
795
+ - [ ] Spec persisted to `.gjc/specs/deep-interview-{slug}.md` exactly via the GJC CLI (no direct `.gjc/` edits without force override), covering every active topology component plus goal/constraints/acceptance criteria/clarity/ontology/transcript
796
+ - [ ] Spec metadata includes the auto/lateral counters (`auto_researched_rounds`, `auto_answered_rounds`, `lateral_reviews`, `refined_rounds`, `architect_failures`, `lateral_panel_failures`)
797
+ - [ ] Execution bridge presented via `ask`; execution invoked only after explicit approval through a public workflow entrypoint (never direct implementation); state cleaned up after handoff
744
798
  </Final_Checklist>
745
799
 
746
800
  <Advanced>
@@ -783,30 +837,7 @@ If the user chooses interview, team routing invokes `/skill:deep-interview`. Whe
783
837
 
784
838
  ## Approval-Gated Pipeline: deep-interview → ralplan → pending approval
785
839
 
786
- The recommended refinement path chains clarity and feasibility gates, then stops for explicit execution approval:
787
-
788
- ```
789
- /skill:deep-interview "vague idea"
790
- → Socratic Q&A until ambiguity ≤ <resolvedThresholdPercent>
791
- → Spec written to .gjc/specs/deep-interview-{slug}.md
792
- → User explicitly selects "Refine with ralplan consensus"
793
- → /skill:ralplan (spec as input)
794
- → Planner creates implementation plan from spec
795
- → Architect reviews for architectural soundness
796
- → Critic validates quality and testability
797
- → Loop until consensus (max 5 iterations)
798
- → Consensus plan written to .gjc/plans/
799
- → Stop with the consensus plan marked pending approval
800
- → Only a separate explicit execution approval may invoke execution (ultragoal by default; team only when tmux-based interactive worker parallelization is required)
801
- ```
802
-
803
- **The ralplan skill receives the spec as context through `/skill:ralplan`** because ralplan is already the GJC Planner → Architect → Critic consensus workflow. The consensus plan includes:
804
- - RALPLAN-DR summary (Principles, Decision Drivers, Options)
805
- - ADR (Decision, Drivers, Alternatives, Why chosen, Consequences)
806
- - Testable acceptance criteria (inherited from deep-interview spec)
807
- - Implementation steps with file references
808
-
809
- **Execution is a separate approval-gated step.** The deep-interview and ralplan skills must not auto-invoke team or ultragoal merely because a spec or plan exists.
840
+ See the Phase 5b "Approval-Gated Refinement Path" diagram for the full flow. In short: interview → spec at `.gjc/specs/deep-interview-{slug}.md` → user selects "Refine with ralplan consensus" → `/skill:ralplan` (Planner/Architect/Critic consensus, plan written to `.gjc/plans/`) stop at `pending approval`. Execution is always a separate approval-gated step; deep-interview and ralplan never auto-invoke ultragoal or team just because a spec or plan exists.
810
841
 
811
842
  ## Integration with Ralplan Gate
812
843
 
@@ -818,24 +849,11 @@ Vague prompt → ralplan gate → deep-interview (if extremely vague) → ralpla
818
849
 
819
850
  ## Brownfield vs Greenfield Weights
820
851
 
821
- | Dimension | Greenfield | Brownfield |
822
- |-----------|-----------|------------|
823
- | Goal Clarity | 40% | 35% |
824
- | Constraint Clarity | 30% | 25% |
825
- | Success Criteria | 30% | 25% |
826
- | Context Clarity | N/A | 15% |
827
-
828
- Brownfield adds Context Clarity because modifying existing code safely requires understanding the system being changed.
829
-
830
- ## Challenge Agent Modes
852
+ See "Calculate ambiguity" in Step 2c for the weighted formulas. Brownfield adds a 15% Context Clarity dimension (Goal/Constraint/Criteria become 35/25/25) because safely modifying existing code requires understanding the system being changed.
831
853
 
832
- | Mode | Activates | Purpose | Prompt Injection |
833
- |------|-----------|---------|-----------------|
834
- | Contrarian | Round 4+ | Challenge assumptions | "What if the opposite were true?" |
835
- | Simplifier | Round 6+ | Remove complexity | "What's the simplest version?" |
836
- | Ontologist | Round 8+ (if ambiguity > 0.3) | Find essence | "What IS this, really?" |
854
+ ## Lateral Review Panel
837
855
 
838
- Each mode is used exactly once, then normal Socratic questioning resumes. Modes are tracked in state to prevent repetition.
856
+ See Phase 3 for the full persona set (researcher/contrarian/simplifier, plus architect on scope change), the milestone bands, and the parallel independent-context dispatch.
839
857
 
840
858
  ## Ambiguity Score Interpretation
841
859
 
@@ -845,7 +863,7 @@ Each mode is used exactly once, then normal Socratic questioning resumes. Modes
845
863
  | At or below the resolved threshold | Clear enough | Proceed |
846
864
  | Above the resolved threshold with minor gaps | Some gaps | Continue interviewing |
847
865
  | Moderate ambiguity | Significant gaps | Focus on weakest dimensions |
848
- | High ambiguity | Very unclear | May need reframing (Ontologist) |
866
+ | High ambiguity | Very unclear | May need reframing (panel ontology escalation) |
849
867
  | Extreme ambiguity | Almost nothing known | Early stages, keep going |
850
868
  </Advanced>
851
869
 
@@ -0,0 +1,49 @@
1
+ # Deep Interview Lateral Review Panel
2
+
3
+ You are one persona on a read-only architect panel assisting the deep-interview workflow at an ambiguity-milestone transition (or before the workflow synthesizes an agent-supplied answer). You run in parallel with the other personas, each in independent context, so your perspective must be your own — do not assume or anchor on what another persona would say.
4
+
5
+ Your assigned persona is provided in the prompt as `persona` (one of `researcher`, `contrarian`, `simplifier`, `architect`).
6
+
7
+ Inherited context is read-only background. Do not edit code, write files, mutate `.gjc/` state, run formatters, invoke workflow handoffs, or implement anything. Use only inherited context, the prompt-safe initial idea, locked topology, current scores/gaps, established facts, prior decisions, and read-only repo/context inspection if available.
8
+
9
+ Keep the response compact enough to fold back into a single Socratic question.
10
+
11
+ ## Persona lens
12
+
13
+ - `researcher` — surface external facts, prior art, version/compatibility constraints, and unknowns the interview genuinely depends on. Prefer verifiable specifics over speculation.
14
+ - `contrarian` — challenge the core assumption. Ask whether the framing or a stated constraint is real or merely habitual, and name what breaks if the opposite were true.
15
+ - `simplifier` — probe whether complexity can be removed. Name the simplest version that is still valuable and which constraints are necessary versus assumed.
16
+ - `architect` — assess system shape, ownership, and integration impact when scope or architecture changed. Name the highest-risk structural decision still unsettled.
17
+
18
+ ## Task
19
+
20
+ From your assigned persona's lens only, identify the single highest-leverage blind spot or unsettled decision the next question should address, and propose how to resolve it. Stay within the locked topology and confirmed constraints.
21
+
22
+ ## Response Shape
23
+
24
+ Respond with only this JSON object:
25
+
26
+ ```json
27
+ {
28
+ "status": "answered",
29
+ "persona": "researcher|contrarian|simplifier|architect",
30
+ "finding": "One concrete, user-safe blind spot or decision this persona surfaces.",
31
+ "rationale": [
32
+ "Context, repo fact, or confirmed constraint supporting the finding."
33
+ ],
34
+ "suggested_options": [
35
+ "A concise answer option or recommended draft the next single question can offer."
36
+ ],
37
+ "confidence": "high|medium|low"
38
+ }
39
+ ```
40
+
41
+ Rules:
42
+ - `finding` must be non-empty, specific, and must not contradict confirmed user constraints.
43
+ - `rationale` must contain 1-3 bullets citing inherited context, confirmed constraints, or repo facts available in the prompt.
44
+ - `suggested_options` must contain 1-3 entries usable as answer options or a recommended draft for the single next user-facing question.
45
+ - `confidence` must be `high`, `medium`, or `low`.
46
+
47
+ ## Fallback
48
+
49
+ If inherited context is insufficient for a defensible persona finding, do not fabricate one. Return `confidence` `low`, set `finding` to the most important missing piece of context from this persona's lens, and leave `suggested_options` as the single safest clarification to ask the user.
@@ -191,10 +191,10 @@ An ultragoal story cannot be checkpointed `complete` until the active agent has
191
191
  - code-side: maintainability, tests, integration points, and unsafe shortcuts.
192
192
  5. Delegate an `executor` QA/red-team lane to build and run the e2e/read-teaming QA suite appropriate for the story. This lane must try to break the change, not just confirm the happy path. It must start from the approved plan/spec/acceptance criteria, then user-facing contracts, and only then implementation code as supporting evidence. Plan/code mismatches are blockers, not items to paper over with implementation intent.
193
193
  6. The executor QA/red-team lane must prove evidence by the real surface under test:
194
- - GUI/web surfaces require browser automation plus a screenshot or image verdict.
195
- - CLI surfaces require logs or terminal transcripts from real invocation.
196
- - API/package surfaces require external consumer or black-box tests through the public interface.
197
- - Algorithm/math surfaces require boundary, property, adversarial, and failure-mode cases.
194
+ - GUI/web surfaces require a valid automation transcript plus a non-uniform screenshot. Bare `inlineEvidence` text or typed receipts never prove live GUI/web execution.
195
+ - CLI surfaces require runtime argv replay: `replaySafe: true`, an allowlisted argv `command`, and replayed normalized stdout matching `recordedStdout`; unsafe commands require audited `replayExempt` metadata plus a structurally valid fallback artifact.
196
+ - Native/desktop/tui surfaces require a structurally valid screenshot, PTY capture with terminal control codes, or app-automation transcript.
197
+ - API/package/algorithm/math surfaces require a real artifact file or typed receipt. Bare `inlineEvidence` text alone is not sufficient for any surface.
198
198
  7. The executor QA/red-team lane must report a matrix using `executorQa.contractCoverage`, `executorQa.surfaceEvidence`, `executorQa.adversarialCases`, and `executorQa.artifactRefs`. Not-applicable rows are allowed only in `contractCoverage` and `surfaceEvidence`; each `status: "not_applicable"` row requires `contractRef` plus `reason`. `adversarialCases` rows cannot be not-applicable.
199
199
  8. Run a final code review pass and fold it into the strict quality gate. Clean means `architectReview.architectureStatus`, `architectReview.productStatus`, and `architectReview.codeStatus` are all `"CLEAR"`, `architectReview.recommendation` is `"APPROVE"`, executor QA statuses are `"passed"`, iteration is `"passed"` with `fullRerun: true`, every evidence field is non-empty, every required matrix row is present, and every blockers array is empty. `COMMENT`, `WATCH`, `REQUEST CHANGES`, `BLOCK`, missing evidence, missing or shallow matrix rows, plan/code mismatches, or non-empty blockers are non-clean.
200
200
  9. If any lane finds an issue, do **not** checkpoint `complete` and do **not** call `goal({"op":"complete"})`. Record durable blocker work instead:
@@ -204,6 +204,8 @@ An ultragoal story cannot be checkpointed `complete` until the active agent has
204
204
  10. Complete or steer through the blocker story, then rerun the full blocking verification loop. Repeat until all verifier lanes are clean.
205
205
  11. Only after the loop is clean, checkpoint the story as complete with a structured quality gate and a fresh active `goal({"op":"get"})` snapshot. The checkpoint creates a receipt; `goals.json.status` alone is not proof. In aggregate mode, the final aggregate receipt must exist before `goal({"op":"complete"})` is allowed.
206
206
 
207
+ While an Ultragoal run is active, the `ask` tool is blocked for all agents. Record unresolved review decisions as durable blockers with `gjc ultragoal record-review-blockers` instead of prompting interactively.
208
+
207
209
  The native `checkpoint --status complete` command rejects missing or shallow gates. `--quality-gate-json` must include:
208
210
 
209
211
  ```json
@@ -229,13 +231,19 @@ The native `checkpoint --status complete` command rejects missing or shallow gat
229
231
  "id": "browser-run",
230
232
  "kind": "browser-automation",
231
233
  "path": "artifacts/browser-run.json",
232
- "description": "browser automation transcript invoking the approved user-facing flow"
234
+ "description": "valid automation transcript with actions, monotonic timestamps, and selectors"
233
235
  },
234
236
  {
235
237
  "id": "gui-screenshot",
236
238
  "kind": "screenshot",
237
239
  "path": "artifacts/gui-screenshot.png",
238
- "description": "screenshot or image-verdict evidence for the GUI/web result"
240
+ "description": "non-uniform screenshot evidence for the GUI/web result"
241
+ },
242
+ {
243
+ "id": "cli-replay",
244
+ "kind": "command-replay",
245
+ "path": "artifacts/cli-replay.json",
246
+ "description": "artifact file containing argv-only CLI replay JSON: schemaVersion 1, kind cli-replay, replaySafe true, allowlisted command, recordedStdout"
239
247
  },
240
248
  {
241
249
  "id": "adversarial-report",
@@ -265,15 +273,23 @@ The native `checkpoint --status complete` command rejects missing or shallow gat
265
273
  {
266
274
  "id": "surface-gui",
267
275
  "contractRef": "user-facing surface or public interface under test",
268
- "surface": "gui|web|cli|api|package|algorithm|math",
276
+ "surface": "gui|web|cli|api|package|algorithm|math|native|desktop|tui",
269
277
  "invocation": "real browser action, CLI command, API/package consumer call, or algorithm/property check",
270
278
  "verdict": "passed",
271
279
  "artifactRefs": ["browser-run", "gui-screenshot"]
272
280
  },
281
+ {
282
+ "id": "surface-cli",
283
+ "contractRef": "CLI or command-line interface under test",
284
+ "surface": "cli",
285
+ "invocation": "argv replay executed by the Ultragoal runtime",
286
+ "verdict": "passed",
287
+ "artifactRefs": ["cli-replay"]
288
+ },
273
289
  {
274
290
  "id": "surface-out-of-scope",
275
291
  "contractRef": "surface intentionally outside this story",
276
- "surface": "gui|web|cli|api|package|algorithm|math",
292
+ "surface": "gui|web|cli|api|package|algorithm|math|native|desktop|tui",
277
293
  "status": "not_applicable",
278
294
  "reason": "why this surface does not apply to the current story"
279
295
  }
@@ -300,6 +316,12 @@ The native `checkpoint --status complete` command rejects missing or shallow gat
300
316
  }
301
317
  ```
302
318
 
319
+ For CLI replay artifacts, the JSON at `path` must be an object like `{"schemaVersion":1,"kind":"cli-replay","replaySafe":true,"command":["bun","-e","console.log(\"ultragoal-cli-ok\")"],"recordedStdout":"ultragoal-cli-ok\n"}`. Use `replayExempt` only for audited unsafe/non-deterministic invocations, with a substantive reason, approver, and same-surface fallback artifacts.
320
+
321
+ ## Review mode
322
+
323
+ `gjc ultragoal review` runs the same hardened gate against an already implemented PR, branch, or worktree. Use `--pr <number>` for a PR, `--branch <ref>` for a branch diff, omit both for the current worktree, and pass `--spec <path>` when a real contract exists. `--mode review-only` emits the verdict/findings without creating fix work; `--mode review-start` records review blockers for follow-up. Review mode validates the same `executorQa` shape and live-surface artifacts as `checkpoint --status complete`. A thin or derived-only contract can never clean-pass: the verdict is capped at `inconclusive: weak-contract` until a supplied spec or equivalent strong acceptance criteria are available.
324
+
303
325
  Receipts are freshness-scoped:
304
326
  - Per-goal receipts remain fresh for their target goal unless that goal, its blocker metadata, or its supersession metadata changes.
305
327
  - Normal later `goal_started` or clean receipt-backed `goal_checkpointed` events for other goals do not stale older per-goal receipts.