@jterrats/open-orchestra 1.0.10 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/CHANGELOG.md +33 -0
  2. package/dist/automation-evidence.d.ts +1 -1
  3. package/dist/automation-evidence.js +133 -11
  4. package/dist/automation-evidence.js.map +1 -1
  5. package/dist/command-manifest.js +4 -4
  6. package/dist/command-manifest.js.map +1 -1
  7. package/dist/phase-playbooks.js +2 -0
  8. package/dist/phase-playbooks.js.map +1 -1
  9. package/dist/qa-coverage-evidence.d.ts +3 -0
  10. package/dist/qa-coverage-evidence.js +92 -0
  11. package/dist/qa-coverage-evidence.js.map +1 -0
  12. package/dist/qa-coverage-guidance.d.ts +6 -0
  13. package/dist/qa-coverage-guidance.js +141 -0
  14. package/dist/qa-coverage-guidance.js.map +1 -0
  15. package/dist/qa-coverage-rules.d.ts +7 -0
  16. package/dist/qa-coverage-rules.js +127 -0
  17. package/dist/qa-coverage-rules.js.map +1 -0
  18. package/dist/qa-coverage-types.d.ts +47 -0
  19. package/dist/qa-coverage-types.js +2 -0
  20. package/dist/qa-coverage-types.js.map +1 -0
  21. package/dist/qa-coverage.d.ts +2 -20
  22. package/dist/qa-coverage.js +42 -132
  23. package/dist/qa-coverage.js.map +1 -1
  24. package/dist/recoverable-failure-lessons.d.ts +2 -0
  25. package/dist/recoverable-failure-lessons.js +55 -0
  26. package/dist/recoverable-failure-lessons.js.map +1 -0
  27. package/dist/release-readiness.js +3 -1
  28. package/dist/release-readiness.js.map +1 -1
  29. package/dist/roles/qa-ux-roles.js +5 -0
  30. package/dist/roles/qa-ux-roles.js.map +1 -1
  31. package/dist/runtime-adapters.js +1 -1
  32. package/dist/runtime-adapters.js.map +1 -1
  33. package/dist/runtime-completion-validation.d.ts +16 -0
  34. package/dist/runtime-completion-validation.js +206 -0
  35. package/dist/runtime-completion-validation.js.map +1 -0
  36. package/dist/runtime-execution.js +3 -0
  37. package/dist/runtime-execution.js.map +1 -1
  38. package/dist/runtime-lifecycle-watch.js +43 -37
  39. package/dist/runtime-lifecycle-watch.js.map +1 -1
  40. package/dist/runtime-parent-action-dispatch.d.ts +2 -1
  41. package/dist/runtime-parent-action-dispatch.js +94 -12
  42. package/dist/runtime-parent-action-dispatch.js.map +1 -1
  43. package/dist/runtime-spawn-bridge.js +6 -0
  44. package/dist/runtime-spawn-bridge.js.map +1 -1
  45. package/dist/skills-catalog.js +2 -0
  46. package/dist/skills-catalog.js.map +1 -1
  47. package/dist/task-graph-commands.js +21 -14
  48. package/dist/task-graph-commands.js.map +1 -1
  49. package/dist/types/runtime.d.ts +23 -0
  50. package/dist/types/tasks.d.ts +3 -0
  51. package/dist/types.d.ts +1 -1
  52. package/dist/types.js.map +1 -1
  53. package/dist/workflow-evidence-service.js +2 -0
  54. package/dist/workflow-evidence-service.js.map +1 -1
  55. package/dist/workflow-gates.js +6 -0
  56. package/dist/workflow-gates.js.map +1 -1
  57. package/dist/workflow-run-commands.js +104 -12
  58. package/dist/workflow-run-commands.js.map +1 -1
  59. package/dist/workflow-task-service.js +30 -1
  60. package/dist/workflow-task-service.js.map +1 -1
  61. package/docs/audio-video-transcription-skill.md +441 -45
  62. package/docs/autonomous-workflow.md +38 -0
  63. package/docs/backlog/web-code-editor-lsp-spike.md +289 -0
  64. package/docs/claude-adapter-qa-matrix.md +31 -19
  65. package/docs/context-vault.md +80 -0
  66. package/docs/e2e-test-batteries.md +3 -3
  67. package/docs/runtime-adapters.md +28 -18
  68. package/docs/site-manifest.json +1 -0
  69. package/docs/traceability-flow.md +14 -4
  70. package/package.json +2 -2
@@ -0,0 +1,289 @@
1
+ # Spike: Orchestra-Aware Web Code Editor With LSP
2
+
3
+ Backlog Item ID: EPIC-ORCHESTRA-AWARE-CODE-EDITOR
4
+ GitHub Issue: GH-447
5
+ Lead role: Architect
6
+ Supporting roles: UX/UI, Developer, Security, QA
7
+ Status: proposed
8
+
9
+ ## Goal
10
+
11
+ Define the MVP architecture for an embedded web code editor that lets users
12
+ inspect workspace-local files, use standard language intelligence, and perform
13
+ governed edits without losing Orchestra task traceability, policy gates, QA
14
+ evidence, or workspace isolation.
15
+
16
+ This spike is architecture and planning only. It does not implement source
17
+ changes.
18
+
19
+ ## Product Position
20
+
21
+ The editor should not compete with VS Code as a full IDE. Its differentiator is
22
+ workflow-aware editing: active task, backlog item, acceptance criteria, role
23
+ contracts, context-pack suggestions, visible policy gates, task-scoped diffs,
24
+ file timeline, rollback guidance, and quality warnings such as god-file risk,
25
+ missing tests, stale generated docs, and release gate blockers.
26
+
27
+ ## Editor Choice
28
+
29
+ Use CodeMirror 6 for the MVP, with Monaco kept as a later option if native VS
30
+ Code-language parity becomes more valuable than bundle control and UI
31
+ composition.
32
+
33
+ CodeMirror 6 benefits: smaller, modular, easier to compose inside the existing
34
+ React/Vite console, more controllable for mobile and constrained panels, and
35
+ well suited to incremental viewer, diagnostics, diff preview, and governed-save
36
+ slices. Costs: LSP support is less turnkey than Monaco and users expecting VS
37
+ Code behavior may notice missing commands.
38
+
39
+ Monaco benefits: closer to VS Code expectations with mature TypeScript and
40
+ JavaScript language behavior, diagnostics, hover, completion, symbols, and
41
+ go-to-definition. Costs: heavier runtime, worker/CSP complexity, harder mobile
42
+ composition, and a stronger pull toward an IDE-like footprint.
43
+
44
+ Adopt CodeMirror 6 for the first implementation slices. Revisit Monaco after
45
+ the read-only viewer, diff integration, and governed write contract are proven.
46
+ The LSP bridge contract should be editor-neutral so Monaco can be introduced
47
+ later without changing policy, workspace, or evidence boundaries.
48
+
49
+ ## Architecture Boundary
50
+
51
+ ```mermaid
52
+ flowchart LR
53
+ ui["Web console editor shell"]
54
+ context["Orchestra context sidebar"]
55
+ api["Editor API contract"]
56
+ policy["Policy gate service"]
57
+ workspace["Workspace file service"]
58
+ git["Git diff and timeline service"]
59
+ lsp["LSP bridge supervisor"]
60
+ server["Language server process"]
61
+ evidence["Evidence and review records"]
62
+
63
+ ui --> api
64
+ context --> api
65
+ api --> policy
66
+ api --> workspace
67
+ api --> git
68
+ api --> lsp
69
+ lsp --> server
70
+ policy --> evidence
71
+ workspace --> evidence
72
+ git --> evidence
73
+ ```
74
+
75
+ The web console must remain a client of stable JSON contracts. The browser does
76
+ not read arbitrary files directly, spawn processes, or decide write policy.
77
+
78
+ ## LSP Bridge Contract
79
+
80
+ The LSP bridge is a local or SaaS-side service boundary that mediates language
81
+ server lifecycle, document sync, diagnostics, and language requests.
82
+
83
+ Required responsibilities:
84
+
85
+ - accept only workspace-relative paths that resolve inside the active workspace;
86
+ - require task id and editor session id for every opened document;
87
+ - start language servers through array-based process APIs, never shell
88
+ interpolation;
89
+ - keep one supervised language-server pool per workspace, language, and tenant
90
+ boundary;
91
+ - enforce startup timeout, idle shutdown, max memory, max files, max document
92
+ size, and bounded request concurrency;
93
+ - redact or suppress diagnostics that expose internal absolute paths or secret
94
+ values;
95
+ - degrade to syntax-only editing when a language server is unavailable;
96
+ - emit lifecycle and diagnostic summaries that can be attached as evidence.
97
+
98
+ Initial target: TypeScript and JavaScript through the project-local TypeScript
99
+ server or a pinned LSP-compatible wrapper. Extension targets are Python
100
+ through pyright, Java through jdtls, .NET through OmniSharp or Roslyn LSP, Apex
101
+ through a Salesforce language server if licensing and setup allow it, and custom
102
+ domain servers through future extension manifests.
103
+
104
+ LSP messages must never be trusted as policy input. They can inform diagnostics
105
+ and navigation, but write permission remains owned by the policy gate service.
106
+
107
+ ## Workspace Isolation
108
+
109
+ All file operations must be scoped to an active workspace root resolved by the
110
+ server.
111
+
112
+ Local mode:
113
+
114
+ - bind local services to `127.0.0.1`;
115
+ - reject absolute paths, traversal, symlinks that escape the workspace, and
116
+ generated or secret-sensitive paths unless explicitly allowlisted;
117
+ - use existing task and evidence state under `.agent-workflow/` as the durable
118
+ traceability source;
119
+ - keep language servers local to the workspace and terminate them when the
120
+ workspace session closes.
121
+
122
+ SaaS mode:
123
+
124
+ - isolate tenants by account, workspace, runtime sandbox, storage prefix, and
125
+ language-server process boundary;
126
+ - enforce data residency before source text or diagnostics cross regions;
127
+ - avoid shared language-server processes across tenants;
128
+ - store only policy-approved evidence summaries unless the user explicitly
129
+ attaches file snippets or diffs;
130
+ - require audit records for open, preview, save request, policy decision, write,
131
+ rollback, and evidence attachment events.
132
+
133
+ ## Governed Edit Contract
134
+
135
+ MVP starts read-only. Edit mode is a deliberate transition with visible task and
136
+ policy state.
137
+
138
+ Open file request fields: `taskId`, `workspaceId`, `relativePath`, `mode`
139
+ (`readOnly` or `editIntent`), and optional `selection`, `contextPackId`, and
140
+ `role`.
141
+
142
+ Save request fields: `taskId`, `workspaceId`, `relativePath`, `baseRevision`,
143
+ `proposedContentHash`, `patch`, `userIntent`, and `policyAcknowledgements`.
144
+
145
+ Save response fields: `decision` (`allowed`, `blocked`, or `needsReview`),
146
+ `diffSummary`, `policyFindings`, `evidenceId`, `rollbackHint`, and
147
+ `nextActions`.
148
+
149
+ Policy checks before write:
150
+
151
+ - task exists, has backlog item, and is active or explicitly selected;
152
+ - path is inside owned write scope for the task;
153
+ - file is not locked by another role or workflow;
154
+ - file is not a release bump file unless the task permits release ownership;
155
+ - generated files require source-of-truth confirmation;
156
+ - security-sensitive paths require Security review;
157
+ - save is based on current file revision or requires conflict resolution;
158
+ - diff preview is acknowledged before write.
159
+
160
+ Every write must create or update a task-scoped diff record and evidence hook.
161
+ The UI should offer rollback guidance based on Git state, but it should not
162
+ perform destructive Git operations without a separate approval flow.
163
+
164
+ ## UX Flow
165
+
166
+ Primary user: a human operator reviewing or making a governed task-scoped edit
167
+ from the local or SaaS web console.
168
+
169
+ MVP flow: user selects an active task; the console shows acceptance criteria,
170
+ owned paths, gates, related files, and evidence gaps; the user opens a related
171
+ file in read-only mode; the editor shows syntax highlighting, diagnostics when
172
+ available, and a task context sidebar; the user requests edit mode; the console
173
+ shows write policy status, locks, and required reviewers; the user edits,
174
+ previews a diff, submits a save request, receives an allowed, blocked, or
175
+ needs-review decision, and gets an evidence link plus next steps.
176
+
177
+ Responsive behavior:
178
+
179
+ - mobile defaults to task context, file list, and read-only code view with
180
+ collapsible diagnostics;
181
+ - tablet uses stacked editor and context panels;
182
+ - desktop uses three regions: file/navigation rail, editor/diff, and Orchestra
183
+ context sidebar;
184
+ - no critical action should depend on horizontal scrolling;
185
+ - keyboard users can open files, search, inspect diagnostics, preview diff,
186
+ request edit mode, and submit or cancel saves.
187
+
188
+ Required states:
189
+
190
+ - loading file and loading diagnostics;
191
+ - empty related files;
192
+ - language server unavailable with syntax-only fallback;
193
+ - policy blocked with clear next action;
194
+ - conflict detected with reload or compare options;
195
+ - save succeeded with evidence link;
196
+ - save failed without exposing stack traces or internal paths.
197
+
198
+ ## Security Constraints
199
+
200
+ Security review is mandatory before implementation because this feature touches
201
+ file paths, process execution, workspace source code, secrets, network calls,
202
+ and future multi-tenant boundaries.
203
+
204
+ Non-negotiables:
205
+
206
+ - no shell interpolation for language server startup or Git operations;
207
+ - validate and normalize every path server-side;
208
+ - never expose stack traces, host paths, environment variables, or raw process
209
+ errors to the browser;
210
+ - scan proposed diffs for configured secret patterns before write;
211
+ - treat workspace files and LSP responses as untrusted input;
212
+ - sanitize markdown, diagnostics, and hover content before rendering;
213
+ - apply content security policy before enabling Monaco workers or remote
214
+ extension assets;
215
+ - fail closed when policy, path validation, revision checks, secret scan, or
216
+ audit write fails;
217
+ - keep local ports bound to `127.0.0.1` by default;
218
+ - require explicit tenant and data residency controls before SaaS rollout.
219
+
220
+ ## QA Evidence Strategy
221
+
222
+ The spike output is validated by review evidence. Implementation stories need
223
+ observable acceptance evidence.
224
+
225
+ Recommended automated coverage:
226
+
227
+ - unit tests for path normalization, task write-scope policy, save contract,
228
+ revision conflict handling, and LSP lifecycle state transitions;
229
+ - contract tests for editor API request and response shapes;
230
+ - integration tests with fake language-server processes for diagnostics,
231
+ timeout, crash, unavailable server, and idle shutdown;
232
+ - Playwright tests for read-only viewer, responsive layout, keyboard flow,
233
+ policy-blocked save, conflict recovery, and successful evidence link;
234
+ - security tests for traversal, symlink escape, secret diff rejection, unsafe
235
+ diagnostic rendering, and blocked process arguments.
236
+
237
+ Evidence required per implementation story:
238
+
239
+ - exact commands and pass/fail results;
240
+ - AC-to-evidence matrix;
241
+ - screenshots or traces for desktop and mobile editor flows;
242
+ - sample diff/evidence artifact;
243
+ - security review result for file/process/network changes;
244
+ - documented deferred language support when a language server is unavailable.
245
+
246
+ ## Implementation Slices
247
+
248
+ 1. Read-only code viewer and file open contract: CodeMirror 6 surface, syntax
249
+ highlighting, loading/empty/error states, path validation, task association,
250
+ and Playwright smoke.
251
+ 2. Task context sidebar: active task, backlog item, acceptance criteria, roles,
252
+ owned paths, context-pack references, evidence gaps, and gate warnings.
253
+ 3. Git diff and task timeline integration: revision metadata, diff preview,
254
+ task-scoped file timeline, conflict detection, and evidence hook.
255
+ 4. TypeScript/JavaScript LSP bridge MVP: supervised lifecycle, diagnostics,
256
+ hover, completion, go-to-definition, fallback states, and fake LSP tests.
257
+ 5. Governed edit mode: edit intent, patch submission, policy checks, diff
258
+ acknowledgement, save response, blocked UX, and needs-review UX.
259
+ 6. Security hardening: traversal and symlink protection, secret scanning,
260
+ process allowlist, diagnostic sanitization, CSP review, and local binding
261
+ checks.
262
+ 7. SaaS isolation design: tenant sandboxing, process isolation, storage and data
263
+ residency policy, audit logs, quota controls, and abuse controls.
264
+ 8. Language extension framework: server capability manifest, health status,
265
+ per-language setup guidance, and extension points.
266
+
267
+ ## Open Risks
268
+
269
+ - LSP servers execute project-aware code paths and can be expensive or unsafe if
270
+ not tightly supervised.
271
+ - Monaco may be required later if CodeMirror extension quality does not meet
272
+ user expectations for TypeScript-heavy projects.
273
+ - SaaS editing has materially higher tenant isolation, data residency, and audit
274
+ requirements than local mode.
275
+ - Secret detection can produce false negatives and should not be the only
276
+ control before writing.
277
+ - Generated-file editing can violate source-of-truth contracts unless policy
278
+ gates are strict.
279
+
280
+ ## Recommended Next Stories
281
+
282
+ - GH-447-A: Build read-only CodeMirror file viewer with task-scoped open
283
+ contract.
284
+ - GH-447-B: Add Orchestra task context sidebar for editor sessions.
285
+ - GH-447-C: Add Git diff preview and task-scoped file timeline.
286
+ - GH-447-D: Implement TypeScript/JavaScript LSP bridge MVP with fake LSP tests.
287
+ - GH-447-E: Implement governed edit mode with policy-gated saves.
288
+ - GH-447-F: Add editor security hardening and negative test matrix.
289
+ - GH-447-G: Define SaaS tenant isolation and data residency ADR.
@@ -10,9 +10,10 @@ claim real Claude Code native execution or Anthropic/provider API execution.
10
10
  | --- | --- | --- | --- |
11
11
  | #432 / `GH-432-CLAUDE-ADAPTER-CONTRACT` | Claude action eligibility, skip reasons, alias policy, non-regression docs | QA handoff, release handoff, `npm run build`, `node --test test/runtime-adapters.test.js` with 51 passing tests, `git diff --check` | Pass |
12
12
  | #433 / `GH-433-CLAUDE-DISPATCH-BRIDGE` | Dispatch bridge boundary, spawned/active lifecycle recording, idempotency, fallback guidance | QA handoff, release handoff, `npm run build`, `node --test test/runtime-adapters.test.js` with 54 passing tests, `git diff --check` | Pass |
13
- | #434 / `GH-434-CLAUDE-COMPLETION-RECONCILIATION` | Strict completion validation by task, phase, role, runtime, session, and expected artifact | Issue exists and remains open; no local QA handoff found for this slice | Pending |
14
- | #435 / `GH-435-CLAUDE-GATE-PRESERVATION` | Safe workflow resume and human gate preservation regression coverage | Issue exists and remains open; no local QA handoff found for this slice | Deferred |
15
- | #436 / `GH-436-CLAUDE-DOCS-QA-EVIDENCE` | Documentation, QA matrix, release evidence, support-level framing | This document, `docs/runtime-adapters.md`, GH-436 QA handoff | Pending |
13
+ | #434 / `GH-434-CLAUDE-COMPLETION-RECONCILIATION` | Strict completion validation by task, phase, role, runtime, session, and expected artifact | `runtime watch` validation tests in `test/runtime-adapters.test.js`; `npm run build`; `node --test test/runtime-adapters.test.js` with 58 passing tests | Pass |
14
+ | #435 / `GH-435-CLAUDE-GATE-PRESERVATION` | Safe workflow resume and human gate preservation regression coverage | `npm run build`; `node --test test/autonomous-workflow-cli.test.js`; `node --test test/runtime-adapters.test.js` with 59 passing tests | Pass |
15
+ | #436 / `GH-436-CLAUDE-DOCS-QA-EVIDENCE` | Documentation, QA matrix, release evidence, support-level framing | This document, `docs/runtime-adapters.md`, GH-436 QA/release handoffs, and follow-up #434/#435 evidence updates | Pass |
16
+ | #439 / `GH-439-CLAUDE-NATIVE-CALLBACK-BRIDGE` | Native callback bridge contract, fallback truthfulness, lifecycle validation | Local contract tests in `test/runtime-adapters.test.js`; `docs/runtime-adapters.md`; this matrix | In progress |
16
17
 
17
18
  ## Acceptance Criteria Matrix
18
19
 
@@ -28,21 +29,26 @@ claim real Claude Code native execution or Anthropic/provider API execution.
28
29
  | #433 | Repeated dispatch is idempotent and never creates duplicate lifecycle events for the same session. | CLI unit | GH-433 QA handoff | QA reports repeated dispatch keeps one spawned and one active event. | Pass |
29
30
  | #433 | Unavailable or unsupported native tool paths return explicit fallback guidance and manual lifecycle commands. | CLI unit/code review | GH-433 QA handoff | QA reports skipped result includes prompt artifact, expected result artifact, and manual spawned command. | Pass |
30
31
  | #433 | Tests cover successful dispatch, unavailable tool fallback, repeated dispatch idempotency, runtime mismatch, and guardrail rejection. | Automated tests | GH-433 QA handoff; `node --test test/runtime-adapters.test.js` with 54 passing tests | Required scenarios mapped to deterministic tests. | Pass |
31
- | #434 | Completion validation checks task id, phase, role, runtime, session id, and expected result artifact path. | Planned unit/watch tests | GitHub issue #434 | No local implementation or QA evidence reviewed in this slice. | Pending |
32
- | #434 | Wrong-task, wrong-role, wrong-runtime, wrong-session, missing, duplicate, and unsafe-path artifacts are rejected or skipped with explicit reasons. | Planned unit/watch tests | GitHub issue #434 | No local implementation or QA evidence reviewed in this slice. | Pending |
33
- | #434 | `runtime watch` records completed exactly once for a valid spawned or active Claude session. | Planned watch tests | GitHub issue #434 | No local implementation or QA evidence reviewed in this slice. | Pending |
34
- | #434 | Native immediate completion results follow the same validation rules when supported. | Planned contract tests | GitHub issue #434 | Native immediate completion is not claimed as supported by current evidence. | Pending |
35
- | #434 | Tests cover artifact validation, duplicate completion prevention, timeout/stale behavior, and safe path handling. | Planned automated tests | GitHub issue #434 | No local implementation or QA evidence reviewed in this slice. | Pending |
36
- | #435 | Verified completion resumes the paused run to the next safe phase when no human gate is pending. | Planned workflow tests | GitHub issue #435 | No local implementation or QA evidence reviewed in this slice. | Deferred |
37
- | #435 | `po-to-architect`, `qa-to-release`, and configured human gates remain paused until explicit approval. | Planned workflow tests/manual review | GitHub issue #435 | Dedicated regression evidence is still required before release claim. | Deferred |
38
- | #435 | Auto-dispatch never records gate approval or skips a gate. | Planned workflow tests | GitHub issue #435 | Dedicated regression evidence is still required before release claim. | Deferred |
39
- | #435 | Tests cover `gates=none`, `gates=phase`, `gates=all`, multi-phase dispatch until idle, manual fallback recovery, and GH-421 spawn-state messaging. | Planned CLI/workflow tests | GitHub issue #435 | No local implementation or QA evidence reviewed in this slice. | Deferred |
40
- | #435 | Existing Codex, Cursor, generic, VS Code, Windsurf, and OpenCode behavior is unchanged or covered by regression tests. | Planned regression tests | GitHub issue #435 | Broad cross-runtime regression evidence is still required. | Deferred |
32
+ | #434 | Completion validation checks task id, phase, role, runtime, session id, and expected result artifact path. | Watch/contract tests | `test/runtime-adapters.test.js`; `src/runtime-completion-validation.ts` | `runtime watch` validates completion against structured spawn/session metadata before recording completed. | Pass |
33
+ | #434 | Wrong-task, wrong-role, wrong-runtime, wrong-session, missing, duplicate, and unsafe-path artifacts are rejected or skipped with explicit reasons. | Watch/negative tests | `runtime watch rejects Claude completion metadata mismatches`; existing unsafe/missing/duplicate watch coverage | Mismatches are skipped with explicit reasons; unsafe paths do not create completed lifecycle events. | Pass |
34
+ | #434 | `runtime watch` records completed exactly once for a valid spawned or active Claude session. | Watch test | `runtime watch completes a valid Claude session once` | Two watch passes produce one completed lifecycle event for the Claude session. | Pass |
35
+ | #434 | Native immediate completion results follow the same validation rules when supported. | Watch/contract test | `runtime watch validates Claude native immediate completion results`; `src/runtime-completion-validation.ts` | Immediate `completionResult` payloads resolve through the same expected-artifact and artifact-metadata validator. | Pass |
36
+ | #434 | Tests cover artifact validation, duplicate completion prevention, timeout/stale behavior, and safe path handling. | Automated tests | `node --test test/runtime-adapters.test.js` with 58 passing tests | Focused runtime adapter suite covers artifact metadata validation, duplicate ignored reasons, immediate completion payloads, and existing timeout/stale/unsafe-path behavior. | Pass |
37
+ | #435 | Verified completion resumes the paused run to the next safe phase when no human gate is pending. | Runtime lifecycle tests | `runtime watch completes spawned sessions once and auto-resumes workflow`; `runtime lifecycle completion can opt out of auto-resume` | Runtime completion resumes safe non-gated work and keeps opt-out behavior. | Pass |
38
+ | #435 | `po-to-architect`, `qa-to-release`, and configured human gates remain paused until explicit approval. | Workflow CLI tests | `workflow resume holds human gates until explicit approval` | Resume now holds unapproved gates and advances only after `workflow gate-approve`. | Pass |
39
+ | #435 | Auto-dispatch never records gate approval or skips a gate. | Runtime lifecycle tests | `runtime lifecycle auto-resume does not approve human gates` | Runtime lifecycle auto-resume leaves paused work unapproved and emits no `AUTONOMOUS_GATE_APPROVED` event. | Pass |
40
+ | #435 | Tests cover `gates=none`, `gates=phase`, `gates=all`, multi-phase dispatch until idle, manual fallback recovery, and GH-421 spawn-state messaging. | CLI/runtime regression tests | `test/autonomous-workflow-cli.test.js`; `test/runtime-adapters.test.js` | Existing runtime suite covers multi-pass dispatch, queued/pending messaging, manual recovery guidance, and gate modes; #435 adds stricter unapproved-gate hold coverage. | Pass |
41
+ | #435 | Existing Codex, Cursor, generic, VS Code, Windsurf, and OpenCode behavior is unchanged or covered by regression tests. | Runtime regression tests | `test/runtime-adapters.test.js` | Runtime adapter catalog and cross-runtime parent action tests still pass. | Pass |
41
42
  | #436 | Runtime adapter docs document Claude dispatch support, alias policy, fallback behavior, manual recovery commands, guardrails, and gate preservation. | Documentation review | `docs/runtime-adapters.md` | Updated in this slice. | Pass |
42
43
  | #436 | QA matrix maps each GH-422 child story acceptance criterion to unit, workflow, CLI, or manual evidence. | Documentation | This file | Matrix records Pass/Pending/Deferred by criterion and evidence type. | Pass |
43
- | #436 | Release evidence includes exact commands, pass/fail results, unsupported CI/manual verification notes, and unresolved risks. | QA handoff/evidence | GH-436 handoff under `.agent-workflow/handoffs/` | Handoff records commands and recommended validations. | Pending |
44
+ | #436 | Release evidence includes exact commands, pass/fail results, unsupported CI/manual verification notes, and unresolved risks. | QA handoff/evidence | GH-436 handoff under `.agent-workflow/handoffs/`; #434/#435 QA evidence and release-readiness checks | Handoffs and evidence record exact commands, pass/fail results, known unsupported Claude callback/provider claims, and residual real-transport risk. | Pass |
44
45
  | #436 | Documentation does not claim native Claude execution beyond tested behavior. | Documentation review | `docs/runtime-adapters.md`; this file | Docs frame support as parent-runtime contract plus manual/runtime-owned launch. | Pass |
45
- | #436 | Product/release review records go/no-go based on evidence and known limitations. | Review artifact | Pending release review for GH-436 | Needs release/product review after documentation QA. | Pending |
46
+ | #436 | Product/release review records go/no-go based on evidence and known limitations. | Review artifact | `.agent-workflow/reviews/GH-436-CLAUDE-DOCS-QA-EVIDENCE-release_manager-review.md`; #435 release-readiness gate | Release review records go with known limitations; #435 follow-up gate preservation now passes. | Pass |
47
+ | #439 | Claude parent-runtime adapter can hand off only when a supported Claude parent runtime and native callback capability are explicitly verified. | Unit/contract | `runtime parent-actions dispatches eligible Claude requests with stable lifecycle`; `runtime parent-actions returns truthful Claude native fallback outside Claude parent runtime` | Local tests simulate the verified callback contract and verify unsupported parent runtimes skip without lifecycle writes. | In progress |
48
+ | #439 | Adapter captures a native child identifier or verified callback result and records spawned and active without manual lifecycle commands. | Unit/contract | `test/runtime-adapters.test.js` Claude native dispatch test | Simulated verified bridge records one spawned event and one active heartbeat with the supplied native child id. | In progress |
49
+ | #439 | Completion reconciliation validates task id, phase, role, runtime, session id, and expected artifact before completion. | Watch/contract tests | Existing #434 watch tests in `test/runtime-adapters.test.js` | Completion still flows through the expected handoff metadata and safe path validators. | Pass |
50
+ | #439 | Unsupported environments return fallback guidance without claiming native execution. | Unit/contract/docs | `runtime parent-actions returns truthful Claude native fallback outside Claude parent runtime`; `docs/runtime-adapters.md` | Codex/non-Claude context returns skipped fallback guidance and records no spawned lifecycle event. | In progress |
51
+ | #439 | Documentation separates tested local contract, real Claude runtime proof, and unsupported CI/non-Claude contexts. | Documentation | `docs/runtime-adapters.md`; this matrix | Docs label local simulation and unsupported contexts; real Claude runtime proof remains manual QA. | In progress |
46
52
 
47
53
  ## Unsupported Or Deferred Claims
48
54
 
@@ -50,7 +56,13 @@ claim real Claude Code native execution or Anthropic/provider API execution.
50
56
  Agent/Subagent tools from CI or from a non-Claude parent runtime.
51
57
  - No evidence proves direct Anthropic or provider API execution for runtime
52
58
  delegation; runtime-native artifacts keep `directProviderApiAllowed=false`.
53
- - #434 completion reconciliation hardening remains pending.
54
- - #435 workflow resume and human gate preservation regression coverage remains
55
- deferred until its implementation and QA pass.
56
-
59
+ - Native immediate Claude completion is covered when represented as a
60
+ `completionResult` payload; real callback transport remains future adapter
61
+ work.
62
+ - #435 workflow resume and human gate preservation regression coverage now
63
+ passes locally; real Claude Code callback transport remains outside this
64
+ local contract matrix.
65
+ - #439 adds a truthful native callback bridge contract: non-Claude and
66
+ callback-unavailable environments skip without lifecycle writes. Real Claude
67
+ Code Agent/Subagent proof still requires manual QA from a Claude parent
68
+ runtime that exposes the native callback capability.
@@ -0,0 +1,80 @@
1
+ # Context Vault
2
+
3
+ Context vault is the planned workspace catalog for source materials that inform
4
+ agent work but should not be pasted directly into every prompt. It covers
5
+ documents, statements of work, PDFs, diagrams, images, audio/video recordings,
6
+ transcripts, and client reference artifacts.
7
+
8
+ The vault is related to context indexing and transcription, but it has a
9
+ different job: it records provenance, sensitivity, retention, ownership, and
10
+ safe consumption rules for project inputs.
11
+
12
+ ## Goals
13
+
14
+ - Register source artifacts with stable metadata and checksums.
15
+ - Classify sensitivity before an artifact is used by an agent.
16
+ - Convert large or binary inputs into bounded summaries, excerpts, transcripts,
17
+ or context packs.
18
+ - Preserve provenance so BA, PO, Architect, QA, and Release can cite which
19
+ artifact informed a requirement, risk, decision, or validation result.
20
+ - Keep raw sensitive inputs out of runtime prompts unless policy explicitly
21
+ allows a bounded excerpt.
22
+
23
+ ## Artifact Metadata
24
+
25
+ Every vault artifact should record:
26
+
27
+ - artifact id and workspace or tenant id;
28
+ - original source, file name, media type, size, checksum, and registered time;
29
+ - owner role or user;
30
+ - sensitivity classification and retention policy;
31
+ - ingestion status, conversion status, redaction status, and error state;
32
+ - derived outputs such as Markdown conversion, transcript, summary, embeddings,
33
+ or context-pack references.
34
+
35
+ ## Consumption Model
36
+
37
+ Agents should not load raw vault artifacts by default. They should request a
38
+ bounded context pack or artifact summary scoped to the active task, role, phase,
39
+ and token budget.
40
+
41
+ The pack should include:
42
+
43
+ - artifact references and provenance links;
44
+ - selected excerpts with inclusion reasons;
45
+ - redaction decisions and omitted-sensitive-data notes;
46
+ - budget summary and truncation indicators;
47
+ - stale or failed-ingestion warnings.
48
+
49
+ ## Security And Privacy
50
+
51
+ Vault ingestion must be tenant-aware and fail closed when classification or
52
+ redaction is uncertain. Secrets, credentials, health data, payment data, and
53
+ other regulated content require explicit policy handling before they can be
54
+ summarized or exposed to a runtime.
55
+
56
+ External conversion or transcription providers are opt-in. Local conversion and
57
+ local transcription should be preferred when a workspace or tenant requires
58
+ offline handling.
59
+
60
+ ## API Shape
61
+
62
+ The implementation should define APIs before storage details:
63
+
64
+ - `vault artifact add` or API equivalent for registering files and metadata;
65
+ - `vault artifact list` with pagination, filtering, status, type, and
66
+ sensitivity filters;
67
+ - `vault artifact show` for metadata and derived-output status;
68
+ - `vault ingest` for conversion, transcription, and redaction jobs;
69
+ - `context pack build` integration that can cite vault artifacts without
70
+ reading raw files broadly.
71
+
72
+ The web console can later expose this as a catalog view, but it should consume
73
+ the same domain services as the CLI/API.
74
+
75
+ ## Related Work
76
+
77
+ - GitHub issue `#449`: context vault epic.
78
+ - GitHub issue `#367`: audio and video transcription evidence skill.
79
+ - GitHub issues `#423` through `#427`: context index, search, bounded context
80
+ packs, redaction, and runtime prompt integration.
@@ -29,7 +29,7 @@ entry points a user or CI runner actually executes.
29
29
  | Browser console | Web console task, cost, provider, delegation, recovery, evidence, workflow, accessibility, artifacts | `npm run test:e2e` | visible state, API persistence, evidence attachment, lifecycle transitions, responsive/keyboard behavior | Playwright report, screenshots/traces on failure |
30
30
  | Public site | Documentation/site navigation, docs catalog, architecture viewer, mobile fit | `npm run test:e2e` | navigation order, local docs catalog search, no raw GitHub redirect for docs, mobile content fit | Playwright report |
31
31
  | Runtime manual queue | Manual runtime delegation in a `/tmp` workspace | `npm run test:e2e:runtime` | two active sessions, third manual `spawn-request` materializes `queued`, artifact includes lifecycle commands, `runtime sessions` lists queued session | stdout/stderr, JSON output, artifact content |
32
- | Init refresh environments | Simulated Codex, Claude, Cursor, generic workspaces | `node --test e2e/init-refresh-environments.test.js` | missing runtime guidance files regenerate on `init --force`, user content is preserved, managed blocks are updated only inside managed ranges | filesystem diff assertions |
32
+ | Init refresh environments | Simulated generic, Codex, Claude, Cursor, VS Code/GitHub Copilot, and Windsurf workspaces | `npm run test:e2e:init` | missing runtime guidance files regenerate on `init --force`, user content is preserved, managed blocks/frontmatter are updated only inside managed ranges, target-specific metadata excludes wrong-target content, generated-artifact evidence maps back to acceptance criteria | filesystem diff assertions, QA coverage JSON |
33
33
  | Workflow lifecycle CLI | CLI workflow run, gate, resume, QA failback, release readiness | `node --test e2e/workflow-lifecycle-cli.test.js` | task phases create handoffs, blocked QA routes back, routine gate resumes immediately, release readiness maps acceptance to evidence | JSON output, events, handoffs |
34
34
 
35
35
  ## P1 High-Risk Regression Batteries
@@ -76,8 +76,8 @@ the packaging/install path is wrong.
76
76
 
77
77
  1. Keep `e2e/runtime-manual-queue.test.js` release-blocking as runtime
78
78
  delegation evolves.
79
- 2. Add `e2e/init-refresh-environments.test.js` for Codex, Claude, Cursor, and
80
- generic project simulations.
79
+ 2. Keep `e2e/init-onboarding.test.js` covering Codex, Claude, Cursor, VS Code,
80
+ GitHub Copilot, Windsurf, and generic project simulations.
81
81
  3. Add `e2e/workflow-lifecycle-cli.test.js` for workflow run, gate, failback,
82
82
  resume, and release readiness.
83
83
  4. Add `e2e/runtime-multi-squad.test.js` for async background squad behavior.
@@ -204,7 +204,7 @@ have two supported paths:
204
204
  `runtime parent-actions --task <id> --dispatch --until-idle --runtime <runtime-id>`.
205
205
  The dispatcher repeatedly inspects pending parent actions, dispatches only
206
206
  safe actions for the active runtime, records spawned and active lifecycle
207
- events with stable runtime child ids or deterministic fallback labels, applies
207
+ events with stable runtime child ids or verified callback correlation ids, applies
208
208
  `runtime watch` completions when expected handoff artifacts appear, resumes
209
209
  paused workflow runs, and continues across later phases until idle or timeout.
210
210
 
@@ -219,11 +219,12 @@ access. This keeps the boundary explicit: Orchestra emits auditable actions and
219
219
  lifecycle commands; the active parent runtime executes native tools such as
220
220
  Codex `spawn_agent`, and the dispatcher only consumes actions that are safe for
221
221
  the runtime declared on the command line. For Claude, the tested dispatch
222
- contract accepts `claude-agent-request` with `tool=claude-code-agent`, records
223
- `spawned` and `active` lifecycle states with a deterministic
224
- `claude-code-agent:<session>` label when no native child id is available, and
225
- remains idempotent across repeated dispatch attempts. Orchestra does not call
226
- Claude Code, Anthropic APIs, or another provider API.
222
+ contract accepts `claude-agent-request` with `tool=claude-code-agent`, but it
223
+ records `spawned` and `active` only when the active parent runtime is Claude and
224
+ the native callback capability is explicitly verified. Unsupported Codex, CI,
225
+ non-Claude, or callback-unavailable contexts return fallback guidance and do not
226
+ claim native execution. Orchestra does not call Claude Code, Anthropic APIs, or
227
+ another provider API.
227
228
 
228
229
  Runtime lifecycle watching is adapter-driven. Each inspected session reports a
229
230
  `watcher` object with adapter id, detection mode, support level, fallback
@@ -242,10 +243,12 @@ not proof that Orchestra can invoke Claude Code or Anthropic APIs by itself.
242
243
  The tested local behavior covers:
243
244
 
244
245
  - Dispatch support: eligible `claude-agent-request` actions for `claude-cli`
245
- with `tool=claude-code-agent` can be consumed by
246
+ with `tool=claude-code-agent` can be inspected by
246
247
  `runtime parent-actions --dispatch --runtime claude-cli`. The dispatch path
247
- records `spawned` and `active` lifecycle state with a stable child identifier
248
- or deterministic `claude-code-agent:<session>` fallback label.
248
+ records `spawned` and `active` lifecycle state only when the bridge verifies a
249
+ Claude parent runtime and callback capability. In local contract tests this is
250
+ simulated with explicit environment markers; in unsupported environments the
251
+ action is skipped with manual fallback guidance.
249
252
  - Alias policy: `claude-code-agent` is the only auto-dispatchable Claude tool
250
253
  name in the tested contract. `Task` is a legacy/manual alias and is skipped
251
254
  as `tool-mismatch`; accepting it in auto-dispatch requires new tests and
@@ -254,19 +257,26 @@ The tested local behavior covers:
254
257
  terminal, mismatched, or unavailable actions return structured eligibility
255
258
  metadata, fallback guidance, prompt artifact, expected result artifact, and
256
259
  manual lifecycle commands. Fallback never runs the phase in the parent agent
257
- silently and never switches to direct provider APIs.
260
+ silently, never records native Claude lifecycle events, and never switches to
261
+ direct provider APIs.
258
262
  - Guardrails: dispatch is bounded by runtime guardrails, runtime filters,
259
263
  session status, safety state, action kind, tool name, and stale-session
260
264
  checks. It preserves `directProviderApiAllowed=false` for runtime-native
261
265
  delegation artifacts.
262
- - Completion reconciliation: current tested support relies on explicit
263
- lifecycle events and bounded expected-artifact inspection. GH-434 tracks
264
- stricter validation of task id, phase, role, runtime, session id, and safe
265
- expected artifact path before a Claude session is marked complete.
266
- - Gate preservation: auto-dispatch must not approve or skip human gates. GH-435
267
- tracks the dedicated regression suite for safe workflow resume across
268
- `gates=none`, `gates=phase`, `gates=all`, multi-phase dispatch, and manual
269
- fallback recovery.
266
+ - Completion reconciliation: `runtime watch` validates the expected completion
267
+ metadata before marking a Claude session complete. The validation checks task
268
+ id, phase, role, runtime, session id, and the safe expected handoff path, and
269
+ it also requires the final handoff artifact to repeat those metadata fields.
270
+ It skips mismatches, missing artifact metadata, unsafe paths, and duplicate
271
+ completions with explicit reasons instead of treating any handoff file as
272
+ completion proof. Native immediate `completionResult` payloads use the same
273
+ validation path when present.
274
+ - Gate preservation: auto-dispatch must not approve or skip human gates.
275
+ `workflow run --resume` now holds unapproved gates until
276
+ `workflow gate-approve` records explicit approval, and runtime lifecycle
277
+ auto-resume records no gate approval events. The regression suite covers safe
278
+ non-gated resume, unapproved gate hold behavior, opt-out, queued/pending
279
+ messaging, and multi-pass parent action dispatch.
270
280
 
271
281
  Manual recovery for a skipped or unavailable Claude action:
272
282
 
@@ -117,6 +117,7 @@
117
117
  { "title": "Sonar quality gates", "source": "docs/sonar-quality-gates.md", "heading": "Sonar Quality Gates" },
118
118
  { "title": "Sonar architecture model", "source": "docs/sonar-architecture-model.md", "heading": "Sonar Architecture Model" },
119
119
  { "title": "Runtime adapters", "source": "docs/runtime-adapters.md", "heading": "Runtime Adapters" },
120
+ { "title": "Context vault", "source": "docs/context-vault.md", "heading": "Context Vault" },
120
121
  { "title": "Site content workflow", "source": "docs/site-content-workflow.md", "heading": "Public Site Content Workflow" }
121
122
  ]
122
123
  },
@@ -54,10 +54,20 @@ orchestra review --task STORY-1 --role qa --result approve --findings "..." --re
54
54
  ```
55
55
 
56
56
  Developer-to-QA handoff should include touched files, commands, known gaps, and
57
- recommended Playwright, CLI, shell, or API coverage. `qa coverage` maps each
58
- acceptance criterion to `covered`, `planned`, `skipped`, or `gap` using task
59
- paths, project scripts, and existing evidence; release readiness surfaces
60
- unresolved QA automation gaps before promotion.
57
+ recommended Playwright, CLI, shell, API, integration, workflow, mobile, desktop,
58
+ data, or generated-artifact coverage. `qa coverage` maps each acceptance
59
+ criterion to `covered`, `weak`, `missing`, `deferred`, or `blocked`
60
+ using task paths, project scripts, and existing evidence; release readiness and
61
+ the `qa-release` gate surface unresolved QA automation gaps before promotion.
62
+
63
+ Generated artifacts are a first-class QA surface. When rules, skills, runtime
64
+ guidance, Markdown files, MDC files, or managed bootstrap blocks change,
65
+ evidence must assert generated paths, managed metadata, target-specific content,
66
+ refresh/drift behavior, user-content preservation, and absence of wrong-target
67
+ content. CLI evidence must assert exit code, stdout, stderr, generated
68
+ files/events, and final state. Integration evidence must include receiver-side
69
+ sandbox/mock/contract/webhook/event/log validation or an explicit deferred owner
70
+ and rationale.
61
71
 
62
72
  Evidence summaries should name the acceptance criterion they cover or say
63
73
  "covers all acceptance criteria" when a single artifact proves the full story.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jterrats/open-orchestra",
3
- "version": "1.0.10",
3
+ "version": "1.0.12",
4
4
  "type": "module",
5
5
  "workspaces": [
6
6
  "extensions/vscode-open-orchestra",
@@ -16,7 +16,7 @@
16
16
  "test": "npm run build && node --test test/**/*.js extensions/**/*.test.cjs",
17
17
  "test:coverage": "npm run build && c8 --reporter=lcov --reports-dir coverage --exclude \"test/**\" --exclude \"e2e/**\" --exclude \"extensions/**/test/**\" --exclude \"dist/assets/**\" --exclude \"dist/web-console/**\" node --test test/**/*.js extensions/**/*.test.cjs",
18
18
  "test:e2e": "npm run build && npm run site:build && playwright test",
19
- "test:e2e:init": "node --test e2e/init-onboarding.test.js",
19
+ "test:e2e:init": "node --test e2e/init-onboarding.test.js e2e/runtime-instruction-flow.test.js",
20
20
  "test:e2e:runtime": "node --test e2e/runtime-manual-queue.test.js",
21
21
  "test:e2e:runtime:ollama": "npm run build && node --test e2e/runtime-ollama-provider.test.js",
22
22
  "lint": "eslint . && prettier --check \"{bin,e2e,scripts,test,src}/**/*.js\" \"{site,web-console}/src/**/*.{css,js,jsx}\" \"{site,web-console}/*.{html,js,json}\" \"extensions/**/*.{cjs,json,md}\" \"src/**/*.ts\" \"*.{js,json}\"",