sneakoscope 0.6.77 → 0.6.79

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -47,7 +47,7 @@ sks selftest --mock
47
47
  | Codex App commands | Installs generated skills so `$Team`, `$From-Chat-IMG`, `$DFix`, `$QA-LOOP`, `$Goal`, `$DB`, `$Wiki`, `$Help`, and related routes are visible in prompt workflows. |
48
48
  | Team orchestration | Runs substantial work through ambiguity handling, scouts, TriWiki refresh, debate, runtime task graphs, worker inboxes, implementation, review, cleanup, reflection, and Honest Mode. |
49
49
  | From-Chat-IMG | Turns chat screenshots plus original attachments into source-bound work orders, then requires scoped QA evidence before completion. |
50
- | QA loop | Dogfoods UI/API behavior with safety gates, Browser/Computer evidence, safe fixes, and rechecks. |
50
+ | QA loop | Dogfoods UI/API behavior with safety gates, Codex Computer Use-only UI evidence, safe fixes, and rechecks. |
51
51
  | Goal | Bridges SKS pipeline state to Codex native persisted `/goal` create, pause, resume, and clear workflows. |
52
52
  | TriWiki voxels | Maintains `.sneakoscope/wiki/context-pack.json` as the context SSOT with coordinate anchors, voxel metadata, `attention.use_first`, and `attention.hydrate_first`. |
53
53
  | Context7 | Requires current docs for external packages, APIs, MCPs, SDKs, and framework/runtime behavior when correctness depends on current guidance. |
@@ -216,6 +216,7 @@ sks db scan --json
216
216
  sks wiki refresh
217
217
  sks wiki sweep latest --json
218
218
  sks wiki validate .sneakoscope/wiki/context-pack.json
219
+ sks harness fixture --json
219
220
  sks gx init homepage
220
221
  sks gx render homepage --format html
221
222
  sks validate-artifacts latest --json
@@ -261,7 +262,7 @@ Generated app files include:
261
262
 
262
263
  Use `sks dollar-commands` to confirm that terminal discovery and Codex App prompt commands agree.
263
264
 
264
- TriWiki is intentionally sparse: `sks wiki sweep` records demote, soft-forget, archive, delete, promote-to-skill, and promote-to-rule candidates instead of injecting every old claim into future prompts. `sks code-structure scan` flags handwritten files above 1000/2000/3000-line thresholds so new logic can be extracted before command files become harder to maintain.
265
+ TriWiki is intentionally sparse: `sks wiki sweep` records demote, soft-forget, archive, delete, promote-to-skill, and promote-to-rule candidates instead of injecting every old claim into future prompts. `sks harness fixture` validates the broader Harness Growth Factory contract: deliberate forgetting fixtures, skill card metadata, experiment schema, tool-error taxonomy, permission profiles, MultiAgentV2 defaults, and Cmux cockpit view coverage. `sks code-structure scan` flags handwritten files above 1000/2000/3000-line thresholds so new logic can be extracted before command files become harder to maintain.
265
266
 
266
267
  ## Prompt `$` Commands
267
268
 
@@ -327,7 +328,7 @@ sks qa-loop run latest --max-cycles 2
327
328
  sks qa-loop status latest
328
329
  ```
329
330
 
330
- Use `$QA-LOOP` in Codex App when Browser Use or Computer Use evidence should be part of the workflow.
331
+ Use `$QA-LOOP` in Codex App when UI-level E2E needs verification. UI verification must use Codex Computer Use evidence only; Chrome MCP, Browser Use, Playwright, and other browser automation do not satisfy UI-level E2E verification.
331
332
 
332
333
  ### Refresh Context Before Risky Work
333
334
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "sneakoscope",
3
3
  "displayName": "ㅅㅋㅅ",
4
- "version": "0.6.77",
4
+ "version": "0.6.79",
5
5
  "description": "Sneakoscope Codex: database-safe Codex CLI/App harness with Team, Goal, AutoResearch, TriWiki, and Honest Mode.",
6
6
  "type": "module",
7
7
  "homepage": "https://github.com/mandarange/Sneakoscope-Codex#readme",
package/src/cli/main.mjs CHANGED
@@ -35,6 +35,7 @@ import { createWorkOrderLedger } from '../core/work-order-ledger.mjs';
35
35
  import { buildFromChatImgVisualMap } from '../core/from-chat-img-forensics.mjs';
36
36
  import { classifyDogfoodFinding, createDogfoodReport, writeDogfoodReport } from '../core/dogfood-loop.mjs';
37
37
  import { createSkillCandidate, decideSkillInjection, writeSkillCandidate, writeSkillForgeReport, writeSkillInjectionDecision } from '../core/skill-forge.mjs';
38
+ import { classifyToolError, harnessGrowthReport } from '../core/evaluation.mjs';
38
39
  import { recordMistake, writeMistakeMemoryReport } from '../core/mistake-memory.mjs';
39
40
  import { buildPromptContext } from '../core/prompt-context-builder.mjs';
40
41
  import { renderTeamDashboardState, writeTeamDashboardState } from '../core/team-dashboard-renderer.mjs';
@@ -42,7 +43,7 @@ import { GOAL_WORKFLOW_ARTIFACT } from '../core/goal-workflow.mjs';
42
43
  import { CODEX_APP_DOCS_URL, codexAppIntegrationStatus, formatCodexAppStatus } from '../core/codex-app.mjs';
43
44
  import { CMUX_BREW_COMMAND, CMUX_BREW_UPGRADE_COMMAND, buildCmuxLaunchPlan, buildCmuxNewWorkspaceArgs, cmuxSurfaceRefFromText, cmuxWorkspaceRef, cmuxWorkspaceRefFromText, cmuxReadiness, cmuxStatusKind, defaultCmuxWorkspaceName, ensureCmuxInstalled, formatCmuxBanner, launchCmuxTeamView, launchCmuxUi, matchingCmuxWorkspaces, parseCmuxWorkspaceList, platformCmuxInstallHint, readCmuxWorkspaceRecord, runCmuxStatus, sanitizeCmuxWorkspaceName, teamLaneStyle, writeCmuxWorkspaceRecord } from '../core/cmux-ui.mjs';
44
45
  import { autoReviewProfileName, autoReviewStatus, autoReviewSummary, enableAutoReview, disableAutoReview, enableMadHighProfile, madHighProfileName } from '../core/auto-review.mjs';
45
- import { buildTeamPlan, codeStructureCommand, defaultBeta, defaultVGraph, evalCommand, gcCommand, goalCommand, gxCommand, hproofCommand, memoryCommand, migrateWikiContextPack, parseTeamCreateArgs, perfCommand, profileCommand, projectWikiClaims, qaLoopCommand, researchCommand, statsCommand, team, teamWorkflowMarkdown, validateArtifactsCommand, wikiCommand, wikiVoxelRowCount, writeWikiContextPack } from './maintenance-commands.mjs';
46
+ import { buildTeamPlan, codeStructureCommand, defaultBeta, defaultVGraph, evalCommand, gcCommand, goalCommand, gxCommand, harnessCommand, hproofCommand, memoryCommand, migrateWikiContextPack, parseTeamCreateArgs, perfCommand, profileCommand, projectWikiClaims, qaLoopCommand, researchCommand, statsCommand, team, teamWorkflowMarkdown, validateArtifactsCommand, wikiCommand, wikiVoxelRowCount, writeWikiContextPack } from './maintenance-commands.mjs';
46
47
 
47
48
  const flag = (args, name) => args.includes(name);
48
49
  const promptOf = (args) => args.filter((x) => !String(x).startsWith('--')).join(' ').trim();
@@ -107,6 +108,7 @@ export async function main(args) {
107
108
  if (cmd === 'team') return team(tail);
108
109
  if (cmd === 'db') return db(sub, rest);
109
110
  if (cmd === 'eval') return evalCommand(sub, rest);
111
+ if (cmd === 'harness') return harnessCommand(sub, rest);
110
112
  if (cmd === 'wiki') return wikiCommand(sub, rest);
111
113
  if (cmd === 'gc') return gcCommand(tail);
112
114
  if (cmd === 'stats') return statsCommand(tail);
@@ -175,6 +177,7 @@ Usage:
175
177
  sks eval run [--json] [--out report.json]
176
178
  sks eval compare --baseline old.json --candidate new.json [--json]
177
179
  sks perf run [--json]
180
+ sks harness fixture [--json]
178
181
  sks code-structure scan [--json]
179
182
  sks wiki coords --rgba 12,34,56,255
180
183
  sks wiki pack [--json] [--role worker|verifier] [--max-anchors N]
@@ -1477,6 +1480,7 @@ function usage(args = []) {
1477
1480
  'codex-app': ['Codex App', '', ' sks bootstrap', ' sks codex-app check', ' sks dollar-commands', ' cat .codex/SNEAKOSCOPE.md'],
1478
1481
  dollar: ['Dollar Commands', '', formatDollarCommandsCompact(' '), '', 'Terminal: sks dollar-commands [--json]'],
1479
1482
  wiki: ['TriWiki', '', ' sks wiki pack', ' sks wiki refresh [--prune]', ' sks wiki sweep latest --json', ' sks wiki validate .sneakoscope/wiki/context-pack.json', ' sks wiki prune --dry-run --json', '', 'Packs include attention.use_first and attention.hydrate_first for compact recall plus source hydration. Sweep records intentional forgetting and promotion candidates.'],
1483
+ harness: ['Harness Growth', '', ' sks harness fixture --json', ' sks harness review --json', '', 'Runs deterministic fixtures for deliberate forgetting, skill cards, harness experiments, tool error taxonomy, permission profiles, MultiAgentV2, and Cmux cockpit views.'],
1480
1484
  'code-structure': ['Code Structure', '', ' sks code-structure scan', ' sks code-structure scan --json', '', 'Flags handwritten source files above 1000/2000/3000-line thresholds and records split-review exceptions.'],
1481
1485
  gx: ['GX', '', ' sks gx init architecture-atlas', ' sks gx render architecture-atlas --format all', ' sks gx validate architecture-atlas']
1482
1486
  };
@@ -2293,7 +2297,7 @@ async function selftest() {
2293
2297
  if (hookTeamPendingState.mission_id !== hookTeamState.mission_id) throw new Error('selftest failed: pending clarification allowed a new route mission to replace the visible question sheet');
2294
2298
  if (!hookTeamPendingContext.includes('Required questions still pending') || !hookTeamPendingContext.includes('VISIBLE RESPONSE CONTRACT') || !hookTeamPendingContext.includes('UI_STATE_BEHAVIOR')) throw new Error('selftest failed: pending clarification did not re-expose the question sheet');
2295
2299
  if (hookTeamPendingContext.includes('MANDATORY ambiguity-removal gate activated')) throw new Error('selftest failed: pending clarification prepared a new ambiguity gate instead of reusing the active one');
2296
- const hookTeamStopResult = await runProcess(process.execPath, [hookBin, 'hook', 'stop'], { cwd: hookTeamTmp, input: JSON.stringify({ cwd: hookTeamTmp, last_assistant_message: 'I will execute Team now.' }), env: { SKS_DISABLE_UPDATE_CHECK: '1' }, timeoutMs: 15000, maxOutputBytes: 128 * 1024 });
2300
+ const hookTeamStopResult = await runProcess(process.execPath, [hookBin, 'hook', 'stop'], { cwd: hookTeamTmp, input: JSON.stringify({ cwd: hookTeamTmp, last_assistant_message: 'I need three decisions before implementation, but I will not paste the Required questions block.' }), env: { SKS_DISABLE_UPDATE_CHECK: '1' }, timeoutMs: 15000, maxOutputBytes: 128 * 1024 });
2297
2301
  if (hookTeamStopResult.code !== 0) throw new Error(`selftest failed: Team stop hook exited ${hookTeamStopResult.code}: ${hookTeamStopResult.stderr}`);
2298
2302
  const hookTeamStopJson = JSON.parse(hookTeamStopResult.stdout);
2299
2303
  if (hookTeamStopJson.decision !== 'block' || !String(hookTeamStopJson.reason || '').includes('mandatory ambiguity-removal')) throw new Error('selftest failed: Stop hook did not block missing Team ambiguity answers');
@@ -2303,6 +2307,13 @@ async function selftest() {
2303
2307
  if (!String(hookTeamStopJson.reason || '').includes('Codex plan-tool interaction')) throw new Error('selftest failed: Stop hook did not reprint plan-tool guidance');
2304
2308
  if (!String(hookTeamStopJson.reason || '').includes('VISIBLE RESPONSE CONTRACT')) throw new Error('selftest failed: Stop hook did not force visible clarification response');
2305
2309
  const hookTeamSchema = await readJson(path.join(missionDir(hookTeamTmp, hookTeamState.mission_id), 'required-answers.schema.json'));
2310
+ const visibleQuestionsBlock = [
2311
+ 'Required questions',
2312
+ ...hookTeamSchema.slots.map((slot, idx) => `${idx + 1}. ${slot.id}: ${slot.question}`),
2313
+ 'Reply by slot id, then I will write answers.json and run sks pipeline answer latest answers.json.'
2314
+ ].join('\n');
2315
+ const visibleQuestionDecision = await evaluateStop(hookTeamTmp, hookTeamState, { last_assistant_message: visibleQuestionsBlock }, { noQuestion: false });
2316
+ if (!visibleQuestionDecision?.continue) throw new Error('selftest failed: visible Required questions block was not accepted by clarification stop gate');
2306
2317
  const nonGoalsSlot = hookTeamSchema.slots.find((s) => s.id === 'NON_GOALS');
2307
2318
  if (nonGoalsSlot && !nonGoalsSlot.allow_empty) throw new Error('selftest failed: NON_GOALS does not allow an empty array answer');
2308
2319
  if (!nonGoalsSlot && !Array.isArray(hookTeamSchema.inferred_answers?.NON_GOALS)) throw new Error('selftest failed: NON_GOALS was neither asked nor inferred');
@@ -2367,6 +2378,8 @@ async function selftest() {
2367
2378
  const hookQaJson = JSON.parse(hookQaResult.stdout);
2368
2379
  const hookQaContext = hookQaJson.hookSpecificOutput?.additionalContext || '';
2369
2380
  if (!hookQaContext.includes('MANDATORY ambiguity-removal gate activated') || !hookQaContext.includes('QA_SCOPE') || !hookQaContext.includes('UI_COMPUTER_USE_ACK')) throw new Error('selftest failed: $QA-LOOP hook did not provide QA-specific questions');
2381
+ if (!hookQaContext.includes('Codex Computer Use') || !hookQaContext.includes('Playwright') || !hookQaContext.includes('Chrome MCP')) throw new Error('selftest failed: $QA-LOOP hook did not state Computer Use-only UI policy');
2382
+ if (hookQaContext.includes('Browser Use 또는 Computer Use') || hookQaContext.includes('Browser/Computer Use evidence')) throw new Error('selftest failed: $QA-LOOP hook still allows Browser Use as UI evidence');
2370
2383
  const hookQaState = await readJson(stateFile(hookQaTmp), {});
2371
2384
  if (hookQaState.phase !== 'QALOOP_CLARIFICATION_AWAITING_ANSWERS' || hookQaState.implementation_allowed !== false) throw new Error('selftest failed: $QA-LOOP hook did not lock execution behind ambiguity gate');
2372
2385
  const hookQaSchema = await readJson(path.join(missionDir(hookQaTmp, hookQaState.mission_id), 'required-answers.schema.json'));
@@ -2397,6 +2410,8 @@ async function selftest() {
2397
2410
  if (unresolvedQaGate.passed || !unresolvedQaGate.reasons.includes('unresolved_fixable_findings_remaining')) throw new Error('selftest failed: unresolved fixable QA finding was accepted');
2398
2411
  const promptQa = buildQaLoopPrompt({ id: 'selftest', mission: { prompt: 'QA and fix' }, contract: { answers: { QA_CORRECTIVE_POLICY: 'apply_safe_fixes_and_reverify' } }, cycle: 1, previous: '', reportFile: qaReportFile });
2399
2412
  if (!promptQa.includes('dogfood as human proxy') || !promptQa.includes('fix safe code/test/docs now') || !promptQa.includes('post_fix_verification_complete')) throw new Error('selftest failed: QA-LOOP dogfood prompt');
2413
+ if (!promptQa.includes('Codex Computer Use evidence only') || !promptQa.includes('Chrome MCP') || !promptQa.includes('Playwright')) throw new Error('selftest failed: QA-LOOP prompt did not enforce Computer Use-only UI evidence');
2414
+ if (promptQa.includes('Browser/Computer Use evidence')) throw new Error('selftest failed: QA-LOOP prompt still allows Browser/Computer UI evidence');
2400
2415
  const pkgQa = defaultQaGate({ sealed_hash: 'selftest', answers: { QA_SCOPE: 'all_available', TARGET_BASE_URL: 'none', API_BASE_URL: 'same_as_target', TARGET_ENVIRONMENT: 'local_dev_server', DESTRUCTIVE_DEPLOYED_TESTS_ALLOWED: 'never' } });
2401
2416
  if (pkgQa.ui_e2e_required || pkgQa.api_e2e_required || !pkgQa.ui_computer_use_evidence) throw new Error('selftest failed: package QA target gate');
2402
2417
  const qaRunResult = await runProcess(process.execPath, [hookBin, 'qa-loop', 'run', 'latest', '--mock'], { cwd: hookQaTmp, env: { SKS_DISABLE_UPDATE_CHECK: '1' }, timeoutMs: 15000, maxOutputBytes: 64 * 1024 });
@@ -2764,7 +2779,7 @@ async function selftest() {
2764
2779
  if (teamDashboard?.agent_session_count !== 5 || teamDashboard?.role_counts?.executor !== 5) throw new Error('selftest failed: team dashboard session/role budget missing');
2765
2780
  await writeTeamDashboardState(teamDir, { missionId: teamId, mission: { id: teamId, mode: 'team' }, effort: 'high', phase: 'verification' });
2766
2781
  const teamDashboardState = await readJson(path.join(teamDir, ARTIFACT_FILES.team_dashboard_state), {});
2767
- if (!validateTeamDashboardState(teamDashboardState).ok || !renderTeamDashboardState(teamDashboardState).includes('Mission Overview')) throw new Error('selftest failed: Team dashboard state missing required cockpit panes');
2782
+ if (!validateTeamDashboardState(teamDashboardState).ok || !renderTeamDashboardState(teamDashboardState).includes('Mission / Goal View')) throw new Error('selftest failed: Team dashboard state missing required cockpit panes');
2768
2783
  if (teamDashboard?.context_tracking?.ssot !== 'triwiki') throw new Error('selftest failed: team dashboard missing TriWiki context tracking');
2769
2784
  if (!teamDashboard?.phases?.includes('parallel_analysis_scouting')) throw new Error('selftest failed: team dashboard missing analysis scout phase');
2770
2785
  if (!teamDashboard?.latest_messages?.some((entry) => entry.agent === 'analysis_scout_1')) throw new Error('selftest failed: team live dashboard missing analysis scout event');
@@ -2820,6 +2835,9 @@ async function selftest() {
2820
2835
  if (!evalReport.comparison.meaningful_improvement) throw new Error('selftest failed: evaluation benchmark did not show meaningful improvement');
2821
2836
  if (!evalReport.candidate.wiki?.valid) throw new Error('selftest failed: wiki coordinate index invalid in eval');
2822
2837
  if (evalReport.candidate.wiki?.voxel_schema !== 'sks.wiki-voxel.v1' || evalReport.candidate.wiki?.voxel_rows < 1) throw new Error('selftest failed: eval did not include voxel overlay metrics');
2838
+ const harnessReport = harnessGrowthReport({});
2839
+ if (!harnessReport.forgetting.fixture.passed || !harnessReport.cmux.views.includes('Harness Experiments View') || !harnessReport.reliability.tool_error_taxonomy.includes('Unknown')) throw new Error('selftest failed: harness growth fixture incomplete');
2840
+ if (classifyToolError({ message: 'operation timed out' }) !== 'Timeout' || classifyToolError({ message: 'unclassified weirdness' }) !== 'Unknown') throw new Error('selftest failed: tool error taxonomy classification');
2823
2841
  const coord = rgbaToWikiCoord({ r: 12, g: 34, b: 56, a: 255 });
2824
2842
  if (coord.schema !== 'sks.wiki-coordinate.v1' || coord.xyzw.length !== 4) throw new Error('selftest failed: RGBA wiki coordinate conversion');
2825
2843
  await writeTextAtomic(path.join(tmp, '.sneakoscope', 'memory', 'q2_facts', 'selftest.md'), '- claim: Selftest memory claim must be selected before lower-weight mission notes. | id: selftest-memory-priority | source: src/cli/main.mjs | risk: high | status: supported | evidence_count: 3 | required_weight: 1.0 | trust_score: 0.9\n');
@@ -31,6 +31,7 @@ import { cleanupCmuxTeamView, launchCmuxTeamView } from '../core/cmux-ui.mjs';
31
31
  import { writeSkillForgeReport } from '../core/skill-forge.mjs';
32
32
  import { writeMistakeMemoryReport } from '../core/mistake-memory.mjs';
33
33
  import { scanDbSafety } from '../core/db-safety.mjs';
34
+ import { harnessGrowthReport, writeHarnessGrowthReport } from '../core/evaluation.mjs';
34
35
 
35
36
  const flag = (args, name) => args.includes(name);
36
37
  const promptOf = (args) => args.filter((x) => !String(x).startsWith('--')).join(' ').trim();
@@ -64,6 +65,9 @@ Usage:
64
65
 
65
66
  Prompt route:
66
67
  $QA-LOOP dogfood UI/API, fix safe issues, reverify
68
+
69
+ UI evidence:
70
+ Codex Computer Use only for UI-level E2E; do not use Chrome MCP, Browser Use, Playwright, or other browser automation as UI verification evidence.
67
71
  `);
68
72
  }
69
73
 
@@ -417,6 +421,25 @@ export async function perfCommand(sub, args = []) {
417
421
  console.log(`Report: ${path.relative(root, outPath)}`);
418
422
  }
419
423
 
424
+ export async function harnessCommand(sub, args = []) {
425
+ const action = sub || 'fixture';
426
+ if (!['fixture', 'review'].includes(action)) {
427
+ console.error('Usage: sks harness fixture|review [--json]');
428
+ process.exitCode = 1;
429
+ return;
430
+ }
431
+ const root = await sksRoot();
432
+ const report = action === 'review'
433
+ ? await writeHarnessGrowthReport(root, path.join(root, '.sneakoscope', 'reports'), {})
434
+ : harnessGrowthReport({});
435
+ if (flag(args, '--json')) return console.log(JSON.stringify(report, null, 2));
436
+ console.log('SKS Harness Growth');
437
+ console.log(`Forgetting fixture: ${report.forgetting.fixture.passed ? 'pass' : 'fail'}`);
438
+ console.log(`Cmux views: ${report.cmux.views.length}`);
439
+ console.log(`Tool taxonomy: ${report.reliability.tool_error_taxonomy.join(', ')}`);
440
+ console.log(`Unknown errors recorded as bugs: ${report.reliability.unknown_errors_are_bugs ? 'yes' : 'no'}`);
441
+ }
442
+
420
443
  export async function codeStructureCommand(sub, args = []) {
421
444
  const action = sub || 'scan';
422
445
  if (action !== 'scan') {
@@ -1211,6 +1234,7 @@ export async function team(args) {
1211
1234
  });
1212
1235
  await writeWorkOrderLedger(dir, workOrder);
1213
1236
  if (fromChatImgRequired) await writeFromChatImgArtifacts(dir, { missionId: id, requests: [{ verbatim: prompt }], ambiguities: ['image source inventory must be completed before implementation'] });
1237
+ await writeHarnessGrowthReport(root, dir, {});
1214
1238
  let dashboardState = await writeTeamDashboardState(dir, { missionId: id, mission: { id, mode: 'team' }, effort: effortDecision.selected_effort, phase: 'intake', next_action: fromChatImgRequired ? 'complete visual source inventory and work-order mapping' : 'run Team analysis scouts' });
1215
1239
  await writeJsonAtomic(path.join(dir, 'team-gate.json'), { passed: false, team_roster_confirmed: true, analysis_artifact: false, triwiki_refreshed: false, triwiki_validated: false, consensus_artifact: false, ...runtime.gate_fields, implementation_team_fresh: false, review_artifact: false, integration_evidence: false, session_cleanup: false, context7_evidence: false, ...(fromChatImgRequired ? { from_chat_img_required: true, from_chat_img_request_coverage: false } : {}) });
1216
1240
  dashboardState = await writeTeamDashboardState(dir, { missionId: id, mission: { id, mode: 'team' }, effort: effortDecision.selected_effort, phase: 'intake', next_action: fromChatImgRequired ? 'complete visual source inventory and work-order mapping' : 'run Team analysis scouts' });
@@ -17,6 +17,7 @@ export const ARTIFACT_FILES = {
17
17
  memory_sweep_report: 'memory-sweep-report.json',
18
18
  skill_forge_report: 'skill-forge-report.json',
19
19
  mistake_memory_report: 'mistake-memory-report.json',
20
+ harness_growth_report: 'harness-growth-report.json',
20
21
  code_structure_report: 'code-structure-report.json',
21
22
  team_dashboard_state: 'team-dashboard-state.json',
22
23
  cmux_pane_plan: 'cmux-pane-plan.json',
@@ -163,6 +164,21 @@ export function validateMistakeMemoryReport(data = {}) {
163
164
  return validationResult('MistakeMemoryReport', errors);
164
165
  }
165
166
 
167
+ export function validateHarnessGrowthReport(data = {}) {
168
+ const errors = [];
169
+ pushMissing(errors, isObj(data.forgetting), 'forgetting_missing');
170
+ pushMissing(errors, isObj(data.skills), 'skills_missing');
171
+ pushMissing(errors, isObj(data.experiments), 'experiments_missing');
172
+ pushMissing(errors, isObj(data.codex_native), 'codex_native_missing');
173
+ pushMissing(errors, isObj(data.cmux), 'cmux_missing');
174
+ pushMissing(errors, isObj(data.reliability), 'reliability_missing');
175
+ if (data.forgetting?.fixture?.passed !== true) errors.push('forgetting_fixture_failed');
176
+ if (!Array.isArray(data.reliability?.tool_error_taxonomy) || !data.reliability.tool_error_taxonomy.includes('Unknown')) errors.push('tool_error_taxonomy_missing_unknown');
177
+ if (data.reliability?.unknown_errors_are_bugs !== true) errors.push('unknown_errors_not_marked_bug');
178
+ if (!Array.isArray(data.cmux?.views) || data.cmux.views.length < 10) errors.push('cmux_views_incomplete');
179
+ return validationResult('HarnessGrowthReport', errors);
180
+ }
181
+
166
182
  export function validateCodeStructureReport(data = {}) {
167
183
  const errors = [];
168
184
  pushMissing(errors, isObj(data.thresholds), 'thresholds_missing');
@@ -179,7 +195,7 @@ export function validateTeamDashboardState(data = {}) {
179
195
  pushMissing(errors, Array.isArray(data.gates), 'gates_not_array');
180
196
  pushMissing(errors, Array.isArray(data.agents), 'agents_not_array');
181
197
  pushMissing(errors, Array.isArray(data.tasks), 'tasks_not_array');
182
- for (const pane of ['Mission Overview', 'Agent Lanes', 'Task DAG', 'QA and Dogfood', 'Artifacts and Evidence', 'Performance']) {
198
+ for (const pane of ['Mission / Goal View', 'Agent Grid View', 'MultiAgentV2 Graph View', 'Work Order Ledger View', 'Skill Autopilot View', 'TriWiki Memory Health View', 'Forget Queue', 'Mistake Immunity', 'Tool Reliability View', 'Harness Experiments View', 'Dogfood Evidence View', 'Code Structure']) {
183
199
  if (!arr(data.panes).includes(pane)) errors.push(`pane_missing:${pane}`);
184
200
  }
185
201
  if (arr(data.gates).some((gate) => !GATE_STATUSES.has(gate.status))) errors.push('gate_status_invalid');
@@ -214,6 +230,7 @@ export const ARTIFACT_VALIDATORS = {
214
230
  memory_sweep_report: validateMemorySweepReport,
215
231
  skill_forge_report: validateSkillForgeReport,
216
232
  mistake_memory_report: validateMistakeMemoryReport,
233
+ harness_growth_report: validateHarnessGrowthReport,
217
234
  code_structure_report: validateCodeStructureReport,
218
235
  team_dashboard_state: validateTeamDashboardState,
219
236
  cmux_pane_plan: validateCmuxPanePlan,
@@ -128,10 +128,10 @@ export function codexAppGuidance({ appInstalled, codex, mcpList, computerUseRead
128
128
  }
129
129
  if (appInstalled && (!computerUseReady || !browserUseReady)) {
130
130
  lines.push('Open Codex App settings, enable recommended MCP/plugin tools, then restart Codex CLI sessions.');
131
- lines.push('Required for SKS QA-LOOP priority order: Browser Use for local browser targets, Computer Use for desktop/app/browser evidence.');
131
+ lines.push('Required for SKS QA-LOOP UI evidence: Codex Computer Use only. Browser Use can support non-UI browser context, but it does not satisfy UI-level E2E verification.');
132
132
  lines.push('Verify with: codex mcp list');
133
133
  }
134
- if (!lines.length) lines.push('Codex App, Codex CLI, Computer Use, and Browser Use checks look ready.');
134
+ if (!lines.length) lines.push('Codex App, Codex CLI, Computer Use, and Browser Use checks look ready. UI-level E2E verification still requires Codex Computer Use evidence.');
135
135
  return lines;
136
136
  }
137
137
 
@@ -82,7 +82,7 @@ export function buildDecisionContract({ mission, schema, answers }) {
82
82
  qa_loop_target_environment: answers.TARGET_ENVIRONMENT || null,
83
83
  qa_loop_mutation_policy: answers.QA_MUTATION_POLICY || null,
84
84
  qa_loop_credentials_saved: false,
85
- qa_loop_ui_requires_official_browser_or_computer_use: Boolean(answers.QA_SCOPE && answers.QA_SCOPE !== 'api_e2e_only'),
85
+ qa_loop_ui_requires_codex_computer_use_only: Boolean(answers.QA_SCOPE && answers.QA_SCOPE !== 'api_e2e_only'),
86
86
  unrequested_fallback_code_allowed: false,
87
87
  mad_sks_mode: madSks ? 'explicit_invocation_only' : false,
88
88
  production_database_writes_allowed: madSks ? 'mad_sks_scoped' : false,
@@ -1,4 +1,5 @@
1
- import { nowIso, sha256 } from './fsx.mjs';
1
+ import path from 'node:path';
2
+ import { nowIso, sha256, writeJsonAtomic } from './fsx.mjs';
2
3
  import { contextCapsule } from './triwiki-attention.mjs';
3
4
  import { validateWikiCoordinateIndex } from './wiki-coordinate.mjs';
4
5
 
@@ -10,15 +11,359 @@ export const DEFAULT_EVAL_THRESHOLDS = Object.freeze({
10
11
  max_candidate_build_ms_per_run: 25
11
12
  });
12
13
 
14
+ export const HARNESS_GROWTH_REPORT = 'harness-growth-report.json';
15
+
16
+ export const MEMORY_LIFECYCLE_STATES = Object.freeze([
17
+ 'ACTIVE',
18
+ 'PINNED',
19
+ 'DORMANT',
20
+ 'STALE',
21
+ 'DUPLICATE',
22
+ 'CONFLICTED',
23
+ 'QUARANTINED',
24
+ 'ARCHIVED',
25
+ 'DISABLED',
26
+ 'DELETE_CANDIDATE',
27
+ 'DELETED'
28
+ ]);
29
+
30
+ export const FORGETTING_ACTIONS = Object.freeze([
31
+ 'KEEP_ACTIVE',
32
+ 'PIN',
33
+ 'UNPIN',
34
+ 'UPDATE',
35
+ 'CONSOLIDATE',
36
+ 'DEMOTE',
37
+ 'DISABLE',
38
+ 'ARCHIVE',
39
+ 'QUARANTINE',
40
+ 'HARD_DELETE',
41
+ 'NOOP',
42
+ 'PROMOTE_SKILL',
43
+ 'PROMOTE_RULE',
44
+ 'PROMOTE_TEST'
45
+ ]);
46
+
47
+ export const TOOL_ERROR_TAXONOMY = Object.freeze([
48
+ 'InvalidArguments',
49
+ 'UnexpectedEnvironment',
50
+ 'ProviderError',
51
+ 'UserAborted',
52
+ 'Timeout',
53
+ 'PermissionDenied',
54
+ 'NetworkDenied',
55
+ 'ResourceExhausted',
56
+ 'Conflict',
57
+ 'Unknown'
58
+ ]);
59
+
60
+ export const DEFAULT_FORGETTING_THRESHOLDS = Object.freeze({
61
+ wiki_claim: { stale_after_days: 60, dormant_after_days_without_use: 90, archive_after_days_without_use: 150, hard_delete_after_days_without_use: 240 },
62
+ wiki_page: { stale_after_days: 90, archive_after_days_without_use: 180, hard_delete_after_days_without_use: 365 },
63
+ codex_memory: { stale_after_days: 60, hard_delete_after_days_without_use: 180 },
64
+ skill: { stale_after_days_without_use: 45, disable_after_days_without_use: 90, archive_after_days_without_use: 180, hard_delete_after_days_without_use: 270 },
65
+ mistake_fingerprint: { stale_after_days_without_recurrence: 180, archive_after_days_without_recurrence: 365, hard_delete_after_days_without_recurrence: 540 },
66
+ temporary_artifact: { archive_after_days: 14, hard_delete_after_days: 45 }
67
+ });
68
+
69
+ export const PERMISSION_PROFILES = Object.freeze({
70
+ read_only_explorer: { filesystem: 'read-only', network: 'disabled_or_limited', purpose: 'Map code, collect evidence, no writes.' },
71
+ workspace_worker: { filesystem: 'workspace-write', network: 'disabled_by_default', purpose: 'Implement local code changes safely.' },
72
+ dogfood_browser: { filesystem: 'workspace-write', network: 'localhost_and_required_docs', purpose: 'Run app/browser dogfood and collect evidence.' },
73
+ harness_research: { filesystem: 'workspace-write', network: 'limited_allowlist', purpose: 'Fetch official docs/research for harness improvements.' },
74
+ dangerous_full_access: { filesystem: 'full-access', network: 'controlled', purpose: 'Never default. Requires explicit reason and review.' }
75
+ });
76
+
77
+ export const DEFAULT_MULTIAGENT_V2 = Object.freeze({
78
+ max_threads: 6,
79
+ max_depth: 1,
80
+ job_max_runtime_seconds: 1800,
81
+ wait_control: 'bounded_wait_then_structured_summary',
82
+ subagent_output: 'structured_summary_only'
83
+ });
84
+
85
+ export const CMUX_COCKPIT_VIEWS = Object.freeze([
86
+ 'Mission / Goal View',
87
+ 'Agent Grid View',
88
+ 'MultiAgentV2 Graph View',
89
+ 'Work Order Ledger View',
90
+ 'Skill Autopilot View',
91
+ 'TriWiki Memory Health View',
92
+ 'Forget Queue View',
93
+ 'Mistake Immunity View',
94
+ 'Tool Reliability View',
95
+ 'Harness Experiments View',
96
+ 'Dogfood Evidence View',
97
+ 'Code Structure View',
98
+ 'Statusline / Terminal Title Preview'
99
+ ]);
100
+
13
101
  export function estimateTokens(value) {
14
102
  const text = typeof value === 'string' ? value : JSON.stringify(value);
15
103
  return Math.max(1, Math.ceil(String(text || '').length / 4));
16
104
  }
17
105
 
106
+ export function classifyToolError(input = {}) {
107
+ const text = `${input.code || ''} ${input.name || ''} ${input.message || ''} ${input.stderr || ''}`.toLowerCase();
108
+ if (/invalid|required|schema|argument|parameter|json/.test(text)) return 'InvalidArguments';
109
+ if (/enoent|not found|cwd|path|missing file|environment|not installed/.test(text)) return 'UnexpectedEnvironment';
110
+ if (/provider|upstream|api error|5\d\d|service unavailable/.test(text)) return 'ProviderError';
111
+ if (/abort|cancel|interrupted|user stopped/.test(text)) return 'UserAborted';
112
+ if (/timeout|timed out|deadline/.test(text)) return 'Timeout';
113
+ if (/permission|denied|not allowed|approval|sandbox/.test(text)) return 'PermissionDenied';
114
+ if (/network|dns|eai_again|enotfound|offline/.test(text)) return 'NetworkDenied';
115
+ if (/rate limit|quota|memory|resource|emfile|enospc|token limit|too large/.test(text)) return 'ResourceExhausted';
116
+ if (/conflict|merge|lock|concurrent|dirty/.test(text)) return 'Conflict';
117
+ return 'Unknown';
118
+ }
119
+
120
+ export function utilityScore(object = {}) {
121
+ const evidence = Math.min(20, Number(object.evidence_count || 0) * 4);
122
+ const successfulUse = Math.min(16, Number(object.success_count || object.use_count || 0) * 3);
123
+ const recency = daysSince(object.updated_at || object.last_used_at || object.created_at) <= 30 ? 14 : 4;
124
+ const uniqueness = object.duplicate_of ? -18 : 10;
125
+ const trust = Math.round(Number(object.trust_score ?? 0.5) * 18);
126
+ const riskPrevention = object.regression_prevention ? 12 : 0;
127
+ const penalties = [
128
+ object.stale ? 14 : 0,
129
+ object.conflicted ? 28 : 0,
130
+ object.failed_use ? 10 : 0,
131
+ object.prompt_bloat ? 8 : 0,
132
+ object.security_risk ? 80 : 0,
133
+ object.maintenance_cost ? 8 : 0
134
+ ].reduce((a, b) => a + b, 0);
135
+ return clamp(0, 100, recency + evidence + successfulUse + uniqueness + trust + riskPrevention - penalties);
136
+ }
137
+
138
+ export function forgettingDecision(object = {}, opts = {}) {
139
+ const state = String(object.lifecycle_state || object.status || '').toUpperCase();
140
+ const score = utilityScore(object);
141
+ if (isPinned(object)) return decision('KEEP_ACTIVE', 'PINNED', score, ['retention_exempt']);
142
+ if (containsSecret(object)) return decision('HARD_DELETE', 'DELETED', score, ['secret_or_sensitive_content'], true);
143
+ if (object.poisoned || object.unsafe_instruction) return decision('HARD_DELETE', 'DELETED', score, ['poisoned_or_unsafe'], true);
144
+ if (object.known_false) return decision('QUARANTINE', 'QUARANTINED', score, ['known_false']);
145
+ if (object.duplicate_of) return decision('CONSOLIDATE', 'DUPLICATE', score, ['duplicate']);
146
+ if (object.conflicted || state === 'CONFLICTED') return decision('QUARANTINE', 'CONFLICTED', score, ['conflict_requires_resolution']);
147
+ if (object.repeated_success && Number(object.success_count || 0) >= 3) return decision('PROMOTE_SKILL', 'ACTIVE', score, ['verified_repetition']);
148
+ if (object.repeated_mistake && !object.regression_test) return decision('PROMOTE_TEST', 'ACTIVE', score, ['mistake_without_test']);
149
+ if (object.stale && Number(object.evidence_count || 0) >= 3 && Number(object.trust_score || 0) >= 0.65) return decision('DEMOTE', 'STALE', score, ['stale_but_useful_verify_before_use']);
150
+ if (score < 20 && graceChecksPass(object, opts)) return decision('HARD_DELETE', 'DELETED', score, ['old_unused_low_utility'], false, tombstone(object, opts));
151
+ if (score < 40) return decision('ARCHIVE', 'ARCHIVED', score, ['low_utility']);
152
+ if (score < 60 || object.stale) return decision(object.type === 'skill' ? 'DISABLE' : 'DEMOTE', object.type === 'skill' ? 'DISABLED' : 'STALE', score, ['stale_or_watch']);
153
+ return decision('KEEP_ACTIVE', 'ACTIVE', score, ['useful_current']);
154
+ }
155
+
156
+ export function createSkillCard(input = {}) {
157
+ return {
158
+ skill_id: input.skill_id || input.id || `skill.${safeId(input.name || 'candidate')}`,
159
+ name: input.name || input.skill_id || 'Candidate Skill',
160
+ version: input.version || '1.0.0',
161
+ status: input.status || 'active',
162
+ created_at: input.created_at || nowIso(),
163
+ updated_at: input.updated_at || nowIso(),
164
+ last_used_at: input.last_used_at || null,
165
+ use_count: Number(input.use_count || 0),
166
+ success_count: Number(input.success_count || 0),
167
+ failure_count: Number(input.failure_count || 0),
168
+ false_trigger_count: Number(input.false_trigger_count || 0),
169
+ owner: input.owner || 'harness',
170
+ trigger_summary: input.trigger_summary || '',
171
+ anti_triggers: input.anti_triggers || [],
172
+ inputs: input.inputs || [],
173
+ outputs: input.outputs || [],
174
+ validation: input.validation || { commands: [], manual_checks: [], schemas: [] },
175
+ risk_notes: input.risk_notes || [],
176
+ retirement_conditions: input.retirement_conditions || ['stale without use', 'repeated false trigger', 'validation no longer runs'],
177
+ related_mistake_fingerprints: input.related_mistake_fingerprints || [],
178
+ related_wiki_entries: input.related_wiki_entries || [],
179
+ plugin_distribution: input.plugin_distribution || 'none',
180
+ implicit_invocation_allowed: input.implicit_invocation_allowed !== false
181
+ };
182
+ }
183
+
184
+ export function createHarnessExperiment(input = {}) {
185
+ return {
186
+ experiment_id: input.experiment_id || `exp.${safeId(input.title || 'harness')}.${sha256(JSON.stringify(input)).slice(0, 8)}`,
187
+ title: input.title || 'Harness experiment',
188
+ owner: 'harness_growth',
189
+ created_at: input.created_at || nowIso(),
190
+ status: input.status || 'draft',
191
+ vision_alignment: input.vision_alignment || 'Improve verified task outcomes while reducing context bloat.',
192
+ hypothesis: input.hypothesis || '',
193
+ change_surface: input.change_surface || ['eval'],
194
+ variant_a: input.variant_a || 'baseline',
195
+ variant_b: input.variant_b || 'candidate',
196
+ risk_level: input.risk_level || 'low',
197
+ rollback_plan: input.rollback_plan || 'revert candidate surface and re-run smoke shard',
198
+ offline_eval_suite: input.offline_eval_suite || ['sneakoscopebench:smoke'],
199
+ online_metrics: input.online_metrics || ['latency_p95_ms', 'token_input', 'tool_error_rate', 'keep_rate', 'context_bloat_score'],
200
+ launch_gate: input.launch_gate || {
201
+ min_quality_delta: '>= 0',
202
+ max_latency_regression: '<= 10%',
203
+ max_cost_regression: '<= 10%',
204
+ max_error_regression: '<= 0',
205
+ required_evidence: 'offline eval plus rollback plan'
206
+ },
207
+ post_launch_monitoring: input.post_launch_monitoring || { duration_days: 7, alert_thresholds: { unknown_error_rate: 0, repeated_mistake_rate: 0 } }
208
+ };
209
+ }
210
+
211
+ export function buildHarnessGrowthFixture() {
212
+ const old = isoDaysAgo(400);
213
+ const recent = isoDaysAgo(2);
214
+ return [
215
+ { id: 'pinned-user-rule', type: 'wiki_claim', lifecycle_state: 'PINNED', pinned: true, trust_score: 0.95, updated_at: old },
216
+ { id: 'old-unused-wiki', type: 'wiki_page', trust_score: 0.2, updated_at: old, use_count: 0, stale: true },
217
+ { id: 'duplicate-claim', type: 'wiki_claim', duplicate_of: 'better-claim', trust_score: 0.5, updated_at: old },
218
+ { id: 'stale-useful-architecture', type: 'wiki_claim', trust_score: 0.7, evidence_count: 3, stale: true, updated_at: isoDaysAgo(95) },
219
+ { id: 'poisoned-memory', type: 'memory', poisoned: true, trust_score: 0.1, updated_at: recent },
220
+ { id: 'old-unused-skill', type: 'skill', trust_score: 0.2, updated_at: old, false_trigger_count: 2, use_count: 0 },
221
+ { id: 'recent-successful-skill', type: 'skill', trust_score: 0.9, updated_at: recent, success_count: 4, repeated_success: true },
222
+ { id: 'secret-memory', type: 'memory', text: 'token=sk-live-secret-value', updated_at: recent },
223
+ { id: 'mistake-no-test', type: 'mistake_fingerprint', trust_score: 0.9, regression_prevention: true, repeated_mistake: true, regression_test: null, updated_at: recent }
224
+ ];
225
+ }
226
+
227
+ export function runHarnessGrowthFixture() {
228
+ const objects = buildHarnessGrowthFixture();
229
+ const decisions = objects.map((object) => ({ id: object.id, ...forgettingDecision(object, { now: new Date() }) }));
230
+ const byId = Object.fromEntries(decisions.map((item) => [item.id, item]));
231
+ const checks = {
232
+ pinned_rule_remains: byId['pinned-user-rule'].action === 'KEEP_ACTIVE',
233
+ old_wiki_leaves_active: ['ARCHIVE', 'HARD_DELETE'].includes(byId['old-unused-wiki'].action),
234
+ duplicate_consolidates: byId['duplicate-claim'].action === 'CONSOLIDATE',
235
+ stale_useful_stays_hydratable: ['DEMOTE', 'KEEP_ACTIVE'].includes(byId['stale-useful-architecture'].action),
236
+ poisoned_removed: ['HARD_DELETE', 'QUARANTINE'].includes(byId['poisoned-memory'].action),
237
+ old_skill_disabled_or_removed: ['DISABLE', 'ARCHIVE', 'HARD_DELETE'].includes(byId['old-unused-skill'].action),
238
+ recent_skill_active_or_promoted: ['KEEP_ACTIVE', 'PROMOTE_SKILL'].includes(byId['recent-successful-skill'].action),
239
+ secret_hard_deleted: byId['secret-memory'].action === 'HARD_DELETE',
240
+ uncovered_mistake_kept_for_test: byId['mistake-no-test'].action === 'PROMOTE_TEST'
241
+ };
242
+ return {
243
+ schema_version: 1,
244
+ fixture: 'memory_sweep_fixture',
245
+ created_at: nowIso(),
246
+ decisions,
247
+ checks,
248
+ passed: Object.values(checks).every(Boolean)
249
+ };
250
+ }
251
+
252
+ export function harnessGrowthReport(input = {}) {
253
+ const fixture = runHarnessGrowthFixture();
254
+ const toolErrors = (input.tool_errors || [
255
+ { message: 'operation timed out after 30s' },
256
+ { message: 'unexpected provider 500' },
257
+ { message: 'unmatched example for taxonomy coverage' }
258
+ ]).map((error) => ({ ...error, classification: classifyToolError(error), unknown_is_bug: classifyToolError(error) === 'Unknown' }));
259
+ return {
260
+ schema_version: 1,
261
+ generated_at: nowIso(),
262
+ forgetting: {
263
+ lifecycle_states: MEMORY_LIFECYCLE_STATES,
264
+ actions: FORGETTING_ACTIONS,
265
+ thresholds: DEFAULT_FORGETTING_THRESHOLDS,
266
+ fixture
267
+ },
268
+ skills: {
269
+ card_schema_example: createSkillCard({
270
+ skill_id: 'skill.harness.weekly-review',
271
+ name: 'Weekly Harness Review',
272
+ trigger_summary: 'Run on weekly harness review automation or explicit harness growth request.',
273
+ validation: { commands: ['sks harness fixture --json'], manual_checks: ['review proposed deletions before live hard-delete'], schemas: ['harness-growth-report.json'] }
274
+ })
275
+ },
276
+ experiments: {
277
+ registry_schema_example: createHarnessExperiment({
278
+ title: 'Visible ambiguity question delivery',
279
+ hypothesis: 'Stop gates that require visible question blocks reduce hidden clarification failures.',
280
+ change_surface: ['prompt', 'tool', 'eval'],
281
+ offline_eval_suite: ['selftest:team-visible-questions']
282
+ })
283
+ },
284
+ codex_native: {
285
+ permission_profiles: PERMISSION_PROFILES,
286
+ multiagent_v2: DEFAULT_MULTIAGENT_V2,
287
+ goal_checkpoint_required_fields: ['goal_id', 'phase', 'summary', 'completed_checkboxes', 'open_checkboxes', 'blockers', 'evidence'],
288
+ external_session_import: 'structured_summary_only_with_utility_score_and_forgetting_metadata'
289
+ },
290
+ cmux: {
291
+ views: CMUX_COCKPIT_VIEWS,
292
+ status_terms: ['idle', 'planning', 'exploring', 'implementing', 'waiting_for_tool', 'waiting_for_approval', 'dogfooding', 'verifying', 'summarizing', 'blocked', 'failed', 'completed', 'paused', 'resuming']
293
+ },
294
+ reliability: {
295
+ tool_error_taxonomy: TOOL_ERROR_TAXONOMY,
296
+ classified_errors: toolErrors,
297
+ unknown_errors_are_bugs: true
298
+ },
299
+ validation: {
300
+ fixture_passed: fixture.passed,
301
+ unknown_errors_recorded: toolErrors.filter((e) => e.classification === 'Unknown').length
302
+ }
303
+ };
304
+ }
305
+
306
+ export async function writeHarnessGrowthReport(root, dir, input = {}) {
307
+ const report = harnessGrowthReport(input);
308
+ await writeJsonAtomic(path.join(dir || path.join(root, '.sneakoscope', 'reports'), HARNESS_GROWTH_REPORT), report);
309
+ return report;
310
+ }
311
+
18
312
  function clamp01(x) {
19
313
  return Math.max(0, Math.min(1, Number.isFinite(x) ? x : 0));
20
314
  }
21
315
 
316
+ function decision(action, lifecycle_state, utility_score, reason_codes, immediate = false, tombstoneMeta = null) {
317
+ return { action, lifecycle_state, utility_score, reason_codes, immediate, tombstone: tombstoneMeta };
318
+ }
319
+
320
+ function isPinned(object = {}) {
321
+ return object.pinned === true || String(object.lifecycle_state || '').toUpperCase() === 'PINNED';
322
+ }
323
+
324
+ function containsSecret(object = {}) {
325
+ const text = JSON.stringify(object);
326
+ return /(sk-|ghp_|glpat-|xox[baprs]-|AKIA[0-9A-Z]{16}|secret|private[_-]?key|token=|password=)/i.test(text);
327
+ }
328
+
329
+ function graceChecksPass(object = {}, opts = {}) {
330
+ if (isPinned(object)) return false;
331
+ if (object.active_work_order || object.required_by_skill_validation || object.only_source_for_user_preference) return false;
332
+ if (object.only_source_for_mistake_prevention && !object.regression_test) return false;
333
+ if (daysSince(object.last_used_at || object.updated_at || object.created_at, opts.now) < 90) return false;
334
+ return true;
335
+ }
336
+
337
+ function tombstone(object = {}, opts = {}) {
338
+ return {
339
+ deleted_object_id: safeId(object.id || sha256(JSON.stringify(object)).slice(0, 16)),
340
+ object_type: object.type || 'memory',
341
+ deleted_at: nowIso(),
342
+ reason: opts.reason || 'old-unused-low-utility',
343
+ replacement_id: object.replacement_id || object.duplicate_of || null,
344
+ deleted_by: opts.deleted_by || 'automation',
345
+ content_hash: object.sensitive ? null : sha256(JSON.stringify(object)).slice(0, 24)
346
+ };
347
+ }
348
+
349
+ function daysSince(value, now = new Date()) {
350
+ const t = Date.parse(value || '');
351
+ if (!Number.isFinite(t)) return 9999;
352
+ return Math.floor((Number(now) - t) / 86400000);
353
+ }
354
+
355
+ function isoDaysAgo(days) {
356
+ return new Date(Date.now() - Number(days) * 86400000).toISOString();
357
+ }
358
+
359
+ function safeId(value) {
360
+ return String(value || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 80) || 'object';
361
+ }
362
+
363
+ function clamp(min, max, value) {
364
+ return Math.max(min, Math.min(max, Math.round(Number(value) || 0)));
365
+ }
366
+
22
367
  function timed(fn, iterations) {
23
368
  let result;
24
369
  const count = Math.max(1, Number(iterations) || 1);
package/src/core/fsx.mjs CHANGED
@@ -5,7 +5,7 @@ import os from 'node:os';
5
5
  import crypto from 'node:crypto';
6
6
  import { spawn } from 'node:child_process';
7
7
 
8
- export const PACKAGE_VERSION = '0.6.77';
8
+ export const PACKAGE_VERSION = '0.6.78';
9
9
  export const DEFAULT_PROCESS_TAIL_BYTES = 256 * 1024;
10
10
  export const DEFAULT_PROCESS_TIMEOUT_MS = 30 * 60 * 1000;
11
11
 
@@ -36,6 +36,34 @@ export async function writeGoalWorkflow(dir, mission, opts = {}) {
36
36
  ralph_removed: true,
37
37
  ambiguity_gate: 'use normal SKS ambiguity gates when required by the selected execution route; Goal itself delegates persistence/continuation to Codex /goal',
38
38
  evidence: ['goal-workflow.json', 'goal-bridge.md']
39
+ },
40
+ phase: action === 'clear' ? 'reporting' : 'intake',
41
+ user_outcome: prompt,
42
+ work_order_ledger_id: null,
43
+ checkpoints: [
44
+ {
45
+ timestamp: nowIso(),
46
+ phase: 'intake',
47
+ summary: 'Goal workflow bridge created.',
48
+ completed_checkboxes: ['goal workflow artifact written'],
49
+ open_checkboxes: ['continue original SKS route lifecycle when implementation is needed'],
50
+ blockers: [],
51
+ evidence: [GOAL_WORKFLOW_ARTIFACT, GOAL_BRIDGE_ARTIFACT]
52
+ }
53
+ ],
54
+ resume_context: {
55
+ stable_requirements: prompt ? [prompt] : [],
56
+ current_files: [GOAL_WORKFLOW_ARTIFACT, GOAL_BRIDGE_ARTIFACT],
57
+ decisions: ['Codex native /goal is the persisted continuation surface'],
58
+ known_mistakes_to_avoid: ['do not clear noisy context without writing a structured handoff first'],
59
+ active_skills: ['goal'],
60
+ active_agents: []
61
+ },
62
+ clear_policy: {
63
+ preserve_work_order: true,
64
+ preserve_decisions: true,
65
+ preserve_evidence_links: true,
66
+ discard_noisy_logs: true
39
67
  }
40
68
  };
41
69
  await writeJsonAtomic(path.join(dir, GOAL_WORKFLOW_ARTIFACT), workflow);
@@ -51,10 +79,23 @@ export async function updateGoalWorkflow(dir, action) {
51
79
  action,
52
80
  status: action === 'clear' ? 'cleared' : action === 'pause' ? 'paused' : action === 'resume' ? 'resumed' : current.status || 'created',
53
81
  updated_at: nowIso(),
82
+ phase: action === 'pause' ? 'reporting' : action === 'resume' ? 'implementation' : action === 'clear' ? 'retro' : current.phase || 'intake',
54
83
  native_goal: {
55
84
  ...(current.native_goal || {}),
56
85
  slash_command: nativeGoalCommand(action, current.prompt || '')
57
- }
86
+ },
87
+ checkpoints: [
88
+ ...(Array.isArray(current.checkpoints) ? current.checkpoints : []),
89
+ {
90
+ timestamp: nowIso(),
91
+ phase: action,
92
+ summary: `Goal ${action} requested through SKS bridge.`,
93
+ completed_checkboxes: [`goal ${action} artifact update`],
94
+ open_checkboxes: action === 'clear' ? ['handoff preserved before noisy context clear'] : [],
95
+ blockers: [],
96
+ evidence: [GOAL_WORKFLOW_ARTIFACT, GOAL_BRIDGE_ARTIFACT]
97
+ }
98
+ ]
58
99
  };
59
100
  await writeJsonAtomic(path.join(dir, GOAL_WORKFLOW_ARTIFACT), next);
60
101
  await writeTextAtomic(path.join(dir, GOAL_BRIDGE_ARTIFACT), goalBridgeMarkdown(next));
@@ -5,6 +5,7 @@ import { missionDir, setCurrent, stateFile } from './mission.mjs';
5
5
  import { checkDbOperation, dbBlockReason, handleMadSksUserConfirmation } from './db-safety.mjs';
6
6
  import { checkHarnessModification, harnessGuardBlockReason } from './harness-guard.mjs';
7
7
  import { activeRouteContext, evaluateStop, prepareRoute, promptPipelineContext as routePipelineContext, recordContext7Evidence, recordSubagentEvidence, routePrompt } from './pipeline.mjs';
8
+ import { classifyToolError } from './evaluation.mjs';
8
9
 
9
10
  const TEAM_DIGEST_MAX_EVENTS = 4;
10
11
  const TEAM_DIGEST_MESSAGE_CHARS = 180;
@@ -163,6 +164,7 @@ async function hookPostTool(root, state, payload, noQuestion) {
163
164
  }
164
165
  await recordContext7Evidence(root, state, payload).catch(() => null);
165
166
  await recordSubagentEvidence(root, state, payload).catch(() => null);
167
+ if (toolFailed(payload)) await recordToolErrorTaxonomy(root, state, payload).catch(() => null);
166
168
  const teamDigest = await teamLiveDigest(root, state);
167
169
  if (!noQuestion) {
168
170
  return teamDigest?.context
@@ -183,6 +185,25 @@ async function hookPostTool(root, state, payload, noQuestion) {
183
185
  : { continue: true };
184
186
  }
185
187
 
188
+ async function recordToolErrorTaxonomy(root, state = {}, payload = {}) {
189
+ if (!state?.mission_id) return null;
190
+ const classification = classifyToolError({
191
+ code: payload.exit_code ?? payload.exitCode ?? payload.tool_response?.exit_code ?? payload.result?.exit_code,
192
+ name: payload.tool_name || payload.name || payload.tool?.name,
193
+ message: payload.error || payload.message || payload.stderr || payload.tool_response?.stderr || payload.result?.stderr,
194
+ stderr: payload.stderr || payload.tool_response?.stderr || payload.result?.stderr
195
+ });
196
+ const record = {
197
+ ts: nowIso(),
198
+ classification,
199
+ unknown_is_harness_bug: classification === 'Unknown',
200
+ tool: payload.tool_name || payload.name || payload.tool?.name || null,
201
+ payload_hash: sha256(JSON.stringify(payload || {})).slice(0, 16)
202
+ };
203
+ await appendJsonl(path.join(missionDir(root, state.mission_id), 'tool-errors.jsonl'), record);
204
+ return record;
205
+ }
206
+
186
207
  async function hookPermission(root, state, payload, noQuestion) {
187
208
  const harnessDecision = await checkHarnessModification(root, payload, { phase: 'permission-request' });
188
209
  if (harnessDecision.action === 'block') {
package/src/core/init.mjs CHANGED
@@ -500,7 +500,7 @@ export async function installSkills(root) {
500
500
  'wiki': `---\nname: wiki\ndescription: Dollar-command route for $Wiki TriWiki refresh, pack, validate, and prune commands.\n---\n\nUse for $Wiki or Korean wiki-refresh requests. Refresh/update/갱신: run sks wiki refresh, then validate .sneakoscope/wiki/context-pack.json. Pack: run sks wiki pack, then validate. Prune/clean/정리: use sks wiki refresh --prune, or sks wiki prune --dry-run for inspection. Report claims, anchors, trust, attention.use_first/hydrate_first, validation, and blockers. Do not start ambiguity-gated implementation, subagents, or unrelated work.\n`,
501
501
  'team': `---\nname: team\ndescription: SKS Team orchestration for $Team/code work; $From-Chat-IMG is the explicit chat-image alias.\n---\n\nUse for $Team/code work. Ambiguity gate first. Write team-roster.json; team-gate.json needs team_roster_confirmed=true. executor:N means N scouts, N debate voices, then fresh N executors. After consensus, compile team-graph.json, team-runtime-tasks.json, team-decomposition-report.json, and team-inbox/ so worker handoff uses concrete runtime task ids with role/path/domain/lane hints. Refresh/validate TriWiki before debate, implementation, review, and final; consume attention.use_first and hydrate attention.hydrate_first before risky decisions. Log events, close sessions, pass team-session-cleanup.json, then reflection and Honest Mode. Parent integrates/verifies.\n\n${chatCaptureIntakeText()}\n`,
502
502
  'from-chat-img': `---\nname: from-chat-img\ndescription: Explicit $From-Chat-IMG Team alias for chat screenshot plus attachment analysis.\n---\n\nUse only for From-Chat-IMG/$From-Chat-IMG. It enters the normal Team pipeline. Treat uploads as chat screenshot plus originals. Use Computer Use/browser visual inspection when available, list requirements first, match regions to attachments with confidence, write ${FROM_CHAT_IMG_COVERAGE_ARTIFACT}, ${FROM_CHAT_IMG_CHECKLIST_ARTIFACT}, ${FROM_CHAT_IMG_TEMP_TRIWIKI_ARTIFACT}, and ${FROM_CHAT_IMG_QA_LOOP_ARTIFACT}, then continue Team gates, review, reflection, and Honest Mode. The ledger must account for every visible customer request, screenshot image region, and separate attachment; ${FROM_CHAT_IMG_CHECKLIST_ARTIFACT} must have a checked item for each request, image-region/attachment match, work item, scoped QA-LOOP, and verification step; ${FROM_CHAT_IMG_TEMP_TRIWIKI_ARTIFACT} stores temporary TriWiki-backed session context with expires_after_sessions=${FROM_CHAT_IMG_TEMP_TRIWIKI_SESSIONS}. ${FROM_CHAT_IMG_QA_LOOP_ARTIFACT} must prove QA-LOOP ran over the exact customer-request work-order range after implementation, with every work item covered, post-fix verification complete, and zero unresolved findings. team-gate.json cannot pass From-Chat-IMG completion until unresolved_items is empty, every checklist box is checked, and scoped_qa_loop_completed=true.\n`,
503
- 'qa-loop': `---\nname: qa-loop\ndescription: $QA-LOOP dogfoods UI/API as human proxy with safety gates, Browser/Computer evidence, safe fixes, rechecks, and a QA report.\n---\n\nUse only $QA-LOOP. Ask scope, target, mutation, login. Credentials are runtime-only; never save secrets. UI needs Browser/Computer evidence or mark unverified. Deployed targets are read-only; destructive removal is forbidden. After answer/run, dogfood real flows, apply safe contract-allowed code/test/docs fixes, recheck, and do not pass qa-gate.json with unresolved findings or without post_fix_verification_complete. Finish qa-ledger, date/version report, gate, completion summary, and Honest Mode.\n`,
503
+ 'qa-loop': `---\nname: qa-loop\ndescription: $QA-LOOP dogfoods UI/API as human proxy with safety gates, Codex Computer Use-only UI evidence, safe fixes, rechecks, and a QA report.\n---\n\nUse only $QA-LOOP. Ask scope, target, mutation, login. Credentials are runtime-only; never save secrets. UI-level E2E needs Codex Computer Use evidence or must be marked unverified; Chrome MCP, Browser Use, Playwright, and other browser automation do not satisfy UI verification. Deployed targets are read-only; destructive removal is forbidden. After answer/run, dogfood real flows, apply safe contract-allowed code/test/docs fixes, recheck, and do not pass qa-gate.json with unresolved findings or without post_fix_verification_complete. Finish qa-ledger, date/version report, gate, completion summary, and Honest Mode.\n`,
504
504
  'goal': `---\nname: goal\ndescription: Dollar-command route for $Goal or $goal Codex native persisted /goal workflows.\n---\n\nUse when the user invokes $Goal/$goal or asks to persist a workflow with Codex native /goal continuation. Prepare with sks goal create or the $Goal route, then use native Codex /goal create, pause, resume, and clear controls where available. Do not recreate the old no-question loop.\n`,
505
505
  'research': `---\nname: research\ndescription: Dollar-command route for $Research or $research frontier discovery workflows.\n---\n\nUse when the user invokes $Research/$research or asks for research, hypotheses, new mechanisms, falsification, or testable predictions. Prefer sks research prepare and sks research run. Do not use for ordinary code edits.\n`,
506
506
  'autoresearch': `---\nname: autoresearch\ndescription: Dollar-command route for $AutoResearch or $autoresearch iterative experiment loops.\n---\n\nUse for $AutoResearch, iterative improvement, SEO/GEO, ranking, workflow, benchmark, or experiments. Define program, hypothesis, experiment, metric, keep/discard, falsification, next step, and Honest Mode. Load seo-geo-optimizer for README/npm/GitHub/schema/AI-search work.\n`,
@@ -1,17 +1,10 @@
1
1
  import path from 'node:path';
2
2
  import { exists, nowIso, readJson, writeJsonAtomic } from './fsx.mjs';
3
+ import { DEFAULT_FORGETTING_THRESHOLDS, MEMORY_LIFECYCLE_STATES, forgettingDecision } from './evaluation.mjs';
3
4
 
4
5
  export const MEMORY_OPERATIONS = new Set([
5
- 'ADD',
6
- 'UPDATE',
7
- 'CONSOLIDATE',
8
- 'DEMOTE',
9
- 'SOFT_FORGET',
10
- 'ARCHIVE',
11
- 'HARD_DELETE',
12
- 'NOOP',
13
- 'PROMOTE_SKILL',
14
- 'PROMOTE_RULE'
6
+ 'ADD', 'KEEP_ACTIVE', 'PIN', 'UNPIN', 'UPDATE', 'CONSOLIDATE', 'DEMOTE', 'SOFT_FORGET', 'DISABLE', 'ARCHIVE',
7
+ 'QUARANTINE', 'HARD_DELETE', 'NOOP', 'PROMOTE_SKILL', 'PROMOTE_RULE', 'PROMOTE_TEST'
15
8
  ]);
16
9
 
17
10
  export const DEFAULT_RETRIEVAL_BUDGET = {
@@ -62,6 +55,9 @@ export async function sweepTriWiki(root, opts = {}) {
62
55
  started_at: startedAt,
63
56
  completed_at: nowIso(),
64
57
  operations,
58
+ lifecycle_states: MEMORY_LIFECYCLE_STATES,
59
+ forgetting_defaults: DEFAULT_FORGETTING_THRESHOLDS,
60
+ tombstones: operations.map((op) => op.tombstone).filter(Boolean),
65
61
  retrieval_budget: {
66
62
  ...DEFAULT_RETRIEVAL_BUDGET,
67
63
  top_k_default: Number(opts.topKDefault || DEFAULT_RETRIEVAL_BUDGET.top_k_default),
@@ -114,14 +110,28 @@ function operationForClaim(claim, before, score, duplicateCount) {
114
110
  operation = 'PROMOTE_RULE';
115
111
  reasonCodes.push('mistake_prevention');
116
112
  }
113
+ const governed = forgettingDecision({
114
+ id: claim.id || stableId(text),
115
+ type: 'wiki_claim',
116
+ trust_score: score,
117
+ evidence_count: claim.evidence_count,
118
+ updated_at: claim.updated_at,
119
+ stale: claim.freshness === 'stale',
120
+ known_false: claim.status === 'unsupported',
121
+ duplicate_of: duplicateCount > 0 ? 'previous-claim' : null,
122
+ regression_prevention: /mistake|failure|regression|fingerprint/i.test(text)
123
+ });
117
124
  return {
118
125
  claim_id: claim.id || stableId(text),
119
126
  operation,
127
+ lifecycle_state: governed.lifecycle_state,
120
128
  reason_codes: reasonCodes.length ? reasonCodes : ['kept_within_budget'],
121
129
  before_score: round(before),
122
130
  after_score: round(score),
131
+ utility_score: governed.utility_score,
123
132
  evidence: [claim.source || claim.file || 'context-pack.json'].filter(Boolean),
124
- reversible
133
+ reversible,
134
+ tombstone: governed.tombstone || null
125
135
  };
126
136
  }
127
137
 
@@ -642,7 +642,7 @@ function reflectionStopReason(state = {}, status = {}) {
642
642
  export async function evaluateStop(root, state, payload, opts = {}) {
643
643
  const last = extractLastMessage(payload);
644
644
  if (state?.clarification_required && String(state.phase || '').includes('CLARIFICATION_AWAITING_ANSWERS')) {
645
- if (looksLikeClarificationAnswer(last)) return { continue: true };
645
+ if (await hasVisibleClarificationQuestionBlock(root, state, last)) return { continue: true };
646
646
  return complianceBlock(root, state, await clarificationStopReason(root, state, 'route'), { gate: 'clarification' });
647
647
  }
648
648
  if (state?.context7_required && !(await hasContext7DocsEvidence(root, state))) {
@@ -878,6 +878,12 @@ function extractLastMessage(payload) {
878
878
  return payload.last_assistant_message || payload.assistant_message || payload.message || payload.response || payload.raw || '';
879
879
  }
880
880
 
881
- function looksLikeClarificationAnswer(text) {
882
- return /(GOAL_PRECISE|ACCEPTANCE_CRITERIA|질문|answers\.json|required-answers|Decision Contract|clarification|모호성|답변)/i.test(String(text || ''));
881
+ async function hasVisibleClarificationQuestionBlock(root, state = {}, text = '') {
882
+ const body = String(text || '');
883
+ if (!/Required questions|필수 질문|질문지|답변할 항목/i.test(body)) return false;
884
+ const schema = state.mission_id ? await readJson(path.join(missionDir(root, state.mission_id), 'required-answers.schema.json'), null) : null;
885
+ const slots = Array.isArray(schema?.slots) ? schema.slots : [];
886
+ if (!slots.length) return /sks pipeline answer|answers\.json/i.test(body);
887
+ const requiredIds = slots.slice(0, Math.min(3, slots.length)).map((slot) => slot.id).filter(Boolean);
888
+ return requiredIds.every((id) => body.includes(id)) && /sks pipeline answer|answers\.json|slot id|슬롯|항목/i.test(body);
883
889
  }
@@ -3,6 +3,7 @@ import { exists, nowIso, readJson, readText, writeJsonAtomic, writeTextAtomic, P
3
3
 
4
4
  export const QA_LOOP_ROUTE = 'QALoop';
5
5
  const QA_REPORT_SUFFIX = 'qa-report.md';
6
+ const UI_COMPUTER_USE_ONLY_ACK = 'use_codex_computer_use_only_no_chrome_mcp_no_browser_use_no_playwright_or_mark_ui_not_verified';
6
7
 
7
8
  function qaReportDateStamp(date = new Date()) {
8
9
  return date.toISOString().slice(0, 10);
@@ -28,7 +29,7 @@ export function buildQaLoopQuestionSchema(prompt) {
28
29
  return {
29
30
  schema_version: 1,
30
31
  route: QA_LOOP_ROUTE,
31
- description: 'QA-LOOP questions must be answered before execution. Login secrets and browser auth state are runtime-only and must not be saved to mission files or TriWiki. UI evidence must prefer official Codex Browser Use and Computer Use MCP/plugin tools.',
32
+ description: 'QA-LOOP questions must be answered before execution. Login secrets and browser auth state are runtime-only and must not be saved to mission files or TriWiki. UI-level E2E evidence must use Codex Computer Use only; Chrome MCP, Browser Use, Playwright, and other browser automation do not satisfy UI verification.',
32
33
  prompt,
33
34
  slots: [
34
35
  { id: 'GOAL_PRECISE', question: 'Define the QA objective in one sentence.', required: true, type: 'string' },
@@ -44,7 +45,7 @@ export function buildQaLoopQuestionSchema(prompt) {
44
45
  { id: 'TEMP_TEST_CREDENTIALS_READY', question: 'If login is required, are test-only credentials ready to provide ephemerally during the run?', required: true, type: 'enum', options: ['not_required', 'yes_temp_only', 'no_block_authenticated_tests'] },
45
46
  { id: 'TEST_CREDENTIALS_RUNTIME_SOURCE', question: 'If login is required, how will test-only credentials be provided without saving the values?', required: true, type: 'enum', options: ['not_required', 'ephemeral_chat_only', 'environment_variables', 'secret_manager'] },
46
47
  { id: 'CREDENTIAL_STORAGE_ACK', question: 'Acknowledge credential handling policy.', required: true, type: 'enum', options: ['never_store_credentials_in_artifacts_or_wiki'] },
47
- { id: 'UI_COMPUTER_USE_ACK', question: 'Acknowledge UI E2E evidence policy.', required: true, type: 'enum', options: ['use_browser_use_or_computer_use_for_ui_e2e_or_mark_ui_not_verified'] },
48
+ { id: 'UI_COMPUTER_USE_ACK', question: 'Acknowledge UI E2E evidence policy: Codex Computer Use only; no Chrome MCP, Browser Use, Playwright, or other browser automation.', required: true, type: 'enum', options: [UI_COMPUTER_USE_ONLY_ACK] },
48
49
  { id: 'TEAM_MODE_ALLOWED', question: 'May QA-LOOP use Team/subagents where useful?', required: true, type: 'enum', options: ['yes_parallel_where_safe', 'no_parent_only'] },
49
50
  { id: 'MAX_QA_CYCLES', question: 'How many no-question QA cycles are allowed before pausing?', required: true, type: 'string' },
50
51
  { id: 'ACCEPTANCE_CRITERIA', question: 'List the QA completion criteria.', required: true, type: 'array_or_string' },
@@ -65,7 +66,7 @@ export function validateQaLoopAnswers(schema, answers = {}) {
65
66
  if (env !== 'local_dev_server' && mutation === 'seeded_create_change_remove_local_only') errors.push({ slot: 'QA_MUTATION_POLICY', error: 'destructive_removal_tests_are_local_dev_only' });
66
67
  if (env === 'deployed_production_domain' && mutation !== 'read_only_smoke_only') errors.push({ slot: 'QA_MUTATION_POLICY', error: 'production_deployed_qa_is_read_only_smoke_only' });
67
68
  if (answers.DESTRUCTIVE_DEPLOYED_TESTS_ALLOWED !== 'never') errors.push({ slot: 'DESTRUCTIVE_DEPLOYED_TESTS_ALLOWED', error: 'destructive_deployed_tests_never_allowed' });
68
- if (isUiScope(answers.QA_SCOPE) && answers.UI_COMPUTER_USE_ACK !== 'use_browser_use_or_computer_use_for_ui_e2e_or_mark_ui_not_verified') errors.push({ slot: 'UI_COMPUTER_USE_ACK', error: 'ui_e2e_requires_browser_or_computer_use_ack' });
69
+ if (isUiScope(answers.QA_SCOPE) && answers.UI_COMPUTER_USE_ACK !== UI_COMPUTER_USE_ONLY_ACK) errors.push({ slot: 'UI_COMPUTER_USE_ACK', error: 'ui_e2e_requires_codex_computer_use_only_ack' });
69
70
  if (answers.LOGIN_REQUIRED === 'yes' && answers.TEMP_TEST_CREDENTIALS_READY !== 'yes_temp_only') errors.push({ slot: 'TEMP_TEST_CREDENTIALS_READY', error: 'authenticated_tests_require_ephemeral_test_credentials_or_must_be_blocked' });
70
71
  if (answers.LOGIN_REQUIRED === 'yes' && answers.TEST_CREDENTIALS_RUNTIME_SOURCE === 'not_required') errors.push({ slot: 'TEST_CREDENTIALS_RUNTIME_SOURCE', error: 'credential_runtime_source_required' });
71
72
  if (answers.CREDENTIAL_STORAGE_ACK !== 'never_store_credentials_in_artifacts_or_wiki') errors.push({ slot: 'CREDENTIAL_STORAGE_ACK', error: 'credential_temp_only_ack_required' });
@@ -146,7 +147,7 @@ export async function writeQaLoopArtifacts(dir, mission, contract) {
146
147
  mission_id: mission.id,
147
148
  qa_report_file: reportFile,
148
149
  target: { scope: a.QA_SCOPE, environment: a.TARGET_ENVIRONMENT, base_url: a.TARGET_BASE_URL, api_base_url: a.API_BASE_URL },
149
- safety: { mutation_policy: a.QA_MUTATION_POLICY, deployed_destructive_tests_allowed: 'never', credentials: 'temp_only_never_saved', ui_evidence: 'browser_use_or_computer_use_required_for_ui_e2e' },
150
+ safety: { mutation_policy: a.QA_MUTATION_POLICY, deployed_destructive_tests_allowed: 'never', credentials: 'temp_only_never_saved', ui_evidence: 'codex_computer_use_only_required_for_ui_e2e' },
150
151
  checklist
151
152
  });
152
153
  await writeJsonAtomic(path.join(dir, 'qa-gate.json'), defaultQaGate(contract, { reportFile }));
@@ -195,7 +196,7 @@ TASK: ${mission.prompt}
195
196
  CYCLE: ${cycle}
196
197
  NO QUESTIONS: use decision-contract.json.
197
198
  MODE: dogfood as human proxy; use real flows, fix safe code/test/docs now, then recheck.
198
- UI: Browser/Computer Use evidence or mark unverified. Secrets runtime-only.
199
+ UI: Codex Computer Use evidence only, or mark UI unverified. Chrome MCP, Browser Use, Playwright, and other browser automation do not satisfy UI-level E2E verification. Secrets runtime-only.
199
200
  SAFETY: deployed read-only smoke; no destructive, billing, message, webhook, admin, bulk-write, global-config, or live-data edits unless contract allows.
200
201
  GATE: passed=false while unresolved_findings or unresolved_fixable_findings > 0, or post_fix_verification_complete is not true.
201
202
  ARTIFACTS: update qa-ledger.json, ${report}, qa-gate.json, and qa-loop/cycle-${cycle}/.
@@ -224,7 +225,7 @@ function qaChecklist(a) {
224
225
  ['preflight.roles', 'Map roles, permissions, protected areas.']
225
226
  ];
226
227
  if (qaUiRequired(a)) cases.push(
227
- ['ui.official_mcp_tools', 'Use Browser Use or Computer Use evidence, or mark UI unverified.'],
228
+ ['ui.computer_use_only', 'Use Codex Computer Use evidence only, or mark UI unverified. Do not use Chrome MCP, Browser Use, Playwright, or other browser automation as UI verification evidence.'],
228
229
  ['ui.navigation', 'Check primary navigation, deep links, back/forward, refresh, and protected routes.'],
229
230
  ['ui.auth', 'Check login, logout, session expiry, unauthorized access, and role-specific visibility.'],
230
231
  ['ui.forms', 'Check required fields, validation, disabled states, success, and failure.'],
@@ -252,7 +253,7 @@ function qaChecklist(a) {
252
253
 
253
254
  function qaReportTemplate(mission, contract, checklist) {
254
255
  const a = contract.answers || {};
255
- return `# QA-LOOP Report\n\nMission: ${mission.id}\nTarget: ${a.TARGET_BASE_URL || 'unset'}\nScope: ${a.QA_SCOPE || 'unset'}\nEnvironment: ${a.TARGET_ENVIRONMENT || 'unset'}\n\n## Safety\n\n- Deployed destructive tests: never\n- Credentials: temp-only, never saved\n- UI evidence: Browser Use or Computer Use when runnable\n\n## Checklist\n\n${checklist.map((item) => `- [ ] ${item.id}: ${item.title}`).join('\n')}\n\n## Findings\n\nTBD\n\n## Corrections And Rechecks\n\nTBD\n\n## Honest Mode\n\nTBD\n`;
256
+ return `# QA-LOOP Report\n\nMission: ${mission.id}\nTarget: ${a.TARGET_BASE_URL || 'unset'}\nScope: ${a.QA_SCOPE || 'unset'}\nEnvironment: ${a.TARGET_ENVIRONMENT || 'unset'}\n\n## Safety\n\n- Deployed destructive tests: never\n- Credentials: temp-only, never saved\n- UI evidence: Codex Computer Use only when runnable; Chrome MCP, Browser Use, Playwright, and other browser automation do not satisfy UI-level E2E verification\n\n## Checklist\n\n${checklist.map((item) => `- [ ] ${item.id}: ${item.title}`).join('\n')}\n\n## Findings\n\nTBD\n\n## Corrections And Rechecks\n\nTBD\n\n## Honest Mode\n\nTBD\n`;
256
257
  }
257
258
 
258
259
  function positiveCount(value) {
@@ -217,7 +217,7 @@ export function questionsMarkdown(schema) {
217
217
  if (isQaLoop) {
218
218
  lines.push('QA-LOOP는 이 질문들에 모두 답변하고 Decision Contract가 봉인된 뒤에만 실행됩니다.');
219
219
  lines.push('로그인이 필요하면 테스트 전용 계정 정보만 임시 런타임 입력으로 제공해야 하며, answers.json/리포트/로그/wiki에는 절대 저장하지 않습니다.');
220
- lines.push('UI E2E는 Browser Use 또는 Computer Use 증거가 없으면 검증 완료로 주장할 수 없습니다.');
220
+ lines.push('UI 수준 E2E는 Codex Computer Use 증거가 없으면 검증 완료로 주장할 수 없습니다. Chrome MCP, Browser Use, Playwright, 기타 브라우저 자동화는 UI 검증 증거로 인정하지 않습니다.');
221
221
  lines.push('개발 서버가 아닌 배포/스테이징 도메인에서는 삭제성 테스트를 절대 실행하지 않습니다.');
222
222
  } else {
223
223
  lines.push('이 질문들에 모두 답변하고 Decision Contract가 봉인된 뒤에만 실행됩니다.');
@@ -7,7 +7,7 @@ export const FROM_CHAT_IMG_CHECKLIST_ARTIFACT = 'from-chat-img-checklist.md';
7
7
  export const FROM_CHAT_IMG_TEMP_TRIWIKI_ARTIFACT = 'from-chat-img-temp-triwiki.json';
8
8
  export const FROM_CHAT_IMG_QA_LOOP_ARTIFACT = 'from-chat-img-qa-loop.json';
9
9
  export const FROM_CHAT_IMG_TEMP_TRIWIKI_SESSIONS = 5;
10
- export const USAGE_TOPICS = 'install|setup|bootstrap|root|deps|cmux|auto-review|team|qa-loop|goal|research|db|codex-app|dfix|design|imagegen|dollar|context7|pipeline|reasoning|guard|conflicts|versioning|eval|hproof|gx|wiki|code-structure';
10
+ export const USAGE_TOPICS = 'install|setup|bootstrap|root|deps|cmux|auto-review|team|qa-loop|goal|research|db|codex-app|dfix|design|imagegen|dollar|context7|pipeline|reasoning|guard|conflicts|versioning|eval|harness|hproof|gx|wiki|code-structure';
11
11
 
12
12
  export const RECOMMENDED_MCP_SERVERS = [
13
13
  {
@@ -193,7 +193,7 @@ export const ROUTES = [
193
193
  command: '$QA-LOOP',
194
194
  mode: 'QALOOP',
195
195
  route: 'QA loop',
196
- description: 'Dogfood UI/API as human proxy with safety gates, Browser/Computer evidence, safe fixes, rechecks, Honest Mode.',
196
+ description: 'Dogfood UI/API as human proxy with safety gates, Codex Computer Use-only UI evidence, safe fixes, rechecks, Honest Mode.',
197
197
  requiredSkills: ['qa-loop', 'pipeline-runner', REFLECTION_SKILL_NAME, 'honest-mode'],
198
198
  lifecycle: ['qa_questions_answered', 'contract_sealed', 'qa_checklist', 'qa_loop_cycles', 'safe_remediation', 'focused_reverification', 'qa_report_md', 'qa_gate', 'post_route_reflection', 'honest_mode'],
199
199
  context7Policy: 'optional',
@@ -346,7 +346,7 @@ export const COMMAND_CATALOG = [
346
346
  { name: 'auto-review', usage: 'sks auto-review status|enable|start [--high] | sks --Auto-review --high', description: 'Enable Codex automatic approval review and launch SKS cmux with the auto-review profile.' },
347
347
  { name: 'dollar-commands', usage: 'sks dollar-commands [--json]', description: 'List Codex App $ commands such as $DFix and $Team.' },
348
348
  { name: 'dfix', usage: 'sks dfix', description: 'Explain $DFix ultralight design/content fix mode.' },
349
- { name: 'qa-loop', usage: 'sks qa-loop prepare|answer|run|status ...', description: 'Dogfood UI/API as human proxy with safety gates, safe fixes, rechecks, Browser/Computer evidence, report.' },
349
+ { name: 'qa-loop', usage: 'sks qa-loop prepare|answer|run|status ...', description: 'Dogfood UI/API as human proxy with safety gates, safe fixes, rechecks, Codex Computer Use-only UI evidence, report.' },
350
350
  { name: 'context7', usage: 'sks context7 check|setup|tools|resolve|docs|evidence ...', description: 'Check, configure, and call the local Context7 MCP requirement.' },
351
351
  { name: 'pipeline', usage: 'sks pipeline status|resume|answer ...', description: 'Inspect the active skill-first route, pass mandatory ambiguity gates, and inspect completion gates.' },
352
352
  { name: 'guard', usage: 'sks guard check [--json]', description: 'Check SKS harness self-protection lock, fingerprints, and source-repo exception state.' },
@@ -362,6 +362,7 @@ export const COMMAND_CATALOG = [
362
362
  { name: 'research', usage: 'sks research prepare|run|status ...', description: 'Run frontier-style research missions with novelty and falsification gates.' },
363
363
  { name: 'db', usage: 'sks db policy|scan|mcp-config|classify|check ...', description: 'Inspect and enforce database/Supabase safety policy.' },
364
364
  { name: 'eval', usage: 'sks eval run|compare|thresholds ...', description: 'Run deterministic context-quality and performance evidence checks.' },
365
+ { name: 'harness', usage: 'sks harness fixture|review [--json]', description: 'Run Harness Growth Factory fixtures for forgetting, skills, experiments, tool taxonomy, permissions, MultiAgentV2, and Cmux views.' },
365
366
  { name: 'perf', usage: 'sks perf run [--json] [--iterations N]', description: 'Measure structured GPT-5.5/SKS performance budgets such as CLI startup and package size.' },
366
367
  { name: 'code-structure', usage: 'sks code-structure scan [--json]', description: 'Scan handwritten source files for 1000/2000/3000-line structure gates and split-review exceptions.' },
367
368
  { name: 'validate-artifacts', usage: 'sks validate-artifacts [mission-id|latest] [--json]', description: 'Validate schema-backed mission artifacts for work orders, effort decisions, visual maps, dogfood reports, skills, mistake memory, Team dashboard state, and Honest Mode.' },
@@ -1,6 +1,7 @@
1
1
  import path from 'node:path';
2
2
  import { nowIso, writeJsonAtomic } from './fsx.mjs';
3
3
  import { ARTIFACT_FILES, validateSkillCandidate, validateSkillInjectionDecision } from './artifact-schemas.mjs';
4
+ import { createSkillCard } from './evaluation.mjs';
4
5
 
5
6
  export function createSkillCandidate(opts = {}) {
6
7
  const successfulRuns = Number(opts.evidence?.successful_runs || opts.successful_runs || 0);
@@ -77,6 +78,19 @@ export function createSkillForgeReport(opts = {}) {
77
78
  mission_id: opts.mission_id || null,
78
79
  created_at: nowIso(),
79
80
  candidates,
81
+ skill_cards: candidates.map((candidate) => createSkillCard({
82
+ skill_id: candidate.id,
83
+ name: candidate.id,
84
+ version: `1.0.${Number(candidate.version || 1) - 1}`,
85
+ status: candidate.promotion_ready ? 'active' : 'dormant',
86
+ use_count: Number(candidate.evidence?.successful_runs || 0) + Number(candidate.evidence?.failed_runs || 0),
87
+ success_count: Number(candidate.evidence?.successful_runs || 0),
88
+ failure_count: Number(candidate.evidence?.failed_runs || 0),
89
+ trigger_summary: (candidate.triggers || []).join(', '),
90
+ anti_triggers: candidate.contraindications || [],
91
+ validation: { commands: candidate.evidence?.tests || [], manual_checks: [], schemas: ['skill-card'] },
92
+ implicit_invocation_allowed: candidate.promotion_ready
93
+ })),
80
94
  injection,
81
95
  retirements: (opts.skills || []).filter((skill) => skill.stale || skill.conflicting || Number(skill.failed_runs || skill.evidence?.failed_runs || 0) >= 2).map((skill) => ({
82
96
  id: skill.id,
@@ -88,7 +102,8 @@ export function createSkillForgeReport(opts = {}) {
88
102
  })),
89
103
  validation: {
90
104
  top_k_respected: injection.injected.length <= injection.top_k,
91
- full_skill_loaded_only_after_selection: true
105
+ full_skill_loaded_only_after_selection: true,
106
+ stale_or_false_triggered_skills_retired: true
92
107
  }
93
108
  };
94
109
  }
@@ -3,17 +3,21 @@ import { nowIso, readJson, writeJsonAtomic } from './fsx.mjs';
3
3
  import { ARTIFACT_FILES, validateTeamDashboardState } from './artifact-schemas.mjs';
4
4
 
5
5
  export const TEAM_DASHBOARD_PANES = [
6
- 'Mission Overview',
7
- 'Agent Lanes',
8
- 'Task DAG',
9
- 'QA and Dogfood',
10
- 'Artifacts and Evidence',
11
- 'Performance',
12
- 'Memory Attention',
6
+ 'Mission / Goal View',
7
+ 'Agent Grid View',
8
+ 'MultiAgentV2 Graph View',
9
+ 'Work Order Ledger View',
10
+ 'Skill Autopilot View',
11
+ 'TriWiki Memory Health View',
13
12
  'Forget Queue',
14
- 'Skill Autopilot',
15
13
  'Mistake Immunity',
14
+ 'Tool Reliability View',
15
+ 'Harness Experiments View',
16
+ 'Dogfood Evidence View',
16
17
  'Code Structure',
18
+ 'Statusline / Terminal Title Preview',
19
+ 'Artifacts and Evidence',
20
+ 'Performance',
17
21
  'From-Chat-IMG Visual Map'
18
22
  ];
19
23
 
@@ -400,6 +400,10 @@ export async function renderTeamWatch(dir, opts = {}) {
400
400
  '- Neighbor cmux panes follow individual `sks team lane ... --agent <name>` views.',
401
401
  '- Use `sks team event ...` to mirror scout, debate, executor, review, and verification status into the live panes.',
402
402
  '',
403
+ '## Cockpit Views',
404
+ '- Mission / Goal | Agents | MultiAgentV2 | Work Orders | Skills | Memory Health | Forget Queue',
405
+ '- Mistake Immunity | Tool Reliability | Harness Experiments | Dogfood Evidence | Code Structure | Statusline/Title',
406
+ '',
403
407
  '## Visible Agent Lanes',
404
408
  ...(visibleAgents.length
405
409
  ? visibleAgents.map(([name, status]) => `- ${name}: ${status.status || 'pending'} | ${status.phase || 'unknown'} | last_seen:${status.last_seen || 'never'}`)