gsd-pi 2.63.0-dev.351157b → 2.63.0-dev.d04bbc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/dist/cli.js +4 -0
  2. package/dist/headless-query.js +11 -1
  3. package/dist/resources/extensions/gsd/auto/detect-stuck.js +27 -0
  4. package/dist/resources/extensions/gsd/auto/phases.js +34 -0
  5. package/dist/resources/extensions/gsd/auto/session.js +4 -0
  6. package/dist/resources/extensions/gsd/auto-model-selection.js +32 -0
  7. package/dist/resources/extensions/gsd/auto-post-unit.js +79 -0
  8. package/dist/resources/extensions/gsd/auto-timers.js +2 -1
  9. package/dist/resources/extensions/gsd/bootstrap/db-tools.js +87 -28
  10. package/dist/resources/extensions/gsd/bootstrap/register-hooks.js +23 -0
  11. package/dist/resources/extensions/gsd/bootstrap/system-context.js +30 -2
  12. package/dist/resources/extensions/gsd/preferences-types.js +1 -0
  13. package/dist/resources/extensions/gsd/prompt-loader.js +7 -0
  14. package/dist/resources/extensions/gsd/prompts/system.md +3 -7
  15. package/dist/resources/extensions/gsd/safety/content-validator.js +73 -0
  16. package/dist/resources/extensions/gsd/safety/destructive-guard.js +34 -0
  17. package/dist/resources/extensions/gsd/safety/evidence-collector.js +109 -0
  18. package/dist/resources/extensions/gsd/safety/evidence-cross-ref.js +83 -0
  19. package/dist/resources/extensions/gsd/safety/file-change-validator.js +71 -0
  20. package/dist/resources/extensions/gsd/safety/git-checkpoint.js +91 -0
  21. package/dist/resources/extensions/gsd/safety/safety-harness.js +64 -0
  22. package/dist/resources/extensions/ollama/index.js +22 -10
  23. package/dist/resources/extensions/ollama/ollama-chat-provider.js +1 -1
  24. package/dist/update-cmd.js +4 -2
  25. package/dist/web/standalone/.next/BUILD_ID +1 -1
  26. package/dist/web/standalone/.next/app-path-routes-manifest.json +18 -18
  27. package/dist/web/standalone/.next/build-manifest.json +2 -2
  28. package/dist/web/standalone/.next/prerender-manifest.json +3 -3
  29. package/dist/web/standalone/.next/server/app/_global-error.html +2 -2
  30. package/dist/web/standalone/.next/server/app/_global-error.rsc +1 -1
  31. package/dist/web/standalone/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
  32. package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error/__PAGE__.segment.rsc +1 -1
  33. package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error.segment.rsc +1 -1
  34. package/dist/web/standalone/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
  35. package/dist/web/standalone/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
  36. package/dist/web/standalone/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
  37. package/dist/web/standalone/.next/server/app/_not-found.html +1 -1
  38. package/dist/web/standalone/.next/server/app/_not-found.rsc +1 -1
  39. package/dist/web/standalone/.next/server/app/_not-found.segments/_full.segment.rsc +1 -1
  40. package/dist/web/standalone/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
  41. package/dist/web/standalone/.next/server/app/_not-found.segments/_index.segment.rsc +1 -1
  42. package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
  43. package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
  44. package/dist/web/standalone/.next/server/app/_not-found.segments/_tree.segment.rsc +1 -1
  45. package/dist/web/standalone/.next/server/app/index.html +1 -1
  46. package/dist/web/standalone/.next/server/app/index.rsc +1 -1
  47. package/dist/web/standalone/.next/server/app/index.segments/__PAGE__.segment.rsc +1 -1
  48. package/dist/web/standalone/.next/server/app/index.segments/_full.segment.rsc +1 -1
  49. package/dist/web/standalone/.next/server/app/index.segments/_head.segment.rsc +1 -1
  50. package/dist/web/standalone/.next/server/app/index.segments/_index.segment.rsc +1 -1
  51. package/dist/web/standalone/.next/server/app/index.segments/_tree.segment.rsc +1 -1
  52. package/dist/web/standalone/.next/server/app-paths-manifest.json +18 -18
  53. package/dist/web/standalone/.next/server/pages/404.html +1 -1
  54. package/dist/web/standalone/.next/server/pages/500.html +2 -2
  55. package/dist/web/standalone/.next/server/server-reference-manifest.json +1 -1
  56. package/dist/welcome-screen.js +1 -1
  57. package/package.json +1 -1
  58. package/packages/pi-coding-agent/dist/core/extensions/provider-registration.test.d.ts +2 -0
  59. package/packages/pi-coding-agent/dist/core/extensions/provider-registration.test.d.ts.map +1 -0
  60. package/packages/pi-coding-agent/dist/core/extensions/provider-registration.test.js +46 -0
  61. package/packages/pi-coding-agent/dist/core/extensions/provider-registration.test.js.map +1 -0
  62. package/packages/pi-coding-agent/dist/core/model-registry.d.ts.map +1 -1
  63. package/packages/pi-coding-agent/dist/core/model-registry.js +11 -0
  64. package/packages/pi-coding-agent/dist/core/model-registry.js.map +1 -1
  65. package/packages/pi-coding-agent/dist/core/sdk.d.ts.map +1 -1
  66. package/packages/pi-coding-agent/dist/core/sdk.js +2 -3
  67. package/packages/pi-coding-agent/dist/core/sdk.js.map +1 -1
  68. package/packages/pi-coding-agent/src/core/extensions/provider-registration.test.ts +81 -0
  69. package/packages/pi-coding-agent/src/core/model-registry.ts +12 -0
  70. package/packages/pi-coding-agent/src/core/sdk.ts +2 -3
  71. package/src/resources/extensions/gsd/auto/detect-stuck.ts +27 -0
  72. package/src/resources/extensions/gsd/auto/phases.ts +39 -0
  73. package/src/resources/extensions/gsd/auto/session.ts +5 -0
  74. package/src/resources/extensions/gsd/auto-model-selection.ts +36 -0
  75. package/src/resources/extensions/gsd/auto-post-unit.ts +88 -0
  76. package/src/resources/extensions/gsd/auto-timers.ts +2 -1
  77. package/src/resources/extensions/gsd/bootstrap/db-tools.ts +86 -28
  78. package/src/resources/extensions/gsd/bootstrap/register-hooks.ts +27 -0
  79. package/src/resources/extensions/gsd/bootstrap/system-context.ts +31 -2
  80. package/src/resources/extensions/gsd/preferences-types.ts +13 -0
  81. package/src/resources/extensions/gsd/prompt-loader.ts +8 -0
  82. package/src/resources/extensions/gsd/prompts/system.md +3 -7
  83. package/src/resources/extensions/gsd/safety/content-validator.ts +98 -0
  84. package/src/resources/extensions/gsd/safety/destructive-guard.ts +49 -0
  85. package/src/resources/extensions/gsd/safety/evidence-collector.ts +151 -0
  86. package/src/resources/extensions/gsd/safety/evidence-cross-ref.ts +120 -0
  87. package/src/resources/extensions/gsd/safety/file-change-validator.ts +108 -0
  88. package/src/resources/extensions/gsd/safety/git-checkpoint.ts +106 -0
  89. package/src/resources/extensions/gsd/safety/safety-harness.ts +105 -0
  90. package/src/resources/extensions/gsd/tests/complete-slice-string-coercion.test.ts +211 -0
  91. package/src/resources/extensions/gsd/tests/flat-rate-routing-guard.test.ts +50 -0
  92. package/src/resources/extensions/gsd/tests/git-checkpoint.test.ts +94 -0
  93. package/src/resources/extensions/gsd/tests/stuck-detection-coverage.test.ts +42 -0
  94. package/src/resources/extensions/gsd/workflow-logger.ts +2 -1
  95. package/src/resources/extensions/ollama/index.ts +20 -11
  96. package/src/resources/extensions/ollama/ollama-auth-mode.test.ts +20 -0
  97. package/src/resources/extensions/ollama/ollama-chat-provider.ts +1 -1
  98. package/src/resources/extensions/ollama/tests/ollama-chat-provider-stream.test.ts +82 -0
  99. /package/dist/web/standalone/.next/static/{QmuF-eAbuU_2MQ03t38qr → vIq9fmvRUaFOpguoX5j4W}/_buildManifest.js +0 -0
  100. /package/dist/web/standalone/.next/static/{QmuF-eAbuU_2MQ03t38qr → vIq9fmvRUaFOpguoX5j4W}/_ssgManifest.js +0 -0
package/dist/cli.js CHANGED
@@ -247,6 +247,10 @@ if (cliFlags.messages[0] === 'sessions') {
247
247
  // `gsd headless` — run auto-mode without TUI
248
248
  if (cliFlags.messages[0] === 'headless') {
249
249
  await ensureRtkBootstrap();
250
+ // Sync bundled resources before headless runs (#3471). Without this,
251
+ // headless-query loads from src/resources/ while auto/interactive load
252
+ // from ~/.gsd/agent/extensions/ — different extension copies diverge.
253
+ initResources(agentDir);
250
254
  const { runHeadless, parseHeadlessArgs } = await import('./headless.js');
251
255
  await runHeadless(parseHeadlessArgs(process.argv));
252
256
  process.exit(0);
@@ -15,9 +15,19 @@
15
15
  */
16
16
  import { createJiti } from '@mariozechner/jiti';
17
17
  import { fileURLToPath } from 'node:url';
18
+ import { join } from 'node:path';
19
+ import { homedir } from 'node:os';
18
20
  import { resolveBundledSourceResource } from './bundled-resource-path.js';
19
21
  const jiti = createJiti(fileURLToPath(import.meta.url), { interopDefault: true, debug: false });
20
- const gsdExtensionPath = (...segments) => resolveBundledSourceResource(import.meta.url, 'extensions', 'gsd', ...segments);
22
+ // Resolve extensions from the synced agent directory so headless-query
23
+ // loads the same extension copy as interactive/auto modes (#3471).
24
+ // Falls back to bundled source for source-tree dev workflows.
25
+ const agentExtensionsDir = join(process.env.GSD_AGENT_DIR || join(homedir(), '.gsd', 'agent'), 'extensions', 'gsd');
26
+ const { existsSync } = await import('node:fs');
27
+ const useAgentDir = existsSync(join(agentExtensionsDir, 'state.ts'));
28
+ const gsdExtensionPath = (...segments) => useAgentDir
29
+ ? join(agentExtensionsDir, ...segments)
30
+ : resolveBundledSourceResource(import.meta.url, 'extensions', 'gsd', ...segments);
21
31
  async function loadExtensionModules() {
22
32
  const stateModule = await jiti.import(gsdExtensionPath('state.ts'), {});
23
33
  const dispatchModule = await jiti.import(gsdExtensionPath('auto-dispatch.ts'), {});
@@ -3,6 +3,12 @@
3
3
  *
4
4
  * Leaf node in the import DAG.
5
5
  */
6
+ /**
7
+ * Pattern matching ENOENT errors with a file path.
8
+ * Matches: "ENOENT: no such file or directory, access '/path/to/file'"
9
+ * and similar Node.js filesystem error messages.
10
+ */
11
+ const ENOENT_PATH_RE = /ENOENT[^']*'([^']+)'/;
6
12
  /**
7
13
  * Analyze a sliding window of recent unit dispatches for stuck patterns.
8
14
  * Returns a signal with reason if stuck, null otherwise.
@@ -10,6 +16,8 @@
10
16
  * Rule 1: Same error string twice in a row → stuck immediately.
11
17
  * Rule 2: Same unit key 3+ consecutive times → stuck (preserves prior behavior).
12
18
  * Rule 3: Oscillation A→B→A→B in last 4 entries → stuck.
19
+ * Rule 4: Same ENOENT path in any 2 entries within the window → stuck (#3575).
20
+ * Missing files don't self-heal between retries — retrying wastes budget.
13
21
  */
14
22
  export function detectStuck(window) {
15
23
  if (window.length < 2)
@@ -45,5 +53,24 @@ export function detectStuck(window) {
45
53
  };
46
54
  }
47
55
  }
56
+ // Rule 4: Same ENOENT path seen twice in window (#3575)
57
+ // Missing files don't appear between retries — stop immediately.
58
+ const enoentPaths = new Map();
59
+ for (const entry of window) {
60
+ if (!entry.error)
61
+ continue;
62
+ const match = ENOENT_PATH_RE.exec(entry.error);
63
+ if (!match)
64
+ continue;
65
+ const filePath = match[1];
66
+ const count = (enoentPaths.get(filePath) ?? 0) + 1;
67
+ if (count >= 2) {
68
+ return {
69
+ stuck: true,
70
+ reason: `Missing file referenced twice: ${filePath} (ENOENT)`,
71
+ };
72
+ }
73
+ enoentPaths.set(filePath, count);
74
+ }
48
75
  return null;
49
76
  }
@@ -24,6 +24,9 @@ import { withTimeout, FINALIZE_POST_TIMEOUT_MS } from "./finalize-timeout.js";
24
24
  import { getEligibleSlices } from "../slice-parallel-eligibility.js";
25
25
  import { startSliceParallel } from "../slice-parallel-orchestrator.js";
26
26
  import { isDbAvailable, getMilestoneSlices } from "../gsd-db.js";
27
+ import { resetEvidence } from "../safety/evidence-collector.js";
28
+ import { createCheckpoint, cleanupCheckpoint, rollbackToCheckpoint } from "../safety/git-checkpoint.js";
29
+ import { resolveSafetyHarnessConfig } from "../safety/safety-harness.js";
27
30
  // ─── generateMilestoneReport ──────────────────────────────────────────────────
28
31
  /**
29
32
  * Resolve the base path for milestone reports.
@@ -777,6 +780,18 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
777
780
  ctx.ui.setStatus("gsd-auto", "auto");
778
781
  if (mid)
779
782
  deps.updateSliceProgressCache(s.basePath, mid, state.activeSlice?.id);
783
+ // ── Safety harness: reset evidence + create checkpoint ──
784
+ const safetyConfig = resolveSafetyHarnessConfig(prefs?.safety_harness);
785
+ if (safetyConfig.enabled && safetyConfig.evidence_collection) {
786
+ resetEvidence();
787
+ }
788
+ // Only checkpoint code-executing units (not lifecycle/planning units)
789
+ if (safetyConfig.enabled && safetyConfig.checkpoints && unitType === "execute-task") {
790
+ s.checkpointSha = createCheckpoint(s.basePath, unitId);
791
+ if (s.checkpointSha) {
792
+ debugLog("runUnitPhase", { phase: "checkpoint-created", unitId, sha: s.checkpointSha.slice(0, 8) });
793
+ }
794
+ }
780
795
  // Prompt injection
781
796
  let finalPrompt = prompt;
782
797
  if (s.pendingVerificationRetry) {
@@ -999,6 +1014,25 @@ export async function runUnitPhase(ic, iterData, loopState, sidecarItem) {
999
1014
  }
1000
1015
  }
1001
1016
  deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified, ...(unitResult.errorContext ? { errorContext: unitResult.errorContext } : {}) }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } });
1017
+ // ── Safety harness: checkpoint cleanup or rollback ──
1018
+ if (s.checkpointSha) {
1019
+ if (unitResult.status === "error" && safetyConfig.auto_rollback) {
1020
+ const rolled = rollbackToCheckpoint(s.basePath, unitId, s.checkpointSha);
1021
+ if (rolled) {
1022
+ ctx.ui.notify(`Rolled back to pre-unit checkpoint for ${unitId}`, "info");
1023
+ debugLog("runUnitPhase", { phase: "checkpoint-rollback", unitId });
1024
+ }
1025
+ }
1026
+ else if (unitResult.status === "error") {
1027
+ ctx.ui.notify(`Unit ${unitId} failed. Pre-unit checkpoint available at ${s.checkpointSha.slice(0, 8)}`, "warning");
1028
+ }
1029
+ else {
1030
+ // Success — clean up checkpoint ref
1031
+ cleanupCheckpoint(s.basePath, unitId);
1032
+ debugLog("runUnitPhase", { phase: "checkpoint-cleaned", unitId });
1033
+ }
1034
+ s.checkpointSha = null;
1035
+ }
1002
1036
  return { action: "next", data: { unitStartedAt: s.currentUnit?.startedAt } };
1003
1037
  }
1004
1038
  // ─── runFinalize ──────────────────────────────────────────────────────────────
@@ -82,6 +82,9 @@ export class AutoSession {
82
82
  lastPromptCharCount;
83
83
  lastBaselineCharCount;
84
84
  pendingQuickTasks = [];
85
+ // ── Safety harness ───────────────────────────────────────────────────────
86
+ /** SHA of the pre-unit git checkpoint ref. Cleared on success or rollback. */
87
+ checkpointSha = null;
85
88
  // ── Signal handler ───────────────────────────────────────────────────────
86
89
  sigtermHandler = null;
87
90
  // ── Loop promise state ──────────────────────────────────────────────────
@@ -159,6 +162,7 @@ export class AutoSession {
159
162
  this.lastToolInvocationError = null;
160
163
  this.isolationDegraded = false;
161
164
  this.milestoneMergedInPhases = false;
165
+ this.checkpointSha = null;
162
166
  // Signal handler
163
167
  this.sigtermHandler = null;
164
168
  // Loop promise state lives in auto-loop.ts module scope
@@ -15,6 +15,9 @@ export function resolvePreferredModelConfig(unitType, autoModeStartModel) {
15
15
  const routingConfig = resolveDynamicRoutingConfig();
16
16
  if (!routingConfig.enabled || !routingConfig.tier_models)
17
17
  return undefined;
18
+ // Don't synthesize a routing config for flat-rate providers (#3453).
19
+ if (autoModeStartModel && isFlatRateProvider(autoModeStartModel.provider))
20
+ return undefined;
18
21
  const ceilingModel = routingConfig.tier_models.heavy
19
22
  ?? (autoModeStartModel ? `${autoModeStartModel.provider}/${autoModeStartModel.id}` : undefined);
20
23
  if (!ceilingModel)
@@ -41,6 +44,25 @@ export async function selectAndApplyModel(ctx, pi, unitType, unitId, basePath, p
41
44
  const routingConfig = resolveDynamicRoutingConfig();
42
45
  let effectiveModelConfig = modelConfig;
43
46
  let routingTierLabel = "";
47
+ // Disable routing for flat-rate providers like GitHub Copilot (#3453).
48
+ // All models cost the same per request, so downgrading to a cheaper
49
+ // model provides no cost benefit — it only degrades quality.
50
+ // Fail-closed: if primary model can't be resolved, fall back to
51
+ // provider-level signals rather than allowing unwanted downgrades.
52
+ if (routingConfig.enabled) {
53
+ const primaryModel = resolveModelId(modelConfig.primary, availableModels, ctx.model?.provider);
54
+ if (primaryModel) {
55
+ if (isFlatRateProvider(primaryModel.provider)) {
56
+ routingConfig.enabled = false;
57
+ }
58
+ }
59
+ else if ((autoModeStartModel && isFlatRateProvider(autoModeStartModel.provider))
60
+ || (ctx.model?.provider && isFlatRateProvider(ctx.model.provider))) {
61
+ // Primary model unresolvable but provider signals indicate flat-rate —
62
+ // disable routing to prevent quality degradation.
63
+ routingConfig.enabled = false;
64
+ }
65
+ }
44
66
  if (routingConfig.enabled) {
45
67
  let budgetPct;
46
68
  if (routingConfig.budget_pressure !== false) {
@@ -244,3 +266,13 @@ export function resolveModelId(modelId, availableModels, currentProvider) {
244
266
  // Fall back to first non-extension candidate, or any candidate
245
267
  return candidates.find(m => !EXTENSION_PROVIDERS.has(m.provider)) ?? candidates[0];
246
268
  }
269
+ /**
270
+ * Flat-rate providers charge the same per request regardless of model.
271
+ * Dynamic routing provides no cost benefit — it only degrades quality (#3453).
272
+ * Uses case-insensitive matching with alias support to prevent fail-open on
273
+ * provider naming variations (e.g. "copilot" vs "github-copilot").
274
+ */
275
+ const FLAT_RATE_PROVIDERS = new Set(["github-copilot", "copilot"]);
276
+ export function isFlatRateProvider(provider) {
277
+ return FLAT_RATE_PROVIDERS.has(provider.toLowerCase());
278
+ }
@@ -29,6 +29,13 @@ import { checkPostUnitHooks, isRetryPending, consumeRetryTrigger, persistHookSta
29
29
  import { hasPendingCaptures, loadPendingCaptures, revertExecutorResolvedCaptures } from "./captures.js";
30
30
  import { debugLog } from "./debug-logger.js";
31
31
  import { runSafely } from "./auto-utils.js";
32
+ import { getEvidence } from "./safety/evidence-collector.js";
33
+ import { validateFileChanges } from "./safety/file-change-validator.js";
34
+ // crossReferenceEvidence available for future use when verification_evidence is stored in DB
35
+ // import { crossReferenceEvidence, type ClaimedEvidence } from "./safety/evidence-cross-ref.js";
36
+ import { validateContent } from "./safety/content-validator.js";
37
+ import { resolveSafetyHarnessConfig } from "./safety/safety-harness.js";
38
+ import { resolveExpectedArtifactPath as resolveArtifactForContent } from "./auto-artifact-paths.js";
32
39
  /** Maximum verification retry attempts before escalating to blocker placeholder (#2653). */
33
40
  const MAX_VERIFICATION_RETRIES = 3;
34
41
  /** Enqueue a sidecar item (hook, triage, or quick-task) for the main loop to
@@ -341,6 +348,78 @@ export async function postUnitPreVerification(pctx, opts) {
341
348
  catch (e) {
342
349
  debugLog("postUnit", { phase: "rogue-detection", error: String(e) });
343
350
  }
351
+ // ── Safety harness: post-unit validation ──
352
+ try {
353
+ const { loadEffectiveGSDPreferences } = await import("./preferences.js");
354
+ const prefs = loadEffectiveGSDPreferences()?.preferences;
355
+ const safetyConfig = resolveSafetyHarnessConfig(prefs?.safety_harness);
356
+ if (safetyConfig.enabled) {
357
+ const { milestone: sMid, slice: sSid, task: sTid } = parseUnitId(s.currentUnit.id);
358
+ // File change validation (execute-task only, after auto-commit)
359
+ if (safetyConfig.file_change_validation && s.currentUnit.type === "execute-task" && sMid && sSid && sTid && isDbAvailable()) {
360
+ try {
361
+ const taskRow = getTask(sMid, sSid, sTid);
362
+ if (taskRow) {
363
+ const expectedOutput = taskRow.expected_output ?? [];
364
+ const plannedFiles = taskRow.files ?? [];
365
+ const audit = validateFileChanges(s.basePath, expectedOutput, plannedFiles);
366
+ if (audit && audit.violations.length > 0) {
367
+ const warnings = audit.violations.filter(v => v.severity === "warning");
368
+ for (const v of warnings) {
369
+ logWarning("safety", `file-change: ${v.file} — ${v.reason}`);
370
+ }
371
+ if (warnings.length > 0) {
372
+ ctx.ui.notify(`Safety: ${warnings.length} unexpected file change(s) outside task plan`, "warning");
373
+ }
374
+ }
375
+ }
376
+ }
377
+ catch (e) {
378
+ debugLog("postUnit", { phase: "safety-file-change", error: String(e) });
379
+ }
380
+ }
381
+ // Evidence cross-reference (execute-task only)
382
+ // Verification evidence is passed via the complete-task tool call and
383
+ // stored in the SUMMARY.md on disk — not available as structured data
384
+ // in the DB. The evidence collector tracks actual bash tool calls, so
385
+ // we can still detect units that claimed success but ran no commands.
386
+ if (safetyConfig.evidence_cross_reference && s.currentUnit.type === "execute-task") {
387
+ try {
388
+ const actual = getEvidence();
389
+ const bashCalls = actual.filter(e => e.kind === "bash");
390
+ // If the task is marked complete but zero bash commands were run,
391
+ // it's suspicious — the LLM may have fabricated results.
392
+ if (sMid && sSid && sTid && isDbAvailable()) {
393
+ const taskRow = getTask(sMid, sSid, sTid);
394
+ if (taskRow?.status === "complete" && taskRow.verify && bashCalls.length === 0) {
395
+ logWarning("safety", "task marked complete with verification commands but no bash calls were executed");
396
+ ctx.ui.notify(`Safety: task ${sTid} has verification commands but no bash calls were recorded`, "warning");
397
+ }
398
+ }
399
+ }
400
+ catch (e) {
401
+ debugLog("postUnit", { phase: "safety-evidence-xref", error: String(e) });
402
+ }
403
+ }
404
+ // Content validation (plan-slice, plan-milestone)
405
+ if (safetyConfig.content_validation) {
406
+ try {
407
+ const artifactPath = resolveArtifactForContent(s.currentUnit.type, s.currentUnit.id, s.basePath);
408
+ const contentViolations = validateContent(s.currentUnit.type, artifactPath);
409
+ for (const v of contentViolations) {
410
+ logWarning("safety", `content: ${v.reason}`);
411
+ ctx.ui.notify(`Content validation: ${v.reason}`, "warning");
412
+ }
413
+ }
414
+ catch (e) {
415
+ debugLog("postUnit", { phase: "safety-content-validation", error: String(e) });
416
+ }
417
+ }
418
+ }
419
+ }
420
+ catch (e) {
421
+ debugLog("postUnit", { phase: "safety-harness", error: String(e) });
422
+ }
344
423
  // Artifact verification
345
424
  let triggerArtifactVerified = false;
346
425
  if (!s.currentUnit.type.startsWith("hook/")) {
@@ -77,8 +77,9 @@ export function startUnitSupervision(sctx) {
77
77
  }
78
78
  }
79
79
  const estimateMinutes = taskEstimate ? parseEstimateMinutes(taskEstimate) : null;
80
+ const MAX_TIMEOUT_SCALE = 6; // Cap at 6x (60min task). Prevents 2h+ tasks from creating 120min+ timeout windows.
80
81
  const timeoutScale = estimateMinutes && estimateMinutes > 0
81
- ? Math.max(1, estimateMinutes / 10) // 10min task = 1x, 30min = 3x, 2h = 12x
82
+ ? Math.min(MAX_TIMEOUT_SCALE, Math.max(1, estimateMinutes / 10))
82
83
  : 1;
83
84
  const softTimeoutMs = (supervisor.soft_timeout_minutes ?? 0) * 60 * 1000 * timeoutScale;
84
85
  const idleTimeoutMs = (supervisor.idle_timeout_minutes ?? 0) * 60 * 1000; // idle not scaled — idle is idle
@@ -678,8 +678,11 @@ export function registerDbTools(pi) {
678
678
  };
679
679
  }
680
680
  try {
681
+ // Coerce string items to objects for verificationEvidence (#3541).
682
+ const coerced = { ...params };
683
+ coerced.verificationEvidence = (params.verificationEvidence ?? []).map((v) => typeof v === "string" ? { command: v, exitCode: -1, verdict: "unknown (coerced from string)", durationMs: 0 } : v);
681
684
  const { handleCompleteTask } = await import("../tools/complete-task.js");
682
- const result = await handleCompleteTask(params, process.cwd());
685
+ const result = await handleCompleteTask(coerced, process.cwd());
683
686
  if ("error" in result) {
684
687
  return {
685
688
  content: [{ type: "text", text: `Error completing task: ${result.error}` }],
@@ -733,12 +736,15 @@ export function registerDbTools(pi) {
733
736
  keyFiles: Type.Optional(Type.Array(Type.String(), { description: "List of key files created or modified" })),
734
737
  keyDecisions: Type.Optional(Type.Array(Type.String(), { description: "List of key decisions made during this task" })),
735
738
  blockerDiscovered: Type.Optional(Type.Boolean({ description: "Whether a plan-invalidating blocker was discovered" })),
736
- verificationEvidence: Type.Optional(Type.Array(Type.Object({
737
- command: Type.String({ description: "Verification command that was run" }),
738
- exitCode: Type.Number({ description: "Exit code of the command" }),
739
- verdict: Type.String({ description: "Pass/fail verdict (e.g. '✅ pass', '❌ fail')" }),
740
- durationMs: Type.Number({ description: "Duration of the command in milliseconds" }),
741
- }), { description: "Array of verification evidence entries" })),
739
+ verificationEvidence: Type.Optional(Type.Array(Type.Union([
740
+ Type.Object({
741
+ command: Type.String({ description: "Verification command that was run" }),
742
+ exitCode: Type.Number({ description: "Exit code of the command" }),
743
+ verdict: Type.String({ description: "Pass/fail verdict (e.g. '✅ pass', '❌ fail')" }),
744
+ durationMs: Type.Number({ description: "Duration of the command in milliseconds" }),
745
+ }),
746
+ Type.String({ description: "Fallback: verification summary string" }),
747
+ ]), { description: "Array of verification evidence entries" })),
742
748
  }),
743
749
  execute: taskCompleteExecute,
744
750
  };
@@ -754,8 +760,46 @@ export function registerDbTools(pi) {
754
760
  };
755
761
  }
756
762
  try {
763
+ // Coerce string items to objects for fields where LLMs sometimes pass
764
+ // plain strings instead of the expected { key, value } shape (#3541).
765
+ // Parses "key — value" or "key - value" format when possible.
766
+ const splitPair = (s) => {
767
+ const m = s.match(/^(.+?)\s*(?:—|-)\s+(.+)$/);
768
+ return m ? [m[1].trim(), m[2].trim()] : [s.trim(), ""];
769
+ };
770
+ const coerced = { ...params };
771
+ coerced.filesModified = (params.filesModified ?? []).map((f) => {
772
+ if (typeof f !== "string")
773
+ return f;
774
+ const [path, description] = splitPair(f);
775
+ return { path, description };
776
+ });
777
+ coerced.requires = (params.requires ?? []).map((r) => {
778
+ if (typeof r !== "string")
779
+ return r;
780
+ const [slice, provides] = splitPair(r);
781
+ return { slice, provides };
782
+ });
783
+ coerced.requirementsAdvanced = (params.requirementsAdvanced ?? []).map((r) => {
784
+ if (typeof r !== "string")
785
+ return r;
786
+ const [id, how] = splitPair(r);
787
+ return { id, how };
788
+ });
789
+ coerced.requirementsValidated = (params.requirementsValidated ?? []).map((r) => {
790
+ if (typeof r !== "string")
791
+ return r;
792
+ const [id, proof] = splitPair(r);
793
+ return { id, proof };
794
+ });
795
+ coerced.requirementsInvalidated = (params.requirementsInvalidated ?? []).map((r) => {
796
+ if (typeof r !== "string")
797
+ return r;
798
+ const [id, what] = splitPair(r);
799
+ return { id, what };
800
+ });
757
801
  const { handleCompleteSlice } = await import("../tools/complete-slice.js");
758
- const result = await handleCompleteSlice(params, process.cwd());
802
+ const result = await handleCompleteSlice(coerced, process.cwd());
759
803
  if ("error" in result) {
760
804
  return {
761
805
  content: [{ type: "text", text: `Error completing slice: ${result.error}` }],
@@ -815,26 +859,41 @@ export function registerDbTools(pi) {
815
859
  requirementsSurfaced: Type.Optional(Type.Array(Type.String(), { description: "New requirements surfaced" })),
816
860
  drillDownPaths: Type.Optional(Type.Array(Type.String(), { description: "Paths to task summaries for drill-down" })),
817
861
  affects: Type.Optional(Type.Array(Type.String(), { description: "Downstream slices affected" })),
818
- requirementsAdvanced: Type.Optional(Type.Array(Type.Object({
819
- id: Type.String({ description: "Requirement ID" }),
820
- how: Type.String({ description: "How it was advanced" }),
821
- }), { description: "Requirements advanced by this slice" })),
822
- requirementsValidated: Type.Optional(Type.Array(Type.Object({
823
- id: Type.String({ description: "Requirement ID" }),
824
- proof: Type.String({ description: "What proof validates it" }),
825
- }), { description: "Requirements validated by this slice" })),
826
- requirementsInvalidated: Type.Optional(Type.Array(Type.Object({
827
- id: Type.String({ description: "Requirement ID" }),
828
- what: Type.String({ description: "What changed" }),
829
- }), { description: "Requirements invalidated or re-scoped" })),
830
- filesModified: Type.Optional(Type.Array(Type.Object({
831
- path: Type.String({ description: "File path" }),
832
- description: Type.String({ description: "What changed" }),
833
- }), { description: "Files modified with descriptions" })),
834
- requires: Type.Optional(Type.Array(Type.Object({
835
- slice: Type.String({ description: "Dependency slice ID" }),
836
- provides: Type.String({ description: "What was consumed from it" }),
837
- }), { description: "Upstream slice dependencies consumed" })),
862
+ requirementsAdvanced: Type.Optional(Type.Array(Type.Union([
863
+ Type.Object({
864
+ id: Type.String({ description: "Requirement ID" }),
865
+ how: Type.String({ description: "How it was advanced" }),
866
+ }),
867
+ Type.String({ description: "Fallback: 'ID — how' string" }),
868
+ ]), { description: "Requirements advanced by this slice" })),
869
+ requirementsValidated: Type.Optional(Type.Array(Type.Union([
870
+ Type.Object({
871
+ id: Type.String({ description: "Requirement ID" }),
872
+ proof: Type.String({ description: "What proof validates it" }),
873
+ }),
874
+ Type.String({ description: "Fallback: 'ID — proof' string" }),
875
+ ]), { description: "Requirements validated by this slice" })),
876
+ requirementsInvalidated: Type.Optional(Type.Array(Type.Union([
877
+ Type.Object({
878
+ id: Type.String({ description: "Requirement ID" }),
879
+ what: Type.String({ description: "What changed" }),
880
+ }),
881
+ Type.String({ description: "Fallback: 'ID what' string" }),
882
+ ]), { description: "Requirements invalidated or re-scoped" })),
883
+ filesModified: Type.Optional(Type.Array(Type.Union([
884
+ Type.Object({
885
+ path: Type.String({ description: "File path" }),
886
+ description: Type.String({ description: "What changed" }),
887
+ }),
888
+ Type.String({ description: "Fallback: file path string" }),
889
+ ]), { description: "Files modified with descriptions" })),
890
+ requires: Type.Optional(Type.Array(Type.Union([
891
+ Type.Object({
892
+ slice: Type.String({ description: "Dependency slice ID" }),
893
+ provides: Type.String({ description: "What was consumed from it" }),
894
+ }),
895
+ Type.String({ description: "Fallback: slice ID string" }),
896
+ ]), { description: "Upstream slice dependencies consumed" })),
838
897
  }),
839
898
  execute: sliceCompleteExecute,
840
899
  };
@@ -15,6 +15,9 @@ import { isParallelActive, shutdownParallel } from "../parallel-orchestrator.js"
15
15
  import { checkToolCallLoop, resetToolCallLoopGuard } from "./tool-call-loop-guard.js";
16
16
  import { saveActivityLog } from "../activity-log.js";
17
17
  import { resetAskUserQuestionsCache } from "../../ask-user-questions.js";
18
+ import { recordToolCall as safetyRecordToolCall, recordToolResult as safetyRecordToolResult } from "../safety/evidence-collector.js";
19
+ import { classifyCommand } from "../safety/destructive-guard.js";
20
+ import { logWarning as safetyLogWarning } from "../workflow-logger.js";
18
21
  // Skip the welcome screen on the very first session_start — cli.ts already
19
22
  // printed it before the TUI launched. Only re-print on /clear (subsequent sessions).
20
23
  let isFirstSession = true;
@@ -187,6 +190,22 @@ export function registerHooks(pi) {
187
190
  if (result.block)
188
191
  return result;
189
192
  });
193
+ // ── Safety harness: evidence collection + destructive command warnings ──
194
+ pi.on("tool_call", async (event, ctx) => {
195
+ if (!isAutoActive())
196
+ return;
197
+ safetyRecordToolCall(event.toolName, event.input);
198
+ // Destructive command classification (warn only, never block)
199
+ if (isToolCallEventType("bash", event)) {
200
+ const classification = classifyCommand(event.input.command);
201
+ if (classification.destructive) {
202
+ safetyLogWarning("safety", `destructive command: ${classification.labels.join(", ")}`, {
203
+ command: String(event.input.command).slice(0, 200),
204
+ });
205
+ ctx.ui.notify(`Destructive command detected: ${classification.labels.join(", ")}`, "warning");
206
+ }
207
+ }
208
+ });
190
209
  pi.on("tool_result", async (event) => {
191
210
  if (event.toolName !== "ask_user_questions")
192
211
  return;
@@ -251,6 +270,10 @@ export function registerHooks(pi) {
251
270
  : (typeof event.result?.content?.[0]?.text === "string" ? event.result.content[0].text : String(event.result));
252
271
  recordToolInvocationError(event.toolName, errorText);
253
272
  }
273
+ // Safety harness: record tool execution results for evidence cross-referencing
274
+ if (isAutoActive()) {
275
+ safetyRecordToolResult(event.toolCallId, event.toolName, event.result, event.isError);
276
+ }
254
277
  });
255
278
  pi.on("model_select", async (_event, ctx) => {
256
279
  await syncServiceTierStatus(ctx);
@@ -3,9 +3,10 @@ import { homedir } from "node:os";
3
3
  import { join } from "node:path";
4
4
  import { logWarning } from "../workflow-logger.js";
5
5
  import { debugTime } from "../debug-logger.js";
6
- import { loadPrompt } from "../prompt-loader.js";
6
+ import { loadPrompt, getTemplatesDir } from "../prompt-loader.js";
7
7
  import { readForensicsMarker } from "../forensics.js";
8
8
  import { resolveAllSkillReferences, renderPreferencesForSystemPrompt, loadEffectiveGSDPreferences } from "../preferences.js";
9
+ import { resolveSkillReference } from "../preferences-skills.js";
9
10
  import { resolveGsdRootFile, resolveSliceFile, resolveSlicePath, resolveTaskFile, resolveTaskFiles, resolveTasksDir, relSliceFile, relSlicePath, relTaskFile } from "../paths.js";
10
11
  import { hasSkillSnapshot, detectNewSkills, formatSkillsXml } from "../skill-discovery.js";
11
12
  import { getActiveAutoWorktreeContext } from "../auto-worktree.js";
@@ -15,6 +16,30 @@ import { formatOverridesSection, loadActiveOverrides, loadFile, parseContinue, p
15
16
  import { toPosixPath } from "../../shared/mod.js";
16
17
  import { markCmuxPromptShown, shouldPromptToEnableCmux } from "../../cmux/index.js";
17
18
  const gsdHome = process.env.GSD_HOME || join(homedir(), ".gsd");
19
+ /**
20
+ * Bundled skill triggers — resolved dynamically at runtime instead of
21
+ * hardcoding absolute paths in the system prompt template. Only skills
22
+ * that actually exist on disk are included in the table. (#3575)
23
+ */
24
+ const BUNDLED_SKILL_TRIGGERS = [
25
+ { trigger: "Frontend UI - web components, pages, landing pages, dashboards, React/HTML/CSS, styling", skill: "frontend-design" },
26
+ { trigger: "macOS or iOS apps - SwiftUI, Xcode, App Store", skill: "swiftui" },
27
+ { trigger: "Debugging - complex bugs, failing tests, root-cause investigation after standard approaches fail", skill: "debug-like-expert" },
28
+ ];
29
+ function buildBundledSkillsTable() {
30
+ const cwd = process.cwd();
31
+ const rows = [];
32
+ for (const { trigger, skill } of BUNDLED_SKILL_TRIGGERS) {
33
+ const resolution = resolveSkillReference(skill, cwd);
34
+ if (resolution.method === "unresolved")
35
+ continue; // skill not installed — omit from prompt
36
+ rows.push(`| ${trigger} | \`${resolution.resolvedPath}\` |`);
37
+ }
38
+ if (rows.length === 0) {
39
+ return "*No bundled skills found. Install skills to `~/.agents/skills/` or `~/.claude/skills/`.*";
40
+ }
41
+ return `| Trigger | Skill to load |\n|---|---|\n${rows.join("\n")}`;
42
+ }
18
43
  function warnDeprecatedAgentInstructions() {
19
44
  const paths = [
20
45
  join(gsdHome, "agent-instructions.md"),
@@ -32,7 +57,10 @@ export async function buildBeforeAgentStartResult(event, ctx) {
32
57
  if (!existsSync(join(process.cwd(), ".gsd")))
33
58
  return undefined;
34
59
  const stopContextTimer = debugTime("context-inject");
35
- const systemContent = loadPrompt("system");
60
+ const systemContent = loadPrompt("system", {
61
+ bundledSkillsTable: buildBundledSkillsTable(),
62
+ templatesDir: getTemplatesDir(),
63
+ });
36
64
  const loadedPreferences = loadEffectiveGSDPreferences();
37
65
  if (shouldPromptToEnableCmux(loadedPreferences?.preferences)) {
38
66
  markCmuxPromptShown();
@@ -75,6 +75,7 @@ export const KNOWN_PREFERENCE_KEYS = new Set([
75
75
  "experimental",
76
76
  "codebase",
77
77
  "slice_parallel",
78
+ "safety_harness",
78
79
  ]);
79
80
  /** Canonical list of all dispatch unit types. */
80
81
  export const KNOWN_UNIT_TYPES = [
@@ -47,6 +47,13 @@ function resolveExtensionDir() {
47
47
  const __extensionDir = resolveExtensionDir();
48
48
  const promptsDir = join(__extensionDir, "prompts");
49
49
  const templatesDir = join(__extensionDir, "templates");
50
+ /**
51
+ * Return the resolved templates directory path for use in prompts.
52
+ * Avoids hardcoding `~/.gsd/agent/extensions/gsd/templates/` in templates. (#3575)
53
+ */
54
+ export function getTemplatesDir() {
55
+ return templatesDir;
56
+ }
50
57
  // Cache all templates eagerly at module load — a running session uses the
51
58
  // template versions that were on disk at startup, immune to later overwrites.
52
59
  const templateCache = new Map();
@@ -24,13 +24,9 @@ Leave the project in a state where the next agent can immediately understand wha
24
24
 
25
25
  ## Skills
26
26
 
27
- GSD ships with bundled skills. Load the relevant skill file with the `read` tool before starting work when the task matches.
27
+ GSD ships with bundled skills. Load the relevant skill file with the `read` tool before starting work when the task matches. Use bare skill names — GSD resolves them to the correct path automatically.
28
28
 
29
- | Trigger | Skill to load |
30
- |---|---|
31
- | Frontend UI - web components, pages, landing pages, dashboards, React/HTML/CSS, styling | `~/.gsd/agent/skills/frontend-design/SKILL.md` |
32
- | macOS or iOS apps - SwiftUI, Xcode, App Store | `~/.gsd/agent/skills/swiftui/SKILL.md` |
33
- | Debugging - complex bugs, failing tests, root-cause investigation after standard approaches fail | `~/.gsd/agent/skills/debug-like-expert/SKILL.md` |
29
+ {{bundledSkillsTable}}
34
30
 
35
31
  ## Hard Rules
36
32
 
@@ -119,7 +115,7 @@ In all modes, slices commit sequentially on the active branch; there are no per-
119
115
  ### Artifact Templates
120
116
 
121
117
  Templates showing the expected format for each artifact type are in:
122
- `~/.gsd/agent/extensions/gsd/templates/`
118
+ `{{templatesDir}}`
123
119
 
124
120
  **Always read the relevant template before writing an artifact** to match the expected structure exactly. The parsers that read these files depend on specific formatting:
125
121