@ranger-testing/ranger-cli 1.0.13 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/README.md +28 -65
  2. package/build/cli.js +220 -277
  3. package/build/cli.js.map +1 -1
  4. package/build/commands/addEnv.js +1 -1
  5. package/build/commands/addEnv.js.map +1 -1
  6. package/build/commands/authEncrypt.js +7 -6
  7. package/build/commands/authEncrypt.js.map +1 -1
  8. package/build/commands/clean.js +1 -1
  9. package/build/commands/clean.js.map +1 -1
  10. package/build/commands/config.js +5 -4
  11. package/build/commands/config.js.map +1 -1
  12. package/build/commands/dataMcpServer.js +1 -1
  13. package/build/commands/dataMcpServer.js.map +1 -1
  14. package/build/commands/env.js +17 -10
  15. package/build/commands/env.js.map +1 -1
  16. package/build/commands/feature.js +208 -273
  17. package/build/commands/feature.js.map +1 -1
  18. package/build/commands/index.js +3 -0
  19. package/build/commands/index.js.map +1 -1
  20. package/build/commands/postEditHook.js +25 -0
  21. package/build/commands/postEditHook.js.map +1 -0
  22. package/build/commands/preCompactHook.js +85 -0
  23. package/build/commands/preCompactHook.js.map +1 -0
  24. package/build/commands/sessionStartHook.js +64 -0
  25. package/build/commands/sessionStartHook.js.map +1 -0
  26. package/build/commands/skillup.js +18 -21
  27. package/build/commands/skillup.js.map +1 -1
  28. package/build/commands/start.js +1 -1
  29. package/build/commands/start.js.map +1 -1
  30. package/build/commands/status.js +30 -44
  31. package/build/commands/status.js.map +1 -1
  32. package/build/commands/update.js +32 -40
  33. package/build/commands/update.js.map +1 -1
  34. package/build/commands/updateEnv.js +1 -1
  35. package/build/commands/updateEnv.js.map +1 -1
  36. package/build/commands/useEnv.js +1 -1
  37. package/build/commands/useEnv.js.map +1 -1
  38. package/build/commands/utils/browserSessionsApi.js +1 -1
  39. package/build/commands/utils/browserSessionsApi.js.map +1 -1
  40. package/build/commands/utils/cliSecret.js +1 -1
  41. package/build/commands/utils/environment.js +0 -6
  42. package/build/commands/utils/environment.js.map +1 -1
  43. package/build/commands/utils/featureApi.js +68 -24
  44. package/build/commands/utils/featureApi.js.map +1 -1
  45. package/build/commands/utils/featureReportGenerator.js +37 -3
  46. package/build/commands/utils/featureReportGenerator.js.map +1 -1
  47. package/build/commands/utils/keychain.js +1 -1
  48. package/build/commands/utils/keychain.js.map +1 -1
  49. package/build/commands/utils/localAgentInstallationsApi.js +1 -1
  50. package/build/commands/utils/mcpConfig.js +1 -1
  51. package/build/commands/utils/rangerRoot.js +30 -0
  52. package/build/commands/utils/rangerRoot.js.map +1 -0
  53. package/build/commands/utils/settings.js +7 -5
  54. package/build/commands/utils/settings.js.map +1 -1
  55. package/build/commands/utils/skillContent.js +28 -0
  56. package/build/commands/utils/skillContent.js.map +1 -0
  57. package/build/commands/utils/skills.js +1 -1
  58. package/build/commands/utils/skills.js.map +1 -1
  59. package/build/commands/utils/userApi.js +32 -0
  60. package/build/commands/utils/userApi.js.map +1 -0
  61. package/build/commands/verifyFeature.js +429 -104
  62. package/build/commands/verifyFeature.js.map +1 -1
  63. package/build/commands/verifyInBrowser.js +1 -1
  64. package/build/commands/verifyInBrowser.js.map +1 -1
  65. package/build/skills/bug-bash.md +31 -10
  66. package/build/skills/feature-tracker/SKILL.md +8 -30
  67. package/build/skills/feature-tracker/create.md +47 -38
  68. package/build/skills/feature-tracker/start.md +4 -4
  69. package/build/skills/feature-tracker/verify.md +10 -14
  70. package/package.json +5 -3
  71. package/scripts/postinstall.js +18 -0
  72. package/build/skills/feature-tracker/manage.md +0 -145
@@ -1,15 +1,16 @@
1
1
  import { query } from '@anthropic-ai/claude-agent-sdk';
2
2
  import { join, dirname } from 'path';
3
- import { readFile, readdir, appendFile, mkdir, rm } from 'fs/promises';
3
+ import { readFile, readdir, appendFile, mkdir, rm, stat } from 'fs/promises';
4
4
  import { existsSync } from 'fs';
5
5
  import { execSync } from 'child_process';
6
6
  import { tmpdir } from 'os';
7
7
  import inquirer from 'inquirer';
8
8
  import { loadSettings, resolveEnvVars, buildPlaywrightConfig, cleanupTempFiles, } from './utils/settings.js';
9
- import { createBrowserSession, updateBrowserSession, getUploadUrls, uploadTrace, uploadConversation, uploadScreenshot, buildTraceViewerUrl, getAnthropicApiKey, } from './utils/browserSessionsApi.js';
9
+ import { createBrowserSession, updateBrowserSession, getUploadUrls, uploadTrace, uploadConversation, uploadScreenshot, uploadVideo, buildTraceViewerUrl, getAnthropicApiKey, } from './utils/browserSessionsApi.js';
10
10
  import { getToken } from './utils/keychain.js';
11
11
  import { getActiveFeatureId } from './feature.js';
12
- import { getFeature, addChecklistItem, updateChecklistItem, } from './utils/featureApi.js';
12
+ import { getFeature, updateChecklistItem, startSession, } from './utils/featureApi.js';
13
+ import { getRangerDir } from './utils/rangerRoot.js';
13
14
  /**
14
15
  * Zip a directory and return the buffer
15
16
  */
@@ -26,7 +27,7 @@ async function zipDirectory(dirPath) {
26
27
  * Find the trace directory for a session
27
28
  */
28
29
  function getTraceDirectory(sessionId) {
29
- return join(process.cwd(), '.ranger', 'sessions', sessionId);
30
+ return join(getRangerDir(), 'sessions', sessionId);
30
31
  }
31
32
  /**
32
33
  * Get the conversation file path for a session
@@ -34,24 +35,132 @@ function getTraceDirectory(sessionId) {
34
35
  function getConversationFilePath(sessionId) {
35
36
  return join(tmpdir(), 'ranger-browser-sessions', sessionId, 'conversation.jsonl');
36
37
  }
38
+ /**
39
+ * Load videos from a session's videos directory
40
+ */
41
+ async function loadSessionVideos(sessionDir) {
42
+ const videosDir = join(sessionDir, 'videos');
43
+ if (!existsSync(videosDir)) {
44
+ return [];
45
+ }
46
+ const files = await readdir(videosDir);
47
+ const videoFiles = files.filter((f) => f.toLowerCase().endsWith('.webm'));
48
+ return videoFiles.map((filename) => ({
49
+ filename,
50
+ path: join(videosDir, filename),
51
+ }));
52
+ }
53
+ /**
54
+ * Get mock evaluation data for debug mode
55
+ */
56
+ function getMockEvaluation(outcome) {
57
+ const mockEvaluations = {
58
+ verified: {
59
+ success: true,
60
+ summary: '[DEBUG] Mock verification completed successfully.',
61
+ evaluation: 'verified',
62
+ evaluationReason: 'All checklist requirements were met.',
63
+ },
64
+ partial: {
65
+ success: false,
66
+ summary: '[DEBUG] Mock partial verification.',
67
+ evaluation: 'partial',
68
+ evaluationReason: 'Some requirements were not fully verified.',
69
+ issues: [
70
+ {
71
+ severity: 'MINOR',
72
+ type: 'OTHER',
73
+ description: 'Secondary feature not fully implemented',
74
+ },
75
+ ],
76
+ },
77
+ incomplete: {
78
+ success: false,
79
+ summary: '[DEBUG] Mock incomplete verification.',
80
+ evaluation: 'partial',
81
+ evaluationReason: 'Implementation is incomplete and needs additional work.',
82
+ issues: [
83
+ {
84
+ severity: 'MAJOR',
85
+ type: 'OTHER',
86
+ description: 'Feature is partially implemented but missing key functionality',
87
+ },
88
+ {
89
+ severity: 'MINOR',
90
+ type: 'OTHER',
91
+ description: 'UI elements present but not fully functional',
92
+ },
93
+ ],
94
+ },
95
+ blocked: {
96
+ success: false,
97
+ summary: '[DEBUG] Mock blocked verification.',
98
+ evaluation: 'blocked',
99
+ evaluationReason: 'HTTP 404 - Page not found.',
100
+ issues: [
101
+ {
102
+ severity: 'BLOCKER',
103
+ type: 'HTTP_404',
104
+ description: 'Target page returns 404 Not Found',
105
+ },
106
+ {
107
+ severity: 'MAJOR',
108
+ type: 'NAVIGATION_ERROR',
109
+ description: 'Unable to proceed due to missing page',
110
+ },
111
+ ],
112
+ },
113
+ failed: {
114
+ success: false,
115
+ summary: '[DEBUG] Mock failed verification.',
116
+ evaluation: 'failed',
117
+ evaluationReason: 'Browser automation failed with timeout error.',
118
+ issues: [
119
+ {
120
+ severity: 'BLOCKER',
121
+ type: 'OTHER',
122
+ description: 'Timeout waiting for element',
123
+ },
124
+ ],
125
+ },
126
+ };
127
+ return mockEvaluations[outcome];
128
+ }
129
+ /**
130
+ * Get the debug mode prompt for minimal browser interaction
131
+ */
132
+ function getDebugPrompt() {
133
+ return `You are testing browser automation. Your task is simple:
134
+
135
+ 1. Navigate to https://www.mozilla.org using browser_navigate
136
+ 2. Take a snapshot with browser_snapshot to see the page
137
+ 3. Take a screenshot named "01_mozilla-homepage.png" using browser_take_screenshot
138
+ 4. Return immediately with the structured output
139
+
140
+ Return your findings in the structured output format.`;
141
+ }
37
142
  /**
38
143
  * Prompt user to select a checklist item
39
144
  */
40
145
  async function selectChecklistItem(items) {
146
+ if (items.length === 0) {
147
+ return null;
148
+ }
41
149
  const choices = items.map((item, i) => {
42
150
  const emoji = item.status === 'verified'
43
151
  ? '\u2705'
44
- : item.status === 'blocked'
45
- ? '\ud83d\uded1'
46
- : item.status === 'canceled'
47
- ? '\u26d4'
48
- : '\u2b1c';
152
+ : item.status === 'incomplete'
153
+ ? '\ud83d\udfe0' // orange circle
154
+ : item.status === 'blocked'
155
+ ? '\ud83d\uded1'
156
+ : item.status === 'canceled'
157
+ ? '\u26d4'
158
+ : '\u2b1c';
49
159
  return {
50
160
  name: `${i + 1}. ${emoji} ${item.description}`,
51
161
  value: item.id,
52
162
  };
53
163
  });
54
- choices.push({ name: '+ Add new item', value: '__new__' });
55
164
  const { selected } = await inquirer.prompt([
56
165
  {
57
166
  type: 'list',
@@ -60,18 +169,49 @@ async function selectChecklistItem(items) {
60
169
  choices,
61
170
  },
62
171
  ]);
63
- if (selected === '__new__') {
64
- return { item: null, addNew: true };
172
+ return items.find((i) => i.id === selected) || null;
173
+ }
174
+ /**
175
+ * Handle incomplete verification - check if all other items are terminal and prompt user
176
+ */
177
+ async function handleIncompleteItem(featureId, incompleteItem, result) {
178
+ const feature = await getFeature(featureId);
179
+ const sessionItems = feature.checklistItems.filter((i) => i.sessionId === feature.currentSessionId);
180
+ const otherItems = sessionItems.filter((i) => i.id !== incompleteItem.id);
181
+ const allOthersTerminal = otherItems.every((i) => i.status === 'verified' ||
182
+ i.status === 'blocked' ||
183
+ i.status === 'canceled' ||
184
+ i.status === 'incomplete');
185
+ console.log(`\n${'='.repeat(60)}`);
186
+ console.log(`INCOMPLETE - Verification found issues`);
187
+ console.log(`${'='.repeat(60)}`);
188
+ // Display structured list of issues
189
+ if (result.issues && result.issues.length > 0) {
190
+ console.log(`\nIssues found:`);
191
+ for (const issue of result.issues) {
192
+ console.log(` • ${issue.description}`);
193
+ }
65
194
  }
66
- return {
67
- item: items.find((i) => i.id === selected) || null,
68
- addNew: false,
69
- };
195
+ else if (result.evaluationReason) {
196
+ console.log(`\nReason: ${result.evaluationReason}`);
197
+ }
198
+ console.log(`\nNext steps:`);
199
+ console.log(` 1. Fix the issues above in your code`);
200
+ console.log(` 2. Run 'ranger verify-feature' again to re-verify`);
201
+ if (allOthersTerminal && otherItems.length > 0) {
202
+ console.log(`\nAll other checklist items are complete.`);
203
+ console.log(`If you're done for now, run 'ranger feature conclude-session' to end this session.`);
204
+ }
205
+ console.log(`${'='.repeat(60)}\n`);
70
206
  }
71
207
  /**
72
208
  * Verify a checklist item in the browser
73
209
  */
74
- export async function verifyFeature(url, options) {
210
+ export async function verifyFeature(options) {
211
+ const isDebugMode = !!options.debugOutcome;
212
+ if (isDebugMode) {
213
+ console.log(`\n[DEBUG MODE] Running minimal browser test with outcome: ${options.debugOutcome}`);
214
+ }
75
215
  // 1. Check for active feature
76
216
  const featureId = await getActiveFeatureId();
77
217
  if (!featureId) {
@@ -80,77 +220,104 @@ export async function verifyFeature(url, options) {
80
220
  // Load feature details
81
221
  const feature = await getFeature(featureId);
82
222
  console.log(`\nActive feature: ${feature.name} (${featureId})`);
223
+ // Filter to only items in the current session
224
+ const currentSessionId = feature.currentSessionId;
225
+ const currentSessionItems = currentSessionId
226
+ ? feature.checklistItems.filter((item) => item.sessionId === currentSessionId)
227
+ : feature.checklistItems;
83
228
  // 2. Determine which checklist item we're verifying
84
229
  let checklistItem = null;
85
230
  let taskDescription = options.task;
86
- if (options.newItem) {
87
- // Create a new item with the provided description
88
- checklistItem = await addChecklistItem(featureId, {
89
- description: options.newItem,
90
- });
91
- console.log(`Created new checklist item: ${checklistItem.description}`);
92
- if (!taskDescription) {
93
- taskDescription = options.newItem;
94
- }
95
- }
96
- else if (options.item !== undefined) {
97
- // Use specified item index
231
+ if (options.item !== undefined) {
232
+ // Use specified item index (1-based, relative to current session items)
98
233
  const itemIndex = options.item - 1; // 1-based to 0-based
99
- if (itemIndex < 0 || itemIndex >= feature.checklistItems.length) {
100
- throw new Error(`Invalid item index: ${options.item}. Feature has ${feature.checklistItems.length} items.`);
234
+ if (itemIndex < 0 || itemIndex >= currentSessionItems.length) {
235
+ throw new Error(`Invalid item index: ${options.item}. Current session has ${currentSessionItems.length} items.`);
101
236
  }
102
- checklistItem = feature.checklistItems[itemIndex];
237
+ checklistItem = currentSessionItems[itemIndex];
103
238
  if (!taskDescription) {
104
239
  taskDescription = checklistItem.description;
105
240
  }
106
241
  }
107
242
  else {
108
- // Interactive selection
109
- const { item, addNew } = await selectChecklistItem(feature.checklistItems);
110
- if (addNew) {
111
- const { description } = await inquirer.prompt([
112
- {
113
- type: 'input',
114
- name: 'description',
115
- message: 'Enter new item description:',
116
- validate: (input) => input.trim() ? true : 'Description is required',
117
- },
118
- ]);
119
- checklistItem = await addChecklistItem(featureId, {
120
- description: description.trim(),
243
+ // Check if running in non-TTY environment (CI, scripts, Claude Code, etc.)
244
+ const isInteractive = process.stdin.isTTY && process.stdout.isTTY;
245
+ if (!isInteractive) {
246
+ // Non-TTY mode: require --item flag, show available items
247
+ console.log('\nNon-interactive mode detected. The --item flag is required.');
248
+ console.log('\nAvailable checklist items for current session:');
249
+ currentSessionItems.forEach((item, i) => {
250
+ const emoji = item.status === 'verified'
251
+ ? '\u2705'
252
+ : item.status === 'incomplete'
253
+ ? '\ud83d\udfe0' // orange circle
254
+ : item.status === 'blocked'
255
+ ? '\ud83d\uded1'
256
+ : item.status === 'canceled'
257
+ ? '\u26d4'
258
+ : '\u2b1c';
259
+ console.log(` ${i + 1}. ${emoji} ${item.description}`);
121
260
  });
122
- console.log(`Created new checklist item: ${checklistItem.description}`);
123
- if (!taskDescription) {
124
- taskDescription = checklistItem.description;
125
- }
261
+ console.log('\nUsage: ranger verify-feature --item <number>');
262
+ console.log('Example: ranger verify-feature --item 1');
263
+ throw new Error('The --item flag is required in non-interactive mode. See available items above.');
126
264
  }
127
- else {
128
- checklistItem = item;
129
- if (!taskDescription && checklistItem) {
130
- taskDescription = checklistItem.description;
131
- }
265
+ // Interactive selection (show only current session items)
266
+ checklistItem = await selectChecklistItem(currentSessionItems);
267
+ if (!taskDescription && checklistItem) {
268
+ taskDescription = checklistItem.description;
132
269
  }
133
270
  }
134
271
  if (!checklistItem) {
135
- throw new Error('No checklist item selected');
272
+ throw new Error('No checklist item selected. Create items when creating the feature with -c flag.');
136
273
  }
137
274
  if (!taskDescription) {
138
275
  throw new Error('No task description provided');
139
276
  }
140
277
  console.log(`\nVerifying: ${checklistItem.description}`);
141
278
  console.log(`Task: ${taskDescription}`);
142
- // 3. Load active environment
143
- const activeEnvPath = join(process.cwd(), '.ranger', 'active-env.txt');
144
- if (!existsSync(activeEnvPath)) {
145
- throw new Error('No active environment. Run: ranger use <env-name>');
279
+ // Start the session if it's in ready status
280
+ if (feature.currentSession &&
281
+ feature.currentSession.status === 'ready' &&
282
+ feature.currentSessionId) {
283
+ try {
284
+ await startSession(featureId, feature.currentSessionId);
285
+ }
286
+ catch (error) {
287
+ // Ignore if session is already started (race condition)
288
+ const message = error instanceof Error ? error.message : String(error);
289
+ if (!message.includes('already')) {
290
+ throw error;
291
+ }
292
+ }
146
293
  }
147
- const activeEnv = await readFile(activeEnvPath, 'utf-8').then((s) => s.trim());
148
- const envDir = join(process.cwd(), '.ranger', activeEnv);
294
+ // Update checklist item status to verification_in_progress
295
+ await updateChecklistItem(featureId, checklistItem.id, {
296
+ status: 'verification_in_progress',
297
+ });
298
+ // 3. Determine which environment to use (same pattern as verifyInBrowser)
299
+ let activeEnv;
300
+ if (options.env) {
301
+ activeEnv = options.env;
302
+ }
303
+ else {
304
+ const activeEnvPath = join(getRangerDir(), 'active-env.txt');
305
+ if (!existsSync(activeEnvPath)) {
306
+ throw new Error('No active environment. Run: ranger use <env-name>');
307
+ }
308
+ activeEnv = await readFile(activeEnvPath, 'utf-8').then((s) => s.trim());
309
+ }
310
+ const envDir = join(getRangerDir(), activeEnv);
149
311
  if (!existsSync(envDir)) {
150
312
  throw new Error(`Environment not found at ${envDir}. Run: ranger add env ${activeEnv}`);
151
313
  }
152
314
  const settings = await loadSettings(activeEnv);
153
315
  const resolvedSettings = resolveEnvVars(settings);
316
+ // Get base URL from settings
317
+ const url = resolvedSettings.baseUrl;
318
+ if (!url) {
319
+ throw new Error(`No baseUrl configured for environment "${activeEnv}". Run: ranger config set ${activeEnv} baseUrl <url>`);
320
+ }
154
321
  // 4. Create browser session
155
322
  const token = await getToken();
156
323
  if (!token) {
@@ -161,6 +328,8 @@ export async function verifyFeature(url, options) {
161
328
  settings: resolvedSettings,
162
329
  task: taskDescription,
163
330
  url,
331
+ featureId,
332
+ checklistItemId: checklistItem.id,
164
333
  });
165
334
  console.log(`Browser session created: ${browserSession.id}`);
166
335
  const configResult = await buildPlaywrightConfig(resolvedSettings, activeEnv, browserSession?.id);
@@ -195,23 +364,79 @@ export async function verifyFeature(url, options) {
195
364
  throw new Error(errorMsg);
196
365
  }
197
366
  // 5. UI Verifier + Evaluation Agent prompt
198
- const verifierPrompt = `You are a Feature Verifier. Your job is to verify a checklist item by executing a UI flow and evaluating whether it adequately completes the checklist item.
367
+ let verifierPrompt;
368
+ if (isDebugMode) {
369
+ verifierPrompt = getDebugPrompt();
370
+ }
371
+ else {
372
+ const notesSection = checklistItem.notes
373
+ ? `\n\n## Additional Notes\n${checklistItem.notes}`
374
+ : '';
375
+ verifierPrompt = `You are a Feature Verifier. Your job is to verify a checklist item by executing a UI flow and evaluating whether it adequately completes the checklist item.
199
376
 
200
377
  ## Checklist Item to Verify
201
- ${checklistItem.description}
378
+ ${checklistItem.description}${notesSection}
202
379
 
203
380
  ## Task to Execute
204
381
  ${taskDescription}
205
382
 
206
- ## URL
207
- ${url}
383
+ CRITICAL URL REQUIREMENT:
384
+ Your base URL is: ${url}
385
+ - You may ONLY navigate to paths under this base URL (same protocol, host, and port)
386
+ - For example, if the base URL is "http://localhost:3000", you can navigate to "http://localhost:3000/home", "http://localhost:3000/settings", etc.
387
+ - DO NOT navigate to any different domain, host, or port under any circumstances
388
+ - IGNORE any URLs from product documentation (mcp__ranger__get_product_docs) that have a different base URL
389
+ - If documentation or code diffs suggest a path exists (e.g., "/dashboard"), you may navigate to that path ONLY under the base URL above
390
+ - The base URL above is the ONLY authorized environment for this verification
208
391
 
209
392
  ## Instructions
210
- 1. Navigate to the URL using browser_navigate
393
+ 1. Navigate to the URL above using browser_navigate
211
394
  2. Take a snapshot with browser_snapshot to see the page
212
- 3. Execute the task step-by-step using browser tools
213
- 4. Document any issues found (bugs, errors, unexpected behavior)
214
- 5. After completing the verification, evaluate whether the result adequately verifies the checklist item
395
+ 3. **IMMEDIATELY check for blocking HTTP errors before proceeding**
396
+ 4. Execute the task step-by-step using browser tools
397
+ 5. **Take screenshots at key moments** (see Screenshot Guidelines below)
398
+ 6. Document any issues found (bugs, errors, unexpected behavior)
399
+ 7. After completing the verification, evaluate whether the result adequately verifies the checklist item
400
+
401
+ ## Screenshot Guidelines - IMPORTANT
402
+ Take screenshots throughout the verification flow so a human can review it for completeness. Screenshots are your evidence trail.
403
+
404
+ **When to take screenshots (use browser_take_screenshot):**
405
+ - After initial page load (capture starting state)
406
+ - Before and after clicking buttons or submitting forms
407
+ - When important UI elements appear (modals, notifications, loading states)
408
+ - After navigation to new pages
409
+ - When verifying specific elements exist
410
+ - At the final state showing the completed action
411
+
412
+ **Screenshot naming:**
413
+ - Use descriptive filenames: "01_login-page-loaded.png", "02_form-filled.png", "03_dashboard-visible.png"
414
+ - Number prefixes (01_, 02_, etc.) help maintain chronological order
415
+ - For KEY MOMENTS that prove the checklist item is complete, prefix with "key_": "key_04_success-message.png", "key_05_final-state.png"
416
+ - The "key_" prefix marks screenshots as high-priority evidence for human reviewers
417
+
418
+ **Aim for 3-6 screenshots per verification** to document the complete flow. Mark 1-2 of the most important ones with the "key_" prefix.
419
+
420
+ ## Critical: Early Error Detection
421
+ After step 2 (taking initial snapshot), IMMEDIATELY check for blocking HTTP errors:
422
+
423
+ **Blocking errors to detect:**
424
+ - HTTP 404: "404", "Not Found", "Page not found", "does not exist"
425
+ - HTTP 500: "500", "Internal Server Error", "Server Error", "Something went wrong"
426
+ - HTTP 400: "400", "Bad Request", "Invalid request"
427
+
428
+ **Also check for:**
429
+ - Framework error pages (Next.js error boundary, React error page, "Application error")
430
+ - Completely blank/empty pages with no content
431
+ - "Cannot GET /path" messages
432
+
433
+ **If ANY blocking error is detected:**
434
+ 1. DO NOT continue with the task
435
+ 2. Return IMMEDIATELY with evaluation: "blocked"
436
+ 3. Set evaluationReason to describe the specific error (e.g., "HTTP 404 - Page not found at /dashboard")
437
+ 4. Include the error in issues array with severity: "BLOCKER" and appropriate type (HTTP_404, HTTP_500, HTTP_400, or NAVIGATION_ERROR)
438
+
439
+ This early exit prevents wasting time on tasks that cannot succeed due to fundamental errors.
215
440
 
216
441
  ## Evaluation Criteria
217
442
  - VERIFIED: The task completed successfully and the checklist item requirements are fully met
@@ -220,6 +445,7 @@ ${url}
220
445
  - FAILED: The task could not be completed due to errors
221
446
 
222
447
  Return your findings in the structured output format with your evaluation.`;
448
+ }
223
449
  const outputSchema = {
224
450
  type: 'object',
225
451
  properties: {
@@ -239,6 +465,16 @@ Return your findings in the structured output format with your evaluation.`;
239
465
  type: 'string',
240
466
  enum: ['BLOCKER', 'MAJOR', 'MINOR'],
241
467
  },
468
+ type: {
469
+ type: 'string',
470
+ enum: [
471
+ 'HTTP_404',
472
+ 'HTTP_500',
473
+ 'HTTP_400',
474
+ 'NAVIGATION_ERROR',
475
+ 'OTHER',
476
+ ],
477
+ },
242
478
  description: { type: 'string' },
243
479
  screenshot: { type: 'string' },
244
480
  },
@@ -264,7 +500,6 @@ Return your findings in the structured output format with your evaluation.`;
264
500
  type: 'json_schema',
265
501
  schema: outputSchema,
266
502
  },
267
- maxTurns: 25,
268
503
  env: {
269
504
  ...process.env,
270
505
  ANTHROPIC_API_KEY: anthropicApiKey,
@@ -275,6 +510,8 @@ Return your findings in the structured output format with your evaluation.`;
275
510
  // 7. Collect messages
276
511
  let finalResult = null;
277
512
  let agentError = null;
513
+ // Fallback: capture StructuredOutput tool call input in case SDK fails to populate structured_output
514
+ let lastStructuredOutputInput = null;
278
515
  const conversationFilePath = getConversationFilePath(browserSession.id);
279
516
  const conversationDir = dirname(conversationFilePath);
280
517
  await mkdir(conversationDir, { recursive: true });
@@ -297,13 +534,25 @@ Return your findings in the structured output format with your evaluation.`;
297
534
  // Ignore
298
535
  }
299
536
  const msg = message;
537
+ // Capture StructuredOutput tool call input as fallback
538
+ // This handles SDK bug where structured_output is not populated in result
539
+ if (msg.type === 'assistant' && msg.message?.content) {
540
+ for (const block of msg.message.content) {
541
+ if (block.type === 'tool_use' &&
542
+ block.name === 'StructuredOutput' &&
543
+ block.input) {
544
+ lastStructuredOutputInput =
545
+ block.input;
546
+ }
547
+ }
548
+ }
300
549
  if (msg.error) {
301
550
  let errorText = msg.error;
302
551
  if (msg.message?.content &&
303
552
  Array.isArray(msg.message.content)) {
304
553
  const texts = msg.message.content
305
554
  .filter((c) => c.type === 'text')
306
- .map((c) => c.text)
555
+ .map((c) => c.text || '')
307
556
  .filter(Boolean);
308
557
  if (texts.length > 0) {
309
558
  errorText = texts.join(' ');
@@ -318,7 +567,15 @@ Return your findings in the structured output format with your evaluation.`;
318
567
  message.structured_output;
319
568
  }
320
569
  else if (message.subtype !== 'success') {
321
- if (!agentError) {
570
+ // SDK bug workaround: If we got error_during_execution but have
571
+ // a StructuredOutput tool call, use that instead
572
+ if (lastStructuredOutputInput &&
573
+ message.errors?.length === 0) {
574
+ finalResult = lastStructuredOutputInput;
575
+ // Clear the error since we actually succeeded
576
+ agentError = null;
577
+ }
578
+ else if (!agentError) {
322
579
  agentError =
323
580
  message.errors?.join(', ') ||
324
581
  'Unknown error';
@@ -334,27 +591,60 @@ Return your findings in the structured output format with your evaluation.`;
334
591
  agentError = error instanceof Error ? error.message : String(error);
335
592
  }
336
593
  const durationMs = Date.now() - startTime;
337
- // 8. Upload trace and update session
594
+ // 8. Upload trace, videos, screenshots with metadata, and update session
338
595
  let traceDownloadUrl;
339
596
  try {
340
597
  const traceDir = getTraceDirectory(browserSession.id);
341
598
  if (existsSync(traceDir)) {
342
599
  const files = await readdir(traceDir);
343
600
  if (files.length > 0) {
601
+ // Upload trace zip
344
602
  const traceUrls = await getUploadUrls(browserSession.id, 'trace.zip', 'zip');
345
603
  const traceBuffer = await zipDirectory(traceDir);
346
604
  await uploadTrace(traceUrls.uploadUrl, traceBuffer);
347
605
  traceDownloadUrl = traceUrls.downloadUrl;
606
+ // Upload videos from videos/ subdirectory
607
+ const videos = await loadSessionVideos(traceDir);
608
+ for (const video of videos) {
609
+ try {
610
+ const videoBuffer = await readFile(video.path);
611
+ const videoUrls = await getUploadUrls(browserSession.id, video.filename, 'webm');
612
+ await uploadVideo(videoUrls.uploadUrl, videoBuffer);
613
+ }
614
+ catch {
615
+ // Ignore individual video upload errors
616
+ }
617
+ }
618
+ // Upload screenshots (same approach as main, with metadata)
348
619
  const pngFiles = files.filter((f) => f.toLowerCase().endsWith('.png'));
349
620
  for (const pngFile of pngFiles) {
350
621
  try {
351
622
  const pngPath = join(traceDir, pngFile);
352
623
  const pngBuffer = await readFile(pngPath);
353
- const pngUrls = await getUploadUrls(browserSession.id, pngFile, 'png');
624
+ const pngStat = await stat(pngPath);
625
+ // Detect "key_" prefix for high-priority screenshots
626
+ const isKeyFrame = pngFile
627
+ .toLowerCase()
628
+ .startsWith('key_');
629
+ const displayName = pngFile
630
+ .replace(/\.png$/i, '')
631
+ .replace(/^key_/i, '')
632
+ .replace(/^\d+_/, '')
633
+ .replace(/-/g, ' ');
634
+ const pngUrls = await getUploadUrls(browserSession.id, pngFile, 'png', {
635
+ metadata: {
636
+ name: displayName,
637
+ description: isKeyFrame
638
+ ? 'Key moment captured during verification'
639
+ : 'Screenshot captured during verification',
640
+ highPriority: isKeyFrame,
641
+ timestamp: pngStat.mtime.toISOString(),
642
+ },
643
+ });
354
644
  await uploadScreenshot(pngUrls.uploadUrl, pngBuffer);
355
645
  }
356
646
  catch {
357
- // Ignore
647
+ // Ignore individual screenshot upload errors
358
648
  }
359
649
  }
360
650
  }
@@ -390,39 +680,74 @@ Return your findings in the structured output format with your evaluation.`;
390
680
  catch {
391
681
  // Ignore upload errors
392
682
  }
393
- // 9. Update checklist item based on evaluation
394
- // Use typedResult from outer scope for the evaluation
395
- const resultForEval = finalResult;
396
- if (resultForEval && checklistItem) {
397
- const evaluation = resultForEval.evaluation;
398
- if (evaluation === 'verified') {
399
- await updateChecklistItem(featureId, checklistItem.id, {
400
- status: 'verified',
401
- browserSessionId: browserSession.id,
402
- });
403
- console.log(`\n\u2705 Checklist item verified!`);
404
- }
405
- else if (evaluation === 'blocked') {
406
- await updateChecklistItem(featureId, checklistItem.id, {
407
- status: 'blocked',
408
- browserSessionId: browserSession.id,
409
- blockedReason: resultForEval.evaluationReason,
410
- });
411
- console.log(`\n\ud83d\uded1 Checklist item blocked: ${resultForEval.evaluationReason}`);
683
+ // 9. Determine the result to use for evaluation
684
+ // In debug mode, use mock evaluation; otherwise use agent result
685
+ let resultForEval;
686
+ if (isDebugMode && options.debugOutcome) {
687
+ const mockEval = getMockEvaluation(options.debugOutcome);
688
+ resultForEval = {
689
+ ...mockEval,
690
+ sessionId: browserSession.id,
691
+ sessionDir: getTraceDirectory(browserSession.id),
692
+ durationMs,
693
+ traceViewerUrl: traceDownloadUrl
694
+ ? buildTraceViewerUrl(traceDownloadUrl)
695
+ : undefined,
696
+ checklistItemId: checklistItem.id,
697
+ };
698
+ console.log(`\n[DEBUG MODE] Using mock evaluation: ${options.debugOutcome}`);
699
+ }
700
+ else {
701
+ const typedResult = finalResult;
702
+ if (agentError && !typedResult) {
703
+ throw new Error(`Verification failed: ${agentError}`);
412
704
  }
413
- else if (evaluation === 'partial' || evaluation === 'failed') {
414
- // Keep pending but link session
415
- await updateChecklistItem(featureId, checklistItem.id, {
416
- browserSessionId: browserSession.id,
417
- });
418
- console.log(`\n\u26a0\ufe0f ${evaluation === 'partial' ? 'Partial verification' : 'Verification failed'}: ${resultForEval.evaluationReason}`);
705
+ if (!typedResult) {
706
+ throw new Error('No result received from agent');
419
707
  }
708
+ resultForEval = typedResult;
709
+ }
710
+ // 10. Update checklist item based on evaluation
711
+ const evaluation = resultForEval.evaluation;
712
+ if (evaluation === 'verified') {
713
+ await updateChecklistItem(featureId, checklistItem.id, {
714
+ status: 'verified',
715
+ browserSessionId: browserSession.id,
716
+ });
717
+ console.log(`\n\u2705 Checklist item verified!`);
420
718
  }
421
- if (agentError && !resultForEval) {
422
- throw new Error(`Verification failed: ${agentError}`);
719
+ else if (evaluation === 'blocked') {
720
+ await updateChecklistItem(featureId, checklistItem.id, {
721
+ status: 'blocked',
722
+ browserSessionId: browserSession.id,
723
+ blockedReason: resultForEval.evaluationReason,
724
+ });
725
+ // Enhanced output for Claude Code
726
+ console.log(`\n${'='.repeat(60)}`);
727
+ console.log(`BLOCKING ISSUE DETECTED - Debug Required`);
728
+ console.log(`${'='.repeat(60)}`);
729
+ console.log(`\nIssue: ${resultForEval.evaluationReason}`);
730
+ if (resultForEval.issues?.length) {
731
+ console.log(`\nDetails:`);
732
+ for (const issue of resultForEval.issues) {
733
+ const typeStr = issue.type ? ` (${issue.type})` : '';
734
+ console.log(` - [${issue.severity}]${typeStr} ${issue.description}`);
735
+ }
736
+ }
737
+ if (resultForEval.traceViewerUrl) {
738
+ console.log(`\nTrace: ${resultForEval.traceViewerUrl}`);
739
+ }
740
+ console.log(`\nSuggested action: Debug this issue in your code, then run verify-feature again.`);
741
+ console.log(`${'='.repeat(60)}\n`);
423
742
  }
424
- if (!resultForEval) {
425
- throw new Error('No result received from agent');
743
+ else if (evaluation === 'partial' || evaluation === 'failed') {
744
+ // Mark as incomplete - verification happened but requirements not fully met
745
+ await updateChecklistItem(featureId, checklistItem.id, {
746
+ status: 'incomplete',
747
+ browserSessionId: browserSession.id,
748
+ });
749
+ // Check if other items are terminal and prompt user
750
+ await handleIncompleteItem(featureId, checklistItem, resultForEval);
426
751
  }
427
752
  return resultForEval;
428
753
  }