testchimp-runner-core 0.0.35 → 0.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/package.json +6 -1
  2. package/plandocs/BEFORE_AFTER_VERIFICATION.md +0 -148
  3. package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +0 -144
  4. package/plandocs/CREDIT_CALLBACK_ARCHITECTURE.md +0 -253
  5. package/plandocs/HUMAN_LIKE_IMPROVEMENTS.md +0 -642
  6. package/plandocs/IMPLEMENTATION_STATUS.md +0 -108
  7. package/plandocs/INTEGRATION_COMPLETE.md +0 -322
  8. package/plandocs/MULTI_AGENT_ARCHITECTURE_REVIEW.md +0 -844
  9. package/plandocs/ORCHESTRATOR_MVP_SUMMARY.md +0 -539
  10. package/plandocs/PHASE1_ABSTRACTION_COMPLETE.md +0 -241
  11. package/plandocs/PHASE1_FINAL_STATUS.md +0 -210
  12. package/plandocs/PHASE_1_COMPLETE.md +0 -165
  13. package/plandocs/PHASE_1_SUMMARY.md +0 -184
  14. package/plandocs/PLANNING_SESSION_SUMMARY.md +0 -372
  15. package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +0 -120
  16. package/plandocs/PROMPT_SANITY_CHECK.md +0 -120
  17. package/plandocs/SCRIPT_CLEANUP_FEATURE.md +0 -201
  18. package/plandocs/SCRIPT_GENERATION_ARCHITECTURE.md +0 -364
  19. package/plandocs/SELECTOR_IMPROVEMENTS.md +0 -139
  20. package/plandocs/SESSION_SUMMARY_v0.0.33.md +0 -151
  21. package/plandocs/TROUBLESHOOTING_SESSION.md +0 -72
  22. package/plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md +0 -336
  23. package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +0 -396
  24. package/plandocs/WHATS_NEW_v0.0.33.md +0 -183
  25. package/plandocs/exploratory-mode-support-v2.plan.md +0 -953
  26. package/plandocs/exploratory-mode-support.plan.md +0 -928
  27. package/plandocs/journey-id-tracking-addendum.md +0 -227
  28. package/releasenotes/RELEASE_0.0.26.md +0 -165
  29. package/releasenotes/RELEASE_0.0.27.md +0 -236
  30. package/releasenotes/RELEASE_0.0.28.md +0 -286
  31. package/src/auth-config.ts +0 -84
  32. package/src/credit-usage-service.ts +0 -188
  33. package/src/env-loader.ts +0 -103
  34. package/src/execution-service.ts +0 -996
  35. package/src/file-handler.ts +0 -104
  36. package/src/index.ts +0 -432
  37. package/src/llm-facade.ts +0 -821
  38. package/src/llm-provider.ts +0 -53
  39. package/src/model-constants.ts +0 -35
  40. package/src/orchestrator/decision-parser.ts +0 -139
  41. package/src/orchestrator/index.ts +0 -58
  42. package/src/orchestrator/orchestrator-agent.ts +0 -1282
  43. package/src/orchestrator/orchestrator-prompts.ts +0 -786
  44. package/src/orchestrator/page-som-handler.ts +0 -1565
  45. package/src/orchestrator/som-types.ts +0 -188
  46. package/src/orchestrator/tool-registry.ts +0 -184
  47. package/src/orchestrator/tools/check-page-ready.ts +0 -75
  48. package/src/orchestrator/tools/extract-data.ts +0 -92
  49. package/src/orchestrator/tools/index.ts +0 -15
  50. package/src/orchestrator/tools/inspect-page.ts +0 -42
  51. package/src/orchestrator/tools/recall-history.ts +0 -72
  52. package/src/orchestrator/tools/refresh-som-markers.ts +0 -69
  53. package/src/orchestrator/tools/take-screenshot.ts +0 -128
  54. package/src/orchestrator/tools/verify-action-result.ts +0 -159
  55. package/src/orchestrator/tools/view-previous-screenshot.ts +0 -103
  56. package/src/orchestrator/types.ts +0 -291
  57. package/src/playwright-mcp-service.ts +0 -224
  58. package/src/progress-reporter.ts +0 -144
  59. package/src/prompts.ts +0 -842
  60. package/src/providers/backend-proxy-llm-provider.ts +0 -91
  61. package/src/providers/local-llm-provider.ts +0 -38
  62. package/src/scenario-service.ts +0 -252
  63. package/src/scenario-worker-class.ts +0 -1110
  64. package/src/script-utils.ts +0 -203
  65. package/src/types.ts +0 -239
  66. package/src/utils/browser-utils.ts +0 -348
  67. package/src/utils/coordinate-converter.ts +0 -162
  68. package/src/utils/page-info-retry.ts +0 -65
  69. package/src/utils/page-info-utils.ts +0 -285
  70. package/testchimp-runner-core-0.0.35.tgz +0 -0
  71. package/tsconfig.json +0 -19
@@ -1,69 +0,0 @@
1
- import { Tool, ToolParameter, ToolExecutionContext } from '../tool-registry';
2
- import { ToolResult } from '../types';
3
-
4
- /**
5
- * Tool to refresh SoM markers when they appear outdated or misaligned
6
- */
7
- export class RefreshSomMarkersTool implements Tool {
8
- name = 'refresh_som_markers';
9
- description = 'Manually refresh the Set-of-Marks visual markers on the page. Use when: (1) Markers appear misaligned with actual UI elements, (2) Page content has changed but markers are stale (e.g., after dynamic content loads), (3) You suspect markers are from a previous page state. Returns updated screenshot with fresh markers.';
10
-
11
- parameters: ToolParameter[] = [
12
- {
13
- name: 'reason',
14
- type: 'string',
15
- description: 'Why you need to refresh markers. Examples: "dropdown expanded but markers still show closed state", "new content loaded but not marked", "markers seem to point to wrong elements"',
16
- required: true
17
- }
18
- ];
19
-
20
- async execute(params: Record<string, any>, context: ToolExecutionContext): Promise<ToolResult> {
21
- const { logger, page, somHandler } = context;
22
- const reason = params.reason || 'Markers appear outdated';
23
-
24
- if (!somHandler) {
25
- return {
26
- success: false,
27
- error: 'SoM mode not enabled - refresh markers tool unavailable.'
28
- };
29
- }
30
-
31
- if (!page) {
32
- return {
33
- success: false,
34
- error: 'No page context available.'
35
- };
36
- }
37
-
38
- try {
39
- logger?.(`[RefreshSomMarkers] Refreshing markers due to: ${reason}`, 'log');
40
-
41
- // Ensure somHandler has the latest page reference
42
- somHandler.setPage(page);
43
-
44
- // Re-scan page and update markers
45
- await somHandler.updateSom();
46
- logger?.(`[RefreshSomMarkers] ✓ Markers updated`, 'log');
47
-
48
- // Capture fresh screenshot with new markers (viewport only - cheaper than full page)
49
- const freshScreenshot = await somHandler.getScreenshot(true, false, 60);
50
- logger?.(`[RefreshSomMarkers] ✓ Fresh screenshot captured (viewport)`, 'log');
51
-
52
- return {
53
- success: true,
54
- data: {
55
- screenshot: freshScreenshot,
56
- reason
57
- },
58
- learning: `SoM markers refreshed. New screenshot shows current page state with updated element markers. Reason: ${reason}`
59
- };
60
- } catch (error: any) {
61
- logger?.(`[RefreshSomMarkers] ✗ Failed: ${error.message}`, 'error');
62
- return {
63
- success: false,
64
- error: `Failed to refresh markers: ${error.message}`
65
- };
66
- }
67
- }
68
- }
69
-
@@ -1,128 +0,0 @@
1
- /**
2
- * Take Screenshot Tool
3
- * Captures current page state visually and analyzes it with DOM snapshot
4
- */
5
-
6
- import { Tool, ToolParameter, ToolExecutionContext } from '../tool-registry';
7
- import { ToolResult } from '../types';
8
- import { getEnhancedPageInfo } from '../../utils/page-info-utils';
9
- import { VISION_MODEL } from '../../model-constants';
10
-
11
- export class TakeScreenshotTool implements Tool {
12
- name = 'take_screenshot';
13
- description = 'Capture a screenshot and analyze it with DOM snapshot to get actionable selector recommendations. Use when you need to see the actual page visually to find the right elements. Returns text-based analysis with selector suggestions.';
14
-
15
- // LLM facade will be injected
16
- private llmFacade?: any;
17
-
18
- setLLMFacade(llmFacade: any): void {
19
- this.llmFacade = llmFacade;
20
- }
21
-
22
- parameters: ToolParameter[] = [
23
- {
24
- name: 'isFullPage',
25
- type: 'boolean',
26
- description: 'If true, captures entire page (scrolling). If false, captures only viewport. DEFAULT: true (recommended for finding elements that may be below fold)',
27
- required: false,
28
- default: true
29
- },
30
- {
31
- name: 'purpose',
32
- type: 'string',
33
- description: 'What you are trying to find or understand from the screenshot',
34
- required: false
35
- }
36
- ];
37
-
38
- async execute(params: Record<string, any>, context: ToolExecutionContext): Promise<ToolResult> {
39
- const { page, logger } = context;
40
- const isFullPage = params.isFullPage !== undefined ? params.isFullPage : true; // Default: true (capture full page)
41
- const purpose = params.purpose || 'Analyze page structure and elements';
42
-
43
- try {
44
- logger?.(`[TakeScreenshot] Capturing ${isFullPage ? 'full page' : 'viewport'} screenshot for: ${purpose}`, 'log');
45
-
46
- // Capture screenshot (JPEG 60 quality for smaller size)
47
- const screenshotBuffer = await page.screenshot({
48
- fullPage: isFullPage,
49
- type: 'jpeg',
50
- quality: 60
51
- });
52
-
53
- // Convert to data URL
54
- const base64 = screenshotBuffer.toString('base64');
55
- const dataUrl = `data:image/jpeg;base64,${base64}`;
56
-
57
- logger?.(`[TakeScreenshot] ✓ Screenshot captured (${Math.round(base64.length / 1024)}KB), analyzing with DOM...`, 'log');
58
-
59
- // Get DOM snapshot for correlation (already has ARIA tree + interactive elements with bboxes)
60
- const pageInfo = await getEnhancedPageInfo(page);
61
-
62
- // Analyze screenshot with structured DOM via LLM
63
- let analysis = 'Screenshot captured. Use DOM snapshot in context to find selectors.';
64
-
65
- if (this.llmFacade) {
66
- try {
67
- logger?.(`[TakeScreenshot] Calling LLM for vision analysis...`, 'log');
68
-
69
- const analysisPrompt = `Analyze screenshot WITH DOM structure to identify correct Playwright selectors.
70
-
71
- PURPOSE: ${purpose}
72
-
73
- INTERACTIVE ELEMENTS (with positions and suggested selectors):
74
- ${pageInfo.formattedElements}
75
-
76
- ARIA TREE (hierarchical structure):
77
- ${JSON.stringify(pageInfo.ariaSnapshot, null, 2).substring(0, 2000)}
78
-
79
- TASK:
80
- 1. Look at screenshot - identify the visual elements you need for: ${purpose}
81
- 2. Match visual position with bounding boxes above
82
- 3. Recommend SEMANTIC SELECTORS FIRST: getByRole, getByLabel, getByPlaceholder, getByText
83
- 4. AVOID auto-generated IDs with unicode (e.g., #«r3»-form-item)
84
-
85
- Output format:
86
- "For [visual element description]:
87
- Try: [semantic selector from list - prefer getByRole/getByLabel]
88
- Or: [alternative selector]"
89
-
90
- Be concise. Only 2-3 recommendations. Prioritize user-friendly semantic selectors.`;
91
-
92
-
93
- const llmResponse = await this.llmFacade.llmProvider.callLLM({
94
- model: VISION_MODEL,
95
- systemPrompt: 'You are a vision analysis expert for web automation. Analyze screenshots with DOM snapshots to recommend working Playwright selectors. ALWAYS prioritize semantic selectors (getByRole, getByLabel, getByText) over CSS selectors with auto-generated IDs.',
96
- userPrompt: analysisPrompt,
97
- imageUrl: dataUrl
98
- });
99
-
100
- analysis = llmResponse.answer || analysis;
101
- logger?.(`[TakeScreenshot] ✓ Vision analysis complete`, 'log');
102
-
103
- } catch (error: any) {
104
- logger?.(`[TakeScreenshot] ⚠ Vision analysis failed, returning raw screenshot: ${error.message}`, 'warn');
105
- }
106
- }
107
-
108
- return {
109
- success: true,
110
- data: {
111
- screenshot: dataUrl,
112
- isFullPage,
113
- size: base64.length,
114
- interactiveElementCount: pageInfo.interactiveElements.length
115
- },
116
- learning: analysis // Text-based analysis with selector recommendations
117
- };
118
- } catch (error: any) {
119
- logger?.(`[TakeScreenshot] ✗ Failed: ${error.message}`, 'error');
120
- return {
121
- success: false,
122
- error: `Screenshot capture failed: ${error.message}`
123
- };
124
- }
125
- }
126
- }
127
-
128
-
@@ -1,159 +0,0 @@
1
- /**
2
- * Verify Action Result Tool
3
- *
4
- * Compares before/after screenshots to verify if an action achieved its goal
5
- * USE SPARINGLY - expensive (sends 2 images to vision model)
6
- */
7
-
8
- import { Tool, ToolParameter, ToolExecutionContext } from '../tool-registry';
9
- import { ToolResult } from '../types';
10
- import { LabeledImage } from '../../llm-provider';
11
-
12
- export class VerifyActionResultTool implements Tool {
13
- name = 'verify_action_result';
14
-
15
- description = `Verify if your previous action achieved its goal by comparing before/after screenshots.
16
-
17
- ⚠️ USE SPARINGLY - EXPENSIVE (2 images sent to vision model)
18
-
19
- When to use:
20
- - After coordinate click to verify expected UI change occurred
21
- - After clicking element when unsure if goal achieved (no clear feedback)
22
- - To verify navigation or modal appeared
23
-
24
- How it works:
25
- 1. Takes screenshot BEFORE you run your next command
26
- 2. You run command (click, fill, etc.)
27
- 3. Takes screenshot AFTER command executes
28
- 4. Compares both and tells you if expected change happened
29
-
30
- Parameters:
31
- - expectedChange: What should have changed (e.g., "Dashboard page loaded", "Modal appeared", "Form submitted")
32
-
33
- Returns: { verified: boolean, reasoning: string, changes: string[] }
34
-
35
- Example:
36
- {
37
- "toolCalls": [{"name": "verify_action_result", "params": {"expectedChange": "Dashboard page loaded with data grid"}}],
38
- "toolReasoning": "Need to verify coordinate click navigated to correct page",
39
- "needsToolResults": true // IMPORTANT: Wait for verification before deciding status
40
- }`;
41
-
42
- parameters: ToolParameter[] = [
43
- {
44
- name: 'expectedChange',
45
- type: 'string',
46
- description: 'What UI change you expect to see (e.g., "Modal opened", "Page navigated", "New form appeared")',
47
- required: true
48
- }
49
- ];
50
-
51
- // LLM facade will be injected
52
- private llmFacade?: any;
53
-
54
- setLLMFacade(llmFacade: any): void {
55
- this.llmFacade = llmFacade;
56
- }
57
-
58
- async execute(params: Record<string, any>, context: ToolExecutionContext): Promise<ToolResult> {
59
- const { page, logger } = context;
60
- const expectedChange = params.expectedChange || 'Expected UI change';
61
-
62
- logger?.(`[VerifyActionResult] Capturing before/after screenshots...`);
63
-
64
- try {
65
- // Capture BEFORE screenshot (viewport only, JPEG 60% quality for cost efficiency)
66
- const beforeScreenshot = await page.screenshot({
67
- encoding: 'base64',
68
- fullPage: false,
69
- type: 'jpeg',
70
- quality: 60
71
- });
72
- const beforeDataUrl = `data:image/jpeg;base64,${beforeScreenshot}`;
73
-
74
- logger?.(`[VerifyActionResult] ✓ BEFORE screenshot captured (JPEG 60%)`);
75
- logger?.(`[VerifyActionResult] ⏳ Waiting 1.5s for UI to settle after your previous action...`);
76
-
77
- // Wait for UI to settle after previous action
78
- await page.waitForTimeout(1500);
79
-
80
- // Capture AFTER screenshot
81
- const afterScreenshot = await page.screenshot({
82
- encoding: 'base64',
83
- fullPage: false,
84
- type: 'jpeg',
85
- quality: 60
86
- });
87
- const afterDataUrl = `data:image/jpeg;base64,${afterScreenshot}`;
88
-
89
- logger?.(`[VerifyActionResult] ✓ AFTER screenshot captured (JPEG 60%)`);
90
- logger?.(`[VerifyActionResult] 🔍 Calling vision model to compare...`);
91
-
92
- if (!this.llmFacade) {
93
- throw new Error('LLM facade not initialized');
94
- }
95
-
96
- // Call LLM with both screenshots
97
- const request = {
98
- model: 'gpt-5-mini', // Vision model
99
- systemPrompt: 'You are a visual verification expert for web automation. Compare before/after screenshots to determine if an action achieved its expected outcome.',
100
- userPrompt: `Expected change: ${expectedChange}
101
-
102
- Compare the [BEFORE] and [AFTER] screenshots.
103
-
104
- Did the expected change occur? Respond with JSON:
105
- {
106
- "verified": boolean,
107
- "reasoning": "What changed (or didn't change) between screenshots",
108
- "changes": ["Specific UI changes observed"]
109
- }
110
-
111
- Be objective:
112
- - Look for new elements, panels, forms, modals
113
- - Check for page navigation, URL changes, content changes
114
- - Note any visual indicators of success/failure
115
-
116
- Be strict: Only return true if you clearly see the expected change.`,
117
- images: [
118
- { label: 'BEFORE', dataUrl: beforeDataUrl },
119
- { label: 'AFTER', dataUrl: afterDataUrl }
120
- ]
121
- };
122
-
123
- const response = await this.llmFacade.llmProvider.callLLM(request);
124
-
125
- // Parse response
126
- const jsonMatch = response.answer.match(/\{[\s\S]*\}/);
127
- if (!jsonMatch) {
128
- logger?.(`[VerifyActionResult] ⚠️ Could not parse verification response`, 'warn');
129
- return {
130
- success: false,
131
- error: 'Failed to parse verification response'
132
- };
133
- }
134
-
135
- const parsed = JSON.parse(jsonMatch[0]);
136
-
137
- logger?.(`[VerifyActionResult] 📊 Result: ${parsed.verified ? '✅ VERIFIED' : '❌ NOT VERIFIED'}`);
138
- logger?.(`[VerifyActionResult] 💭 ${parsed.reasoning}`);
139
- if (parsed.changes && parsed.changes.length > 0) {
140
- logger?.(`[VerifyActionResult] 📝 Changes observed:`);
141
- parsed.changes.forEach((change: string) => logger?.(` - ${change}`));
142
- }
143
-
144
- return {
145
- success: true,
146
- data: parsed,
147
- learning: `Verification result: ${parsed.verified ? 'Goal achieved' : 'Goal not achieved'}. ${parsed.reasoning}`
148
- };
149
-
150
- } catch (error: any) {
151
- logger?.(`[VerifyActionResult] ❌ Error: ${error.message}`, 'error');
152
- return {
153
- success: false,
154
- error: error.message
155
- };
156
- }
157
- }
158
- }
159
-
@@ -1,103 +0,0 @@
1
- /**
2
- * View Previous Screenshot Tool
3
- * Access the screenshot from the previous iteration for continuity reasoning
4
- */
5
-
6
- import { Tool, ToolParameter, ToolExecutionContext } from '../tool-registry';
7
- import { ToolResult } from '../types';
8
-
9
- export class ViewPreviousScreenshotTool implements Tool {
10
- name = 'view_previous_screenshot';
11
- description = 'View the screenshot from the PREVIOUS iteration to understand continuity. Common uses: (1) Verify coordinate-based clicks - look for magenta "clicked" marker to see where click landed, (2) Compare before/after states - see what changed after commands, (3) Check transient effects - see alerts/toasts that may have disappeared. Returns vision analysis of the previous screenshot.';
12
-
13
- // LLM facade for vision analysis
14
- private llmFacade?: any;
15
-
16
- setLLMFacade(llmFacade: any): void {
17
- this.llmFacade = llmFacade;
18
- }
19
-
20
- parameters: ToolParameter[] = [
21
- {
22
- name: 'purpose',
23
- type: 'string',
24
- description: 'Why you need to see the previous screenshot. Examples: "verify coord click accuracy", "check if error message appeared then disappeared", "compare before/after form submission"',
25
- required: true
26
- }
27
- ];
28
-
29
- async execute(params: Record<string, any>, context: ToolExecutionContext): Promise<ToolResult> {
30
- const { logger } = context;
31
- const purpose = params.purpose || 'Review previous page state';
32
-
33
- // Access previousSomScreenshot from context (passed through)
34
- const previousScreenshot = (context as any).previousSomScreenshot;
35
-
36
- if (!previousScreenshot) {
37
- return {
38
- success: false,
39
- error: 'No previous screenshot available (this is the first iteration)'
40
- };
41
- }
42
-
43
- try {
44
- logger?.(`[ViewPreviousScreenshot] Analyzing previous iteration screenshot for: ${purpose}`, 'log');
45
-
46
- // Analyze with vision LLM if available
47
- let analysis = 'Previous screenshot retrieved.';
48
-
49
- if (this.llmFacade) {
50
- try {
51
- const analysisPrompt = `Analyze the screenshot from the PREVIOUS iteration (before the most recent commands executed).
52
-
53
- PURPOSE: ${purpose}
54
-
55
- WHAT TO LOOK FOR:
56
- 1. **Coordinate verification**: If purpose mentions "coord" or "click", look for MAGENTA "clicked" marker (circle with yellow border)
57
- - Describe marker position relative to UI elements
58
- - Assess accuracy: "centered on button", "5% above target", etc.
59
-
60
- 2. **Before/after comparison**: Compare visual state with current page
61
- - What changed after commands executed?
62
- - New elements, removed elements, state changes?
63
-
64
- 3. **Transient effects**: Elements that may have appeared and disappeared
65
- - Alerts, toasts, error messages that are now gone
66
- - Loading states, spinners that finished
67
-
68
- 4. **General state**: Answer the specific question from purpose
69
-
70
- TASK: Provide concise, specific observations relevant to the purpose.`;
71
-
72
- const llmResponse = await this.llmFacade.llmProvider.callLLM({
73
- systemPrompt: 'You are analyzing a screenshot from a previous test iteration to help with continuity reasoning. Provide specific, actionable observations.',
74
- userPrompt: analysisPrompt,
75
- imageUrl: previousScreenshot
76
- });
77
-
78
- analysis = llmResponse.answer || analysis;
79
- logger?.(`[ViewPreviousScreenshot] ✓ Analysis complete`, 'log');
80
-
81
- } catch (error: any) {
82
- logger?.(`[ViewPreviousScreenshot] ⚠ Vision analysis failed: ${error.message}`, 'warn');
83
- }
84
- }
85
-
86
- return {
87
- success: true,
88
- data: {
89
- screenshotAvailable: true,
90
- purpose
91
- },
92
- learning: analysis
93
- };
94
- } catch (error: any) {
95
- logger?.(`[ViewPreviousScreenshot] ✗ Failed: ${error.message}`, 'error');
96
- return {
97
- success: false,
98
- error: `Failed to access previous screenshot: ${error.message}`
99
- };
100
- }
101
- }
102
- }
103
-