testchimp-runner-core 0.0.32 → 0.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/dist/llm-facade.d.ts.map +1 -1
  2. package/dist/llm-facade.js +7 -7
  3. package/dist/llm-facade.js.map +1 -1
  4. package/dist/llm-provider.d.ts +9 -0
  5. package/dist/llm-provider.d.ts.map +1 -1
  6. package/dist/model-constants.d.ts +16 -5
  7. package/dist/model-constants.d.ts.map +1 -1
  8. package/dist/model-constants.js +17 -6
  9. package/dist/model-constants.js.map +1 -1
  10. package/dist/orchestrator/index.d.ts +1 -1
  11. package/dist/orchestrator/index.d.ts.map +1 -1
  12. package/dist/orchestrator/index.js +3 -2
  13. package/dist/orchestrator/index.js.map +1 -1
  14. package/dist/orchestrator/orchestrator-agent.d.ts +0 -8
  15. package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
  16. package/dist/orchestrator/orchestrator-agent.js +206 -405
  17. package/dist/orchestrator/orchestrator-agent.js.map +1 -1
  18. package/dist/orchestrator/orchestrator-prompts.d.ts +20 -0
  19. package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -0
  20. package/dist/orchestrator/orchestrator-prompts.js +455 -0
  21. package/dist/orchestrator/orchestrator-prompts.js.map +1 -0
  22. package/dist/orchestrator/tools/index.d.ts +2 -1
  23. package/dist/orchestrator/tools/index.d.ts.map +1 -1
  24. package/dist/orchestrator/tools/index.js +4 -2
  25. package/dist/orchestrator/tools/index.js.map +1 -1
  26. package/dist/orchestrator/tools/verify-action-result.d.ts +17 -0
  27. package/dist/orchestrator/tools/verify-action-result.d.ts.map +1 -0
  28. package/dist/orchestrator/tools/verify-action-result.js +140 -0
  29. package/dist/orchestrator/tools/verify-action-result.js.map +1 -0
  30. package/dist/orchestrator/types.d.ts +26 -0
  31. package/dist/orchestrator/types.d.ts.map +1 -1
  32. package/dist/orchestrator/types.js.map +1 -1
  33. package/dist/prompts.d.ts.map +1 -1
  34. package/dist/prompts.js +87 -37
  35. package/dist/prompts.js.map +1 -1
  36. package/dist/scenario-worker-class.d.ts.map +1 -1
  37. package/dist/scenario-worker-class.js +4 -1
  38. package/dist/scenario-worker-class.js.map +1 -1
  39. package/dist/utils/coordinate-converter.d.ts +32 -0
  40. package/dist/utils/coordinate-converter.d.ts.map +1 -0
  41. package/dist/utils/coordinate-converter.js +130 -0
  42. package/dist/utils/coordinate-converter.js.map +1 -0
  43. package/package.json +1 -1
  44. package/plandocs/BEFORE_AFTER_VERIFICATION.md +148 -0
  45. package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +144 -0
  46. package/plandocs/IMPLEMENTATION_STATUS.md +108 -0
  47. package/plandocs/PHASE_1_COMPLETE.md +165 -0
  48. package/plandocs/PHASE_1_SUMMARY.md +184 -0
  49. package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +120 -0
  50. package/plandocs/PROMPT_SANITY_CHECK.md +120 -0
  51. package/plandocs/SESSION_SUMMARY_v0.0.33.md +151 -0
  52. package/plandocs/TROUBLESHOOTING_SESSION.md +72 -0
  53. package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +396 -0
  54. package/plandocs/WHATS_NEW_v0.0.33.md +183 -0
  55. package/src/llm-facade.ts +8 -8
  56. package/src/llm-provider.ts +11 -1
  57. package/src/model-constants.ts +17 -5
  58. package/src/orchestrator/index.ts +3 -2
  59. package/src/orchestrator/orchestrator-agent.ts +249 -424
  60. package/src/orchestrator/orchestrator-agent.ts.backup +1386 -0
  61. package/src/orchestrator/orchestrator-prompts.ts +474 -0
  62. package/src/orchestrator/tools/index.ts +2 -1
  63. package/src/orchestrator/tools/verify-action-result.ts +159 -0
  64. package/src/orchestrator/types.ts +48 -0
  65. package/src/prompts.ts +87 -37
  66. package/src/scenario-worker-class.ts +7 -2
  67. package/src/utils/coordinate-converter.ts +162 -0
  68. package/testchimp-runner-core-0.0.33.tgz +0 -0
  69. /package/{CREDIT_CALLBACK_ARCHITECTURE.md → plandocs/CREDIT_CALLBACK_ARCHITECTURE.md} +0 -0
  70. /package/{INTEGRATION_COMPLETE.md → plandocs/INTEGRATION_COMPLETE.md} +0 -0
  71. /package/{VISION_DIAGNOSTICS_IMPROVEMENTS.md → plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md} +0 -0
@@ -0,0 +1,183 @@
1
+ # What's New in Runner-Core v0.0.33
2
+
3
+ ## Phase 1: Tactical Improvements - COMPLETE ✅
4
+
5
+ ---
6
+
7
+ ## 1. 📝 Note to Future Self (Cross-Step Memory)
8
+
9
+ **The agent can now leave notes that persist across the entire scenario journey.**
10
+
11
+ ### How it works:
12
+ ```typescript
13
+ // Step 1 - Login
14
+ Agent: "Cookie modal appears after 2s. Dismiss it before interacting."
15
+ → Stored in memory.latestNote
16
+
17
+ // Step 2 - Navigate to Dashboard
18
+ Agent reads note from Step 1
19
+ Agent: "Waiting 2s for cookie modal..."
20
+ → Dismisses modal proactively
21
+ ```
22
+
23
+ ### Scope:
24
+ - ✅ Across iterations (within same step)
25
+ - ✅ Across steps (entire scenario)
26
+ - ✅ Free-form text (agent decides what's important)
27
+
28
+ ### Example notes:
29
+ - **Tactical:** "Tried #menu, failed. Try SVG child next."
30
+ - **Strategic:** "This app uses shadow DOM. Prefer CSS selectors over getByRole."
31
+ - **Behavioral:** "Modals load after 2s delay. Wait before clicking."
32
+
33
+ ---
34
+
35
+ ## 2. 🎯 Percentage-Based Coordinate Fallback
36
+
37
+ **When selectors fail, use visual positioning as last resort.**
38
+
39
+ ### Precision:
40
+ - 3 decimal places (e.g., 15.755%, 8.500%)
41
+ - ~1 pixel accuracy on most screens
42
+ - Resolution-independent
43
+
44
+ ### Supported Actions:
45
+ - **Click:** `{action: "click", xPercent: 15.755, yPercent: 8.500}`
46
+ - **Fill:** `{action: "fill", xPercent: 30.000, yPercent: 25.000, value: "text"}`
47
+ - **Drag:** `{action: "drag", xPercent: 10.000, yPercent: 50.000, toXPercent: 60.000, toYPercent: 50.000}`
48
+ - **Hover, RightClick, DoubleClick, Scroll**
49
+
50
+ ### Auto-Activation:
51
+ - Triggers after 3 consecutive selector failures
52
+ - Limited to 2 coordinate attempts
53
+ - Then gives up (stuck)
54
+
55
+ ---
56
+
57
+ ## 3. ⚡ Optimized Iteration Budget
58
+
59
+ **Maximum 5 iterations per step** (down from 8)
60
+
61
+ ```
62
+ Iterations 1-3: Playwright selectors (3 attempts)
63
+ with note-to-self between each
64
+
65
+ Iterations 4-5: Coordinates (2 attempts max)
66
+ If both fail → stuck
67
+ ```
68
+
69
+ **Why:** Coordinates either work or don't - no point retrying 5+ times.
70
+
71
+ ---
72
+
73
+ ## 4. 🕐 Smart Timeout Handling (Earlier Fix)
74
+
75
+ **Navigation operations now have appropriate timeouts:**
76
+ - `waitForLoadState()`: 30 seconds (was 5s)
77
+ - `goto()`: 30 seconds
78
+ - Element operations: 5 seconds (unchanged)
79
+
80
+ **Detects automatically:** Code scans command for navigation keywords.
81
+
82
+ ---
83
+
84
+ ## How Notes Work Across Steps
85
+
86
+ ### Example Scenario:
87
+
88
+ ```
89
+ Step 1: Login
90
+ Iteration 1: Fill username → Success
91
+ Iteration 2: Fill password → Success
92
+ Iteration 3: Click login → Success
93
+ Agent note: "Login redirects to dashboard. Cookie modal appears after 2s."
94
+
95
+ Step 2: Navigate to Settings
96
+ Reads note from Step 1: "Cookie modal appears after 2s"
97
+ Iteration 1:
98
+ - Wait 2s
99
+ - Dismiss modal
100
+ - Click Settings
101
+ → Success in 1 iteration! (note prevented wasted attempts)
102
+ ```
103
+
104
+ **Benefit:** Agent builds up knowledge about the application and uses it in future steps.
105
+
106
+ ---
107
+
108
+ ## Comparison: Before vs After
109
+
110
+ | Aspect | Before (v0.0.32) | After (v0.0.33) |
111
+ |--------|------------------|------------------|
112
+ | Iteration memory | None | Note to self (cross-step) |
113
+ | Selector fails | Give up or loop | Coordinate fallback |
114
+ | Max iterations | 8 per step | 5 per step |
115
+ | Timeout handling | 5s for all | 30s for navigation |
116
+ | Coordinate support | None | Full (click, fill, drag, etc.) |
117
+ | Average iterations | ~4 per step | ~2.5 per step (estimated) |
118
+
119
+ ---
120
+
121
+ ## Testing Recommendations
122
+
123
+ ### Test 1: Note Continuity
124
+ Create a scenario with repeated patterns:
125
+ ```
126
+ - Login
127
+ - Go to page A → encounter modal
128
+ - Go to page B → should handle modal proactively
129
+ ```
130
+
131
+ **Expected:** Step 2 learns from Step 1's note.
132
+
133
+ ### Test 2: Coordinate Fallback
134
+ Run PeopleHR scenario:
135
+ ```
136
+ - Click hamburger menu (SVG icon)
137
+ ```
138
+
139
+ **Expected:**
140
+ - Iterations 1-3: Try selectors (may fail)
141
+ - Iteration 4: Coordinates → succeeds
142
+ - Generated script: `await page.mouse.click(x, y);`
143
+
144
+ ### Test 3: Timeout Fix
145
+ Any scenario with:
146
+ ```
147
+ - await page.waitForLoadState('networkidle');
148
+ ```
149
+
150
+ **Expected:** No more 5s timeout errors.
151
+
152
+ ---
153
+
154
+ ## Migration
155
+
156
+ **No code changes needed!** Existing code works as-is with improvements.
157
+
158
+ **New response fields** (optional):
159
+ - `noteToFutureSelf`: string (agent can optionally include)
160
+ - `coordinateAction`: object (only when coordinate mode active)
161
+
162
+ ---
163
+
164
+ ## What's Next: Phase 2
165
+
166
+ Phase 2 will add numbered element system for even better reliability:
167
+ - Iteration 1: Playwright selector (1 attempt)
168
+ - Iterations 2-3: Index commands CLICK[3] (2 attempts)
169
+ - Iterations 4-5: Coordinates (2 attempts)
170
+
171
+ **Target:** ~1.5 average iterations per step
172
+
173
+ ---
174
+
175
+ ## Status
176
+
177
+ ✅ **Built and Ready**
178
+ 📦 **Version:** v0.0.33
179
+ 🧪 **Status:** Ready for testing
180
+ 📊 **Expected Impact:** 30-40% reduction in iterations
181
+
182
+ **Test now to validate improvements before Phase 2!**
183
+
package/src/llm-facade.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { PROMPTS } from './prompts';
2
2
  import { PageInfo } from './utils/page-info-utils';
3
3
  import { StepOperation } from './types';
4
- import { DEFAULT_MODEL, VISION_MODEL } from './model-constants';
4
+ import { DEFAULT_MODEL, DEFAULT_SIMPLER_MODEL, VISION_MODEL } from './model-constants';
5
5
  import { LLMProvider, LLMRequest, LLMResponse } from './llm-provider';
6
6
 
7
7
  // LLM Response interfaces
@@ -140,7 +140,7 @@ export class LLMFacade {
140
140
  /**
141
141
  * Generate a test name from scenario description
142
142
  */
143
- async generateTestName(scenario: string, model: string = DEFAULT_MODEL): Promise<string> {
143
+ async generateTestName(scenario: string, model: string = DEFAULT_SIMPLER_MODEL): Promise<string> {
144
144
  this.log('Generating test name with LLM...');
145
145
 
146
146
  const request: LLMRequest = {
@@ -163,7 +163,7 @@ export class LLMFacade {
163
163
  /**
164
164
  * Generate hashtags for semantic grouping
165
165
  */
166
- async generateHashtags(scenario: string, model: string = 'gpt-4o-mini'): Promise<string[]> {
166
+ async generateHashtags(scenario: string, model: string = DEFAULT_SIMPLER_MODEL): Promise<string[]> {
167
167
  this.log('Generating hashtags with LLM...');
168
168
 
169
169
  const request: LLMRequest = {
@@ -280,7 +280,7 @@ Respond ONLY with valid JSON:
280
280
  errorMessage: string,
281
281
  attemptCount: number,
282
282
  pageInfo: any,
283
- model: string = DEFAULT_MODEL
283
+ model: string = DEFAULT_SIMPLER_MODEL
284
284
  ): Promise<ScreenshotNeedResponse> {
285
285
  this.log('Assessing screenshot need with LLM...');
286
286
 
@@ -479,7 +479,7 @@ Respond ONLY with valid JSON:
479
479
  /**
480
480
  * Break down scenario into steps
481
481
  */
482
- async breakdownScenario(scenario: string, model: string = DEFAULT_MODEL): Promise<ScenarioStep[]> {
482
+ async breakdownScenario(scenario: string, model: string = DEFAULT_SIMPLER_MODEL): Promise<ScenarioStep[]> {
483
483
  this.log('Breaking down scenario with LLM...');
484
484
  this.log(`📝 INPUT SCENARIO: ${scenario}`);
485
485
 
@@ -571,7 +571,7 @@ Respond ONLY with valid JSON:
571
571
  /**
572
572
  * Parse script into steps for AI repair
573
573
  */
574
- async parseScriptIntoSteps(script: string, model: string = 'gpt-4o-mini'): Promise<Array<{ description: string; code: string; success?: boolean; error?: string }>> {
574
+ async parseScriptIntoSteps(script: string, model: string = DEFAULT_SIMPLER_MODEL): Promise<Array<{ description: string; code: string; success?: boolean; error?: string }>> {
575
575
  this.log('Parsing script into steps with LLM...');
576
576
 
577
577
  const request: LLMRequest = {
@@ -657,7 +657,7 @@ Respond ONLY with valid JSON:
657
657
  async assessRepairConfidence(
658
658
  originalScript: string,
659
659
  updatedScript: string,
660
- model: string = DEFAULT_MODEL
660
+ model: string = DEFAULT_SIMPLER_MODEL
661
661
  ): Promise<RepairConfidenceResponse> {
662
662
  const request: LLMRequest = {
663
663
  model,
@@ -676,7 +676,7 @@ Respond ONLY with valid JSON:
676
676
  originalScript: string,
677
677
  updatedScript: string,
678
678
  newRepairAdvice: string,
679
- model: string = 'gpt-4o-mini'
679
+ model: string = DEFAULT_SIMPLER_MODEL
680
680
  ): Promise<string> {
681
681
  const request: LLMRequest = {
682
682
  model,
@@ -3,6 +3,15 @@
3
3
  * Abstraction for LLM calls - allows different implementations (backend proxy, local, etc.)
4
4
  */
5
5
 
6
+ /**
7
+ * Labeled image for multi-image LLM requests
8
+ * Used for before/after comparisons or multi-screenshot analysis
9
+ */
10
+ export interface LabeledImage {
11
+ label: string; // e.g., "Before", "After", "Screenshot 1"
12
+ dataUrl: string; // Base64 data URL (data:image/png;base64,...)
13
+ }
14
+
6
15
  /**
7
16
  * LLM request - camelCase for TypeScript
8
17
  */
@@ -10,7 +19,8 @@ export interface LLMRequest {
10
19
  model?: string;
11
20
  systemPrompt?: string;
12
21
  userPrompt?: string;
13
- imageUrl?: string; // For vision models (data URL)
22
+ imageUrl?: string; // For vision models (data URL) - backward compatible, single image
23
+ images?: LabeledImage[]; // For multi-image requests (e.g., before/after comparison)
14
24
  }
15
25
 
16
26
  /**
@@ -5,14 +5,26 @@
5
5
  */
6
6
 
7
7
  /**
8
- * Default model for general LLM tasks:
9
- * - Scenario breakdown
10
- * - Command generation
11
- * - Goal completion checks
12
- * - Screenshot need assessment
8
+ * Default model for complex reasoning tasks:
9
+ * - Command generation (Playwright selectors)
10
+ * - Goal completion checks (critical decisions)
11
+ * - Repair suggestions (complex logic)
12
+ * - Agent orchestration (decision-making)
13
13
  */
14
14
  export const DEFAULT_MODEL = 'gpt-5-mini';
15
15
 
16
+ /**
17
+ * Simpler model for structured/guided tasks:
18
+ * - Scenario breakdown (structured splitting)
19
+ * - Screenshot need assessment (yes/no decision)
20
+ * - Repair confidence assessment (comparison task)
21
+ * - Test name generation (text formatting)
22
+ * - Hashtag generation (semantic tags)
23
+ * - Script parsing (extract steps)
24
+ * - Final script merging (text concatenation)
25
+ */
26
+ export const DEFAULT_SIMPLER_MODEL = 'gpt-4o-mini';
27
+
16
28
  /**
17
29
  * Vision-capable model for visual diagnostics:
18
30
  * - Screenshot analysis
@@ -8,13 +8,14 @@ export { OrchestratorAgent } from './orchestrator-agent';
8
8
  // Tool system
9
9
  export { ToolRegistry, Tool, ToolParameter, ToolExecutionContext } from './tool-registry';
10
10
 
11
- // 5 information-gathering tools (state changes via Playwright commands)
11
+ // 6 information-gathering tools (state changes via Playwright commands)
12
12
  export {
13
13
  TakeScreenshotTool,
14
14
  RecallHistoryTool,
15
15
  InspectPageTool,
16
16
  CheckPageReadyTool,
17
- ExtractDataTool
17
+ ExtractDataTool,
18
+ VerifyActionResultTool
18
19
  } from './tools';
19
20
 
20
21
  // Types