testchimp-runner-core 0.0.32 → 0.0.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/llm-facade.d.ts.map +1 -1
- package/dist/llm-facade.js +7 -7
- package/dist/llm-facade.js.map +1 -1
- package/dist/llm-provider.d.ts +9 -0
- package/dist/llm-provider.d.ts.map +1 -1
- package/dist/model-constants.d.ts +16 -5
- package/dist/model-constants.d.ts.map +1 -1
- package/dist/model-constants.js +17 -6
- package/dist/model-constants.js.map +1 -1
- package/dist/orchestrator/index.d.ts +1 -1
- package/dist/orchestrator/index.d.ts.map +1 -1
- package/dist/orchestrator/index.js +3 -2
- package/dist/orchestrator/index.js.map +1 -1
- package/dist/orchestrator/orchestrator-agent.d.ts +0 -8
- package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -1
- package/dist/orchestrator/orchestrator-agent.js +206 -405
- package/dist/orchestrator/orchestrator-agent.js.map +1 -1
- package/dist/orchestrator/orchestrator-prompts.d.ts +20 -0
- package/dist/orchestrator/orchestrator-prompts.d.ts.map +1 -0
- package/dist/orchestrator/orchestrator-prompts.js +455 -0
- package/dist/orchestrator/orchestrator-prompts.js.map +1 -0
- package/dist/orchestrator/tools/index.d.ts +2 -1
- package/dist/orchestrator/tools/index.d.ts.map +1 -1
- package/dist/orchestrator/tools/index.js +4 -2
- package/dist/orchestrator/tools/index.js.map +1 -1
- package/dist/orchestrator/tools/verify-action-result.d.ts +17 -0
- package/dist/orchestrator/tools/verify-action-result.d.ts.map +1 -0
- package/dist/orchestrator/tools/verify-action-result.js +140 -0
- package/dist/orchestrator/tools/verify-action-result.js.map +1 -0
- package/dist/orchestrator/types.d.ts +26 -0
- package/dist/orchestrator/types.d.ts.map +1 -1
- package/dist/orchestrator/types.js.map +1 -1
- package/dist/prompts.d.ts.map +1 -1
- package/dist/prompts.js +87 -37
- package/dist/prompts.js.map +1 -1
- package/dist/scenario-worker-class.d.ts.map +1 -1
- package/dist/scenario-worker-class.js +4 -1
- package/dist/scenario-worker-class.js.map +1 -1
- package/dist/utils/coordinate-converter.d.ts +32 -0
- package/dist/utils/coordinate-converter.d.ts.map +1 -0
- package/dist/utils/coordinate-converter.js +130 -0
- package/dist/utils/coordinate-converter.js.map +1 -0
- package/package.json +1 -1
- package/plandocs/BEFORE_AFTER_VERIFICATION.md +148 -0
- package/plandocs/COORDINATE_MODE_DIAGNOSIS.md +144 -0
- package/plandocs/IMPLEMENTATION_STATUS.md +108 -0
- package/plandocs/PHASE_1_COMPLETE.md +165 -0
- package/plandocs/PHASE_1_SUMMARY.md +184 -0
- package/plandocs/PROMPT_OPTIMIZATION_ANALYSIS.md +120 -0
- package/plandocs/PROMPT_SANITY_CHECK.md +120 -0
- package/plandocs/SESSION_SUMMARY_v0.0.33.md +151 -0
- package/plandocs/TROUBLESHOOTING_SESSION.md +72 -0
- package/plandocs/VISUAL_AGENT_EVOLUTION_PLAN.md +396 -0
- package/plandocs/WHATS_NEW_v0.0.33.md +183 -0
- package/src/llm-facade.ts +8 -8
- package/src/llm-provider.ts +11 -1
- package/src/model-constants.ts +17 -5
- package/src/orchestrator/index.ts +3 -2
- package/src/orchestrator/orchestrator-agent.ts +249 -424
- package/src/orchestrator/orchestrator-agent.ts.backup +1386 -0
- package/src/orchestrator/orchestrator-prompts.ts +474 -0
- package/src/orchestrator/tools/index.ts +2 -1
- package/src/orchestrator/tools/verify-action-result.ts +159 -0
- package/src/orchestrator/types.ts +48 -0
- package/src/prompts.ts +87 -37
- package/src/scenario-worker-class.ts +7 -2
- package/src/utils/coordinate-converter.ts +162 -0
- package/testchimp-runner-core-0.0.33.tgz +0 -0
- /package/{CREDIT_CALLBACK_ARCHITECTURE.md → plandocs/CREDIT_CALLBACK_ARCHITECTURE.md} +0 -0
- /package/{INTEGRATION_COMPLETE.md → plandocs/INTEGRATION_COMPLETE.md} +0 -0
- /package/{VISION_DIAGNOSTICS_IMPROVEMENTS.md → plandocs/VISION_DIAGNOSTICS_IMPROVEMENTS.md} +0 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# What's New in Runner-Core v0.0.33
|
|
2
|
+
|
|
3
|
+
## Phase 1: Tactical Improvements - COMPLETE ✅
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 1. 📝 Note to Future Self (Cross-Step Memory)
|
|
8
|
+
|
|
9
|
+
**The agent can now leave notes that persist across the entire scenario journey.**
|
|
10
|
+
|
|
11
|
+
### How it works:
|
|
12
|
+
```typescript
|
|
13
|
+
// Step 1 - Login
|
|
14
|
+
Agent: "Cookie modal appears after 2s. Dismiss it before interacting."
|
|
15
|
+
→ Stored in memory.latestNote
|
|
16
|
+
|
|
17
|
+
// Step 2 - Navigate to Dashboard
|
|
18
|
+
Agent reads note from Step 1
|
|
19
|
+
Agent: "Waiting 2s for cookie modal..."
|
|
20
|
+
→ Dismisses modal proactively
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Scope:
|
|
24
|
+
- ✅ Across iterations (within same step)
|
|
25
|
+
- ✅ Across steps (entire scenario)
|
|
26
|
+
- ✅ Free-form text (agent decides what's important)
|
|
27
|
+
|
|
28
|
+
### Example notes:
|
|
29
|
+
- **Tactical:** "Tried #menu, failed. Try SVG child next."
|
|
30
|
+
- **Strategic:** "This app uses shadow DOM. Prefer CSS selectors over getByRole."
|
|
31
|
+
- **Behavioral:** "Modals load after 2s delay. Wait before clicking."
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## 2. 🎯 Percentage-Based Coordinate Fallback
|
|
36
|
+
|
|
37
|
+
**When selectors fail, use visual positioning as last resort.**
|
|
38
|
+
|
|
39
|
+
### Precision:
|
|
40
|
+
- 3 decimal places (e.g., 15.755%, 8.500%)
|
|
41
|
+
- ~1 pixel accuracy on most screens
|
|
42
|
+
- Resolution-independent
|
|
43
|
+
|
|
44
|
+
### Supported Actions:
|
|
45
|
+
- **Click:** `{action: "click", xPercent: 15.755, yPercent: 8.500}`
|
|
46
|
+
- **Fill:** `{action: "fill", xPercent: 30.000, yPercent: 25.000, value: "text"}`
|
|
47
|
+
- **Drag:** `{action: "drag", xPercent: 10.000, yPercent: 50.000, toXPercent: 60.000, toYPercent: 50.000}`
|
|
48
|
+
- **Hover, RightClick, DoubleClick, Scroll**
|
|
49
|
+
|
|
50
|
+
### Auto-Activation:
|
|
51
|
+
- Triggers after 3 consecutive selector failures
|
|
52
|
+
- Limited to 2 coordinate attempts
|
|
53
|
+
- Then gives up (stuck)
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## 3. ⚡ Optimized Iteration Budget
|
|
58
|
+
|
|
59
|
+
**Maximum 5 iterations per step** (down from 8)
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
Iterations 1-3: Playwright selectors (3 attempts)
|
|
63
|
+
with note-to-self between each
|
|
64
|
+
|
|
65
|
+
Iterations 4-5: Coordinates (2 attempts max)
|
|
66
|
+
If both fail → stuck
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
**Why:** Coordinates either work or don't - no point retrying 5+ times.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## 4. 🕐 Smart Timeout Handling (Earlier Fix)
|
|
74
|
+
|
|
75
|
+
**Navigation operations now have appropriate timeouts:**
|
|
76
|
+
- `waitForLoadState()`: 30 seconds (was 5s)
|
|
77
|
+
- `goto()`: 30 seconds
|
|
78
|
+
- Element operations: 5 seconds (unchanged)
|
|
79
|
+
|
|
80
|
+
**Detects automatically:** Code scans command for navigation keywords.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## How Notes Work Across Steps
|
|
85
|
+
|
|
86
|
+
### Example Scenario:
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
Step 1: Login
|
|
90
|
+
Iteration 1: Fill username → Success
|
|
91
|
+
Iteration 2: Fill password → Success
|
|
92
|
+
Iteration 3: Click login → Success
|
|
93
|
+
Agent note: "Login redirects to dashboard. Cookie modal appears after 2s."
|
|
94
|
+
|
|
95
|
+
Step 2: Navigate to Settings
|
|
96
|
+
Reads note from Step 1: "Cookie modal appears after 2s"
|
|
97
|
+
Iteration 1:
|
|
98
|
+
- Wait 2s
|
|
99
|
+
- Dismiss modal
|
|
100
|
+
- Click Settings
|
|
101
|
+
→ Success in 1 iteration! (note prevented wasted attempts)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Benefit:** Agent builds up knowledge about the application and uses it in future steps.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Comparison: Before vs After
|
|
109
|
+
|
|
110
|
+
| Aspect | Before (v0.0.32) | After (v0.0.33) |
|
|
111
|
+
|--------|------------------|------------------|
|
|
112
|
+
| Iteration memory | None | Note to self (cross-step) |
|
|
113
|
+
| Selector fails | Give up or loop | Coordinate fallback |
|
|
114
|
+
| Max iterations | 8 per step | 5 per step |
|
|
115
|
+
| Timeout handling | 5s for all | 30s for navigation |
|
|
116
|
+
| Coordinate support | None | Full (click, fill, drag, etc.) |
|
|
117
|
+
| Average iterations | ~4 per step | ~2.5 per step (estimated) |
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Testing Recommendations
|
|
122
|
+
|
|
123
|
+
### Test 1: Note Continuity
|
|
124
|
+
Create a scenario with repeated patterns:
|
|
125
|
+
```
|
|
126
|
+
- Login
|
|
127
|
+
- Go to page A → encounter modal
|
|
128
|
+
- Go to page B → should handle modal proactively
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**Expected:** Step 2 learns from Step 1's note.
|
|
132
|
+
|
|
133
|
+
### Test 2: Coordinate Fallback
|
|
134
|
+
Run PeopleHR scenario:
|
|
135
|
+
```
|
|
136
|
+
- Click hamburger menu (SVG icon)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
**Expected:**
|
|
140
|
+
- Iterations 1-3: Try selectors (may fail)
|
|
141
|
+
- Iteration 4: Coordinates → succeeds
|
|
142
|
+
- Generated script: `await page.mouse.click(x, y);`
|
|
143
|
+
|
|
144
|
+
### Test 3: Timeout Fix
|
|
145
|
+
Any scenario with:
|
|
146
|
+
```
|
|
147
|
+
- await page.waitForLoadState('networkidle');
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Expected:** No more 5s timeout errors.
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Migration
|
|
155
|
+
|
|
156
|
+
**No code changes needed!** Existing code works as-is with improvements.
|
|
157
|
+
|
|
158
|
+
**New response fields** (optional):
|
|
159
|
+
- `noteToFutureSelf`: string (agent can optionally include)
|
|
160
|
+
- `coordinateAction`: object (only when coordinate mode active)
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## What's Next: Phase 2
|
|
165
|
+
|
|
166
|
+
Phase 2 will add numbered element system for even better reliability:
|
|
167
|
+
- Iteration 1: Playwright selector (1 attempt)
|
|
168
|
+
- Iterations 2-3: Index commands CLICK[3] (2 attempts)
|
|
169
|
+
- Iterations 4-5: Coordinates (2 attempts)
|
|
170
|
+
|
|
171
|
+
**Target:** ~1.5 average iterations per step
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Status
|
|
176
|
+
|
|
177
|
+
✅ **Built and Ready**
|
|
178
|
+
📦 **Version:** v0.0.33
|
|
179
|
+
🧪 **Status:** Ready for testing
|
|
180
|
+
📊 **Expected Impact:** 30-40% reduction in iterations
|
|
181
|
+
|
|
182
|
+
**Test now to validate improvements before Phase 2!**
|
|
183
|
+
|
package/src/llm-facade.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { PROMPTS } from './prompts';
|
|
2
2
|
import { PageInfo } from './utils/page-info-utils';
|
|
3
3
|
import { StepOperation } from './types';
|
|
4
|
-
import { DEFAULT_MODEL, VISION_MODEL } from './model-constants';
|
|
4
|
+
import { DEFAULT_MODEL, DEFAULT_SIMPLER_MODEL, VISION_MODEL } from './model-constants';
|
|
5
5
|
import { LLMProvider, LLMRequest, LLMResponse } from './llm-provider';
|
|
6
6
|
|
|
7
7
|
// LLM Response interfaces
|
|
@@ -140,7 +140,7 @@ export class LLMFacade {
|
|
|
140
140
|
/**
|
|
141
141
|
* Generate a test name from scenario description
|
|
142
142
|
*/
|
|
143
|
-
async generateTestName(scenario: string, model: string =
|
|
143
|
+
async generateTestName(scenario: string, model: string = DEFAULT_SIMPLER_MODEL): Promise<string> {
|
|
144
144
|
this.log('Generating test name with LLM...');
|
|
145
145
|
|
|
146
146
|
const request: LLMRequest = {
|
|
@@ -163,7 +163,7 @@ export class LLMFacade {
|
|
|
163
163
|
/**
|
|
164
164
|
* Generate hashtags for semantic grouping
|
|
165
165
|
*/
|
|
166
|
-
async generateHashtags(scenario: string, model: string =
|
|
166
|
+
async generateHashtags(scenario: string, model: string = DEFAULT_SIMPLER_MODEL): Promise<string[]> {
|
|
167
167
|
this.log('Generating hashtags with LLM...');
|
|
168
168
|
|
|
169
169
|
const request: LLMRequest = {
|
|
@@ -280,7 +280,7 @@ Respond ONLY with valid JSON:
|
|
|
280
280
|
errorMessage: string,
|
|
281
281
|
attemptCount: number,
|
|
282
282
|
pageInfo: any,
|
|
283
|
-
model: string =
|
|
283
|
+
model: string = DEFAULT_SIMPLER_MODEL
|
|
284
284
|
): Promise<ScreenshotNeedResponse> {
|
|
285
285
|
this.log('Assessing screenshot need with LLM...');
|
|
286
286
|
|
|
@@ -479,7 +479,7 @@ Respond ONLY with valid JSON:
|
|
|
479
479
|
/**
|
|
480
480
|
* Break down scenario into steps
|
|
481
481
|
*/
|
|
482
|
-
async breakdownScenario(scenario: string, model: string =
|
|
482
|
+
async breakdownScenario(scenario: string, model: string = DEFAULT_SIMPLER_MODEL): Promise<ScenarioStep[]> {
|
|
483
483
|
this.log('Breaking down scenario with LLM...');
|
|
484
484
|
this.log(`📝 INPUT SCENARIO: ${scenario}`);
|
|
485
485
|
|
|
@@ -571,7 +571,7 @@ Respond ONLY with valid JSON:
|
|
|
571
571
|
/**
|
|
572
572
|
* Parse script into steps for AI repair
|
|
573
573
|
*/
|
|
574
|
-
async parseScriptIntoSteps(script: string, model: string =
|
|
574
|
+
async parseScriptIntoSteps(script: string, model: string = DEFAULT_SIMPLER_MODEL): Promise<Array<{ description: string; code: string; success?: boolean; error?: string }>> {
|
|
575
575
|
this.log('Parsing script into steps with LLM...');
|
|
576
576
|
|
|
577
577
|
const request: LLMRequest = {
|
|
@@ -657,7 +657,7 @@ Respond ONLY with valid JSON:
|
|
|
657
657
|
async assessRepairConfidence(
|
|
658
658
|
originalScript: string,
|
|
659
659
|
updatedScript: string,
|
|
660
|
-
model: string =
|
|
660
|
+
model: string = DEFAULT_SIMPLER_MODEL
|
|
661
661
|
): Promise<RepairConfidenceResponse> {
|
|
662
662
|
const request: LLMRequest = {
|
|
663
663
|
model,
|
|
@@ -676,7 +676,7 @@ Respond ONLY with valid JSON:
|
|
|
676
676
|
originalScript: string,
|
|
677
677
|
updatedScript: string,
|
|
678
678
|
newRepairAdvice: string,
|
|
679
|
-
model: string =
|
|
679
|
+
model: string = DEFAULT_SIMPLER_MODEL
|
|
680
680
|
): Promise<string> {
|
|
681
681
|
const request: LLMRequest = {
|
|
682
682
|
model,
|
package/src/llm-provider.ts
CHANGED
|
@@ -3,6 +3,15 @@
|
|
|
3
3
|
* Abstraction for LLM calls - allows different implementations (backend proxy, local, etc.)
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* Labeled image for multi-image LLM requests
|
|
8
|
+
* Used for before/after comparisons or multi-screenshot analysis
|
|
9
|
+
*/
|
|
10
|
+
export interface LabeledImage {
|
|
11
|
+
label: string; // e.g., "Before", "After", "Screenshot 1"
|
|
12
|
+
dataUrl: string; // Base64 data URL (data:image/png;base64,...)
|
|
13
|
+
}
|
|
14
|
+
|
|
6
15
|
/**
|
|
7
16
|
* LLM request - camelCase for TypeScript
|
|
8
17
|
*/
|
|
@@ -10,7 +19,8 @@ export interface LLMRequest {
|
|
|
10
19
|
model?: string;
|
|
11
20
|
systemPrompt?: string;
|
|
12
21
|
userPrompt?: string;
|
|
13
|
-
imageUrl?: string;
|
|
22
|
+
imageUrl?: string; // For vision models (data URL) - backward compatible, single image
|
|
23
|
+
images?: LabeledImage[]; // For multi-image requests (e.g., before/after comparison)
|
|
14
24
|
}
|
|
15
25
|
|
|
16
26
|
/**
|
package/src/model-constants.ts
CHANGED
|
@@ -5,14 +5,26 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
|
-
* Default model for
|
|
9
|
-
* -
|
|
10
|
-
* -
|
|
11
|
-
* -
|
|
12
|
-
* -
|
|
8
|
+
* Default model for complex reasoning tasks:
|
|
9
|
+
* - Command generation (Playwright selectors)
|
|
10
|
+
* - Goal completion checks (critical decisions)
|
|
11
|
+
* - Repair suggestions (complex logic)
|
|
12
|
+
* - Agent orchestration (decision-making)
|
|
13
13
|
*/
|
|
14
14
|
export const DEFAULT_MODEL = 'gpt-5-mini';
|
|
15
15
|
|
|
16
|
+
/**
|
|
17
|
+
* Simpler model for structured/guided tasks:
|
|
18
|
+
* - Scenario breakdown (structured splitting)
|
|
19
|
+
* - Screenshot need assessment (yes/no decision)
|
|
20
|
+
* - Repair confidence assessment (comparison task)
|
|
21
|
+
* - Test name generation (text formatting)
|
|
22
|
+
* - Hashtag generation (semantic tags)
|
|
23
|
+
* - Script parsing (extract steps)
|
|
24
|
+
* - Final script merging (text concatenation)
|
|
25
|
+
*/
|
|
26
|
+
export const DEFAULT_SIMPLER_MODEL = 'gpt-4o-mini';
|
|
27
|
+
|
|
16
28
|
/**
|
|
17
29
|
* Vision-capable model for visual diagnostics:
|
|
18
30
|
* - Screenshot analysis
|
|
@@ -8,13 +8,14 @@ export { OrchestratorAgent } from './orchestrator-agent';
|
|
|
8
8
|
// Tool system
|
|
9
9
|
export { ToolRegistry, Tool, ToolParameter, ToolExecutionContext } from './tool-registry';
|
|
10
10
|
|
|
11
|
-
//
|
|
11
|
+
// 6 information-gathering tools (state changes via Playwright commands)
|
|
12
12
|
export {
|
|
13
13
|
TakeScreenshotTool,
|
|
14
14
|
RecallHistoryTool,
|
|
15
15
|
InspectPageTool,
|
|
16
16
|
CheckPageReadyTool,
|
|
17
|
-
ExtractDataTool
|
|
17
|
+
ExtractDataTool,
|
|
18
|
+
VerifyActionResultTool
|
|
18
19
|
} from './tools';
|
|
19
20
|
|
|
20
21
|
// Types
|