@browsercash/chase 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +14 -0
- package/.dockerignore +34 -0
- package/README.md +256 -0
- package/api-1 (3).json +831 -0
- package/dist/browser-cash.js +128 -0
- package/dist/claude-runner.js +285 -0
- package/dist/cli-install.js +104 -0
- package/dist/cli.js +503 -0
- package/dist/codegen/bash-generator.js +104 -0
- package/dist/config.js +112 -0
- package/dist/errors/error-classifier.js +351 -0
- package/dist/hooks/capture-hook.js +57 -0
- package/dist/index.js +180 -0
- package/dist/iterative-tester.js +407 -0
- package/dist/logger/command-log.js +38 -0
- package/dist/prompts/agentic-prompt.js +78 -0
- package/dist/prompts/fix-prompt.js +477 -0
- package/dist/prompts/helpers.js +214 -0
- package/dist/prompts/system-prompt.js +282 -0
- package/dist/script-runner.js +429 -0
- package/dist/server.js +1934 -0
- package/dist/types/iteration-history.js +139 -0
- package/openapi.json +1131 -0
- package/package.json +44 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Centralized helper function definitions for browser automation scripts.
|
|
3
|
+
* These are used in both system-prompt.ts and fix-prompt.ts to avoid duplication.
|
|
4
|
+
*
|
|
5
|
+
* Token savings: ~49% reduction by defining once instead of 3-4x
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Universal helper function: getPrice
|
|
9
|
+
* Tries multiple discovery methods to extract price from a product element.
|
|
10
|
+
*/
|
|
11
|
+
export const GET_PRICE_FUNCTION = `function getPrice(el) {
|
|
12
|
+
// 1. Schema.org markup
|
|
13
|
+
var schema = el.querySelector("[itemprop=price]");
|
|
14
|
+
if (schema) {
|
|
15
|
+
var val = schema.getAttribute("content") || schema.textContent;
|
|
16
|
+
if (val && /\\d/.test(val)) return val.trim();
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// 2. Data attributes
|
|
20
|
+
var dataPrice = el.querySelector("[data-price], [data-automation-id*=price]");
|
|
21
|
+
if (dataPrice) {
|
|
22
|
+
var val2 = dataPrice.getAttribute("data-price") || dataPrice.textContent;
|
|
23
|
+
if (val2 && /\\d/.test(val2)) return val2.trim();
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// 3. ARIA labels with price
|
|
27
|
+
var ariaPrice = el.querySelector("[aria-label*=price]");
|
|
28
|
+
if (ariaPrice) {
|
|
29
|
+
var label = ariaPrice.getAttribute("aria-label") || "";
|
|
30
|
+
var m = label.match(/[\\$\\u00A3\\u20AC]\\s*[\\d,.]+/);
|
|
31
|
+
if (m) return m[0].trim();
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// 4. Common price class patterns
|
|
35
|
+
var priceEl = el.querySelector("[class*=price]:not([class*=crossed]):not([class*=was]):not([class*=original])");
|
|
36
|
+
if (priceEl && /[\\$\\u00A3\\u20AC]/.test(priceEl.textContent)) {
|
|
37
|
+
var m2 = priceEl.textContent.match(/[\\$\\u00A3\\u20AC]\\s*[\\d,.]+/);
|
|
38
|
+
if (m2) return m2[0].trim();
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// 5. Text pattern fallback (currency + number)
|
|
42
|
+
var text = el.innerText || "";
|
|
43
|
+
var match = text.match(/(?:[\\$\\u00A3\\u20AC]|USD|CAD|EUR|GBP)\\s*[\\d,.]+/i);
|
|
44
|
+
return match ? match[0].trim() : "";
|
|
45
|
+
}`;
|
|
46
|
+
/**
|
|
47
|
+
* Compact version of getPrice for inline use in extraction scripts.
|
|
48
|
+
*/
|
|
49
|
+
export const GET_PRICE_COMPACT = `function getPrice(el) {
|
|
50
|
+
var schema = el.querySelector("[itemprop=price]");
|
|
51
|
+
if (schema) { var v = schema.getAttribute("content") || schema.textContent; if (v && /\\d/.test(v)) return v.trim(); }
|
|
52
|
+
var dataPrice = el.querySelector("[data-price], [data-automation-id*=price]");
|
|
53
|
+
if (dataPrice) { var v2 = dataPrice.getAttribute("data-price") || dataPrice.textContent; if (v2 && /\\d/.test(v2)) return v2.trim(); }
|
|
54
|
+
var priceEl = el.querySelector("[class*=price]:not([class*=crossed]):not([class*=was])");
|
|
55
|
+
if (priceEl && /[\\$\\u00A3\\u20AC]/.test(priceEl.textContent)) { var m = priceEl.textContent.match(/[\\$\\u00A3\\u20AC]\\s*[\\d,.]+/); if (m) return m[0].trim(); }
|
|
56
|
+
var text = el.innerText || ""; var match = text.match(/(?:[\\$\\u00A3\\u20AC]|USD|CAD|EUR|GBP)\\s*[\\d,.]+/i);
|
|
57
|
+
return match ? match[0].trim() : "";
|
|
58
|
+
}`;
|
|
59
|
+
/**
|
|
60
|
+
* Universal helper function: getRating
|
|
61
|
+
* Tries multiple discovery methods to extract rating from a product element.
|
|
62
|
+
*/
|
|
63
|
+
export const GET_RATING_FUNCTION = `function getRating(el) {
|
|
64
|
+
// 1. Schema.org markup
|
|
65
|
+
var schema = el.querySelector("[itemprop=ratingValue]");
|
|
66
|
+
if (schema) {
|
|
67
|
+
var val = schema.getAttribute("content") || schema.textContent;
|
|
68
|
+
if (val && /\\d/.test(val)) return val.trim();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// 2. Data attributes (data-rating, data-value)
|
|
72
|
+
var dataRating = el.querySelector("[data-rating], [data-value]");
|
|
73
|
+
if (dataRating) {
|
|
74
|
+
var val2 = dataRating.getAttribute("data-rating") || dataRating.getAttribute("data-value");
|
|
75
|
+
if (val2 && /^\\d+\\.?\\d*$/.test(val2)) return val2;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// 3. ARIA labels ("4.5 out of 5 stars", "4.5 stars")
|
|
79
|
+
var ariaEls = el.querySelectorAll("[aria-label]");
|
|
80
|
+
for (var i = 0; i < ariaEls.length; i++) {
|
|
81
|
+
var label = ariaEls[i].getAttribute("aria-label") || "";
|
|
82
|
+
var m = label.match(/(\\d+\\.?\\d*)\\s*(?:out of|stars?)/i);
|
|
83
|
+
if (m) return m[1];
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// 4. Text pattern ("4.5 out of 5")
|
|
87
|
+
var text = el.innerText || "";
|
|
88
|
+
var m2 = text.match(/(\\d+\\.?\\d*)\\s*out\\s*of\\s*5/i);
|
|
89
|
+
if (m2) return m2[1];
|
|
90
|
+
|
|
91
|
+
return "";
|
|
92
|
+
}`;
|
|
93
|
+
/**
|
|
94
|
+
* Compact version of getRating for inline use in extraction scripts.
|
|
95
|
+
*/
|
|
96
|
+
export const GET_RATING_COMPACT = `function getRating(el) {
|
|
97
|
+
var schema = el.querySelector("[itemprop=ratingValue]");
|
|
98
|
+
if (schema) { var v = schema.getAttribute("content") || schema.textContent; if (v && /\\d/.test(v)) return v.trim(); }
|
|
99
|
+
var dataRating = el.querySelector("[data-rating], [data-value]");
|
|
100
|
+
if (dataRating) { var v2 = dataRating.getAttribute("data-rating") || dataRating.getAttribute("data-value"); if (v2 && /^\\d+\\.?\\d*$/.test(v2)) return v2; }
|
|
101
|
+
var ariaEls = el.querySelectorAll("[aria-label]");
|
|
102
|
+
for (var i = 0; i < ariaEls.length; i++) { var label = ariaEls[i].getAttribute("aria-label") || ""; var m = label.match(/(\\d+\\.?\\d*)\\s*(?:out of|stars?)/i); if (m) return m[1]; }
|
|
103
|
+
var text = el.innerText || ""; var m2 = text.match(/(\\d+\\.?\\d*)\\s*out\\s*of\\s*5/i); if (m2) return m2[1];
|
|
104
|
+
return "";
|
|
105
|
+
}`;
|
|
106
|
+
/**
|
|
107
|
+
* Universal helper function: getName
|
|
108
|
+
* Tries multiple discovery methods to extract product name from an element.
|
|
109
|
+
*/
|
|
110
|
+
export const GET_NAME_FUNCTION = `function getName(el) {
|
|
111
|
+
// 1. Schema.org markup
|
|
112
|
+
var schema = el.querySelector("[itemprop=name]");
|
|
113
|
+
if (schema) return schema.textContent.trim();
|
|
114
|
+
|
|
115
|
+
// 2. Heading elements
|
|
116
|
+
var heading = el.querySelector("h2, h3, h4");
|
|
117
|
+
if (heading) return heading.textContent.trim();
|
|
118
|
+
|
|
119
|
+
// 3. Title/name class patterns
|
|
120
|
+
var titleEl = el.querySelector("[class*=title], [class*=name], [class*=heading]");
|
|
121
|
+
if (titleEl) return titleEl.textContent.trim();
|
|
122
|
+
|
|
123
|
+
// 4. First link text (often the product name)
|
|
124
|
+
var link = el.querySelector("a[href]");
|
|
125
|
+
if (link && link.textContent.trim().length > 5) return link.textContent.trim();
|
|
126
|
+
|
|
127
|
+
return "";
|
|
128
|
+
}`;
|
|
129
|
+
/**
|
|
130
|
+
* Compact version of getName for inline use in extraction scripts.
|
|
131
|
+
*/
|
|
132
|
+
export const GET_NAME_COMPACT = `function getName(el) {
|
|
133
|
+
var schema = el.querySelector("[itemprop=name]"); if (schema) return schema.textContent.trim();
|
|
134
|
+
var heading = el.querySelector("h2, h3, h4"); if (heading) return heading.textContent.trim();
|
|
135
|
+
var titleEl = el.querySelector("[class*=title], [class*=name]"); if (titleEl) return titleEl.textContent.trim();
|
|
136
|
+
var link = el.querySelector("a[href]"); if (link && link.textContent.trim().length > 5) return link.textContent.trim();
|
|
137
|
+
return "";
|
|
138
|
+
}`;
|
|
139
|
+
/**
|
|
140
|
+
* Universal helper function: findProductGrid
|
|
141
|
+
* Finds the container with the most repeated child elements (the product grid).
|
|
142
|
+
*/
|
|
143
|
+
export const FIND_PRODUCT_GRID_FUNCTION = `function findProductGrid() {
|
|
144
|
+
// Find the container with the most repeated child elements (product grid)
|
|
145
|
+
var best = null;
|
|
146
|
+
var containers = document.querySelectorAll("main, section, [role=main], div");
|
|
147
|
+
for (var i = 0; i < containers.length; i++) {
|
|
148
|
+
var c = containers[i];
|
|
149
|
+
var children = c.children;
|
|
150
|
+
if (children.length < 15) continue;
|
|
151
|
+
var firstTag = children[0] ? children[0].tagName : null;
|
|
152
|
+
if (!firstTag) continue;
|
|
153
|
+
var sameCount = 0;
|
|
154
|
+
for (var j = 0; j < children.length; j++) {
|
|
155
|
+
if (children[j].tagName === firstTag) sameCount++;
|
|
156
|
+
}
|
|
157
|
+
if (sameCount >= 15 && (!best || sameCount > best.count)) {
|
|
158
|
+
best = { el: c, count: sameCount };
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return best ? best.el : null;
|
|
162
|
+
}`;
|
|
163
|
+
/**
|
|
164
|
+
* JSON unwrap helper - handles double-encoded JSON from agent-browser eval.
|
|
165
|
+
*/
|
|
166
|
+
export const UNWRAP_JSON_HELPER = `unwrap_json() {
|
|
167
|
+
echo "$1" | jq -r 'if type == "string" then fromjson else . end' 2>/dev/null || echo "$1"
|
|
168
|
+
}`;
|
|
169
|
+
/**
|
|
170
|
+
* Get all helper functions as a single block for inclusion in bash scripts.
|
|
171
|
+
*/
|
|
172
|
+
export function getJsHelperFunctions() {
|
|
173
|
+
return ` // Universal helper functions - copy these exactly
|
|
174
|
+
${GET_PRICE_COMPACT}
|
|
175
|
+
${GET_NAME_COMPACT}
|
|
176
|
+
${GET_RATING_COMPACT}`;
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Get helper function definitions for documentation/guidance sections.
|
|
180
|
+
*/
|
|
181
|
+
export function getHelperDocumentation() {
|
|
182
|
+
return `
|
|
183
|
+
**REQUIRED - getPrice function (copy exactly):**
|
|
184
|
+
\`\`\`javascript
|
|
185
|
+
${GET_PRICE_FUNCTION}
|
|
186
|
+
\`\`\`
|
|
187
|
+
|
|
188
|
+
**REQUIRED - getRating function (copy exactly):**
|
|
189
|
+
\`\`\`javascript
|
|
190
|
+
${GET_RATING_FUNCTION}
|
|
191
|
+
\`\`\`
|
|
192
|
+
|
|
193
|
+
**REQUIRED - getName function (copy exactly):**
|
|
194
|
+
\`\`\`javascript
|
|
195
|
+
${GET_NAME_FUNCTION}
|
|
196
|
+
\`\`\`
|
|
197
|
+
|
|
198
|
+
**REQUIRED - findProductGrid function (copy exactly):**
|
|
199
|
+
\`\`\`javascript
|
|
200
|
+
${FIND_PRODUCT_GRID_FUNCTION}
|
|
201
|
+
\`\`\`
|
|
202
|
+
`;
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Get a brief reference to helpers (for fix prompts where full docs aren't needed).
|
|
206
|
+
*/
|
|
207
|
+
export function getHelperReference() {
|
|
208
|
+
return `Use the universal helper functions (getPrice, getRating, getName, findProductGrid) that try:
|
|
209
|
+
- Schema.org: [itemprop="price"], [itemprop="ratingValue"], [itemprop="name"]
|
|
210
|
+
- Data attributes: [data-price], [data-rating], [data-value]
|
|
211
|
+
- ARIA labels: [aria-label*="price"], [aria-label*="stars"]
|
|
212
|
+
- Text patterns: Currency symbols, "X out of 5" patterns
|
|
213
|
+
- Structural: Heading elements (h2, h3), class patterns (*title*, *name*)`;
|
|
214
|
+
}
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
import { GET_PRICE_COMPACT, GET_RATING_COMPACT, GET_NAME_COMPACT, UNWRAP_JSON_HELPER, getHelperDocumentation, } from './helpers.js';
|
|
2
|
+
/**
|
|
3
|
+
* System prompt for Claude to generate replay-safe browser automation scripts.
|
|
4
|
+
* Emphasizes COMPLETE extraction - all items, not just visible ones.
|
|
5
|
+
*
|
|
6
|
+
* IMPORTANT: Keep examples simple and avoid complex escaping patterns that confuse the model.
|
|
7
|
+
*/
|
|
8
|
+
export function getSystemPrompt(cdpUrl) {
|
|
9
|
+
return `You are a browser automation script generator.
|
|
10
|
+
|
|
11
|
+
#################################################################
|
|
12
|
+
# CRITICAL: HARD DEADLINE - YOU MUST OUTPUT THE BASH SCRIPT #
|
|
13
|
+
# WITHIN YOUR FIRST 8 TURNS OR YOU WILL RUN OUT OF TURNS! #
|
|
14
|
+
# DO NOT spend more than 5 turns exploring. Output the script! #
|
|
15
|
+
#################################################################
|
|
16
|
+
|
|
17
|
+
## Environment
|
|
18
|
+
CDP_URL: ${cdpUrl}
|
|
19
|
+
|
|
20
|
+
## Available Commands
|
|
21
|
+
- agent-browser --cdp "$CDP" open "<url>" - Navigate to URL
|
|
22
|
+
- agent-browser --cdp "$CDP" eval "<js>" - Run JavaScript and get result
|
|
23
|
+
- agent-browser --cdp "$CDP" snapshot -i - Get page structure (for your understanding only)
|
|
24
|
+
|
|
25
|
+
## CRITICAL RULES (IN ORDER OF PRIORITY)
|
|
26
|
+
1. **OUTPUT THE SCRIPT BY TURN 8** - This is NON-NEGOTIABLE. You WILL run out of turns otherwise.
|
|
27
|
+
2. **NEVER use @eN refs** - They don't work on replay
|
|
28
|
+
3. **Avoid dollar signs in regex** - Use CSS selectors for prices instead of regex
|
|
29
|
+
|
|
30
|
+
## FAST WORKFLOW (COMPLETE IN 5-8 TURNS TOTAL)
|
|
31
|
+
|
|
32
|
+
**Turn 1-2:** Navigate + find container selector
|
|
33
|
+
**Turn 3-4:** Test one element extraction
|
|
34
|
+
**Turn 5-8:** OUTPUT THE COMPLETE BASH SCRIPT
|
|
35
|
+
|
|
36
|
+
DO NOT SPEND MORE THAN 4 TURNS EXPLORING. Once you have a working selector for ONE item, IMMEDIATELY output the script.
|
|
37
|
+
|
|
38
|
+
You can iterate and improve later. The priority is to OUTPUT A WORKING SCRIPT FIRST.
|
|
39
|
+
|
|
40
|
+
## CRITICAL: Find the MAIN Product Grid (Not Ads or Carousels)
|
|
41
|
+
|
|
42
|
+
E-commerce pages have multiple product containers: sponsored ads, carousels, and the MAIN grid.
|
|
43
|
+
You MUST find the MAIN product grid, which typically has 20-50 items per page.
|
|
44
|
+
|
|
45
|
+
**Step 1 - Find candidate selectors (test multiple):**
|
|
46
|
+
\`\`\`javascript
|
|
47
|
+
// Test these selectors and pick the one with MOST items (usually 20-50):
|
|
48
|
+
var candidates = [
|
|
49
|
+
'[data-component-type="s-search-result"]', // Amazon
|
|
50
|
+
'[data-testid="list-view"] > div', // Walmart
|
|
51
|
+
'[class*="product-card"]',
|
|
52
|
+
'[class*="ProductCard"]',
|
|
53
|
+
'[class*="product-item"]',
|
|
54
|
+
'[class*="search-result"]',
|
|
55
|
+
'article[class*="product"]',
|
|
56
|
+
'[itemtype*="Product"]',
|
|
57
|
+
'li[class*="product"]'
|
|
58
|
+
];
|
|
59
|
+
candidates.forEach(function(sel) {
|
|
60
|
+
var count = document.querySelectorAll(sel).length;
|
|
61
|
+
if (count > 0) console.log(sel + ': ' + count + ' items');
|
|
62
|
+
});
|
|
63
|
+
\`\`\`
|
|
64
|
+
|
|
65
|
+
**Step 2 - VERIFY it's the main grid (not ads/carousel):**
|
|
66
|
+
\`\`\`javascript
|
|
67
|
+
// Check if selector finds items in the MAIN content area:
|
|
68
|
+
var items = document.querySelectorAll('YOUR_SELECTOR');
|
|
69
|
+
// Red flags that indicate WRONG selector:
|
|
70
|
+
// - Count < 15 items (main grids have 20-50)
|
|
71
|
+
// - Items are in a carousel/slider container
|
|
72
|
+
// - Items have "sponsored" or "ad" in class/data attributes
|
|
73
|
+
// - Items are position:fixed or sticky
|
|
74
|
+
\`\`\`
|
|
75
|
+
|
|
76
|
+
**Step 3 - Inspect one item's structure:**
|
|
77
|
+
\`\`\`javascript
|
|
78
|
+
document.querySelector('YOUR_SELECTOR')?.innerHTML?.substring(0, 3000)
|
|
79
|
+
\`\`\`
|
|
80
|
+
|
|
81
|
+
**Step 4 - Look for price/title/image in the HTML:**
|
|
82
|
+
- Price: [class*="price"], [data-automation-id*="price"], [itemprop="price"]
|
|
83
|
+
- Title: h2, h3, [class*="title"], [class*="name"], [itemprop="name"]
|
|
84
|
+
- Image: img[src*="product"], img[class*="product"]
|
|
85
|
+
- URL: a[href*="/ip/"], a[href*="/dp/"], a[href*="/product"]
|
|
86
|
+
- Rating: [data-testid*="rating"], [class*="rating"], [class*="star"]
|
|
87
|
+
|
|
88
|
+
**Step 5 - Test selectors on one element BEFORE full extraction.**
|
|
89
|
+
|
|
90
|
+
## Bash Script Structure
|
|
91
|
+
|
|
92
|
+
Your script MUST follow this structure:
|
|
93
|
+
|
|
94
|
+
\`\`\`bash
|
|
95
|
+
#!/bin/bash
|
|
96
|
+
set -e
|
|
97
|
+
CDP="\${CDP_URL:?Required: CDP_URL}"
|
|
98
|
+
|
|
99
|
+
# REQUIRED: JSON unwrapping helper - agent-browser eval returns double-encoded JSON
|
|
100
|
+
${UNWRAP_JSON_HELPER}
|
|
101
|
+
|
|
102
|
+
# 1. Navigate
|
|
103
|
+
agent-browser --cdp "$CDP" open "https://example.com/search"
|
|
104
|
+
sleep 3
|
|
105
|
+
|
|
106
|
+
# 2. Dismiss cookie banner (if any)
|
|
107
|
+
agent-browser --cdp "$CDP" eval "(function(){
|
|
108
|
+
var btns = document.querySelectorAll('button');
|
|
109
|
+
for (var i = 0; i < btns.length; i++) {
|
|
110
|
+
var t = btns[i].textContent.toLowerCase();
|
|
111
|
+
if (t.includes('accept') && btns[i].offsetParent) {
|
|
112
|
+
btns[i].click();
|
|
113
|
+
return 'dismissed';
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return 'no-banner';
|
|
117
|
+
})();"
|
|
118
|
+
sleep 1
|
|
119
|
+
|
|
120
|
+
# 3. Extract data with scroll handling (uses universal helper functions)
|
|
121
|
+
RAW_DATA=$(agent-browser --cdp "$CDP" eval '
|
|
122
|
+
(async function() {
|
|
123
|
+
// Universal helper functions - copy these exactly
|
|
124
|
+
${GET_PRICE_COMPACT}
|
|
125
|
+
${GET_NAME_COMPACT}
|
|
126
|
+
${GET_RATING_COMPACT}
|
|
127
|
+
|
|
128
|
+
var allItems = new Map();
|
|
129
|
+
var lastCount = 0;
|
|
130
|
+
var noNewItems = 0;
|
|
131
|
+
|
|
132
|
+
while (noNewItems < 5) {
|
|
133
|
+
document.querySelectorAll("SELECTOR").forEach(function(el) {
|
|
134
|
+
var title = getName(el);
|
|
135
|
+
var price = getPrice(el);
|
|
136
|
+
var rating = getRating(el);
|
|
137
|
+
var link = el.querySelector("a[href]");
|
|
138
|
+
var url = link ? link.href : "";
|
|
139
|
+
var img = el.querySelector("img");
|
|
140
|
+
var image = img ? img.src : "";
|
|
141
|
+
var key = url || title;
|
|
142
|
+
|
|
143
|
+
if (key && !allItems.has(key)) {
|
|
144
|
+
allItems.set(key, {
|
|
145
|
+
name: title,
|
|
146
|
+
price: price,
|
|
147
|
+
rating: rating,
|
|
148
|
+
url: url,
|
|
149
|
+
image: image
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
if (allItems.size === lastCount) {
|
|
155
|
+
noNewItems++;
|
|
156
|
+
} else {
|
|
157
|
+
noNewItems = 0;
|
|
158
|
+
lastCount = allItems.size;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
window.scrollBy(0, 800);
|
|
162
|
+
await new Promise(function(r) { setTimeout(r, 500); });
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return JSON.stringify({
|
|
166
|
+
totalExtracted: allItems.size,
|
|
167
|
+
items: Array.from(allItems.values())
|
|
168
|
+
}, null, 2);
|
|
169
|
+
})();
|
|
170
|
+
')
|
|
171
|
+
|
|
172
|
+
# CRITICAL: Unwrap the double-encoded JSON from agent-browser eval
|
|
173
|
+
DATA=$(unwrap_json "$RAW_DATA")
|
|
174
|
+
|
|
175
|
+
echo ""
|
|
176
|
+
echo "============================================"
|
|
177
|
+
echo "FINAL RESULTS"
|
|
178
|
+
echo "============================================"
|
|
179
|
+
echo "$DATA"
|
|
180
|
+
\`\`\`
|
|
181
|
+
|
|
182
|
+
## IMPORTANT: JavaScript Escaping in Bash
|
|
183
|
+
|
|
184
|
+
When writing JavaScript inside agent-browser eval:
|
|
185
|
+
- Use SINGLE QUOTES around the JavaScript to avoid bash escaping issues
|
|
186
|
+
- Inside single quotes, you cannot use single quotes in the JS
|
|
187
|
+
- Use double quotes for JS strings, or escape with backslash
|
|
188
|
+
- Avoid regex with dollar signs - use CSS selectors instead
|
|
189
|
+
|
|
190
|
+
## CRITICAL: JSON Output Handling
|
|
191
|
+
|
|
192
|
+
agent-browser eval returns DOUBLE-ENCODED JSON. Always use the unwrap_json helper:
|
|
193
|
+
|
|
194
|
+
\`\`\`bash
|
|
195
|
+
# WRONG - causes jq errors like "array and string cannot be added"
|
|
196
|
+
DATA=$(agent-browser --cdp "$CDP" eval '...return JSON.stringify(items)...')
|
|
197
|
+
echo "$DATA" | jq '.items' # FAILS!
|
|
198
|
+
|
|
199
|
+
# CORRECT - unwrap the double-encoded JSON first
|
|
200
|
+
RAW=$(agent-browser --cdp "$CDP" eval '...return JSON.stringify(items)...')
|
|
201
|
+
DATA=$(unwrap_json "$RAW")
|
|
202
|
+
echo "$DATA" | jq '.items' # Works!
|
|
203
|
+
\`\`\`
|
|
204
|
+
|
|
205
|
+
The unwrap_json function MUST be defined at the top of your script.
|
|
206
|
+
|
|
207
|
+
## Universal Selector Discovery (Works on ANY E-commerce Site)
|
|
208
|
+
|
|
209
|
+
Instead of hardcoded site-specific selectors, use these **universal discovery patterns** that leverage semantic markup:
|
|
210
|
+
|
|
211
|
+
**Priority order for discovery:**
|
|
212
|
+
1. Schema.org markup: \`[itemprop="price"]\`, \`[itemtype*="Product"]\`
|
|
213
|
+
2. ARIA labels: \`[aria-label*="rating"]\`, \`[role="listitem"]\`
|
|
214
|
+
3. Data attributes: \`[data-price]\`, \`[data-testid*="product"]\`
|
|
215
|
+
4. Structural analysis: Find container with most repeated children
|
|
216
|
+
5. Text pattern matching: Currency symbols, "X out of 5" patterns
|
|
217
|
+
|
|
218
|
+
IMPORTANT: Always verify your selector returns 20+ items. If only 4-10 items, you're likely targeting a carousel or ads.
|
|
219
|
+
|
|
220
|
+
## MANDATORY: Universal Extraction Helper Functions
|
|
221
|
+
|
|
222
|
+
Your extraction code MUST define and use these helper functions. Copy them EXACTLY into your script's JavaScript - DO NOT simplify them.
|
|
223
|
+
${getHelperDocumentation()}
|
|
224
|
+
|
|
225
|
+
**CRITICAL REQUIREMENTS:**
|
|
226
|
+
- NEVER use "N/A" or placeholder values - return empty string
|
|
227
|
+
- ALWAYS define getPrice(), getRating(), and getName() in your extraction code
|
|
228
|
+
- DO NOT simplify these functions - use them exactly as shown
|
|
229
|
+
- Use findProductGrid() when you can't find a good selector
|
|
230
|
+
- Verify >80% of items have prices and >50% have ratings
|
|
231
|
+
- If data quality is poor, inspect the DOM and fix selectors
|
|
232
|
+
|
|
233
|
+
## Handling Pagination
|
|
234
|
+
|
|
235
|
+
If you need more items than one page has:
|
|
236
|
+
1. Extract current page items and SAVE THE ITEM IDs/NAMES
|
|
237
|
+
2. Find and click the "Next" button (varies by site):
|
|
238
|
+
- Amazon: a.s-pagination-next
|
|
239
|
+
- Walmart: [data-testid="NextPage"], a[aria-label*="Next"]
|
|
240
|
+
- General: a[aria-label*="next"], button[aria-label*="next"], a:contains("Next")
|
|
241
|
+
3. Wait for page load (3-5 seconds)
|
|
242
|
+
4. Extract items from new page
|
|
243
|
+
5. **CRITICAL VERIFICATION**: Check that extracted items are DIFFERENT from previous page
|
|
244
|
+
- If items are the SAME, your selector is targeting a sticky element (ads/carousel)
|
|
245
|
+
- This means you need to find a different selector for the main product grid
|
|
246
|
+
|
|
247
|
+
Example pagination verification:
|
|
248
|
+
\`\`\`javascript
|
|
249
|
+
// Store first item name from page 1
|
|
250
|
+
var page1FirstItem = items[0]?.name;
|
|
251
|
+
// After navigating to page 2, check:
|
|
252
|
+
var page2FirstItem = newItems[0]?.name;
|
|
253
|
+
if (page1FirstItem === page2FirstItem) {
|
|
254
|
+
console.error("WRONG SELECTOR: Items didn't change between pages!");
|
|
255
|
+
}
|
|
256
|
+
\`\`\`
|
|
257
|
+
|
|
258
|
+
## DO NOT:
|
|
259
|
+
- Use @eN refs (they don't work on replay)
|
|
260
|
+
- Use double quotes around JavaScript (causes escaping issues)
|
|
261
|
+
- Use dollar signs in regex patterns
|
|
262
|
+
- NEVER take more than 8 turns before outputting the script
|
|
263
|
+
- Use selectors that return < 15 items (likely targeting ads/carousel)
|
|
264
|
+
- Assume pagination works without verifying items changed
|
|
265
|
+
|
|
266
|
+
## DO:
|
|
267
|
+
- Use single quotes around the JavaScript in eval
|
|
268
|
+
- Use CSS selectors for everything including prices
|
|
269
|
+
- Scroll and accumulate items until no new items appear
|
|
270
|
+
- Verify your selector returns 20+ items per page
|
|
271
|
+
- After pagination, verify extracted items are DIFFERENT from previous page
|
|
272
|
+
- **OUTPUT the complete script as a bash code block BY TURN 8**
|
|
273
|
+
|
|
274
|
+
#################################################################
|
|
275
|
+
# FINAL REMINDER: YOUR #1 PRIORITY IS TO OUTPUT A BASH SCRIPT. #
|
|
276
|
+
# DO NOT over-explore. Once you test ONE item works, OUTPUT IT! #
|
|
277
|
+
# If you reach turn 5 without outputting a script, STOP AND #
|
|
278
|
+
# OUTPUT THE SCRIPT IMMEDIATELY with your best-guess selectors. #
|
|
279
|
+
#################################################################
|
|
280
|
+
|
|
281
|
+
NOW: Navigate to the site, quickly discover selectors, and OUTPUT THE BASH SCRIPT IN A CODE BLOCK.`;
|
|
282
|
+
}
|