@dyyz1993/agent-browser 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +907 -0
- package/bin/agent-browser-darwin-arm64 +0 -0
- package/bin/agent-browser.js +120 -0
- package/dist/__tests__/e2e/utils/test-helpers.d.ts +5 -0
- package/dist/__tests__/e2e/utils/test-helpers.d.ts.map +1 -0
- package/dist/__tests__/e2e/utils/test-helpers.js +22 -0
- package/dist/__tests__/e2e/utils/test-helpers.js.map +1 -0
- package/dist/__tests__/test-iframe.d.ts +2 -0
- package/dist/__tests__/test-iframe.d.ts.map +1 -0
- package/dist/__tests__/test-iframe.js +52 -0
- package/dist/__tests__/test-iframe.js.map +1 -0
- package/dist/__tests__/utils/parseCli.d.ts +20 -0
- package/dist/__tests__/utils/parseCli.d.ts.map +1 -0
- package/dist/__tests__/utils/parseCli.js +1086 -0
- package/dist/__tests__/utils/parseCli.js.map +1 -0
- package/dist/actions.d.ts +50 -0
- package/dist/actions.d.ts.map +1 -0
- package/dist/actions.js +2164 -0
- package/dist/actions.js.map +1 -0
- package/dist/browser.d.ts +556 -0
- package/dist/browser.d.ts.map +1 -0
- package/dist/browser.js +2599 -0
- package/dist/browser.js.map +1 -0
- package/dist/cli/commands.d.ts +8 -0
- package/dist/cli/commands.d.ts.map +1 -0
- package/dist/cli/commands.js +1038 -0
- package/dist/cli/commands.js.map +1 -0
- package/dist/cli/connection.d.ts +50 -0
- package/dist/cli/connection.d.ts.map +1 -0
- package/dist/cli/connection.js +595 -0
- package/dist/cli/connection.js.map +1 -0
- package/dist/cli/flags.d.ts +36 -0
- package/dist/cli/flags.d.ts.map +1 -0
- package/dist/cli/flags.js +206 -0
- package/dist/cli/flags.js.map +1 -0
- package/dist/cli/help.d.ts +4 -0
- package/dist/cli/help.d.ts.map +1 -0
- package/dist/cli/help.js +1024 -0
- package/dist/cli/help.js.map +1 -0
- package/dist/cli/output.d.ts +14 -0
- package/dist/cli/output.d.ts.map +1 -0
- package/dist/cli/output.js +456 -0
- package/dist/cli/output.js.map +1 -0
- package/dist/cli-new.d.ts +3 -0
- package/dist/cli-new.d.ts.map +1 -0
- package/dist/cli-new.js +308 -0
- package/dist/cli-new.js.map +1 -0
- package/dist/cli-old.d.ts +3 -0
- package/dist/cli-old.d.ts.map +1 -0
- package/dist/cli-old.js +1101 -0
- package/dist/cli-old.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +403 -0
- package/dist/cli.js.map +1 -0
- package/dist/content-detection.d.ts +18 -0
- package/dist/content-detection.d.ts.map +1 -0
- package/dist/content-detection.js +68 -0
- package/dist/content-detection.js.map +1 -0
- package/dist/daemon.d.ts +55 -0
- package/dist/daemon.d.ts.map +1 -0
- package/dist/daemon.js +426 -0
- package/dist/daemon.js.map +1 -0
- package/dist/diff.d.ts +42 -0
- package/dist/diff.d.ts.map +1 -0
- package/dist/diff.js +166 -0
- package/dist/diff.js.map +1 -0
- package/dist/human-mouse.d.ts +31 -0
- package/dist/human-mouse.d.ts.map +1 -0
- package/dist/human-mouse.js +184 -0
- package/dist/human-mouse.js.map +1 -0
- package/dist/ios-actions.d.ts +11 -0
- package/dist/ios-actions.d.ts.map +1 -0
- package/dist/ios-actions.js +228 -0
- package/dist/ios-actions.js.map +1 -0
- package/dist/ios-manager.d.ts +266 -0
- package/dist/ios-manager.d.ts.map +1 -0
- package/dist/ios-manager.js +1076 -0
- package/dist/ios-manager.js.map +1 -0
- package/dist/message-bridge.d.ts +10 -0
- package/dist/message-bridge.d.ts.map +1 -0
- package/dist/message-bridge.js +60 -0
- package/dist/message-bridge.js.map +1 -0
- package/dist/protocol.d.ts +26 -0
- package/dist/protocol.d.ts.map +1 -0
- package/dist/protocol.js +912 -0
- package/dist/protocol.js.map +1 -0
- package/dist/recorder/binding.d.ts +24 -0
- package/dist/recorder/binding.d.ts.map +1 -0
- package/dist/recorder/binding.js +215 -0
- package/dist/recorder/binding.js.map +1 -0
- package/dist/recorder/index.d.ts +4 -0
- package/dist/recorder/index.d.ts.map +1 -0
- package/dist/recorder/index.js +4 -0
- package/dist/recorder/index.js.map +1 -0
- package/dist/recorder/inject.js +1913 -0
- package/dist/recorder/recorder.d.ts +19 -0
- package/dist/recorder/recorder.d.ts.map +1 -0
- package/dist/recorder/recorder.js +101 -0
- package/dist/recorder/recorder.js.map +1 -0
- package/dist/recorder/store.d.ts +22 -0
- package/dist/recorder/store.d.ts.map +1 -0
- package/dist/recorder/store.js +150 -0
- package/dist/recorder/store.js.map +1 -0
- package/dist/recorder/types.d.ts +73 -0
- package/dist/recorder/types.d.ts.map +1 -0
- package/dist/recorder/types.js +5 -0
- package/dist/recorder/types.js.map +1 -0
- package/dist/snapshot.d.ts +81 -0
- package/dist/snapshot.d.ts.map +1 -0
- package/dist/snapshot.js +1348 -0
- package/dist/snapshot.js.map +1 -0
- package/dist/stream-server-standalone.d.ts +38 -0
- package/dist/stream-server-standalone.d.ts.map +1 -0
- package/dist/stream-server-standalone.js +494 -0
- package/dist/stream-server-standalone.js.map +1 -0
- package/dist/stream-server.d.ts +214 -0
- package/dist/stream-server.d.ts.map +1 -0
- package/dist/stream-server.js +811 -0
- package/dist/stream-server.js.map +1 -0
- package/dist/types.d.ts +914 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/dist/types.js.map +1 -0
- package/dist/viewer-html.d.ts +2 -0
- package/dist/viewer-html.d.ts.map +1 -0
- package/dist/viewer-html.js +185 -0
- package/dist/viewer-html.js.map +1 -0
- package/dist/viewer-script.d.ts +47 -0
- package/dist/viewer-script.d.ts.map +1 -0
- package/dist/viewer-script.js +586 -0
- package/dist/viewer-script.js.map +1 -0
- package/package.json +86 -0
- package/scripts/build-all-platforms.sh +68 -0
- package/scripts/check-version-sync.js +39 -0
- package/scripts/check_goods_container.js +35 -0
- package/scripts/check_page_content.js +36 -0
- package/scripts/click_applause_rate.js +30 -0
- package/scripts/copy-native.js +36 -0
- package/scripts/copy-recorder.js +21 -0
- package/scripts/e2e-test-recorder.ts +584 -0
- package/scripts/explore_jd_page.js +31 -0
- package/scripts/extract_all_jd_data.js +80 -0
- package/scripts/extract_jd_product_detail.js +62 -0
- package/scripts/extract_jd_products_correct_links.js +78 -0
- package/scripts/extract_jd_products_final.js +80 -0
- package/scripts/extract_jd_reviews.js +48 -0
- package/scripts/extract_jd_seafood_final.js +78 -0
- package/scripts/extract_multiple_products.js +77 -0
- package/scripts/extract_products_no_scroll.js +68 -0
- package/scripts/extract_products_simple.js +68 -0
- package/scripts/find_applause_rate.js +26 -0
- package/scripts/find_jd_links.js +28 -0
- package/scripts/find_main_content.js +20 -0
- package/scripts/find_product_cards.js +38 -0
- package/scripts/find_root_content.js +26 -0
- package/scripts/find_unique_products.js +55 -0
- package/scripts/get_jd_product_detail.js +16 -0
- package/scripts/get_jd_products.js +23 -0
- package/scripts/get_jd_seafood_products.js +44 -0
- package/scripts/get_product_details_from_images.js +54 -0
- package/scripts/postinstall.js +235 -0
- package/scripts/scroll_and_get_products.js +47 -0
- package/scripts/scroll_deep_and_find.js +45 -0
- package/scripts/sync-version.js +69 -0
- package/scripts/verify-baidu-enter.ts +116 -0
- package/skills/agent-browser/SKILL.md +310 -0
- package/skills/agent-browser/references/authentication.md +198 -0
- package/skills/agent-browser/references/commands.md +471 -0
- package/skills/agent-browser/references/data-extraction.md +377 -0
- package/skills/agent-browser/references/proxy-support.md +188 -0
- package/skills/agent-browser/references/session-management.md +197 -0
- package/skills/agent-browser/references/snapshot-refs.md +379 -0
- package/skills/agent-browser/references/video-recording.md +173 -0
- package/skills/agent-browser/templates/api-interception.sh +53 -0
- package/skills/agent-browser/templates/authenticated-session.sh +97 -0
- package/skills/agent-browser/templates/capture-workflow.sh +69 -0
- package/skills/agent-browser/templates/data-extraction.sh +210 -0
- package/skills/agent-browser/templates/form-automation.sh +62 -0
- package/skills/skill-creator/LICENSE.txt +202 -0
- package/skills/skill-creator/SKILL.md +356 -0
- package/skills/skill-creator/references/output-patterns.md +82 -0
- package/skills/skill-creator/references/workflows.md +28 -0
- package/skills/skill-creator/scripts/init_skill.py +303 -0
- package/skills/skill-creator/scripts/package_skill.py +113 -0
- package/skills/skill-creator/scripts/quick_validate.py +95 -0
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
# 数据提取模式指南
|
|
2
|
+
|
|
3
|
+
本文档总结了 agent-browser 的五种核心数据提取模式,适用于不同类型的网站架构。
|
|
4
|
+
|
|
5
|
+
## 模式一:DOM 元素提取
|
|
6
|
+
|
|
7
|
+
适用于静态渲染页面,直接从 HTML 元素提取数据。
|
|
8
|
+
|
|
9
|
+
### 基础示例
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
agent-browser open "https://example.com/products"
|
|
13
|
+
agent-browser eval '
|
|
14
|
+
const items = document.querySelectorAll(".product-item");
|
|
15
|
+
const products = Array.from(items).map(item => ({
|
|
16
|
+
name: item.querySelector("h3")?.textContent?.trim(),
|
|
17
|
+
price: item.querySelector(".price")?.textContent?.trim(),
|
|
18
|
+
link: item.querySelector("a")?.href
|
|
19
|
+
}));
|
|
20
|
+
JSON.stringify(products, null, 2);
|
|
21
|
+
'
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### 实际案例:Twitter 推文提取
|
|
25
|
+
|
|
26
|
+
```javascript
|
|
27
|
+
const tweets = [];
|
|
28
|
+
const items = document.querySelectorAll(".timeline-item");
|
|
29
|
+
|
|
30
|
+
for (let i = 0; i < Math.min(10, items.length); i++) {
|
|
31
|
+
const el = items[i];
|
|
32
|
+
const textEl = el.querySelector(".tweet-content, .content");
|
|
33
|
+
const linkEl = el.querySelector("a[href*=\"status\"]");
|
|
34
|
+
|
|
35
|
+
tweets.push({
|
|
36
|
+
id: linkEl?.href?.match(/status\/(\d+)/)?.[1] || "",
|
|
37
|
+
text: textEl?.textContent?.trim() || "",
|
|
38
|
+
author: el.querySelector(".fullname")?.textContent?.trim(),
|
|
39
|
+
time: el.querySelector("time")?.getAttribute("datetime")
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
JSON.stringify({ tweets_count: tweets.length, tweets }, null, 2);
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 最佳实践
|
|
47
|
+
|
|
48
|
+
- 使用 `?.` 可选链避免空指针错误
|
|
49
|
+
- 提供多个备选选择器:`.tweet-content, .content, .tweet-text`
|
|
50
|
+
- 限制输出长度:`text.substring(0, 1000)`
|
|
51
|
+
- 使用 `JSON.stringify()` 输出结构化数据
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## 模式二:JavaScript 全局变量提取
|
|
56
|
+
|
|
57
|
+
适用于 SPA(单页应用),从 `window` 对象提取预加载的状态数据。
|
|
58
|
+
|
|
59
|
+
### 常见全局变量
|
|
60
|
+
|
|
61
|
+
| 网站/框架 | 变量名 | 用途 |
|
|
62
|
+
|-----------|--------|------|
|
|
63
|
+
| 小红书 | `__INITIAL_STATE__` | React 状态 |
|
|
64
|
+
| 电商网站 | `dataLayer` | GTM 数据层 |
|
|
65
|
+
| 商品详情 | `productConfig`, `variations` | 商品配置 |
|
|
66
|
+
| 通用 | `__NEXT_DATA__`, `__NUXT__` | SSR 数据 |
|
|
67
|
+
|
|
68
|
+
### 实际案例:小红书搜索结果
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
agent-browser eval '
|
|
72
|
+
const feeds = window.__INITIAL_STATE__?.search?.feeds?._value || {};
|
|
73
|
+
const posts = Object.values(feeds).filter(f => f.noteCard).map(f => ({
|
|
74
|
+
id: f.id,
|
|
75
|
+
user: {
|
|
76
|
+
name: f.noteCard.user.nickname,
|
|
77
|
+
avatar: f.noteCard.user.avatar
|
|
78
|
+
},
|
|
79
|
+
stats: {
|
|
80
|
+
likes: parseInt(f.noteCard.interactInfo.likedCount) || 0,
|
|
81
|
+
comments: parseInt(f.noteCard.interactInfo.commentCount) || 0
|
|
82
|
+
},
|
|
83
|
+
cover: f.noteCard.cover?.urlDefault,
|
|
84
|
+
link: "https://www.xiaohongshu.com/explore/" + f.id
|
|
85
|
+
}));
|
|
86
|
+
JSON.stringify({ count: posts.length, posts });
|
|
87
|
+
'
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 实际案例:电商商品详情(dataLayer)
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
agent-browser eval '
|
|
94
|
+
const dl = window.dataLayer || [];
|
|
95
|
+
const viewItem = dl.find(e => e.event === "view_item");
|
|
96
|
+
const item = viewItem?.ecommerce?.items?.[0] || {};
|
|
97
|
+
|
|
98
|
+
const product = {
|
|
99
|
+
name: document.querySelector("h1")?.textContent?.trim() || item.item_name,
|
|
100
|
+
productId: item.item_id,
|
|
101
|
+
price: item.price,
|
|
102
|
+
currency: viewItem?.ecommerce?.currency || "HKD",
|
|
103
|
+
color: item.color,
|
|
104
|
+
category: item.item_category,
|
|
105
|
+
stockStatus: item.stock_status
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
JSON.stringify(product, null, 2);
|
|
109
|
+
'
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### 最佳实践
|
|
113
|
+
|
|
114
|
+
- 使用可选链:`window.__INITIAL_STATE__?.search?.feeds?._value`
|
|
115
|
+
- 提供默认值:`|| {}` 或 `|| []`
|
|
116
|
+
- 结合 DOM 提取作为补充数据源
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## 模式三:API 拦截捕获
|
|
121
|
+
|
|
122
|
+
适用于反爬严格的网站,被动捕获 XHR/Fetch 响应。
|
|
123
|
+
|
|
124
|
+
### 工作原理
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
128
|
+
│ 打开空白页 │ --> │ 启动监听器 │ --> │ 导航目标页 │
|
|
129
|
+
│ about:blank │ │ wait --req │ │ 触发 API │
|
|
130
|
+
└─────────────┘ └─────────────┘ └─────────────┘
|
|
131
|
+
│
|
|
132
|
+
v
|
|
133
|
+
┌─────────────┐
|
|
134
|
+
│ 捕获响应 │
|
|
135
|
+
│ 保存 JSON │
|
|
136
|
+
└─────────────┘
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### 实际案例:抖音用户视频
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
TARGET_URL="https://www.douyin.com/user/MS4wLjABAAA..."
|
|
143
|
+
OUTPUT_FILE="/tmp/douyin-videos.json"
|
|
144
|
+
|
|
145
|
+
agent-browser open "about:blank"
|
|
146
|
+
|
|
147
|
+
(agent-browser wait --request "aweme/post" --timeout 30000 > /tmp/response.json) &
|
|
148
|
+
WAIT_PID=$!
|
|
149
|
+
sleep 1
|
|
150
|
+
|
|
151
|
+
agent-browser open "$TARGET_URL"
|
|
152
|
+
wait $WAIT_PID
|
|
153
|
+
|
|
154
|
+
jq '{
|
|
155
|
+
user: .aweme_list[0].author.nickname,
|
|
156
|
+
videos: .aweme_list[:10] | map({
|
|
157
|
+
id: .aweme_id,
|
|
158
|
+
desc: .desc,
|
|
159
|
+
stats: {
|
|
160
|
+
likes: .statistics.digg_count,
|
|
161
|
+
comments: .statistics.comment_count
|
|
162
|
+
}
|
|
163
|
+
})
|
|
164
|
+
}' /tmp/response.json > "$OUTPUT_FILE"
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### 常见 API 模式
|
|
168
|
+
|
|
169
|
+
| 平台 | 请求特征 | 数据路径 |
|
|
170
|
+
|------|---------|---------|
|
|
171
|
+
| 抖音 | `aweme/post` | `.aweme_list` |
|
|
172
|
+
| 小红书 | `api/snsweb` | `.data` |
|
|
173
|
+
| 电商 | `api/product` | `.product` |
|
|
174
|
+
| 通用 | `demandware`, `graphql` | 根据响应结构 |
|
|
175
|
+
|
|
176
|
+
### 最佳实践
|
|
177
|
+
|
|
178
|
+
- 先打开空白页再启动监听
|
|
179
|
+
- 使用后台进程 `&` 和 `wait`
|
|
180
|
+
- 设置合理的超时时间
|
|
181
|
+
- 使用 `jq` 处理 JSON 输出
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## 模式四:滚动加载采集
|
|
186
|
+
|
|
187
|
+
适用于无限滚动列表,循环滚动 + 去重合并。
|
|
188
|
+
|
|
189
|
+
### 工作流程
|
|
190
|
+
|
|
191
|
+
```
|
|
192
|
+
┌──────────────────────────────────────────────────────┐
|
|
193
|
+
│ 滚动采集循环 │
|
|
194
|
+
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
|
195
|
+
│ │ 提取数据 │ -> │ 合并去重 │ -> │ 滚动加载 │ │
|
|
196
|
+
│ └──────────┘ └──────────┘ └──────────┘ │
|
|
197
|
+
│ │ │ │ │
|
|
198
|
+
│ v v v │
|
|
199
|
+
│ 当前数据 累计数据 触发加载 │
|
|
200
|
+
│ │ │
|
|
201
|
+
│ <──────── 检测终止条件 ────────┘ │
|
|
202
|
+
└──────────────────────────────────────────────────────┘
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### 实际案例:抖音关注列表
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
ALL_USERS="[]"
|
|
209
|
+
PREV_COUNT=0
|
|
210
|
+
|
|
211
|
+
for i in {1..30}; do
|
|
212
|
+
CURRENT=$(agent-browser eval 'JSON.stringify(
|
|
213
|
+
Array.from(document.querySelectorAll("a"))
|
|
214
|
+
.filter(a => a.href.includes("/user/"))
|
|
215
|
+
.map(a => ({ name: a.textContent.trim(), url: a.href }))
|
|
216
|
+
.filter(u => u.name.length > 0)
|
|
217
|
+
)')
|
|
218
|
+
|
|
219
|
+
ALL_USERS=$(echo "$ALL_USERS" "$CURRENT" | python3 -c "
|
|
220
|
+
import sys, json
|
|
221
|
+
data = []
|
|
222
|
+
for line in sys.stdin:
|
|
223
|
+
try:
|
|
224
|
+
d = json.loads(line.strip())
|
|
225
|
+
if isinstance(d, list): data.extend(d)
|
|
226
|
+
except: pass
|
|
227
|
+
seen = set()
|
|
228
|
+
unique = [u for u in data if u['url'] not in seen and not seen.add(u['url'])]
|
|
229
|
+
print(json.dumps(unique, ensure_ascii=False))
|
|
230
|
+
")
|
|
231
|
+
|
|
232
|
+
COUNT=$(echo "$ALL_USERS" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
|
|
233
|
+
echo "已采集: $COUNT 个用户"
|
|
234
|
+
|
|
235
|
+
[ "$COUNT" -eq "$PREV_COUNT" ] && [ "$COUNT" -gt 0 ] && break
|
|
236
|
+
PREV_COUNT=$COUNT
|
|
237
|
+
|
|
238
|
+
agent-browser scroll down 300
|
|
239
|
+
sleep 1
|
|
240
|
+
done
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### 终止条件检测
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
# 方式一:数量不变
|
|
247
|
+
[ "$COUNT" -eq "$PREV_COUNT" ] && break
|
|
248
|
+
|
|
249
|
+
# 方式二:检测 API 标志
|
|
250
|
+
HAS_MORE=$(agent-browser eval 'window.__INITIAL_STATE__?.hasMore')
|
|
251
|
+
[ "$HAS_MORE" = "false" ] && break
|
|
252
|
+
|
|
253
|
+
# 方式三:检测 DOM 元素
|
|
254
|
+
BOTTOM=$(agent-browser eval 'document.querySelector(".no-more") !== null')
|
|
255
|
+
[ "$BOTTOM" = "true" ] && break
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### 最佳实践
|
|
259
|
+
|
|
260
|
+
- 使用 URL 或 ID 作为去重键
|
|
261
|
+
- 设置最大循环次数防止无限循环
|
|
262
|
+
- 滚动后适当等待加载
|
|
263
|
+
- 输出进度便于监控
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## 模式五:iframe 嵌套操作
|
|
268
|
+
|
|
269
|
+
适用于嵌入式登录、第三方组件等场景。
|
|
270
|
+
|
|
271
|
+
### Frame 路径语法
|
|
272
|
+
|
|
273
|
+
```
|
|
274
|
+
#outer-iframe # 单层 iframe(按 ID/Name)
|
|
275
|
+
#0 # 第一个 iframe(按索引)
|
|
276
|
+
#outer/login # 嵌套 iframe(父/子)
|
|
277
|
+
#0/1/2 # 多层嵌套(按索引)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### 实际案例:嵌套 iframe 登录
|
|
281
|
+
|
|
282
|
+
```bash
|
|
283
|
+
agent-browser open "https://example.com/embedded-login"
|
|
284
|
+
|
|
285
|
+
# 查看主页面 iframe 结构
|
|
286
|
+
agent-browser snapshot
|
|
287
|
+
|
|
288
|
+
# 切换到嵌套 iframe
|
|
289
|
+
agent-browser snapshot --in-frame "#outer-iframe/login-frame"
|
|
290
|
+
|
|
291
|
+
# 在 iframe 内操作
|
|
292
|
+
agent-browser fill '#username' 'admin' --in-frame "#outer-iframe/login-frame"
|
|
293
|
+
agent-browser fill '#password' 'password' --in-frame "#outer-iframe/login-frame"
|
|
294
|
+
agent-browser click 'button' --in-frame "#outer-iframe/login-frame"
|
|
295
|
+
|
|
296
|
+
# 验证结果
|
|
297
|
+
agent-browser snapshot --in-frame "#outer-iframe/login-frame"
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
### 最佳实践
|
|
301
|
+
|
|
302
|
+
- 先 snapshot 主页面了解 iframe 结构
|
|
303
|
+
- 使用 CSS 选择器或索引定位 iframe
|
|
304
|
+
- 每次操作后重新 snapshot 验证状态
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
## 代理配置
|
|
309
|
+
|
|
310
|
+
所有脚本都支持代理,用于访问受限网站:
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
# 方式一:环境变量
|
|
314
|
+
export https_proxy=http://127.0.0.1:7890
|
|
315
|
+
agent-browser open "https://target-site.com"
|
|
316
|
+
|
|
317
|
+
# 方式二:命令行参数
|
|
318
|
+
agent-browser --proxy "http://127.0.0.1:7890" open "https://target-site.com"
|
|
319
|
+
|
|
320
|
+
# 方式三:SOCKS5 代理
|
|
321
|
+
export https_proxy=socks5://127.0.0.1:1080
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
---
|
|
325
|
+
|
|
326
|
+
## 完整脚本模板
|
|
327
|
+
|
|
328
|
+
### 数据提取脚本结构
|
|
329
|
+
|
|
330
|
+
```bash
|
|
331
|
+
#!/bin/bash
|
|
332
|
+
# 用法: ./extract-data.sh [参数] [输出文件]
|
|
333
|
+
|
|
334
|
+
TARGET_URL="${1:-https://default.example.com}"
|
|
335
|
+
OUTPUT_FILE="${2:-/tmp/output.json}"
|
|
336
|
+
|
|
337
|
+
echo "=== 1. 关闭旧会话 ==="
|
|
338
|
+
agent-browser close 2>/dev/null
|
|
339
|
+
sleep 1
|
|
340
|
+
|
|
341
|
+
echo ""
|
|
342
|
+
echo "=== 2. 配置代理并打开页面 ==="
|
|
343
|
+
export https_proxy=http://127.0.0.1:7890
|
|
344
|
+
agent-browser open "$TARGET_URL"
|
|
345
|
+
sleep 2
|
|
346
|
+
|
|
347
|
+
echo ""
|
|
348
|
+
echo "=== 3. 提取数据 ==="
|
|
349
|
+
agent-browser eval '
|
|
350
|
+
// 数据提取逻辑
|
|
351
|
+
const data = { /* ... */ };
|
|
352
|
+
JSON.stringify(data, null, 2);
|
|
353
|
+
' > "$OUTPUT_FILE"
|
|
354
|
+
|
|
355
|
+
echo ""
|
|
356
|
+
echo "=== 4. 显示结果 ==="
|
|
357
|
+
cat "$OUTPUT_FILE"
|
|
358
|
+
|
|
359
|
+
echo ""
|
|
360
|
+
echo "=== 5. 关闭浏览器 ==="
|
|
361
|
+
agent-browser close
|
|
362
|
+
|
|
363
|
+
echo ""
|
|
364
|
+
echo "=== 采集完成 ==="
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
---
|
|
368
|
+
|
|
369
|
+
## 选择指南
|
|
370
|
+
|
|
371
|
+
| 场景 | 推荐模式 | 原因 |
|
|
372
|
+
|------|---------|------|
|
|
373
|
+
| 静态页面 | DOM 提取 | 简单直接 |
|
|
374
|
+
| SPA 应用 | JS 变量提取 | 数据已预加载 |
|
|
375
|
+
| 反爬严格 | API 拦截 | 绕过前端限制 |
|
|
376
|
+
| 无限列表 | 滚动加载 | 完整采集 |
|
|
377
|
+
| 嵌套组件 | iframe 操作 | 跨域访问 |
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# Proxy Support
|
|
2
|
+
|
|
3
|
+
Proxy configuration for geo-testing, rate limiting avoidance, and corporate environments.
|
|
4
|
+
|
|
5
|
+
**Related**: [commands.md](commands.md) for global options, [SKILL.md](../SKILL.md) for quick start.
|
|
6
|
+
|
|
7
|
+
## Contents
|
|
8
|
+
|
|
9
|
+
- [Basic Proxy Configuration](#basic-proxy-configuration)
|
|
10
|
+
- [Authenticated Proxy](#authenticated-proxy)
|
|
11
|
+
- [SOCKS Proxy](#socks-proxy)
|
|
12
|
+
- [Proxy Bypass](#proxy-bypass)
|
|
13
|
+
- [Common Use Cases](#common-use-cases)
|
|
14
|
+
- [Verifying Proxy Connection](#verifying-proxy-connection)
|
|
15
|
+
- [Troubleshooting](#troubleshooting)
|
|
16
|
+
- [Best Practices](#best-practices)
|
|
17
|
+
|
|
18
|
+
## Basic Proxy Configuration
|
|
19
|
+
|
|
20
|
+
Set proxy via environment variable before starting:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# HTTP proxy
|
|
24
|
+
export HTTP_PROXY="http://proxy.example.com:8080"
|
|
25
|
+
agent-browser open https://example.com
|
|
26
|
+
|
|
27
|
+
# HTTPS proxy
|
|
28
|
+
export HTTPS_PROXY="https://proxy.example.com:8080"
|
|
29
|
+
agent-browser open https://example.com
|
|
30
|
+
|
|
31
|
+
# Both
|
|
32
|
+
export HTTP_PROXY="http://proxy.example.com:8080"
|
|
33
|
+
export HTTPS_PROXY="http://proxy.example.com:8080"
|
|
34
|
+
agent-browser open https://example.com
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Authenticated Proxy
|
|
38
|
+
|
|
39
|
+
For proxies requiring authentication:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Include credentials in URL
|
|
43
|
+
export HTTP_PROXY="http://username:password@proxy.example.com:8080"
|
|
44
|
+
agent-browser open https://example.com
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## SOCKS Proxy
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# SOCKS5 proxy
|
|
51
|
+
export ALL_PROXY="socks5://proxy.example.com:1080"
|
|
52
|
+
agent-browser open https://example.com
|
|
53
|
+
|
|
54
|
+
# SOCKS5 with auth
|
|
55
|
+
export ALL_PROXY="socks5://user:pass@proxy.example.com:1080"
|
|
56
|
+
agent-browser open https://example.com
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Proxy Bypass
|
|
60
|
+
|
|
61
|
+
Skip proxy for specific domains:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Bypass proxy for local addresses
|
|
65
|
+
export NO_PROXY="localhost,127.0.0.1,.internal.company.com"
|
|
66
|
+
agent-browser open https://internal.company.com # Direct connection
|
|
67
|
+
agent-browser open https://external.com # Via proxy
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Common Use Cases
|
|
71
|
+
|
|
72
|
+
### Geo-Location Testing
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
#!/bin/bash
|
|
76
|
+
# Test site from different regions using geo-located proxies
|
|
77
|
+
|
|
78
|
+
PROXIES=(
|
|
79
|
+
"http://us-proxy.example.com:8080"
|
|
80
|
+
"http://eu-proxy.example.com:8080"
|
|
81
|
+
"http://asia-proxy.example.com:8080"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
for proxy in "${PROXIES[@]}"; do
|
|
85
|
+
export HTTP_PROXY="$proxy"
|
|
86
|
+
export HTTPS_PROXY="$proxy"
|
|
87
|
+
|
|
88
|
+
region=$(echo "$proxy" | grep -oP '^\w+-\w+')
|
|
89
|
+
echo "Testing from: $region"
|
|
90
|
+
|
|
91
|
+
agent-browser --session "$region" open https://example.com
|
|
92
|
+
agent-browser --session "$region" screenshot "./screenshots/$region.png"
|
|
93
|
+
agent-browser --session "$region" close
|
|
94
|
+
done
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Rotating Proxies for Scraping
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
#!/bin/bash
|
|
101
|
+
# Rotate through proxy list to avoid rate limiting
|
|
102
|
+
|
|
103
|
+
PROXY_LIST=(
|
|
104
|
+
"http://proxy1.example.com:8080"
|
|
105
|
+
"http://proxy2.example.com:8080"
|
|
106
|
+
"http://proxy3.example.com:8080"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
URLS=(
|
|
110
|
+
"https://site.com/page1"
|
|
111
|
+
"https://site.com/page2"
|
|
112
|
+
"https://site.com/page3"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
for i in "${!URLS[@]}"; do
|
|
116
|
+
proxy_index=$((i % ${#PROXY_LIST[@]}))
|
|
117
|
+
export HTTP_PROXY="${PROXY_LIST[$proxy_index]}"
|
|
118
|
+
export HTTPS_PROXY="${PROXY_LIST[$proxy_index]}"
|
|
119
|
+
|
|
120
|
+
agent-browser open "${URLS[$i]}"
|
|
121
|
+
agent-browser get text body > "output-$i.txt"
|
|
122
|
+
agent-browser close
|
|
123
|
+
|
|
124
|
+
sleep 1 # Polite delay
|
|
125
|
+
done
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Corporate Network Access
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
#!/bin/bash
|
|
132
|
+
# Access internal sites via corporate proxy
|
|
133
|
+
|
|
134
|
+
export HTTP_PROXY="http://corpproxy.company.com:8080"
|
|
135
|
+
export HTTPS_PROXY="http://corpproxy.company.com:8080"
|
|
136
|
+
export NO_PROXY="localhost,127.0.0.1,.company.com"
|
|
137
|
+
|
|
138
|
+
# External sites go through proxy
|
|
139
|
+
agent-browser open https://external-vendor.com
|
|
140
|
+
|
|
141
|
+
# Internal sites bypass proxy
|
|
142
|
+
agent-browser open https://intranet.company.com
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Verifying Proxy Connection
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# Check your apparent IP
|
|
149
|
+
agent-browser open https://httpbin.org/ip
|
|
150
|
+
agent-browser get text body
|
|
151
|
+
# Should show proxy's IP, not your real IP
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Troubleshooting
|
|
155
|
+
|
|
156
|
+
### Proxy Connection Failed
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# Test proxy connectivity first
|
|
160
|
+
curl -x http://proxy.example.com:8080 https://httpbin.org/ip
|
|
161
|
+
|
|
162
|
+
# Check if proxy requires auth
|
|
163
|
+
export HTTP_PROXY="http://user:pass@proxy.example.com:8080"
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### SSL/TLS Errors Through Proxy
|
|
167
|
+
|
|
168
|
+
Some proxies perform SSL inspection. If you encounter certificate errors:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
# For testing only - not recommended for production
|
|
172
|
+
agent-browser open https://example.com --ignore-https-errors
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Slow Performance
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# Use proxy only when necessary
|
|
179
|
+
export NO_PROXY="*.cdn.com,*.static.com" # Direct CDN access
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Best Practices
|
|
183
|
+
|
|
184
|
+
1. **Use environment variables** - Don't hardcode proxy credentials
|
|
185
|
+
2. **Set NO_PROXY appropriately** - Avoid routing local traffic through proxy
|
|
186
|
+
3. **Test proxy before automation** - Verify connectivity with simple requests
|
|
187
|
+
4. **Handle proxy failures gracefully** - Implement retry logic for unstable proxies
|
|
188
|
+
5. **Rotate proxies for large scraping jobs** - Distribute load and avoid bans
|