@dyyz1993/agent-browser 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/LICENSE +202 -0
  2. package/README.md +907 -0
  3. package/bin/agent-browser-darwin-arm64 +0 -0
  4. package/bin/agent-browser.js +120 -0
  5. package/dist/__tests__/e2e/utils/test-helpers.d.ts +5 -0
  6. package/dist/__tests__/e2e/utils/test-helpers.d.ts.map +1 -0
  7. package/dist/__tests__/e2e/utils/test-helpers.js +22 -0
  8. package/dist/__tests__/e2e/utils/test-helpers.js.map +1 -0
  9. package/dist/__tests__/test-iframe.d.ts +2 -0
  10. package/dist/__tests__/test-iframe.d.ts.map +1 -0
  11. package/dist/__tests__/test-iframe.js +52 -0
  12. package/dist/__tests__/test-iframe.js.map +1 -0
  13. package/dist/__tests__/utils/parseCli.d.ts +20 -0
  14. package/dist/__tests__/utils/parseCli.d.ts.map +1 -0
  15. package/dist/__tests__/utils/parseCli.js +1086 -0
  16. package/dist/__tests__/utils/parseCli.js.map +1 -0
  17. package/dist/actions.d.ts +50 -0
  18. package/dist/actions.d.ts.map +1 -0
  19. package/dist/actions.js +2164 -0
  20. package/dist/actions.js.map +1 -0
  21. package/dist/browser.d.ts +556 -0
  22. package/dist/browser.d.ts.map +1 -0
  23. package/dist/browser.js +2599 -0
  24. package/dist/browser.js.map +1 -0
  25. package/dist/cli/commands.d.ts +8 -0
  26. package/dist/cli/commands.d.ts.map +1 -0
  27. package/dist/cli/commands.js +1038 -0
  28. package/dist/cli/commands.js.map +1 -0
  29. package/dist/cli/connection.d.ts +50 -0
  30. package/dist/cli/connection.d.ts.map +1 -0
  31. package/dist/cli/connection.js +595 -0
  32. package/dist/cli/connection.js.map +1 -0
  33. package/dist/cli/flags.d.ts +36 -0
  34. package/dist/cli/flags.d.ts.map +1 -0
  35. package/dist/cli/flags.js +206 -0
  36. package/dist/cli/flags.js.map +1 -0
  37. package/dist/cli/help.d.ts +4 -0
  38. package/dist/cli/help.d.ts.map +1 -0
  39. package/dist/cli/help.js +1024 -0
  40. package/dist/cli/help.js.map +1 -0
  41. package/dist/cli/output.d.ts +14 -0
  42. package/dist/cli/output.d.ts.map +1 -0
  43. package/dist/cli/output.js +456 -0
  44. package/dist/cli/output.js.map +1 -0
  45. package/dist/cli-new.d.ts +3 -0
  46. package/dist/cli-new.d.ts.map +1 -0
  47. package/dist/cli-new.js +308 -0
  48. package/dist/cli-new.js.map +1 -0
  49. package/dist/cli-old.d.ts +3 -0
  50. package/dist/cli-old.d.ts.map +1 -0
  51. package/dist/cli-old.js +1101 -0
  52. package/dist/cli-old.js.map +1 -0
  53. package/dist/cli.d.ts +3 -0
  54. package/dist/cli.d.ts.map +1 -0
  55. package/dist/cli.js +403 -0
  56. package/dist/cli.js.map +1 -0
  57. package/dist/content-detection.d.ts +18 -0
  58. package/dist/content-detection.d.ts.map +1 -0
  59. package/dist/content-detection.js +68 -0
  60. package/dist/content-detection.js.map +1 -0
  61. package/dist/daemon.d.ts +55 -0
  62. package/dist/daemon.d.ts.map +1 -0
  63. package/dist/daemon.js +426 -0
  64. package/dist/daemon.js.map +1 -0
  65. package/dist/diff.d.ts +42 -0
  66. package/dist/diff.d.ts.map +1 -0
  67. package/dist/diff.js +166 -0
  68. package/dist/diff.js.map +1 -0
  69. package/dist/human-mouse.d.ts +31 -0
  70. package/dist/human-mouse.d.ts.map +1 -0
  71. package/dist/human-mouse.js +184 -0
  72. package/dist/human-mouse.js.map +1 -0
  73. package/dist/ios-actions.d.ts +11 -0
  74. package/dist/ios-actions.d.ts.map +1 -0
  75. package/dist/ios-actions.js +228 -0
  76. package/dist/ios-actions.js.map +1 -0
  77. package/dist/ios-manager.d.ts +266 -0
  78. package/dist/ios-manager.d.ts.map +1 -0
  79. package/dist/ios-manager.js +1076 -0
  80. package/dist/ios-manager.js.map +1 -0
  81. package/dist/message-bridge.d.ts +10 -0
  82. package/dist/message-bridge.d.ts.map +1 -0
  83. package/dist/message-bridge.js +60 -0
  84. package/dist/message-bridge.js.map +1 -0
  85. package/dist/protocol.d.ts +26 -0
  86. package/dist/protocol.d.ts.map +1 -0
  87. package/dist/protocol.js +912 -0
  88. package/dist/protocol.js.map +1 -0
  89. package/dist/recorder/binding.d.ts +24 -0
  90. package/dist/recorder/binding.d.ts.map +1 -0
  91. package/dist/recorder/binding.js +215 -0
  92. package/dist/recorder/binding.js.map +1 -0
  93. package/dist/recorder/index.d.ts +4 -0
  94. package/dist/recorder/index.d.ts.map +1 -0
  95. package/dist/recorder/index.js +4 -0
  96. package/dist/recorder/index.js.map +1 -0
  97. package/dist/recorder/inject.js +1913 -0
  98. package/dist/recorder/recorder.d.ts +19 -0
  99. package/dist/recorder/recorder.d.ts.map +1 -0
  100. package/dist/recorder/recorder.js +101 -0
  101. package/dist/recorder/recorder.js.map +1 -0
  102. package/dist/recorder/store.d.ts +22 -0
  103. package/dist/recorder/store.d.ts.map +1 -0
  104. package/dist/recorder/store.js +150 -0
  105. package/dist/recorder/store.js.map +1 -0
  106. package/dist/recorder/types.d.ts +73 -0
  107. package/dist/recorder/types.d.ts.map +1 -0
  108. package/dist/recorder/types.js +5 -0
  109. package/dist/recorder/types.js.map +1 -0
  110. package/dist/snapshot.d.ts +81 -0
  111. package/dist/snapshot.d.ts.map +1 -0
  112. package/dist/snapshot.js +1348 -0
  113. package/dist/snapshot.js.map +1 -0
  114. package/dist/stream-server-standalone.d.ts +38 -0
  115. package/dist/stream-server-standalone.d.ts.map +1 -0
  116. package/dist/stream-server-standalone.js +494 -0
  117. package/dist/stream-server-standalone.js.map +1 -0
  118. package/dist/stream-server.d.ts +214 -0
  119. package/dist/stream-server.d.ts.map +1 -0
  120. package/dist/stream-server.js +811 -0
  121. package/dist/stream-server.js.map +1 -0
  122. package/dist/types.d.ts +914 -0
  123. package/dist/types.d.ts.map +1 -0
  124. package/dist/types.js +4 -0
  125. package/dist/types.js.map +1 -0
  126. package/dist/viewer-html.d.ts +2 -0
  127. package/dist/viewer-html.d.ts.map +1 -0
  128. package/dist/viewer-html.js +185 -0
  129. package/dist/viewer-html.js.map +1 -0
  130. package/dist/viewer-script.d.ts +47 -0
  131. package/dist/viewer-script.d.ts.map +1 -0
  132. package/dist/viewer-script.js +586 -0
  133. package/dist/viewer-script.js.map +1 -0
  134. package/package.json +86 -0
  135. package/scripts/build-all-platforms.sh +68 -0
  136. package/scripts/check-version-sync.js +39 -0
  137. package/scripts/check_goods_container.js +35 -0
  138. package/scripts/check_page_content.js +36 -0
  139. package/scripts/click_applause_rate.js +30 -0
  140. package/scripts/copy-native.js +36 -0
  141. package/scripts/copy-recorder.js +21 -0
  142. package/scripts/e2e-test-recorder.ts +584 -0
  143. package/scripts/explore_jd_page.js +31 -0
  144. package/scripts/extract_all_jd_data.js +80 -0
  145. package/scripts/extract_jd_product_detail.js +62 -0
  146. package/scripts/extract_jd_products_correct_links.js +78 -0
  147. package/scripts/extract_jd_products_final.js +80 -0
  148. package/scripts/extract_jd_reviews.js +48 -0
  149. package/scripts/extract_jd_seafood_final.js +78 -0
  150. package/scripts/extract_multiple_products.js +77 -0
  151. package/scripts/extract_products_no_scroll.js +68 -0
  152. package/scripts/extract_products_simple.js +68 -0
  153. package/scripts/find_applause_rate.js +26 -0
  154. package/scripts/find_jd_links.js +28 -0
  155. package/scripts/find_main_content.js +20 -0
  156. package/scripts/find_product_cards.js +38 -0
  157. package/scripts/find_root_content.js +26 -0
  158. package/scripts/find_unique_products.js +55 -0
  159. package/scripts/get_jd_product_detail.js +16 -0
  160. package/scripts/get_jd_products.js +23 -0
  161. package/scripts/get_jd_seafood_products.js +44 -0
  162. package/scripts/get_product_details_from_images.js +54 -0
  163. package/scripts/postinstall.js +235 -0
  164. package/scripts/scroll_and_get_products.js +47 -0
  165. package/scripts/scroll_deep_and_find.js +45 -0
  166. package/scripts/sync-version.js +69 -0
  167. package/scripts/verify-baidu-enter.ts +116 -0
  168. package/skills/agent-browser/SKILL.md +310 -0
  169. package/skills/agent-browser/references/authentication.md +198 -0
  170. package/skills/agent-browser/references/commands.md +471 -0
  171. package/skills/agent-browser/references/data-extraction.md +377 -0
  172. package/skills/agent-browser/references/proxy-support.md +188 -0
  173. package/skills/agent-browser/references/session-management.md +197 -0
  174. package/skills/agent-browser/references/snapshot-refs.md +379 -0
  175. package/skills/agent-browser/references/video-recording.md +173 -0
  176. package/skills/agent-browser/templates/api-interception.sh +53 -0
  177. package/skills/agent-browser/templates/authenticated-session.sh +97 -0
  178. package/skills/agent-browser/templates/capture-workflow.sh +69 -0
  179. package/skills/agent-browser/templates/data-extraction.sh +210 -0
  180. package/skills/agent-browser/templates/form-automation.sh +62 -0
  181. package/skills/skill-creator/LICENSE.txt +202 -0
  182. package/skills/skill-creator/SKILL.md +356 -0
  183. package/skills/skill-creator/references/output-patterns.md +82 -0
  184. package/skills/skill-creator/references/workflows.md +28 -0
  185. package/skills/skill-creator/scripts/init_skill.py +303 -0
  186. package/skills/skill-creator/scripts/package_skill.py +113 -0
  187. package/skills/skill-creator/scripts/quick_validate.py +95 -0
@@ -0,0 +1,377 @@
1
+ # 数据提取模式指南
2
+
3
+ 本文档总结了 agent-browser 的五种核心数据提取模式,适用于不同类型的网站架构。
4
+
5
+ ## 模式一:DOM 元素提取
6
+
7
+ 适用于静态渲染页面,直接从 HTML 元素提取数据。
8
+
9
+ ### 基础示例
10
+
11
+ ```bash
12
+ agent-browser open "https://example.com/products"
13
+ agent-browser eval '
14
+ const items = document.querySelectorAll(".product-item");
15
+ const products = Array.from(items).map(item => ({
16
+ name: item.querySelector("h3")?.textContent?.trim(),
17
+ price: item.querySelector(".price")?.textContent?.trim(),
18
+ link: item.querySelector("a")?.href
19
+ }));
20
+ JSON.stringify(products, null, 2);
21
+ '
22
+ ```
23
+
24
+ ### 实际案例:Twitter 推文提取
25
+
26
+ ```javascript
27
+ const tweets = [];
28
+ const items = document.querySelectorAll(".timeline-item");
29
+
30
+ for (let i = 0; i < Math.min(10, items.length); i++) {
31
+ const el = items[i];
32
+ const textEl = el.querySelector(".tweet-content, .content");
33
+ const linkEl = el.querySelector("a[href*=\"status\"]");
34
+
35
+ tweets.push({
36
+ id: linkEl?.href?.match(/status\/(\d+)/)?.[1] || "",
37
+ text: textEl?.textContent?.trim() || "",
38
+ author: el.querySelector(".fullname")?.textContent?.trim(),
39
+ time: el.querySelector("time")?.getAttribute("datetime")
40
+ });
41
+ }
42
+
43
+ JSON.stringify({ tweets_count: tweets.length, tweets }, null, 2);
44
+ ```
45
+
46
+ ### 最佳实践
47
+
48
+ - 使用 `?.` 可选链避免空指针错误
49
+ - 提供多个备选选择器:`.tweet-content, .content, .tweet-text`
50
+ - 限制输出长度:`text.substring(0, 1000)`
51
+ - 使用 `JSON.stringify()` 输出结构化数据
52
+
53
+ ---
54
+
55
+ ## 模式二:JavaScript 全局变量提取
56
+
57
+ 适用于 SPA(单页应用),从 `window` 对象提取预加载的状态数据。
58
+
59
+ ### 常见全局变量
60
+
61
+ | 网站/框架 | 变量名 | 用途 |
62
+ |-----------|--------|------|
63
+ | 小红书 | `__INITIAL_STATE__` | React 状态 |
64
+ | 电商网站 | `dataLayer` | GTM 数据层 |
65
+ | 商品详情 | `productConfig`, `variations` | 商品配置 |
66
+ | 通用 | `__NEXT_DATA__`, `__NUXT__` | SSR 数据 |
67
+
68
+ ### 实际案例:小红书搜索结果
69
+
70
+ ```bash
71
+ agent-browser eval '
72
+ const feeds = window.__INITIAL_STATE__?.search?.feeds?._value || {};
73
+ const posts = Object.values(feeds).filter(f => f.noteCard).map(f => ({
74
+ id: f.id,
75
+ user: {
76
+ name: f.noteCard.user.nickname,
77
+ avatar: f.noteCard.user.avatar
78
+ },
79
+ stats: {
80
+ likes: parseInt(f.noteCard.interactInfo.likedCount) || 0,
81
+ comments: parseInt(f.noteCard.interactInfo.commentCount) || 0
82
+ },
83
+ cover: f.noteCard.cover?.urlDefault,
84
+ link: "https://www.xiaohongshu.com/explore/" + f.id
85
+ }));
86
+ JSON.stringify({ count: posts.length, posts });
87
+ '
88
+ ```
89
+
90
+ ### 实际案例:电商商品详情(dataLayer)
91
+
92
+ ```bash
93
+ agent-browser eval '
94
+ const dl = window.dataLayer || [];
95
+ const viewItem = dl.find(e => e.event === "view_item");
96
+ const item = viewItem?.ecommerce?.items?.[0] || {};
97
+
98
+ const product = {
99
+ name: document.querySelector("h1")?.textContent?.trim() || item.item_name,
100
+ productId: item.item_id,
101
+ price: item.price,
102
+ currency: viewItem?.ecommerce?.currency || "HKD",
103
+ color: item.color,
104
+ category: item.item_category,
105
+ stockStatus: item.stock_status
106
+ };
107
+
108
+ JSON.stringify(product, null, 2);
109
+ '
110
+ ```
111
+
112
+ ### 最佳实践
113
+
114
+ - 使用可选链:`window.__INITIAL_STATE__?.search?.feeds?._value`
115
+ - 提供默认值:`|| {}` 或 `|| []`
116
+ - 结合 DOM 提取作为补充数据源
117
+
118
+ ---
119
+
120
+ ## 模式三:API 拦截捕获
121
+
122
+ 适用于反爬严格的网站,被动捕获 XHR/Fetch 响应。
123
+
124
+ ### 工作原理
125
+
126
+ ```
127
+ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
128
+ │ 打开空白页 │ --> │ 启动监听器 │ --> │ 导航目标页 │
129
+ │ about:blank │ │ wait --req │ │ 触发 API │
130
+ └─────────────┘ └─────────────┘ └─────────────┘
131
+
132
+ v
133
+ ┌─────────────┐
134
+ │ 捕获响应 │
135
+ │ 保存 JSON │
136
+ └─────────────┘
137
+ ```
138
+
139
+ ### 实际案例:抖音用户视频
140
+
141
+ ```bash
142
+ TARGET_URL="https://www.douyin.com/user/MS4wLjABAAA..."
143
+ OUTPUT_FILE="/tmp/douyin-videos.json"
144
+
145
+ agent-browser open "about:blank"
146
+
147
+ (agent-browser wait --request "aweme/post" --timeout 30000 > /tmp/response.json) &
148
+ WAIT_PID=$!
149
+ sleep 1
150
+
151
+ agent-browser open "$TARGET_URL"
152
+ wait $WAIT_PID
153
+
154
+ jq '{
155
+ user: .aweme_list[0].author.nickname,
156
+ videos: .aweme_list[:10] | map({
157
+ id: .aweme_id,
158
+ desc: .desc,
159
+ stats: {
160
+ likes: .statistics.digg_count,
161
+ comments: .statistics.comment_count
162
+ }
163
+ })
164
+ }' /tmp/response.json > "$OUTPUT_FILE"
165
+ ```
166
+
167
+ ### 常见 API 模式
168
+
169
+ | 平台 | 请求特征 | 数据路径 |
170
+ |------|---------|---------|
171
+ | 抖音 | `aweme/post` | `.aweme_list` |
172
+ | 小红书 | `api/snsweb` | `.data` |
173
+ | 电商 | `api/product` | `.product` |
174
+ | 通用 | `demandware`, `graphql` | 根据响应结构 |
175
+
176
+ ### 最佳实践
177
+
178
+ - 先打开空白页再启动监听
179
+ - 使用后台进程 `&` 和 `wait`
180
+ - 设置合理的超时时间
181
+ - 使用 `jq` 处理 JSON 输出
182
+
183
+ ---
184
+
185
+ ## 模式四:滚动加载采集
186
+
187
+ 适用于无限滚动列表,循环滚动 + 去重合并。
188
+
189
+ ### 工作流程
190
+
191
+ ```
192
+ ┌──────────────────────────────────────────────────────┐
193
+ │ 滚动采集循环 │
194
+ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
195
+ │ │ 提取数据 │ -> │ 合并去重 │ -> │ 滚动加载 │ │
196
+ │ └──────────┘ └──────────┘ └──────────┘ │
197
+ │ │ │ │ │
198
+ │ v v v │
199
+ │ 当前数据 累计数据 触发加载 │
200
+ │ │ │
201
+ │ <──────── 检测终止条件 ────────┘ │
202
+ └──────────────────────────────────────────────────────┘
203
+ ```
204
+
205
+ ### 实际案例:抖音关注列表
206
+
207
+ ```bash
208
+ ALL_USERS="[]"
209
+ PREV_COUNT=0
210
+
211
+ for i in {1..30}; do
212
+ CURRENT=$(agent-browser eval 'JSON.stringify(
213
+ Array.from(document.querySelectorAll("a"))
214
+ .filter(a => a.href.includes("/user/"))
215
+ .map(a => ({ name: a.textContent.trim(), url: a.href }))
216
+ .filter(u => u.name.length > 0)
217
+ )')
218
+
219
+ ALL_USERS=$(echo "$ALL_USERS" "$CURRENT" | python3 -c "
220
+ import sys, json
221
+ data = []
222
+ for line in sys.stdin:
223
+ try:
224
+ d = json.loads(line.strip())
225
+ if isinstance(d, list): data.extend(d)
226
+ except: pass
227
+ seen = set()
228
+ unique = [u for u in data if u['url'] not in seen and not seen.add(u['url'])]
229
+ print(json.dumps(unique, ensure_ascii=False))
230
+ ")
231
+
232
+ COUNT=$(echo "$ALL_USERS" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
233
+ echo "已采集: $COUNT 个用户"
234
+
235
+ [ "$COUNT" -eq "$PREV_COUNT" ] && [ "$COUNT" -gt 0 ] && break
236
+ PREV_COUNT=$COUNT
237
+
238
+ agent-browser scroll down 300
239
+ sleep 1
240
+ done
241
+ ```
242
+
243
+ ### 终止条件检测
244
+
245
+ ```bash
246
+ # 方式一:数量不变
247
+ [ "$COUNT" -eq "$PREV_COUNT" ] && break
248
+
249
+ # 方式二:检测 API 标志
250
+ HAS_MORE=$(agent-browser eval 'window.__INITIAL_STATE__?.hasMore')
251
+ [ "$HAS_MORE" = "false" ] && break
252
+
253
+ # 方式三:检测 DOM 元素
254
+ BOTTOM=$(agent-browser eval 'document.querySelector(".no-more") !== null')
255
+ [ "$BOTTOM" = "true" ] && break
256
+ ```
257
+
258
+ ### 最佳实践
259
+
260
+ - 使用 URL 或 ID 作为去重键
261
+ - 设置最大循环次数防止无限循环
262
+ - 滚动后适当等待加载
263
+ - 输出进度便于监控
264
+
265
+ ---
266
+
267
+ ## 模式五:iframe 嵌套操作
268
+
269
+ 适用于嵌入式登录、第三方组件等场景。
270
+
271
+ ### Frame 路径语法
272
+
273
+ ```
274
+ #outer-iframe # 单层 iframe(按 ID/Name)
275
+ #0 # 第一个 iframe(按索引)
276
+ #outer/login # 嵌套 iframe(父/子)
277
+ #0/1/2 # 多层嵌套(按索引)
278
+ ```
279
+
280
+ ### 实际案例:嵌套 iframe 登录
281
+
282
+ ```bash
283
+ agent-browser open "https://example.com/embedded-login"
284
+
285
+ # 查看主页面 iframe 结构
286
+ agent-browser snapshot
287
+
288
+ # 切换到嵌套 iframe
289
+ agent-browser snapshot --in-frame "#outer-iframe/login-frame"
290
+
291
+ # 在 iframe 内操作
292
+ agent-browser fill '#username' 'admin' --in-frame "#outer-iframe/login-frame"
293
+ agent-browser fill '#password' 'password' --in-frame "#outer-iframe/login-frame"
294
+ agent-browser click 'button' --in-frame "#outer-iframe/login-frame"
295
+
296
+ # 验证结果
297
+ agent-browser snapshot --in-frame "#outer-iframe/login-frame"
298
+ ```
299
+
300
+ ### 最佳实践
301
+
302
+ - 先 snapshot 主页面了解 iframe 结构
303
+ - 使用 CSS 选择器或索引定位 iframe
304
+ - 每次操作后重新 snapshot 验证状态
305
+
306
+ ---
307
+
308
+ ## 代理配置
309
+
310
+ 所有脚本都支持代理,用于访问受限网站:
311
+
312
+ ```bash
313
+ # 方式一:环境变量
314
+ export https_proxy=http://127.0.0.1:7890
315
+ agent-browser open "https://target-site.com"
316
+
317
+ # 方式二:命令行参数
318
+ agent-browser --proxy "http://127.0.0.1:7890" open "https://target-site.com"
319
+
320
+ # 方式三:SOCKS5 代理
321
+ export https_proxy=socks5://127.0.0.1:1080
322
+ ```
323
+
324
+ ---
325
+
326
+ ## 完整脚本模板
327
+
328
+ ### 数据提取脚本结构
329
+
330
+ ```bash
331
+ #!/bin/bash
332
+ # 用法: ./extract-data.sh [参数] [输出文件]
333
+
334
+ TARGET_URL="${1:-https://default.example.com}"
335
+ OUTPUT_FILE="${2:-/tmp/output.json}"
336
+
337
+ echo "=== 1. 关闭旧会话 ==="
338
+ agent-browser close 2>/dev/null
339
+ sleep 1
340
+
341
+ echo ""
342
+ echo "=== 2. 配置代理并打开页面 ==="
343
+ export https_proxy=http://127.0.0.1:7890
344
+ agent-browser open "$TARGET_URL"
345
+ sleep 2
346
+
347
+ echo ""
348
+ echo "=== 3. 提取数据 ==="
349
+ agent-browser eval '
350
+ // 数据提取逻辑
351
+ const data = { /* ... */ };
352
+ JSON.stringify(data, null, 2);
353
+ ' > "$OUTPUT_FILE"
354
+
355
+ echo ""
356
+ echo "=== 4. 显示结果 ==="
357
+ cat "$OUTPUT_FILE"
358
+
359
+ echo ""
360
+ echo "=== 5. 关闭浏览器 ==="
361
+ agent-browser close
362
+
363
+ echo ""
364
+ echo "=== 采集完成 ==="
365
+ ```
366
+
367
+ ---
368
+
369
+ ## 选择指南
370
+
371
+ | 场景 | 推荐模式 | 原因 |
372
+ |------|---------|------|
373
+ | 静态页面 | DOM 提取 | 简单直接 |
374
+ | SPA 应用 | JS 变量提取 | 数据已预加载 |
375
+ | 反爬严格 | API 拦截 | 绕过前端限制 |
376
+ | 无限列表 | 滚动加载 | 完整采集 |
377
+ | 嵌套组件 | iframe 操作 | 跨域访问 |
@@ -0,0 +1,188 @@
1
+ # Proxy Support
2
+
3
+ Proxy configuration for geo-testing, rate limiting avoidance, and corporate environments.
4
+
5
+ **Related**: [commands.md](commands.md) for global options, [SKILL.md](../SKILL.md) for quick start.
6
+
7
+ ## Contents
8
+
9
+ - [Basic Proxy Configuration](#basic-proxy-configuration)
10
+ - [Authenticated Proxy](#authenticated-proxy)
11
+ - [SOCKS Proxy](#socks-proxy)
12
+ - [Proxy Bypass](#proxy-bypass)
13
+ - [Common Use Cases](#common-use-cases)
14
+ - [Verifying Proxy Connection](#verifying-proxy-connection)
15
+ - [Troubleshooting](#troubleshooting)
16
+ - [Best Practices](#best-practices)
17
+
18
+ ## Basic Proxy Configuration
19
+
20
+ Set proxy via environment variable before starting:
21
+
22
+ ```bash
23
+ # HTTP proxy
24
+ export HTTP_PROXY="http://proxy.example.com:8080"
25
+ agent-browser open https://example.com
26
+
27
+ # HTTPS proxy
28
+ export HTTPS_PROXY="https://proxy.example.com:8080"
29
+ agent-browser open https://example.com
30
+
31
+ # Both
32
+ export HTTP_PROXY="http://proxy.example.com:8080"
33
+ export HTTPS_PROXY="http://proxy.example.com:8080"
34
+ agent-browser open https://example.com
35
+ ```
36
+
37
+ ## Authenticated Proxy
38
+
39
+ For proxies requiring authentication:
40
+
41
+ ```bash
42
+ # Include credentials in URL
43
+ export HTTP_PROXY="http://username:password@proxy.example.com:8080"
44
+ agent-browser open https://example.com
45
+ ```
46
+
47
+ ## SOCKS Proxy
48
+
49
+ ```bash
50
+ # SOCKS5 proxy
51
+ export ALL_PROXY="socks5://proxy.example.com:1080"
52
+ agent-browser open https://example.com
53
+
54
+ # SOCKS5 with auth
55
+ export ALL_PROXY="socks5://user:pass@proxy.example.com:1080"
56
+ agent-browser open https://example.com
57
+ ```
58
+
59
+ ## Proxy Bypass
60
+
61
+ Skip proxy for specific domains:
62
+
63
+ ```bash
64
+ # Bypass proxy for local addresses
65
+ export NO_PROXY="localhost,127.0.0.1,.internal.company.com"
66
+ agent-browser open https://internal.company.com # Direct connection
67
+ agent-browser open https://external.com # Via proxy
68
+ ```
69
+
70
+ ## Common Use Cases
71
+
72
+ ### Geo-Location Testing
73
+
74
+ ```bash
75
+ #!/bin/bash
76
+ # Test site from different regions using geo-located proxies
77
+
78
+ PROXIES=(
79
+ "http://us-proxy.example.com:8080"
80
+ "http://eu-proxy.example.com:8080"
81
+ "http://asia-proxy.example.com:8080"
82
+ )
83
+
84
+ for proxy in "${PROXIES[@]}"; do
85
+ export HTTP_PROXY="$proxy"
86
+ export HTTPS_PROXY="$proxy"
87
+
88
+ region=$(echo "$proxy" | grep -oP '^\w+-\w+')
89
+ echo "Testing from: $region"
90
+
91
+ agent-browser --session "$region" open https://example.com
92
+ agent-browser --session "$region" screenshot "./screenshots/$region.png"
93
+ agent-browser --session "$region" close
94
+ done
95
+ ```
96
+
97
+ ### Rotating Proxies for Scraping
98
+
99
+ ```bash
100
+ #!/bin/bash
101
+ # Rotate through proxy list to avoid rate limiting
102
+
103
+ PROXY_LIST=(
104
+ "http://proxy1.example.com:8080"
105
+ "http://proxy2.example.com:8080"
106
+ "http://proxy3.example.com:8080"
107
+ )
108
+
109
+ URLS=(
110
+ "https://site.com/page1"
111
+ "https://site.com/page2"
112
+ "https://site.com/page3"
113
+ )
114
+
115
+ for i in "${!URLS[@]}"; do
116
+ proxy_index=$((i % ${#PROXY_LIST[@]}))
117
+ export HTTP_PROXY="${PROXY_LIST[$proxy_index]}"
118
+ export HTTPS_PROXY="${PROXY_LIST[$proxy_index]}"
119
+
120
+ agent-browser open "${URLS[$i]}"
121
+ agent-browser get text body > "output-$i.txt"
122
+ agent-browser close
123
+
124
+ sleep 1 # Polite delay
125
+ done
126
+ ```
127
+
128
+ ### Corporate Network Access
129
+
130
+ ```bash
131
+ #!/bin/bash
132
+ # Access internal sites via corporate proxy
133
+
134
+ export HTTP_PROXY="http://corpproxy.company.com:8080"
135
+ export HTTPS_PROXY="http://corpproxy.company.com:8080"
136
+ export NO_PROXY="localhost,127.0.0.1,.company.com"
137
+
138
+ # External sites go through proxy
139
+ agent-browser open https://external-vendor.com
140
+
141
+ # Internal sites bypass proxy
142
+ agent-browser open https://intranet.company.com
143
+ ```
144
+
145
+ ## Verifying Proxy Connection
146
+
147
+ ```bash
148
+ # Check your apparent IP
149
+ agent-browser open https://httpbin.org/ip
150
+ agent-browser get text body
151
+ # Should show proxy's IP, not your real IP
152
+ ```
153
+
154
+ ## Troubleshooting
155
+
156
+ ### Proxy Connection Failed
157
+
158
+ ```bash
159
+ # Test proxy connectivity first
160
+ curl -x http://proxy.example.com:8080 https://httpbin.org/ip
161
+
162
+ # Check if proxy requires auth
163
+ export HTTP_PROXY="http://user:pass@proxy.example.com:8080"
164
+ ```
165
+
166
+ ### SSL/TLS Errors Through Proxy
167
+
168
+ Some proxies perform SSL inspection. If you encounter certificate errors:
169
+
170
+ ```bash
171
+ # For testing only - not recommended for production
172
+ agent-browser open https://example.com --ignore-https-errors
173
+ ```
174
+
175
+ ### Slow Performance
176
+
177
+ ```bash
178
+ # Use proxy only when necessary
179
+ export NO_PROXY="*.cdn.com,*.static.com" # Direct CDN access
180
+ ```
181
+
182
+ ## Best Practices
183
+
184
+ 1. **Use environment variables** - Don't hardcode proxy credentials
185
+ 2. **Set NO_PROXY appropriately** - Avoid routing local traffic through proxy
186
+ 3. **Test proxy before automation** - Verify connectivity with simple requests
187
+ 4. **Handle proxy failures gracefully** - Implement retry logic for unstable proxies
188
+ 5. **Rotate proxies for large scraping jobs** - Distribute load and avoid bans