@xbrowser/web-automation 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.ts +278 -0
- package/package.json +23 -0
package/index.ts
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import type { XCLIAPI } from '@dyyz1993/xcli-core';
|
|
3
|
+
import { ok, fail } from '@dyyz1993/xcli-core';
|
|
4
|
+
|
|
5
|
+
export default function (xcli: XCLIAPI): void {
|
|
6
|
+
const site = xcli.createSite({
|
|
7
|
+
name: 'web-automation',
|
|
8
|
+
url: '',
|
|
9
|
+
description: '通用网页自动化 - 搜索、提取、分页采集',
|
|
10
|
+
requiresLogin: false,
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
site.command('extract', {
|
|
14
|
+
description: '从指定URL提取页面结构化内容',
|
|
15
|
+
scope: 'browser',
|
|
16
|
+
result: z.any(),
|
|
17
|
+
parameters: z.object({
|
|
18
|
+
url: z.string().describe('目标页面URL'),
|
|
19
|
+
selector: z.string().optional().default('body').describe('CSS选择器,默认body'),
|
|
20
|
+
fields: z
|
|
21
|
+
.array(
|
|
22
|
+
z.object({
|
|
23
|
+
name: z.string().describe('字段名'),
|
|
24
|
+
selector: z.string().describe('CSS选择器'),
|
|
25
|
+
attribute: z.string().optional().describe('提取属性值(如href),留空则取textContent'),
|
|
26
|
+
})
|
|
27
|
+
)
|
|
28
|
+
.optional()
|
|
29
|
+
.describe('自定义提取字段列表'),
|
|
30
|
+
}),
|
|
31
|
+
examples: [
|
|
32
|
+
{
|
|
33
|
+
cmd: 'xbrowser web-automation extract --url "https://news.ycombinator.com" --fields \'[{"name":"title","selector":".titleline > a"},{"name":"link","selector":".titleline > a","attribute":"href"}]\'',
|
|
34
|
+
description: '提取 Hacker News 标题和链接',
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
handler: async (params, ctx) => {
|
|
38
|
+
const page = (ctx as Record<string, unknown>).page as import('playwright').Page | undefined;
|
|
39
|
+
if (!page) throw new Error('需要浏览器页面上下文');
|
|
40
|
+
|
|
41
|
+
const { url, selector, fields } = params;
|
|
42
|
+
|
|
43
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
44
|
+
await page.waitForLoadState('networkidle');
|
|
45
|
+
|
|
46
|
+
if (fields && fields.length > 0) {
|
|
47
|
+
const data = await page.evaluate((fieldDefs: typeof fields) => {
|
|
48
|
+
return fieldDefs.map((field) => {
|
|
49
|
+
const elements = document.querySelectorAll(field.selector);
|
|
50
|
+
return {
|
|
51
|
+
field: field.name,
|
|
52
|
+
values: Array.from(elements).map((el) => {
|
|
53
|
+
if (field.attribute) {
|
|
54
|
+
return el.getAttribute(field.attribute) || '';
|
|
55
|
+
}
|
|
56
|
+
return el.textContent?.trim() || '';
|
|
57
|
+
}),
|
|
58
|
+
};
|
|
59
|
+
});
|
|
60
|
+
}, fields);
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
data,
|
|
64
|
+
tips: [`从 ${url} 提取了 ${fields.length} 个字段`],
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const content = await page.evaluate((sel: string) => {
|
|
69
|
+
const root = document.querySelector(sel) || document.body;
|
|
70
|
+
const items: Array<{ tag: string; text: string; href?: string; src?: string }> = [];
|
|
71
|
+
|
|
72
|
+
const walk = (el: Element) => {
|
|
73
|
+
const tag = el.tagName.toLowerCase();
|
|
74
|
+
const text = el.textContent?.trim().slice(0, 500) || '';
|
|
75
|
+
const item: (typeof items)[0] = { tag, text };
|
|
76
|
+
|
|
77
|
+
if (tag === 'a') item.href = (el as HTMLAnchorElement).href;
|
|
78
|
+
if (tag === 'img') item.src = (el as HTMLImageElement).src;
|
|
79
|
+
|
|
80
|
+
if (text && !['script', 'style', 'noscript'].includes(tag)) {
|
|
81
|
+
items.push(item);
|
|
82
|
+
}
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
root.querySelectorAll('*').forEach(walk);
|
|
86
|
+
return items;
|
|
87
|
+
}, selector);
|
|
88
|
+
|
|
89
|
+
return ok(content, [`从 ${url} 的 "${selector}" 中提取了 ${content.length} 个元素`]);
|
|
90
|
+
},
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
site.command('paginate', {
|
|
94
|
+
description: '分页采集:自动翻页并提取数据',
|
|
95
|
+
scope: 'browser',
|
|
96
|
+
result: z.any(),
|
|
97
|
+
parameters: z.object({
|
|
98
|
+
url: z.string().describe('起始页URL'),
|
|
99
|
+
nextSelector: z
|
|
100
|
+
.string()
|
|
101
|
+
.default('.n, .next, [rel="next"]')
|
|
102
|
+
.describe('下一页按钮选择器'),
|
|
103
|
+
itemSelector: z.string().describe('每条数据的容器选择器'),
|
|
104
|
+
fields: z
|
|
105
|
+
.array(
|
|
106
|
+
z.object({
|
|
107
|
+
name: z.string(),
|
|
108
|
+
selector: z.string(),
|
|
109
|
+
attribute: z.string().optional(),
|
|
110
|
+
})
|
|
111
|
+
)
|
|
112
|
+
.describe('要提取的字段'),
|
|
113
|
+
maxPages: z.number().optional().default(5).describe('最大翻页数'),
|
|
114
|
+
delay: z.number().optional().default(1000).describe('翻页间隔(ms)'),
|
|
115
|
+
}),
|
|
116
|
+
examples: [
|
|
117
|
+
{
|
|
118
|
+
cmd: 'xbrowser web-automation paginate --url "https://example.com/list" --item-selector ".item" --fields \'[{"name":"title","selector":"h3"}]\' --max-pages 3',
|
|
119
|
+
description: '翻页采集3页数据',
|
|
120
|
+
},
|
|
121
|
+
],
|
|
122
|
+
handler: async (params, ctx) => {
|
|
123
|
+
const page = (ctx as Record<string, unknown>).page as import('playwright').Page | undefined;
|
|
124
|
+
if (!page) throw new Error('需要浏览器页面上下文');
|
|
125
|
+
|
|
126
|
+
const { url, nextSelector, itemSelector, fields, maxPages, delay } = params;
|
|
127
|
+
const allData: Record<string, string>[] = [];
|
|
128
|
+
|
|
129
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
130
|
+
|
|
131
|
+
for (let p = 1; p <= maxPages; p++) {
|
|
132
|
+
const pageData = await page.evaluate(
|
|
133
|
+
(opts: { itemSel: string; fieldDefs: typeof fields }) => {
|
|
134
|
+
const items = document.querySelectorAll(opts.itemSel);
|
|
135
|
+
return Array.from(items).map((item) => {
|
|
136
|
+
const row: Record<string, string> = {};
|
|
137
|
+
for (const field of opts.fieldDefs) {
|
|
138
|
+
const el = item.querySelector(field.selector);
|
|
139
|
+
if (el) {
|
|
140
|
+
row[field.name] = field.attribute
|
|
141
|
+
? el.getAttribute(field.attribute) || ''
|
|
142
|
+
: el.textContent?.trim() || '';
|
|
143
|
+
} else {
|
|
144
|
+
row[field.name] = '';
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
return row;
|
|
148
|
+
});
|
|
149
|
+
},
|
|
150
|
+
{ itemSel: itemSelector, fieldDefs: fields }
|
|
151
|
+
);
|
|
152
|
+
|
|
153
|
+
allData.push(...pageData);
|
|
154
|
+
|
|
155
|
+
if (p < maxPages) {
|
|
156
|
+
const nextBtn = page.locator(nextSelector).first();
|
|
157
|
+
const isVisible = await nextBtn.isVisible().catch(() => false);
|
|
158
|
+
if (!isVisible) break;
|
|
159
|
+
|
|
160
|
+
await nextBtn.click();
|
|
161
|
+
await page.waitForLoadState('domcontentloaded');
|
|
162
|
+
await page.waitForTimeout(delay);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
return ok(allData, [
|
|
167
|
+
`采集 ${Math.min(maxPages, Math.ceil(allData.length / 10))} 页,共 ${allData.length} 条数据`,
|
|
168
|
+
`字段: ${fields.map((f) => f.name).join(', ')}`,
|
|
169
|
+
]);
|
|
170
|
+
},
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
site.command('fill-and-submit', {
|
|
174
|
+
description: '填写表单并提交',
|
|
175
|
+
scope: 'browser',
|
|
176
|
+
result: z.any(),
|
|
177
|
+
parameters: z.object({
|
|
178
|
+
url: z.string().describe('表单页面URL'),
|
|
179
|
+
fields: z
|
|
180
|
+
.array(
|
|
181
|
+
z.object({
|
|
182
|
+
selector: z.string().describe('输入框选择器'),
|
|
183
|
+
value: z.string().describe('填入值'),
|
|
184
|
+
})
|
|
185
|
+
)
|
|
186
|
+
.describe('表单字段列表'),
|
|
187
|
+
submitSelector: z
|
|
188
|
+
.string()
|
|
189
|
+
.default('button[type="submit"], input[type="submit"]')
|
|
190
|
+
.describe('提交按钮选择器'),
|
|
191
|
+
waitForNavigation: z.boolean().optional().default(true).describe('是否等待页面跳转'),
|
|
192
|
+
}),
|
|
193
|
+
examples: [
|
|
194
|
+
{
|
|
195
|
+
cmd: 'xbrowser web-automation fill-and-submit --url "https://example.com/form" --fields \'[{"selector":"#name","value":"John"},{"selector":"#email","value":"john@test.com"}]\' --submit-selector "#submit"',
|
|
196
|
+
description: '填写并提交表单',
|
|
197
|
+
},
|
|
198
|
+
],
|
|
199
|
+
handler: async (params, ctx) => {
|
|
200
|
+
const page = (ctx as Record<string, unknown>).page as import('playwright').Page | undefined;
|
|
201
|
+
if (!page) throw new Error('需要浏览器页面上下文');
|
|
202
|
+
|
|
203
|
+
const { url, fields: formFields, submitSelector, waitForNavigation } = params;
|
|
204
|
+
|
|
205
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
206
|
+
await page.waitForLoadState('networkidle');
|
|
207
|
+
|
|
208
|
+
for (const field of formFields) {
|
|
209
|
+
await page.fill(field.selector, field.value);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (waitForNavigation) {
|
|
213
|
+
await Promise.all([
|
|
214
|
+
page.waitForNavigation({ waitUntil: 'domcontentloaded' }).catch(() => {}),
|
|
215
|
+
page.click(submitSelector),
|
|
216
|
+
]);
|
|
217
|
+
} else {
|
|
218
|
+
await page.click(submitSelector);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
await page.waitForLoadState('networkidle');
|
|
222
|
+
|
|
223
|
+
const resultUrl = page.url();
|
|
224
|
+
const resultTitle = await page.title();
|
|
225
|
+
|
|
226
|
+
return ok({
|
|
227
|
+
submittedUrl: url,
|
|
228
|
+
resultUrl,
|
|
229
|
+
resultTitle,
|
|
230
|
+
fieldsFilled: formFields.length,
|
|
231
|
+
}, [`表单已提交,跳转到: ${resultUrl}`]);
|
|
232
|
+
},
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
site.command('screenshot', {
|
|
236
|
+
description: '截取网页截图',
|
|
237
|
+
scope: 'browser',
|
|
238
|
+
result: z.any(),
|
|
239
|
+
parameters: z.object({
|
|
240
|
+
url: z.string().describe('目标URL'),
|
|
241
|
+
fullPage: z.boolean().optional().default(false).describe('是否全页截图'),
|
|
242
|
+
selector: z.string().optional().describe('只截取指定元素'),
|
|
243
|
+
}),
|
|
244
|
+
examples: [
|
|
245
|
+
{
|
|
246
|
+
cmd: 'xbrowser web-automation screenshot --url "https://example.com" --full-page true',
|
|
247
|
+
description: '全页截图',
|
|
248
|
+
},
|
|
249
|
+
],
|
|
250
|
+
handler: async (params, ctx) => {
|
|
251
|
+
const page = (ctx as Record<string, unknown>).page as import('playwright').Page | undefined;
|
|
252
|
+
if (!page) throw new Error('需要浏览器页面上下文');
|
|
253
|
+
|
|
254
|
+
const { url, fullPage, selector } = params;
|
|
255
|
+
|
|
256
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
257
|
+
await page.waitForLoadState('networkidle');
|
|
258
|
+
|
|
259
|
+
let base64: string;
|
|
260
|
+
|
|
261
|
+
if (selector) {
|
|
262
|
+
const element = page.locator(selector);
|
|
263
|
+
const buffer = await element.screenshot();
|
|
264
|
+
base64 = buffer.toString('base64');
|
|
265
|
+
} else {
|
|
266
|
+
const buffer = await page.screenshot({ fullPage });
|
|
267
|
+
base64 = buffer.toString('base64');
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
return ok({
|
|
271
|
+
url,
|
|
272
|
+
fullPage,
|
|
273
|
+
imageBase64: base64,
|
|
274
|
+
size: base64.length,
|
|
275
|
+
}, [`截图完成,大小 ${(base64.length / 1024).toFixed(1)}KB`]);
|
|
276
|
+
},
|
|
277
|
+
});
|
|
278
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@xbrowser/web-automation",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "通用网页自动化 - 搜索、提取、分页采集",
|
|
5
|
+
"main": "index.ts",
|
|
6
|
+
"keywords": [
|
|
7
|
+
"xbrowser",
|
|
8
|
+
"xbrowser-plugin",
|
|
9
|
+
"automation"
|
|
10
|
+
],
|
|
11
|
+
"author": "dyyz1993",
|
|
12
|
+
"license": "MIT",
|
|
13
|
+
"xbrowser": {
|
|
14
|
+
"site": "",
|
|
15
|
+
"requiresLogin": false,
|
|
16
|
+
"commands": [
|
|
17
|
+
"extract",
|
|
18
|
+
"paginate",
|
|
19
|
+
"fill-and-submit",
|
|
20
|
+
"screenshot"
|
|
21
|
+
]
|
|
22
|
+
}
|
|
23
|
+
}
|