@pyrokine/mcp-chrome 1.1.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -54
- package/dist/anti-detection/behavior.d.ts +0 -8
- package/dist/anti-detection/behavior.d.ts.map +1 -1
- package/dist/anti-detection/behavior.js +0 -16
- package/dist/anti-detection/behavior.js.map +1 -1
- package/dist/cdp/client.d.ts +0 -2
- package/dist/cdp/client.d.ts.map +1 -1
- package/dist/cdp/client.js +30 -45
- package/dist/cdp/client.js.map +1 -1
- package/dist/cdp/launcher.d.ts +1 -8
- package/dist/cdp/launcher.d.ts.map +1 -1
- package/dist/cdp/launcher.js +11 -21
- package/dist/cdp/launcher.js.map +1 -1
- package/dist/core/auto-wait.d.ts +2 -2
- package/dist/core/auto-wait.d.ts.map +1 -1
- package/dist/core/auto-wait.js +1 -1
- package/dist/core/auto-wait.js.map +1 -1
- package/dist/core/errors.d.ts +10 -13
- package/dist/core/errors.d.ts.map +1 -1
- package/dist/core/errors.js +19 -25
- package/dist/core/errors.js.map +1 -1
- package/dist/core/locator.d.ts +6 -7
- package/dist/core/locator.d.ts.map +1 -1
- package/dist/core/locator.js +77 -31
- package/dist/core/locator.js.map +1 -1
- package/dist/core/retry.d.ts.map +1 -1
- package/dist/core/retry.js +1 -1
- package/dist/core/retry.js.map +1 -1
- package/dist/core/session.d.ts +37 -33
- package/dist/core/session.d.ts.map +1 -1
- package/dist/core/session.js +159 -116
- package/dist/core/session.js.map +1 -1
- package/dist/core/types.d.ts +25 -1
- package/dist/core/types.d.ts.map +1 -1
- package/dist/core/types.js +20 -0
- package/dist/core/types.js.map +1 -1
- package/dist/core/unified-session.d.ts +105 -67
- package/dist/core/unified-session.d.ts.map +1 -1
- package/dist/core/unified-session.js +347 -186
- package/dist/core/unified-session.js.map +1 -1
- package/dist/extension/bridge.d.ts +25 -19
- package/dist/extension/bridge.d.ts.map +1 -1
- package/dist/extension/bridge.js +29 -52
- package/dist/extension/bridge.js.map +1 -1
- package/dist/extension/http-server.d.ts +13 -11
- package/dist/extension/http-server.d.ts.map +1 -1
- package/dist/extension/http-server.js +101 -95
- package/dist/extension/http-server.js.map +1 -1
- package/dist/index.js +18 -64
- package/dist/index.js.map +1 -1
- package/dist/tools/browse.d.ts +3 -80
- package/dist/tools/browse.d.ts.map +1 -1
- package/dist/tools/browse.js +135 -291
- package/dist/tools/browse.js.map +1 -1
- package/dist/tools/cookies.d.ts +3 -71
- package/dist/tools/cookies.d.ts.map +1 -1
- package/dist/tools/cookies.js +75 -157
- package/dist/tools/cookies.js.map +1 -1
- package/dist/tools/evaluate.d.ts +3 -52
- package/dist/tools/evaluate.d.ts.map +1 -1
- package/dist/tools/evaluate.js +35 -86
- package/dist/tools/evaluate.js.map +1 -1
- package/dist/tools/extract.d.ts +5 -227
- package/dist/tools/extract.d.ts.map +1 -1
- package/dist/tools/extract.js +586 -184
- package/dist/tools/extract.js.map +1 -1
- package/dist/tools/index.d.ts +9 -9
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +9 -9
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/input.d.ts +3 -258
- package/dist/tools/input.d.ts.map +1 -1
- package/dist/tools/input.js +95 -147
- package/dist/tools/input.js.map +1 -1
- package/dist/tools/logs.d.ts +3 -51
- package/dist/tools/logs.d.ts.map +1 -1
- package/dist/tools/logs.js +47 -108
- package/dist/tools/logs.js.map +1 -1
- package/dist/tools/manage.d.ts +3 -64
- package/dist/tools/manage.d.ts.map +1 -1
- package/dist/tools/manage.js +243 -373
- package/dist/tools/manage.js.map +1 -1
- package/dist/tools/schema.d.ts +16 -182
- package/dist/tools/schema.d.ts.map +1 -1
- package/dist/tools/schema.js +70 -159
- package/dist/tools/schema.js.map +1 -1
- package/dist/tools/wait.d.ts +3 -221
- package/dist/tools/wait.d.ts.map +1 -1
- package/dist/tools/wait.js +74 -145
- package/dist/tools/wait.js.map +1 -1
- package/package.json +1 -1
package/dist/tools/extract.js
CHANGED
|
@@ -3,80 +3,44 @@
|
|
|
3
3
|
*
|
|
4
4
|
* 提取页面内容:
|
|
5
5
|
* - text: 文本内容
|
|
6
|
-
* - html: HTML
|
|
6
|
+
* - html: HTML 源码(可选附带图片元信息或图片数据)
|
|
7
7
|
* - attribute: 元素属性
|
|
8
8
|
* - screenshot: 截图
|
|
9
9
|
* - state: 页面状态(精简的可交互元素列表)
|
|
10
|
+
* - metadata: 页面元信息(title/og/jsonLd 等)
|
|
10
11
|
*/
|
|
11
|
-
import { writeFile } from 'fs/promises';
|
|
12
|
+
import { mkdir, writeFile } from 'fs/promises';
|
|
13
|
+
import { basename, dirname, extname, join } from 'path';
|
|
12
14
|
import { z } from 'zod';
|
|
13
|
-
import { formatErrorResponse, getSession, getUnifiedSession } from '../core/index.js';
|
|
14
|
-
import {
|
|
15
|
-
/**
|
|
16
|
-
|
|
17
|
-
*/
|
|
18
|
-
export const extractToolDefinition = {
|
|
19
|
-
name: 'extract',
|
|
20
|
-
description: '提取页面内容:文本、HTML、属性、截图、状态',
|
|
21
|
-
inputSchema: {
|
|
22
|
-
type: 'object',
|
|
23
|
-
properties: {
|
|
24
|
-
type: {
|
|
25
|
-
type: 'string',
|
|
26
|
-
enum: ['text', 'html', 'attribute', 'screenshot', 'state'],
|
|
27
|
-
description: '提取类型',
|
|
28
|
-
},
|
|
29
|
-
target: {
|
|
30
|
-
...targetJsonSchema,
|
|
31
|
-
description: '目标元素(attribute 必填;text/html 可选,省略则提取整个页面;screenshot/state 不需要)',
|
|
32
|
-
},
|
|
33
|
-
attribute: {
|
|
34
|
-
type: 'string',
|
|
35
|
-
description: '属性名(attribute)',
|
|
36
|
-
},
|
|
37
|
-
fullPage: {
|
|
38
|
-
type: 'boolean',
|
|
39
|
-
description: '是否全页面截图(screenshot)',
|
|
40
|
-
},
|
|
41
|
-
output: {
|
|
42
|
-
type: 'string',
|
|
43
|
-
description: '输出文件路径(可选)。若指定,结果写入文件;否则返回内容',
|
|
44
|
-
},
|
|
45
|
-
tabId: {
|
|
46
|
-
type: 'string',
|
|
47
|
-
description: '目标 Tab ID(可选,仅 Extension 模式)。不指定则使用当前 attach 的 tab。可操作非当前 attach 的 tab。CDP 模式下忽略此参数',
|
|
48
|
-
},
|
|
49
|
-
timeout: {
|
|
50
|
-
type: 'number',
|
|
51
|
-
description: '等待目标元素超时',
|
|
52
|
-
},
|
|
53
|
-
frame: {
|
|
54
|
-
oneOf: [{ type: 'string' }, { type: 'number' }],
|
|
55
|
-
description: 'iframe 定位(可选,仅 Extension 模式)。CSS 选择器(如 "iframe#main")或索引(如 0)。不指定则在主框架操作',
|
|
56
|
-
},
|
|
57
|
-
},
|
|
58
|
-
required: ['type'],
|
|
59
|
-
},
|
|
60
|
-
};
|
|
15
|
+
import { formatErrorResponse, formatResponse, getSession, getUnifiedSession } from '../core/index.js';
|
|
16
|
+
import { targetToFindParams, targetZodSchema } from './schema.js';
|
|
17
|
+
/** 无 output 时附录返回的最大图片数 */
|
|
18
|
+
const MAX_APPENDIX_IMAGES = 20;
|
|
61
19
|
/**
|
|
62
20
|
* extract 参数 schema
|
|
63
21
|
*/
|
|
64
22
|
const extractSchema = z.object({
|
|
65
|
-
type: z.enum(['text', 'html', 'attribute', 'screenshot', 'state'])
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
23
|
+
type: z.enum(['text', 'html', 'attribute', 'screenshot', 'state', 'metadata'])
|
|
24
|
+
.describe('提取类型'),
|
|
25
|
+
target: targetZodSchema.optional().describe('目标元素(attribute 必填;text/html 可选,省略则提取整个页面;screenshot/state/metadata 不需要)'),
|
|
26
|
+
attribute: z.string().optional().describe('属性名(attribute)'),
|
|
27
|
+
images: z.enum(['info', 'data']).optional().describe('图片提取模式(仅 html 类型有效)。info: 元信息(src/alt/尺寸);data: 含图片数据'),
|
|
28
|
+
fullPage: z.boolean().optional().describe('是否全页面截图(screenshot)'),
|
|
29
|
+
scale: z.number().optional().describe('截图缩放比例(screenshot fullPage)。默认 1,设为 0.5 可降低分辨率加速大页面截图'),
|
|
30
|
+
format: z.enum(['png', 'jpeg', 'webp']).optional().describe('截图格式(screenshot)。默认 png,jpeg/webp 体积更小,复杂页面推荐 jpeg 减少超时'),
|
|
31
|
+
quality: z.number().min(0).max(100).optional().describe('截图质量(screenshot,仅 jpeg/webp 有效)。0-100,推荐 80'),
|
|
32
|
+
output: z.string()
|
|
33
|
+
.optional()
|
|
34
|
+
.describe('输出文件路径(可选)。若指定,结果写入文件;否则返回内容。images=data 时作为输出目录路径'),
|
|
35
|
+
tabId: z.string().optional().describe('目标 Tab ID(可选,仅 Extension 模式)。不指定则使用当前 attach 的 tab。可操作非当前 attach 的 tab。CDP 模式下忽略此参数'),
|
|
36
|
+
timeout: z.number().optional().describe('等待目标元素超时'),
|
|
37
|
+
frame: z.union([z.string(), z.number()]).optional().describe('iframe 定位(可选,仅 Extension 模式)。CSS 选择器(如 "iframe#main")或索引(如 0)。不指定则在主框架操作'),
|
|
73
38
|
});
|
|
74
39
|
/**
|
|
75
40
|
* extract 工具处理器
|
|
76
41
|
*/
|
|
77
|
-
|
|
42
|
+
async function handleExtract(args) {
|
|
78
43
|
try {
|
|
79
|
-
const args = extractSchema.parse(params);
|
|
80
44
|
const unifiedSession = getUnifiedSession();
|
|
81
45
|
const useExtension = unifiedSession.isExtensionConnected();
|
|
82
46
|
const session = getSession();
|
|
@@ -93,66 +57,43 @@ export async function handleExtract(params) {
|
|
|
93
57
|
? await extractTextExtension(unifiedSession, args.target)
|
|
94
58
|
: await extractText(session, args.target, args.timeout);
|
|
95
59
|
if (args.output) {
|
|
96
|
-
await
|
|
97
|
-
return {
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
type: 'text',
|
|
104
|
-
output: args.output,
|
|
105
|
-
size: text.length,
|
|
106
|
-
}),
|
|
107
|
-
},
|
|
108
|
-
],
|
|
109
|
-
};
|
|
60
|
+
await writeOutputFile(args.output, text, 'utf-8');
|
|
61
|
+
return formatResponse({
|
|
62
|
+
success: true,
|
|
63
|
+
type: 'text',
|
|
64
|
+
output: args.output,
|
|
65
|
+
size: text.length,
|
|
66
|
+
});
|
|
110
67
|
}
|
|
111
|
-
return {
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
success: true,
|
|
117
|
-
type: 'text',
|
|
118
|
-
content: text,
|
|
119
|
-
}),
|
|
120
|
-
},
|
|
121
|
-
],
|
|
122
|
-
};
|
|
68
|
+
return formatResponse({
|
|
69
|
+
success: true,
|
|
70
|
+
type: 'text',
|
|
71
|
+
content: text,
|
|
72
|
+
});
|
|
123
73
|
}
|
|
124
74
|
case 'html': {
|
|
75
|
+
// 带图片提取的增强路径
|
|
76
|
+
if (args.images) {
|
|
77
|
+
return await handleHtmlWithImages(unifiedSession, session, useExtension, args);
|
|
78
|
+
}
|
|
79
|
+
// 原有路径:纯 HTML
|
|
125
80
|
const html = useExtension
|
|
126
81
|
? await extractHtmlExtension(unifiedSession, args.target)
|
|
127
82
|
: await extractHTML(session, args.target, args.timeout);
|
|
128
83
|
if (args.output) {
|
|
129
|
-
await
|
|
130
|
-
return {
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
type: 'html',
|
|
137
|
-
output: args.output,
|
|
138
|
-
size: html.length,
|
|
139
|
-
}),
|
|
140
|
-
},
|
|
141
|
-
],
|
|
142
|
-
};
|
|
84
|
+
await writeOutputFile(args.output, html, 'utf-8');
|
|
85
|
+
return formatResponse({
|
|
86
|
+
success: true,
|
|
87
|
+
type: 'html',
|
|
88
|
+
output: args.output,
|
|
89
|
+
size: html.length,
|
|
90
|
+
});
|
|
143
91
|
}
|
|
144
|
-
return {
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
success: true,
|
|
150
|
-
type: 'html',
|
|
151
|
-
content: html,
|
|
152
|
-
}),
|
|
153
|
-
},
|
|
154
|
-
],
|
|
155
|
-
};
|
|
92
|
+
return formatResponse({
|
|
93
|
+
success: true,
|
|
94
|
+
type: 'html',
|
|
95
|
+
content: html,
|
|
96
|
+
});
|
|
156
97
|
}
|
|
157
98
|
case 'attribute': {
|
|
158
99
|
if (!args.target) {
|
|
@@ -194,37 +135,106 @@ export async function handleExtract(params) {
|
|
|
194
135
|
else {
|
|
195
136
|
value = await extractAttribute(session, args.target, args.attribute, args.timeout);
|
|
196
137
|
}
|
|
197
|
-
return {
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
type: 'attribute',
|
|
204
|
-
attribute: args.attribute,
|
|
205
|
-
value,
|
|
206
|
-
}),
|
|
207
|
-
},
|
|
208
|
-
],
|
|
209
|
-
};
|
|
138
|
+
return formatResponse({
|
|
139
|
+
success: true,
|
|
140
|
+
type: 'attribute',
|
|
141
|
+
attribute: args.attribute,
|
|
142
|
+
value,
|
|
143
|
+
});
|
|
210
144
|
}
|
|
211
145
|
case 'screenshot': {
|
|
212
|
-
|
|
146
|
+
// 有 target 时获取元素区域用于裁剪(支持所有 target 类型)
|
|
147
|
+
let clip;
|
|
148
|
+
if (args.target) {
|
|
149
|
+
if (useExtension) {
|
|
150
|
+
const { selector, text, xpath, nth: nthParam } = targetToFindParams(args.target);
|
|
151
|
+
const nth = nthParam ?? 0;
|
|
152
|
+
const found = await unifiedSession.find(selector, text, xpath);
|
|
153
|
+
if (found.length > nth) {
|
|
154
|
+
const rect = found[nth].rect;
|
|
155
|
+
if (rect.width > 0 && rect.height > 0) {
|
|
156
|
+
// find() 返回视口绝对坐标(已包含 iframe 坐标修正)
|
|
157
|
+
clip = rect;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
const { selector, text, xpath, nth: nthParam } = targetToFindParams(args.target);
|
|
163
|
+
const nth = nthParam ?? 0;
|
|
164
|
+
const rect = await session.evaluate(`function(selector, text, xpath, nth) {
|
|
165
|
+
function toRect(el) {
|
|
166
|
+
var r = el.getBoundingClientRect();
|
|
167
|
+
return {x: r.x, y: r.y, width: r.width, height: r.height};
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
function findByXPath(xp, n) {
|
|
171
|
+
var r = document.evaluate(xp, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
172
|
+
return r.snapshotLength > n ? r.snapshotItem(n) : null;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function findBySelector(sel, txt, n) {
|
|
176
|
+
var els = document.querySelectorAll(sel);
|
|
177
|
+
var matchCount = 0;
|
|
178
|
+
for (var i = 0; i < els.length; ++i) {
|
|
179
|
+
var el = els[i];
|
|
180
|
+
if (txt) {
|
|
181
|
+
var content = (el.textContent || '').trim();
|
|
182
|
+
if (!content.includes(txt)) continue;
|
|
183
|
+
}
|
|
184
|
+
if (matchCount < n) { ++matchCount; continue; }
|
|
185
|
+
return el;
|
|
186
|
+
}
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function findByText(txt, n) {
|
|
191
|
+
var root = document.body || document.documentElement;
|
|
192
|
+
if (!root) return null;
|
|
193
|
+
var walker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
|
|
194
|
+
var matchCount = 0;
|
|
195
|
+
var el = walker.currentNode;
|
|
196
|
+
while (el) {
|
|
197
|
+
var content = (el.textContent || '').trim();
|
|
198
|
+
if (content && content.includes(txt)) {
|
|
199
|
+
if (matchCount < n) { ++matchCount; }
|
|
200
|
+
else { return el; }
|
|
201
|
+
}
|
|
202
|
+
el = walker.nextNode();
|
|
203
|
+
}
|
|
204
|
+
return null;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
var el = null;
|
|
208
|
+
if (xpath) {
|
|
209
|
+
el = findByXPath(xpath, nth);
|
|
210
|
+
} else if (selector) {
|
|
211
|
+
el = findBySelector(selector, text, nth);
|
|
212
|
+
} else if (text) {
|
|
213
|
+
el = findByText(text, nth);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return el ? toRect(el) : null;
|
|
217
|
+
}`, [selector ?? null, text ?? null, xpath ?? null, nth]);
|
|
218
|
+
if (rect && rect.width > 0 && rect.height > 0) {
|
|
219
|
+
clip = rect;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
const base64 = await unifiedSession.screenshot({
|
|
224
|
+
fullPage: clip ? false : (args.fullPage ?? false),
|
|
225
|
+
scale: args.scale,
|
|
226
|
+
format: args.format,
|
|
227
|
+
quality: args.quality,
|
|
228
|
+
clip,
|
|
229
|
+
});
|
|
213
230
|
if (args.output) {
|
|
214
231
|
// 写入文件
|
|
215
|
-
await
|
|
216
|
-
return {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
success: true,
|
|
222
|
-
type: 'screenshot',
|
|
223
|
-
output: args.output,
|
|
224
|
-
}),
|
|
225
|
-
},
|
|
226
|
-
],
|
|
227
|
-
};
|
|
232
|
+
await writeOutputFile(args.output, Buffer.from(base64, 'base64'));
|
|
233
|
+
return formatResponse({
|
|
234
|
+
success: true,
|
|
235
|
+
type: 'screenshot',
|
|
236
|
+
output: args.output,
|
|
237
|
+
});
|
|
228
238
|
}
|
|
229
239
|
// 返回 base64 图片
|
|
230
240
|
return {
|
|
@@ -232,40 +242,52 @@ export async function handleExtract(params) {
|
|
|
232
242
|
{
|
|
233
243
|
type: 'image',
|
|
234
244
|
data: base64,
|
|
235
|
-
mimeType: '
|
|
245
|
+
mimeType: `image/${args.format === 'jpeg' ? 'jpeg' : args.format ?? 'png'}`,
|
|
236
246
|
},
|
|
237
247
|
],
|
|
238
248
|
};
|
|
239
249
|
}
|
|
240
250
|
case 'state': {
|
|
241
|
-
|
|
251
|
+
// 有 target 时获取子树的无障碍状态
|
|
252
|
+
let refId;
|
|
253
|
+
if (args.target && useExtension) {
|
|
254
|
+
const { selector, text, xpath, nth: nthParam } = targetToFindParams(args.target);
|
|
255
|
+
const nth = nthParam ?? 0;
|
|
256
|
+
const elements = await unifiedSession.find(selector, text, xpath);
|
|
257
|
+
if (elements.length > 0 && nth < elements.length) {
|
|
258
|
+
refId = elements[nth].refId;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
const state = await unifiedSession.readPage(refId ? { refId } : undefined);
|
|
242
262
|
if (args.output) {
|
|
243
|
-
await
|
|
244
|
-
return {
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
success: true,
|
|
250
|
-
type: 'state',
|
|
251
|
-
output: args.output,
|
|
252
|
-
}),
|
|
253
|
-
},
|
|
254
|
-
],
|
|
255
|
-
};
|
|
263
|
+
await writeOutputFile(args.output, JSON.stringify(state, null, 2), 'utf-8');
|
|
264
|
+
return formatResponse({
|
|
265
|
+
success: true,
|
|
266
|
+
type: 'state',
|
|
267
|
+
output: args.output,
|
|
268
|
+
});
|
|
256
269
|
}
|
|
257
|
-
return {
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
270
|
+
return formatResponse({
|
|
271
|
+
success: true,
|
|
272
|
+
type: 'state',
|
|
273
|
+
state,
|
|
274
|
+
});
|
|
275
|
+
}
|
|
276
|
+
case 'metadata': {
|
|
277
|
+
const metadata = await unifiedSession.getMetadata();
|
|
278
|
+
if (args.output) {
|
|
279
|
+
await writeOutputFile(args.output, JSON.stringify(metadata, null, 2), 'utf-8');
|
|
280
|
+
return formatResponse({
|
|
281
|
+
success: true,
|
|
282
|
+
type: 'metadata',
|
|
283
|
+
output: args.output,
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
return formatResponse({
|
|
287
|
+
success: true,
|
|
288
|
+
type: 'metadata',
|
|
289
|
+
...metadata,
|
|
290
|
+
});
|
|
269
291
|
}
|
|
270
292
|
default:
|
|
271
293
|
return {
|
|
@@ -290,6 +312,354 @@ export async function handleExtract(params) {
|
|
|
290
312
|
return formatErrorResponse(error);
|
|
291
313
|
}
|
|
292
314
|
}
|
|
315
|
+
// ==================== HTML + 图片提取 ====================
|
|
316
|
+
/** 写入文件前自动创建父目录 */
|
|
317
|
+
async function writeOutputFile(path, data, encoding) {
|
|
318
|
+
await mkdir(dirname(path), { recursive: true });
|
|
319
|
+
await writeFile(path, data, encoding);
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* 处理 html + images 提取
|
|
323
|
+
*/
|
|
324
|
+
async function handleHtmlWithImages(unifiedSession, session, useExtension, args) {
|
|
325
|
+
const { selector, nth: nthParam } = args.target
|
|
326
|
+
? targetToFindParams(args.target)
|
|
327
|
+
: { selector: undefined, nth: undefined };
|
|
328
|
+
const nth = nthParam ?? 0;
|
|
329
|
+
let result;
|
|
330
|
+
if (selector && nth > 0) {
|
|
331
|
+
// nth > 0:用 evaluate 取第 N 个匹配元素
|
|
332
|
+
result = await unifiedSession.evaluate(`(function(s, n) {
|
|
333
|
+
var els = document.querySelectorAll(s);
|
|
334
|
+
if (n >= els.length) return {html: '', images: []};
|
|
335
|
+
var root = els[n];
|
|
336
|
+
var html = root.outerHTML;
|
|
337
|
+
var imgList = [];
|
|
338
|
+
if (root.tagName === 'IMG') imgList.push(root);
|
|
339
|
+
root.querySelectorAll('img').forEach(function(img) { imgList.push(img); });
|
|
340
|
+
var images = [];
|
|
341
|
+
for (var i = 0; i < imgList.length; i++) {
|
|
342
|
+
var img = imgList[i];
|
|
343
|
+
images.push({index: i, src: img.src, dataSrc: (function() { var raw = img.dataset.src || img.dataset.lazySrc || img.dataset.original || ''; if (!raw) return ''; try { return new URL(raw, location.href).href } catch(e) { return raw } })(), alt: img.alt, width: img.width, height: img.height, naturalWidth: img.naturalWidth, naturalHeight: img.naturalHeight});
|
|
344
|
+
}
|
|
345
|
+
return {html: html, images: images};
|
|
346
|
+
})`, undefined, undefined, [selector, nth]);
|
|
347
|
+
}
|
|
348
|
+
else {
|
|
349
|
+
result = useExtension
|
|
350
|
+
? await unifiedSession.getHtmlWithImages(selector)
|
|
351
|
+
: await extractHtmlWithImagesCdp(session, selector, args.timeout);
|
|
352
|
+
}
|
|
353
|
+
if (args.images === 'info') {
|
|
354
|
+
// info 模式:HTML + 图片元信息
|
|
355
|
+
const payload = { type: 'html', content: result.html, images: result.images };
|
|
356
|
+
if (args.output) {
|
|
357
|
+
await writeOutputFile(args.output, JSON.stringify(payload, null, 2), 'utf-8');
|
|
358
|
+
return formatResponse({
|
|
359
|
+
success: true,
|
|
360
|
+
type: 'html',
|
|
361
|
+
output: args.output,
|
|
362
|
+
imageCount: result.images.length,
|
|
363
|
+
});
|
|
364
|
+
}
|
|
365
|
+
return formatResponse({
|
|
366
|
+
success: true,
|
|
367
|
+
...payload,
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
// data 模式:获取图片数据
|
|
371
|
+
const appendixMode = !args.output;
|
|
372
|
+
const imageDataList = await fetchImageData(unifiedSession, result.images, appendixMode ? MAX_APPENDIX_IMAGES : undefined);
|
|
373
|
+
if (args.output) {
|
|
374
|
+
// 写入目录
|
|
375
|
+
await writeImageDirectory(args.output, result.html, result.images, imageDataList);
|
|
376
|
+
return formatResponse({
|
|
377
|
+
success: true,
|
|
378
|
+
type: 'html',
|
|
379
|
+
output: args.output,
|
|
380
|
+
imageCount: result.images.length,
|
|
381
|
+
index: join(args.output, 'index.json'),
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
// 无 output:MCP 附录方式返回
|
|
385
|
+
return buildImageAppendixResponse(result.html, result.images, imageDataList);
|
|
386
|
+
}
|
|
387
|
+
/**
|
|
388
|
+
* CDP 模式:提取 HTML + 图片元信息
|
|
389
|
+
*/
|
|
390
|
+
async function extractHtmlWithImagesCdp(session, selector, timeout) {
|
|
391
|
+
if (selector) {
|
|
392
|
+
const locator = session.createLocator({ css: selector }, timeout !== undefined ? { timeout } : undefined);
|
|
393
|
+
return locator.evaluateOn(`function() {
|
|
394
|
+
var html = this.outerHTML;
|
|
395
|
+
var imgList = [];
|
|
396
|
+
if (this.tagName === 'IMG') imgList.push(this);
|
|
397
|
+
this.querySelectorAll('img').forEach(function(img) { imgList.push(img); });
|
|
398
|
+
var images = [];
|
|
399
|
+
for (var i = 0; i < imgList.length; i++) {
|
|
400
|
+
var img = imgList[i];
|
|
401
|
+
images.push({index: i, src: img.src, dataSrc: (function() { var raw = img.dataset.src || img.dataset.lazySrc || img.dataset.original || ''; if (!raw) return ''; try { return new URL(raw, location.href).href } catch(e) { return raw } })(), alt: img.alt, width: img.width, height: img.height, naturalWidth: img.naturalWidth, naturalHeight: img.naturalHeight});
|
|
402
|
+
}
|
|
403
|
+
return {html: html, images: images};
|
|
404
|
+
}`);
|
|
405
|
+
}
|
|
406
|
+
return session.evaluate(`(function() {
|
|
407
|
+
var html = document.documentElement.outerHTML;
|
|
408
|
+
var imgs = document.querySelectorAll('img');
|
|
409
|
+
var images = [];
|
|
410
|
+
for (var i = 0; i < imgs.length; i++) {
|
|
411
|
+
var img = imgs[i];
|
|
412
|
+
images.push({index: i, src: img.src, dataSrc: (function() { var raw = img.dataset.src || img.dataset.lazySrc || img.dataset.original || ''; if (!raw) return ''; try { return new URL(raw, location.href).href } catch(e) { return raw } })(), alt: img.alt, width: img.width, height: img.height, naturalWidth: img.naturalWidth, naturalHeight: img.naturalHeight});
|
|
413
|
+
}
|
|
414
|
+
return {html: html, images: images};
|
|
415
|
+
})()`);
|
|
416
|
+
}
|
|
417
|
+
/**
|
|
418
|
+
* 获取图片数据
|
|
419
|
+
*
|
|
420
|
+
* 策略:
|
|
421
|
+
* 1. data: URL → 直接解码
|
|
422
|
+
* 2. CDP Page.getResourceContent(批量) → 从浏览器缓存读取(零网络请求)
|
|
423
|
+
* 3. Node.js fetch → fallback
|
|
424
|
+
*
|
|
425
|
+
* @param unifiedSession 会话管理器,用于 CDP 资源获取
|
|
426
|
+
* @param images 图片元信息列表
|
|
427
|
+
* @param limit 最多获取前 N 张图片数据(附录模式限流),超出的返回 null
|
|
428
|
+
*/
|
|
429
|
+
async function fetchImageData(unifiedSession, images, limit) {
|
|
430
|
+
const effectiveLimit = limit ?? images.length;
|
|
431
|
+
// 第一趟:解析 data: URL + 收集需要 CDP 获取的 URL(去重)
|
|
432
|
+
const preResolved = [];
|
|
433
|
+
const cdpUrlSet = new Set();
|
|
434
|
+
for (let i = 0; i < images.length; i++) {
|
|
435
|
+
const img = images[i];
|
|
436
|
+
const effectiveSrc = img.src || img.dataSrc;
|
|
437
|
+
if (i >= effectiveLimit || !effectiveSrc) {
|
|
438
|
+
preResolved.push({ base64: null, mimeType: 'image/png' });
|
|
439
|
+
continue;
|
|
440
|
+
}
|
|
441
|
+
if (effectiveSrc.startsWith('data:')) {
|
|
442
|
+
const match = effectiveSrc.match(/^data:([^;]+);base64,(.+)$/);
|
|
443
|
+
preResolved.push(match ? { base64: match[2], mimeType: match[1] } : { base64: null, mimeType: 'image/png' });
|
|
444
|
+
continue;
|
|
445
|
+
}
|
|
446
|
+
if (!effectiveSrc.startsWith('http')) {
|
|
447
|
+
preResolved.push({ base64: null, mimeType: guessMimeType(effectiveSrc) });
|
|
448
|
+
continue;
|
|
449
|
+
}
|
|
450
|
+
// 只有 src 非空(浏览器实际请求过的)才走 CDP 缓存
|
|
451
|
+
if (img.src) {
|
|
452
|
+
cdpUrlSet.add(img.src);
|
|
453
|
+
}
|
|
454
|
+
preResolved.push(null); // 需要进一步获取
|
|
455
|
+
}
|
|
456
|
+
// 第二趟:批量 CDP 获取
|
|
457
|
+
const cdpResults = await unifiedSession.getResourceContentBatch([...cdpUrlSet]);
|
|
458
|
+
// 第三趟:组装结果,CDP 未命中的走 fetch fallback(去重:相同 URL 共享结果)
|
|
459
|
+
const fetchUrlMap = new Map();
|
|
460
|
+
const results = [];
|
|
461
|
+
for (let i = 0; i < images.length; i++) {
|
|
462
|
+
if (preResolved[i] !== null) {
|
|
463
|
+
results.push(preResolved[i]);
|
|
464
|
+
continue;
|
|
465
|
+
}
|
|
466
|
+
const img = images[i];
|
|
467
|
+
const effectiveSrc = img.src || img.dataSrc;
|
|
468
|
+
const mimeType = guessMimeType(effectiveSrc);
|
|
469
|
+
// 尝试 CDP 缓存
|
|
470
|
+
if (img.src && cdpResults.has(img.src)) {
|
|
471
|
+
const resource = cdpResults.get(img.src);
|
|
472
|
+
if (resource.base64Encoded) {
|
|
473
|
+
results.push({ base64: resource.content, mimeType });
|
|
474
|
+
}
|
|
475
|
+
else {
|
|
476
|
+
results.push({ base64: Buffer.from(resource.content).toString('base64'), mimeType });
|
|
477
|
+
}
|
|
478
|
+
continue;
|
|
479
|
+
}
|
|
480
|
+
// 需要 fetch fallback — 按 URL 合并
|
|
481
|
+
results.push({ base64: null, mimeType }); // 占位
|
|
482
|
+
const existing = fetchUrlMap.get(effectiveSrc);
|
|
483
|
+
if (existing) {
|
|
484
|
+
existing.indices.push(i);
|
|
485
|
+
}
|
|
486
|
+
else {
|
|
487
|
+
fetchUrlMap.set(effectiveSrc, { mimeType, indices: [i] });
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
// 第四趟:并发 fetch fallback(限制并发,相同 URL 只下载一次)
|
|
491
|
+
if (fetchUrlMap.size > 0) {
|
|
492
|
+
const fetchTasks = [...fetchUrlMap.entries()];
|
|
493
|
+
let idx = 0;
|
|
494
|
+
const next = async () => {
|
|
495
|
+
while (idx < fetchTasks.length) {
|
|
496
|
+
const [url, { mimeType, indices }] = fetchTasks[idx++];
|
|
497
|
+
try {
|
|
498
|
+
const response = await fetch(url, { signal: AbortSignal.timeout(5000) });
|
|
499
|
+
if (response.ok) {
|
|
500
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
501
|
+
const contentType = response.headers.get('content-type')?.split(';')[0] ?? mimeType;
|
|
502
|
+
const data = { base64: buffer.toString('base64'), mimeType: contentType };
|
|
503
|
+
for (const i of indices) {
|
|
504
|
+
results[i] = data;
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
catch {
|
|
509
|
+
// fetch 失败,保持 null
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
};
|
|
513
|
+
await Promise.all(Array.from({ length: Math.min(6, fetchTasks.length) }, () => next()));
|
|
514
|
+
}
|
|
515
|
+
return results;
|
|
516
|
+
}
|
|
517
|
+
/**
|
|
518
|
+
* 写入图片目录
|
|
519
|
+
*
|
|
520
|
+
* 生成结构:
|
|
521
|
+
* {output}/
|
|
522
|
+
* content.html
|
|
523
|
+
* images/
|
|
524
|
+
* 0-photo.jpg
|
|
525
|
+
* 1-icon.png
|
|
526
|
+
* index.json
|
|
527
|
+
*/
|
|
528
|
+
async function writeImageDirectory(outputDir, html, images, imageDataList) {
|
|
529
|
+
const imagesDir = join(outputDir, 'images');
|
|
530
|
+
await mkdir(imagesDir, { recursive: true });
|
|
531
|
+
// 写入 HTML
|
|
532
|
+
await writeFile(join(outputDir, 'content.html'), html, 'utf-8');
|
|
533
|
+
// 写入图片文件 + 构建索引(相同 src 去重)
|
|
534
|
+
const indexEntries = [];
|
|
535
|
+
const writtenFiles = new Map(); // src → file path
|
|
536
|
+
for (let i = 0; i < images.length; i++) {
|
|
537
|
+
const img = images[i];
|
|
538
|
+
const data = imageDataList[i];
|
|
539
|
+
const src = img.src || img.dataSrc;
|
|
540
|
+
let file = null;
|
|
541
|
+
if (data.base64) {
|
|
542
|
+
// 相同 src 复用已写入的文件
|
|
543
|
+
const existing = writtenFiles.get(src);
|
|
544
|
+
if (existing) {
|
|
545
|
+
file = existing;
|
|
546
|
+
}
|
|
547
|
+
else {
|
|
548
|
+
const ext = mimeToExt(data.mimeType);
|
|
549
|
+
const safeName = sanitizeFilename(src);
|
|
550
|
+
const filename = `${i}-${safeName}${ext}`;
|
|
551
|
+
file = `images/${filename}`;
|
|
552
|
+
await writeFile(join(imagesDir, filename), Buffer.from(data.base64, 'base64'));
|
|
553
|
+
writtenFiles.set(src, file);
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
indexEntries.push({
|
|
557
|
+
index: img.index,
|
|
558
|
+
src: img.src || img.dataSrc,
|
|
559
|
+
alt: img.alt,
|
|
560
|
+
width: img.width,
|
|
561
|
+
height: img.height,
|
|
562
|
+
file,
|
|
563
|
+
});
|
|
564
|
+
}
|
|
565
|
+
// 写入索引
|
|
566
|
+
await writeFile(join(outputDir, 'index.json'), JSON.stringify({
|
|
567
|
+
html: 'content.html',
|
|
568
|
+
images: indexEntries,
|
|
569
|
+
}, null, 2), 'utf-8');
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* 构造附录式 MCP 响应
|
|
573
|
+
*
|
|
574
|
+
* 返回格式:
|
|
575
|
+
* [text: JSON summary]
|
|
576
|
+
* [text: --- Images ---]
|
|
577
|
+
* [text: [0] url alt WxH]
|
|
578
|
+
* [image: base64 data]
|
|
579
|
+
* ...
|
|
580
|
+
*/
|
|
581
|
+
function buildImageAppendixResponse(html, images, imageDataList) {
|
|
582
|
+
const content = [];
|
|
583
|
+
// 主体 JSON
|
|
584
|
+
content.push({
|
|
585
|
+
type: 'text',
|
|
586
|
+
text: JSON.stringify({
|
|
587
|
+
success: true,
|
|
588
|
+
type: 'html',
|
|
589
|
+
content: html,
|
|
590
|
+
imageCount: images.length,
|
|
591
|
+
}),
|
|
592
|
+
});
|
|
593
|
+
if (images.length === 0) {
|
|
594
|
+
return { content };
|
|
595
|
+
}
|
|
596
|
+
content.push({ type: 'text', text: '\n--- Images ---' });
|
|
597
|
+
/** Claude API 支持的 image block 格式 */
|
|
598
|
+
const SUPPORTED_IMAGE_MIMES = new Set(['image/png', 'image/jpeg', 'image/gif', 'image/webp']);
|
|
599
|
+
const limit = Math.min(images.length, MAX_APPENDIX_IMAGES);
|
|
600
|
+
for (let i = 0; i < images.length; i++) {
|
|
601
|
+
const img = images[i];
|
|
602
|
+
const data = imageDataList[i];
|
|
603
|
+
const effectiveSrc = img.src || img.dataSrc;
|
|
604
|
+
// 图片标注
|
|
605
|
+
const sizeStr = img.naturalWidth ? `${img.naturalWidth}×${img.naturalHeight}` : `${img.width}×${img.height}`;
|
|
606
|
+
const altStr = img.alt ? ` alt="${img.alt}"` : '';
|
|
607
|
+
content.push({ type: 'text', text: `\n[${img.index}] ${effectiveSrc}${altStr} ${sizeStr}` });
|
|
608
|
+
// 在限制内且有数据时附带图片(SVG 等不支持的格式跳过 image block)
|
|
609
|
+
if (i < limit && data.base64 && SUPPORTED_IMAGE_MIMES.has(data.mimeType)) {
|
|
610
|
+
content.push({ type: 'image', data: data.base64, mimeType: data.mimeType });
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
if (images.length > MAX_APPENDIX_IMAGES) {
|
|
614
|
+
content.push({
|
|
615
|
+
type: 'text',
|
|
616
|
+
text: `\n(共 ${images.length} 张图片,仅前 ${MAX_APPENDIX_IMAGES} 张附带数据。使用 output 参数导出全部图片)`,
|
|
617
|
+
});
|
|
618
|
+
}
|
|
619
|
+
return { content };
|
|
620
|
+
}
|
|
621
|
+
// ==================== MIME / 文件名工具 ====================
|
|
622
|
+
/** 从 URL 或扩展名推断 MIME 类型 */
|
|
623
|
+
function guessMimeType(url) {
|
|
624
|
+
let ext;
|
|
625
|
+
try {
|
|
626
|
+
ext = extname(new URL(url, 'http://x').pathname).toLowerCase();
|
|
627
|
+
}
|
|
628
|
+
catch {
|
|
629
|
+
return 'image/png';
|
|
630
|
+
}
|
|
631
|
+
const map = {
|
|
632
|
+
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
|
|
633
|
+
'.png': 'image/png', '.gif': 'image/gif',
|
|
634
|
+
'.webp': 'image/webp', '.svg': 'image/svg+xml',
|
|
635
|
+
'.ico': 'image/x-icon', '.bmp': 'image/bmp',
|
|
636
|
+
'.avif': 'image/avif',
|
|
637
|
+
};
|
|
638
|
+
return map[ext] ?? 'image/png';
|
|
639
|
+
}
|
|
640
|
+
/** MIME 类型转文件扩展名 */
|
|
641
|
+
function mimeToExt(mimeType) {
|
|
642
|
+
const map = {
|
|
643
|
+
'image/jpeg': '.jpg', 'image/png': '.png',
|
|
644
|
+
'image/gif': '.gif', 'image/webp': '.webp',
|
|
645
|
+
'image/svg+xml': '.svg', 'image/x-icon': '.ico',
|
|
646
|
+
'image/bmp': '.bmp', 'image/avif': '.avif',
|
|
647
|
+
};
|
|
648
|
+
return map[mimeType] ?? '.png';
|
|
649
|
+
}
|
|
650
|
+
/** 从 URL 提取安全的文件名片段 */
|
|
651
|
+
function sanitizeFilename(url) {
|
|
652
|
+
try {
|
|
653
|
+
const name = basename(new URL(url, 'http://x').pathname);
|
|
654
|
+
// 去掉扩展名,只保留字母数字和连字符
|
|
655
|
+
const stem = name.replace(/\.[^.]+$/, '').replace(/[^a-zA-Z0-9_-]/g, '_');
|
|
656
|
+
return stem.substring(0, 40) || 'image';
|
|
657
|
+
}
|
|
658
|
+
catch {
|
|
659
|
+
return 'image';
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
// ==================== 原有提取函数 ====================
|
|
293
663
|
/**
|
|
294
664
|
* 提取文本内容
|
|
295
665
|
*/
|
|
@@ -309,10 +679,9 @@ async function extractText(session, target, timeout) {
|
|
|
309
679
|
async function extractHTML(session, target, timeout) {
|
|
310
680
|
if (target) {
|
|
311
681
|
const locator = session.createLocator(target, timeout !== undefined ? { timeout } : undefined);
|
|
312
|
-
|
|
682
|
+
return await locator.evaluateOn(`function() {
|
|
313
683
|
return this.outerHTML;
|
|
314
684
|
}`);
|
|
315
|
-
return html;
|
|
316
685
|
}
|
|
317
686
|
return session.evaluate('document.documentElement.outerHTML');
|
|
318
687
|
}
|
|
@@ -331,17 +700,23 @@ async function extractAttribute(session, target, attribute, timeout) {
|
|
|
331
700
|
* 支持所有 Target 形式(css/xpath/text/role/label 等)
|
|
332
701
|
*/
|
|
333
702
|
async function extractTextExtension(unifiedSession, target) {
|
|
334
|
-
if (!target)
|
|
703
|
+
if (!target) {
|
|
335
704
|
return unifiedSession.getText();
|
|
336
|
-
|
|
337
|
-
|
|
705
|
+
}
|
|
706
|
+
const { selector, text, xpath, nth: nthParam } = targetToFindParams(target);
|
|
707
|
+
const nth = nthParam ?? 0;
|
|
708
|
+
if (selector) {
|
|
709
|
+
if (nth > 0) {
|
|
710
|
+
return unifiedSession.evaluate(`(function(s, n) { var els = document.querySelectorAll(s); return n < els.length ? (els[n].textContent || '') : '' })`, undefined, undefined, [selector, nth]);
|
|
711
|
+
}
|
|
338
712
|
return unifiedSession.getText(selector);
|
|
713
|
+
}
|
|
339
714
|
// xpath/text 定位:通过 evaluate 在页面上下文中查找
|
|
340
715
|
if (xpath) {
|
|
341
|
-
return unifiedSession.evaluate(`(function(xp) { var r = document.evaluate(xp, document, null, XPathResult.
|
|
716
|
+
return unifiedSession.evaluate(`(function(xp, n) { var r = document.evaluate(xp, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); return n < r.snapshotLength ? (r.snapshotItem(n).textContent || '') : '' })`, undefined, undefined, [xpath, nth]);
|
|
342
717
|
}
|
|
343
718
|
if (text) {
|
|
344
|
-
return unifiedSession.evaluate(`(function(t) { var els = document.querySelectorAll('*'); for (var i = 0; i < els.length; i++) { var cn = els[i].childNodes; for (var j = 0; j < cn.length; j++) { if (cn[j].nodeType === 3 && cn[j].textContent && cn[j].textContent.includes(t))
|
|
719
|
+
return unifiedSession.evaluate(`(function(t, n) { var els = document.querySelectorAll('*'); var found = []; for (var i = 0; i < els.length; i++) { var cn = els[i].childNodes; for (var j = 0; j < cn.length; j++) { if (cn[j].nodeType === 3 && cn[j].textContent && cn[j].textContent.includes(t)) { found.push(els[i]); break; } } } return n < found.length ? (found[n].textContent || '') : '' })`, undefined, undefined, [text, nth]);
|
|
345
720
|
}
|
|
346
721
|
return unifiedSession.getText();
|
|
347
722
|
}
|
|
@@ -350,17 +725,23 @@ async function extractTextExtension(unifiedSession, target) {
|
|
|
350
725
|
* 支持所有 Target 形式(css/xpath/text/role/label 等)
|
|
351
726
|
*/
|
|
352
727
|
async function extractHtmlExtension(unifiedSession, target, outer = true) {
|
|
353
|
-
if (!target)
|
|
728
|
+
if (!target) {
|
|
354
729
|
return unifiedSession.getHtml(undefined, outer);
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
730
|
+
}
|
|
731
|
+
const { selector, text, xpath, nth: nthParam } = targetToFindParams(target);
|
|
732
|
+
const nth = nthParam ?? 0;
|
|
358
733
|
const prop = outer ? 'outerHTML' : 'innerHTML';
|
|
734
|
+
if (selector) {
|
|
735
|
+
if (nth > 0) {
|
|
736
|
+
return unifiedSession.evaluate(`(function(s, n, p) { var els = document.querySelectorAll(s); return n < els.length ? (els[n][p] || '') : '' })`, undefined, undefined, [selector, nth, prop]);
|
|
737
|
+
}
|
|
738
|
+
return unifiedSession.getHtml(selector, outer);
|
|
739
|
+
}
|
|
359
740
|
if (xpath) {
|
|
360
|
-
return unifiedSession.evaluate(`(function(xp, p) { var r = document.evaluate(xp, document, null, XPathResult.
|
|
741
|
+
return unifiedSession.evaluate(`(function(xp, n, p) { var r = document.evaluate(xp, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); return n < r.snapshotLength ? (r.snapshotItem(n)[p] || '') : '' })`, undefined, undefined, [xpath, nth, prop]);
|
|
361
742
|
}
|
|
362
743
|
if (text) {
|
|
363
|
-
return unifiedSession.evaluate(`(function(t, p) { var els = document.querySelectorAll('*'); for (var i = 0; i < els.length; i++) { var cn = els[i].childNodes; for (var j = 0; j < cn.length; j++) { if (cn[j].nodeType === 3 && cn[j].textContent && cn[j].textContent.includes(t))
|
|
744
|
+
return unifiedSession.evaluate(`(function(t, n, p) { var els = document.querySelectorAll('*'); var found = []; for (var i = 0; i < els.length; i++) { var cn = els[i].childNodes; for (var j = 0; j < cn.length; j++) { if (cn[j].nodeType === 3 && cn[j].textContent && cn[j].textContent.includes(t)) { found.push(els[i]); break; } } } return n < found.length ? (found[n][p] || '') : '' })`, undefined, undefined, [text, nth, prop]);
|
|
364
745
|
}
|
|
365
746
|
return unifiedSession.getHtml(undefined, outer);
|
|
366
747
|
}
|
|
@@ -368,16 +749,24 @@ async function extractHtmlExtension(unifiedSession, target, outer = true) {
|
|
|
368
749
|
* Extension 模式:提取属性
|
|
369
750
|
*/
|
|
370
751
|
async function extractAttributeExtension(unifiedSession, target, attribute) {
|
|
371
|
-
const { selector, text, xpath } = targetToFindParams(target);
|
|
752
|
+
const { selector, text, xpath, nth: nthParam } = targetToFindParams(target);
|
|
372
753
|
// xpath/text 定位需要先 find 得到 refId,再获取属性
|
|
373
754
|
if (xpath || text) {
|
|
374
755
|
const elements = await unifiedSession.find(selector, text, xpath);
|
|
375
756
|
if (elements.length > 0) {
|
|
376
|
-
|
|
757
|
+
const nth = nthParam ?? 0;
|
|
758
|
+
if (nth >= elements.length) {
|
|
759
|
+
throw new Error(`第 ${nth} 个匹配元素不存在(共 ${elements.length} 个)`);
|
|
760
|
+
}
|
|
761
|
+
return unifiedSession.getAttribute(undefined, elements[nth].refId, attribute);
|
|
377
762
|
}
|
|
378
763
|
return null;
|
|
379
764
|
}
|
|
380
765
|
if (selector) {
|
|
766
|
+
const nth = nthParam ?? 0;
|
|
767
|
+
if (nth > 0) {
|
|
768
|
+
return unifiedSession.evaluate(`(function(s, n, a) { var els = document.querySelectorAll(s); return n < els.length ? els[n].getAttribute(a) : null })`, undefined, undefined, [selector, nth, attribute]);
|
|
769
|
+
}
|
|
381
770
|
return unifiedSession.getAttribute(selector, undefined, attribute);
|
|
382
771
|
}
|
|
383
772
|
return null;
|
|
@@ -391,7 +780,8 @@ async function extractAttributeExtension(unifiedSession, target, attribute) {
|
|
|
391
780
|
async function waitForTargetExtension(unifiedSession, target, timeout) {
|
|
392
781
|
const startTime = Date.now();
|
|
393
782
|
const retryDelay = 100;
|
|
394
|
-
const { selector, text, xpath } = targetToFindParams(target);
|
|
783
|
+
const { selector, text, xpath, nth: nthParam } = targetToFindParams(target);
|
|
784
|
+
const nth = nthParam ?? 0;
|
|
395
785
|
let lastError = null;
|
|
396
786
|
while (true) {
|
|
397
787
|
const elapsed = Date.now() - startTime;
|
|
@@ -407,12 +797,15 @@ async function waitForTargetExtension(unifiedSession, target, timeout) {
|
|
|
407
797
|
try {
|
|
408
798
|
const remaining = timeout - elapsed;
|
|
409
799
|
const elements = await unifiedSession.find(selector, text, xpath, remaining);
|
|
410
|
-
if (elements.length >
|
|
800
|
+
if (elements.length > nth) {
|
|
411
801
|
return;
|
|
802
|
+
}
|
|
412
803
|
}
|
|
413
804
|
catch (err) {
|
|
414
805
|
// 暂时性错误(RPC 超时、发送失败、连接断开)可重试,其他确定性错误立即抛出
|
|
415
|
-
if (err instanceof
|
|
806
|
+
if (err instanceof
|
|
807
|
+
Error &&
|
|
808
|
+
/Request timeout|Failed to send|disconnect|未连接|stopped|replaced/i.test(err.message)) {
|
|
416
809
|
lastError = err;
|
|
417
810
|
await new Promise(r => setTimeout(r, retryDelay));
|
|
418
811
|
continue;
|
|
@@ -422,4 +815,13 @@ async function waitForTargetExtension(unifiedSession, target, timeout) {
|
|
|
422
815
|
await new Promise(r => setTimeout(r, retryDelay));
|
|
423
816
|
}
|
|
424
817
|
}
|
|
818
|
+
/**
|
|
819
|
+
* 注册 extract 工具
|
|
820
|
+
*/
|
|
821
|
+
export function registerExtractTool(server) {
|
|
822
|
+
server.registerTool('extract', {
|
|
823
|
+
description: '提取页面内容:文本、HTML(可附带图片)、属性、截图、状态、页面元信息',
|
|
824
|
+
inputSchema: extractSchema,
|
|
825
|
+
}, (args) => handleExtract(args));
|
|
826
|
+
}
|
|
425
827
|
//# sourceMappingURL=extract.js.map
|