real-browser-mcp-server 1.1.7 → 1.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/cjs/index.js +384 -0
- package/{lib → dist/lib}/cjs/module/pageController.js +27 -29
- package/{lib → dist/lib}/cjs/module/turnstile.js +23 -12
- package/dist/src/ai/action-parser.js +229 -0
- package/dist/src/ai/core.js +367 -0
- package/dist/src/ai/element-finder.js +409 -0
- package/{src → dist/src}/ai/index.js +35 -50
- package/dist/src/ai/page-analyzer.js +264 -0
- package/dist/src/ai/selector-healer.js +215 -0
- package/dist/src/index.js +116 -0
- package/dist/src/mcp/handlers/browser.js +230 -0
- package/dist/src/mcp/handlers/dom.js +550 -0
- package/dist/src/mcp/handlers/extract.js +451 -0
- package/dist/src/mcp/handlers/helpers.js +514 -0
- package/dist/src/mcp/handlers/index.js +63 -0
- package/dist/src/mcp/handlers/misc.js +1224 -0
- package/dist/src/mcp/handlers/network.js +1134 -0
- package/dist/src/mcp/handlers/state.js +215 -0
- package/dist/src/mcp/handlers/vision.js +475 -0
- package/dist/src/mcp/index.js +166 -0
- package/dist/src/mcp/server.js +117 -0
- package/{src → dist/src}/mcp/tools.js +12 -11
- package/dist/src/shared/tools.js +598 -0
- package/{test → dist/test}/cjs/test.js +119 -169
- package/dist/test/mcp/smoke-test.js +131 -0
- package/lib/esm/module/pageController.mjs +21 -18
- package/lib/esm/module/turnstile.mjs +7 -0
- package/package.json +22 -11
- package/.github/ISSUE_TEMPLATE/general_issue.yaml +0 -58
- package/.github/SETUP.md +0 -111
- package/.github/workflows/publish.yml +0 -162
- package/Dockerfile +0 -78
- package/lib/cjs/adblocker.bin +0 -0
- package/lib/cjs/index.js +0 -396
- package/src/ai/action-parser.js +0 -269
- package/src/ai/core.js +0 -379
- package/src/ai/element-finder.js +0 -466
- package/src/ai/page-analyzer.js +0 -295
- package/src/ai/selector-healer.js +0 -236
- package/src/index.js +0 -128
- package/src/mcp/handlers.js +0 -5306
- package/src/mcp/index.js +0 -190
- package/src/mcp/server.js +0 -141
- package/src/shared/tools.js +0 -625
- package/test/esm/test.mjs +0 -299
- package/test/mcp/smoke-test.js +0 -141
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.extractHandlers = void 0;
|
|
37
|
+
// @ts-nocheck
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
|
+
const fs = __importStar(require("fs"));
|
|
40
|
+
const state_1 = require("./state");
|
|
41
|
+
// Auto-generated extract handlers
|
|
42
|
+
exports.extractHandlers = {
|
|
43
|
+
async get_content(params = {}) {
|
|
44
|
+
const { page } = (0, state_1.requireBrowser)();
|
|
45
|
+
const { format = 'text', selector, rawHttpUrl } = params;
|
|
46
|
+
(0, state_1.notifyProgress)('get_content', 'started', `Extracting ${format} content${selector ? ` from ${selector}` : ''}`);
|
|
47
|
+
// === rawHttp mode: fetch raw HTTP response without JS rendering ===
|
|
48
|
+
if (format === 'rawHttp') {
|
|
49
|
+
const url = rawHttpUrl || page.url();
|
|
50
|
+
(0, state_1.notifyProgress)('get_content', 'in_progress', `Fetching raw HTTP (no JS) from: ${url}`);
|
|
51
|
+
try {
|
|
52
|
+
const cookies = await page.context().cookies(url);
|
|
53
|
+
const cookieStr = cookies.map(c => `${c.name}=${c.value}`).join('; ');
|
|
54
|
+
const response = await fetch(url, {
|
|
55
|
+
headers: {
|
|
56
|
+
'User-Agent': await page.evaluate(() => navigator.userAgent),
|
|
57
|
+
'Cookie': cookieStr,
|
|
58
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
59
|
+
'Referer': page.url()
|
|
60
|
+
},
|
|
61
|
+
redirect: 'follow'
|
|
62
|
+
});
|
|
63
|
+
const rawHtml = await response.text();
|
|
64
|
+
const renderedHtml = await page.content();
|
|
65
|
+
const diff = {
|
|
66
|
+
rawLength: rawHtml.length,
|
|
67
|
+
renderedLength: renderedHtml.length,
|
|
68
|
+
sizeDifference: renderedHtml.length - rawHtml.length,
|
|
69
|
+
jsLoadedContent: renderedHtml.length > rawHtml.length * 1.1
|
|
70
|
+
};
|
|
71
|
+
(0, state_1.notifyProgress)('get_content', 'completed', `Raw: ${diff.rawLength} chars, Rendered: ${diff.renderedLength} chars`);
|
|
72
|
+
return {
|
|
73
|
+
success: true, rawHtml, renderedHtml, diff,
|
|
74
|
+
url, finalUrl: response.url, statusCode: response.status, format: 'rawHttp'
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
catch (e) {
|
|
78
|
+
return { success: false, error: `Raw HTTP fetch failed: ${e.message}` };
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
let content;
|
|
82
|
+
// === markdown: real HTML→Markdown conversion (no external deps) ===
|
|
83
|
+
if (format === 'markdown') {
|
|
84
|
+
if (selector) {
|
|
85
|
+
const exists = await page.$(selector);
|
|
86
|
+
if (!exists) {
|
|
87
|
+
(0, state_1.notifyProgress)('get_content', 'error', `Element not found: ${selector}`);
|
|
88
|
+
return { success: false, error: `Element not found: ${selector}` };
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
content = await page.evaluate((sel) => {
|
|
92
|
+
const root = sel ? document.querySelector(sel) : document.body;
|
|
93
|
+
if (!root)
|
|
94
|
+
return '';
|
|
95
|
+
const skip = new Set(['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'SVG', 'CANVAS']);
|
|
96
|
+
const inline = (node) => {
|
|
97
|
+
let out = '';
|
|
98
|
+
node.childNodes.forEach(c => {
|
|
99
|
+
if (c.nodeType === 3) {
|
|
100
|
+
out += c.textContent.replace(/\s+/g, ' ');
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
if (c.nodeType !== 1 || skip.has(c.tagName))
|
|
104
|
+
return;
|
|
105
|
+
const t = c.tagName;
|
|
106
|
+
if (t === 'A') {
|
|
107
|
+
const h = c.getAttribute('href') || '';
|
|
108
|
+
const x = inline(c).trim();
|
|
109
|
+
out += h ? `[${x}](${h})` : x;
|
|
110
|
+
}
|
|
111
|
+
else if (t === 'STRONG' || t === 'B')
|
|
112
|
+
out += `**${inline(c).trim()}**`;
|
|
113
|
+
else if (t === 'EM' || t === 'I')
|
|
114
|
+
out += `*${inline(c).trim()}*`;
|
|
115
|
+
else if (t === 'CODE')
|
|
116
|
+
out += '`' + c.textContent.trim() + '`';
|
|
117
|
+
else if (t === 'IMG') {
|
|
118
|
+
const a = c.getAttribute('alt') || '';
|
|
119
|
+
const s = c.getAttribute('src') || '';
|
|
120
|
+
if (s)
|
|
121
|
+
out += ``;
|
|
122
|
+
}
|
|
123
|
+
else if (t === 'BR')
|
|
124
|
+
out += '\n';
|
|
125
|
+
else
|
|
126
|
+
out += inline(c);
|
|
127
|
+
});
|
|
128
|
+
return out;
|
|
129
|
+
};
|
|
130
|
+
const lines = [];
|
|
131
|
+
const walk = (node) => {
|
|
132
|
+
node.childNodes.forEach(c => {
|
|
133
|
+
if (c.nodeType === 3) {
|
|
134
|
+
const x = c.textContent.trim();
|
|
135
|
+
if (x)
|
|
136
|
+
lines.push(x);
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
if (c.nodeType !== 1 || skip.has(c.tagName))
|
|
140
|
+
return;
|
|
141
|
+
const t = c.tagName;
|
|
142
|
+
if (/^H[1-6]$/.test(t))
|
|
143
|
+
lines.push('\n' + '#'.repeat(+t[1]) + ' ' + inline(c).trim() + '\n');
|
|
144
|
+
else if (t === 'P') {
|
|
145
|
+
const x = inline(c).trim();
|
|
146
|
+
if (x)
|
|
147
|
+
lines.push(x + '\n');
|
|
148
|
+
}
|
|
149
|
+
else if (t === 'UL' || t === 'OL') {
|
|
150
|
+
let i = 1;
|
|
151
|
+
c.querySelectorAll(':scope > li').forEach(li => lines.push((t === 'OL' ? (i++) + '. ' : '- ') + inline(li).trim()));
|
|
152
|
+
lines.push('');
|
|
153
|
+
}
|
|
154
|
+
else if (t === 'BLOCKQUOTE')
|
|
155
|
+
lines.push('> ' + inline(c).trim() + '\n');
|
|
156
|
+
else if (t === 'PRE')
|
|
157
|
+
lines.push('```\n' + c.textContent.trim() + '\n```\n');
|
|
158
|
+
else if (t === 'HR')
|
|
159
|
+
lines.push('\n---\n');
|
|
160
|
+
else if (['A', 'STRONG', 'B', 'EM', 'I', 'CODE', 'IMG', 'SPAN', 'LABEL'].includes(t)) {
|
|
161
|
+
const x = inline(c).trim();
|
|
162
|
+
if (x)
|
|
163
|
+
lines.push(x);
|
|
164
|
+
}
|
|
165
|
+
else
|
|
166
|
+
walk(c);
|
|
167
|
+
});
|
|
168
|
+
};
|
|
169
|
+
walk(root);
|
|
170
|
+
return lines.join('\n').replace(/\n{3,}/g, '\n\n').trim();
|
|
171
|
+
}, selector || null);
|
|
172
|
+
}
|
|
173
|
+
else if (selector) {
|
|
174
|
+
const element = await page.$(selector);
|
|
175
|
+
if (!element) {
|
|
176
|
+
(0, state_1.notifyProgress)('get_content', 'error', `Element not found: ${selector}`);
|
|
177
|
+
return { success: false, error: `Element not found: ${selector}` };
|
|
178
|
+
}
|
|
179
|
+
if (format === 'html') {
|
|
180
|
+
content = await element.evaluate(el => el.outerHTML);
|
|
181
|
+
}
|
|
182
|
+
else {
|
|
183
|
+
content = await element.evaluate(el => el.textContent);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
if (format === 'html') {
|
|
188
|
+
content = await page.content();
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
content = await page.evaluate(() => document.body.innerText);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
(0, state_1.notifyProgress)('get_content', 'completed', `Extracted ${content.length} characters`, { format, length: content.length });
|
|
195
|
+
return {
|
|
196
|
+
success: true,
|
|
197
|
+
content,
|
|
198
|
+
url: page.url(),
|
|
199
|
+
format
|
|
200
|
+
};
|
|
201
|
+
},
|
|
202
|
+
async save_content_as_markdown(params) {
|
|
203
|
+
const { page } = (0, state_1.requireBrowser)();
|
|
204
|
+
const { filename, selector, includeImages = true, includeMeta = true } = params;
|
|
205
|
+
(0, state_1.notifyProgress)('save_content_as_markdown', 'started', `Saving to: ${filename}`);
|
|
206
|
+
let markdown = '';
|
|
207
|
+
if (includeMeta) {
|
|
208
|
+
const title = await page.title();
|
|
209
|
+
const url = page.url();
|
|
210
|
+
markdown += `# ${title}\n\n`;
|
|
211
|
+
markdown += `> Source: ${url}\n\n`;
|
|
212
|
+
}
|
|
213
|
+
const content = selector
|
|
214
|
+
? await page.$eval(selector, el => el.innerText)
|
|
215
|
+
: await page.evaluate(() => document.body.innerText);
|
|
216
|
+
markdown += content;
|
|
217
|
+
const outputPath = path.resolve(filename);
|
|
218
|
+
fs.writeFileSync(outputPath, markdown);
|
|
219
|
+
(0, state_1.notifyProgress)('save_content_as_markdown', 'completed', `Saved ${markdown.length} bytes to ${filename}`, { filename: outputPath, size: markdown.length });
|
|
220
|
+
return { success: true, filename: outputPath, size: markdown.length };
|
|
221
|
+
},
|
|
222
|
+
async extract_json(params = {}) {
|
|
223
|
+
const { page } = (0, state_1.requireBrowser)();
|
|
224
|
+
const { source = 'page', selector, jsonPath } = params;
|
|
225
|
+
(0, state_1.notifyProgress)('extract_json', 'started', `Extracting JSON from: ${source}`);
|
|
226
|
+
let jsonData = [];
|
|
227
|
+
if (source === 'ld+json') {
|
|
228
|
+
jsonData = await page.$$eval('script[type="application/ld+json"]', scripts => scripts.map(s => {
|
|
229
|
+
try {
|
|
230
|
+
return JSON.parse(s.textContent);
|
|
231
|
+
}
|
|
232
|
+
catch {
|
|
233
|
+
return null;
|
|
234
|
+
}
|
|
235
|
+
}).filter(Boolean));
|
|
236
|
+
}
|
|
237
|
+
else if (source === 'scripts') {
|
|
238
|
+
const content = await page.$$eval('script', scripts => scripts.map(s => s.textContent).join('\n'));
|
|
239
|
+
const jsonRegex = /\{[^{}]*\}|\[[^\[\]]*\]/g;
|
|
240
|
+
const matches = content.match(jsonRegex) || [];
|
|
241
|
+
jsonData = matches.slice(0, 20).map(m => {
|
|
242
|
+
try {
|
|
243
|
+
return JSON.parse(m);
|
|
244
|
+
}
|
|
245
|
+
catch {
|
|
246
|
+
return null;
|
|
247
|
+
}
|
|
248
|
+
}).filter(Boolean);
|
|
249
|
+
}
|
|
250
|
+
else if (selector) {
|
|
251
|
+
const text = await page.$eval(selector, el => el.textContent);
|
|
252
|
+
try {
|
|
253
|
+
jsonData = [JSON.parse(text)];
|
|
254
|
+
}
|
|
255
|
+
catch { }
|
|
256
|
+
}
|
|
257
|
+
(0, state_1.notifyProgress)('extract_json', 'completed', `Extracted ${jsonData.length} JSON objects`, { count: jsonData.length });
|
|
258
|
+
return { success: true, source, count: jsonData.length, data: jsonData };
|
|
259
|
+
},
|
|
260
|
+
async scrape_meta_tags(params = {}) {
|
|
261
|
+
const { page } = (0, state_1.requireBrowser)();
|
|
262
|
+
const { types = ['all'] } = params;
|
|
263
|
+
(0, state_1.notifyProgress)('scrape_meta_tags', 'started', 'Extracting meta tags...');
|
|
264
|
+
const meta = await page.evaluate(() => {
|
|
265
|
+
const result = { meta: {}, og: {}, twitter: {} };
|
|
266
|
+
document.querySelectorAll('meta').forEach(tag => {
|
|
267
|
+
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
|
268
|
+
const content = tag.getAttribute('content');
|
|
269
|
+
if (name && content) {
|
|
270
|
+
if (name.startsWith('og:')) {
|
|
271
|
+
result.og[name.replace('og:', '')] = content;
|
|
272
|
+
}
|
|
273
|
+
else if (name.startsWith('twitter:')) {
|
|
274
|
+
result.twitter[name.replace('twitter:', '')] = content;
|
|
275
|
+
}
|
|
276
|
+
else {
|
|
277
|
+
result.meta[name] = content;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
});
|
|
281
|
+
result.title = document.title;
|
|
282
|
+
result.canonical = document.querySelector('link[rel="canonical"]')?.href;
|
|
283
|
+
return result;
|
|
284
|
+
});
|
|
285
|
+
const tagCount = Object.keys(meta.meta).length + Object.keys(meta.og).length + Object.keys(meta.twitter).length;
|
|
286
|
+
(0, state_1.notifyProgress)('scrape_meta_tags', 'completed', `Extracted ${tagCount} meta tags`, { tagCount });
|
|
287
|
+
return { success: true, ...meta };
|
|
288
|
+
},
|
|
289
|
+
async link_harvester(params = {}) {
|
|
290
|
+
const { page } = (0, state_1.requireBrowser)();
|
|
291
|
+
const { types = ['all'], selector, includeText = true, includeHidden = true, searchIframes = false } = params;
|
|
292
|
+
(0, state_1.notifyProgress)('link_harvester', 'started', 'Harvesting links (enhanced mode)...');
|
|
293
|
+
const currentHost = new URL(page.url()).hostname;
|
|
294
|
+
// Enhanced link extraction
|
|
295
|
+
const extractLinks = async (context) => {
|
|
296
|
+
return await context.evaluate(({ includeText, includeHidden }) => {
|
|
297
|
+
const allLinks = [];
|
|
298
|
+
const seenUrls = new Set();
|
|
299
|
+
const addLink = (href, text, source, element) => {
|
|
300
|
+
if (!href || seenUrls.has(href))
|
|
301
|
+
return;
|
|
302
|
+
if (!href.startsWith('http') && !href.startsWith('//'))
|
|
303
|
+
return;
|
|
304
|
+
// Handle protocol-relative URLs
|
|
305
|
+
if (href.startsWith('//')) {
|
|
306
|
+
href = window.location.protocol + href;
|
|
307
|
+
}
|
|
308
|
+
seenUrls.add(href);
|
|
309
|
+
allLinks.push({
|
|
310
|
+
href,
|
|
311
|
+
text: includeText ? (text || '').trim().substring(0, 100) : undefined,
|
|
312
|
+
source,
|
|
313
|
+
hidden: element ? (element.offsetParent === null ||
|
|
314
|
+
getComputedStyle(element).display === 'none' ||
|
|
315
|
+
getComputedStyle(element).visibility === 'hidden') : false
|
|
316
|
+
});
|
|
317
|
+
};
|
|
318
|
+
// 1. Standard anchor tags
|
|
319
|
+
document.querySelectorAll('a[href]').forEach(a => {
|
|
320
|
+
addLink(a.href, a.textContent, 'anchor', a);
|
|
321
|
+
});
|
|
322
|
+
// 2. Data attributes containing URLs
|
|
323
|
+
const dataAttrs = ['data-href', 'data-url', 'data-link', 'data-src', 'data-file', 'data-download'];
|
|
324
|
+
dataAttrs.forEach(attr => {
|
|
325
|
+
document.querySelectorAll(`[${attr}]`).forEach(el => {
|
|
326
|
+
const url = el.getAttribute(attr);
|
|
327
|
+
addLink(url, el.textContent, `${attr}`, el);
|
|
328
|
+
});
|
|
329
|
+
});
|
|
330
|
+
// 3. OnClick handlers with URLs
|
|
331
|
+
if (includeHidden) {
|
|
332
|
+
document.querySelectorAll('[onclick]').forEach(el => {
|
|
333
|
+
const onclick = el.getAttribute('onclick');
|
|
334
|
+
// Look for URL patterns in onclick
|
|
335
|
+
const urlMatches = onclick.match(/https?:\/\/[^\s"'<>]+/gi) || [];
|
|
336
|
+
urlMatches.forEach(url => {
|
|
337
|
+
addLink(url, el.textContent, 'onclick', el);
|
|
338
|
+
});
|
|
339
|
+
// Look for location.href assignments
|
|
340
|
+
const hrefMatch = onclick.match(/location\.href\s*=\s*['"]([^'"]+)['"]/);
|
|
341
|
+
if (hrefMatch) {
|
|
342
|
+
addLink(hrefMatch[1], el.textContent, 'onclick-location', el);
|
|
343
|
+
}
|
|
344
|
+
// Look for window.open calls
|
|
345
|
+
const openMatch = onclick.match(/window\.open\s*\(\s*['"]([^'"]+)['"]/);
|
|
346
|
+
if (openMatch) {
|
|
347
|
+
addLink(openMatch[1], el.textContent, 'onclick-window-open', el);
|
|
348
|
+
}
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
// 4. JavaScript href links
|
|
352
|
+
document.querySelectorAll('a[href^="javascript:"]').forEach(a => {
|
|
353
|
+
const href = a.getAttribute('href');
|
|
354
|
+
const urlMatch = href.match(/https?:\/\/[^\s"'<>]+/gi);
|
|
355
|
+
if (urlMatch) {
|
|
356
|
+
urlMatch.forEach(url => addLink(url, a.textContent, 'javascript-href', a));
|
|
357
|
+
}
|
|
358
|
+
});
|
|
359
|
+
// 5. Hidden inputs with URLs
|
|
360
|
+
document.querySelectorAll('input[type="hidden"]').forEach(input => {
|
|
361
|
+
const value = input.value;
|
|
362
|
+
if (value && (value.startsWith('http') || value.startsWith('//'))) {
|
|
363
|
+
addLink(value, input.name || input.id, 'hidden-input', input);
|
|
364
|
+
}
|
|
365
|
+
});
|
|
366
|
+
// 6. Script content analysis for URLs (limited for performance)
|
|
367
|
+
if (includeHidden) {
|
|
368
|
+
const scripts = [...document.querySelectorAll('script')].slice(0, 20);
|
|
369
|
+
scripts.forEach(script => {
|
|
370
|
+
const content = script.textContent || '';
|
|
371
|
+
// Look for download/stream URLs
|
|
372
|
+
const patterns = [
|
|
373
|
+
/["']?(https?:\/\/[^"'\s<>]+\.(mp4|mkv|avi|m3u8|mpd|zip|rar|pdf))[^"'\s<>]*["']?/gi,
|
|
374
|
+
/download[_-]?url\s*[:=]\s*["']([^"']+)["']/gi,
|
|
375
|
+
/file\s*[:=]\s*["']([^"']+)["']/gi
|
|
376
|
+
];
|
|
377
|
+
patterns.forEach(pattern => {
|
|
378
|
+
let match;
|
|
379
|
+
while ((match = pattern.exec(content)) !== null) {
|
|
380
|
+
addLink(match[1], 'script-extracted', 'script', null);
|
|
381
|
+
}
|
|
382
|
+
});
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
// 7. Meta refresh URLs
|
|
386
|
+
const metaRefresh = document.querySelector('meta[http-equiv="refresh"]');
|
|
387
|
+
if (metaRefresh) {
|
|
388
|
+
const content = metaRefresh.getAttribute('content');
|
|
389
|
+
const urlMatch = content?.match(/url=(.+)/i);
|
|
390
|
+
if (urlMatch) {
|
|
391
|
+
addLink(urlMatch[1].trim().replace(/['"]/g, ''), 'meta-refresh', 'meta', null);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
// 8. Iframe sources
|
|
395
|
+
document.querySelectorAll('iframe[src]').forEach(iframe => {
|
|
396
|
+
addLink(iframe.src, 'iframe', 'iframe', iframe);
|
|
397
|
+
});
|
|
398
|
+
return allLinks;
|
|
399
|
+
}, { includeText, includeHidden }).catch(() => []);
|
|
400
|
+
};
|
|
401
|
+
let links = await extractLinks(page);
|
|
402
|
+
// Search iframes if enabled
|
|
403
|
+
if (searchIframes) {
|
|
404
|
+
const frames = page.frames();
|
|
405
|
+
for (let i = 1; i < frames.length && i < 5; i++) {
|
|
406
|
+
try {
|
|
407
|
+
const frame = frames[i];
|
|
408
|
+
if (frame.url() && frame.url() !== 'about:blank') {
|
|
409
|
+
const frameLinks = await extractLinks(frame);
|
|
410
|
+
frameLinks.forEach(link => link.source = `iframe:${link.source}`);
|
|
411
|
+
links = [...links, ...frameLinks];
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
catch (e) { }
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
// Filter by type
|
|
418
|
+
if (!types.includes('all')) {
|
|
419
|
+
links = links.filter(link => {
|
|
420
|
+
const isInternal = link.href.includes(currentHost);
|
|
421
|
+
const isMedia = /\.(jpg|jpeg|png|gif|mp4|mp3|mkv|avi|pdf|zip|rar|m3u8|mpd)/i.test(link.href);
|
|
422
|
+
const isDownload = /download|file|drive/i.test(link.href);
|
|
423
|
+
if (types.includes('internal') && isInternal)
|
|
424
|
+
return true;
|
|
425
|
+
if (types.includes('external') && !isInternal)
|
|
426
|
+
return true;
|
|
427
|
+
if (types.includes('media') && isMedia)
|
|
428
|
+
return true;
|
|
429
|
+
if (types.includes('download') && isDownload)
|
|
430
|
+
return true;
|
|
431
|
+
if (types.includes('hidden') && link.hidden)
|
|
432
|
+
return true;
|
|
433
|
+
return false;
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
// Remove hidden links if not requested
|
|
437
|
+
if (!includeHidden) {
|
|
438
|
+
links = links.filter(link => !link.hidden);
|
|
439
|
+
}
|
|
440
|
+
// Deduplicate
|
|
441
|
+
const seen = new Set();
|
|
442
|
+
links = links.filter(link => {
|
|
443
|
+
if (seen.has(link.href))
|
|
444
|
+
return false;
|
|
445
|
+
seen.add(link.href);
|
|
446
|
+
return true;
|
|
447
|
+
});
|
|
448
|
+
(0, state_1.notifyProgress)('link_harvester', 'completed', `Found ${links.length} links (including hidden)`, { count: links.length });
|
|
449
|
+
return { success: true, count: links.length, links };
|
|
450
|
+
}
|
|
451
|
+
};
|