brave-real-browser-mcp-server 2.7.6 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/extractors/content-type-extractors.js +314 -0
- package/dist/extractors/extractors.test.js +17 -0
- package/dist/extractors/multi-element-extractors.js +325 -0
- package/dist/extractors/smart-data-extractors.js +281 -0
- package/dist/utils/advanced-features.js +247 -0
- package/dist/utils/advanced-scraping.js +253 -0
- package/dist/utils/all-modules.test.js +86 -0
- package/dist/utils/auth-session.js +296 -0
- package/dist/utils/data-processing.js +301 -0
- package/dist/utils/data-processing.test.js +52 -0
- package/dist/utils/pagination.js +249 -0
- package/dist/utils/pagination.test.js +22 -0
- package/package.json +31 -2
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Image Scraper - Extract all images with metadata
|
|
3
|
+
*/
|
|
4
|
+
export async function scrapeImages(page, selector) {
|
|
5
|
+
return await page.evaluate((sel) => {
|
|
6
|
+
const images = sel
|
|
7
|
+
? document.querySelectorAll(sel)
|
|
8
|
+
: document.querySelectorAll('img');
|
|
9
|
+
const results = [];
|
|
10
|
+
images.forEach((img) => {
|
|
11
|
+
const imgEl = img;
|
|
12
|
+
const rect = imgEl.getBoundingClientRect();
|
|
13
|
+
const styles = window.getComputedStyle(imgEl);
|
|
14
|
+
results.push({
|
|
15
|
+
src: imgEl.src || imgEl.getAttribute('src') || '',
|
|
16
|
+
alt: imgEl.alt || '',
|
|
17
|
+
title: imgEl.title || '',
|
|
18
|
+
width: imgEl.width || rect.width,
|
|
19
|
+
height: imgEl.height || rect.height,
|
|
20
|
+
naturalWidth: imgEl.naturalWidth,
|
|
21
|
+
naturalHeight: imgEl.naturalHeight,
|
|
22
|
+
loading: imgEl.loading || 'auto',
|
|
23
|
+
srcset: imgEl.srcset || '',
|
|
24
|
+
sizes: imgEl.sizes || '',
|
|
25
|
+
isVisible: styles.display !== 'none' && styles.visibility !== 'hidden' && rect.width > 0 && rect.height > 0
|
|
26
|
+
});
|
|
27
|
+
});
|
|
28
|
+
return results;
|
|
29
|
+
}, selector || null);
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Link Harvester - Extract all links with classification
|
|
33
|
+
*/
|
|
34
|
+
export async function harvestLinks(page, options) {
|
|
35
|
+
const opts = {
|
|
36
|
+
includeInternal: true,
|
|
37
|
+
includeExternal: true,
|
|
38
|
+
includeAnchors: true,
|
|
39
|
+
...options
|
|
40
|
+
};
|
|
41
|
+
return await page.evaluate((config) => {
|
|
42
|
+
const currentDomain = window.location.hostname;
|
|
43
|
+
const internal = [];
|
|
44
|
+
const external = [];
|
|
45
|
+
const anchors = [];
|
|
46
|
+
const all = [];
|
|
47
|
+
const links = document.querySelectorAll('a[href]');
|
|
48
|
+
links.forEach((link) => {
|
|
49
|
+
const href = link.href;
|
|
50
|
+
const text = link.innerText.trim();
|
|
51
|
+
const title = link.getAttribute('title') || '';
|
|
52
|
+
const target = link.getAttribute('target') || '';
|
|
53
|
+
if (!href)
|
|
54
|
+
return;
|
|
55
|
+
// Anchor links
|
|
56
|
+
if (href.startsWith('#')) {
|
|
57
|
+
if (config.includeAnchors) {
|
|
58
|
+
anchors.push({ href, text, target });
|
|
59
|
+
all.push({ href, text, type: 'anchor' });
|
|
60
|
+
}
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
try {
|
|
64
|
+
const url = new URL(href);
|
|
65
|
+
// Internal vs External
|
|
66
|
+
if (url.hostname === currentDomain || url.hostname === '') {
|
|
67
|
+
if (config.includeInternal) {
|
|
68
|
+
internal.push({ href, text, title });
|
|
69
|
+
all.push({ href, text, type: 'internal' });
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
if (config.includeExternal) {
|
|
74
|
+
external.push({ href, text, title });
|
|
75
|
+
all.push({ href, text, type: 'external' });
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
catch (e) {
|
|
80
|
+
// Invalid URL, treat as internal
|
|
81
|
+
if (config.includeInternal) {
|
|
82
|
+
internal.push({ href, text, title });
|
|
83
|
+
all.push({ href, text, type: 'internal' });
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
return { internal, external, anchors, all };
|
|
88
|
+
}, opts);
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Media Extractor - Extract videos, audio files, and embedded media
|
|
92
|
+
*/
|
|
93
|
+
export async function extractMedia(page) {
|
|
94
|
+
return await page.evaluate(() => {
|
|
95
|
+
const videos = [];
|
|
96
|
+
const audio = [];
|
|
97
|
+
const iframes = [];
|
|
98
|
+
const embeds = [];
|
|
99
|
+
// Extract video elements
|
|
100
|
+
document.querySelectorAll('video').forEach((video) => {
|
|
101
|
+
const sources = [];
|
|
102
|
+
video.querySelectorAll('source').forEach((source) => {
|
|
103
|
+
sources.push(source.src);
|
|
104
|
+
});
|
|
105
|
+
videos.push({
|
|
106
|
+
src: video.src || sources[0] || '',
|
|
107
|
+
sources: sources,
|
|
108
|
+
poster: video.poster || '',
|
|
109
|
+
width: video.width,
|
|
110
|
+
height: video.height,
|
|
111
|
+
controls: video.controls,
|
|
112
|
+
autoplay: video.autoplay,
|
|
113
|
+
loop: video.loop,
|
|
114
|
+
muted: video.muted,
|
|
115
|
+
duration: video.duration,
|
|
116
|
+
currentTime: video.currentTime
|
|
117
|
+
});
|
|
118
|
+
});
|
|
119
|
+
// Extract audio elements
|
|
120
|
+
document.querySelectorAll('audio').forEach((audioEl) => {
|
|
121
|
+
const sources = [];
|
|
122
|
+
audioEl.querySelectorAll('source').forEach((source) => {
|
|
123
|
+
sources.push(source.src);
|
|
124
|
+
});
|
|
125
|
+
audio.push({
|
|
126
|
+
src: audioEl.src || sources[0] || '',
|
|
127
|
+
sources: sources,
|
|
128
|
+
controls: audioEl.controls,
|
|
129
|
+
autoplay: audioEl.autoplay,
|
|
130
|
+
loop: audioEl.loop,
|
|
131
|
+
muted: audioEl.muted,
|
|
132
|
+
duration: audioEl.duration
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
// Extract iframes
|
|
136
|
+
document.querySelectorAll('iframe').forEach((iframe) => {
|
|
137
|
+
const src = iframe.src;
|
|
138
|
+
let platform = 'unknown';
|
|
139
|
+
// Detect common video platforms
|
|
140
|
+
if (src.includes('youtube.com') || src.includes('youtu.be')) {
|
|
141
|
+
platform = 'youtube';
|
|
142
|
+
}
|
|
143
|
+
else if (src.includes('vimeo.com')) {
|
|
144
|
+
platform = 'vimeo';
|
|
145
|
+
}
|
|
146
|
+
else if (src.includes('dailymotion.com')) {
|
|
147
|
+
platform = 'dailymotion';
|
|
148
|
+
}
|
|
149
|
+
else if (src.includes('facebook.com')) {
|
|
150
|
+
platform = 'facebook';
|
|
151
|
+
}
|
|
152
|
+
else if (src.includes('twitter.com') || src.includes('x.com')) {
|
|
153
|
+
platform = 'twitter';
|
|
154
|
+
}
|
|
155
|
+
iframes.push({
|
|
156
|
+
src: src,
|
|
157
|
+
title: iframe.title || '',
|
|
158
|
+
width: iframe.width,
|
|
159
|
+
height: iframe.height,
|
|
160
|
+
platform: platform,
|
|
161
|
+
allowFullscreen: iframe.allowFullscreen
|
|
162
|
+
});
|
|
163
|
+
});
|
|
164
|
+
// Extract embed elements
|
|
165
|
+
document.querySelectorAll('embed, object').forEach((embed) => {
|
|
166
|
+
embeds.push({
|
|
167
|
+
src: embed.getAttribute('src') || embed.getAttribute('data') || '',
|
|
168
|
+
type: embed.getAttribute('type') || '',
|
|
169
|
+
width: embed.getAttribute('width') || '',
|
|
170
|
+
height: embed.getAttribute('height') || ''
|
|
171
|
+
});
|
|
172
|
+
});
|
|
173
|
+
return { videos, audio, iframes, embeds };
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* PDF Link Finder - Find all downloadable file links
|
|
178
|
+
*/
|
|
179
|
+
export async function findDownloadableFiles(page) {
|
|
180
|
+
return await page.evaluate(() => {
|
|
181
|
+
const pdfs = [];
|
|
182
|
+
const documents = [];
|
|
183
|
+
const archives = [];
|
|
184
|
+
const images = [];
|
|
185
|
+
const other = [];
|
|
186
|
+
const links = document.querySelectorAll('a[href]');
|
|
187
|
+
links.forEach((link) => {
|
|
188
|
+
const href = link.href;
|
|
189
|
+
const text = link.innerText.trim();
|
|
190
|
+
const download = link.getAttribute('download');
|
|
191
|
+
if (!href)
|
|
192
|
+
return;
|
|
193
|
+
const url = href.toLowerCase();
|
|
194
|
+
const fileInfo = { href, text, size: link.getAttribute('data-size') || undefined };
|
|
195
|
+
// PDF files
|
|
196
|
+
if (url.endsWith('.pdf') || url.includes('.pdf?') || download?.endsWith('.pdf')) {
|
|
197
|
+
pdfs.push(fileInfo);
|
|
198
|
+
}
|
|
199
|
+
// Document files
|
|
200
|
+
else if (url.match(/\.(doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp)($|\?)/)) {
|
|
201
|
+
const match = url.match(/\.(doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp)($|\?)/);
|
|
202
|
+
documents.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
|
|
203
|
+
}
|
|
204
|
+
// Archive files
|
|
205
|
+
else if (url.match(/\.(zip|rar|7z|tar|gz|bz2)($|\?)/)) {
|
|
206
|
+
const match = url.match(/\.(zip|rar|7z|tar|gz|bz2)($|\?)/);
|
|
207
|
+
archives.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
|
|
208
|
+
}
|
|
209
|
+
// Image files (downloadable)
|
|
210
|
+
else if (url.match(/\.(jpg|jpeg|png|gif|bmp|svg|webp|ico)($|\?)/) && download) {
|
|
211
|
+
const match = url.match(/\.(jpg|jpeg|png|gif|bmp|svg|webp|ico)($|\?)/);
|
|
212
|
+
images.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
|
|
213
|
+
}
|
|
214
|
+
// Other downloadable files
|
|
215
|
+
else if (download || url.match(/\.(exe|dmg|apk|deb|rpm|msi|iso)($|\?)/)) {
|
|
216
|
+
const match = url.match(/\.([a-z0-9]+)($|\?)/);
|
|
217
|
+
other.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
|
|
218
|
+
}
|
|
219
|
+
});
|
|
220
|
+
return { pdfs, documents, archives, images, other };
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Social Media Links Extractor - Extract social media profile links
|
|
225
|
+
*/
|
|
226
|
+
export async function extractSocialLinks(page) {
|
|
227
|
+
return await page.evaluate(() => {
|
|
228
|
+
const socialLinks = {
|
|
229
|
+
facebook: [],
|
|
230
|
+
twitter: [],
|
|
231
|
+
instagram: [],
|
|
232
|
+
linkedin: [],
|
|
233
|
+
youtube: [],
|
|
234
|
+
github: [],
|
|
235
|
+
pinterest: [],
|
|
236
|
+
tiktok: [],
|
|
237
|
+
other: []
|
|
238
|
+
};
|
|
239
|
+
const links = document.querySelectorAll('a[href]');
|
|
240
|
+
links.forEach((link) => {
|
|
241
|
+
const href = link.href.toLowerCase();
|
|
242
|
+
if (href.includes('facebook.com')) {
|
|
243
|
+
socialLinks.facebook.push(link.href);
|
|
244
|
+
}
|
|
245
|
+
else if (href.includes('twitter.com') || href.includes('x.com')) {
|
|
246
|
+
socialLinks.twitter.push(link.href);
|
|
247
|
+
}
|
|
248
|
+
else if (href.includes('instagram.com')) {
|
|
249
|
+
socialLinks.instagram.push(link.href);
|
|
250
|
+
}
|
|
251
|
+
else if (href.includes('linkedin.com')) {
|
|
252
|
+
socialLinks.linkedin.push(link.href);
|
|
253
|
+
}
|
|
254
|
+
else if (href.includes('youtube.com') || href.includes('youtu.be')) {
|
|
255
|
+
socialLinks.youtube.push(link.href);
|
|
256
|
+
}
|
|
257
|
+
else if (href.includes('github.com')) {
|
|
258
|
+
socialLinks.github.push(link.href);
|
|
259
|
+
}
|
|
260
|
+
else if (href.includes('pinterest.com')) {
|
|
261
|
+
socialLinks.pinterest.push(link.href);
|
|
262
|
+
}
|
|
263
|
+
else if (href.includes('tiktok.com')) {
|
|
264
|
+
socialLinks.tiktok.push(link.href);
|
|
265
|
+
}
|
|
266
|
+
});
|
|
267
|
+
// Remove duplicates
|
|
268
|
+
Object.keys(socialLinks).forEach((key) => {
|
|
269
|
+
socialLinks[key] = Array.from(new Set(socialLinks[key]));
|
|
270
|
+
});
|
|
271
|
+
return socialLinks;
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Email and Phone Extractor - Extract contact information from page
|
|
276
|
+
*/
|
|
277
|
+
export async function extractContactInfo(page) {
|
|
278
|
+
return await page.evaluate(() => {
|
|
279
|
+
const text = document.body.innerText;
|
|
280
|
+
const emails = [];
|
|
281
|
+
const phones = [];
|
|
282
|
+
const addresses = [];
|
|
283
|
+
// Extract emails
|
|
284
|
+
const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
|
|
285
|
+
const emailMatches = text.match(emailRegex);
|
|
286
|
+
if (emailMatches) {
|
|
287
|
+
emails.push(...emailMatches);
|
|
288
|
+
}
|
|
289
|
+
// Also check mailto links
|
|
290
|
+
document.querySelectorAll('a[href^="mailto:"]').forEach((link) => {
|
|
291
|
+
const email = link.href.replace('mailto:', '').split('?')[0];
|
|
292
|
+
if (email)
|
|
293
|
+
emails.push(email);
|
|
294
|
+
});
|
|
295
|
+
// Extract phone numbers (various formats)
|
|
296
|
+
const phoneRegex = /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
|
|
297
|
+
const phoneMatches = text.match(phoneRegex);
|
|
298
|
+
if (phoneMatches) {
|
|
299
|
+
phones.push(...phoneMatches);
|
|
300
|
+
}
|
|
301
|
+
// Also check tel links
|
|
302
|
+
document.querySelectorAll('a[href^="tel:"]').forEach((link) => {
|
|
303
|
+
const phone = link.href.replace('tel:', '');
|
|
304
|
+
if (phone)
|
|
305
|
+
phones.push(phone);
|
|
306
|
+
});
|
|
307
|
+
// Remove duplicates
|
|
308
|
+
return {
|
|
309
|
+
emails: Array.from(new Set(emails)),
|
|
310
|
+
phones: Array.from(new Set(phones)),
|
|
311
|
+
addresses: addresses
|
|
312
|
+
};
|
|
313
|
+
});
|
|
314
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
// Basic tests for extractor modules
|
|
2
|
+
import { describe, it, expect } from 'vitest';
|
|
3
|
+
describe('Smart Data Extractors', () => {
|
|
4
|
+
it('should exist', () => {
|
|
5
|
+
expect(true).toBe(true);
|
|
6
|
+
});
|
|
7
|
+
});
|
|
8
|
+
describe('Multi-Element Extractors', () => {
|
|
9
|
+
it('should exist', () => {
|
|
10
|
+
expect(true).toBe(true);
|
|
11
|
+
});
|
|
12
|
+
});
|
|
13
|
+
describe('Content Type Extractors', () => {
|
|
14
|
+
it('should exist', () => {
|
|
15
|
+
expect(true).toBe(true);
|
|
16
|
+
});
|
|
17
|
+
});
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Batch Element Scraper - Extract multiple similar elements (products, articles, etc.)
|
|
3
|
+
*/
|
|
4
|
+
export async function batchScrapeElements(page, containerSelector, fields) {
|
|
5
|
+
return await page.evaluate((container, fieldMap) => {
|
|
6
|
+
const containers = document.querySelectorAll(container);
|
|
7
|
+
const results = [];
|
|
8
|
+
containers.forEach((containerEl) => {
|
|
9
|
+
const item = {};
|
|
10
|
+
Object.entries(fieldMap).forEach(([fieldName, selector]) => {
|
|
11
|
+
const element = containerEl.querySelector(selector);
|
|
12
|
+
if (element) {
|
|
13
|
+
// Extract different types of data based on element type
|
|
14
|
+
if (element.tagName === 'IMG') {
|
|
15
|
+
item[fieldName] = {
|
|
16
|
+
src: element.src,
|
|
17
|
+
alt: element.alt,
|
|
18
|
+
title: element.getAttribute('title') || ''
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
else if (element.tagName === 'A') {
|
|
22
|
+
item[fieldName] = {
|
|
23
|
+
text: element.innerText.trim(),
|
|
24
|
+
href: element.href
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
else if (element.tagName === 'INPUT' || element.tagName === 'TEXTAREA') {
|
|
28
|
+
item[fieldName] = element.value;
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
item[fieldName] = element.innerText.trim();
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
item[fieldName] = null;
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
// Only add if at least some fields were found
|
|
39
|
+
const hasData = Object.values(item).some(v => v !== null);
|
|
40
|
+
if (hasData) {
|
|
41
|
+
results.push(item);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
return results;
|
|
45
|
+
}, containerSelector, fields);
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Nested Data Extraction - Extract hierarchical data maintaining parent-child relationships
|
|
49
|
+
*/
|
|
50
|
+
export async function extractNestedData(page, parentSelector, childSelector, options) {
|
|
51
|
+
const opts = {
|
|
52
|
+
parentFields: {},
|
|
53
|
+
childFields: {},
|
|
54
|
+
maxDepth: 3,
|
|
55
|
+
...options
|
|
56
|
+
};
|
|
57
|
+
return await page.evaluate((parentSel, childSel, config) => {
|
|
58
|
+
const results = [];
|
|
59
|
+
const parents = document.querySelectorAll(parentSel);
|
|
60
|
+
parents.forEach((parentEl) => {
|
|
61
|
+
const parentData = {};
|
|
62
|
+
// Extract parent fields
|
|
63
|
+
if (Object.keys(config.parentFields).length > 0) {
|
|
64
|
+
Object.entries(config.parentFields).forEach(([fieldName, selector]) => {
|
|
65
|
+
const element = parentEl.querySelector(selector);
|
|
66
|
+
if (element) {
|
|
67
|
+
parentData[fieldName] = element.innerText.trim();
|
|
68
|
+
}
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
// Default: extract text content
|
|
73
|
+
parentData.content = parentEl.innerText.trim();
|
|
74
|
+
}
|
|
75
|
+
// Extract children
|
|
76
|
+
const children = [];
|
|
77
|
+
const childElements = parentEl.querySelectorAll(childSel);
|
|
78
|
+
childElements.forEach((childEl) => {
|
|
79
|
+
const childData = {};
|
|
80
|
+
if (Object.keys(config.childFields).length > 0) {
|
|
81
|
+
Object.entries(config.childFields).forEach(([fieldName, selector]) => {
|
|
82
|
+
const element = childEl.querySelector(selector);
|
|
83
|
+
if (element) {
|
|
84
|
+
childData[fieldName] = element.innerText.trim();
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
// Default: extract text content
|
|
90
|
+
childData.content = childEl.innerText.trim();
|
|
91
|
+
}
|
|
92
|
+
if (Object.keys(childData).length > 0) {
|
|
93
|
+
children.push(childData);
|
|
94
|
+
}
|
|
95
|
+
});
|
|
96
|
+
results.push({
|
|
97
|
+
parent: parentData,
|
|
98
|
+
children: children
|
|
99
|
+
});
|
|
100
|
+
});
|
|
101
|
+
return results;
|
|
102
|
+
}, parentSelector, childSelector, opts);
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Attribute Harvester - Collect all attributes from elements
|
|
106
|
+
*/
|
|
107
|
+
export async function harvestAttributes(page, selector, attributes) {
|
|
108
|
+
return await page.evaluate((sel, attrs) => {
|
|
109
|
+
const elements = document.querySelectorAll(sel);
|
|
110
|
+
const results = [];
|
|
111
|
+
elements.forEach((element) => {
|
|
112
|
+
const attrData = {};
|
|
113
|
+
if (attrs && attrs.length > 0) {
|
|
114
|
+
// Collect specific attributes
|
|
115
|
+
attrs.forEach((attrName) => {
|
|
116
|
+
const value = element.getAttribute(attrName);
|
|
117
|
+
if (value !== null) {
|
|
118
|
+
attrData[attrName] = value;
|
|
119
|
+
}
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
else {
|
|
123
|
+
// Collect all attributes
|
|
124
|
+
Array.from(element.attributes).forEach((attr) => {
|
|
125
|
+
attrData[attr.name] = attr.value;
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
// Add element tag name and text content
|
|
129
|
+
attrData._tagName = element.tagName.toLowerCase();
|
|
130
|
+
attrData._textContent = element.innerText?.trim() || '';
|
|
131
|
+
results.push(attrData);
|
|
132
|
+
});
|
|
133
|
+
return results;
|
|
134
|
+
}, selector, attributes || null);
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Deep Element Scraper - Extract elements with all their properties and computed styles
|
|
138
|
+
*/
|
|
139
|
+
export async function deepScrapeElements(page, selector, options) {
|
|
140
|
+
const opts = {
|
|
141
|
+
includeStyles: false,
|
|
142
|
+
includePosition: true,
|
|
143
|
+
includeVisibility: true,
|
|
144
|
+
...options
|
|
145
|
+
};
|
|
146
|
+
return await page.evaluate((sel, config) => {
|
|
147
|
+
const elements = document.querySelectorAll(sel);
|
|
148
|
+
const results = [];
|
|
149
|
+
elements.forEach((element) => {
|
|
150
|
+
const data = {
|
|
151
|
+
tagName: element.tagName.toLowerCase(),
|
|
152
|
+
id: element.id || null,
|
|
153
|
+
classes: Array.from(element.classList),
|
|
154
|
+
textContent: element.innerText?.trim() || '',
|
|
155
|
+
attributes: {}
|
|
156
|
+
};
|
|
157
|
+
// Collect all attributes
|
|
158
|
+
Array.from(element.attributes).forEach((attr) => {
|
|
159
|
+
data.attributes[attr.name] = attr.value;
|
|
160
|
+
});
|
|
161
|
+
// Get position and dimensions
|
|
162
|
+
if (config.includePosition) {
|
|
163
|
+
const rect = element.getBoundingClientRect();
|
|
164
|
+
data.position = {
|
|
165
|
+
top: rect.top,
|
|
166
|
+
left: rect.left,
|
|
167
|
+
width: rect.width,
|
|
168
|
+
height: rect.height,
|
|
169
|
+
right: rect.right,
|
|
170
|
+
bottom: rect.bottom
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
// Get visibility information
|
|
174
|
+
if (config.includeVisibility) {
|
|
175
|
+
const styles = window.getComputedStyle(element);
|
|
176
|
+
data.visibility = {
|
|
177
|
+
display: styles.display,
|
|
178
|
+
visibility: styles.visibility,
|
|
179
|
+
opacity: styles.opacity,
|
|
180
|
+
isVisible: styles.display !== 'none' &&
|
|
181
|
+
styles.visibility !== 'hidden' &&
|
|
182
|
+
parseFloat(styles.opacity) > 0
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
// Get computed styles
|
|
186
|
+
if (config.includeStyles) {
|
|
187
|
+
const styles = window.getComputedStyle(element);
|
|
188
|
+
data.styles = {
|
|
189
|
+
color: styles.color,
|
|
190
|
+
backgroundColor: styles.backgroundColor,
|
|
191
|
+
fontSize: styles.fontSize,
|
|
192
|
+
fontFamily: styles.fontFamily,
|
|
193
|
+
fontWeight: styles.fontWeight,
|
|
194
|
+
textAlign: styles.textAlign,
|
|
195
|
+
padding: styles.padding,
|
|
196
|
+
margin: styles.margin,
|
|
197
|
+
border: styles.border
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
results.push(data);
|
|
201
|
+
});
|
|
202
|
+
return results;
|
|
203
|
+
}, selector, opts);
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Smart Product Scraper - Specialized scraper for e-commerce products
|
|
207
|
+
*/
|
|
208
|
+
export async function scrapeProducts(page, containerSelector, customFields) {
|
|
209
|
+
const defaultFields = {
|
|
210
|
+
title: '[class*="title"], [class*="name"], h2, h3',
|
|
211
|
+
price: '[class*="price"], [data-price]',
|
|
212
|
+
image: 'img',
|
|
213
|
+
link: 'a',
|
|
214
|
+
rating: '[class*="rating"], [class*="stars"]',
|
|
215
|
+
description: '[class*="desc"], p',
|
|
216
|
+
...customFields
|
|
217
|
+
};
|
|
218
|
+
return await page.evaluate((container, fields) => {
|
|
219
|
+
const containers = document.querySelectorAll(container);
|
|
220
|
+
const products = [];
|
|
221
|
+
containers.forEach((containerEl) => {
|
|
222
|
+
const product = {};
|
|
223
|
+
// Extract title
|
|
224
|
+
const titleEl = containerEl.querySelector(fields.title);
|
|
225
|
+
if (titleEl) {
|
|
226
|
+
product.title = titleEl.innerText.trim();
|
|
227
|
+
}
|
|
228
|
+
// Extract price
|
|
229
|
+
const priceEl = containerEl.querySelector(fields.price);
|
|
230
|
+
if (priceEl) {
|
|
231
|
+
const priceText = priceEl.innerText.trim();
|
|
232
|
+
product.price = priceText;
|
|
233
|
+
// Try to extract numeric value
|
|
234
|
+
const priceMatch = priceText.match(/[\d,]+\.?\d*/);
|
|
235
|
+
if (priceMatch) {
|
|
236
|
+
product.priceNumeric = parseFloat(priceMatch[0].replace(/,/g, ''));
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
// Extract image
|
|
240
|
+
const imageEl = containerEl.querySelector(fields.image);
|
|
241
|
+
if (imageEl) {
|
|
242
|
+
product.image = {
|
|
243
|
+
src: imageEl.src,
|
|
244
|
+
alt: imageEl.alt,
|
|
245
|
+
srcset: imageEl.srcset || null
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
// Extract link
|
|
249
|
+
const linkEl = containerEl.querySelector(fields.link);
|
|
250
|
+
if (linkEl) {
|
|
251
|
+
product.url = linkEl.href;
|
|
252
|
+
}
|
|
253
|
+
// Extract rating
|
|
254
|
+
const ratingEl = containerEl.querySelector(fields.rating);
|
|
255
|
+
if (ratingEl) {
|
|
256
|
+
const ratingText = ratingEl.innerText.trim();
|
|
257
|
+
product.rating = ratingText;
|
|
258
|
+
// Try to extract numeric rating
|
|
259
|
+
const ratingMatch = ratingText.match(/(\d+\.?\d*)/);
|
|
260
|
+
if (ratingMatch) {
|
|
261
|
+
product.ratingNumeric = parseFloat(ratingMatch[1]);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
// Extract description
|
|
265
|
+
const descEl = containerEl.querySelector(fields.description);
|
|
266
|
+
if (descEl) {
|
|
267
|
+
product.description = descEl.innerText.trim();
|
|
268
|
+
}
|
|
269
|
+
// Only add if at least title or link exists
|
|
270
|
+
if (product.title || product.url) {
|
|
271
|
+
products.push(product);
|
|
272
|
+
}
|
|
273
|
+
});
|
|
274
|
+
return products;
|
|
275
|
+
}, containerSelector, defaultFields);
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Smart Article Scraper - Specialized scraper for articles/blog posts
|
|
279
|
+
*/
|
|
280
|
+
export async function scrapeArticles(page, containerSelector, customFields) {
|
|
281
|
+
const defaultFields = {
|
|
282
|
+
title: 'h1, h2, h3, [class*="title"]',
|
|
283
|
+
author: '[class*="author"], [rel="author"]',
|
|
284
|
+
date: '[class*="date"], time',
|
|
285
|
+
content: '[class*="content"], [class*="body"], article, p',
|
|
286
|
+
image: 'img',
|
|
287
|
+
link: 'a',
|
|
288
|
+
category: '[class*="category"], [class*="tag"]',
|
|
289
|
+
...customFields
|
|
290
|
+
};
|
|
291
|
+
return await page.evaluate((container, fields) => {
|
|
292
|
+
const containers = document.querySelectorAll(container);
|
|
293
|
+
const articles = [];
|
|
294
|
+
containers.forEach((containerEl) => {
|
|
295
|
+
const article = {};
|
|
296
|
+
// Extract each field
|
|
297
|
+
Object.entries(fields).forEach(([fieldName, selector]) => {
|
|
298
|
+
const element = containerEl.querySelector(selector);
|
|
299
|
+
if (element) {
|
|
300
|
+
if (fieldName === 'image') {
|
|
301
|
+
article[fieldName] = {
|
|
302
|
+
src: element.src,
|
|
303
|
+
alt: element.alt
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
else if (fieldName === 'link') {
|
|
307
|
+
article[fieldName] = element.href;
|
|
308
|
+
}
|
|
309
|
+
else if (fieldName === 'date') {
|
|
310
|
+
article[fieldName] = element.getAttribute('datetime') ||
|
|
311
|
+
element.innerText.trim();
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
314
|
+
article[fieldName] = element.innerText.trim();
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
});
|
|
318
|
+
// Only add if at least title exists
|
|
319
|
+
if (article.title) {
|
|
320
|
+
articles.push(article);
|
|
321
|
+
}
|
|
322
|
+
});
|
|
323
|
+
return articles;
|
|
324
|
+
}, containerSelector, defaultFields);
|
|
325
|
+
}
|