brave-real-browser-mcp-server 2.7.6 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,314 @@
1
+ /**
2
+ * Image Scraper - Extract all images with metadata
3
+ */
4
+ export async function scrapeImages(page, selector) {
5
+ return await page.evaluate((sel) => {
6
+ const images = sel
7
+ ? document.querySelectorAll(sel)
8
+ : document.querySelectorAll('img');
9
+ const results = [];
10
+ images.forEach((img) => {
11
+ const imgEl = img;
12
+ const rect = imgEl.getBoundingClientRect();
13
+ const styles = window.getComputedStyle(imgEl);
14
+ results.push({
15
+ src: imgEl.src || imgEl.getAttribute('src') || '',
16
+ alt: imgEl.alt || '',
17
+ title: imgEl.title || '',
18
+ width: imgEl.width || rect.width,
19
+ height: imgEl.height || rect.height,
20
+ naturalWidth: imgEl.naturalWidth,
21
+ naturalHeight: imgEl.naturalHeight,
22
+ loading: imgEl.loading || 'auto',
23
+ srcset: imgEl.srcset || '',
24
+ sizes: imgEl.sizes || '',
25
+ isVisible: styles.display !== 'none' && styles.visibility !== 'hidden' && rect.width > 0 && rect.height > 0
26
+ });
27
+ });
28
+ return results;
29
+ }, selector || null);
30
+ }
31
+ /**
32
+ * Link Harvester - Extract all links with classification
33
+ */
34
+ export async function harvestLinks(page, options) {
35
+ const opts = {
36
+ includeInternal: true,
37
+ includeExternal: true,
38
+ includeAnchors: true,
39
+ ...options
40
+ };
41
+ return await page.evaluate((config) => {
42
+ const currentDomain = window.location.hostname;
43
+ const internal = [];
44
+ const external = [];
45
+ const anchors = [];
46
+ const all = [];
47
+ const links = document.querySelectorAll('a[href]');
48
+ links.forEach((link) => {
49
+ const href = link.href;
50
+ const text = link.innerText.trim();
51
+ const title = link.getAttribute('title') || '';
52
+ const target = link.getAttribute('target') || '';
53
+ if (!href)
54
+ return;
55
+ // Anchor links
56
+ if (href.startsWith('#')) {
57
+ if (config.includeAnchors) {
58
+ anchors.push({ href, text, target });
59
+ all.push({ href, text, type: 'anchor' });
60
+ }
61
+ return;
62
+ }
63
+ try {
64
+ const url = new URL(href);
65
+ // Internal vs External
66
+ if (url.hostname === currentDomain || url.hostname === '') {
67
+ if (config.includeInternal) {
68
+ internal.push({ href, text, title });
69
+ all.push({ href, text, type: 'internal' });
70
+ }
71
+ }
72
+ else {
73
+ if (config.includeExternal) {
74
+ external.push({ href, text, title });
75
+ all.push({ href, text, type: 'external' });
76
+ }
77
+ }
78
+ }
79
+ catch (e) {
80
+ // Invalid URL, treat as internal
81
+ if (config.includeInternal) {
82
+ internal.push({ href, text, title });
83
+ all.push({ href, text, type: 'internal' });
84
+ }
85
+ }
86
+ });
87
+ return { internal, external, anchors, all };
88
+ }, opts);
89
+ }
90
+ /**
91
+ * Media Extractor - Extract videos, audio files, and embedded media
92
+ */
93
+ export async function extractMedia(page) {
94
+ return await page.evaluate(() => {
95
+ const videos = [];
96
+ const audio = [];
97
+ const iframes = [];
98
+ const embeds = [];
99
+ // Extract video elements
100
+ document.querySelectorAll('video').forEach((video) => {
101
+ const sources = [];
102
+ video.querySelectorAll('source').forEach((source) => {
103
+ sources.push(source.src);
104
+ });
105
+ videos.push({
106
+ src: video.src || sources[0] || '',
107
+ sources: sources,
108
+ poster: video.poster || '',
109
+ width: video.width,
110
+ height: video.height,
111
+ controls: video.controls,
112
+ autoplay: video.autoplay,
113
+ loop: video.loop,
114
+ muted: video.muted,
115
+ duration: video.duration,
116
+ currentTime: video.currentTime
117
+ });
118
+ });
119
+ // Extract audio elements
120
+ document.querySelectorAll('audio').forEach((audioEl) => {
121
+ const sources = [];
122
+ audioEl.querySelectorAll('source').forEach((source) => {
123
+ sources.push(source.src);
124
+ });
125
+ audio.push({
126
+ src: audioEl.src || sources[0] || '',
127
+ sources: sources,
128
+ controls: audioEl.controls,
129
+ autoplay: audioEl.autoplay,
130
+ loop: audioEl.loop,
131
+ muted: audioEl.muted,
132
+ duration: audioEl.duration
133
+ });
134
+ });
135
+ // Extract iframes
136
+ document.querySelectorAll('iframe').forEach((iframe) => {
137
+ const src = iframe.src;
138
+ let platform = 'unknown';
139
+ // Detect common video platforms
140
+ if (src.includes('youtube.com') || src.includes('youtu.be')) {
141
+ platform = 'youtube';
142
+ }
143
+ else if (src.includes('vimeo.com')) {
144
+ platform = 'vimeo';
145
+ }
146
+ else if (src.includes('dailymotion.com')) {
147
+ platform = 'dailymotion';
148
+ }
149
+ else if (src.includes('facebook.com')) {
150
+ platform = 'facebook';
151
+ }
152
+ else if (src.includes('twitter.com') || src.includes('x.com')) {
153
+ platform = 'twitter';
154
+ }
155
+ iframes.push({
156
+ src: src,
157
+ title: iframe.title || '',
158
+ width: iframe.width,
159
+ height: iframe.height,
160
+ platform: platform,
161
+ allowFullscreen: iframe.allowFullscreen
162
+ });
163
+ });
164
+ // Extract embed elements
165
+ document.querySelectorAll('embed, object').forEach((embed) => {
166
+ embeds.push({
167
+ src: embed.getAttribute('src') || embed.getAttribute('data') || '',
168
+ type: embed.getAttribute('type') || '',
169
+ width: embed.getAttribute('width') || '',
170
+ height: embed.getAttribute('height') || ''
171
+ });
172
+ });
173
+ return { videos, audio, iframes, embeds };
174
+ });
175
+ }
176
+ /**
177
+ * PDF Link Finder - Find all downloadable file links
178
+ */
179
+ export async function findDownloadableFiles(page) {
180
+ return await page.evaluate(() => {
181
+ const pdfs = [];
182
+ const documents = [];
183
+ const archives = [];
184
+ const images = [];
185
+ const other = [];
186
+ const links = document.querySelectorAll('a[href]');
187
+ links.forEach((link) => {
188
+ const href = link.href;
189
+ const text = link.innerText.trim();
190
+ const download = link.getAttribute('download');
191
+ if (!href)
192
+ return;
193
+ const url = href.toLowerCase();
194
+ const fileInfo = { href, text, size: link.getAttribute('data-size') || undefined };
195
+ // PDF files
196
+ if (url.endsWith('.pdf') || url.includes('.pdf?') || download?.endsWith('.pdf')) {
197
+ pdfs.push(fileInfo);
198
+ }
199
+ // Document files
200
+ else if (url.match(/\.(doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp)($|\?)/)) {
201
+ const match = url.match(/\.(doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp)($|\?)/);
202
+ documents.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
203
+ }
204
+ // Archive files
205
+ else if (url.match(/\.(zip|rar|7z|tar|gz|bz2)($|\?)/)) {
206
+ const match = url.match(/\.(zip|rar|7z|tar|gz|bz2)($|\?)/);
207
+ archives.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
208
+ }
209
+ // Image files (downloadable)
210
+ else if (url.match(/\.(jpg|jpeg|png|gif|bmp|svg|webp|ico)($|\?)/) && download) {
211
+ const match = url.match(/\.(jpg|jpeg|png|gif|bmp|svg|webp|ico)($|\?)/);
212
+ images.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
213
+ }
214
+ // Other downloadable files
215
+ else if (download || url.match(/\.(exe|dmg|apk|deb|rpm|msi|iso)($|\?)/)) {
216
+ const match = url.match(/\.([a-z0-9]+)($|\?)/);
217
+ other.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
218
+ }
219
+ });
220
+ return { pdfs, documents, archives, images, other };
221
+ });
222
+ }
223
+ /**
224
+ * Social Media Links Extractor - Extract social media profile links
225
+ */
226
+ export async function extractSocialLinks(page) {
227
+ return await page.evaluate(() => {
228
+ const socialLinks = {
229
+ facebook: [],
230
+ twitter: [],
231
+ instagram: [],
232
+ linkedin: [],
233
+ youtube: [],
234
+ github: [],
235
+ pinterest: [],
236
+ tiktok: [],
237
+ other: []
238
+ };
239
+ const links = document.querySelectorAll('a[href]');
240
+ links.forEach((link) => {
241
+ const href = link.href.toLowerCase();
242
+ if (href.includes('facebook.com')) {
243
+ socialLinks.facebook.push(link.href);
244
+ }
245
+ else if (href.includes('twitter.com') || href.includes('x.com')) {
246
+ socialLinks.twitter.push(link.href);
247
+ }
248
+ else if (href.includes('instagram.com')) {
249
+ socialLinks.instagram.push(link.href);
250
+ }
251
+ else if (href.includes('linkedin.com')) {
252
+ socialLinks.linkedin.push(link.href);
253
+ }
254
+ else if (href.includes('youtube.com') || href.includes('youtu.be')) {
255
+ socialLinks.youtube.push(link.href);
256
+ }
257
+ else if (href.includes('github.com')) {
258
+ socialLinks.github.push(link.href);
259
+ }
260
+ else if (href.includes('pinterest.com')) {
261
+ socialLinks.pinterest.push(link.href);
262
+ }
263
+ else if (href.includes('tiktok.com')) {
264
+ socialLinks.tiktok.push(link.href);
265
+ }
266
+ });
267
+ // Remove duplicates
268
+ Object.keys(socialLinks).forEach((key) => {
269
+ socialLinks[key] = Array.from(new Set(socialLinks[key]));
270
+ });
271
+ return socialLinks;
272
+ });
273
+ }
274
+ /**
275
+ * Email and Phone Extractor - Extract contact information from page
276
+ */
277
+ export async function extractContactInfo(page) {
278
+ return await page.evaluate(() => {
279
+ const text = document.body.innerText;
280
+ const emails = [];
281
+ const phones = [];
282
+ const addresses = [];
283
+ // Extract emails
284
+ const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
285
+ const emailMatches = text.match(emailRegex);
286
+ if (emailMatches) {
287
+ emails.push(...emailMatches);
288
+ }
289
+ // Also check mailto links
290
+ document.querySelectorAll('a[href^="mailto:"]').forEach((link) => {
291
+ const email = link.href.replace('mailto:', '').split('?')[0];
292
+ if (email)
293
+ emails.push(email);
294
+ });
295
+ // Extract phone numbers (various formats)
296
+ const phoneRegex = /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
297
+ const phoneMatches = text.match(phoneRegex);
298
+ if (phoneMatches) {
299
+ phones.push(...phoneMatches);
300
+ }
301
+ // Also check tel links
302
+ document.querySelectorAll('a[href^="tel:"]').forEach((link) => {
303
+ const phone = link.href.replace('tel:', '');
304
+ if (phone)
305
+ phones.push(phone);
306
+ });
307
+ // Remove duplicates
308
+ return {
309
+ emails: Array.from(new Set(emails)),
310
+ phones: Array.from(new Set(phones)),
311
+ addresses: addresses
312
+ };
313
+ });
314
+ }
@@ -0,0 +1,17 @@
1
+ // Basic tests for extractor modules
2
+ import { describe, it, expect } from 'vitest';
3
+ describe('Smart Data Extractors', () => {
4
+ it('should exist', () => {
5
+ expect(true).toBe(true);
6
+ });
7
+ });
8
+ describe('Multi-Element Extractors', () => {
9
+ it('should exist', () => {
10
+ expect(true).toBe(true);
11
+ });
12
+ });
13
+ describe('Content Type Extractors', () => {
14
+ it('should exist', () => {
15
+ expect(true).toBe(true);
16
+ });
17
+ });
@@ -0,0 +1,325 @@
1
+ /**
2
+ * Batch Element Scraper - Extract multiple similar elements (products, articles, etc.)
3
+ */
4
+ export async function batchScrapeElements(page, containerSelector, fields) {
5
+ return await page.evaluate((container, fieldMap) => {
6
+ const containers = document.querySelectorAll(container);
7
+ const results = [];
8
+ containers.forEach((containerEl) => {
9
+ const item = {};
10
+ Object.entries(fieldMap).forEach(([fieldName, selector]) => {
11
+ const element = containerEl.querySelector(selector);
12
+ if (element) {
13
+ // Extract different types of data based on element type
14
+ if (element.tagName === 'IMG') {
15
+ item[fieldName] = {
16
+ src: element.src,
17
+ alt: element.alt,
18
+ title: element.getAttribute('title') || ''
19
+ };
20
+ }
21
+ else if (element.tagName === 'A') {
22
+ item[fieldName] = {
23
+ text: element.innerText.trim(),
24
+ href: element.href
25
+ };
26
+ }
27
+ else if (element.tagName === 'INPUT' || element.tagName === 'TEXTAREA') {
28
+ item[fieldName] = element.value;
29
+ }
30
+ else {
31
+ item[fieldName] = element.innerText.trim();
32
+ }
33
+ }
34
+ else {
35
+ item[fieldName] = null;
36
+ }
37
+ });
38
+ // Only add if at least some fields were found
39
+ const hasData = Object.values(item).some(v => v !== null);
40
+ if (hasData) {
41
+ results.push(item);
42
+ }
43
+ });
44
+ return results;
45
+ }, containerSelector, fields);
46
+ }
47
+ /**
48
+ * Nested Data Extraction - Extract hierarchical data maintaining parent-child relationships
49
+ */
50
+ export async function extractNestedData(page, parentSelector, childSelector, options) {
51
+ const opts = {
52
+ parentFields: {},
53
+ childFields: {},
54
+ maxDepth: 3,
55
+ ...options
56
+ };
57
+ return await page.evaluate((parentSel, childSel, config) => {
58
+ const results = [];
59
+ const parents = document.querySelectorAll(parentSel);
60
+ parents.forEach((parentEl) => {
61
+ const parentData = {};
62
+ // Extract parent fields
63
+ if (Object.keys(config.parentFields).length > 0) {
64
+ Object.entries(config.parentFields).forEach(([fieldName, selector]) => {
65
+ const element = parentEl.querySelector(selector);
66
+ if (element) {
67
+ parentData[fieldName] = element.innerText.trim();
68
+ }
69
+ });
70
+ }
71
+ else {
72
+ // Default: extract text content
73
+ parentData.content = parentEl.innerText.trim();
74
+ }
75
+ // Extract children
76
+ const children = [];
77
+ const childElements = parentEl.querySelectorAll(childSel);
78
+ childElements.forEach((childEl) => {
79
+ const childData = {};
80
+ if (Object.keys(config.childFields).length > 0) {
81
+ Object.entries(config.childFields).forEach(([fieldName, selector]) => {
82
+ const element = childEl.querySelector(selector);
83
+ if (element) {
84
+ childData[fieldName] = element.innerText.trim();
85
+ }
86
+ });
87
+ }
88
+ else {
89
+ // Default: extract text content
90
+ childData.content = childEl.innerText.trim();
91
+ }
92
+ if (Object.keys(childData).length > 0) {
93
+ children.push(childData);
94
+ }
95
+ });
96
+ results.push({
97
+ parent: parentData,
98
+ children: children
99
+ });
100
+ });
101
+ return results;
102
+ }, parentSelector, childSelector, opts);
103
+ }
104
+ /**
105
+ * Attribute Harvester - Collect all attributes from elements
106
+ */
107
+ export async function harvestAttributes(page, selector, attributes) {
108
+ return await page.evaluate((sel, attrs) => {
109
+ const elements = document.querySelectorAll(sel);
110
+ const results = [];
111
+ elements.forEach((element) => {
112
+ const attrData = {};
113
+ if (attrs && attrs.length > 0) {
114
+ // Collect specific attributes
115
+ attrs.forEach((attrName) => {
116
+ const value = element.getAttribute(attrName);
117
+ if (value !== null) {
118
+ attrData[attrName] = value;
119
+ }
120
+ });
121
+ }
122
+ else {
123
+ // Collect all attributes
124
+ Array.from(element.attributes).forEach((attr) => {
125
+ attrData[attr.name] = attr.value;
126
+ });
127
+ }
128
+ // Add element tag name and text content
129
+ attrData._tagName = element.tagName.toLowerCase();
130
+ attrData._textContent = element.innerText?.trim() || '';
131
+ results.push(attrData);
132
+ });
133
+ return results;
134
+ }, selector, attributes || null);
135
+ }
136
+ /**
137
+ * Deep Element Scraper - Extract elements with all their properties and computed styles
138
+ */
139
+ export async function deepScrapeElements(page, selector, options) {
140
+ const opts = {
141
+ includeStyles: false,
142
+ includePosition: true,
143
+ includeVisibility: true,
144
+ ...options
145
+ };
146
+ return await page.evaluate((sel, config) => {
147
+ const elements = document.querySelectorAll(sel);
148
+ const results = [];
149
+ elements.forEach((element) => {
150
+ const data = {
151
+ tagName: element.tagName.toLowerCase(),
152
+ id: element.id || null,
153
+ classes: Array.from(element.classList),
154
+ textContent: element.innerText?.trim() || '',
155
+ attributes: {}
156
+ };
157
+ // Collect all attributes
158
+ Array.from(element.attributes).forEach((attr) => {
159
+ data.attributes[attr.name] = attr.value;
160
+ });
161
+ // Get position and dimensions
162
+ if (config.includePosition) {
163
+ const rect = element.getBoundingClientRect();
164
+ data.position = {
165
+ top: rect.top,
166
+ left: rect.left,
167
+ width: rect.width,
168
+ height: rect.height,
169
+ right: rect.right,
170
+ bottom: rect.bottom
171
+ };
172
+ }
173
+ // Get visibility information
174
+ if (config.includeVisibility) {
175
+ const styles = window.getComputedStyle(element);
176
+ data.visibility = {
177
+ display: styles.display,
178
+ visibility: styles.visibility,
179
+ opacity: styles.opacity,
180
+ isVisible: styles.display !== 'none' &&
181
+ styles.visibility !== 'hidden' &&
182
+ parseFloat(styles.opacity) > 0
183
+ };
184
+ }
185
+ // Get computed styles
186
+ if (config.includeStyles) {
187
+ const styles = window.getComputedStyle(element);
188
+ data.styles = {
189
+ color: styles.color,
190
+ backgroundColor: styles.backgroundColor,
191
+ fontSize: styles.fontSize,
192
+ fontFamily: styles.fontFamily,
193
+ fontWeight: styles.fontWeight,
194
+ textAlign: styles.textAlign,
195
+ padding: styles.padding,
196
+ margin: styles.margin,
197
+ border: styles.border
198
+ };
199
+ }
200
+ results.push(data);
201
+ });
202
+ return results;
203
+ }, selector, opts);
204
+ }
205
+ /**
206
+ * Smart Product Scraper - Specialized scraper for e-commerce products
207
+ */
208
+ export async function scrapeProducts(page, containerSelector, customFields) {
209
+ const defaultFields = {
210
+ title: '[class*="title"], [class*="name"], h2, h3',
211
+ price: '[class*="price"], [data-price]',
212
+ image: 'img',
213
+ link: 'a',
214
+ rating: '[class*="rating"], [class*="stars"]',
215
+ description: '[class*="desc"], p',
216
+ ...customFields
217
+ };
218
+ return await page.evaluate((container, fields) => {
219
+ const containers = document.querySelectorAll(container);
220
+ const products = [];
221
+ containers.forEach((containerEl) => {
222
+ const product = {};
223
+ // Extract title
224
+ const titleEl = containerEl.querySelector(fields.title);
225
+ if (titleEl) {
226
+ product.title = titleEl.innerText.trim();
227
+ }
228
+ // Extract price
229
+ const priceEl = containerEl.querySelector(fields.price);
230
+ if (priceEl) {
231
+ const priceText = priceEl.innerText.trim();
232
+ product.price = priceText;
233
+ // Try to extract numeric value
234
+ const priceMatch = priceText.match(/[\d,]+\.?\d*/);
235
+ if (priceMatch) {
236
+ product.priceNumeric = parseFloat(priceMatch[0].replace(/,/g, ''));
237
+ }
238
+ }
239
+ // Extract image
240
+ const imageEl = containerEl.querySelector(fields.image);
241
+ if (imageEl) {
242
+ product.image = {
243
+ src: imageEl.src,
244
+ alt: imageEl.alt,
245
+ srcset: imageEl.srcset || null
246
+ };
247
+ }
248
+ // Extract link
249
+ const linkEl = containerEl.querySelector(fields.link);
250
+ if (linkEl) {
251
+ product.url = linkEl.href;
252
+ }
253
+ // Extract rating
254
+ const ratingEl = containerEl.querySelector(fields.rating);
255
+ if (ratingEl) {
256
+ const ratingText = ratingEl.innerText.trim();
257
+ product.rating = ratingText;
258
+ // Try to extract numeric rating
259
+ const ratingMatch = ratingText.match(/(\d+\.?\d*)/);
260
+ if (ratingMatch) {
261
+ product.ratingNumeric = parseFloat(ratingMatch[1]);
262
+ }
263
+ }
264
+ // Extract description
265
+ const descEl = containerEl.querySelector(fields.description);
266
+ if (descEl) {
267
+ product.description = descEl.innerText.trim();
268
+ }
269
+ // Only add if at least title or link exists
270
+ if (product.title || product.url) {
271
+ products.push(product);
272
+ }
273
+ });
274
+ return products;
275
+ }, containerSelector, defaultFields);
276
+ }
277
+ /**
278
+ * Smart Article Scraper - Specialized scraper for articles/blog posts
279
+ */
280
+ export async function scrapeArticles(page, containerSelector, customFields) {
281
+ const defaultFields = {
282
+ title: 'h1, h2, h3, [class*="title"]',
283
+ author: '[class*="author"], [rel="author"]',
284
+ date: '[class*="date"], time',
285
+ content: '[class*="content"], [class*="body"], article, p',
286
+ image: 'img',
287
+ link: 'a',
288
+ category: '[class*="category"], [class*="tag"]',
289
+ ...customFields
290
+ };
291
+ return await page.evaluate((container, fields) => {
292
+ const containers = document.querySelectorAll(container);
293
+ const articles = [];
294
+ containers.forEach((containerEl) => {
295
+ const article = {};
296
+ // Extract each field
297
+ Object.entries(fields).forEach(([fieldName, selector]) => {
298
+ const element = containerEl.querySelector(selector);
299
+ if (element) {
300
+ if (fieldName === 'image') {
301
+ article[fieldName] = {
302
+ src: element.src,
303
+ alt: element.alt
304
+ };
305
+ }
306
+ else if (fieldName === 'link') {
307
+ article[fieldName] = element.href;
308
+ }
309
+ else if (fieldName === 'date') {
310
+ article[fieldName] = element.getAttribute('datetime') ||
311
+ element.innerText.trim();
312
+ }
313
+ else {
314
+ article[fieldName] = element.innerText.trim();
315
+ }
316
+ }
317
+ });
318
+ // Only add if at least title exists
319
+ if (article.title) {
320
+ articles.push(article);
321
+ }
322
+ });
323
+ return articles;
324
+ }, containerSelector, defaultFields);
325
+ }