brave-real-browser-mcp-server 2.7.5 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/dist/browser-manager.js +0 -14
  2. package/dist/extractors/content-type-extractors.js +225 -144
  3. package/dist/extractors/extractors.test.js +17 -0
  4. package/dist/extractors/multi-element-extractors.js +273 -122
  5. package/dist/extractors/smart-data-extractors.js +202 -125
  6. package/dist/index.js +78 -122
  7. package/dist/tool-definitions.js +14 -659
  8. package/dist/utils/advanced-features.js +247 -0
  9. package/dist/utils/advanced-scraping.js +253 -0
  10. package/dist/utils/all-modules.test.js +86 -0
  11. package/dist/utils/auth-session.js +296 -0
  12. package/dist/utils/data-processing.js +301 -0
  13. package/dist/utils/data-processing.test.js +52 -0
  14. package/dist/utils/pagination.js +249 -0
  15. package/dist/utils/pagination.test.js +22 -0
  16. package/package.json +31 -2
  17. package/dist/advanced/advanced-content-extraction.js +0 -435
  18. package/dist/advanced/advanced-content-extraction.test.js +0 -8
  19. package/dist/advanced/advanced-scraping.js +0 -301
  20. package/dist/ai/ai-features.js +0 -56
  21. package/dist/ai/ai-features.test.js +0 -18
  22. package/dist/ai/ai-tools.js +0 -390
  23. package/dist/api/api-integration-system.js +0 -68
  24. package/dist/api/api-integration-system.test.js +0 -29
  25. package/dist/api/api-integration.js +0 -371
  26. package/dist/auth/session-manager.js +0 -50
  27. package/dist/auth/session-manager.test.js +0 -8
  28. package/dist/captcha/advanced-captcha-handler.js +0 -45
  29. package/dist/captcha/advanced-captcha-handler.test.js +0 -8
  30. package/dist/captcha/captcha-handler.js +0 -374
  31. package/dist/extractors/smart-data-extractors.test.js +0 -91
  32. package/dist/handlers/advanced-scraping-handlers.js +0 -333
  33. package/dist/handlers/advanced-scraping-handlers.test.js +0 -218
  34. package/dist/handlers/new-features-handlers.js +0 -209
  35. package/dist/handlers/new-features-handlers.test.js +0 -21
  36. package/dist/monitoring/monitoring-system.js +0 -53
  37. package/dist/monitoring/monitoring-system.test.js +0 -26
  38. package/dist/monitoring/monitoring-tools.js +0 -372
  39. package/dist/navigation/pagination-tools.js +0 -215
  40. package/dist/processors/data-processors.js +0 -250
  41. package/dist/processors/data-processors.test.js +0 -163
  42. package/dist/processors/data-transformation.js +0 -344
  43. package/dist/processors/data-transformation.test.js +0 -288
  44. package/dist/quality/data-quality-tools.js +0 -43
  45. package/dist/quality/data-quality-tools.test.js +0 -26
  46. package/dist/search/advanced-search-tools.js +0 -52
  47. package/dist/search/advanced-search-tools.test.js +0 -11
  48. package/dist/search/search-filter-tools.js +0 -339
  49. package/dist/visual/screenshot-tools.js +0 -47
  50. package/dist/visual/screenshot-tools.test.js +0 -8
  51. package/dist/visual/visual-tools.js +0 -516
@@ -732,17 +732,3 @@ export function getContentPriorityConfig() {
732
732
  export function updateContentPriorityConfig(config) {
733
733
  contentPriorityConfig = { ...contentPriorityConfig, ...config };
734
734
  }
735
- // Alias for getPageInstance - for compatibility with advanced scraping handlers
736
- export async function getBrowserPage() {
737
- if (!pageInstance) {
738
- throw new Error('Browser not initialized. Call browser_init first.');
739
- }
740
- return pageInstance;
741
- }
742
- // Synchronous version for compatibility with new-features-handlers
743
- export function getCurrentPage() {
744
- if (!pageInstance) {
745
- throw new Error('Browser not initialized. Call browser_init first.');
746
- }
747
- return pageInstance;
748
- }
@@ -1,233 +1,314 @@
1
- // Content Type Specific Extractors
2
- // Image Scraper, Link Harvester, Media Extractor, PDF Link Finder
3
1
  /**
4
- * Image Scraper - सभी images URLs, alt text, dimensions के साथ
2
+ * Image Scraper - Extract all images with metadata
5
3
  */
6
- export async function extractImages(page, selector) {
4
+ export async function scrapeImages(page, selector) {
7
5
  return await page.evaluate((sel) => {
8
- const images = sel ?
9
- Array.from(document.querySelectorAll(sel)) :
10
- Array.from(document.querySelectorAll('img'));
11
- return images.map((img) => ({
12
- src: img.src || img.getAttribute('data-src') || img.getAttribute('data-lazy-src'),
13
- alt: img.alt || '',
14
- title: img.title || '',
15
- width: img.naturalWidth || img.width,
16
- height: img.naturalHeight || img.height,
17
- loading: img.loading,
18
- srcset: img.srcset || ''
19
- }));
20
- }, selector);
6
+ const images = sel
7
+ ? document.querySelectorAll(sel)
8
+ : document.querySelectorAll('img');
9
+ const results = [];
10
+ images.forEach((img) => {
11
+ const imgEl = img;
12
+ const rect = imgEl.getBoundingClientRect();
13
+ const styles = window.getComputedStyle(imgEl);
14
+ results.push({
15
+ src: imgEl.src || imgEl.getAttribute('src') || '',
16
+ alt: imgEl.alt || '',
17
+ title: imgEl.title || '',
18
+ width: imgEl.width || rect.width,
19
+ height: imgEl.height || rect.height,
20
+ naturalWidth: imgEl.naturalWidth,
21
+ naturalHeight: imgEl.naturalHeight,
22
+ loading: imgEl.loading || 'auto',
23
+ srcset: imgEl.srcset || '',
24
+ sizes: imgEl.sizes || '',
25
+ isVisible: styles.display !== 'none' && styles.visibility !== 'hidden' && rect.width > 0 && rect.height > 0
26
+ });
27
+ });
28
+ return results;
29
+ }, selector || null);
21
30
  }
22
31
  /**
23
- * Link Harvester - Internal/external links classification के साथ
32
+ * Link Harvester - Extract all links with classification
24
33
  */
25
- export async function extractLinks(page, selector) {
26
- const currentUrl = page.url();
27
- return await page.evaluate((sel, pageUrl) => {
28
- const links = sel ?
29
- Array.from(document.querySelectorAll(sel)) :
30
- Array.from(document.querySelectorAll('a[href]'));
34
+ export async function harvestLinks(page, options) {
35
+ const opts = {
36
+ includeInternal: true,
37
+ includeExternal: true,
38
+ includeAnchors: true,
39
+ ...options
40
+ };
41
+ return await page.evaluate((config) => {
42
+ const currentDomain = window.location.hostname;
31
43
  const internal = [];
32
44
  const external = [];
33
- const currentDomain = new URL(pageUrl).hostname;
45
+ const anchors = [];
46
+ const all = [];
47
+ const links = document.querySelectorAll('a[href]');
34
48
  links.forEach((link) => {
35
49
  const href = link.href;
50
+ const text = link.innerText.trim();
51
+ const title = link.getAttribute('title') || '';
52
+ const target = link.getAttribute('target') || '';
36
53
  if (!href)
37
54
  return;
38
- const linkData = {
39
- href,
40
- text: link.textContent?.trim() || '',
41
- title: link.title || '',
42
- rel: link.rel || '',
43
- target: link.target || ''
44
- };
55
+ // Anchor links
56
+ if (href.startsWith('#')) {
57
+ if (config.includeAnchors) {
58
+ anchors.push({ href, text, target });
59
+ all.push({ href, text, type: 'anchor' });
60
+ }
61
+ return;
62
+ }
45
63
  try {
46
- const linkDomain = new URL(href).hostname;
47
- if (linkDomain === currentDomain || href.startsWith('/') || href.startsWith('#')) {
48
- internal.push(linkData);
64
+ const url = new URL(href);
65
+ // Internal vs External
66
+ if (url.hostname === currentDomain || url.hostname === '') {
67
+ if (config.includeInternal) {
68
+ internal.push({ href, text, title });
69
+ all.push({ href, text, type: 'internal' });
70
+ }
49
71
  }
50
72
  else {
51
- external.push(linkData);
73
+ if (config.includeExternal) {
74
+ external.push({ href, text, title });
75
+ all.push({ href, text, type: 'external' });
76
+ }
52
77
  }
53
78
  }
54
79
  catch (e) {
55
- // Invalid URL, consider as internal relative link
56
- internal.push(linkData);
80
+ // Invalid URL, treat as internal
81
+ if (config.includeInternal) {
82
+ internal.push({ href, text, title });
83
+ all.push({ href, text, type: 'internal' });
84
+ }
57
85
  }
58
86
  });
59
- return {
60
- internal,
61
- external,
62
- totalLinks: internal.length + external.length,
63
- internalCount: internal.length,
64
- externalCount: external.length
65
- };
66
- }, selector, currentUrl);
87
+ return { internal, external, anchors, all };
88
+ }, opts);
67
89
  }
68
90
  /**
69
- * Media Extractor - Videos, audio files के URLs और metadata
91
+ * Media Extractor - Extract videos, audio files, and embedded media
70
92
  */
71
93
  export async function extractMedia(page) {
72
94
  return await page.evaluate(() => {
73
95
  const videos = [];
74
- const audios = [];
96
+ const audio = [];
75
97
  const iframes = [];
76
- // Extract videos
77
- const videoElements = document.querySelectorAll('video');
78
- videoElements.forEach((video) => {
79
- const sources = Array.from(video.querySelectorAll('source'));
98
+ const embeds = [];
99
+ // Extract video elements
100
+ document.querySelectorAll('video').forEach((video) => {
101
+ const sources = [];
102
+ video.querySelectorAll('source').forEach((source) => {
103
+ sources.push(source.src);
104
+ });
80
105
  videos.push({
81
- src: video.src || '',
106
+ src: video.src || sources[0] || '',
107
+ sources: sources,
82
108
  poster: video.poster || '',
83
109
  width: video.width,
84
110
  height: video.height,
85
- duration: video.duration,
86
- sources: sources.map((s) => ({
87
- src: s.src,
88
- type: s.type
89
- })),
90
111
  controls: video.controls,
91
112
  autoplay: video.autoplay,
92
- loop: video.loop
113
+ loop: video.loop,
114
+ muted: video.muted,
115
+ duration: video.duration,
116
+ currentTime: video.currentTime
93
117
  });
94
118
  });
95
- // Extract audio
96
- const audioElements = document.querySelectorAll('audio');
97
- audioElements.forEach((audio) => {
98
- const sources = Array.from(audio.querySelectorAll('source'));
99
- audios.push({
100
- src: audio.src || '',
101
- duration: audio.duration,
102
- sources: sources.map((s) => ({
103
- src: s.src,
104
- type: s.type
105
- })),
106
- controls: audio.controls,
107
- autoplay: audio.autoplay,
108
- loop: audio.loop
119
+ // Extract audio elements
120
+ document.querySelectorAll('audio').forEach((audioEl) => {
121
+ const sources = [];
122
+ audioEl.querySelectorAll('source').forEach((source) => {
123
+ sources.push(source.src);
124
+ });
125
+ audio.push({
126
+ src: audioEl.src || sources[0] || '',
127
+ sources: sources,
128
+ controls: audioEl.controls,
129
+ autoplay: audioEl.autoplay,
130
+ loop: audioEl.loop,
131
+ muted: audioEl.muted,
132
+ duration: audioEl.duration
109
133
  });
110
134
  });
111
- // Extract iframes (often used for embedded videos)
112
- const iframeElements = document.querySelectorAll('iframe');
113
- iframeElements.forEach((iframe) => {
135
+ // Extract iframes
136
+ document.querySelectorAll('iframe').forEach((iframe) => {
137
+ const src = iframe.src;
138
+ let platform = 'unknown';
139
+ // Detect common video platforms
140
+ if (src.includes('youtube.com') || src.includes('youtu.be')) {
141
+ platform = 'youtube';
142
+ }
143
+ else if (src.includes('vimeo.com')) {
144
+ platform = 'vimeo';
145
+ }
146
+ else if (src.includes('dailymotion.com')) {
147
+ platform = 'dailymotion';
148
+ }
149
+ else if (src.includes('facebook.com')) {
150
+ platform = 'facebook';
151
+ }
152
+ else if (src.includes('twitter.com') || src.includes('x.com')) {
153
+ platform = 'twitter';
154
+ }
114
155
  iframes.push({
115
- src: iframe.src || '',
156
+ src: src,
157
+ title: iframe.title || '',
116
158
  width: iframe.width,
117
159
  height: iframe.height,
118
- title: iframe.title || '',
119
- allow: iframe.allow || ''
160
+ platform: platform,
161
+ allowFullscreen: iframe.allowFullscreen
120
162
  });
121
163
  });
122
- return {
123
- videos,
124
- audios,
125
- iframes,
126
- videoCount: videos.length,
127
- audioCount: audios.length,
128
- iframeCount: iframes.length
129
- };
164
+ // Extract embed elements
165
+ document.querySelectorAll('embed, object').forEach((embed) => {
166
+ embeds.push({
167
+ src: embed.getAttribute('src') || embed.getAttribute('data') || '',
168
+ type: embed.getAttribute('type') || '',
169
+ width: embed.getAttribute('width') || '',
170
+ height: embed.getAttribute('height') || ''
171
+ });
172
+ });
173
+ return { videos, audio, iframes, embeds };
130
174
  });
131
175
  }
132
176
  /**
133
- * PDF Link Finder - Downloadable files detect करना
177
+ * PDF Link Finder - Find all downloadable file links
134
178
  */
135
- export async function extractDownloadableFiles(page) {
179
+ export async function findDownloadableFiles(page) {
136
180
  return await page.evaluate(() => {
137
- const files = {
138
- pdfs: [],
139
- docs: [],
140
- images: [],
141
- archives: [],
142
- others: []
143
- };
144
- // Common file extensions
145
- const extensions = {
146
- pdf: ['pdf'],
147
- doc: ['doc', 'docx', 'txt', 'rtf', 'odt'],
148
- image: ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
149
- archive: ['zip', 'rar', '7z', 'tar', 'gz'],
150
- };
151
- // Find all links
181
+ const pdfs = [];
182
+ const documents = [];
183
+ const archives = [];
184
+ const images = [];
185
+ const other = [];
152
186
  const links = document.querySelectorAll('a[href]');
153
187
  links.forEach((link) => {
154
188
  const href = link.href;
189
+ const text = link.innerText.trim();
190
+ const download = link.getAttribute('download');
155
191
  if (!href)
156
192
  return;
157
- const linkData = {
158
- href,
159
- text: link.textContent?.trim() || '',
160
- download: link.download || '',
161
- type: link.type || ''
162
- };
163
- // Check file extension
164
- const urlPath = href.split('?')[0]; // Remove query params
165
- const ext = urlPath.split('.').pop()?.toLowerCase();
166
- if (!ext)
167
- return;
168
- if (extensions.pdf.includes(ext)) {
169
- files.pdfs.push(linkData);
193
+ const url = href.toLowerCase();
194
+ const fileInfo = { href, text, size: link.getAttribute('data-size') || undefined };
195
+ // PDF files
196
+ if (url.endsWith('.pdf') || url.includes('.pdf?') || download?.endsWith('.pdf')) {
197
+ pdfs.push(fileInfo);
170
198
  }
171
- else if (extensions.doc.includes(ext)) {
172
- files.docs.push(linkData);
199
+ // Document files
200
+ else if (url.match(/\.(doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp)($|\?)/)) {
201
+ const match = url.match(/\.(doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp)($|\?)/);
202
+ documents.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
173
203
  }
174
- else if (extensions.image.includes(ext)) {
175
- files.images.push(linkData);
204
+ // Archive files
205
+ else if (url.match(/\.(zip|rar|7z|tar|gz|bz2)($|\?)/)) {
206
+ const match = url.match(/\.(zip|rar|7z|tar|gz|bz2)($|\?)/);
207
+ archives.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
176
208
  }
177
- else if (extensions.archive.includes(ext)) {
178
- files.archives.push(linkData);
209
+ // Image files (downloadable)
210
+ else if (url.match(/\.(jpg|jpeg|png|gif|bmp|svg|webp|ico)($|\?)/) && download) {
211
+ const match = url.match(/\.(jpg|jpeg|png|gif|bmp|svg|webp|ico)($|\?)/);
212
+ images.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
179
213
  }
180
- else if (link.download || link.type) {
181
- files.others.push(linkData);
214
+ // Other downloadable files
215
+ else if (download || url.match(/\.(exe|dmg|apk|deb|rpm|msi|iso)($|\?)/)) {
216
+ const match = url.match(/\.([a-z0-9]+)($|\?)/);
217
+ other.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
182
218
  }
183
219
  });
184
- return {
185
- ...files,
186
- totalFiles: files.pdfs.length + files.docs.length + files.images.length +
187
- files.archives.length + files.others.length
188
- };
220
+ return { pdfs, documents, archives, images, other };
189
221
  });
190
222
  }
191
223
  /**
192
- * Social Media Links Extractor - Social media profiles निकालना
224
+ * Social Media Links Extractor - Extract social media profile links
193
225
  */
194
- export async function extractSocialMediaLinks(page) {
226
+ export async function extractSocialLinks(page) {
195
227
  return await page.evaluate(() => {
196
- const social = {
228
+ const socialLinks = {
197
229
  facebook: [],
198
230
  twitter: [],
199
231
  instagram: [],
200
232
  linkedin: [],
201
233
  youtube: [],
202
234
  github: [],
235
+ pinterest: [],
236
+ tiktok: [],
203
237
  other: []
204
238
  };
205
239
  const links = document.querySelectorAll('a[href]');
206
240
  links.forEach((link) => {
207
241
  const href = link.href.toLowerCase();
208
- const linkData = {
209
- href: link.href,
210
- text: link.textContent?.trim() || ''
211
- };
212
242
  if (href.includes('facebook.com')) {
213
- social.facebook.push(linkData);
243
+ socialLinks.facebook.push(link.href);
214
244
  }
215
245
  else if (href.includes('twitter.com') || href.includes('x.com')) {
216
- social.twitter.push(linkData);
246
+ socialLinks.twitter.push(link.href);
217
247
  }
218
248
  else if (href.includes('instagram.com')) {
219
- social.instagram.push(linkData);
249
+ socialLinks.instagram.push(link.href);
220
250
  }
221
251
  else if (href.includes('linkedin.com')) {
222
- social.linkedin.push(linkData);
252
+ socialLinks.linkedin.push(link.href);
223
253
  }
224
254
  else if (href.includes('youtube.com') || href.includes('youtu.be')) {
225
- social.youtube.push(linkData);
255
+ socialLinks.youtube.push(link.href);
226
256
  }
227
257
  else if (href.includes('github.com')) {
228
- social.github.push(linkData);
258
+ socialLinks.github.push(link.href);
259
+ }
260
+ else if (href.includes('pinterest.com')) {
261
+ socialLinks.pinterest.push(link.href);
262
+ }
263
+ else if (href.includes('tiktok.com')) {
264
+ socialLinks.tiktok.push(link.href);
229
265
  }
230
266
  });
231
- return social;
267
+ // Remove duplicates
268
+ Object.keys(socialLinks).forEach((key) => {
269
+ socialLinks[key] = Array.from(new Set(socialLinks[key]));
270
+ });
271
+ return socialLinks;
272
+ });
273
+ }
274
+ /**
275
+ * Email and Phone Extractor - Extract contact information from page
276
+ */
277
+ export async function extractContactInfo(page) {
278
+ return await page.evaluate(() => {
279
+ const text = document.body.innerText;
280
+ const emails = [];
281
+ const phones = [];
282
+ const addresses = [];
283
+ // Extract emails
284
+ const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
285
+ const emailMatches = text.match(emailRegex);
286
+ if (emailMatches) {
287
+ emails.push(...emailMatches);
288
+ }
289
+ // Also check mailto links
290
+ document.querySelectorAll('a[href^="mailto:"]').forEach((link) => {
291
+ const email = link.href.replace('mailto:', '').split('?')[0];
292
+ if (email)
293
+ emails.push(email);
294
+ });
295
+ // Extract phone numbers (various formats)
296
+ const phoneRegex = /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
297
+ const phoneMatches = text.match(phoneRegex);
298
+ if (phoneMatches) {
299
+ phones.push(...phoneMatches);
300
+ }
301
+ // Also check tel links
302
+ document.querySelectorAll('a[href^="tel:"]').forEach((link) => {
303
+ const phone = link.href.replace('tel:', '');
304
+ if (phone)
305
+ phones.push(phone);
306
+ });
307
+ // Remove duplicates
308
+ return {
309
+ emails: Array.from(new Set(emails)),
310
+ phones: Array.from(new Set(phones)),
311
+ addresses: addresses
312
+ };
232
313
  });
233
314
  }
@@ -0,0 +1,17 @@
1
+ // Basic tests for extractor modules
2
+ import { describe, it, expect } from 'vitest';
3
+ describe('Smart Data Extractors', () => {
4
+ it('should exist', () => {
5
+ expect(true).toBe(true);
6
+ });
7
+ });
8
+ describe('Multi-Element Extractors', () => {
9
+ it('should exist', () => {
10
+ expect(true).toBe(true);
11
+ });
12
+ });
13
+ describe('Content Type Extractors', () => {
14
+ it('should exist', () => {
15
+ expect(true).toBe(true);
16
+ });
17
+ });