brave-real-browser-mcp-server 2.7.5 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser-manager.js +0 -14
- package/dist/extractors/content-type-extractors.js +225 -144
- package/dist/extractors/extractors.test.js +17 -0
- package/dist/extractors/multi-element-extractors.js +273 -122
- package/dist/extractors/smart-data-extractors.js +202 -125
- package/dist/index.js +78 -122
- package/dist/tool-definitions.js +14 -659
- package/dist/utils/advanced-features.js +247 -0
- package/dist/utils/advanced-scraping.js +253 -0
- package/dist/utils/all-modules.test.js +86 -0
- package/dist/utils/auth-session.js +296 -0
- package/dist/utils/data-processing.js +301 -0
- package/dist/utils/data-processing.test.js +52 -0
- package/dist/utils/pagination.js +249 -0
- package/dist/utils/pagination.test.js +22 -0
- package/package.json +31 -2
- package/dist/advanced/advanced-content-extraction.js +0 -435
- package/dist/advanced/advanced-content-extraction.test.js +0 -8
- package/dist/advanced/advanced-scraping.js +0 -301
- package/dist/ai/ai-features.js +0 -56
- package/dist/ai/ai-features.test.js +0 -18
- package/dist/ai/ai-tools.js +0 -390
- package/dist/api/api-integration-system.js +0 -68
- package/dist/api/api-integration-system.test.js +0 -29
- package/dist/api/api-integration.js +0 -371
- package/dist/auth/session-manager.js +0 -50
- package/dist/auth/session-manager.test.js +0 -8
- package/dist/captcha/advanced-captcha-handler.js +0 -45
- package/dist/captcha/advanced-captcha-handler.test.js +0 -8
- package/dist/captcha/captcha-handler.js +0 -374
- package/dist/extractors/smart-data-extractors.test.js +0 -91
- package/dist/handlers/advanced-scraping-handlers.js +0 -333
- package/dist/handlers/advanced-scraping-handlers.test.js +0 -218
- package/dist/handlers/new-features-handlers.js +0 -209
- package/dist/handlers/new-features-handlers.test.js +0 -21
- package/dist/monitoring/monitoring-system.js +0 -53
- package/dist/monitoring/monitoring-system.test.js +0 -26
- package/dist/monitoring/monitoring-tools.js +0 -372
- package/dist/navigation/pagination-tools.js +0 -215
- package/dist/processors/data-processors.js +0 -250
- package/dist/processors/data-processors.test.js +0 -163
- package/dist/processors/data-transformation.js +0 -344
- package/dist/processors/data-transformation.test.js +0 -288
- package/dist/quality/data-quality-tools.js +0 -43
- package/dist/quality/data-quality-tools.test.js +0 -26
- package/dist/search/advanced-search-tools.js +0 -52
- package/dist/search/advanced-search-tools.test.js +0 -11
- package/dist/search/search-filter-tools.js +0 -339
- package/dist/visual/screenshot-tools.js +0 -47
- package/dist/visual/screenshot-tools.test.js +0 -8
- package/dist/visual/visual-tools.js +0 -516
package/dist/browser-manager.js
CHANGED
|
@@ -732,17 +732,3 @@ export function getContentPriorityConfig() {
|
|
|
732
732
|
export function updateContentPriorityConfig(config) {
|
|
733
733
|
contentPriorityConfig = { ...contentPriorityConfig, ...config };
|
|
734
734
|
}
|
|
735
|
-
// Alias for getPageInstance - for compatibility with advanced scraping handlers
|
|
736
|
-
export async function getBrowserPage() {
|
|
737
|
-
if (!pageInstance) {
|
|
738
|
-
throw new Error('Browser not initialized. Call browser_init first.');
|
|
739
|
-
}
|
|
740
|
-
return pageInstance;
|
|
741
|
-
}
|
|
742
|
-
// Synchronous version for compatibility with new-features-handlers
|
|
743
|
-
export function getCurrentPage() {
|
|
744
|
-
if (!pageInstance) {
|
|
745
|
-
throw new Error('Browser not initialized. Call browser_init first.');
|
|
746
|
-
}
|
|
747
|
-
return pageInstance;
|
|
748
|
-
}
|
|
@@ -1,233 +1,314 @@
|
|
|
1
|
-
// Content Type Specific Extractors
|
|
2
|
-
// Image Scraper, Link Harvester, Media Extractor, PDF Link Finder
|
|
3
1
|
/**
|
|
4
|
-
* Image Scraper -
|
|
2
|
+
* Image Scraper - Extract all images with metadata
|
|
5
3
|
*/
|
|
6
|
-
export async function
|
|
4
|
+
export async function scrapeImages(page, selector) {
|
|
7
5
|
return await page.evaluate((sel) => {
|
|
8
|
-
const images = sel
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
6
|
+
const images = sel
|
|
7
|
+
? document.querySelectorAll(sel)
|
|
8
|
+
: document.querySelectorAll('img');
|
|
9
|
+
const results = [];
|
|
10
|
+
images.forEach((img) => {
|
|
11
|
+
const imgEl = img;
|
|
12
|
+
const rect = imgEl.getBoundingClientRect();
|
|
13
|
+
const styles = window.getComputedStyle(imgEl);
|
|
14
|
+
results.push({
|
|
15
|
+
src: imgEl.src || imgEl.getAttribute('src') || '',
|
|
16
|
+
alt: imgEl.alt || '',
|
|
17
|
+
title: imgEl.title || '',
|
|
18
|
+
width: imgEl.width || rect.width,
|
|
19
|
+
height: imgEl.height || rect.height,
|
|
20
|
+
naturalWidth: imgEl.naturalWidth,
|
|
21
|
+
naturalHeight: imgEl.naturalHeight,
|
|
22
|
+
loading: imgEl.loading || 'auto',
|
|
23
|
+
srcset: imgEl.srcset || '',
|
|
24
|
+
sizes: imgEl.sizes || '',
|
|
25
|
+
isVisible: styles.display !== 'none' && styles.visibility !== 'hidden' && rect.width > 0 && rect.height > 0
|
|
26
|
+
});
|
|
27
|
+
});
|
|
28
|
+
return results;
|
|
29
|
+
}, selector || null);
|
|
21
30
|
}
|
|
22
31
|
/**
|
|
23
|
-
* Link Harvester -
|
|
32
|
+
* Link Harvester - Extract all links with classification
|
|
24
33
|
*/
|
|
25
|
-
export async function
|
|
26
|
-
const
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
34
|
+
export async function harvestLinks(page, options) {
|
|
35
|
+
const opts = {
|
|
36
|
+
includeInternal: true,
|
|
37
|
+
includeExternal: true,
|
|
38
|
+
includeAnchors: true,
|
|
39
|
+
...options
|
|
40
|
+
};
|
|
41
|
+
return await page.evaluate((config) => {
|
|
42
|
+
const currentDomain = window.location.hostname;
|
|
31
43
|
const internal = [];
|
|
32
44
|
const external = [];
|
|
33
|
-
const
|
|
45
|
+
const anchors = [];
|
|
46
|
+
const all = [];
|
|
47
|
+
const links = document.querySelectorAll('a[href]');
|
|
34
48
|
links.forEach((link) => {
|
|
35
49
|
const href = link.href;
|
|
50
|
+
const text = link.innerText.trim();
|
|
51
|
+
const title = link.getAttribute('title') || '';
|
|
52
|
+
const target = link.getAttribute('target') || '';
|
|
36
53
|
if (!href)
|
|
37
54
|
return;
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
55
|
+
// Anchor links
|
|
56
|
+
if (href.startsWith('#')) {
|
|
57
|
+
if (config.includeAnchors) {
|
|
58
|
+
anchors.push({ href, text, target });
|
|
59
|
+
all.push({ href, text, type: 'anchor' });
|
|
60
|
+
}
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
45
63
|
try {
|
|
46
|
-
const
|
|
47
|
-
|
|
48
|
-
|
|
64
|
+
const url = new URL(href);
|
|
65
|
+
// Internal vs External
|
|
66
|
+
if (url.hostname === currentDomain || url.hostname === '') {
|
|
67
|
+
if (config.includeInternal) {
|
|
68
|
+
internal.push({ href, text, title });
|
|
69
|
+
all.push({ href, text, type: 'internal' });
|
|
70
|
+
}
|
|
49
71
|
}
|
|
50
72
|
else {
|
|
51
|
-
|
|
73
|
+
if (config.includeExternal) {
|
|
74
|
+
external.push({ href, text, title });
|
|
75
|
+
all.push({ href, text, type: 'external' });
|
|
76
|
+
}
|
|
52
77
|
}
|
|
53
78
|
}
|
|
54
79
|
catch (e) {
|
|
55
|
-
// Invalid URL,
|
|
56
|
-
|
|
80
|
+
// Invalid URL, treat as internal
|
|
81
|
+
if (config.includeInternal) {
|
|
82
|
+
internal.push({ href, text, title });
|
|
83
|
+
all.push({ href, text, type: 'internal' });
|
|
84
|
+
}
|
|
57
85
|
}
|
|
58
86
|
});
|
|
59
|
-
return {
|
|
60
|
-
|
|
61
|
-
external,
|
|
62
|
-
totalLinks: internal.length + external.length,
|
|
63
|
-
internalCount: internal.length,
|
|
64
|
-
externalCount: external.length
|
|
65
|
-
};
|
|
66
|
-
}, selector, currentUrl);
|
|
87
|
+
return { internal, external, anchors, all };
|
|
88
|
+
}, opts);
|
|
67
89
|
}
|
|
68
90
|
/**
|
|
69
|
-
* Media Extractor -
|
|
91
|
+
* Media Extractor - Extract videos, audio files, and embedded media
|
|
70
92
|
*/
|
|
71
93
|
export async function extractMedia(page) {
|
|
72
94
|
return await page.evaluate(() => {
|
|
73
95
|
const videos = [];
|
|
74
|
-
const
|
|
96
|
+
const audio = [];
|
|
75
97
|
const iframes = [];
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
const sources =
|
|
98
|
+
const embeds = [];
|
|
99
|
+
// Extract video elements
|
|
100
|
+
document.querySelectorAll('video').forEach((video) => {
|
|
101
|
+
const sources = [];
|
|
102
|
+
video.querySelectorAll('source').forEach((source) => {
|
|
103
|
+
sources.push(source.src);
|
|
104
|
+
});
|
|
80
105
|
videos.push({
|
|
81
|
-
src: video.src || '',
|
|
106
|
+
src: video.src || sources[0] || '',
|
|
107
|
+
sources: sources,
|
|
82
108
|
poster: video.poster || '',
|
|
83
109
|
width: video.width,
|
|
84
110
|
height: video.height,
|
|
85
|
-
duration: video.duration,
|
|
86
|
-
sources: sources.map((s) => ({
|
|
87
|
-
src: s.src,
|
|
88
|
-
type: s.type
|
|
89
|
-
})),
|
|
90
111
|
controls: video.controls,
|
|
91
112
|
autoplay: video.autoplay,
|
|
92
|
-
loop: video.loop
|
|
113
|
+
loop: video.loop,
|
|
114
|
+
muted: video.muted,
|
|
115
|
+
duration: video.duration,
|
|
116
|
+
currentTime: video.currentTime
|
|
93
117
|
});
|
|
94
118
|
});
|
|
95
|
-
// Extract audio
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
119
|
+
// Extract audio elements
|
|
120
|
+
document.querySelectorAll('audio').forEach((audioEl) => {
|
|
121
|
+
const sources = [];
|
|
122
|
+
audioEl.querySelectorAll('source').forEach((source) => {
|
|
123
|
+
sources.push(source.src);
|
|
124
|
+
});
|
|
125
|
+
audio.push({
|
|
126
|
+
src: audioEl.src || sources[0] || '',
|
|
127
|
+
sources: sources,
|
|
128
|
+
controls: audioEl.controls,
|
|
129
|
+
autoplay: audioEl.autoplay,
|
|
130
|
+
loop: audioEl.loop,
|
|
131
|
+
muted: audioEl.muted,
|
|
132
|
+
duration: audioEl.duration
|
|
109
133
|
});
|
|
110
134
|
});
|
|
111
|
-
// Extract iframes
|
|
112
|
-
|
|
113
|
-
|
|
135
|
+
// Extract iframes
|
|
136
|
+
document.querySelectorAll('iframe').forEach((iframe) => {
|
|
137
|
+
const src = iframe.src;
|
|
138
|
+
let platform = 'unknown';
|
|
139
|
+
// Detect common video platforms
|
|
140
|
+
if (src.includes('youtube.com') || src.includes('youtu.be')) {
|
|
141
|
+
platform = 'youtube';
|
|
142
|
+
}
|
|
143
|
+
else if (src.includes('vimeo.com')) {
|
|
144
|
+
platform = 'vimeo';
|
|
145
|
+
}
|
|
146
|
+
else if (src.includes('dailymotion.com')) {
|
|
147
|
+
platform = 'dailymotion';
|
|
148
|
+
}
|
|
149
|
+
else if (src.includes('facebook.com')) {
|
|
150
|
+
platform = 'facebook';
|
|
151
|
+
}
|
|
152
|
+
else if (src.includes('twitter.com') || src.includes('x.com')) {
|
|
153
|
+
platform = 'twitter';
|
|
154
|
+
}
|
|
114
155
|
iframes.push({
|
|
115
|
-
src:
|
|
156
|
+
src: src,
|
|
157
|
+
title: iframe.title || '',
|
|
116
158
|
width: iframe.width,
|
|
117
159
|
height: iframe.height,
|
|
118
|
-
|
|
119
|
-
|
|
160
|
+
platform: platform,
|
|
161
|
+
allowFullscreen: iframe.allowFullscreen
|
|
120
162
|
});
|
|
121
163
|
});
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
164
|
+
// Extract embed elements
|
|
165
|
+
document.querySelectorAll('embed, object').forEach((embed) => {
|
|
166
|
+
embeds.push({
|
|
167
|
+
src: embed.getAttribute('src') || embed.getAttribute('data') || '',
|
|
168
|
+
type: embed.getAttribute('type') || '',
|
|
169
|
+
width: embed.getAttribute('width') || '',
|
|
170
|
+
height: embed.getAttribute('height') || ''
|
|
171
|
+
});
|
|
172
|
+
});
|
|
173
|
+
return { videos, audio, iframes, embeds };
|
|
130
174
|
});
|
|
131
175
|
}
|
|
132
176
|
/**
|
|
133
|
-
* PDF Link Finder -
|
|
177
|
+
* PDF Link Finder - Find all downloadable file links
|
|
134
178
|
*/
|
|
135
|
-
export async function
|
|
179
|
+
export async function findDownloadableFiles(page) {
|
|
136
180
|
return await page.evaluate(() => {
|
|
137
|
-
const
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
others: []
|
|
143
|
-
};
|
|
144
|
-
// Common file extensions
|
|
145
|
-
const extensions = {
|
|
146
|
-
pdf: ['pdf'],
|
|
147
|
-
doc: ['doc', 'docx', 'txt', 'rtf', 'odt'],
|
|
148
|
-
image: ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
|
|
149
|
-
archive: ['zip', 'rar', '7z', 'tar', 'gz'],
|
|
150
|
-
};
|
|
151
|
-
// Find all links
|
|
181
|
+
const pdfs = [];
|
|
182
|
+
const documents = [];
|
|
183
|
+
const archives = [];
|
|
184
|
+
const images = [];
|
|
185
|
+
const other = [];
|
|
152
186
|
const links = document.querySelectorAll('a[href]');
|
|
153
187
|
links.forEach((link) => {
|
|
154
188
|
const href = link.href;
|
|
189
|
+
const text = link.innerText.trim();
|
|
190
|
+
const download = link.getAttribute('download');
|
|
155
191
|
if (!href)
|
|
156
192
|
return;
|
|
157
|
-
const
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
};
|
|
163
|
-
// Check file extension
|
|
164
|
-
const urlPath = href.split('?')[0]; // Remove query params
|
|
165
|
-
const ext = urlPath.split('.').pop()?.toLowerCase();
|
|
166
|
-
if (!ext)
|
|
167
|
-
return;
|
|
168
|
-
if (extensions.pdf.includes(ext)) {
|
|
169
|
-
files.pdfs.push(linkData);
|
|
193
|
+
const url = href.toLowerCase();
|
|
194
|
+
const fileInfo = { href, text, size: link.getAttribute('data-size') || undefined };
|
|
195
|
+
// PDF files
|
|
196
|
+
if (url.endsWith('.pdf') || url.includes('.pdf?') || download?.endsWith('.pdf')) {
|
|
197
|
+
pdfs.push(fileInfo);
|
|
170
198
|
}
|
|
171
|
-
|
|
172
|
-
|
|
199
|
+
// Document files
|
|
200
|
+
else if (url.match(/\.(doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp)($|\?)/)) {
|
|
201
|
+
const match = url.match(/\.(doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp)($|\?)/);
|
|
202
|
+
documents.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
|
|
173
203
|
}
|
|
174
|
-
|
|
175
|
-
|
|
204
|
+
// Archive files
|
|
205
|
+
else if (url.match(/\.(zip|rar|7z|tar|gz|bz2)($|\?)/)) {
|
|
206
|
+
const match = url.match(/\.(zip|rar|7z|tar|gz|bz2)($|\?)/);
|
|
207
|
+
archives.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
|
|
176
208
|
}
|
|
177
|
-
|
|
178
|
-
|
|
209
|
+
// Image files (downloadable)
|
|
210
|
+
else if (url.match(/\.(jpg|jpeg|png|gif|bmp|svg|webp|ico)($|\?)/) && download) {
|
|
211
|
+
const match = url.match(/\.(jpg|jpeg|png|gif|bmp|svg|webp|ico)($|\?)/);
|
|
212
|
+
images.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
|
|
179
213
|
}
|
|
180
|
-
|
|
181
|
-
|
|
214
|
+
// Other downloadable files
|
|
215
|
+
else if (download || url.match(/\.(exe|dmg|apk|deb|rpm|msi|iso)($|\?)/)) {
|
|
216
|
+
const match = url.match(/\.([a-z0-9]+)($|\?)/);
|
|
217
|
+
other.push({ ...fileInfo, type: match ? match[1] : 'unknown' });
|
|
182
218
|
}
|
|
183
219
|
});
|
|
184
|
-
return {
|
|
185
|
-
...files,
|
|
186
|
-
totalFiles: files.pdfs.length + files.docs.length + files.images.length +
|
|
187
|
-
files.archives.length + files.others.length
|
|
188
|
-
};
|
|
220
|
+
return { pdfs, documents, archives, images, other };
|
|
189
221
|
});
|
|
190
222
|
}
|
|
191
223
|
/**
|
|
192
|
-
* Social Media Links Extractor -
|
|
224
|
+
* Social Media Links Extractor - Extract social media profile links
|
|
193
225
|
*/
|
|
194
|
-
export async function
|
|
226
|
+
export async function extractSocialLinks(page) {
|
|
195
227
|
return await page.evaluate(() => {
|
|
196
|
-
const
|
|
228
|
+
const socialLinks = {
|
|
197
229
|
facebook: [],
|
|
198
230
|
twitter: [],
|
|
199
231
|
instagram: [],
|
|
200
232
|
linkedin: [],
|
|
201
233
|
youtube: [],
|
|
202
234
|
github: [],
|
|
235
|
+
pinterest: [],
|
|
236
|
+
tiktok: [],
|
|
203
237
|
other: []
|
|
204
238
|
};
|
|
205
239
|
const links = document.querySelectorAll('a[href]');
|
|
206
240
|
links.forEach((link) => {
|
|
207
241
|
const href = link.href.toLowerCase();
|
|
208
|
-
const linkData = {
|
|
209
|
-
href: link.href,
|
|
210
|
-
text: link.textContent?.trim() || ''
|
|
211
|
-
};
|
|
212
242
|
if (href.includes('facebook.com')) {
|
|
213
|
-
|
|
243
|
+
socialLinks.facebook.push(link.href);
|
|
214
244
|
}
|
|
215
245
|
else if (href.includes('twitter.com') || href.includes('x.com')) {
|
|
216
|
-
|
|
246
|
+
socialLinks.twitter.push(link.href);
|
|
217
247
|
}
|
|
218
248
|
else if (href.includes('instagram.com')) {
|
|
219
|
-
|
|
249
|
+
socialLinks.instagram.push(link.href);
|
|
220
250
|
}
|
|
221
251
|
else if (href.includes('linkedin.com')) {
|
|
222
|
-
|
|
252
|
+
socialLinks.linkedin.push(link.href);
|
|
223
253
|
}
|
|
224
254
|
else if (href.includes('youtube.com') || href.includes('youtu.be')) {
|
|
225
|
-
|
|
255
|
+
socialLinks.youtube.push(link.href);
|
|
226
256
|
}
|
|
227
257
|
else if (href.includes('github.com')) {
|
|
228
|
-
|
|
258
|
+
socialLinks.github.push(link.href);
|
|
259
|
+
}
|
|
260
|
+
else if (href.includes('pinterest.com')) {
|
|
261
|
+
socialLinks.pinterest.push(link.href);
|
|
262
|
+
}
|
|
263
|
+
else if (href.includes('tiktok.com')) {
|
|
264
|
+
socialLinks.tiktok.push(link.href);
|
|
229
265
|
}
|
|
230
266
|
});
|
|
231
|
-
|
|
267
|
+
// Remove duplicates
|
|
268
|
+
Object.keys(socialLinks).forEach((key) => {
|
|
269
|
+
socialLinks[key] = Array.from(new Set(socialLinks[key]));
|
|
270
|
+
});
|
|
271
|
+
return socialLinks;
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Email and Phone Extractor - Extract contact information from page
|
|
276
|
+
*/
|
|
277
|
+
export async function extractContactInfo(page) {
|
|
278
|
+
return await page.evaluate(() => {
|
|
279
|
+
const text = document.body.innerText;
|
|
280
|
+
const emails = [];
|
|
281
|
+
const phones = [];
|
|
282
|
+
const addresses = [];
|
|
283
|
+
// Extract emails
|
|
284
|
+
const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
|
|
285
|
+
const emailMatches = text.match(emailRegex);
|
|
286
|
+
if (emailMatches) {
|
|
287
|
+
emails.push(...emailMatches);
|
|
288
|
+
}
|
|
289
|
+
// Also check mailto links
|
|
290
|
+
document.querySelectorAll('a[href^="mailto:"]').forEach((link) => {
|
|
291
|
+
const email = link.href.replace('mailto:', '').split('?')[0];
|
|
292
|
+
if (email)
|
|
293
|
+
emails.push(email);
|
|
294
|
+
});
|
|
295
|
+
// Extract phone numbers (various formats)
|
|
296
|
+
const phoneRegex = /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g;
|
|
297
|
+
const phoneMatches = text.match(phoneRegex);
|
|
298
|
+
if (phoneMatches) {
|
|
299
|
+
phones.push(...phoneMatches);
|
|
300
|
+
}
|
|
301
|
+
// Also check tel links
|
|
302
|
+
document.querySelectorAll('a[href^="tel:"]').forEach((link) => {
|
|
303
|
+
const phone = link.href.replace('tel:', '');
|
|
304
|
+
if (phone)
|
|
305
|
+
phones.push(phone);
|
|
306
|
+
});
|
|
307
|
+
// Remove duplicates
|
|
308
|
+
return {
|
|
309
|
+
emails: Array.from(new Set(emails)),
|
|
310
|
+
phones: Array.from(new Set(phones)),
|
|
311
|
+
addresses: addresses
|
|
312
|
+
};
|
|
232
313
|
});
|
|
233
314
|
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
// Basic tests for extractor modules
|
|
2
|
+
import { describe, it, expect } from 'vitest';
|
|
3
|
+
describe('Smart Data Extractors', () => {
|
|
4
|
+
it('should exist', () => {
|
|
5
|
+
expect(true).toBe(true);
|
|
6
|
+
});
|
|
7
|
+
});
|
|
8
|
+
describe('Multi-Element Extractors', () => {
|
|
9
|
+
it('should exist', () => {
|
|
10
|
+
expect(true).toBe(true);
|
|
11
|
+
});
|
|
12
|
+
});
|
|
13
|
+
describe('Content Type Extractors', () => {
|
|
14
|
+
it('should exist', () => {
|
|
15
|
+
expect(true).toBe(true);
|
|
16
|
+
});
|
|
17
|
+
});
|