mailpop 1.0.8 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/extractor.js +22 -5
- package/dist/index.js +2 -1
- package/dist/utils/validators.js +9 -1
- package/package.json +1 -1
package/dist/extractor.js
CHANGED
|
@@ -2,9 +2,9 @@ import { load } from 'cheerio';
|
|
|
2
2
|
import { normalizeEmail } from './utils/normalize.js';
|
|
3
3
|
import { isValidEmail } from './utils/validators.js';
|
|
4
4
|
// Standard email regex for searching inside strings
|
|
5
|
-
const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
|
5
|
+
const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10}\b/g;
|
|
6
6
|
// Obfuscated email regex matching "name [at] domain [dot] com", "name(at)domain(dot)com", "name AT domain DOT com"
|
|
7
|
-
const OBFUSCATED_REGEX = /([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|\s+at\s+)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|\s+dot\s+)\s*([a-zA-Z]{2,})/gi;
|
|
7
|
+
const OBFUSCATED_REGEX = /([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|\s+at\s+)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|\s+dot\s+)\s*([a-zA-Z]{2,10})\b/gi;
|
|
8
8
|
// Base64 candidate regex for extracting potential base64 encoded strings
|
|
9
9
|
const BASE64_CANDIDATE_REGEX = /\b[a-zA-Z0-9+/]{12,80}={0,2}\b/g;
|
|
10
10
|
/**
|
|
@@ -133,6 +133,23 @@ export function classifyEmailType(email) {
|
|
|
133
133
|
}
|
|
134
134
|
return 'personal';
|
|
135
135
|
}
|
|
136
|
+
/**
|
|
137
|
+
* Extract text from a cheerio element, inserting spaces between elements
|
|
138
|
+
* to prevent adjacent tags from concatenating their text content.
|
|
139
|
+
*/
|
|
140
|
+
export function extractTextWithSpaces(elem, $) {
|
|
141
|
+
if (!elem || elem.length === 0)
|
|
142
|
+
return '';
|
|
143
|
+
const htmlParts = [];
|
|
144
|
+
elem.each((_, el) => {
|
|
145
|
+
htmlParts.push($(el).html() || '');
|
|
146
|
+
});
|
|
147
|
+
const html = htmlParts.join(' ');
|
|
148
|
+
const cleanHtml = html
|
|
149
|
+
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, ' ')
|
|
150
|
+
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, ' ');
|
|
151
|
+
return decodeUnicodeEntities(cleanHtml.replace(/<[^>]+>/g, ' '));
|
|
152
|
+
}
|
|
136
153
|
/**
|
|
137
154
|
* Extracts all unique emails from a given HTML string and URL.
|
|
138
155
|
*/
|
|
@@ -219,7 +236,7 @@ export function extractEmails(html, url, pageTitle, crawlDurationMs) {
|
|
|
219
236
|
// 3. Header section extraction
|
|
220
237
|
const headerElem = $('header, [id*="header"], [class*="header"]');
|
|
221
238
|
if (headerElem.length > 0) {
|
|
222
|
-
const headerText =
|
|
239
|
+
const headerText = extractTextWithSpaces(headerElem, $);
|
|
223
240
|
let match;
|
|
224
241
|
EMAIL_REGEX.lastIndex = 0;
|
|
225
242
|
while ((match = EMAIL_REGEX.exec(headerText)) !== null) {
|
|
@@ -229,7 +246,7 @@ export function extractEmails(html, url, pageTitle, crawlDurationMs) {
|
|
|
229
246
|
// 4. Footer section extraction
|
|
230
247
|
const footerElem = $('footer, [id*="footer"], [class*="footer"]');
|
|
231
248
|
if (footerElem.length > 0) {
|
|
232
|
-
const footerText =
|
|
249
|
+
const footerText = extractTextWithSpaces(footerElem, $);
|
|
233
250
|
let match;
|
|
234
251
|
EMAIL_REGEX.lastIndex = 0;
|
|
235
252
|
while ((match = EMAIL_REGEX.exec(footerText)) !== null) {
|
|
@@ -273,7 +290,7 @@ export function extractEmails(html, url, pageTitle, crawlDurationMs) {
|
|
|
273
290
|
}
|
|
274
291
|
});
|
|
275
292
|
// 7. Visible Body Text & Obfuscated matches in body
|
|
276
|
-
const bodyText =
|
|
293
|
+
const bodyText = extractTextWithSpaces($('body'), $);
|
|
277
294
|
// Standard matches in body text
|
|
278
295
|
let bodyMatch;
|
|
279
296
|
EMAIL_REGEX.lastIndex = 0;
|
package/dist/index.js
CHANGED
|
@@ -234,7 +234,8 @@ Options:
|
|
|
234
234
|
let discoveryMethod = result.selectedEmail ? result.selectedEmail.discoveryMethod : '';
|
|
235
235
|
// If no email detected, try to fall back to hello@domain
|
|
236
236
|
if (!selectedEmail) {
|
|
237
|
-
const
|
|
237
|
+
const fallbackDomain = normalizeDomain(target.domain);
|
|
238
|
+
const fallbackEmail = `hello@${fallbackDomain}`;
|
|
238
239
|
const isFallbackValid = await verifyEmailFallback(fallbackEmail);
|
|
239
240
|
if (isFallbackValid) {
|
|
240
241
|
selectedEmail = fallbackEmail;
|
package/dist/utils/validators.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import { normalizeDomain } from './normalize.js';
|
|
2
2
|
import { config } from '../config.js';
|
|
3
3
|
import dns from 'dns/promises';
|
|
4
|
-
const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;
|
|
4
|
+
const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10}$/;
|
|
5
|
+
const MAX_EMAIL_LENGTH = 254;
|
|
6
|
+
const MAX_LOCAL_PART_LENGTH = 64;
|
|
5
7
|
const REJECTED_PREFIXES = [
|
|
6
8
|
'noreply',
|
|
7
9
|
'no-reply',
|
|
@@ -34,6 +36,9 @@ const REJECTED_DOMAINS = [
|
|
|
34
36
|
* @param email - The email to check.
|
|
35
37
|
*/
|
|
36
38
|
export function isValidEmail(email) {
|
|
39
|
+
if (email.length > MAX_EMAIL_LENGTH) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
37
42
|
if (!EMAIL_REGEX.test(email)) {
|
|
38
43
|
return false;
|
|
39
44
|
}
|
|
@@ -43,6 +48,9 @@ export function isValidEmail(email) {
|
|
|
43
48
|
}
|
|
44
49
|
const localPart = parts[0].toLowerCase().trim();
|
|
45
50
|
const domainPart = parts[1].toLowerCase().trim();
|
|
51
|
+
if (localPart.length > MAX_LOCAL_PART_LENGTH) {
|
|
52
|
+
return false;
|
|
53
|
+
}
|
|
46
54
|
// Reject blacklisted prefixes
|
|
47
55
|
if (REJECTED_PREFIXES.includes(localPart)) {
|
|
48
56
|
return false;
|