mailpop 1.0.9 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/extractor.js CHANGED
@@ -2,9 +2,9 @@ import { load } from 'cheerio';
2
2
  import { normalizeEmail } from './utils/normalize.js';
3
3
  import { isValidEmail } from './utils/validators.js';
4
4
  // Standard email regex for searching inside strings
5
- const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
5
+ const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10}\b/g;
6
6
  // Obfuscated email regex matching "name [at] domain [dot] com", "name(at)domain(dot)com", "name AT domain DOT com"
7
- const OBFUSCATED_REGEX = /([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|\s+at\s+)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|\s+dot\s+)\s*([a-zA-Z]{2,})/gi;
7
+ const OBFUSCATED_REGEX = /([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|\s+at\s+)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|\s+dot\s+)\s*([a-zA-Z]{2,10})\b/gi;
8
8
  // Base64 candidate regex for extracting potential base64 encoded strings
9
9
  const BASE64_CANDIDATE_REGEX = /\b[a-zA-Z0-9+/]{12,80}={0,2}\b/g;
10
10
  /**
@@ -133,6 +133,23 @@ export function classifyEmailType(email) {
133
133
  }
134
134
  return 'personal';
135
135
  }
136
+ /**
137
+ * Extract text from a cheerio element, inserting spaces between elements
138
+ * to prevent adjacent tags from concatenating their text content.
139
+ */
140
+ export function extractTextWithSpaces(elem, $) {
141
+ if (!elem || elem.length === 0)
142
+ return '';
143
+ const htmlParts = [];
144
+ elem.each((_, el) => {
145
+ htmlParts.push($(el).html() || '');
146
+ });
147
+ const html = htmlParts.join(' ');
148
+ const cleanHtml = html
149
+ .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, ' ')
150
+ .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, ' ');
151
+ return decodeUnicodeEntities(cleanHtml.replace(/<[^>]+>/g, ' '));
152
+ }
136
153
  /**
137
154
  * Extracts all unique emails from a given HTML string and URL.
138
155
  */
@@ -219,7 +236,7 @@ export function extractEmails(html, url, pageTitle, crawlDurationMs) {
219
236
  // 3. Header section extraction
220
237
  const headerElem = $('header, [id*="header"], [class*="header"]');
221
238
  if (headerElem.length > 0) {
222
- const headerText = decodeUnicodeEntities(headerElem.text());
239
+ const headerText = extractTextWithSpaces(headerElem, $);
223
240
  let match;
224
241
  EMAIL_REGEX.lastIndex = 0;
225
242
  while ((match = EMAIL_REGEX.exec(headerText)) !== null) {
@@ -229,7 +246,7 @@ export function extractEmails(html, url, pageTitle, crawlDurationMs) {
229
246
  // 4. Footer section extraction
230
247
  const footerElem = $('footer, [id*="footer"], [class*="footer"]');
231
248
  if (footerElem.length > 0) {
232
- const footerText = decodeUnicodeEntities(footerElem.text());
249
+ const footerText = extractTextWithSpaces(footerElem, $);
233
250
  let match;
234
251
  EMAIL_REGEX.lastIndex = 0;
235
252
  while ((match = EMAIL_REGEX.exec(footerText)) !== null) {
@@ -273,7 +290,7 @@ export function extractEmails(html, url, pageTitle, crawlDurationMs) {
273
290
  }
274
291
  });
275
292
  // 7. Visible Body Text & Obfuscated matches in body
276
- const bodyText = decodeUnicodeEntities($('body').text());
293
+ const bodyText = extractTextWithSpaces($('body'), $);
277
294
  // Standard matches in body text
278
295
  let bodyMatch;
279
296
  EMAIL_REGEX.lastIndex = 0;
@@ -1,7 +1,7 @@
1
1
  import { normalizeDomain } from './normalize.js';
2
2
  import { config } from '../config.js';
3
3
  import dns from 'dns/promises';
4
- const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;
4
+ const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10}$/;
5
5
  const MAX_EMAIL_LENGTH = 254;
6
6
  const MAX_LOCAL_PART_LENGTH = 64;
7
7
  const REJECTED_PREFIXES = [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mailpop",
3
- "version": "1.0.9",
3
+ "version": "1.0.10",
4
4
  "description": "Production-ready public contact email discovery tool from company websites.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",