mailpop 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ import { load } from 'cheerio';
2
+ import { normalizeDomain, normalizeUrl } from './utils/normalize.js';
3
+ const AVOID_KEYWORDS = [
4
+ 'logout',
5
+ 'login',
6
+ 'signup',
7
+ 'register',
8
+ 'checkout',
9
+ 'cart',
10
+ 'dashboard',
11
+ 'account',
12
+ 'auth',
13
+ 'admin',
14
+ 'wp-admin',
15
+ ];
16
+ /**
17
+ * Checks if a URL matches patterns we should avoid.
18
+ */
19
+ export function shouldAvoidUrl(url) {
20
+ try {
21
+ const parsed = new URL(url);
22
+ const pathAndQuery = (parsed.pathname + parsed.search + parsed.hash).toLowerCase();
23
+ // Avoid non-http protocols, assets, documents, and media
24
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
25
+ return true;
26
+ }
27
+ const fileExtension = parsed.pathname.split('.').pop()?.toLowerCase();
28
+ if (fileExtension &&
29
+ [
30
+ 'pdf',
31
+ 'jpg',
32
+ 'jpeg',
33
+ 'png',
34
+ 'gif',
35
+ 'svg',
36
+ 'zip',
37
+ 'tar',
38
+ 'gz',
39
+ 'mp4',
40
+ 'mp3',
41
+ 'docx',
42
+ 'xlsx',
43
+ 'pptx',
44
+ 'epub',
45
+ 'exe',
46
+ 'dmg',
47
+ ].includes(fileExtension)) {
48
+ return true;
49
+ }
50
+ return AVOID_KEYWORDS.some((keyword) => pathAndQuery.includes(keyword));
51
+ }
52
+ catch (_e) {
53
+ return true; // Avoid invalid URLs
54
+ }
55
+ }
56
+ /**
57
+ * Validates if the target URL is on the same domain or subdomain.
58
+ */
59
+ export function isInternalUrl(url, targetDomain) {
60
+ try {
61
+ const parsed = new URL(url);
62
+ const emailDomain = normalizeDomain(parsed.hostname);
63
+ const cleanTarget = normalizeDomain(targetDomain);
64
+ return emailDomain === cleanTarget || emailDomain.endsWith('.' + cleanTarget);
65
+ }
66
+ catch (_e) {
67
+ return false;
68
+ }
69
+ }
70
+ /**
71
+ * Calculates a priority score for a URL.
72
+ * Higher scores mean the page is more likely to contain contact information.
73
+ */
74
+ export function getLinkPriority(url) {
75
+ try {
76
+ const parsed = new URL(url);
77
+ const path = parsed.pathname.toLowerCase();
78
+ // High priority: contact pages, team pages, about pages
79
+ const high = ['contact', 'about', 'team', 'support', 'help'];
80
+ if (high.some((keyword) => path.includes(keyword))) {
81
+ return 2;
82
+ }
83
+ // Medium priority: terms, policies, partnerships, services
84
+ const med = ['privacy', 'legal', 'terms', 'partnership', 'sales', 'company', 'services'];
85
+ if (med.some((keyword) => path.includes(keyword))) {
86
+ return 1;
87
+ }
88
+ return 0;
89
+ }
90
+ catch (_e) {
91
+ return -1;
92
+ }
93
+ }
94
+ /**
95
+ * Extracts and filters internal links from an HTML document.
96
+ * Returns URLs sorted by their priority score (highest first).
97
+ */
98
+ export function extractAndFilterLinks(html, baseUrl, targetDomain) {
99
+ const links = new Set();
100
+ try {
101
+ const $ = load(html);
102
+ $('a[href]').each((_, element) => {
103
+ const href = $(element).attr('href')?.trim();
104
+ if (!href)
105
+ return;
106
+ try {
107
+ // Resolve relative links against base URL
108
+ const resolvedUrl = new URL(href, baseUrl);
109
+ // Remove hash / fragment to prevent duplicate crawling of same page
110
+ resolvedUrl.hash = '';
111
+ const normalized = normalizeUrl(resolvedUrl.toString());
112
+ if (normalized && isInternalUrl(normalized, targetDomain) && !shouldAvoidUrl(normalized)) {
113
+ links.add(normalized);
114
+ }
115
+ }
116
+ catch (_e) {
117
+ // Ignore parsing errors for individual bad hrefs
118
+ }
119
+ });
120
+ }
121
+ catch (_e) {
122
+ // Ignore html parse errors
123
+ }
124
+ // Convert to array and sort by priority score
125
+ return Array.from(links).sort((a, b) => getLinkPriority(b) - getLinkPriority(a));
126
+ }
package/dist/logger.js ADDED
@@ -0,0 +1,82 @@
1
+ import fs from 'fs/promises';
2
+ import path from 'path';
3
+ const LOGS_DIR = path.resolve('logs');
4
+ /**
5
+ * Ensures that the logs directory exists on disk.
6
+ */
7
+ async function ensureLogsDir() {
8
+ try {
9
+ await fs.mkdir(LOGS_DIR, { recursive: true });
10
+ }
11
+ catch (_e) {
12
+ // Ignore error if directory already exists
13
+ }
14
+ }
15
+ /**
16
+ * Writes a single JSON line to a specified log file.
17
+ */
18
+ async function writeLog(filename, data) {
19
+ await ensureLogsDir();
20
+ const filePath = path.join(LOGS_DIR, filename);
21
+ const logLine = JSON.stringify(data) + '\n';
22
+ try {
23
+ await fs.appendFile(filePath, logLine, 'utf-8');
24
+ }
25
+ catch (err) {
26
+ const errorMsg = err instanceof Error ? err.message : String(err);
27
+ process.stderr.write(`Failed to write log to ${filename}: ${errorMsg}\n`);
28
+ }
29
+ }
30
+ export class Logger {
31
+ /**
32
+ * Logs general information events.
33
+ */
34
+ static async info(action, domain, duration, result, message) {
35
+ const entry = {
36
+ timestamp: new Date().toISOString(),
37
+ level: 'INFO',
38
+ domain,
39
+ action,
40
+ duration,
41
+ result,
42
+ message,
43
+ };
44
+ const consoleMsg = `[INFO] ${domain ? `[${domain}] ` : ''}${action}${result ? ` -> ${result}` : ''}${message ? ` | ${message}` : ''}`;
45
+ process.stdout.write(consoleMsg + '\n');
46
+ await writeLog('app.log', entry);
47
+ }
48
+ /**
49
+ * Logs error events and duplicates them to errors.log.
50
+ */
51
+ static async error(action, domain, duration, errorMsg, stack) {
52
+ const entry = {
53
+ timestamp: new Date().toISOString(),
54
+ level: 'ERROR',
55
+ domain,
56
+ action,
57
+ duration,
58
+ error: errorMsg,
59
+ stack,
60
+ };
61
+ const consoleMsg = `[ERROR] ${domain ? `[${domain}] ` : ''}${action}${errorMsg ? `: ${errorMsg}` : ''}`;
62
+ process.stderr.write(consoleMsg + '\n');
63
+ await writeLog('app.log', entry);
64
+ await writeLog('errors.log', entry);
65
+ }
66
+ /**
67
+ * Logs a discovered email to the dedicated discovered-emails.log file.
68
+ */
69
+ static async email(domain, email, source, confidence, method) {
70
+ const entry = {
71
+ timestamp: new Date().toISOString(),
72
+ domain,
73
+ email,
74
+ emailSource: source,
75
+ confidenceScore: confidence,
76
+ discoveryMethod: method,
77
+ };
78
+ const consoleMsg = `[EMAIL] [${domain}] Found ${email} (${method}, confidence: ${confidence}) at ${source}`;
79
+ process.stdout.write(consoleMsg + '\n');
80
+ await writeLog('discovered-emails.log', entry);
81
+ }
82
+ }
package/dist/robots.js ADDED
@@ -0,0 +1,90 @@
1
+ import { Logger } from './logger.js';
2
+ /**
3
+ * Fetches and parses robots.txt for a website, extracting sitemap links and disallowed paths.
4
+ * @param websiteUrl - Base website URL.
5
+ * @param cache - Cache instance to store results.
6
+ */
7
+ export async function parseRobotsTxt(websiteUrl, cache) {
8
+ let domainHost = '';
9
+ try {
10
+ domainHost = new URL(websiteUrl).hostname;
11
+ }
12
+ catch (_e) {
13
+ domainHost = websiteUrl;
14
+ }
15
+ // Construct absolute robots.txt URL
16
+ let robotsUrl = '';
17
+ try {
18
+ const base = new URL(websiteUrl);
19
+ robotsUrl = `${base.protocol}//${base.host}/robots.txt`;
20
+ }
21
+ catch (_e) {
22
+ robotsUrl = `https://${domainHost}/robots.txt`;
23
+ }
24
+ const cached = await cache.get(robotsUrl);
25
+ if (cached) {
26
+ return cached;
27
+ }
28
+ const result = {
29
+ sitemaps: [],
30
+ disallowedPaths: [],
31
+ };
32
+ try {
33
+ const response = await fetch(robotsUrl, {
34
+ headers: {
35
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) mailpop/1.0',
36
+ },
37
+ signal: AbortSignal.timeout(10000), // 10s timeout
38
+ });
39
+ if (response.ok) {
40
+ const text = await response.text();
41
+ const lines = text.split(/\r?\n/);
42
+ let appliesToUs = true; // True unless we hit User-agent that isn't '*' or 'mailpop'
43
+ for (const line of lines) {
44
+ const cleaned = line.trim();
45
+ if (!cleaned || cleaned.startsWith('#')) {
46
+ continue;
47
+ }
48
+ const colonIdx = cleaned.indexOf(':');
49
+ if (colonIdx === -1) {
50
+ continue;
51
+ }
52
+ const key = cleaned.substring(0, colonIdx).trim().toLowerCase();
53
+ const value = cleaned.substring(colonIdx + 1).trim();
54
+ if (key === 'user-agent') {
55
+ const ua = value.toLowerCase();
56
+ appliesToUs = ua === '*' || ua === 'mailpop';
57
+ }
58
+ else if (key === 'sitemap') {
59
+ try {
60
+ // Validate it is a valid URL
61
+ new URL(value);
62
+ result.sitemaps.push(value);
63
+ }
64
+ catch (_e) {
65
+ // Try resolving relative URL if needed
66
+ try {
67
+ const absUrl = new URL(value, robotsUrl).toString();
68
+ result.sitemaps.push(absUrl);
69
+ }
70
+ catch (_err) {
71
+ // Ignore invalid sitemap URL
72
+ }
73
+ }
74
+ }
75
+ else if (key === 'disallow' && appliesToUs) {
76
+ if (value) {
77
+ result.disallowedPaths.push(value);
78
+ }
79
+ }
80
+ }
81
+ }
82
+ }
83
+ catch (err) {
84
+ const errorMsg = err instanceof Error ? err.message : String(err);
85
+ await Logger.info('robots-fetch-skip', domainHost, undefined, 'Skipped', `Failed to fetch robots.txt: ${errorMsg}`);
86
+ }
87
+ // Cache the result for 24 hours (86400000 ms)
88
+ await cache.set(robotsUrl, result, 86400000);
89
+ return result;
90
+ }
package/dist/scorer.js ADDED
@@ -0,0 +1,170 @@
1
+ import { isDomainMatch } from './utils/validators.js';
2
+ import { normalizeDomain } from './utils/normalize.js';
3
+ const PUBLIC_DOMAINS = [
4
+ 'gmail.com',
5
+ 'yahoo.com',
6
+ 'hotmail.com',
7
+ 'outlook.com',
8
+ 'aol.com',
9
+ 'icloud.com',
10
+ 'mail.com',
11
+ 'zoho.com',
12
+ 'protonmail.com',
13
+ 'yandex.com',
14
+ 'gmx.com',
15
+ 'live.com',
16
+ 'me.com',
17
+ 'msn.com',
18
+ ];
19
+ /**
20
+ * Returns a priority base score (0 to 100) for the email prefix.
21
+ */
22
+ export function getEmailBaseScore(email) {
23
+ const localPart = email.split('@')[0].toLowerCase();
24
+ const roleScores = [
25
+ { prefixes: ['contact'], score: 100 },
26
+ { prefixes: ['info'], score: 95 },
27
+ { prefixes: ['hello'], score: 90 },
28
+ { prefixes: ['support', 'help'], score: 85 },
29
+ { prefixes: ['sales', 'partnership', 'partnerships', 'bizdev', 'business'], score: 80 },
30
+ { prefixes: ['team', 'office', 'admin'], score: 70 },
31
+ { prefixes: ['founder', 'ceo', 'co-founder', 'owner'], score: 65 },
32
+ { prefixes: ['media', 'press', 'marketing', 'pr'], score: 55 },
33
+ { prefixes: ['jobs', 'careers', 'hr', 'recruiting'], score: 45 },
34
+ ];
35
+ // Check role-based scores
36
+ for (const group of roleScores) {
37
+ if (group.prefixes.some((p) => localPart === p || localPart.startsWith(p + '.'))) {
38
+ return { score: group.score, type: 'role' };
39
+ }
40
+ }
41
+ // Automated keywords check
42
+ const automatedPrefixes = [
43
+ 'noreply',
44
+ 'no-reply',
45
+ 'donotreply',
46
+ 'do-not-reply',
47
+ 'mailer-daemon',
48
+ 'postmaster',
49
+ 'abuse',
50
+ 'security',
51
+ 'spam',
52
+ 'bot',
53
+ 'system',
54
+ 'notification',
55
+ ];
56
+ if (automatedPrefixes.some((p) => localPart === p || localPart.startsWith(p + '-'))) {
57
+ return { score: 0, type: 'automated' };
58
+ }
59
+ // If not role or automated, it's likely a personal/employee email (e.g. john.doe@)
60
+ return { score: 40, type: 'personal' };
61
+ }
62
+ /**
63
+ * Evaluates the confidence score (0 to 100) for a discovered email based on various signals.
64
+ * @param discovered - The email discovery object.
65
+ * @param targetDomain - The target domain we are crawling.
66
+ * @param occurrences - Number of times this email was found across different pages of the site.
67
+ */
68
+ export function scoreDiscoveredEmail(discovered, targetDomain, occurrences = 1) {
69
+ let score = discovered.confidenceScore; // Start with the extraction score (40 to 90)
70
+ // 1. Page Location modifier
71
+ const method = discovered.discoveryMethod;
72
+ if (method === 'contact-page') {
73
+ score += 10;
74
+ }
75
+ else if (method === 'about-page') {
76
+ score += 5;
77
+ }
78
+ else if (method === 'sitemap') {
79
+ score += 5;
80
+ }
81
+ // 2. Email Location modifier
82
+ const sourceType = discovered.metadata.sourceType;
83
+ if (sourceType === 'mailto') {
84
+ score += 5;
85
+ }
86
+ else if (sourceType === 'footer') {
87
+ score += 5;
88
+ }
89
+ else if (sourceType === 'script') {
90
+ score -= 10; // lower confidence for script elements
91
+ }
92
+ else if (sourceType === 'obfuscated') {
93
+ score -= 5;
94
+ }
95
+ // 3. Domain Match modifiers (Crucial for cold outreach safety)
96
+ const emailParts = discovered.email.split('@');
97
+ if (emailParts.length === 2) {
98
+ const emailDomain = normalizeDomain(emailParts[1]);
99
+ const normalizedTarget = normalizeDomain(targetDomain);
100
+ if (emailDomain === normalizedTarget || emailDomain.endsWith('.' + normalizedTarget)) {
101
+ // Direct or subdomain match is excellent
102
+ score += 10;
103
+ }
104
+ else if (PUBLIC_DOMAINS.includes(emailDomain)) {
105
+ // Gmail/Yahoo is common for small businesses, but slightly less authoritative than matching domain
106
+ score -= 15;
107
+ }
108
+ else {
109
+ // Serious penalty for matching an entirely different corporate domain (risk of scrapers picking up CDNs/analytics domains)
110
+ score -= 50;
111
+ }
112
+ }
113
+ // 4. Frequency/Occurrences modifier
114
+ if (occurrences > 1) {
115
+ score += Math.min(10, occurrences * 2); // Boost if found on multiple pages
116
+ }
117
+ // Clamp score strictly between 10 and 100 (if no domain matches and penalized, could drop low, but 10 is floor)
118
+ const finalScore = Math.max(10, Math.min(100, score));
119
+ return Math.round(finalScore);
120
+ }
121
+ /**
122
+ * Compares two discovered emails and returns the better one based on selection rules:
123
+ * 1. Highest confidence score.
124
+ * 2. Highest base/priority score (role-based order).
125
+ * 3. Domain-matching over external.
126
+ */
127
+ export function selectBestEmail(emails, targetDomain, occurrenceCounts) {
128
+ if (emails.length === 0) {
129
+ return null;
130
+ }
131
+ // Pre-calculate scores for all emails
132
+ const scoredList = emails.map((email) => {
133
+ const occurrences = occurrenceCounts[email.email] || 1;
134
+ const confidence = scoreDiscoveredEmail(email, targetDomain, occurrences);
135
+ const { score: baseScore } = getEmailBaseScore(email.email);
136
+ const matchesDomain = isDomainMatch(email.email, targetDomain);
137
+ return {
138
+ email,
139
+ confidence,
140
+ baseScore,
141
+ matchesDomain,
142
+ };
143
+ });
144
+ // Sort according to priority rules
145
+ scoredList.sort((a, b) => {
146
+ // 1. Highest Confidence Score
147
+ if (b.confidence !== a.confidence) {
148
+ return b.confidence - a.confidence;
149
+ }
150
+ // 2. Highest Base Score (role priority contact > info > hello ...)
151
+ if (b.baseScore !== a.baseScore) {
152
+ return b.baseScore - a.baseScore;
153
+ }
154
+ // 3. Domain Matching
155
+ if (a.matchesDomain && !b.matchesDomain)
156
+ return -1;
157
+ if (!a.matchesDomain && b.matchesDomain)
158
+ return 1;
159
+ // 4. Role type preference
160
+ if (a.email.emailType === 'role' && b.email.emailType !== 'role')
161
+ return -1;
162
+ if (a.email.emailType !== 'role' && b.email.emailType === 'role')
163
+ return 1;
164
+ return 0;
165
+ });
166
+ const best = scoredList[0];
167
+ // Update confidence score to the calculated final score
168
+ best.email.confidenceScore = best.confidence;
169
+ return best.email;
170
+ }
@@ -0,0 +1,75 @@
1
+ import { load } from 'cheerio';
2
+ import { Logger } from './logger.js';
3
+ /**
4
+ * Fetches and recursively parses a sitemap URL. If it's a sitemap index, it parses
5
+ * sub-sitemaps up to a limit. Returns discovered URLs.
6
+ * @param sitemapUrl - Absolute URL to the XML sitemap.
7
+ * @param cache - Cache instance to store results.
8
+ * @param maxUrls - Maximum number of URLs to extract per sitemap to prevent memory overload.
9
+ */
10
+ export async function parseSitemap(sitemapUrl, cache, maxUrls = 500) {
11
+ const cached = await cache.get(sitemapUrl);
12
+ if (cached) {
13
+ return cached;
14
+ }
15
+ const urls = [];
16
+ let host = '';
17
+ try {
18
+ host = new URL(sitemapUrl).hostname;
19
+ }
20
+ catch (_e) {
21
+ host = sitemapUrl;
22
+ }
23
+ try {
24
+ const response = await fetch(sitemapUrl, {
25
+ headers: {
26
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) mailpop/1.0',
27
+ Accept: 'application/xml, text/xml, application/xhtml+xml, */*',
28
+ },
29
+ signal: AbortSignal.timeout(15000), // 15s timeout
30
+ });
31
+ if (response.ok) {
32
+ const xml = await response.text();
33
+ // Use xmlMode: true for Cheerio to parse XML tags correctly
34
+ const $ = load(xml, { xmlMode: true });
35
+ // 1. Check if it's a sitemap index (contains <sitemap> tags)
36
+ const sitemaps = $('sitemap');
37
+ if (sitemaps.length > 0) {
38
+ const nestedUrls = [];
39
+ // Limit scanning to first 5 sub-sitemaps to avoid excessive fetches
40
+ const subSitemapsLimit = Math.min(sitemaps.length, 5);
41
+ for (let i = 0; i < subSitemapsLimit; i++) {
42
+ const loc = $(sitemaps[i]).find('loc').text().trim();
43
+ if (loc) {
44
+ const nested = await parseSitemap(loc, cache, maxUrls);
45
+ nestedUrls.push(...nested);
46
+ if (nestedUrls.length >= maxUrls) {
47
+ break;
48
+ }
49
+ }
50
+ }
51
+ const finalNested = nestedUrls.slice(0, maxUrls);
52
+ await cache.set(sitemapUrl, finalNested, 86400000); // Cache 24 hours
53
+ return finalNested;
54
+ }
55
+ // 2. Otherwise it's a normal sitemap (contains <url> tags)
56
+ $('url').each((_, element) => {
57
+ if (urls.length >= maxUrls) {
58
+ return;
59
+ }
60
+ const loc = $(element).find('loc').text().trim();
61
+ if (loc) {
62
+ urls.push(loc);
63
+ }
64
+ });
65
+ }
66
+ }
67
+ catch (err) {
68
+ const errorMsg = err instanceof Error ? err.message : String(err);
69
+ await Logger.info('sitemap-fetch-skip', host, undefined, 'Skipped', `Failed to parse sitemap: ${errorMsg}`);
70
+ }
71
+ const finalUrls = urls.slice(0, maxUrls);
72
+ // Cache the results for 24 hours (86400000 ms)
73
+ await cache.set(sitemapUrl, finalUrls, 86400000);
74
+ return finalUrls;
75
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Delays execution for a specified number of milliseconds.
3
+ * @param ms - Number of milliseconds to delay.
4
+ */
5
+ export function delay(ms) {
6
+ return new Promise((resolve) => setTimeout(resolve, ms));
7
+ }
8
+ /**
9
+ * Delays execution for a randomized duration between min and max milliseconds.
10
+ * @param min - Minimum delay in milliseconds.
11
+ * @param max - Maximum delay in milliseconds.
12
+ */
13
+ export function getRandomDelay(min, max) {
14
+ const ms = Math.floor(Math.random() * (max - min + 1)) + min;
15
+ return delay(ms);
16
+ }
@@ -0,0 +1,30 @@
1
+ export class MailPopError extends Error {
2
+ constructor(message) {
3
+ super(message);
4
+ this.name = this.constructor.name;
5
+ Error.captureStackTrace(this, this.constructor);
6
+ }
7
+ }
8
+ export class PageLoadError extends MailPopError {
9
+ statusCode;
10
+ url;
11
+ constructor(message, url, statusCode) {
12
+ super(message);
13
+ this.url = url;
14
+ this.statusCode = statusCode;
15
+ }
16
+ }
17
+ export class RateLimitError extends PageLoadError {
18
+ constructor(message, url) {
19
+ super(message, url, 429);
20
+ }
21
+ }
22
+ export class CrawlTimeoutError extends MailPopError {
23
+ domain;
24
+ durationMs;
25
+ constructor(message, domain, durationMs) {
26
+ super(message);
27
+ this.domain = domain;
28
+ this.durationMs = durationMs;
29
+ }
30
+ }