npm - mailpop - Versions diffs - 1.0.0 - Mend

mailpop 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/link-discovery.js ADDED Viewed

@@ -0,0 +1,126 @@
+import { load } from 'cheerio';
+import { normalizeDomain, normalizeUrl } from './utils/normalize.js';
+const AVOID_KEYWORDS = [
+    'logout',
+    'login',
+    'signup',
+    'register',
+    'checkout',
+    'cart',
+    'dashboard',
+    'account',
+    'auth',
+    'admin',
+    'wp-admin',
+];
+/**
+ * Checks if a URL matches patterns we should avoid.
+ */
+export function shouldAvoidUrl(url) {
+    try {
+        const parsed = new URL(url);
+        const pathAndQuery = (parsed.pathname + parsed.search + parsed.hash).toLowerCase();
+        // Avoid non-http protocols, assets, documents, and media
+        if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
+            return true;
+        }
+        const fileExtension = parsed.pathname.split('.').pop()?.toLowerCase();
+        if (fileExtension &&
+            [
+                'pdf',
+                'jpg',
+                'jpeg',
+                'png',
+                'gif',
+                'svg',
+                'zip',
+                'tar',
+                'gz',
+                'mp4',
+                'mp3',
+                'docx',
+                'xlsx',
+                'pptx',
+                'epub',
+                'exe',
+                'dmg',
+            ].includes(fileExtension)) {
+            return true;
+        }
+        return AVOID_KEYWORDS.some((keyword) => pathAndQuery.includes(keyword));
+    }
+    catch (_e) {
+        return true; // Avoid invalid URLs
+    }
+}
+/**
+ * Validates if the target URL is on the same domain or subdomain.
+ */
+export function isInternalUrl(url, targetDomain) {
+    try {
+        const parsed = new URL(url);
+        const emailDomain = normalizeDomain(parsed.hostname);
+        const cleanTarget = normalizeDomain(targetDomain);
+        return emailDomain === cleanTarget || emailDomain.endsWith('.' + cleanTarget);
+    }
+    catch (_e) {
+        return false;
+    }
+}
+/**
+ * Calculates a priority score for a URL.
+ * Higher scores mean the page is more likely to contain contact information.
+ */
+export function getLinkPriority(url) {
+    try {
+        const parsed = new URL(url);
+        const path = parsed.pathname.toLowerCase();
+        // High priority: contact pages, team pages, about pages
+        const high = ['contact', 'about', 'team', 'support', 'help'];
+        if (high.some((keyword) => path.includes(keyword))) {
+            return 2;
+        }
+        // Medium priority: terms, policies, partnerships, services
+        const med = ['privacy', 'legal', 'terms', 'partnership', 'sales', 'company', 'services'];
+        if (med.some((keyword) => path.includes(keyword))) {
+            return 1;
+        }
+        return 0;
+    }
+    catch (_e) {
+        return -1;
+    }
+}
+/**
+ * Extracts and filters internal links from an HTML document.
+ * Returns URLs sorted by their priority score (highest first).
+ */
+export function extractAndFilterLinks(html, baseUrl, targetDomain) {
+    const links = new Set();
+    try {
+        const $ = load(html);
+        $('a[href]').each((_, element) => {
+            const href = $(element).attr('href')?.trim();
+            if (!href)
+                return;
+            try {
+                // Resolve relative links against base URL
+                const resolvedUrl = new URL(href, baseUrl);
+                // Remove hash / fragment to prevent duplicate crawling of same page
+                resolvedUrl.hash = '';
+                const normalized = normalizeUrl(resolvedUrl.toString());
+                if (normalized && isInternalUrl(normalized, targetDomain) && !shouldAvoidUrl(normalized)) {
+                    links.add(normalized);
+                }
+            }
+            catch (_e) {
+                // Ignore parsing errors for individual bad hrefs
+            }
+        });
+    }
+    catch (_e) {
+        // Ignore html parse errors
+    }
+    // Convert to array and sort by priority score
+    return Array.from(links).sort((a, b) => getLinkPriority(b) - getLinkPriority(a));
+}

package/dist/logger.js ADDED Viewed

@@ -0,0 +1,82 @@
+import fs from 'fs/promises';
+import path from 'path';
+const LOGS_DIR = path.resolve('logs');
+/**
+ * Ensures that the logs directory exists on disk.
+ */
+async function ensureLogsDir() {
+    try {
+        await fs.mkdir(LOGS_DIR, { recursive: true });
+    }
+    catch (_e) {
+        // Ignore error if directory already exists
+    }
+}
+/**
+ * Writes a single JSON line to a specified log file.
+ */
+async function writeLog(filename, data) {
+    await ensureLogsDir();
+    const filePath = path.join(LOGS_DIR, filename);
+    const logLine = JSON.stringify(data) + '\n';
+    try {
+        await fs.appendFile(filePath, logLine, 'utf-8');
+    }
+    catch (err) {
+        const errorMsg = err instanceof Error ? err.message : String(err);
+        process.stderr.write(`Failed to write log to ${filename}: ${errorMsg}\n`);
+    }
+}
+export class Logger {
+    /**
+     * Logs general information events.
+     */
+    static async info(action, domain, duration, result, message) {
+        const entry = {
+            timestamp: new Date().toISOString(),
+            level: 'INFO',
+            domain,
+            action,
+            duration,
+            result,
+            message,
+        };
+        const consoleMsg = `[INFO] ${domain ? `[${domain}] ` : ''}${action}${result ? ` -> ${result}` : ''}${message ? ` | ${message}` : ''}`;
+        process.stdout.write(consoleMsg + '\n');
+        await writeLog('app.log', entry);
+    }
+    /**
+     * Logs error events and duplicates them to errors.log.
+     */
+    static async error(action, domain, duration, errorMsg, stack) {
+        const entry = {
+            timestamp: new Date().toISOString(),
+            level: 'ERROR',
+            domain,
+            action,
+            duration,
+            error: errorMsg,
+            stack,
+        };
+        const consoleMsg = `[ERROR] ${domain ? `[${domain}] ` : ''}${action}${errorMsg ? `: ${errorMsg}` : ''}`;
+        process.stderr.write(consoleMsg + '\n');
+        await writeLog('app.log', entry);
+        await writeLog('errors.log', entry);
+    }
+    /**
+     * Logs a discovered email to the dedicated discovered-emails.log file.
+     */
+    static async email(domain, email, source, confidence, method) {
+        const entry = {
+            timestamp: new Date().toISOString(),
+            domain,
+            email,
+            emailSource: source,
+            confidenceScore: confidence,
+            discoveryMethod: method,
+        };
+        const consoleMsg = `[EMAIL] [${domain}] Found ${email} (${method}, confidence: ${confidence}) at ${source}`;
+        process.stdout.write(consoleMsg + '\n');
+        await writeLog('discovered-emails.log', entry);
+    }
+}

package/dist/robots.js ADDED Viewed

@@ -0,0 +1,90 @@
+import { Logger } from './logger.js';
+/**
+ * Fetches and parses robots.txt for a website, extracting sitemap links and disallowed paths.
+ * @param websiteUrl - Base website URL.
+ * @param cache - Cache instance to store results.
+ */
+export async function parseRobotsTxt(websiteUrl, cache) {
+    let domainHost = '';
+    try {
+        domainHost = new URL(websiteUrl).hostname;
+    }
+    catch (_e) {
+        domainHost = websiteUrl;
+    }
+    // Construct absolute robots.txt URL
+    let robotsUrl = '';
+    try {
+        const base = new URL(websiteUrl);
+        robotsUrl = `${base.protocol}//${base.host}/robots.txt`;
+    }
+    catch (_e) {
+        robotsUrl = `https://${domainHost}/robots.txt`;
+    }
+    const cached = await cache.get(robotsUrl);
+    if (cached) {
+        return cached;
+    }
+    const result = {
+        sitemaps: [],
+        disallowedPaths: [],
+    };
+    try {
+        const response = await fetch(robotsUrl, {
+            headers: {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) mailpop/1.0',
+            },
+            signal: AbortSignal.timeout(10000), // 10s timeout
+        });
+        if (response.ok) {
+            const text = await response.text();
+            const lines = text.split(/\r?\n/);
+            let appliesToUs = true; // True unless we hit User-agent that isn't '*' or 'mailpop'
+            for (const line of lines) {
+                const cleaned = line.trim();
+                if (!cleaned || cleaned.startsWith('#')) {
+                    continue;
+                }
+                const colonIdx = cleaned.indexOf(':');
+                if (colonIdx === -1) {
+                    continue;
+                }
+                const key = cleaned.substring(0, colonIdx).trim().toLowerCase();
+                const value = cleaned.substring(colonIdx + 1).trim();
+                if (key === 'user-agent') {
+                    const ua = value.toLowerCase();
+                    appliesToUs = ua === '*' || ua === 'mailpop';
+                }
+                else if (key === 'sitemap') {
+                    try {
+                        // Validate it is a valid URL
+                        new URL(value);
+                        result.sitemaps.push(value);
+                    }
+                    catch (_e) {
+                        // Try resolving relative URL if needed
+                        try {
+                            const absUrl = new URL(value, robotsUrl).toString();
+                            result.sitemaps.push(absUrl);
+                        }
+                        catch (_err) {
+                            // Ignore invalid sitemap URL
+                        }
+                    }
+                }
+                else if (key === 'disallow' && appliesToUs) {
+                    if (value) {
+                        result.disallowedPaths.push(value);
+                    }
+                }
+            }
+        }
+    }
+    catch (err) {
+        const errorMsg = err instanceof Error ? err.message : String(err);
+        await Logger.info('robots-fetch-skip', domainHost, undefined, 'Skipped', `Failed to fetch robots.txt: ${errorMsg}`);
+    }
+    // Cache the result for 24 hours (86400000 ms)
+    await cache.set(robotsUrl, result, 86400000);
+    return result;
+}

package/dist/scorer.js ADDED Viewed

@@ -0,0 +1,170 @@
+import { isDomainMatch } from './utils/validators.js';
+import { normalizeDomain } from './utils/normalize.js';
+const PUBLIC_DOMAINS = [
+    'gmail.com',
+    'yahoo.com',
+    'hotmail.com',
+    'outlook.com',
+    'aol.com',
+    'icloud.com',
+    'mail.com',
+    'zoho.com',
+    'protonmail.com',
+    'yandex.com',
+    'gmx.com',
+    'live.com',
+    'me.com',
+    'msn.com',
+];
+/**
+ * Returns a priority base score (0 to 100) for the email prefix.
+ */
+export function getEmailBaseScore(email) {
+    const localPart = email.split('@')[0].toLowerCase();
+    const roleScores = [
+        { prefixes: ['contact'], score: 100 },
+        { prefixes: ['info'], score: 95 },
+        { prefixes: ['hello'], score: 90 },
+        { prefixes: ['support', 'help'], score: 85 },
+        { prefixes: ['sales', 'partnership', 'partnerships', 'bizdev', 'business'], score: 80 },
+        { prefixes: ['team', 'office', 'admin'], score: 70 },
+        { prefixes: ['founder', 'ceo', 'co-founder', 'owner'], score: 65 },
+        { prefixes: ['media', 'press', 'marketing', 'pr'], score: 55 },
+        { prefixes: ['jobs', 'careers', 'hr', 'recruiting'], score: 45 },
+    ];
+    // Check role-based scores
+    for (const group of roleScores) {
+        if (group.prefixes.some((p) => localPart === p || localPart.startsWith(p + '.'))) {
+            return { score: group.score, type: 'role' };
+        }
+    }
+    // Automated keywords check
+    const automatedPrefixes = [
+        'noreply',
+        'no-reply',
+        'donotreply',
+        'do-not-reply',
+        'mailer-daemon',
+        'postmaster',
+        'abuse',
+        'security',
+        'spam',
+        'bot',
+        'system',
+        'notification',
+    ];
+    if (automatedPrefixes.some((p) => localPart === p || localPart.startsWith(p + '-'))) {
+        return { score: 0, type: 'automated' };
+    }
+    // If not role or automated, it's likely a personal/employee email (e.g. john.doe@)
+    return { score: 40, type: 'personal' };
+}
+/**
+ * Evaluates the confidence score (0 to 100) for a discovered email based on various signals.
+ * @param discovered - The email discovery object.
+ * @param targetDomain - The target domain we are crawling.
+ * @param occurrences - Number of times this email was found across different pages of the site.
+ */
+export function scoreDiscoveredEmail(discovered, targetDomain, occurrences = 1) {
+    let score = discovered.confidenceScore; // Start with the extraction score (40 to 90)
+    // 1. Page Location modifier
+    const method = discovered.discoveryMethod;
+    if (method === 'contact-page') {
+        score += 10;
+    }
+    else if (method === 'about-page') {
+        score += 5;
+    }
+    else if (method === 'sitemap') {
+        score += 5;
+    }
+    // 2. Email Location modifier
+    const sourceType = discovered.metadata.sourceType;
+    if (sourceType === 'mailto') {
+        score += 5;
+    }
+    else if (sourceType === 'footer') {
+        score += 5;
+    }
+    else if (sourceType === 'script') {
+        score -= 10; // lower confidence for script elements
+    }
+    else if (sourceType === 'obfuscated') {
+        score -= 5;
+    }
+    // 3. Domain Match modifiers (Crucial for cold outreach safety)
+    const emailParts = discovered.email.split('@');
+    if (emailParts.length === 2) {
+        const emailDomain = normalizeDomain(emailParts[1]);
+        const normalizedTarget = normalizeDomain(targetDomain);
+        if (emailDomain === normalizedTarget || emailDomain.endsWith('.' + normalizedTarget)) {
+            // Direct or subdomain match is excellent
+            score += 10;
+        }
+        else if (PUBLIC_DOMAINS.includes(emailDomain)) {
+            // Gmail/Yahoo is common for small businesses, but slightly less authoritative than matching domain
+            score -= 15;
+        }
+        else {
+            // Serious penalty for matching an entirely different corporate domain (risk of scrapers picking up CDNs/analytics domains)
+            score -= 50;
+        }
+    }
+    // 4. Frequency/Occurrences modifier
+    if (occurrences > 1) {
+        score += Math.min(10, occurrences * 2); // Boost if found on multiple pages
+    }
+    // Clamp score strictly between 10 and 100 (if no domain matches and penalized, could drop low, but 10 is floor)
+    const finalScore = Math.max(10, Math.min(100, score));
+    return Math.round(finalScore);
+}
+/**
+ * Compares two discovered emails and returns the better one based on selection rules:
+ * 1. Highest confidence score.
+ * 2. Highest base/priority score (role-based order).
+ * 3. Domain-matching over external.
+ */
+export function selectBestEmail(emails, targetDomain, occurrenceCounts) {
+    if (emails.length === 0) {
+        return null;
+    }
+    // Pre-calculate scores for all emails
+    const scoredList = emails.map((email) => {
+        const occurrences = occurrenceCounts[email.email] || 1;
+        const confidence = scoreDiscoveredEmail(email, targetDomain, occurrences);
+        const { score: baseScore } = getEmailBaseScore(email.email);
+        const matchesDomain = isDomainMatch(email.email, targetDomain);
+        return {
+            email,
+            confidence,
+            baseScore,
+            matchesDomain,
+        };
+    });
+    // Sort according to priority rules
+    scoredList.sort((a, b) => {
+        // 1. Highest Confidence Score
+        if (b.confidence !== a.confidence) {
+            return b.confidence - a.confidence;
+        }
+        // 2. Highest Base Score (role priority contact > info > hello ...)
+        if (b.baseScore !== a.baseScore) {
+            return b.baseScore - a.baseScore;
+        }
+        // 3. Domain Matching
+        if (a.matchesDomain && !b.matchesDomain)
+            return -1;
+        if (!a.matchesDomain && b.matchesDomain)
+            return 1;
+        // 4. Role type preference
+        if (a.email.emailType === 'role' && b.email.emailType !== 'role')
+            return -1;
+        if (a.email.emailType !== 'role' && b.email.emailType === 'role')
+            return 1;
+        return 0;
+    });
+    const best = scoredList[0];
+    // Update confidence score to the calculated final score
+    best.email.confidenceScore = best.confidence;
+    return best.email;
+}

package/dist/sitemap.js ADDED Viewed

@@ -0,0 +1,75 @@
+import { load } from 'cheerio';
+import { Logger } from './logger.js';
+/**
+ * Fetches and recursively parses a sitemap URL. If it's a sitemap index, it parses
+ * sub-sitemaps up to a limit. Returns discovered URLs.
+ * @param sitemapUrl - Absolute URL to the XML sitemap.
+ * @param cache - Cache instance to store results.
+ * @param maxUrls - Maximum number of URLs to extract per sitemap to prevent memory overload.
+ */
+export async function parseSitemap(sitemapUrl, cache, maxUrls = 500) {
+    const cached = await cache.get(sitemapUrl);
+    if (cached) {
+        return cached;
+    }
+    const urls = [];
+    let host = '';
+    try {
+        host = new URL(sitemapUrl).hostname;
+    }
+    catch (_e) {
+        host = sitemapUrl;
+    }
+    try {
+        const response = await fetch(sitemapUrl, {
+            headers: {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) mailpop/1.0',
+                Accept: 'application/xml, text/xml, application/xhtml+xml, */*',
+            },
+            signal: AbortSignal.timeout(15000), // 15s timeout
+        });
+        if (response.ok) {
+            const xml = await response.text();
+            // Use xmlMode: true for Cheerio to parse XML tags correctly
+            const $ = load(xml, { xmlMode: true });
+            // 1. Check if it's a sitemap index (contains <sitemap> tags)
+            const sitemaps = $('sitemap');
+            if (sitemaps.length > 0) {
+                const nestedUrls = [];
+                // Limit scanning to first 5 sub-sitemaps to avoid excessive fetches
+                const subSitemapsLimit = Math.min(sitemaps.length, 5);
+                for (let i = 0; i < subSitemapsLimit; i++) {
+                    const loc = $(sitemaps[i]).find('loc').text().trim();
+                    if (loc) {
+                        const nested = await parseSitemap(loc, cache, maxUrls);
+                        nestedUrls.push(...nested);
+                        if (nestedUrls.length >= maxUrls) {
+                            break;
+                        }
+                    }
+                }
+                const finalNested = nestedUrls.slice(0, maxUrls);
+                await cache.set(sitemapUrl, finalNested, 86400000); // Cache 24 hours
+                return finalNested;
+            }
+            // 2. Otherwise it's a normal sitemap (contains <url> tags)
+            $('url').each((_, element) => {
+                if (urls.length >= maxUrls) {
+                    return;
+                }
+                const loc = $(element).find('loc').text().trim();
+                if (loc) {
+                    urls.push(loc);
+                }
+            });
+        }
+    }
+    catch (err) {
+        const errorMsg = err instanceof Error ? err.message : String(err);
+        await Logger.info('sitemap-fetch-skip', host, undefined, 'Skipped', `Failed to parse sitemap: ${errorMsg}`);
+    }
+    const finalUrls = urls.slice(0, maxUrls);
+    // Cache the results for 24 hours (86400000 ms)
+    await cache.set(sitemapUrl, finalUrls, 86400000);
+    return finalUrls;
+}

package/dist/types/crawler.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/types/csv.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/types/email.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/utils/delay.js ADDED Viewed

@@ -0,0 +1,16 @@
+/**
+ * Delays execution for a specified number of milliseconds.
+ * @param ms - Number of milliseconds to delay.
+ */
+export function delay(ms) {
+    return new Promise((resolve) => setTimeout(resolve, ms));
+}
+/**
+ * Delays execution for a randomized duration between min and max milliseconds.
+ * @param min - Minimum delay in milliseconds.
+ * @param max - Maximum delay in milliseconds.
+ */
+export function getRandomDelay(min, max) {
+    const ms = Math.floor(Math.random() * (max - min + 1)) + min;
+    return delay(ms);
+}

package/dist/utils/errors.js ADDED Viewed

@@ -0,0 +1,30 @@
+export class MailPopError extends Error {
+    constructor(message) {
+        super(message);
+        this.name = this.constructor.name;
+        Error.captureStackTrace(this, this.constructor);
+    }
+}
+export class PageLoadError extends MailPopError {
+    statusCode;
+    url;
+    constructor(message, url, statusCode) {
+        super(message);
+        this.url = url;
+        this.statusCode = statusCode;
+    }
+}
+export class RateLimitError extends PageLoadError {
+    constructor(message, url) {
+        super(message, url, 429);
+    }
+}
+export class CrawlTimeoutError extends MailPopError {
+    domain;
+    durationMs;
+    constructor(message, domain, durationMs) {
+        super(message);
+        this.domain = domain;
+        this.durationMs = durationMs;
+    }
+}