mailpop 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,95 @@
1
+ /**
2
+ * Normalizes an email address.
3
+ * - Trims whitespace
4
+ * - Converts to lowercase
5
+ * - Removes 'mailto:' prefix if present
6
+ * - Strips query parameters (e.g. ?subject=...)
7
+ * - Strips hash/fragment (e.g. #footer)
8
+ * - Removes any remaining URL encoded pieces or trailing dots
9
+ */
10
+ export function normalizeEmail(email) {
11
+ let cleaned = email.trim().toLowerCase();
12
+ if (cleaned.startsWith('mailto:')) {
13
+ cleaned = cleaned.substring(7);
14
+ }
15
+ // Remove query parameters
16
+ const qMarkIndex = cleaned.indexOf('?');
17
+ if (qMarkIndex !== -1) {
18
+ cleaned = cleaned.substring(0, qMarkIndex);
19
+ }
20
+ // Remove hash/fragment
21
+ const hashIndex = cleaned.indexOf('#');
22
+ if (hashIndex !== -1) {
23
+ cleaned = cleaned.substring(0, hashIndex);
24
+ }
25
+ // URL decode if there are encoded characters
26
+ try {
27
+ cleaned = decodeURIComponent(cleaned);
28
+ }
29
+ catch (_e) {
30
+ // Keep as is if URL decoding fails
31
+ }
32
+ // Remove leading/trailing periods, quotes, or whitespace that might be captured in scraping
33
+ cleaned = cleaned.replace(/^['".\s]+|['".\s]+$/g, '');
34
+ return cleaned;
35
+ }
36
+ /**
37
+ * Normalizes a URL to ensure it has a valid protocol and is formatted consistently.
38
+ * @param url - The input URL string.
39
+ */
40
+ export function normalizeUrl(url) {
41
+ let cleaned = url.trim();
42
+ if (!cleaned) {
43
+ return '';
44
+ }
45
+ if (!/^https?:\/\//i.test(cleaned)) {
46
+ cleaned = 'https://' + cleaned;
47
+ }
48
+ try {
49
+ const parsed = new URL(cleaned);
50
+ return parsed.href;
51
+ }
52
+ catch (_e) {
53
+ return cleaned;
54
+ }
55
+ }
56
+ /**
57
+ * Normalizes a domain name from a URL or raw string.
58
+ * @param domainOrUrl - Input string.
59
+ */
60
+ export function normalizeDomain(domainOrUrl) {
61
+ let cleaned = domainOrUrl.trim().toLowerCase();
62
+ if (/^https?:\/\//i.test(cleaned)) {
63
+ try {
64
+ const parsed = new URL(cleaned);
65
+ cleaned = parsed.hostname;
66
+ }
67
+ catch (_e) {
68
+ // fallback regex
69
+ cleaned = cleaned.replace(/^https?:\/\/(www\.)?/, '');
70
+ }
71
+ }
72
+ cleaned = cleaned.replace(/^www\./i, '');
73
+ const slashIdx = cleaned.indexOf('/');
74
+ if (slashIdx !== -1) {
75
+ cleaned = cleaned.substring(0, slashIdx);
76
+ }
77
+ return cleaned;
78
+ }
79
+ /**
80
+ * Searches a raw CSV row record for columns representing the target website URL.
81
+ * Scans case-insensitively for fields like 'website', 'url', 'domain', 'site', 'web'.
82
+ */
83
+ export function findWebsiteInRow(row) {
84
+ const candidateKeys = ['website', 'url', 'domain', 'site', 'web'];
85
+ for (const key of Object.keys(row)) {
86
+ const normalizedKey = key.toLowerCase().trim();
87
+ if (candidateKeys.some((candidate) => normalizedKey.includes(candidate))) {
88
+ const val = row[key];
89
+ if (val && val.trim()) {
90
+ return val.trim();
91
+ }
92
+ }
93
+ }
94
+ return null;
95
+ }
@@ -0,0 +1,29 @@
1
+ import { delay } from './delay.js';
2
+ /**
3
+ * Retries an asynchronous operation with exponential backoff.
4
+ * @param operation - The asynchronous function to execute.
5
+ * @param options - Configuration for retry count and delays.
6
+ */
7
+ export async function retryWithBackoff(operation, options) {
8
+ let attempt = 0;
9
+ while (true) {
10
+ try {
11
+ return await operation();
12
+ }
13
+ catch (error) {
14
+ attempt++;
15
+ const err = error instanceof Error ? error : new Error(String(error));
16
+ if (attempt > options.maxRetries) {
17
+ throw err;
18
+ }
19
+ const backoffDelay = Math.min(options.initialDelayMs * Math.pow(2, attempt - 1), options.maxDelayMs);
20
+ // Add a small jitter (+/- 10%) to prevent thundering herd
21
+ const jitter = (Math.random() - 0.5) * 0.2 * backoffDelay;
22
+ const finalDelay = Math.max(0, backoffDelay + jitter);
23
+ if (options.onRetry) {
24
+ options.onRetry(err, attempt);
25
+ }
26
+ await delay(finalDelay);
27
+ }
28
+ }
29
+ }
@@ -0,0 +1,85 @@
1
+ import { normalizeDomain } from './normalize.js';
2
+ const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;
3
+ const REJECTED_PREFIXES = [
4
+ 'noreply',
5
+ 'no-reply',
6
+ 'donotreply',
7
+ 'do-not-reply',
8
+ 'mailer-daemon',
9
+ 'postmaster',
10
+ 'abuse',
11
+ 'security',
12
+ 'spam',
13
+ ];
14
+ const REJECTED_DOMAINS = [
15
+ 'example.com',
16
+ 'example.org',
17
+ 'example.net',
18
+ 'test.com',
19
+ 'testing.com',
20
+ 'domain.com',
21
+ 'tempmail.com',
22
+ 'yopmail.com',
23
+ 'mailinator.com',
24
+ 'sharklasers.com',
25
+ 'guerrillamail.com',
26
+ 'dispostable.com',
27
+ 'getairmail.com',
28
+ '10minutemail.com',
29
+ ];
30
+ /**
31
+ * Validates whether a string has a valid email format and is not in the blacklist.
32
+ * @param email - The email to check.
33
+ */
34
+ export function isValidEmail(email) {
35
+ if (!EMAIL_REGEX.test(email)) {
36
+ return false;
37
+ }
38
+ const parts = email.split('@');
39
+ if (parts.length !== 2) {
40
+ return false;
41
+ }
42
+ const localPart = parts[0].toLowerCase().trim();
43
+ const domainPart = parts[1].toLowerCase().trim();
44
+ // Reject blacklisted prefixes
45
+ if (REJECTED_PREFIXES.includes(localPart)) {
46
+ return false;
47
+ }
48
+ // Reject blacklisted domains
49
+ if (REJECTED_DOMAINS.includes(domainPart)) {
50
+ return false;
51
+ }
52
+ // Simple heuristics for temporary or obviously fake emails
53
+ if (localPart.startsWith('noreply') ||
54
+ localPart.startsWith('no-reply') ||
55
+ localPart.startsWith('donotreply')) {
56
+ return false;
57
+ }
58
+ if (domainPart.includes('tempmail') ||
59
+ domainPart.includes('mailinator') ||
60
+ domainPart.includes('yopmail') ||
61
+ domainPart.startsWith('test.') ||
62
+ domainPart === 'test') {
63
+ return false;
64
+ }
65
+ // Exclude obviously invalid top level domains
66
+ const tld = domainPart.split('.').pop();
67
+ if (tld && (tld === 'local' || tld === 'temp' || tld === 'example')) {
68
+ return false;
69
+ }
70
+ return true;
71
+ }
72
+ /**
73
+ * Checks if the email domain matches the target company domain.
74
+ * @param email - The email to verify.
75
+ * @param targetDomainOrUrl - The target website/domain.
76
+ */
77
+ export function isDomainMatch(email, targetDomainOrUrl) {
78
+ const parts = email.split('@');
79
+ if (parts.length !== 2) {
80
+ return false;
81
+ }
82
+ const emailDomain = normalizeDomain(parts[1]);
83
+ const targetDomain = normalizeDomain(targetDomainOrUrl);
84
+ return emailDomain === targetDomain || emailDomain.endsWith('.' + targetDomain);
85
+ }
package/package.json ADDED
@@ -0,0 +1,46 @@
1
+ {
2
+ "name": "mailpop",
3
+ "version": "1.0.0",
4
+ "description": "Production-ready public contact email discovery tool from company websites.",
5
+ "type": "module",
6
+ "main": "dist/index.js",
7
+ "bin": {
8
+ "mailpop": "dist/index.js"
9
+ },
10
+ "files": [
11
+ "dist"
12
+ ],
13
+ "scripts": {
14
+ "dev": "tsx src/index.ts",
15
+ "build": "tsc",
16
+ "start": "node dist/index.js",
17
+ "lint": "eslint src",
18
+ "format": "prettier --write \"src/**/*.ts\"",
19
+ "typecheck": "tsc --noEmit"
20
+ },
21
+ "keywords": [
22
+ "email-hunter",
23
+ "lead-enrichment",
24
+ "playwright",
25
+ "cheerio",
26
+ "crawler",
27
+ "scraper"
28
+ ],
29
+ "author": "Antigravity",
30
+ "license": "MIT",
31
+ "dependencies": {
32
+ "cheerio": "^1.0.0-rc.12",
33
+ "dotenv": "^16.4.5",
34
+ "fast-csv": "^5.0.1",
35
+ "p-limit": "^6.1.0",
36
+ "playwright": "^1.49.0"
37
+ },
38
+ "devDependencies": {
39
+ "@types/node": "^22.10.1",
40
+ "eslint": "^9.16.0",
41
+ "typescript-eslint": "^8.16.0",
42
+ "prettier": "^3.4.1",
43
+ "tsx": "^4.19.2",
44
+ "typescript": "^5.7.2"
45
+ }
46
+ }