mailpop 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +195 -0
- package/dist/cache.js +81 -0
- package/dist/config.js +34 -0
- package/dist/crawler.js +280 -0
- package/dist/csv.js +115 -0
- package/dist/extractor.js +290 -0
- package/dist/index.js +247 -0
- package/dist/link-discovery.js +126 -0
- package/dist/logger.js +82 -0
- package/dist/robots.js +90 -0
- package/dist/scorer.js +170 -0
- package/dist/sitemap.js +75 -0
- package/dist/types/crawler.js +1 -0
- package/dist/types/csv.js +1 -0
- package/dist/types/email.js +1 -0
- package/dist/utils/delay.js +16 -0
- package/dist/utils/errors.js +30 -0
- package/dist/utils/normalize.js +95 -0
- package/dist/utils/retry.js +29 -0
- package/dist/utils/validators.js +85 -0
- package/package.json +46 -0
package/dist/csv.js
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import * as fs from 'fs';
|
|
2
|
+
import fsPromises from 'fs/promises';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import * as csv from 'fast-csv';
|
|
5
|
+
import { Logger } from './logger.js';
|
|
6
|
+
/**
|
|
7
|
+
* Creates an async generator to stream rows from an input CSV file.
|
|
8
|
+
* This guarantees memory-efficient processing for up to 50k+ rows.
|
|
9
|
+
* @param filePath - Path to the input CSV file.
|
|
10
|
+
*/
|
|
11
|
+
export async function* readCsvGenerator(filePath) {
|
|
12
|
+
if (!fs.existsSync(filePath)) {
|
|
13
|
+
throw new Error(`Input CSV file not found: ${filePath}`);
|
|
14
|
+
}
|
|
15
|
+
const stream = fs.createReadStream(filePath).pipe(csv.parse({
|
|
16
|
+
headers: true,
|
|
17
|
+
trim: true,
|
|
18
|
+
discardUnmappedColumns: false,
|
|
19
|
+
}));
|
|
20
|
+
let index = 0;
|
|
21
|
+
for await (const row of stream) {
|
|
22
|
+
yield { row: row, index };
|
|
23
|
+
index++;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Reads only the header row of a CSV file.
|
|
28
|
+
* Helper to dynamically extract the input schema.
|
|
29
|
+
*/
|
|
30
|
+
export async function getCsvHeaders(filePath) {
|
|
31
|
+
if (!fs.existsSync(filePath)) {
|
|
32
|
+
throw new Error(`Input CSV file not found: ${filePath}`);
|
|
33
|
+
}
|
|
34
|
+
return new Promise((resolve, reject) => {
|
|
35
|
+
const stream = fs.createReadStream(filePath);
|
|
36
|
+
const parser = csv.parse({ headers: true });
|
|
37
|
+
stream
|
|
38
|
+
.pipe(parser)
|
|
39
|
+
.on('headers', (headers) => {
|
|
40
|
+
stream.destroy();
|
|
41
|
+
resolve(headers);
|
|
42
|
+
})
|
|
43
|
+
.on('error', (err) => {
|
|
44
|
+
reject(err);
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Appends a single row to the output CSV file in a thread-safe and escaped manner.
|
|
50
|
+
* @param filePath - Path to the output CSV file.
|
|
51
|
+
* @param row - Output row data.
|
|
52
|
+
* @param headers - Complete ordered headers array for output alignment.
|
|
53
|
+
* @param writeHeader - Whether to prefix the row with a header line.
|
|
54
|
+
*/
|
|
55
|
+
export async function appendCsvRow(filePath, row, headers, writeHeader = false) {
|
|
56
|
+
const dir = path.dirname(filePath);
|
|
57
|
+
try {
|
|
58
|
+
await fsPromises.mkdir(dir, { recursive: true });
|
|
59
|
+
}
|
|
60
|
+
catch (_e) {
|
|
61
|
+
// Ignore folder creation errors if it already exists
|
|
62
|
+
}
|
|
63
|
+
const csvLine = await new Promise((resolve, reject) => {
|
|
64
|
+
csv
|
|
65
|
+
.writeToString([row], {
|
|
66
|
+
headers: headers,
|
|
67
|
+
includeEndRowDelimiter: true,
|
|
68
|
+
writeHeaders: writeHeader,
|
|
69
|
+
})
|
|
70
|
+
.then(resolve)
|
|
71
|
+
.catch(reject);
|
|
72
|
+
});
|
|
73
|
+
await fsPromises.appendFile(filePath, csvLine, 'utf-8');
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Loads checkpoint progress from disk. Returns null if missing/corrupt.
|
|
77
|
+
*/
|
|
78
|
+
export async function loadCheckpoint(filePath) {
|
|
79
|
+
try {
|
|
80
|
+
const content = await fsPromises.readFile(filePath, 'utf-8');
|
|
81
|
+
const data = JSON.parse(content);
|
|
82
|
+
if (typeof data.lastProcessedIndex === 'number' && Array.isArray(data.completedUrls)) {
|
|
83
|
+
return data;
|
|
84
|
+
}
|
|
85
|
+
return null;
|
|
86
|
+
}
|
|
87
|
+
catch (_e) {
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Saves checkpoint progress to disk.
|
|
93
|
+
*/
|
|
94
|
+
export async function saveCheckpoint(filePath, data) {
|
|
95
|
+
const dir = path.dirname(filePath);
|
|
96
|
+
try {
|
|
97
|
+
await fsPromises.mkdir(dir, { recursive: true });
|
|
98
|
+
await fsPromises.writeFile(filePath, JSON.stringify(data, null, 2), 'utf-8');
|
|
99
|
+
}
|
|
100
|
+
catch (err) {
|
|
101
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
102
|
+
await Logger.error('checkpoint-save-fail', undefined, undefined, `Failed to save checkpoint: ${errorMsg}`);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Deletes the checkpoint file (used upon successful completion of run).
|
|
107
|
+
*/
|
|
108
|
+
export async function clearCheckpoint(filePath) {
|
|
109
|
+
try {
|
|
110
|
+
await fsPromises.unlink(filePath);
|
|
111
|
+
}
|
|
112
|
+
catch (_e) {
|
|
113
|
+
// Ignore if checkpoint file doesn't exist
|
|
114
|
+
}
|
|
115
|
+
}
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
import { normalizeEmail } from './utils/normalize.js';
|
|
3
|
+
import { isValidEmail } from './utils/validators.js';
|
|
4
|
+
// Standard email regex for searching inside strings
|
|
5
|
+
const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
|
6
|
+
// Obfuscated email regex matching "name [at] domain [dot] com", "name(at)domain(dot)com", "name AT domain DOT com"
|
|
7
|
+
const OBFUSCATED_REGEX = /([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|\s+at\s+)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|\s+dot\s+)\s*([a-zA-Z]{2,})/gi;
|
|
8
|
+
// Base64 candidate regex for extracting potential base64 encoded strings
|
|
9
|
+
const BASE64_CANDIDATE_REGEX = /\b[a-zA-Z0-9+/]{12,80}={0,2}\b/g;
|
|
10
|
+
/**
|
|
11
|
+
* Decodes a Cloudflare email protection hex string.
|
|
12
|
+
*/
|
|
13
|
+
export function decodeCloudflareEmail(hex) {
|
|
14
|
+
try {
|
|
15
|
+
let email = '';
|
|
16
|
+
const r = parseInt(hex.substring(0, 2), 16);
|
|
17
|
+
for (let i = 2; i < hex.length; i += 2) {
|
|
18
|
+
const c = parseInt(hex.substring(i, i + 2), 16) ^ r;
|
|
19
|
+
email += String.fromCharCode(c);
|
|
20
|
+
}
|
|
21
|
+
return email;
|
|
22
|
+
}
|
|
23
|
+
catch (_e) {
|
|
24
|
+
return '';
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Decodes HTML Unicode entities (e.g. c or c).
|
|
29
|
+
* Cheerio generally decodes these, but this serves as a fallback.
|
|
30
|
+
*/
|
|
31
|
+
export function decodeUnicodeEntities(text) {
|
|
32
|
+
return text
|
|
33
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
|
|
34
|
+
.replace(/&#([0-9]+);/g, (_, dec) => String.fromCharCode(parseInt(dec, 10)));
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Attempts to extract Base64 encoded email addresses.
|
|
38
|
+
*/
|
|
39
|
+
export function extractBase64Emails(text) {
|
|
40
|
+
const emails = [];
|
|
41
|
+
let match;
|
|
42
|
+
// Reset regex index
|
|
43
|
+
BASE64_CANDIDATE_REGEX.lastIndex = 0;
|
|
44
|
+
while ((match = BASE64_CANDIDATE_REGEX.exec(text)) !== null) {
|
|
45
|
+
try {
|
|
46
|
+
const decoded = Buffer.from(match[0], 'base64').toString('utf-8');
|
|
47
|
+
const normalized = normalizeEmail(decoded);
|
|
48
|
+
if (isValidEmail(normalized)) {
|
|
49
|
+
emails.push(normalized);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
catch (_e) {
|
|
53
|
+
// Not a valid base64 string or decode failed
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return emails;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Maps a URL path to a discovery method / page type.
|
|
60
|
+
*/
|
|
61
|
+
function getDiscoveryMethod(url) {
|
|
62
|
+
try {
|
|
63
|
+
const path = new URL(url).pathname.toLowerCase();
|
|
64
|
+
if (path.includes('contact'))
|
|
65
|
+
return 'contact-page';
|
|
66
|
+
if (path.includes('about'))
|
|
67
|
+
return 'about-page';
|
|
68
|
+
if (path.includes('sitemap'))
|
|
69
|
+
return 'sitemap';
|
|
70
|
+
return 'general-page';
|
|
71
|
+
}
|
|
72
|
+
catch (_e) {
|
|
73
|
+
return 'general-page';
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Classifies an email as 'role', 'personal', or 'automated'.
|
|
78
|
+
*/
|
|
79
|
+
export function classifyEmailType(email) {
|
|
80
|
+
const localPart = email.split('@')[0].toLowerCase();
|
|
81
|
+
const rolePrefixes = [
|
|
82
|
+
'contact',
|
|
83
|
+
'info',
|
|
84
|
+
'hello',
|
|
85
|
+
'support',
|
|
86
|
+
'sales',
|
|
87
|
+
'partnerships',
|
|
88
|
+
'partnership',
|
|
89
|
+
'business',
|
|
90
|
+
'team',
|
|
91
|
+
'founder',
|
|
92
|
+
'ceo',
|
|
93
|
+
'media',
|
|
94
|
+
'press',
|
|
95
|
+
'jobs',
|
|
96
|
+
'careers',
|
|
97
|
+
'admin',
|
|
98
|
+
'office',
|
|
99
|
+
'help',
|
|
100
|
+
'inquiries',
|
|
101
|
+
'inquiry',
|
|
102
|
+
'hi',
|
|
103
|
+
'welcome',
|
|
104
|
+
'hr',
|
|
105
|
+
'marketing',
|
|
106
|
+
'privacy',
|
|
107
|
+
'legal',
|
|
108
|
+
'billing',
|
|
109
|
+
'finance',
|
|
110
|
+
];
|
|
111
|
+
const automatedPrefixes = [
|
|
112
|
+
'noreply',
|
|
113
|
+
'no-reply',
|
|
114
|
+
'donotreply',
|
|
115
|
+
'do-not-reply',
|
|
116
|
+
'mailer-daemon',
|
|
117
|
+
'postmaster',
|
|
118
|
+
'abuse',
|
|
119
|
+
'security',
|
|
120
|
+
'spam',
|
|
121
|
+
'bot',
|
|
122
|
+
'system',
|
|
123
|
+
'notification',
|
|
124
|
+
'notifications',
|
|
125
|
+
'alert',
|
|
126
|
+
'alerts',
|
|
127
|
+
];
|
|
128
|
+
if (automatedPrefixes.some((prefix) => localPart === prefix || localPart.startsWith(prefix + '-'))) {
|
|
129
|
+
return 'automated';
|
|
130
|
+
}
|
|
131
|
+
if (rolePrefixes.some((prefix) => localPart === prefix || localPart.startsWith(prefix + '.'))) {
|
|
132
|
+
return 'role';
|
|
133
|
+
}
|
|
134
|
+
return 'personal';
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Extracts all unique emails from a given HTML string and URL.
|
|
138
|
+
*/
|
|
139
|
+
export function extractEmails(html, url, pageTitle, crawlDurationMs) {
|
|
140
|
+
const discovered = new Map();
|
|
141
|
+
const $ = load(html);
|
|
142
|
+
const discoveryMethod = getDiscoveryMethod(url);
|
|
143
|
+
const addEmail = (rawEmail, sourceType, obfuscatedMethod) => {
|
|
144
|
+
const email = normalizeEmail(rawEmail);
|
|
145
|
+
if (!isValidEmail(email)) {
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
const type = classifyEmailType(email);
|
|
149
|
+
const timestamp = new Date().toISOString();
|
|
150
|
+
const metadata = {
|
|
151
|
+
sourceUrl: url,
|
|
152
|
+
sourceType,
|
|
153
|
+
pageTitle,
|
|
154
|
+
discoveryTimestamp: timestamp,
|
|
155
|
+
crawlDurationMs,
|
|
156
|
+
};
|
|
157
|
+
// Calculate preliminary confidence score based on extraction details
|
|
158
|
+
// Final confidence is refined in scorer.ts
|
|
159
|
+
let initialConfidence = 60;
|
|
160
|
+
if (sourceType === 'footer' || sourceType === 'header') {
|
|
161
|
+
initialConfidence = 85;
|
|
162
|
+
}
|
|
163
|
+
else if (sourceType === 'mailto') {
|
|
164
|
+
initialConfidence = 90;
|
|
165
|
+
}
|
|
166
|
+
else if (sourceType === 'obfuscated') {
|
|
167
|
+
initialConfidence = 50;
|
|
168
|
+
}
|
|
169
|
+
else if (sourceType === 'script') {
|
|
170
|
+
initialConfidence = 40;
|
|
171
|
+
}
|
|
172
|
+
if (obfuscatedMethod) {
|
|
173
|
+
initialConfidence -= 10; // Obfuscated discovery has slightly lower confidence
|
|
174
|
+
}
|
|
175
|
+
const item = {
|
|
176
|
+
email,
|
|
177
|
+
emailSource: url,
|
|
178
|
+
emailType: type,
|
|
179
|
+
confidenceScore: Math.max(10, Math.min(100, initialConfidence)),
|
|
180
|
+
discoveryMethod: obfuscatedMethod === 'cloudflare'
|
|
181
|
+
? 'obscure-js'
|
|
182
|
+
: sourceType === 'mailto'
|
|
183
|
+
? 'mailto-link'
|
|
184
|
+
: discoveryMethod,
|
|
185
|
+
metadata,
|
|
186
|
+
};
|
|
187
|
+
const existing = discovered.get(email);
|
|
188
|
+
if (!existing || existing.confidenceScore < item.confidenceScore) {
|
|
189
|
+
discovered.set(email, item);
|
|
190
|
+
}
|
|
191
|
+
};
|
|
192
|
+
// 1. Cloudflare email protection decoding
|
|
193
|
+
// Search tags containing cfemail
|
|
194
|
+
$('[data-cfemail]').each((_, el) => {
|
|
195
|
+
const hex = $(el).attr('data-cfemail');
|
|
196
|
+
if (hex) {
|
|
197
|
+
const email = decodeCloudflareEmail(hex);
|
|
198
|
+
if (email)
|
|
199
|
+
addEmail(email, 'obfuscated', 'cloudflare');
|
|
200
|
+
}
|
|
201
|
+
});
|
|
202
|
+
// Search links containing cloudflare email-protection path
|
|
203
|
+
$('a[href*="/cdn-cgi/l/email-protection#"]').each((_, el) => {
|
|
204
|
+
const href = $(el).attr('href') || '';
|
|
205
|
+
const match = href.match(/\/cdn-cgi\/l\/email-protection#([a-fA-F0-9]+)/);
|
|
206
|
+
if (match && match[1]) {
|
|
207
|
+
const email = decodeCloudflareEmail(match[1]);
|
|
208
|
+
if (email)
|
|
209
|
+
addEmail(email, 'obfuscated', 'cloudflare');
|
|
210
|
+
}
|
|
211
|
+
});
|
|
212
|
+
// 2. Mailto links
|
|
213
|
+
$('a[href^="mailto:"]').each((_, el) => {
|
|
214
|
+
const href = $(el).attr('href');
|
|
215
|
+
if (href) {
|
|
216
|
+
addEmail(href, 'mailto');
|
|
217
|
+
}
|
|
218
|
+
});
|
|
219
|
+
// 3. Header section extraction
|
|
220
|
+
const headerElem = $('header, [id*="header"], [class*="header"]');
|
|
221
|
+
if (headerElem.length > 0) {
|
|
222
|
+
const headerText = decodeUnicodeEntities(headerElem.text());
|
|
223
|
+
let match;
|
|
224
|
+
EMAIL_REGEX.lastIndex = 0;
|
|
225
|
+
while ((match = EMAIL_REGEX.exec(headerText)) !== null) {
|
|
226
|
+
addEmail(match[0], 'header');
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
// 4. Footer section extraction
|
|
230
|
+
const footerElem = $('footer, [id*="footer"], [class*="footer"]');
|
|
231
|
+
if (footerElem.length > 0) {
|
|
232
|
+
const footerText = decodeUnicodeEntities(footerElem.text());
|
|
233
|
+
let match;
|
|
234
|
+
EMAIL_REGEX.lastIndex = 0;
|
|
235
|
+
while ((match = EMAIL_REGEX.exec(footerText)) !== null) {
|
|
236
|
+
addEmail(match[0], 'footer');
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
// 5. Meta tags
|
|
240
|
+
$('meta').each((_, el) => {
|
|
241
|
+
const content = $(el).attr('content');
|
|
242
|
+
if (content) {
|
|
243
|
+
const cleanedContent = decodeUnicodeEntities(content);
|
|
244
|
+
let match;
|
|
245
|
+
EMAIL_REGEX.lastIndex = 0;
|
|
246
|
+
while ((match = EMAIL_REGEX.exec(cleanedContent)) !== null) {
|
|
247
|
+
addEmail(match[0], 'meta');
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
});
|
|
251
|
+
// 6. Scripts (JSON-LD, Inline JavaScript)
|
|
252
|
+
$('script').each((_, el) => {
|
|
253
|
+
const scriptContent = $(el).html();
|
|
254
|
+
if (scriptContent) {
|
|
255
|
+
const decodedScript = decodeUnicodeEntities(scriptContent);
|
|
256
|
+
// Look for standard emails in scripts
|
|
257
|
+
let match;
|
|
258
|
+
EMAIL_REGEX.lastIndex = 0;
|
|
259
|
+
while ((match = EMAIL_REGEX.exec(decodedScript)) !== null) {
|
|
260
|
+
addEmail(match[0], 'script');
|
|
261
|
+
}
|
|
262
|
+
// Look for obfuscated pattern emails in scripts
|
|
263
|
+
OBFUSCATED_REGEX.lastIndex = 0;
|
|
264
|
+
let obfMatch;
|
|
265
|
+
while ((obfMatch = OBFUSCATED_REGEX.exec(decodedScript)) !== null) {
|
|
266
|
+
addEmail(`${obfMatch[1]}@${obfMatch[2]}.${obfMatch[3]}`, 'script', 'text-obfuscation');
|
|
267
|
+
}
|
|
268
|
+
// Look for base64 encoded emails in scripts
|
|
269
|
+
const base64Emails = extractBase64Emails(decodedScript);
|
|
270
|
+
for (const b64Email of base64Emails) {
|
|
271
|
+
addEmail(b64Email, 'script', 'base64');
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
});
|
|
275
|
+
// 7. Visible Body Text & Obfuscated matches in body
|
|
276
|
+
const bodyText = decodeUnicodeEntities($('body').text());
|
|
277
|
+
// Standard matches in body text
|
|
278
|
+
let bodyMatch;
|
|
279
|
+
EMAIL_REGEX.lastIndex = 0;
|
|
280
|
+
while ((bodyMatch = EMAIL_REGEX.exec(bodyText)) !== null) {
|
|
281
|
+
addEmail(bodyMatch[0], 'text');
|
|
282
|
+
}
|
|
283
|
+
// Obfuscated matches in body text
|
|
284
|
+
let obfBodyMatch;
|
|
285
|
+
OBFUSCATED_REGEX.lastIndex = 0;
|
|
286
|
+
while ((obfBodyMatch = OBFUSCATED_REGEX.exec(bodyText)) !== null) {
|
|
287
|
+
addEmail(`${obfBodyMatch[1]}@${obfBodyMatch[2]}.${obfBodyMatch[3]}`, 'text', 'text-obfuscation');
|
|
288
|
+
}
|
|
289
|
+
return Array.from(discovered.values());
|
|
290
|
+
}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { config } from './config.js';
|
|
3
|
+
import { readCsvGenerator, appendCsvRow, loadCheckpoint, saveCheckpoint, clearCheckpoint, getCsvHeaders, } from './csv.js';
|
|
4
|
+
import { Crawler } from './crawler.js';
|
|
5
|
+
import { Logger } from './logger.js';
|
|
6
|
+
import pLimit from 'p-limit';
|
|
7
|
+
import fs from 'fs/promises';
|
|
8
|
+
import path from 'path';
|
|
9
|
+
import { normalizeDomain, findWebsiteInRow } from './utils/normalize.js';
|
|
10
|
+
let highestContiguousIndex = -1;
|
|
11
|
+
const completedIndices = new Set();
|
|
12
|
+
let completedUrls = [];
|
|
13
|
+
let crawlerInstance = null;
|
|
14
|
+
let isShuttingDown = false;
|
|
15
|
+
/**
|
|
16
|
+
* Handles graceful shutdown on SIGINT / SIGTERM signals.
|
|
17
|
+
*/
|
|
18
|
+
async function handleShutdown(signal) {
|
|
19
|
+
if (isShuttingDown) {
|
|
20
|
+
return;
|
|
21
|
+
}
|
|
22
|
+
isShuttingDown = true;
|
|
23
|
+
process.stdout.write(`\n[SHUTDOWN] Received ${signal}. Saving checkpoints and shutting down...\n`);
|
|
24
|
+
if (crawlerInstance) {
|
|
25
|
+
try {
|
|
26
|
+
await crawlerInstance.close();
|
|
27
|
+
}
|
|
28
|
+
catch (_e) {
|
|
29
|
+
/* ignore */
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
if (highestContiguousIndex >= 0) {
|
|
33
|
+
try {
|
|
34
|
+
await saveCheckpoint(config.checkpointFile, {
|
|
35
|
+
lastProcessedIndex: highestContiguousIndex,
|
|
36
|
+
completedUrls,
|
|
37
|
+
timestamp: new Date().toISOString(),
|
|
38
|
+
});
|
|
39
|
+
process.stdout.write(`[SHUTDOWN] Checkpoint persisted at index ${highestContiguousIndex}.\n`);
|
|
40
|
+
}
|
|
41
|
+
catch (err) {
|
|
42
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
43
|
+
process.stderr.write(`[SHUTDOWN] Failed to write checkpoint: ${msg}\n`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
process.exit(0);
|
|
47
|
+
}
|
|
48
|
+
// Register signal listeners
|
|
49
|
+
process.on('SIGINT', () => {
|
|
50
|
+
handleShutdown('SIGINT').catch(() => process.exit(1));
|
|
51
|
+
});
|
|
52
|
+
process.on('SIGTERM', () => {
|
|
53
|
+
handleShutdown('SIGTERM').catch(() => process.exit(1));
|
|
54
|
+
});
|
|
55
|
+
/**
|
|
56
|
+
* Marks an index as completed and advances the highest contiguous completed index.
|
|
57
|
+
*/
|
|
58
|
+
function markIndexCompleted(index, url) {
|
|
59
|
+
completedIndices.add(index);
|
|
60
|
+
completedUrls.push(url);
|
|
61
|
+
while (completedIndices.has(highestContiguousIndex + 1)) {
|
|
62
|
+
highestContiguousIndex++;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Main application runner.
|
|
67
|
+
*/
|
|
68
|
+
async function main() {
|
|
69
|
+
const startRunTime = Date.now();
|
|
70
|
+
// Parse CLI flags and arguments
|
|
71
|
+
const args = process.argv.slice(2);
|
|
72
|
+
let inputPath = config.inputCsv;
|
|
73
|
+
let outputPath = config.outputCsv;
|
|
74
|
+
for (let i = 0; i < args.length; i++) {
|
|
75
|
+
if (args[i] === '-i' || args[i] === '--input') {
|
|
76
|
+
inputPath = path.resolve(args[i + 1]);
|
|
77
|
+
i++;
|
|
78
|
+
}
|
|
79
|
+
else if (args[i] === '-o' || args[i] === '--output') {
|
|
80
|
+
outputPath = path.resolve(args[i + 1]);
|
|
81
|
+
i++;
|
|
82
|
+
}
|
|
83
|
+
else if (args[i] === '-h' || args[i] === '--help') {
|
|
84
|
+
process.stdout.write(`
|
|
85
|
+
mailpop - CLI Guide
|
|
86
|
+
Usage: npx mailpop [options] [input.csv] [output.csv]
|
|
87
|
+
|
|
88
|
+
Options:
|
|
89
|
+
-i, --input <path> Path to the input CSV file
|
|
90
|
+
-o, --output <path> Path to the output CSV file
|
|
91
|
+
-h, --help Display this help message
|
|
92
|
+
\n`);
|
|
93
|
+
process.exit(0);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
// Fallback to positional arguments
|
|
97
|
+
const positionals = args.filter((a) => !a.startsWith('-'));
|
|
98
|
+
if (positionals.length >= 1) {
|
|
99
|
+
inputPath = path.resolve(positionals[0]);
|
|
100
|
+
}
|
|
101
|
+
if (positionals.length >= 2) {
|
|
102
|
+
outputPath = path.resolve(positionals[1]);
|
|
103
|
+
}
|
|
104
|
+
await Logger.info('app-initialize', undefined, undefined, 'Running', `Initializing mailpop (Input: ${path.basename(inputPath)}, Output: ${path.basename(outputPath)})...`);
|
|
105
|
+
// 1. Extract dynamic headers from the input CSV
|
|
106
|
+
let inputHeaders = [];
|
|
107
|
+
try {
|
|
108
|
+
inputHeaders = await getCsvHeaders(inputPath);
|
|
109
|
+
}
|
|
110
|
+
catch (err) {
|
|
111
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
112
|
+
await Logger.error('app-initialize-fail', undefined, undefined, `Failed to read input CSV headers: ${errorMsg}`);
|
|
113
|
+
process.exit(1);
|
|
114
|
+
}
|
|
115
|
+
// Construct combined output headers, preserving original columns and adding new ones
|
|
116
|
+
const outputHeaders = [...inputHeaders];
|
|
117
|
+
const newColumns = [
|
|
118
|
+
'email',
|
|
119
|
+
'email_source',
|
|
120
|
+
'email_type',
|
|
121
|
+
'confidence_score',
|
|
122
|
+
'discovery_method',
|
|
123
|
+
];
|
|
124
|
+
for (const col of newColumns) {
|
|
125
|
+
if (!outputHeaders.includes(col)) {
|
|
126
|
+
outputHeaders.push(col);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
// 2. Initialize crawler and browser
|
|
130
|
+
crawlerInstance = new Crawler();
|
|
131
|
+
await crawlerInstance.initialize(config.headless);
|
|
132
|
+
// 3. Determine if resume is available
|
|
133
|
+
let checkpointIndex = -1;
|
|
134
|
+
const checkpoint = await loadCheckpoint(config.checkpointFile);
|
|
135
|
+
if (checkpoint) {
|
|
136
|
+
checkpointIndex = checkpoint.lastProcessedIndex;
|
|
137
|
+
highestContiguousIndex = checkpointIndex;
|
|
138
|
+
completedUrls = checkpoint.completedUrls;
|
|
139
|
+
// Prime completed indices from checkpoint history to ensure tracking consistency
|
|
140
|
+
for (let i = 0; i <= checkpointIndex; i++) {
|
|
141
|
+
completedIndices.add(i);
|
|
142
|
+
}
|
|
143
|
+
await Logger.info('runner-resume', undefined, undefined, 'Resume', `Checkpoint found. Resuming crawl from row index ${checkpointIndex + 1}`);
|
|
144
|
+
}
|
|
145
|
+
else {
|
|
146
|
+
// New run. Setup fresh output file with CSV header line
|
|
147
|
+
await Logger.info('runner-fresh-start', undefined, undefined, 'Fresh', 'Starting fresh run. Writing CSV headers...');
|
|
148
|
+
try {
|
|
149
|
+
const outDir = path.dirname(outputPath);
|
|
150
|
+
await fs.mkdir(outDir, { recursive: true }).catch(() => { });
|
|
151
|
+
const headerLine = outputHeaders.join(',') + '\n';
|
|
152
|
+
await fs.writeFile(outputPath, headerLine, 'utf-8');
|
|
153
|
+
}
|
|
154
|
+
catch (err) {
|
|
155
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
156
|
+
await Logger.error('app-initialize-fail', undefined, undefined, `Failed to setup output file: ${errorMsg}`);
|
|
157
|
+
await crawlerInstance.close();
|
|
158
|
+
process.exit(1);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
// 3. Process Input CSV with Concurrency Throttling
|
|
162
|
+
const limit = pLimit(config.concurrency);
|
|
163
|
+
const tasks = [];
|
|
164
|
+
let processedCount = 0;
|
|
165
|
+
try {
|
|
166
|
+
for await (const { row, index } of readCsvGenerator(inputPath)) {
|
|
167
|
+
if (isShuttingDown) {
|
|
168
|
+
break;
|
|
169
|
+
}
|
|
170
|
+
// Skip previously processed indices
|
|
171
|
+
if (index <= checkpointIndex) {
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
// Detect website URL column dynamically (Website, URL, Domain, Site, Web)
|
|
175
|
+
const websiteUrl = findWebsiteInRow(row);
|
|
176
|
+
if (!websiteUrl) {
|
|
177
|
+
await Logger.error('csv-row-skip', undefined, undefined, `Row ${index} is missing a website/url/domain column. Skipping.`);
|
|
178
|
+
markIndexCompleted(index, 'skipped-missing-url');
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
// Schedule the crawl target
|
|
182
|
+
const target = {
|
|
183
|
+
name: row.Name || row.name || 'Unknown Company',
|
|
184
|
+
website: websiteUrl,
|
|
185
|
+
domain: row.Domain || row.domain || normalizeDomain(websiteUrl),
|
|
186
|
+
};
|
|
187
|
+
const task = limit(async () => {
|
|
188
|
+
if (isShuttingDown) {
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
const result = await crawlerInstance.crawlWebsite(target, config);
|
|
192
|
+
if (isShuttingDown) {
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
// Map crawling result, retaining all original row keys
|
|
196
|
+
const outputRow = {
|
|
197
|
+
...row,
|
|
198
|
+
email: result.selectedEmail ? result.selectedEmail.email : '',
|
|
199
|
+
email_source: result.selectedEmail ? result.selectedEmail.emailSource : '',
|
|
200
|
+
email_type: result.selectedEmail ? result.selectedEmail.emailType : '',
|
|
201
|
+
confidence_score: result.selectedEmail
|
|
202
|
+
? String(result.selectedEmail.confidenceScore)
|
|
203
|
+
: '',
|
|
204
|
+
discovery_method: result.selectedEmail ? result.selectedEmail.discoveryMethod : '',
|
|
205
|
+
};
|
|
206
|
+
// Append output row incrementally matching the dynamic headers list
|
|
207
|
+
await appendCsvRow(outputPath, outputRow, outputHeaders, false);
|
|
208
|
+
// Mark index completed and log status
|
|
209
|
+
markIndexCompleted(index, target.website);
|
|
210
|
+
processedCount++;
|
|
211
|
+
// Save progress checkpoints every 10 companies processed
|
|
212
|
+
if (processedCount % 10 === 0) {
|
|
213
|
+
await saveCheckpoint(config.checkpointFile, {
|
|
214
|
+
lastProcessedIndex: highestContiguousIndex,
|
|
215
|
+
completedUrls,
|
|
216
|
+
timestamp: new Date().toISOString(),
|
|
217
|
+
});
|
|
218
|
+
await Logger.info('runner-checkpoint', undefined, undefined, 'Progress', `Checkpoint persisted. Processed: ${processedCount} in this run (Last index: ${highestContiguousIndex})`);
|
|
219
|
+
}
|
|
220
|
+
});
|
|
221
|
+
tasks.push(task);
|
|
222
|
+
}
|
|
223
|
+
// Wait for all scheduled crawler tasks to finish
|
|
224
|
+
await Promise.all(tasks);
|
|
225
|
+
// 4. Successful Finish
|
|
226
|
+
if (!isShuttingDown) {
|
|
227
|
+
await clearCheckpoint(config.checkpointFile);
|
|
228
|
+
const totalDuration = Date.now() - startRunTime;
|
|
229
|
+
await Logger.info('runner-complete', undefined, totalDuration, 'Success', `Crawl completed successfully. Processed ${processedCount} targets in ${Math.round(totalDuration / 1000)}s.`);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
catch (err) {
|
|
233
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
234
|
+
await Logger.error('runner-fatal', undefined, Date.now() - startRunTime, errorMsg);
|
|
235
|
+
}
|
|
236
|
+
finally {
|
|
237
|
+
if (crawlerInstance) {
|
|
238
|
+
await crawlerInstance.close();
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
// Start application
|
|
243
|
+
main().catch((err) => {
|
|
244
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
245
|
+
process.stderr.write(`Fatal error during application execution: ${errorMsg}\n`);
|
|
246
|
+
process.exit(1);
|
|
247
|
+
});
|