mailpop 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/csv.js ADDED
@@ -0,0 +1,115 @@
1
+ import * as fs from 'fs';
2
+ import fsPromises from 'fs/promises';
3
+ import path from 'path';
4
+ import * as csv from 'fast-csv';
5
+ import { Logger } from './logger.js';
6
+ /**
7
+ * Creates an async generator to stream rows from an input CSV file.
8
+ * This guarantees memory-efficient processing for up to 50k+ rows.
9
+ * @param filePath - Path to the input CSV file.
10
+ */
11
+ export async function* readCsvGenerator(filePath) {
12
+ if (!fs.existsSync(filePath)) {
13
+ throw new Error(`Input CSV file not found: ${filePath}`);
14
+ }
15
+ const stream = fs.createReadStream(filePath).pipe(csv.parse({
16
+ headers: true,
17
+ trim: true,
18
+ discardUnmappedColumns: false,
19
+ }));
20
+ let index = 0;
21
+ for await (const row of stream) {
22
+ yield { row: row, index };
23
+ index++;
24
+ }
25
+ }
26
+ /**
27
+ * Reads only the header row of a CSV file.
28
+ * Helper to dynamically extract the input schema.
29
+ */
30
+ export async function getCsvHeaders(filePath) {
31
+ if (!fs.existsSync(filePath)) {
32
+ throw new Error(`Input CSV file not found: ${filePath}`);
33
+ }
34
+ return new Promise((resolve, reject) => {
35
+ const stream = fs.createReadStream(filePath);
36
+ const parser = csv.parse({ headers: true });
37
+ stream
38
+ .pipe(parser)
39
+ .on('headers', (headers) => {
40
+ stream.destroy();
41
+ resolve(headers);
42
+ })
43
+ .on('error', (err) => {
44
+ reject(err);
45
+ });
46
+ });
47
+ }
48
+ /**
49
+ * Appends a single row to the output CSV file in a thread-safe and escaped manner.
50
+ * @param filePath - Path to the output CSV file.
51
+ * @param row - Output row data.
52
+ * @param headers - Complete ordered headers array for output alignment.
53
+ * @param writeHeader - Whether to prefix the row with a header line.
54
+ */
55
+ export async function appendCsvRow(filePath, row, headers, writeHeader = false) {
56
+ const dir = path.dirname(filePath);
57
+ try {
58
+ await fsPromises.mkdir(dir, { recursive: true });
59
+ }
60
+ catch (_e) {
61
+ // Ignore folder creation errors if it already exists
62
+ }
63
+ const csvLine = await new Promise((resolve, reject) => {
64
+ csv
65
+ .writeToString([row], {
66
+ headers: headers,
67
+ includeEndRowDelimiter: true,
68
+ writeHeaders: writeHeader,
69
+ })
70
+ .then(resolve)
71
+ .catch(reject);
72
+ });
73
+ await fsPromises.appendFile(filePath, csvLine, 'utf-8');
74
+ }
75
+ /**
76
+ * Loads checkpoint progress from disk. Returns null if missing/corrupt.
77
+ */
78
+ export async function loadCheckpoint(filePath) {
79
+ try {
80
+ const content = await fsPromises.readFile(filePath, 'utf-8');
81
+ const data = JSON.parse(content);
82
+ if (typeof data.lastProcessedIndex === 'number' && Array.isArray(data.completedUrls)) {
83
+ return data;
84
+ }
85
+ return null;
86
+ }
87
+ catch (_e) {
88
+ return null;
89
+ }
90
+ }
91
+ /**
92
+ * Saves checkpoint progress to disk.
93
+ */
94
+ export async function saveCheckpoint(filePath, data) {
95
+ const dir = path.dirname(filePath);
96
+ try {
97
+ await fsPromises.mkdir(dir, { recursive: true });
98
+ await fsPromises.writeFile(filePath, JSON.stringify(data, null, 2), 'utf-8');
99
+ }
100
+ catch (err) {
101
+ const errorMsg = err instanceof Error ? err.message : String(err);
102
+ await Logger.error('checkpoint-save-fail', undefined, undefined, `Failed to save checkpoint: ${errorMsg}`);
103
+ }
104
+ }
105
+ /**
106
+ * Deletes the checkpoint file (used upon successful completion of run).
107
+ */
108
+ export async function clearCheckpoint(filePath) {
109
+ try {
110
+ await fsPromises.unlink(filePath);
111
+ }
112
+ catch (_e) {
113
+ // Ignore if checkpoint file doesn't exist
114
+ }
115
+ }
@@ -0,0 +1,290 @@
1
+ import { load } from 'cheerio';
2
+ import { normalizeEmail } from './utils/normalize.js';
3
+ import { isValidEmail } from './utils/validators.js';
4
+ // Standard email regex for searching inside strings
5
+ const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
6
+ // Obfuscated email regex matching "name [at] domain [dot] com", "name(at)domain(dot)com", "name AT domain DOT com"
7
+ const OBFUSCATED_REGEX = /([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|\s+at\s+)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|\s+dot\s+)\s*([a-zA-Z]{2,})/gi;
8
+ // Base64 candidate regex for extracting potential base64 encoded strings
9
+ const BASE64_CANDIDATE_REGEX = /\b[a-zA-Z0-9+/]{12,80}={0,2}\b/g;
10
+ /**
11
+ * Decodes a Cloudflare email protection hex string.
12
+ */
13
+ export function decodeCloudflareEmail(hex) {
14
+ try {
15
+ let email = '';
16
+ const r = parseInt(hex.substring(0, 2), 16);
17
+ for (let i = 2; i < hex.length; i += 2) {
18
+ const c = parseInt(hex.substring(i, i + 2), 16) ^ r;
19
+ email += String.fromCharCode(c);
20
+ }
21
+ return email;
22
+ }
23
+ catch (_e) {
24
+ return '';
25
+ }
26
+ }
27
+ /**
28
+ * Decodes HTML Unicode entities (e.g. &#x63; or &#99;).
29
+ * Cheerio generally decodes these, but this serves as a fallback.
30
+ */
31
+ export function decodeUnicodeEntities(text) {
32
+ return text
33
+ .replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
34
+ .replace(/&#([0-9]+);/g, (_, dec) => String.fromCharCode(parseInt(dec, 10)));
35
+ }
36
+ /**
37
+ * Attempts to extract Base64 encoded email addresses.
38
+ */
39
+ export function extractBase64Emails(text) {
40
+ const emails = [];
41
+ let match;
42
+ // Reset regex index
43
+ BASE64_CANDIDATE_REGEX.lastIndex = 0;
44
+ while ((match = BASE64_CANDIDATE_REGEX.exec(text)) !== null) {
45
+ try {
46
+ const decoded = Buffer.from(match[0], 'base64').toString('utf-8');
47
+ const normalized = normalizeEmail(decoded);
48
+ if (isValidEmail(normalized)) {
49
+ emails.push(normalized);
50
+ }
51
+ }
52
+ catch (_e) {
53
+ // Not a valid base64 string or decode failed
54
+ }
55
+ }
56
+ return emails;
57
+ }
58
+ /**
59
+ * Maps a URL path to a discovery method / page type.
60
+ */
61
+ function getDiscoveryMethod(url) {
62
+ try {
63
+ const path = new URL(url).pathname.toLowerCase();
64
+ if (path.includes('contact'))
65
+ return 'contact-page';
66
+ if (path.includes('about'))
67
+ return 'about-page';
68
+ if (path.includes('sitemap'))
69
+ return 'sitemap';
70
+ return 'general-page';
71
+ }
72
+ catch (_e) {
73
+ return 'general-page';
74
+ }
75
+ }
76
+ /**
77
+ * Classifies an email as 'role', 'personal', or 'automated'.
78
+ */
79
+ export function classifyEmailType(email) {
80
+ const localPart = email.split('@')[0].toLowerCase();
81
+ const rolePrefixes = [
82
+ 'contact',
83
+ 'info',
84
+ 'hello',
85
+ 'support',
86
+ 'sales',
87
+ 'partnerships',
88
+ 'partnership',
89
+ 'business',
90
+ 'team',
91
+ 'founder',
92
+ 'ceo',
93
+ 'media',
94
+ 'press',
95
+ 'jobs',
96
+ 'careers',
97
+ 'admin',
98
+ 'office',
99
+ 'help',
100
+ 'inquiries',
101
+ 'inquiry',
102
+ 'hi',
103
+ 'welcome',
104
+ 'hr',
105
+ 'marketing',
106
+ 'privacy',
107
+ 'legal',
108
+ 'billing',
109
+ 'finance',
110
+ ];
111
+ const automatedPrefixes = [
112
+ 'noreply',
113
+ 'no-reply',
114
+ 'donotreply',
115
+ 'do-not-reply',
116
+ 'mailer-daemon',
117
+ 'postmaster',
118
+ 'abuse',
119
+ 'security',
120
+ 'spam',
121
+ 'bot',
122
+ 'system',
123
+ 'notification',
124
+ 'notifications',
125
+ 'alert',
126
+ 'alerts',
127
+ ];
128
+ if (automatedPrefixes.some((prefix) => localPart === prefix || localPart.startsWith(prefix + '-'))) {
129
+ return 'automated';
130
+ }
131
+ if (rolePrefixes.some((prefix) => localPart === prefix || localPart.startsWith(prefix + '.'))) {
132
+ return 'role';
133
+ }
134
+ return 'personal';
135
+ }
136
+ /**
137
+ * Extracts all unique emails from a given HTML string and URL.
138
+ */
139
+ export function extractEmails(html, url, pageTitle, crawlDurationMs) {
140
+ const discovered = new Map();
141
+ const $ = load(html);
142
+ const discoveryMethod = getDiscoveryMethod(url);
143
+ const addEmail = (rawEmail, sourceType, obfuscatedMethod) => {
144
+ const email = normalizeEmail(rawEmail);
145
+ if (!isValidEmail(email)) {
146
+ return;
147
+ }
148
+ const type = classifyEmailType(email);
149
+ const timestamp = new Date().toISOString();
150
+ const metadata = {
151
+ sourceUrl: url,
152
+ sourceType,
153
+ pageTitle,
154
+ discoveryTimestamp: timestamp,
155
+ crawlDurationMs,
156
+ };
157
+ // Calculate preliminary confidence score based on extraction details
158
+ // Final confidence is refined in scorer.ts
159
+ let initialConfidence = 60;
160
+ if (sourceType === 'footer' || sourceType === 'header') {
161
+ initialConfidence = 85;
162
+ }
163
+ else if (sourceType === 'mailto') {
164
+ initialConfidence = 90;
165
+ }
166
+ else if (sourceType === 'obfuscated') {
167
+ initialConfidence = 50;
168
+ }
169
+ else if (sourceType === 'script') {
170
+ initialConfidence = 40;
171
+ }
172
+ if (obfuscatedMethod) {
173
+ initialConfidence -= 10; // Obfuscated discovery has slightly lower confidence
174
+ }
175
+ const item = {
176
+ email,
177
+ emailSource: url,
178
+ emailType: type,
179
+ confidenceScore: Math.max(10, Math.min(100, initialConfidence)),
180
+ discoveryMethod: obfuscatedMethod === 'cloudflare'
181
+ ? 'obscure-js'
182
+ : sourceType === 'mailto'
183
+ ? 'mailto-link'
184
+ : discoveryMethod,
185
+ metadata,
186
+ };
187
+ const existing = discovered.get(email);
188
+ if (!existing || existing.confidenceScore < item.confidenceScore) {
189
+ discovered.set(email, item);
190
+ }
191
+ };
192
+ // 1. Cloudflare email protection decoding
193
+ // Search tags containing cfemail
194
+ $('[data-cfemail]').each((_, el) => {
195
+ const hex = $(el).attr('data-cfemail');
196
+ if (hex) {
197
+ const email = decodeCloudflareEmail(hex);
198
+ if (email)
199
+ addEmail(email, 'obfuscated', 'cloudflare');
200
+ }
201
+ });
202
+ // Search links containing cloudflare email-protection path
203
+ $('a[href*="/cdn-cgi/l/email-protection#"]').each((_, el) => {
204
+ const href = $(el).attr('href') || '';
205
+ const match = href.match(/\/cdn-cgi\/l\/email-protection#([a-fA-F0-9]+)/);
206
+ if (match && match[1]) {
207
+ const email = decodeCloudflareEmail(match[1]);
208
+ if (email)
209
+ addEmail(email, 'obfuscated', 'cloudflare');
210
+ }
211
+ });
212
+ // 2. Mailto links
213
+ $('a[href^="mailto:"]').each((_, el) => {
214
+ const href = $(el).attr('href');
215
+ if (href) {
216
+ addEmail(href, 'mailto');
217
+ }
218
+ });
219
+ // 3. Header section extraction
220
+ const headerElem = $('header, [id*="header"], [class*="header"]');
221
+ if (headerElem.length > 0) {
222
+ const headerText = decodeUnicodeEntities(headerElem.text());
223
+ let match;
224
+ EMAIL_REGEX.lastIndex = 0;
225
+ while ((match = EMAIL_REGEX.exec(headerText)) !== null) {
226
+ addEmail(match[0], 'header');
227
+ }
228
+ }
229
+ // 4. Footer section extraction
230
+ const footerElem = $('footer, [id*="footer"], [class*="footer"]');
231
+ if (footerElem.length > 0) {
232
+ const footerText = decodeUnicodeEntities(footerElem.text());
233
+ let match;
234
+ EMAIL_REGEX.lastIndex = 0;
235
+ while ((match = EMAIL_REGEX.exec(footerText)) !== null) {
236
+ addEmail(match[0], 'footer');
237
+ }
238
+ }
239
+ // 5. Meta tags
240
+ $('meta').each((_, el) => {
241
+ const content = $(el).attr('content');
242
+ if (content) {
243
+ const cleanedContent = decodeUnicodeEntities(content);
244
+ let match;
245
+ EMAIL_REGEX.lastIndex = 0;
246
+ while ((match = EMAIL_REGEX.exec(cleanedContent)) !== null) {
247
+ addEmail(match[0], 'meta');
248
+ }
249
+ }
250
+ });
251
+ // 6. Scripts (JSON-LD, Inline JavaScript)
252
+ $('script').each((_, el) => {
253
+ const scriptContent = $(el).html();
254
+ if (scriptContent) {
255
+ const decodedScript = decodeUnicodeEntities(scriptContent);
256
+ // Look for standard emails in scripts
257
+ let match;
258
+ EMAIL_REGEX.lastIndex = 0;
259
+ while ((match = EMAIL_REGEX.exec(decodedScript)) !== null) {
260
+ addEmail(match[0], 'script');
261
+ }
262
+ // Look for obfuscated pattern emails in scripts
263
+ OBFUSCATED_REGEX.lastIndex = 0;
264
+ let obfMatch;
265
+ while ((obfMatch = OBFUSCATED_REGEX.exec(decodedScript)) !== null) {
266
+ addEmail(`${obfMatch[1]}@${obfMatch[2]}.${obfMatch[3]}`, 'script', 'text-obfuscation');
267
+ }
268
+ // Look for base64 encoded emails in scripts
269
+ const base64Emails = extractBase64Emails(decodedScript);
270
+ for (const b64Email of base64Emails) {
271
+ addEmail(b64Email, 'script', 'base64');
272
+ }
273
+ }
274
+ });
275
+ // 7. Visible Body Text & Obfuscated matches in body
276
+ const bodyText = decodeUnicodeEntities($('body').text());
277
+ // Standard matches in body text
278
+ let bodyMatch;
279
+ EMAIL_REGEX.lastIndex = 0;
280
+ while ((bodyMatch = EMAIL_REGEX.exec(bodyText)) !== null) {
281
+ addEmail(bodyMatch[0], 'text');
282
+ }
283
+ // Obfuscated matches in body text
284
+ let obfBodyMatch;
285
+ OBFUSCATED_REGEX.lastIndex = 0;
286
+ while ((obfBodyMatch = OBFUSCATED_REGEX.exec(bodyText)) !== null) {
287
+ addEmail(`${obfBodyMatch[1]}@${obfBodyMatch[2]}.${obfBodyMatch[3]}`, 'text', 'text-obfuscation');
288
+ }
289
+ return Array.from(discovered.values());
290
+ }
package/dist/index.js ADDED
@@ -0,0 +1,247 @@
1
+ #!/usr/bin/env node
2
+ import { config } from './config.js';
3
+ import { readCsvGenerator, appendCsvRow, loadCheckpoint, saveCheckpoint, clearCheckpoint, getCsvHeaders, } from './csv.js';
4
+ import { Crawler } from './crawler.js';
5
+ import { Logger } from './logger.js';
6
+ import pLimit from 'p-limit';
7
+ import fs from 'fs/promises';
8
+ import path from 'path';
9
+ import { normalizeDomain, findWebsiteInRow } from './utils/normalize.js';
10
+ let highestContiguousIndex = -1;
11
+ const completedIndices = new Set();
12
+ let completedUrls = [];
13
+ let crawlerInstance = null;
14
+ let isShuttingDown = false;
15
+ /**
16
+ * Handles graceful shutdown on SIGINT / SIGTERM signals.
17
+ */
18
+ async function handleShutdown(signal) {
19
+ if (isShuttingDown) {
20
+ return;
21
+ }
22
+ isShuttingDown = true;
23
+ process.stdout.write(`\n[SHUTDOWN] Received ${signal}. Saving checkpoints and shutting down...\n`);
24
+ if (crawlerInstance) {
25
+ try {
26
+ await crawlerInstance.close();
27
+ }
28
+ catch (_e) {
29
+ /* ignore */
30
+ }
31
+ }
32
+ if (highestContiguousIndex >= 0) {
33
+ try {
34
+ await saveCheckpoint(config.checkpointFile, {
35
+ lastProcessedIndex: highestContiguousIndex,
36
+ completedUrls,
37
+ timestamp: new Date().toISOString(),
38
+ });
39
+ process.stdout.write(`[SHUTDOWN] Checkpoint persisted at index ${highestContiguousIndex}.\n`);
40
+ }
41
+ catch (err) {
42
+ const msg = err instanceof Error ? err.message : String(err);
43
+ process.stderr.write(`[SHUTDOWN] Failed to write checkpoint: ${msg}\n`);
44
+ }
45
+ }
46
+ process.exit(0);
47
+ }
48
+ // Register signal listeners
49
+ process.on('SIGINT', () => {
50
+ handleShutdown('SIGINT').catch(() => process.exit(1));
51
+ });
52
+ process.on('SIGTERM', () => {
53
+ handleShutdown('SIGTERM').catch(() => process.exit(1));
54
+ });
55
+ /**
56
+ * Marks an index as completed and advances the highest contiguous completed index.
57
+ */
58
+ function markIndexCompleted(index, url) {
59
+ completedIndices.add(index);
60
+ completedUrls.push(url);
61
+ while (completedIndices.has(highestContiguousIndex + 1)) {
62
+ highestContiguousIndex++;
63
+ }
64
+ }
65
+ /**
66
+ * Main application runner.
67
+ */
68
+ async function main() {
69
+ const startRunTime = Date.now();
70
+ // Parse CLI flags and arguments
71
+ const args = process.argv.slice(2);
72
+ let inputPath = config.inputCsv;
73
+ let outputPath = config.outputCsv;
74
+ for (let i = 0; i < args.length; i++) {
75
+ if (args[i] === '-i' || args[i] === '--input') {
76
+ inputPath = path.resolve(args[i + 1]);
77
+ i++;
78
+ }
79
+ else if (args[i] === '-o' || args[i] === '--output') {
80
+ outputPath = path.resolve(args[i + 1]);
81
+ i++;
82
+ }
83
+ else if (args[i] === '-h' || args[i] === '--help') {
84
+ process.stdout.write(`
85
+ mailpop - CLI Guide
86
+ Usage: npx mailpop [options] [input.csv] [output.csv]
87
+
88
+ Options:
89
+ -i, --input <path> Path to the input CSV file
90
+ -o, --output <path> Path to the output CSV file
91
+ -h, --help Display this help message
92
+ \n`);
93
+ process.exit(0);
94
+ }
95
+ }
96
+ // Fallback to positional arguments
97
+ const positionals = args.filter((a) => !a.startsWith('-'));
98
+ if (positionals.length >= 1) {
99
+ inputPath = path.resolve(positionals[0]);
100
+ }
101
+ if (positionals.length >= 2) {
102
+ outputPath = path.resolve(positionals[1]);
103
+ }
104
+ await Logger.info('app-initialize', undefined, undefined, 'Running', `Initializing mailpop (Input: ${path.basename(inputPath)}, Output: ${path.basename(outputPath)})...`);
105
+ // 1. Extract dynamic headers from the input CSV
106
+ let inputHeaders = [];
107
+ try {
108
+ inputHeaders = await getCsvHeaders(inputPath);
109
+ }
110
+ catch (err) {
111
+ const errorMsg = err instanceof Error ? err.message : String(err);
112
+ await Logger.error('app-initialize-fail', undefined, undefined, `Failed to read input CSV headers: ${errorMsg}`);
113
+ process.exit(1);
114
+ }
115
+ // Construct combined output headers, preserving original columns and adding new ones
116
+ const outputHeaders = [...inputHeaders];
117
+ const newColumns = [
118
+ 'email',
119
+ 'email_source',
120
+ 'email_type',
121
+ 'confidence_score',
122
+ 'discovery_method',
123
+ ];
124
+ for (const col of newColumns) {
125
+ if (!outputHeaders.includes(col)) {
126
+ outputHeaders.push(col);
127
+ }
128
+ }
129
+ // 2. Initialize crawler and browser
130
+ crawlerInstance = new Crawler();
131
+ await crawlerInstance.initialize(config.headless);
132
+ // 3. Determine if resume is available
133
+ let checkpointIndex = -1;
134
+ const checkpoint = await loadCheckpoint(config.checkpointFile);
135
+ if (checkpoint) {
136
+ checkpointIndex = checkpoint.lastProcessedIndex;
137
+ highestContiguousIndex = checkpointIndex;
138
+ completedUrls = checkpoint.completedUrls;
139
+ // Prime completed indices from checkpoint history to ensure tracking consistency
140
+ for (let i = 0; i <= checkpointIndex; i++) {
141
+ completedIndices.add(i);
142
+ }
143
+ await Logger.info('runner-resume', undefined, undefined, 'Resume', `Checkpoint found. Resuming crawl from row index ${checkpointIndex + 1}`);
144
+ }
145
+ else {
146
+ // New run. Setup fresh output file with CSV header line
147
+ await Logger.info('runner-fresh-start', undefined, undefined, 'Fresh', 'Starting fresh run. Writing CSV headers...');
148
+ try {
149
+ const outDir = path.dirname(outputPath);
150
+ await fs.mkdir(outDir, { recursive: true }).catch(() => { });
151
+ const headerLine = outputHeaders.join(',') + '\n';
152
+ await fs.writeFile(outputPath, headerLine, 'utf-8');
153
+ }
154
+ catch (err) {
155
+ const errorMsg = err instanceof Error ? err.message : String(err);
156
+ await Logger.error('app-initialize-fail', undefined, undefined, `Failed to setup output file: ${errorMsg}`);
157
+ await crawlerInstance.close();
158
+ process.exit(1);
159
+ }
160
+ }
161
+ // 3. Process Input CSV with Concurrency Throttling
162
+ const limit = pLimit(config.concurrency);
163
+ const tasks = [];
164
+ let processedCount = 0;
165
+ try {
166
+ for await (const { row, index } of readCsvGenerator(inputPath)) {
167
+ if (isShuttingDown) {
168
+ break;
169
+ }
170
+ // Skip previously processed indices
171
+ if (index <= checkpointIndex) {
172
+ continue;
173
+ }
174
+ // Detect website URL column dynamically (Website, URL, Domain, Site, Web)
175
+ const websiteUrl = findWebsiteInRow(row);
176
+ if (!websiteUrl) {
177
+ await Logger.error('csv-row-skip', undefined, undefined, `Row ${index} is missing a website/url/domain column. Skipping.`);
178
+ markIndexCompleted(index, 'skipped-missing-url');
179
+ continue;
180
+ }
181
+ // Schedule the crawl target
182
+ const target = {
183
+ name: row.Name || row.name || 'Unknown Company',
184
+ website: websiteUrl,
185
+ domain: row.Domain || row.domain || normalizeDomain(websiteUrl),
186
+ };
187
+ const task = limit(async () => {
188
+ if (isShuttingDown) {
189
+ return;
190
+ }
191
+ const result = await crawlerInstance.crawlWebsite(target, config);
192
+ if (isShuttingDown) {
193
+ return;
194
+ }
195
+ // Map crawling result, retaining all original row keys
196
+ const outputRow = {
197
+ ...row,
198
+ email: result.selectedEmail ? result.selectedEmail.email : '',
199
+ email_source: result.selectedEmail ? result.selectedEmail.emailSource : '',
200
+ email_type: result.selectedEmail ? result.selectedEmail.emailType : '',
201
+ confidence_score: result.selectedEmail
202
+ ? String(result.selectedEmail.confidenceScore)
203
+ : '',
204
+ discovery_method: result.selectedEmail ? result.selectedEmail.discoveryMethod : '',
205
+ };
206
+ // Append output row incrementally matching the dynamic headers list
207
+ await appendCsvRow(outputPath, outputRow, outputHeaders, false);
208
+ // Mark index completed and log status
209
+ markIndexCompleted(index, target.website);
210
+ processedCount++;
211
+ // Save progress checkpoints every 10 companies processed
212
+ if (processedCount % 10 === 0) {
213
+ await saveCheckpoint(config.checkpointFile, {
214
+ lastProcessedIndex: highestContiguousIndex,
215
+ completedUrls,
216
+ timestamp: new Date().toISOString(),
217
+ });
218
+ await Logger.info('runner-checkpoint', undefined, undefined, 'Progress', `Checkpoint persisted. Processed: ${processedCount} in this run (Last index: ${highestContiguousIndex})`);
219
+ }
220
+ });
221
+ tasks.push(task);
222
+ }
223
+ // Wait for all scheduled crawler tasks to finish
224
+ await Promise.all(tasks);
225
+ // 4. Successful Finish
226
+ if (!isShuttingDown) {
227
+ await clearCheckpoint(config.checkpointFile);
228
+ const totalDuration = Date.now() - startRunTime;
229
+ await Logger.info('runner-complete', undefined, totalDuration, 'Success', `Crawl completed successfully. Processed ${processedCount} targets in ${Math.round(totalDuration / 1000)}s.`);
230
+ }
231
+ }
232
+ catch (err) {
233
+ const errorMsg = err instanceof Error ? err.message : String(err);
234
+ await Logger.error('runner-fatal', undefined, Date.now() - startRunTime, errorMsg);
235
+ }
236
+ finally {
237
+ if (crawlerInstance) {
238
+ await crawlerInstance.close();
239
+ }
240
+ }
241
+ }
242
+ // Start application
243
+ main().catch((err) => {
244
+ const errorMsg = err instanceof Error ? err.message : String(err);
245
+ process.stderr.write(`Fatal error during application execution: ${errorMsg}\n`);
246
+ process.exit(1);
247
+ });