mailpop 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/crawler.js +89 -57
- package/package.json +1 -1
package/dist/crawler.js
CHANGED
|
@@ -68,8 +68,8 @@ export class Crawler {
|
|
|
68
68
|
throw new PageLoadError(`Server error (${status})`, pageUrl, status);
|
|
69
69
|
}
|
|
70
70
|
// Wait for load states to let JavaScript load components (React, Angular, Vue, Next.js, etc.)
|
|
71
|
-
await page.waitForLoadState('load', { timeout:
|
|
72
|
-
await page.waitForLoadState('networkidle', { timeout:
|
|
71
|
+
await page.waitForLoadState('load', { timeout: 1500 }).catch(() => { });
|
|
72
|
+
await page.waitForLoadState('networkidle', { timeout: 1000 }).catch(() => { });
|
|
73
73
|
const html = await page.content();
|
|
74
74
|
const title = await page.title().catch(() => '');
|
|
75
75
|
return { html, title };
|
|
@@ -127,11 +127,26 @@ export class Crawler {
|
|
|
127
127
|
bypassCSP: true,
|
|
128
128
|
ignoreHTTPSErrors: true,
|
|
129
129
|
});
|
|
130
|
-
// Bandwidth optimization: block assets
|
|
131
|
-
//
|
|
130
|
+
// Bandwidth & CPU optimization: block assets (images, fonts, stylesheets)
|
|
131
|
+
// and heavy marketing/analytics scripts that hang connection states.
|
|
132
132
|
await context.route('**/*', (route) => {
|
|
133
|
-
const
|
|
134
|
-
|
|
133
|
+
const req = route.request();
|
|
134
|
+
const type = req.resourceType();
|
|
135
|
+
const url = req.url().toLowerCase();
|
|
136
|
+
const isAsset = ['image', 'media', 'font', 'stylesheet'].includes(type);
|
|
137
|
+
const isTracking = [
|
|
138
|
+
'google-analytics',
|
|
139
|
+
'googletagmanager',
|
|
140
|
+
'doubleclick',
|
|
141
|
+
'facebook.net',
|
|
142
|
+
'hotjar',
|
|
143
|
+
'segment.io',
|
|
144
|
+
'mixpanel',
|
|
145
|
+
'sentry.io',
|
|
146
|
+
'amplitude',
|
|
147
|
+
'hubspot',
|
|
148
|
+
].some((term) => url.includes(term));
|
|
149
|
+
if (isAsset || isTracking) {
|
|
135
150
|
route.abort().catch(() => { });
|
|
136
151
|
}
|
|
137
152
|
else {
|
|
@@ -180,70 +195,87 @@ export class Crawler {
|
|
|
180
195
|
}
|
|
181
196
|
await Logger.info('crawl-start', domain, undefined, 'Active', `Queue size: ${queue.length} pages, sitemaps parsed: ${sitemapLinks.length}`);
|
|
182
197
|
// 3. Traversal (BFS) Loop
|
|
183
|
-
|
|
198
|
+
let earlyExitTriggered = false;
|
|
199
|
+
while (queue.length > 0 &&
|
|
200
|
+
pagesCrawledCount < config.maxPagesPerSite &&
|
|
201
|
+
!earlyExitTriggered) {
|
|
184
202
|
const elapsed = Date.now() - startTime;
|
|
185
203
|
if (elapsed > config.maxCrawlTimePerSiteMs) {
|
|
186
204
|
await Logger.info('crawl-time-limit', domain, elapsed, 'Timeout', `Reached budget limit of ${config.maxCrawlTimePerSiteMs}ms`);
|
|
187
205
|
break;
|
|
188
206
|
}
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
207
|
+
// Dequeue batch of up to 3 pages to load concurrently
|
|
208
|
+
const batchSize = Math.min(3, queue.length, config.maxPagesPerSite - pagesCrawledCount);
|
|
209
|
+
const batch = [];
|
|
210
|
+
for (let i = 0; i < batchSize; i++) {
|
|
211
|
+
const item = queue.shift();
|
|
212
|
+
if (item) {
|
|
213
|
+
batch.push(item);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
if (batch.length === 0) {
|
|
217
|
+
break;
|
|
192
218
|
}
|
|
193
|
-
|
|
194
|
-
|
|
219
|
+
pagesCrawledCount += batch.length;
|
|
220
|
+
// Apply throttling delay between page batches (if we've already crawled pages)
|
|
221
|
+
if (pagesCrawledCount > batch.length) {
|
|
195
222
|
await getRandomDelay(config.minDelayMs, config.maxDelayMs);
|
|
196
223
|
}
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
const
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
224
|
+
// Crawl current batch pages concurrently
|
|
225
|
+
await Promise.all(batch.map(async (current) => {
|
|
226
|
+
if (earlyExitTriggered) {
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
const pageStart = Date.now();
|
|
230
|
+
try {
|
|
231
|
+
const { html, title } = await this.loadPage(current.url, context, config);
|
|
232
|
+
const pageDuration = Date.now() - pageStart;
|
|
233
|
+
// Extract emails
|
|
234
|
+
const extracted = extractEmails(html, current.url, title, pageDuration);
|
|
235
|
+
for (const item of extracted) {
|
|
236
|
+
// Keep occurrences tracker updated
|
|
237
|
+
occurrenceCounts[item.email] = (occurrenceCounts[item.email] || 0) + 1;
|
|
238
|
+
// Deduplicate: If already found, update with higher confidence if applicable
|
|
239
|
+
const existingIdx = discoveredEmails.findIndex((e) => e.email === item.email);
|
|
240
|
+
if (existingIdx === -1) {
|
|
241
|
+
discoveredEmails.push(item);
|
|
242
|
+
await Logger.email(domain, item.email, item.emailSource, item.confidenceScore, item.discoveryMethod);
|
|
243
|
+
}
|
|
244
|
+
else {
|
|
245
|
+
if (item.confidenceScore > discoveredEmails[existingIdx].confidenceScore) {
|
|
246
|
+
discoveredEmails[existingIdx] = item;
|
|
247
|
+
}
|
|
216
248
|
}
|
|
217
249
|
}
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
}
|
|
250
|
+
// Check if we can trigger an early stop
|
|
251
|
+
const currentBest = selectBestEmail(discoveredEmails, domain, occurrenceCounts);
|
|
252
|
+
if (currentBest &&
|
|
253
|
+
currentBest.confidenceScore >= 95 &&
|
|
254
|
+
isDomainMatch(currentBest.email, domain)) {
|
|
255
|
+
earlyExitTriggered = true;
|
|
256
|
+
await Logger.info('crawl-early-stop', domain, Date.now() - startTime, 'Success', `Early exit triggered by: ${currentBest.email} (${currentBest.confidenceScore} score)`);
|
|
257
|
+
}
|
|
258
|
+
// Discover and enqueue internal links if depth is within bounds and early exit hasn't fired
|
|
259
|
+
if (!earlyExitTriggered && current.depth < config.maxDepth) {
|
|
260
|
+
const childLinks = extractAndFilterLinks(html, current.url, domain);
|
|
261
|
+
for (const link of childLinks) {
|
|
262
|
+
if (!visited.has(link) && visited.size < 100) {
|
|
263
|
+
// Safety ceiling to prevent massive Set sizes
|
|
264
|
+
visited.add(link);
|
|
265
|
+
queue.push({
|
|
266
|
+
url: link,
|
|
267
|
+
depth: current.depth + 1,
|
|
268
|
+
referrer: current.url,
|
|
269
|
+
});
|
|
270
|
+
}
|
|
239
271
|
}
|
|
240
272
|
}
|
|
241
273
|
}
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
}
|
|
274
|
+
catch (err) {
|
|
275
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
276
|
+
await Logger.error('page-crawl-error', domain, Date.now() - pageStart, `Failed ${current.url}: ${errorMsg}`);
|
|
277
|
+
}
|
|
278
|
+
}));
|
|
247
279
|
}
|
|
248
280
|
// 4. Select Final Email
|
|
249
281
|
const selectedEmail = selectBestEmail(discoveredEmails, domain, occurrenceCounts);
|