mailpop 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/crawler.js +89 -57
  2. package/package.json +1 -1
package/dist/crawler.js CHANGED
@@ -68,8 +68,8 @@ export class Crawler {
68
68
  throw new PageLoadError(`Server error (${status})`, pageUrl, status);
69
69
  }
70
70
  // Wait for load states to let JavaScript load components (React, Angular, Vue, Next.js, etc.)
71
- await page.waitForLoadState('load', { timeout: 3000 }).catch(() => { });
72
- await page.waitForLoadState('networkidle', { timeout: 3000 }).catch(() => { });
71
+ await page.waitForLoadState('load', { timeout: 1500 }).catch(() => { });
72
+ await page.waitForLoadState('networkidle', { timeout: 1000 }).catch(() => { });
73
73
  const html = await page.content();
74
74
  const title = await page.title().catch(() => '');
75
75
  return { html, title };
@@ -127,11 +127,26 @@ export class Crawler {
127
127
  bypassCSP: true,
128
128
  ignoreHTTPSErrors: true,
129
129
  });
130
- // Bandwidth optimization: block assets like images, videos, fonts, and CSS.
131
- // This is crucial for performance and avoids loading unnecessary assets.
130
+ // Bandwidth & CPU optimization: block assets (images, fonts, stylesheets)
131
+ // and heavy marketing/analytics scripts that hang connection states.
132
132
  await context.route('**/*', (route) => {
133
- const type = route.request().resourceType();
134
- if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
133
+ const req = route.request();
134
+ const type = req.resourceType();
135
+ const url = req.url().toLowerCase();
136
+ const isAsset = ['image', 'media', 'font', 'stylesheet'].includes(type);
137
+ const isTracking = [
138
+ 'google-analytics',
139
+ 'googletagmanager',
140
+ 'doubleclick',
141
+ 'facebook.net',
142
+ 'hotjar',
143
+ 'segment.io',
144
+ 'mixpanel',
145
+ 'sentry.io',
146
+ 'amplitude',
147
+ 'hubspot',
148
+ ].some((term) => url.includes(term));
149
+ if (isAsset || isTracking) {
135
150
  route.abort().catch(() => { });
136
151
  }
137
152
  else {
@@ -180,70 +195,87 @@ export class Crawler {
180
195
  }
181
196
  await Logger.info('crawl-start', domain, undefined, 'Active', `Queue size: ${queue.length} pages, sitemaps parsed: ${sitemapLinks.length}`);
182
197
  // 3. Traversal (BFS) Loop
183
- while (queue.length > 0 && pagesCrawledCount < config.maxPagesPerSite) {
198
+ let earlyExitTriggered = false;
199
+ while (queue.length > 0 &&
200
+ pagesCrawledCount < config.maxPagesPerSite &&
201
+ !earlyExitTriggered) {
184
202
  const elapsed = Date.now() - startTime;
185
203
  if (elapsed > config.maxCrawlTimePerSiteMs) {
186
204
  await Logger.info('crawl-time-limit', domain, elapsed, 'Timeout', `Reached budget limit of ${config.maxCrawlTimePerSiteMs}ms`);
187
205
  break;
188
206
  }
189
- const current = queue.shift();
190
- if (!current) {
191
- continue;
207
+ // Dequeue batch of up to 3 pages to load concurrently
208
+ const batchSize = Math.min(3, queue.length, config.maxPagesPerSite - pagesCrawledCount);
209
+ const batch = [];
210
+ for (let i = 0; i < batchSize; i++) {
211
+ const item = queue.shift();
212
+ if (item) {
213
+ batch.push(item);
214
+ }
215
+ }
216
+ if (batch.length === 0) {
217
+ break;
192
218
  }
193
- // Apply throttling delay between pages of the SAME website to avoid rate limits
194
- if (pagesCrawledCount > 0) {
219
+ pagesCrawledCount += batch.length;
220
+ // Apply throttling delay between page batches (if we've already crawled pages)
221
+ if (pagesCrawledCount > batch.length) {
195
222
  await getRandomDelay(config.minDelayMs, config.maxDelayMs);
196
223
  }
197
- pagesCrawledCount++;
198
- const pageStart = Date.now();
199
- try {
200
- const { html, title } = await this.loadPage(current.url, context, config);
201
- const pageDuration = Date.now() - pageStart;
202
- // Extract emails
203
- const extracted = extractEmails(html, current.url, title, pageDuration);
204
- for (const item of extracted) {
205
- // Keep occurrences tracker updated
206
- occurrenceCounts[item.email] = (occurrenceCounts[item.email] || 0) + 1;
207
- // Deduplicate: If already found, update with higher confidence if applicable
208
- const existingIdx = discoveredEmails.findIndex((e) => e.email === item.email);
209
- if (existingIdx === -1) {
210
- discoveredEmails.push(item);
211
- await Logger.email(domain, item.email, item.emailSource, item.confidenceScore, item.discoveryMethod);
212
- }
213
- else {
214
- if (item.confidenceScore > discoveredEmails[existingIdx].confidenceScore) {
215
- discoveredEmails[existingIdx] = item;
224
+ // Crawl current batch pages concurrently
225
+ await Promise.all(batch.map(async (current) => {
226
+ if (earlyExitTriggered) {
227
+ return;
228
+ }
229
+ const pageStart = Date.now();
230
+ try {
231
+ const { html, title } = await this.loadPage(current.url, context, config);
232
+ const pageDuration = Date.now() - pageStart;
233
+ // Extract emails
234
+ const extracted = extractEmails(html, current.url, title, pageDuration);
235
+ for (const item of extracted) {
236
+ // Keep occurrences tracker updated
237
+ occurrenceCounts[item.email] = (occurrenceCounts[item.email] || 0) + 1;
238
+ // Deduplicate: If already found, update with higher confidence if applicable
239
+ const existingIdx = discoveredEmails.findIndex((e) => e.email === item.email);
240
+ if (existingIdx === -1) {
241
+ discoveredEmails.push(item);
242
+ await Logger.email(domain, item.email, item.emailSource, item.confidenceScore, item.discoveryMethod);
243
+ }
244
+ else {
245
+ if (item.confidenceScore > discoveredEmails[existingIdx].confidenceScore) {
246
+ discoveredEmails[existingIdx] = item;
247
+ }
216
248
  }
217
249
  }
218
- }
219
- // EARLY STOP OPTIMIZATION: If we found a high confidence, domain-matching contact email, stop crawling
220
- const currentBest = selectBestEmail(discoveredEmails, domain, occurrenceCounts);
221
- if (currentBest &&
222
- currentBest.confidenceScore >= 95 &&
223
- isDomainMatch(currentBest.email, domain)) {
224
- await Logger.info('crawl-early-stop', domain, Date.now() - startTime, 'Success', `Early exit triggered by: ${currentBest.email} (${currentBest.confidenceScore} score)`);
225
- break;
226
- }
227
- // Discover and enqueue internal links if depth is within bounds
228
- if (current.depth < config.maxDepth) {
229
- const childLinks = extractAndFilterLinks(html, current.url, domain);
230
- for (const link of childLinks) {
231
- if (!visited.has(link) && visited.size < 100) {
232
- // Safety ceiling to prevent massive Set sizes
233
- visited.add(link);
234
- queue.push({
235
- url: link,
236
- depth: current.depth + 1,
237
- referrer: current.url,
238
- });
250
+ // Check if we can trigger an early stop
251
+ const currentBest = selectBestEmail(discoveredEmails, domain, occurrenceCounts);
252
+ if (currentBest &&
253
+ currentBest.confidenceScore >= 95 &&
254
+ isDomainMatch(currentBest.email, domain)) {
255
+ earlyExitTriggered = true;
256
+ await Logger.info('crawl-early-stop', domain, Date.now() - startTime, 'Success', `Early exit triggered by: ${currentBest.email} (${currentBest.confidenceScore} score)`);
257
+ }
258
+ // Discover and enqueue internal links if depth is within bounds and early exit hasn't fired
259
+ if (!earlyExitTriggered && current.depth < config.maxDepth) {
260
+ const childLinks = extractAndFilterLinks(html, current.url, domain);
261
+ for (const link of childLinks) {
262
+ if (!visited.has(link) && visited.size < 100) {
263
+ // Safety ceiling to prevent massive Set sizes
264
+ visited.add(link);
265
+ queue.push({
266
+ url: link,
267
+ depth: current.depth + 1,
268
+ referrer: current.url,
269
+ });
270
+ }
239
271
  }
240
272
  }
241
273
  }
242
- }
243
- catch (err) {
244
- const errorMsg = err instanceof Error ? err.message : String(err);
245
- await Logger.error('page-crawl-error', domain, Date.now() - pageStart, `Failed ${current.url}: ${errorMsg}`);
246
- }
274
+ catch (err) {
275
+ const errorMsg = err instanceof Error ? err.message : String(err);
276
+ await Logger.error('page-crawl-error', domain, Date.now() - pageStart, `Failed ${current.url}: ${errorMsg}`);
277
+ }
278
+ }));
247
279
  }
248
280
  // 4. Select Final Email
249
281
  const selectedEmail = selectBestEmail(discoveredEmails, domain, occurrenceCounts);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mailpop",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "description": "Production-ready public contact email discovery tool from company websites.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",