guidelinescraper 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/crawl.mjs +30 -4
  2. package/package.json +2 -2
package/crawl.mjs CHANGED
@@ -103,8 +103,12 @@ const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
103
103
  const pages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
104
104
 
105
105
  const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
106
+ const totalPages = pages.length;
107
+ let completed = 0;
108
+ let failed = 0;
109
+ const crawlStart = Date.now();
106
110
 
107
- console.log(`Discovered ${pages.length} pages to crawl.\n`);
111
+ console.log(`Discovered ${totalPages} pages to crawl.\n`);
108
112
  for (const { url, pdfPath } of pages) {
109
113
  console.log(` ${pdfPath} ← ${url}`);
110
114
  }
@@ -403,14 +407,36 @@ const crawler = new PlaywrightCrawler({
403
407
  const rawHtml = await page.content();
404
408
  fs.writeFileSync(htmlPath, purge(rawHtml));
405
409
 
406
- log.info(`Saved ${pdfPath} + ${htmlPath}`);
410
+ completed++;
411
+ const elapsed = ((Date.now() - crawlStart) / 1000).toFixed(1);
412
+ const pct = Math.round((completed / totalPages) * 100);
413
+ const avgSec = (Date.now() - crawlStart) / 1000 / completed;
414
+ const remaining = Math.round(avgSec * (totalPages - completed));
415
+ const eta = remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
416
+ log.info(`[${completed}/${totalPages}] ${pct}% · ${elapsed}s elapsed ${eta} · ${path.basename(pdfPath)}`);
407
417
  },
408
418
 
409
419
  async failedRequestHandler({ request, log }) {
410
- log.error(`Failed to crawl ${request.url} after retries.`);
420
+ failed++;
421
+ completed++;
422
+ log.error(`[${completed}/${totalPages}] Failed: ${request.url}`);
411
423
  },
412
424
  });
413
425
 
426
+ function formatDuration(ms) {
427
+ const totalSec = Math.round(ms / 1000);
428
+ const m = Math.floor(totalSec / 60);
429
+ const s = totalSec % 60;
430
+ return m > 0 ? `${m}m ${s}s` : `${s}s`;
431
+ }
432
+
433
+ const discoverEnd = Date.now();
434
+ console.log(`\nCrawling ${totalPages} pages…\n`);
435
+
414
436
  await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
415
437
 
416
- console.log("\nDone. Output saved under:", domainDir);
438
+ const totalMs = Date.now() - crawlStart;
439
+ console.log(`\n${"─".repeat(50)}`);
440
+ console.log(`Done in ${formatDuration(totalMs)}`);
441
+ console.log(` ${completed - failed} saved, ${failed} failed, ${totalPages} total`);
442
+ console.log(` Output: ${domainDir}`);
package/package.json CHANGED
@@ -1,10 +1,10 @@
1
1
  {
2
2
  "name": "guidelinescraper",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "type": "module",
5
5
  "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
6
6
  "bin": {
7
- "guidelinescraper": "./crawl.mjs"
7
+ "guidelinescraper": "crawl.mjs"
8
8
  },
9
9
  "scripts": {
10
10
  "start": "node crawl.mjs"