guidelinescraper 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crawl.mjs +30 -4
- package/package.json +1 -1
package/crawl.mjs
CHANGED
|
@@ -103,8 +103,12 @@ const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
|
|
|
103
103
|
const pages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
|
|
104
104
|
|
|
105
105
|
const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
|
|
106
|
+
const totalPages = pages.length;
|
|
107
|
+
let completed = 0;
|
|
108
|
+
let failed = 0;
|
|
109
|
+
const crawlStart = Date.now();
|
|
106
110
|
|
|
107
|
-
console.log(`Discovered ${
|
|
111
|
+
console.log(`Discovered ${totalPages} pages to crawl.\n`);
|
|
108
112
|
for (const { url, pdfPath } of pages) {
|
|
109
113
|
console.log(` ${pdfPath} ← ${url}`);
|
|
110
114
|
}
|
|
@@ -403,14 +407,36 @@ const crawler = new PlaywrightCrawler({
|
|
|
403
407
|
const rawHtml = await page.content();
|
|
404
408
|
fs.writeFileSync(htmlPath, purge(rawHtml));
|
|
405
409
|
|
|
406
|
-
|
|
410
|
+
completed++;
|
|
411
|
+
const elapsed = ((Date.now() - crawlStart) / 1000).toFixed(1);
|
|
412
|
+
const pct = Math.round((completed / totalPages) * 100);
|
|
413
|
+
const avgSec = (Date.now() - crawlStart) / 1000 / completed;
|
|
414
|
+
const remaining = Math.round(avgSec * (totalPages - completed));
|
|
415
|
+
const eta = remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
|
|
416
|
+
log.info(`[${completed}/${totalPages}] ${pct}% · ${elapsed}s elapsed ${eta} · ${path.basename(pdfPath)}`);
|
|
407
417
|
},
|
|
408
418
|
|
|
409
419
|
async failedRequestHandler({ request, log }) {
|
|
410
|
-
|
|
420
|
+
failed++;
|
|
421
|
+
completed++;
|
|
422
|
+
log.error(`[${completed}/${totalPages}] Failed: ${request.url}`);
|
|
411
423
|
},
|
|
412
424
|
});
|
|
413
425
|
|
|
426
|
+
function formatDuration(ms) {
|
|
427
|
+
const totalSec = Math.round(ms / 1000);
|
|
428
|
+
const m = Math.floor(totalSec / 60);
|
|
429
|
+
const s = totalSec % 60;
|
|
430
|
+
return m > 0 ? `${m}m ${s}s` : `${s}s`;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
const discoverEnd = Date.now();
|
|
434
|
+
console.log(`\nCrawling ${totalPages} pages…\n`);
|
|
435
|
+
|
|
414
436
|
await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
|
|
415
437
|
|
|
416
|
-
|
|
438
|
+
const totalMs = Date.now() - crawlStart;
|
|
439
|
+
console.log(`\n${"─".repeat(50)}`);
|
|
440
|
+
console.log(`Done in ${formatDuration(totalMs)}`);
|
|
441
|
+
console.log(` ${completed - failed} saved, ${failed} failed, ${totalPages} total`);
|
|
442
|
+
console.log(` Output: ${domainDir}`);
|