guidelinescraper 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crawl.mjs +9 -5
- package/package.json +1 -1
package/crawl.mjs
CHANGED
|
@@ -125,7 +125,7 @@ const crawler = new PlaywrightCrawler({
|
|
|
125
125
|
launchContext: {
|
|
126
126
|
launchOptions: { args: ["--disable-dev-shm-usage"] },
|
|
127
127
|
},
|
|
128
|
-
maxConcurrency:
|
|
128
|
+
maxConcurrency: 8,
|
|
129
129
|
maxRequestRetries: 2,
|
|
130
130
|
navigationTimeoutSecs: 120,
|
|
131
131
|
|
|
@@ -165,7 +165,6 @@ const crawler = new PlaywrightCrawler({
|
|
|
165
165
|
return;
|
|
166
166
|
}
|
|
167
167
|
|
|
168
|
-
|
|
169
168
|
await page
|
|
170
169
|
.waitForLoadState("networkidle", { timeout: 30_000 })
|
|
171
170
|
.catch(() => {});
|
|
@@ -407,9 +406,12 @@ const crawler = new PlaywrightCrawler({
|
|
|
407
406
|
const pct = Math.round((completed / totalPages) * 100);
|
|
408
407
|
const avgSec = (Date.now() - crawlStart) / 1000 / completed;
|
|
409
408
|
const remaining = Math.round(avgSec * (totalPages - completed));
|
|
410
|
-
const eta =
|
|
409
|
+
const eta =
|
|
410
|
+
remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
|
|
411
411
|
const shortPath = pdfPath.replace(/\.pdf$/, "").replace(/^output\//, "");
|
|
412
|
-
log.info(
|
|
412
|
+
log.info(
|
|
413
|
+
`[${completed}/${totalPages}] ${pct}% · ${elapsed}s ${eta} · ${shortPath}`,
|
|
414
|
+
);
|
|
413
415
|
},
|
|
414
416
|
|
|
415
417
|
async failedRequestHandler({ request, log }) {
|
|
@@ -434,5 +436,7 @@ await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
|
|
|
434
436
|
const totalMs = Date.now() - crawlStart;
|
|
435
437
|
console.log(`\n${"─".repeat(50)}`);
|
|
436
438
|
console.log(`Done in ${formatDuration(totalMs)}`);
|
|
437
|
-
console.log(
|
|
439
|
+
console.log(
|
|
440
|
+
` ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
|
|
441
|
+
);
|
|
438
442
|
console.log(` Output: ${domainDir}`);
|