guidelinescraper 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crawl.mjs +9 -9
- package/package.json +1 -1
package/crawl.mjs
CHANGED
|
@@ -109,10 +109,6 @@ let failed = 0;
|
|
|
109
109
|
const crawlStart = Date.now();
|
|
110
110
|
|
|
111
111
|
console.log(`Discovered ${totalPages} pages to crawl.\n`);
|
|
112
|
-
for (const { url, pdfPath } of pages) {
|
|
113
|
-
console.log(` ${pdfPath} ← ${url}`);
|
|
114
|
-
}
|
|
115
|
-
console.log();
|
|
116
112
|
|
|
117
113
|
for (const { pdfPath } of pages) {
|
|
118
114
|
fs.mkdirSync(path.dirname(pdfPath), { recursive: true });
|
|
@@ -129,7 +125,7 @@ const crawler = new PlaywrightCrawler({
|
|
|
129
125
|
launchContext: {
|
|
130
126
|
launchOptions: { args: ["--disable-dev-shm-usage"] },
|
|
131
127
|
},
|
|
132
|
-
maxConcurrency:
|
|
128
|
+
maxConcurrency: 8,
|
|
133
129
|
maxRequestRetries: 2,
|
|
134
130
|
navigationTimeoutSecs: 120,
|
|
135
131
|
|
|
@@ -169,7 +165,6 @@ const crawler = new PlaywrightCrawler({
|
|
|
169
165
|
return;
|
|
170
166
|
}
|
|
171
167
|
|
|
172
|
-
|
|
173
168
|
await page
|
|
174
169
|
.waitForLoadState("networkidle", { timeout: 30_000 })
|
|
175
170
|
.catch(() => {});
|
|
@@ -411,9 +406,12 @@ const crawler = new PlaywrightCrawler({
|
|
|
411
406
|
const pct = Math.round((completed / totalPages) * 100);
|
|
412
407
|
const avgSec = (Date.now() - crawlStart) / 1000 / completed;
|
|
413
408
|
const remaining = Math.round(avgSec * (totalPages - completed));
|
|
414
|
-
const eta =
|
|
409
|
+
const eta =
|
|
410
|
+
remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
|
|
415
411
|
const shortPath = pdfPath.replace(/\.pdf$/, "").replace(/^output\//, "");
|
|
416
|
-
log.info(
|
|
412
|
+
log.info(
|
|
413
|
+
`[${completed}/${totalPages}] ${pct}% · ${elapsed}s ${eta} · ${shortPath}`,
|
|
414
|
+
);
|
|
417
415
|
},
|
|
418
416
|
|
|
419
417
|
async failedRequestHandler({ request, log }) {
|
|
@@ -438,5 +436,7 @@ await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
|
|
|
438
436
|
const totalMs = Date.now() - crawlStart;
|
|
439
437
|
console.log(`\n${"─".repeat(50)}`);
|
|
440
438
|
console.log(`Done in ${formatDuration(totalMs)}`);
|
|
441
|
-
console.log(
|
|
439
|
+
console.log(
|
|
440
|
+
` ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
|
|
441
|
+
);
|
|
442
442
|
console.log(` Output: ${domainDir}`);
|