guidelinescraper 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/crawl.mjs +9 -9
  2. package/package.json +1 -1
package/crawl.mjs CHANGED
@@ -109,10 +109,6 @@ let failed = 0;
109
109
  const crawlStart = Date.now();
110
110
 
111
111
  console.log(`Discovered ${totalPages} pages to crawl.\n`);
112
- for (const { url, pdfPath } of pages) {
113
- console.log(` ${pdfPath} ← ${url}`);
114
- }
115
- console.log();
116
112
 
117
113
  for (const { pdfPath } of pages) {
118
114
  fs.mkdirSync(path.dirname(pdfPath), { recursive: true });
@@ -129,7 +125,7 @@ const crawler = new PlaywrightCrawler({
129
125
  launchContext: {
130
126
  launchOptions: { args: ["--disable-dev-shm-usage"] },
131
127
  },
132
- maxConcurrency: 16,
128
+ maxConcurrency: 8,
133
129
  maxRequestRetries: 2,
134
130
  navigationTimeoutSecs: 120,
135
131
 
@@ -169,7 +165,6 @@ const crawler = new PlaywrightCrawler({
169
165
  return;
170
166
  }
171
167
 
172
-
173
168
  await page
174
169
  .waitForLoadState("networkidle", { timeout: 30_000 })
175
170
  .catch(() => {});
@@ -411,9 +406,12 @@ const crawler = new PlaywrightCrawler({
411
406
  const pct = Math.round((completed / totalPages) * 100);
412
407
  const avgSec = (Date.now() - crawlStart) / 1000 / completed;
413
408
  const remaining = Math.round(avgSec * (totalPages - completed));
414
- const eta = remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
409
+ const eta =
410
+ remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
415
411
  const shortPath = pdfPath.replace(/\.pdf$/, "").replace(/^output\//, "");
416
- log.info(`[${completed}/${totalPages}] ${pct}% · ${elapsed}s ${eta} · ${shortPath}`);
412
+ log.info(
413
+ `[${completed}/${totalPages}] ${pct}% · ${elapsed}s ${eta} · ${shortPath}`,
414
+ );
417
415
  },
418
416
 
419
417
  async failedRequestHandler({ request, log }) {
@@ -438,5 +436,7 @@ await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
438
436
  const totalMs = Date.now() - crawlStart;
439
437
  console.log(`\n${"─".repeat(50)}`);
440
438
  console.log(`Done in ${formatDuration(totalMs)}`);
441
- console.log(` ${completed - failed} saved, ${failed} failed, ${totalPages} total`);
439
+ console.log(
440
+ ` ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
441
+ );
442
442
  console.log(` Output: ${domainDir}`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "guidelinescraper",
3
- "version": "1.0.3",
3
+ "version": "1.0.5",
4
4
  "type": "module",
5
5
  "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
6
6
  "bin": {