guidelinescraper 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crawl.mjs +22 -7
- package/package.json +1 -1
- package/purge-html.mjs +2 -2
package/crawl.mjs
CHANGED
|
@@ -20,6 +20,7 @@ const { values, positionals } = parseArgs({
|
|
|
20
20
|
url: { type: "string", short: "u" },
|
|
21
21
|
hub: { type: "string", short: "h" },
|
|
22
22
|
cookie: { type: "string", short: "c" },
|
|
23
|
+
limit: { type: "string", short: "l" },
|
|
23
24
|
help: { type: "boolean" },
|
|
24
25
|
},
|
|
25
26
|
allowPositionals: true,
|
|
@@ -33,6 +34,7 @@ Options:
|
|
|
33
34
|
-u, --url <url> Portal domain or URL
|
|
34
35
|
-h, --hub <id> Hub ID (auto-detected if omitted)
|
|
35
36
|
-c, --cookie <str> Cookie header for authenticated requests
|
|
37
|
+
-l, --limit <n> Only crawl the first N pages
|
|
36
38
|
--help Show this help
|
|
37
39
|
|
|
38
40
|
Environment variables (or .env file):
|
|
@@ -45,6 +47,7 @@ Environment variables (or .env file):
|
|
|
45
47
|
const inputUrl = values.url || positionals[0] || process.env.URL;
|
|
46
48
|
const hubId = values.hub || process.env.HUB_ID;
|
|
47
49
|
const cookie = values.cookie || process.env.COOKIE || "";
|
|
50
|
+
const limit = values.limit ? Number(values.limit) : Infinity;
|
|
48
51
|
|
|
49
52
|
const OUTPUT_DIR = "output";
|
|
50
53
|
|
|
@@ -100,7 +103,8 @@ function collectPages(nodes, parentDir) {
|
|
|
100
103
|
}
|
|
101
104
|
|
|
102
105
|
const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
|
|
103
|
-
const
|
|
106
|
+
const allPages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
|
|
107
|
+
const pages = allPages.slice(0, limit);
|
|
104
108
|
|
|
105
109
|
const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
|
|
106
110
|
const totalPages = pages.length;
|
|
@@ -125,7 +129,7 @@ const crawler = new PlaywrightCrawler({
|
|
|
125
129
|
launchContext: {
|
|
126
130
|
launchOptions: { args: ["--disable-dev-shm-usage"] },
|
|
127
131
|
},
|
|
128
|
-
maxConcurrency:
|
|
132
|
+
maxConcurrency: 8,
|
|
129
133
|
maxRequestRetries: 2,
|
|
130
134
|
navigationTimeoutSecs: 120,
|
|
131
135
|
|
|
@@ -165,7 +169,6 @@ const crawler = new PlaywrightCrawler({
|
|
|
165
169
|
return;
|
|
166
170
|
}
|
|
167
171
|
|
|
168
|
-
|
|
169
172
|
await page
|
|
170
173
|
.waitForLoadState("networkidle", { timeout: 30_000 })
|
|
171
174
|
.catch(() => {});
|
|
@@ -395,11 +398,18 @@ const crawler = new PlaywrightCrawler({
|
|
|
395
398
|
|
|
396
399
|
fs.writeFileSync(pdfPath, pdfBuffer);
|
|
397
400
|
|
|
401
|
+
const rawHtml = await page.content();
|
|
402
|
+
|
|
403
|
+
const rawPath = pdfPath
|
|
404
|
+
.replace(/\/pdf\//, "/raw/")
|
|
405
|
+
.replace(/\.pdf$/, ".html");
|
|
406
|
+
fs.mkdirSync(path.dirname(rawPath), { recursive: true });
|
|
407
|
+
fs.writeFileSync(rawPath, rawHtml);
|
|
408
|
+
|
|
398
409
|
const htmlPath = pdfPath
|
|
399
410
|
.replace(/\/pdf\//, "/html/")
|
|
400
411
|
.replace(/\.pdf$/, ".html");
|
|
401
412
|
fs.mkdirSync(path.dirname(htmlPath), { recursive: true });
|
|
402
|
-
const rawHtml = await page.content();
|
|
403
413
|
fs.writeFileSync(htmlPath, purge(rawHtml));
|
|
404
414
|
|
|
405
415
|
completed++;
|
|
@@ -407,9 +417,12 @@ const crawler = new PlaywrightCrawler({
|
|
|
407
417
|
const pct = Math.round((completed / totalPages) * 100);
|
|
408
418
|
const avgSec = (Date.now() - crawlStart) / 1000 / completed;
|
|
409
419
|
const remaining = Math.round(avgSec * (totalPages - completed));
|
|
410
|
-
const eta =
|
|
420
|
+
const eta =
|
|
421
|
+
remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
|
|
411
422
|
const shortPath = pdfPath.replace(/\.pdf$/, "").replace(/^output\//, "");
|
|
412
|
-
log.info(
|
|
423
|
+
log.info(
|
|
424
|
+
`[${completed}/${totalPages}] ${pct}% · ${elapsed}s ${eta} · ${shortPath}`,
|
|
425
|
+
);
|
|
413
426
|
},
|
|
414
427
|
|
|
415
428
|
async failedRequestHandler({ request, log }) {
|
|
@@ -434,5 +447,7 @@ await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
|
|
|
434
447
|
const totalMs = Date.now() - crawlStart;
|
|
435
448
|
console.log(`\n${"─".repeat(50)}`);
|
|
436
449
|
console.log(`Done in ${formatDuration(totalMs)}`);
|
|
437
|
-
console.log(
|
|
450
|
+
console.log(
|
|
451
|
+
` ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
|
|
452
|
+
);
|
|
438
453
|
console.log(` Output: ${domainDir}`);
|
package/package.json
CHANGED
package/purge-html.mjs
CHANGED
|
@@ -5,7 +5,7 @@ import { parseHTML } from "linkedom";
|
|
|
5
5
|
const REMOVE_TAGS = new Set([
|
|
6
6
|
"script", "style", "link", "meta", "noscript", "iframe", "svg", "canvas",
|
|
7
7
|
"video", "audio", "source", "track", "object", "embed", "applet",
|
|
8
|
-
"form", "input", "textarea", "select", "
|
|
8
|
+
"form", "input", "textarea", "select", "template",
|
|
9
9
|
]);
|
|
10
10
|
|
|
11
11
|
const REMOVE_SELECTORS = [
|
|
@@ -81,7 +81,7 @@ export function purge(html) {
|
|
|
81
81
|
document.querySelector("#classic-theme") ||
|
|
82
82
|
document.body;
|
|
83
83
|
|
|
84
|
-
|
|
84
|
+
for (const child of [...main.childNodes]) cleanNode(child);
|
|
85
85
|
|
|
86
86
|
const title = document.querySelector("title")?.textContent?.trim() || "";
|
|
87
87
|
const cleanHtml =
|