@dancrumb/web-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(pnpm --version)"
5
+ ]
6
+ }
7
+ }
package/package.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "name": "@dancrumb/web-crawler",
3
+ "version": "1.0.0",
4
+ "description": "",
5
+ "export": {
6
+ "./": "./dist/WebCrawler.js"
7
+ },
8
+ "scripts": {
9
+ "build": "tsc",
10
+ "test": "vitest --run"
11
+ },
12
+ "keywords": [],
13
+ "author": "Dan Rumney<dancrumb@gmail.com>",
14
+ "license": "MIT",
15
+ "type": "module",
16
+ "devDependencies": {
17
+ "@types/debug": "^4.1.13",
18
+ "@types/node": "^25.9.1",
19
+ "git-cz": "^4.9.0",
20
+ "supports-color": "^10.2.2",
21
+ "tsx": "^4.22.3",
22
+ "typescript": "^6.0.3",
23
+ "vitest": "^4.1.7"
24
+ },
25
+ "dependencies": {
26
+ "@dancrumb/fpish": "^8.0.0",
27
+ "axios": "^1.16.1",
28
+ "cheerio": "^1.2.0",
29
+ "debug": "^4.4.3",
30
+ "fpish": "^3.4.3"
31
+ },
32
+ "config": {
33
+ "commitizen": {
34
+ "path": "./node_modules/git-cz"
35
+ }
36
+ }
37
+ }
@@ -0,0 +1,281 @@
1
+ import type { AxiosResponse } from "axios";
2
+ import * as cheerio from "cheerio";
3
+ import debug from "debug";
4
+ import assert from "node:assert";
5
+ import EventEmitter from "node:events";
6
+ import { httpGet, httpHead } from "./http-operations.js";
7
+ import { popMap } from "./pop-map.js";
8
+
9
+ const logDebug = debug("crawler:main");
10
+
11
+ type Document = {
12
+ url: string;
13
+ canonicalUrl: string;
14
+ content: string | Buffer;
15
+ contentType: string;
16
+ title: string | null;
17
+ };
18
+
19
+ export class WebCrawler extends EventEmitter<{
20
+ progress: [progress: string | null];
21
+ }> {
22
+ private crawled = new Map<string, number>();
23
+ private uncrawled = new Map<string, number>();
24
+ private documents = new Map<string, Document>();
25
+ private roots = new Set<string>();
26
+ private maxDepth: number = 0;
27
+
28
+ private reportProgress() {
29
+ if (this.uncrawled.size === 0) {
30
+ this.emit("progress", null);
31
+ }
32
+ const progress = (
33
+ (this.crawled.size / (this.uncrawled.size + this.crawled.size)) *
34
+ 100
35
+ ).toFixed(1);
36
+ this.emit("progress", progress);
37
+ }
38
+
39
+ private addUncrawled(url: string, depth: number) {
40
+ this.uncrawled.set(url, depth);
41
+ this.reportProgress();
42
+ }
43
+
44
+ private markCrawled(url: string, urlDepth?: number) {
45
+ let depth = urlDepth ?? this.uncrawled.get(url);
46
+ if (depth === undefined) {
47
+ return;
48
+ }
49
+ this.crawled.set(url, depth);
50
+ this.uncrawled.delete(url);
51
+ this.reportProgress();
52
+ }
53
+
54
+ private async loadDocument(url: string): Promise<string | Buffer> {
55
+ const { data, headers } = await httpGet(url);
56
+ const contentType =
57
+ this.getContentType(headers) ?? "application/octet-stream";
58
+ const content = data;
59
+ this.documents.set(url, {
60
+ url,
61
+ canonicalUrl: "",
62
+ content,
63
+ title: null,
64
+ contentType,
65
+ });
66
+ const canonicalUrl = await this.getCanonical(url);
67
+ if (canonicalUrl !== url) {
68
+ this.documents.delete(url);
69
+ this.documents.set(canonicalUrl, {
70
+ url,
71
+ canonicalUrl,
72
+ content,
73
+ title: null,
74
+ contentType,
75
+ });
76
+ }
77
+ return data;
78
+ }
79
+
80
+ private async getDocument(url: string): Promise<string | Buffer> {
81
+ if (this.documents.has(url)) {
82
+ return this.documents.get(url)!.content;
83
+ }
84
+ return this.loadDocument(url);
85
+ }
86
+
87
+ private isUnderRoot(url: string) {
88
+ return [...this.roots.values()].some((root) => url.startsWith(root));
89
+ }
90
+
91
+ private async getAllLinks(url: string): Promise<string[]> {
92
+ try {
93
+ logDebug(`Getting links from ${url}`);
94
+
95
+ const { data } = await httpGet(url);
96
+ const $ = cheerio.load(data);
97
+
98
+ const anchorLinks: string[] = [];
99
+
100
+ for (const anchor of $("a")) {
101
+ anchorLinks.push(anchor.attribs["href"] ?? "");
102
+ }
103
+
104
+ return anchorLinks;
105
+ } catch (e) {
106
+ logDebug(`Got "${(e as Error).message}" while trying to get links`);
107
+ return [];
108
+ }
109
+ }
110
+
111
+ private async getCanonical(url: string): Promise<string> {
112
+ try {
113
+ const data = await this.getDocument(url);
114
+ const $ = cheerio.load(data);
115
+ for (const link of $("link")) {
116
+ if (link.attribs["rel"] === "canonical") {
117
+ return link.attribs["href"]!;
118
+ }
119
+ }
120
+ return url;
121
+ } catch (e) {
122
+ logDebug(
123
+ `Got "${(e as Error).message}" while trying to get canonical url`,
124
+ );
125
+ return url;
126
+ }
127
+ }
128
+
129
+ private async resolveLink(link: string, root: string) {
130
+ let absolute = link;
131
+ if (link.startsWith("http")) {
132
+ absolute = link;
133
+ } else {
134
+ absolute = new URL(link, root).href.replace(/\/$/, "");
135
+ }
136
+ return await this.getCanonical(absolute);
137
+ }
138
+
139
+ private async fetchHead(
140
+ link: string,
141
+ ): Promise<AxiosResponse["headers"] | null> {
142
+ let linkUrl: URL;
143
+ try {
144
+ linkUrl = new URL(link);
145
+ } catch (e) {
146
+ logDebug(`${link} is invalid, so skipping`);
147
+ return null;
148
+ }
149
+
150
+ if (!linkUrl.protocol.startsWith("http")) {
151
+ logDebug(`Protocol: '${linkUrl.protocol}' not supported, so skipping`);
152
+ return null;
153
+ }
154
+
155
+ logDebug(`HEAD ${link}`);
156
+ try {
157
+ const pageHead = await httpHead(link);
158
+
159
+ if (pageHead.status === 200) {
160
+ return pageHead.headers;
161
+ }
162
+ logDebug(
163
+ `Got ${pageHead.status}: ${pageHead.statusText} when requesting ${link}`,
164
+ );
165
+ } catch (e) {
166
+ logDebug(`Got ${(e as Error).message} when requesting ${link}`);
167
+ }
168
+
169
+ return null;
170
+ }
171
+
172
+ private getContentType(headers: AxiosResponse["headers"]): string | null {
173
+ const contentType =
174
+ headers["content-type"] ?? headers["Content-Type"] ?? null;
175
+ if (contentType === null) {
176
+ return contentType;
177
+ }
178
+ if (typeof contentType !== "string") {
179
+ return null;
180
+ }
181
+ return contentType;
182
+ }
183
+
184
+ async crawlLink(initialLink: string, maxDepth: number = this.maxDepth) {
185
+ const root = await this.getCanonical(initialLink);
186
+ this.addRoot(root);
187
+ this.maxDepth = maxDepth;
188
+
189
+ let [url, depth]: [string, number] | [undefined, undefined] = [root, 0];
190
+
191
+ while (url !== undefined) {
192
+ assert(depth !== undefined);
193
+ logDebug("\n*********\n");
194
+ logDebug(
195
+ `Crawled: ${this.crawled.size}\nUncrawled: ${this.uncrawled.size}`,
196
+ );
197
+ logDebug(`URL: ${url}\nDepth:${depth}\n`);
198
+ if (depth <= maxDepth) {
199
+ this.markCrawled(url, depth);
200
+ logDebug(`Crawling ${url}...`);
201
+ const links = await this.getAllLinks(url);
202
+ for (const link of links) {
203
+ const resolved = await this.resolveLink(link, url);
204
+ if (!this.isUnderRoot(resolved)) {
205
+ logDebug(`Out of scope: ${resolved} (${root})`);
206
+ continue;
207
+ }
208
+ logDebug(
209
+ `${resolved} is under one of ${[...this.roots.entries()].join(",")}`,
210
+ );
211
+ if ((this.crawled.get(resolved) ?? maxDepth) < depth) {
212
+ logDebug(
213
+ `Already crawled ${resolved} at depth ${this.crawled.get(resolved)}`,
214
+ );
215
+ continue;
216
+ }
217
+ if (!this.uncrawled.has(resolved)) {
218
+ const head = await this.fetchHead(resolved);
219
+ if (head === null) {
220
+ continue;
221
+ }
222
+ const contentType = this.getContentType(head);
223
+ if (contentType === null || !contentType.startsWith("text/html")) {
224
+ logDebug(
225
+ `Page at ${resolved} has MIME type ${contentType}, so skipping`,
226
+ );
227
+ continue;
228
+ }
229
+ logDebug(`Adding ${resolved} to list of uncrawled`);
230
+ this.addUncrawled(resolved, depth + 1);
231
+ }
232
+ }
233
+ }
234
+ [url, depth] = popMap(this.uncrawled);
235
+ }
236
+
237
+ return [[...this.crawled.keys()], [...this.uncrawled.keys()]];
238
+ }
239
+
240
+ clear() {
241
+ this.crawled.clear();
242
+ this.uncrawled.clear();
243
+ this.documents.clear();
244
+ this.roots.clear();
245
+ }
246
+
247
+ async addRoot(root: string): Promise<string> {
248
+ const canonical = await this.getCanonical(root);
249
+ this.roots.add(canonical);
250
+ return canonical;
251
+ }
252
+
253
+ /**
254
+ * Sets the maximum crawl depth.
255
+ *
256
+ * If the number is negative or infinite, it sets the maxDepth to 0
257
+ * Otherwise, it truncates the number to an integer (if necessary) and sets the maxDepth to that number
258
+ *
259
+ * Returns the number that was saved
260
+ */
261
+ setMaxDepth(depth: number) {
262
+ if (depth < 0) {
263
+ this.maxDepth = 0;
264
+ } else if (Number.isFinite(depth)) {
265
+ this.maxDepth = Math.trunc(depth);
266
+ } else {
267
+ this.maxDepth = 0;
268
+ }
269
+ return this.maxDepth;
270
+ }
271
+
272
+ getCrawledUrls() {
273
+ return [...this.crawled.keys()];
274
+ }
275
+
276
+ getDocuments(): Document[] {
277
+ return this.getCrawledUrls()
278
+ .map((url) => this.documents.get(url))
279
+ .filter((d) => d !== undefined);
280
+ }
281
+ }
@@ -0,0 +1,33 @@
1
+ import { default as _axios } from "axios";
2
+ import debug from "debug";
3
+
4
+ const logDebug = debug("crawler:http");
5
+
6
+ export async function httpHead(link: string) {
7
+ const aborter = new AbortController();
8
+
9
+ const abortTimeout = setTimeout(() => {
10
+ logDebug(`HEAD ${link} Timed Out`);
11
+ aborter.abort();
12
+ }, 5000);
13
+ logDebug(`HEAD ${link}`);
14
+ const head = await _axios.head(link, { signal: aborter.signal });
15
+ clearTimeout(abortTimeout);
16
+ return head;
17
+ }
18
+
19
+ export async function httpGet(link: string) {
20
+ const aborter = new AbortController();
21
+
22
+ const abortTimeout = setTimeout(() => {
23
+ logDebug(`GET ${link} Timed Out`);
24
+ aborter.abort();
25
+ }, 5000);
26
+
27
+ logDebug(`GET ${link}`);
28
+ const got = await _axios.get<string | Buffer>(link, {
29
+ signal: aborter.signal,
30
+ });
31
+ clearTimeout(abortTimeout);
32
+ return got;
33
+ }
package/src/ndnu.ts ADDED
@@ -0,0 +1,14 @@
1
+ import { inspect } from "node:util";
2
+ import { WebCrawler } from "./WebCralwer.js";
3
+
4
+ const NDNU = "https://ndnu.edu/financial-aid/";
5
+
6
+ const crawler = new WebCrawler();
7
+ crawler.setMaxDepth(1);
8
+ crawler.on("progress", (progress) =>
9
+ console.log(`Progress: ${progress ?? "-"}%`),
10
+ );
11
+
12
+ await crawler.crawlLink(NDNU);
13
+ console.log("Crawled");
14
+ console.log(crawler.getDocuments().map(({ content: _, ...d }) => inspect(d)));
package/src/pop-map.ts ADDED
@@ -0,0 +1,30 @@
1
+ /**
2
+ * "Pops" a value from a Map
3
+ *
4
+ * This relies on the native ordering of the Map implementation, so it does not guarantee
5
+ * that this is the least- or most-recent-value.
6
+ *
7
+ * It always returns a two-value tuple; either the key and value or undefined, undefined.
8
+ *
9
+ * This make it easier on calling code to determine what was returned
10
+ *
11
+ * @example
12
+ * ```
13
+ * const [key,value] = popMap(someMap);
14
+ * if(key === defined) {
15
+ * // The Map was Empty
16
+ *}
17
+ *
18
+ * // value !== undefined (unless the definition of the map's keys allow for this)
19
+ *
20
+ * @param map
21
+ * @returns
22
+ */
23
+ export function popMap<K, V>(map: Map<K, V>): [K, V] | [undefined, undefined] {
24
+ const entry = map.entries().next().value ?? [undefined, undefined];
25
+ if (entry[0]) {
26
+ map.delete(entry[0]);
27
+ }
28
+
29
+ return entry;
30
+ }
@@ -0,0 +1,39 @@
1
+ import { beforeEach, describe, expect, it } from "vitest";
2
+ import { popMap } from "./pop-map.js";
3
+
4
+ describe("popMap", () => {
5
+ const testMap = new Map<string, number>();
6
+
7
+ beforeEach(() => {
8
+ testMap.clear();
9
+ testMap.set("a", 1);
10
+ testMap.set("b", 2);
11
+ testMap.set("c", 3);
12
+ });
13
+
14
+ it("pops a value from a map", () => {
15
+ const entry = popMap(testMap);
16
+ expect(entry).toBeDefined();
17
+ expect(testMap.size).toBe(2);
18
+ expect(entry[0]).toBeOneOf(["a", "b", "c"]);
19
+ });
20
+
21
+ it("handles an empty map", () => {
22
+ popMap(testMap);
23
+ expect(testMap.size).toBe(2);
24
+ popMap(testMap);
25
+ expect(testMap.size).toBe(1);
26
+ popMap(testMap);
27
+ expect(testMap.size).toBe(0);
28
+ popMap(testMap);
29
+ expect(testMap.size).toBe(0);
30
+ });
31
+
32
+ it("handles additions to the map", () => {
33
+ popMap(testMap);
34
+ expect(testMap.size).toBe(2);
35
+ testMap.set("q", 42);
36
+ popMap(testMap);
37
+ expect(testMap.size).toBe(2);
38
+ });
39
+ });
package/tsconfig.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "exclude": [
3
+ "./src/ndnu.ts",
4
+ "./src/*.test.*",
5
+ "./dist"
6
+ ],
7
+ // Visit https://aka.ms/tsconfig to read more about this file
8
+ "compilerOptions": {
9
+ // File Layout
10
+ "rootDir": "./src",
11
+ "outDir": "./dist",
12
+ // Environment Settings
13
+ // See also https://aka.ms/tsconfig/module
14
+ "module": "nodenext",
15
+ "target": "esnext",
16
+ "types": [
17
+ "node"
18
+ ],
19
+ // For nodejs:
20
+ // "lib": ["esnext"],
21
+ // "types": ["node"],
22
+ // and npm install -D @types/node
23
+ // Other Outputs
24
+ "sourceMap": true,
25
+ "declaration": true,
26
+ "declarationMap": true,
27
+ // Stricter Typechecking Options
28
+ "noUncheckedIndexedAccess": true,
29
+ "exactOptionalPropertyTypes": true,
30
+ "strict": true,
31
+ "verbatimModuleSyntax": true,
32
+ "isolatedModules": true,
33
+ "noUncheckedSideEffectImports": true,
34
+ "moduleDetection": "force",
35
+ "skipLibCheck": true,
36
+ }
37
+ }