@dancrumb/web-crawler 1.0.3 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/WebCrawler.d.ts +44 -0
- package/dist/WebCrawler.d.ts.map +1 -0
- package/dist/WebCrawler.js +230 -0
- package/dist/WebCrawler.js.map +1 -0
- package/dist/http-operations.d.ts +3 -0
- package/dist/http-operations.d.ts.map +1 -0
- package/dist/http-operations.js +28 -0
- package/dist/http-operations.js.map +1 -0
- package/dist/pop-map.d.ts +24 -0
- package/dist/pop-map.d.ts.map +1 -0
- package/{src/pop-map.ts → dist/pop-map.js} +7 -7
- package/dist/pop-map.js.map +1 -0
- package/package.json +6 -3
- package/.claude/settings.local.json +0 -7
- package/src/WebCralwer.ts +0 -281
- package/src/http-operations.ts +0 -33
- package/src/ndnu.ts +0 -14
- package/src/popMap.test.ts +0 -39
- package/tsconfig.json +0 -37
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import EventEmitter from "node:events";
|
|
2
|
+
type Document = {
|
|
3
|
+
url: string;
|
|
4
|
+
canonicalUrl: string;
|
|
5
|
+
content: string | Buffer;
|
|
6
|
+
contentType: string;
|
|
7
|
+
title: string | null;
|
|
8
|
+
};
|
|
9
|
+
export declare class WebCrawler extends EventEmitter<{
|
|
10
|
+
progress: [progress: string | null];
|
|
11
|
+
}> {
|
|
12
|
+
private crawled;
|
|
13
|
+
private uncrawled;
|
|
14
|
+
private documents;
|
|
15
|
+
private roots;
|
|
16
|
+
private maxDepth;
|
|
17
|
+
private reportProgress;
|
|
18
|
+
private addUncrawled;
|
|
19
|
+
private markCrawled;
|
|
20
|
+
private loadDocument;
|
|
21
|
+
private getDocument;
|
|
22
|
+
private isUnderRoot;
|
|
23
|
+
private getAllLinks;
|
|
24
|
+
private getCanonical;
|
|
25
|
+
private resolveLink;
|
|
26
|
+
private fetchHead;
|
|
27
|
+
private getContentType;
|
|
28
|
+
crawlLink(initialLink: string, maxDepth?: number): Promise<string[][]>;
|
|
29
|
+
clear(): void;
|
|
30
|
+
addRoot(root: string): Promise<string>;
|
|
31
|
+
/**
|
|
32
|
+
* Sets the maximum crawl depth.
|
|
33
|
+
*
|
|
34
|
+
* If the number is negative or infinite, it sets the maxDepth to 0
|
|
35
|
+
* Otherwise, it truncates the number to an integer (if necessary) and sets the maxDepth to that number
|
|
36
|
+
*
|
|
37
|
+
* Returns the number that was saved
|
|
38
|
+
*/
|
|
39
|
+
setMaxDepth(depth: number): number;
|
|
40
|
+
getCrawledUrls(): string[];
|
|
41
|
+
getDocuments(): Document[];
|
|
42
|
+
}
|
|
43
|
+
export {};
|
|
44
|
+
//# sourceMappingURL=WebCrawler.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"WebCrawler.d.ts","sourceRoot":"","sources":["../src/WebCrawler.ts"],"names":[],"mappings":"AAIA,OAAO,YAAY,MAAM,aAAa,CAAC;AAMvC,KAAK,QAAQ,GAAG;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,YAAY,EAAE,MAAM,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,MAAM,CAAC;IACzB,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;CACtB,CAAC;AAEF,qBAAa,UAAW,SAAQ,YAAY,CAAC;IAC3C,QAAQ,EAAE,CAAC,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;CACrC,CAAC;IACA,OAAO,CAAC,OAAO,CAA6B;IAC5C,OAAO,CAAC,SAAS,CAA6B;IAC9C,OAAO,CAAC,SAAS,CAA+B;IAChD,OAAO,CAAC,KAAK,CAAqB;IAClC,OAAO,CAAC,QAAQ,CAAa;IAE7B,OAAO,CAAC,cAAc;IAWtB,OAAO,CAAC,YAAY;IAKpB,OAAO,CAAC,WAAW;YAUL,YAAY;YA0BZ,WAAW;IAOzB,OAAO,CAAC,WAAW;YAIL,WAAW;YAoBX,YAAY;YAkBZ,WAAW;YAUX,SAAS;IAiCvB,OAAO,CAAC,cAAc;IAYhB,SAAS,CAAC,WAAW,EAAE,MAAM,EAAE,QAAQ,GAAE,MAAsB;IAwDrE,KAAK;IAOC,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAM5C;;;;;;;OAOG;IACH,WAAW,CAAC,KAAK,EAAE,MAAM;IAWzB,cAAc;IAId,YAAY,IAAI,QAAQ,EAAE;CAK3B"}
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
import debug from "debug";
|
|
3
|
+
import assert from "node:assert";
|
|
4
|
+
import EventEmitter from "node:events";
|
|
5
|
+
import { httpGet, httpHead } from "./http-operations.js";
|
|
6
|
+
import { popMap } from "./pop-map.js";
|
|
7
|
+
const logDebug = debug("crawler:main");
|
|
8
|
+
export class WebCrawler extends EventEmitter {
|
|
9
|
+
crawled = new Map();
|
|
10
|
+
uncrawled = new Map();
|
|
11
|
+
documents = new Map();
|
|
12
|
+
roots = new Set();
|
|
13
|
+
maxDepth = 0;
|
|
14
|
+
reportProgress() {
|
|
15
|
+
if (this.uncrawled.size === 0) {
|
|
16
|
+
this.emit("progress", null);
|
|
17
|
+
}
|
|
18
|
+
const progress = ((this.crawled.size / (this.uncrawled.size + this.crawled.size)) *
|
|
19
|
+
100).toFixed(1);
|
|
20
|
+
this.emit("progress", progress);
|
|
21
|
+
}
|
|
22
|
+
addUncrawled(url, depth) {
|
|
23
|
+
this.uncrawled.set(url, depth);
|
|
24
|
+
this.reportProgress();
|
|
25
|
+
}
|
|
26
|
+
markCrawled(url, urlDepth) {
|
|
27
|
+
let depth = urlDepth ?? this.uncrawled.get(url);
|
|
28
|
+
if (depth === undefined) {
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
this.crawled.set(url, depth);
|
|
32
|
+
this.uncrawled.delete(url);
|
|
33
|
+
this.reportProgress();
|
|
34
|
+
}
|
|
35
|
+
async loadDocument(url) {
|
|
36
|
+
const { data, headers } = await httpGet(url);
|
|
37
|
+
const contentType = this.getContentType(headers) ?? "application/octet-stream";
|
|
38
|
+
const content = data;
|
|
39
|
+
this.documents.set(url, {
|
|
40
|
+
url,
|
|
41
|
+
canonicalUrl: "",
|
|
42
|
+
content,
|
|
43
|
+
title: null,
|
|
44
|
+
contentType,
|
|
45
|
+
});
|
|
46
|
+
const canonicalUrl = await this.getCanonical(url);
|
|
47
|
+
if (canonicalUrl !== url) {
|
|
48
|
+
this.documents.delete(url);
|
|
49
|
+
this.documents.set(canonicalUrl, {
|
|
50
|
+
url,
|
|
51
|
+
canonicalUrl,
|
|
52
|
+
content,
|
|
53
|
+
title: null,
|
|
54
|
+
contentType,
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
return data;
|
|
58
|
+
}
|
|
59
|
+
async getDocument(url) {
|
|
60
|
+
if (this.documents.has(url)) {
|
|
61
|
+
return this.documents.get(url).content;
|
|
62
|
+
}
|
|
63
|
+
return this.loadDocument(url);
|
|
64
|
+
}
|
|
65
|
+
isUnderRoot(url) {
|
|
66
|
+
return [...this.roots.values()].some((root) => url.startsWith(root));
|
|
67
|
+
}
|
|
68
|
+
async getAllLinks(url) {
|
|
69
|
+
try {
|
|
70
|
+
logDebug(`Getting links from ${url}`);
|
|
71
|
+
const { data } = await httpGet(url);
|
|
72
|
+
const $ = cheerio.load(data);
|
|
73
|
+
const anchorLinks = [];
|
|
74
|
+
for (const anchor of $("a")) {
|
|
75
|
+
anchorLinks.push(anchor.attribs["href"] ?? "");
|
|
76
|
+
}
|
|
77
|
+
return anchorLinks;
|
|
78
|
+
}
|
|
79
|
+
catch (e) {
|
|
80
|
+
logDebug(`Got "${e.message}" while trying to get links`);
|
|
81
|
+
return [];
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
async getCanonical(url) {
|
|
85
|
+
try {
|
|
86
|
+
const data = await this.getDocument(url);
|
|
87
|
+
const $ = cheerio.load(data);
|
|
88
|
+
for (const link of $("link")) {
|
|
89
|
+
if (link.attribs["rel"] === "canonical") {
|
|
90
|
+
return link.attribs["href"];
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return url;
|
|
94
|
+
}
|
|
95
|
+
catch (e) {
|
|
96
|
+
logDebug(`Got "${e.message}" while trying to get canonical url`);
|
|
97
|
+
return url;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
async resolveLink(link, root) {
|
|
101
|
+
let absolute = link;
|
|
102
|
+
if (link.startsWith("http")) {
|
|
103
|
+
absolute = link;
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
absolute = new URL(link, root).href.replace(/\/$/, "");
|
|
107
|
+
}
|
|
108
|
+
return await this.getCanonical(absolute);
|
|
109
|
+
}
|
|
110
|
+
async fetchHead(link) {
|
|
111
|
+
let linkUrl;
|
|
112
|
+
try {
|
|
113
|
+
linkUrl = new URL(link);
|
|
114
|
+
}
|
|
115
|
+
catch (e) {
|
|
116
|
+
logDebug(`${link} is invalid, so skipping`);
|
|
117
|
+
return null;
|
|
118
|
+
}
|
|
119
|
+
if (!linkUrl.protocol.startsWith("http")) {
|
|
120
|
+
logDebug(`Protocol: '${linkUrl.protocol}' not supported, so skipping`);
|
|
121
|
+
return null;
|
|
122
|
+
}
|
|
123
|
+
logDebug(`HEAD ${link}`);
|
|
124
|
+
try {
|
|
125
|
+
const pageHead = await httpHead(link);
|
|
126
|
+
if (pageHead.status === 200) {
|
|
127
|
+
return pageHead.headers;
|
|
128
|
+
}
|
|
129
|
+
logDebug(`Got ${pageHead.status}: ${pageHead.statusText} when requesting ${link}`);
|
|
130
|
+
}
|
|
131
|
+
catch (e) {
|
|
132
|
+
logDebug(`Got ${e.message} when requesting ${link}`);
|
|
133
|
+
}
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
getContentType(headers) {
|
|
137
|
+
const contentType = headers["content-type"] ?? headers["Content-Type"] ?? null;
|
|
138
|
+
if (contentType === null) {
|
|
139
|
+
return contentType;
|
|
140
|
+
}
|
|
141
|
+
if (typeof contentType !== "string") {
|
|
142
|
+
return null;
|
|
143
|
+
}
|
|
144
|
+
return contentType;
|
|
145
|
+
}
|
|
146
|
+
async crawlLink(initialLink, maxDepth = this.maxDepth) {
|
|
147
|
+
const root = await this.getCanonical(initialLink);
|
|
148
|
+
this.addRoot(root);
|
|
149
|
+
this.maxDepth = maxDepth;
|
|
150
|
+
let [url, depth] = [root, 0];
|
|
151
|
+
while (url !== undefined) {
|
|
152
|
+
assert(depth !== undefined);
|
|
153
|
+
logDebug("\n*********\n");
|
|
154
|
+
logDebug(`Crawled: ${this.crawled.size}\nUncrawled: ${this.uncrawled.size}`);
|
|
155
|
+
logDebug(`URL: ${url}\nDepth:${depth}\n`);
|
|
156
|
+
if (depth <= maxDepth) {
|
|
157
|
+
this.markCrawled(url, depth);
|
|
158
|
+
logDebug(`Crawling ${url}...`);
|
|
159
|
+
const links = await this.getAllLinks(url);
|
|
160
|
+
for (const link of links) {
|
|
161
|
+
const resolved = await this.resolveLink(link, url);
|
|
162
|
+
if (!this.isUnderRoot(resolved)) {
|
|
163
|
+
logDebug(`Out of scope: ${resolved} (${root})`);
|
|
164
|
+
continue;
|
|
165
|
+
}
|
|
166
|
+
logDebug(`${resolved} is under one of ${[...this.roots.entries()].join(",")}`);
|
|
167
|
+
if ((this.crawled.get(resolved) ?? maxDepth) < depth) {
|
|
168
|
+
logDebug(`Already crawled ${resolved} at depth ${this.crawled.get(resolved)}`);
|
|
169
|
+
continue;
|
|
170
|
+
}
|
|
171
|
+
if (!this.uncrawled.has(resolved)) {
|
|
172
|
+
const head = await this.fetchHead(resolved);
|
|
173
|
+
if (head === null) {
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
const contentType = this.getContentType(head);
|
|
177
|
+
if (contentType === null || !contentType.startsWith("text/html")) {
|
|
178
|
+
logDebug(`Page at ${resolved} has MIME type ${contentType}, so skipping`);
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
logDebug(`Adding ${resolved} to list of uncrawled`);
|
|
182
|
+
this.addUncrawled(resolved, depth + 1);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
[url, depth] = popMap(this.uncrawled);
|
|
187
|
+
}
|
|
188
|
+
return [[...this.crawled.keys()], [...this.uncrawled.keys()]];
|
|
189
|
+
}
|
|
190
|
+
clear() {
|
|
191
|
+
this.crawled.clear();
|
|
192
|
+
this.uncrawled.clear();
|
|
193
|
+
this.documents.clear();
|
|
194
|
+
this.roots.clear();
|
|
195
|
+
}
|
|
196
|
+
async addRoot(root) {
|
|
197
|
+
const canonical = await this.getCanonical(root);
|
|
198
|
+
this.roots.add(canonical);
|
|
199
|
+
return canonical;
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Sets the maximum crawl depth.
|
|
203
|
+
*
|
|
204
|
+
* If the number is negative or infinite, it sets the maxDepth to 0
|
|
205
|
+
* Otherwise, it truncates the number to an integer (if necessary) and sets the maxDepth to that number
|
|
206
|
+
*
|
|
207
|
+
* Returns the number that was saved
|
|
208
|
+
*/
|
|
209
|
+
setMaxDepth(depth) {
|
|
210
|
+
if (depth < 0) {
|
|
211
|
+
this.maxDepth = 0;
|
|
212
|
+
}
|
|
213
|
+
else if (Number.isFinite(depth)) {
|
|
214
|
+
this.maxDepth = Math.trunc(depth);
|
|
215
|
+
}
|
|
216
|
+
else {
|
|
217
|
+
this.maxDepth = 0;
|
|
218
|
+
}
|
|
219
|
+
return this.maxDepth;
|
|
220
|
+
}
|
|
221
|
+
getCrawledUrls() {
|
|
222
|
+
return [...this.crawled.keys()];
|
|
223
|
+
}
|
|
224
|
+
getDocuments() {
|
|
225
|
+
return this.getCrawledUrls()
|
|
226
|
+
.map((url) => this.documents.get(url))
|
|
227
|
+
.filter((d) => d !== undefined);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
//# sourceMappingURL=WebCrawler.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"WebCrawler.js","sourceRoot":"","sources":["../src/WebCrawler.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AACnC,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,MAAM,MAAM,aAAa,CAAC;AACjC,OAAO,YAAY,MAAM,aAAa,CAAC;AACvC,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,EAAE,MAAM,EAAE,MAAM,cAAc,CAAC;AAEtC,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,CAAC,CAAC;AAUvC,MAAM,OAAO,UAAW,SAAQ,YAE9B;IACQ,OAAO,GAAG,IAAI,GAAG,EAAkB,CAAC;IACpC,SAAS,GAAG,IAAI,GAAG,EAAkB,CAAC;IACtC,SAAS,GAAG,IAAI,GAAG,EAAoB,CAAC;IACxC,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;IAC1B,QAAQ,GAAW,CAAC,CAAC;IAErB,cAAc;QACpB,IAAI,IAAI,CAAC,SAAS,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC9B,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;QAC9B,CAAC;QACD,MAAM,QAAQ,GAAG,CACf,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC/D,GAAG,CACJ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QACb,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;IAClC,CAAC;IAEO,YAAY,CAAC,GAAW,EAAE,KAAa;QAC7C,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAC/B,IAAI,CAAC,cAAc,EAAE,CAAC;IACxB,CAAC;IAEO,WAAW,CAAC,GAAW,EAAE,QAAiB;QAChD,IAAI,KAAK,GAAG,QAAQ,IAAI,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAChD,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACxB,OAAO;QACT,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAC7B,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC3B,IAAI,CAAC,cAAc,EAAE,CAAC;IACxB,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,GAAW;QACpC,MAAM,EAAE,IAAI,EAAE,OAAO,EAAE,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,CAAC;QAC7C,MAAM,WAAW,GACf,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,IAAI,0BAA0B,CAAC;QAC7D,MAAM,OAAO,GAAG,IAAI,CAAC;QACrB,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE;YACtB,GAAG;YACH,YAAY,EAAE,EAAE;YAChB,OAAO;YACP,KAAK,EAAE,IAAI;YACX,WAAW;SACZ,CAAC,CAAC;QACH,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QAClD,IAAI,YAAY,KAAK,GAAG,EAAE,CAAC;YACzB,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC3B,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,YAAY,EAAE;gBAC/B,GAAG;gBACH,YAAY;gBACZ,OAAO;gBACP,KAAK,EAAE,IAAI;gBACX,WAAW;aACZ,CAAC,CAAC;QACL,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,KAAK,CAAC,WAAW,CAAC,GAAW;QACnC,IAAI,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5B,OAAO,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAE,CAAC,OAAO,CAAC;QAC1C,CAAC;QACD,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAChC,CAAC;IAEO,WAAW,CAAC,GAAW;QAC7B,OAAO,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;IACvE,CAAC;IAEO,KAAK,CAAC,WAAW,CAAC,GAAW;QACnC,IAAI,CAAC;YACH,QAAQ,CAAC,sBAAsB,GAAG,EAAE,CAAC,CAAC;YAEtC,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,CAAC;YACpC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,MAAM,WAAW,GAAa,EAAE,CAAC;YAEjC,KAAK,MAAM,MAAM,IAAI,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC5B,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;YACjD,CAAC;YAED,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,QAAQ,CAAC,QAAS,CAAW,CAAC,OAAO,6BAA6B,CAAC,CAAC;YACpE,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,GAAW;QACpC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;YACzC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,KAAK,MAAM,IAAI,IAAI,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC7B,IAAI,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,WAAW,EAAE,CAAC;oBACxC,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,CAAE,CAAC;gBAC/B,CAAC;YACH,CAAC;YACD,OAAO,GAAG,CAAC;QACb,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,QAAQ,CACN,QAAS,CAAW,CAAC,OAAO,qCAAqC,CAClE,CAAC;YACF,OAAO,GAAG,CAAC;QACb,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,WAAW,CAAC,IAAY,EAAE,IAAY;QAClD,IAAI,QAAQ,GAAG,IAAI,CAAC;QACpB,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,QAAQ,GAAG,IAAI,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACzD,CAAC;QACD,OAAO,MAAM,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;IAC3C,CAAC;IAEO,KAAK,CAAC,SAAS,CACrB,IAAY;QAEZ,IAAI,OAAY,CAAC;QACjB,IAAI,CAAC;YACH,OAAO,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,QAAQ,CAAC,GAAG,IAAI,0BAA0B,CAAC,CAAC;YAC5C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;YACzC,QAAQ,CAAC,cAAc,OAAO,CAAC,QAAQ,8BAA8B,CAAC,CAAC;YACvE,OAAO,IAAI,CAAC;QACd,CAAC;QAED,QAAQ,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;QACzB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,CAAC;YAEtC,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;gBAC5B,OAAO,QAAQ,CAAC,OAAO,CAAC;YAC1B,CAAC;YACD,QAAQ,CACN,OAAO,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,oBAAoB,IAAI,EAAE,CACzE,CAAC;QACJ,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,QAAQ,CAAC,OAAQ,CAAW,CAAC,OAAO,oBAAoB,IAAI,EAAE,CAAC,CAAC;QAClE,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,cAAc,CAAC,OAAiC;QACtD,MAAM,WAAW,GACf,OAAO,CAAC,cAAc,CAAC,IAAI,OAAO,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC;QAC7D,IAAI,WAAW,KAAK,IAAI,EAAE,CAAC;YACzB,OAAO,WAAW,CAAC;QACrB,CAAC;QACD,IAAI,OAAO,WAAW,KAAK,QAAQ,EAAE,CAAC;YACpC,OAAO,IAAI,CAAC;QACd,CAAC;QACD,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,WAAmB,EAAE,WAAmB,IAAI,CAAC,QAAQ;QACnE,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC;QAClD,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QACnB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QAEzB,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,GAA8C,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAExE,OAAO,GAAG,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,CAAC,KAAK,KAAK,SAAS,CAAC,CAAC;YAC5B,QAAQ,CAAC,eAAe,CAAC,CAAC;YAC1B,QAAQ,CACN,YAAY,IAAI,CAAC,OAAO,CAAC,IAAI,gBAAgB,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CACnE,CAAC;YACF,QAAQ,CAAC,QAAQ,GAAG,WAAW,KAAK,IAAI,CAAC,CAAC;YAC1C,IAAI,KAAK,IAAI,QAAQ,EAAE,CAAC;gBACtB,IAAI,CAAC,WAAW,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;gBAC7B,QAAQ,CAAC,YAAY,GAAG,KAAK,CAAC,CAAC;gBAC/B,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;gBAC1C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;oBACzB,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;oBACnD,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,EAAE,CAAC;wBAChC,QAAQ,CAAC,iBAAiB,QAAQ,KAAK,IAAI,GAAG,CAAC,CAAC;wBAChD,SAAS;oBACX,CAAC;oBACD,QAAQ,CACN,GAAG,QAAQ,oBAAoB,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CACrE,CAAC;oBACF,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAC,GAAG,KAAK,EAAE,CAAC;wBACrD,QAAQ,CACN,mBAAmB,QAAQ,aAAa,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CACrE,CAAC;wBACF,SAAS;oBACX,CAAC;oBACD,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;wBAClC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;wBAC5C,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;4BAClB,SAAS;wBACX,CAAC;wBACD,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;wBAC9C,IAAI,WAAW,KAAK,IAAI,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;4BACjE,QAAQ,CACN,WAAW,QAAQ,kBAAkB,WAAW,eAAe,CAChE,CAAC;4BACF,SAAS;wBACX,CAAC;wBACD,QAAQ,CAAC,UAAU,QAAQ,uBAAuB,CAAC,CAAC;wBACpD,IAAI,CAAC,YAAY,CAAC,QAAQ,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;oBACzC,CAAC;gBACH,CAAC;YACH,CAAC;YACD,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACxC,CAAC;QAED,OAAO,CAAC,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IAChE,CAAC;IAED,KAAK;QACH,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;QACrB,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QACvB,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QACvB,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,IAAY;QACxB,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;QAChD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAC1B,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;;;;;;OAOG;IACH,WAAW,CAAC,KAAa;QACvB,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;QACpB,CAAC;aAAM,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAClC,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACpC,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;QACpB,CAAC;QACD,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED,cAAc;QACZ,OAAO,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IAClC,CAAC;IAED,YAAY;QACV,OAAO,IAAI,CAAC,cAAc,EAAE;aACzB,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;aACrC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC;IACpC,CAAC;CACF"}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export declare function httpHead(link: string): Promise<import("axios").AxiosResponse<any, any, {}>>;
|
|
2
|
+
export declare function httpGet(link: string): Promise<import("axios").AxiosResponse<string | Buffer<ArrayBufferLike>, any, {}>>;
|
|
3
|
+
//# sourceMappingURL=http-operations.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"http-operations.d.ts","sourceRoot":"","sources":["../src/http-operations.ts"],"names":[],"mappings":"AAKA,wBAAsB,QAAQ,CAAC,IAAI,EAAE,MAAM,wDAW1C;AAED,wBAAsB,OAAO,CAAC,IAAI,EAAE,MAAM,qFAczC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { default as _axios } from "axios";
|
|
2
|
+
import debug from "debug";
|
|
3
|
+
const logDebug = debug("crawler:http");
|
|
4
|
+
export async function httpHead(link) {
|
|
5
|
+
const aborter = new AbortController();
|
|
6
|
+
const abortTimeout = setTimeout(() => {
|
|
7
|
+
logDebug(`HEAD ${link} Timed Out`);
|
|
8
|
+
aborter.abort();
|
|
9
|
+
}, 5000);
|
|
10
|
+
logDebug(`HEAD ${link}`);
|
|
11
|
+
const head = await _axios.head(link, { signal: aborter.signal });
|
|
12
|
+
clearTimeout(abortTimeout);
|
|
13
|
+
return head;
|
|
14
|
+
}
|
|
15
|
+
export async function httpGet(link) {
|
|
16
|
+
const aborter = new AbortController();
|
|
17
|
+
const abortTimeout = setTimeout(() => {
|
|
18
|
+
logDebug(`GET ${link} Timed Out`);
|
|
19
|
+
aborter.abort();
|
|
20
|
+
}, 5000);
|
|
21
|
+
logDebug(`GET ${link}`);
|
|
22
|
+
const got = await _axios.get(link, {
|
|
23
|
+
signal: aborter.signal,
|
|
24
|
+
});
|
|
25
|
+
clearTimeout(abortTimeout);
|
|
26
|
+
return got;
|
|
27
|
+
}
|
|
28
|
+
//# sourceMappingURL=http-operations.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"http-operations.js","sourceRoot":"","sources":["../src/http-operations.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,IAAI,MAAM,EAAE,MAAM,OAAO,CAAC;AAC1C,OAAO,KAAK,MAAM,OAAO,CAAC;AAE1B,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,CAAC,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,IAAY;IACzC,MAAM,OAAO,GAAG,IAAI,eAAe,EAAE,CAAC;IAEtC,MAAM,YAAY,GAAG,UAAU,CAAC,GAAG,EAAE;QACnC,QAAQ,CAAC,QAAQ,IAAI,YAAY,CAAC,CAAC;QACnC,OAAO,CAAC,KAAK,EAAE,CAAC;IAClB,CAAC,EAAE,IAAI,CAAC,CAAC;IACT,QAAQ,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;IACzB,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IACjE,YAAY,CAAC,YAAY,CAAC,CAAC;IAC3B,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAAY;IACxC,MAAM,OAAO,GAAG,IAAI,eAAe,EAAE,CAAC;IAEtC,MAAM,YAAY,GAAG,UAAU,CAAC,GAAG,EAAE;QACnC,QAAQ,CAAC,OAAO,IAAI,YAAY,CAAC,CAAC;QAClC,OAAO,CAAC,KAAK,EAAE,CAAC;IAClB,CAAC,EAAE,IAAI,CAAC,CAAC;IAET,QAAQ,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC;IACxB,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,GAAG,CAAkB,IAAI,EAAE;QAClD,MAAM,EAAE,OAAO,CAAC,MAAM;KACvB,CAAC,CAAC;IACH,YAAY,CAAC,YAAY,CAAC,CAAC;IAC3B,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* "Pops" a value from a Map
|
|
3
|
+
*
|
|
4
|
+
* This relies on the native ordering of the Map implementation, so it does not guarantee
|
|
5
|
+
* that this is the least- or most-recent-value.
|
|
6
|
+
*
|
|
7
|
+
* It always returns a two-value tuple; either the key and value or undefined, undefined.
|
|
8
|
+
*
|
|
9
|
+
* This make it easier on calling code to determine what was returned
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```
|
|
13
|
+
* const [key,value] = popMap(someMap);
|
|
14
|
+
* if(key === defined) {
|
|
15
|
+
* // The Map was Empty
|
|
16
|
+
*}
|
|
17
|
+
*
|
|
18
|
+
* // value !== undefined (unless the definition of the map's keys allow for this)
|
|
19
|
+
*
|
|
20
|
+
* @param map
|
|
21
|
+
* @returns
|
|
22
|
+
*/
|
|
23
|
+
export declare function popMap<K, V>(map: Map<K, V>): [K, V] | [undefined, undefined];
|
|
24
|
+
//# sourceMappingURL=pop-map.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pop-map.d.ts","sourceRoot":"","sources":["../src/pop-map.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,SAAS,EAAE,SAAS,CAAC,CAO5E"}
|
|
@@ -20,11 +20,11 @@
|
|
|
20
20
|
* @param map
|
|
21
21
|
* @returns
|
|
22
22
|
*/
|
|
23
|
-
export function popMap
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
return entry;
|
|
23
|
+
export function popMap(map) {
|
|
24
|
+
const entry = map.entries().next().value ?? [undefined, undefined];
|
|
25
|
+
if (entry[0]) {
|
|
26
|
+
map.delete(entry[0]);
|
|
27
|
+
}
|
|
28
|
+
return entry;
|
|
30
29
|
}
|
|
30
|
+
//# sourceMappingURL=pop-map.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pop-map.js","sourceRoot":"","sources":["../src/pop-map.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,MAAM,UAAU,MAAM,CAAO,GAAc;IACzC,MAAM,KAAK,GAAG,GAAG,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,IAAI,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;IACnE,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;QACb,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dancrumb/web-crawler",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "",
|
|
5
|
-
"
|
|
6
|
-
"
|
|
5
|
+
"files": [
|
|
6
|
+
"dist"
|
|
7
|
+
],
|
|
8
|
+
"exports": {
|
|
9
|
+
"/": "./dist/WebCrawler.js"
|
|
7
10
|
},
|
|
8
11
|
"scripts": {
|
|
9
12
|
"build": "tsc",
|
package/src/WebCralwer.ts
DELETED
|
@@ -1,281 +0,0 @@
|
|
|
1
|
-
import type { AxiosResponse } from "axios";
|
|
2
|
-
import * as cheerio from "cheerio";
|
|
3
|
-
import debug from "debug";
|
|
4
|
-
import assert from "node:assert";
|
|
5
|
-
import EventEmitter from "node:events";
|
|
6
|
-
import { httpGet, httpHead } from "./http-operations.js";
|
|
7
|
-
import { popMap } from "./pop-map.js";
|
|
8
|
-
|
|
9
|
-
const logDebug = debug("crawler:main");
|
|
10
|
-
|
|
11
|
-
type Document = {
|
|
12
|
-
url: string;
|
|
13
|
-
canonicalUrl: string;
|
|
14
|
-
content: string | Buffer;
|
|
15
|
-
contentType: string;
|
|
16
|
-
title: string | null;
|
|
17
|
-
};
|
|
18
|
-
|
|
19
|
-
export class WebCrawler extends EventEmitter<{
|
|
20
|
-
progress: [progress: string | null];
|
|
21
|
-
}> {
|
|
22
|
-
private crawled = new Map<string, number>();
|
|
23
|
-
private uncrawled = new Map<string, number>();
|
|
24
|
-
private documents = new Map<string, Document>();
|
|
25
|
-
private roots = new Set<string>();
|
|
26
|
-
private maxDepth: number = 0;
|
|
27
|
-
|
|
28
|
-
private reportProgress() {
|
|
29
|
-
if (this.uncrawled.size === 0) {
|
|
30
|
-
this.emit("progress", null);
|
|
31
|
-
}
|
|
32
|
-
const progress = (
|
|
33
|
-
(this.crawled.size / (this.uncrawled.size + this.crawled.size)) *
|
|
34
|
-
100
|
|
35
|
-
).toFixed(1);
|
|
36
|
-
this.emit("progress", progress);
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
private addUncrawled(url: string, depth: number) {
|
|
40
|
-
this.uncrawled.set(url, depth);
|
|
41
|
-
this.reportProgress();
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
private markCrawled(url: string, urlDepth?: number) {
|
|
45
|
-
let depth = urlDepth ?? this.uncrawled.get(url);
|
|
46
|
-
if (depth === undefined) {
|
|
47
|
-
return;
|
|
48
|
-
}
|
|
49
|
-
this.crawled.set(url, depth);
|
|
50
|
-
this.uncrawled.delete(url);
|
|
51
|
-
this.reportProgress();
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
private async loadDocument(url: string): Promise<string | Buffer> {
|
|
55
|
-
const { data, headers } = await httpGet(url);
|
|
56
|
-
const contentType =
|
|
57
|
-
this.getContentType(headers) ?? "application/octet-stream";
|
|
58
|
-
const content = data;
|
|
59
|
-
this.documents.set(url, {
|
|
60
|
-
url,
|
|
61
|
-
canonicalUrl: "",
|
|
62
|
-
content,
|
|
63
|
-
title: null,
|
|
64
|
-
contentType,
|
|
65
|
-
});
|
|
66
|
-
const canonicalUrl = await this.getCanonical(url);
|
|
67
|
-
if (canonicalUrl !== url) {
|
|
68
|
-
this.documents.delete(url);
|
|
69
|
-
this.documents.set(canonicalUrl, {
|
|
70
|
-
url,
|
|
71
|
-
canonicalUrl,
|
|
72
|
-
content,
|
|
73
|
-
title: null,
|
|
74
|
-
contentType,
|
|
75
|
-
});
|
|
76
|
-
}
|
|
77
|
-
return data;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
private async getDocument(url: string): Promise<string | Buffer> {
|
|
81
|
-
if (this.documents.has(url)) {
|
|
82
|
-
return this.documents.get(url)!.content;
|
|
83
|
-
}
|
|
84
|
-
return this.loadDocument(url);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
private isUnderRoot(url: string) {
|
|
88
|
-
return [...this.roots.values()].some((root) => url.startsWith(root));
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
private async getAllLinks(url: string): Promise<string[]> {
|
|
92
|
-
try {
|
|
93
|
-
logDebug(`Getting links from ${url}`);
|
|
94
|
-
|
|
95
|
-
const { data } = await httpGet(url);
|
|
96
|
-
const $ = cheerio.load(data);
|
|
97
|
-
|
|
98
|
-
const anchorLinks: string[] = [];
|
|
99
|
-
|
|
100
|
-
for (const anchor of $("a")) {
|
|
101
|
-
anchorLinks.push(anchor.attribs["href"] ?? "");
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
return anchorLinks;
|
|
105
|
-
} catch (e) {
|
|
106
|
-
logDebug(`Got "${(e as Error).message}" while trying to get links`);
|
|
107
|
-
return [];
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
private async getCanonical(url: string): Promise<string> {
|
|
112
|
-
try {
|
|
113
|
-
const data = await this.getDocument(url);
|
|
114
|
-
const $ = cheerio.load(data);
|
|
115
|
-
for (const link of $("link")) {
|
|
116
|
-
if (link.attribs["rel"] === "canonical") {
|
|
117
|
-
return link.attribs["href"]!;
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
return url;
|
|
121
|
-
} catch (e) {
|
|
122
|
-
logDebug(
|
|
123
|
-
`Got "${(e as Error).message}" while trying to get canonical url`,
|
|
124
|
-
);
|
|
125
|
-
return url;
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
private async resolveLink(link: string, root: string) {
|
|
130
|
-
let absolute = link;
|
|
131
|
-
if (link.startsWith("http")) {
|
|
132
|
-
absolute = link;
|
|
133
|
-
} else {
|
|
134
|
-
absolute = new URL(link, root).href.replace(/\/$/, "");
|
|
135
|
-
}
|
|
136
|
-
return await this.getCanonical(absolute);
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
private async fetchHead(
|
|
140
|
-
link: string,
|
|
141
|
-
): Promise<AxiosResponse["headers"] | null> {
|
|
142
|
-
let linkUrl: URL;
|
|
143
|
-
try {
|
|
144
|
-
linkUrl = new URL(link);
|
|
145
|
-
} catch (e) {
|
|
146
|
-
logDebug(`${link} is invalid, so skipping`);
|
|
147
|
-
return null;
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
if (!linkUrl.protocol.startsWith("http")) {
|
|
151
|
-
logDebug(`Protocol: '${linkUrl.protocol}' not supported, so skipping`);
|
|
152
|
-
return null;
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
logDebug(`HEAD ${link}`);
|
|
156
|
-
try {
|
|
157
|
-
const pageHead = await httpHead(link);
|
|
158
|
-
|
|
159
|
-
if (pageHead.status === 200) {
|
|
160
|
-
return pageHead.headers;
|
|
161
|
-
}
|
|
162
|
-
logDebug(
|
|
163
|
-
`Got ${pageHead.status}: ${pageHead.statusText} when requesting ${link}`,
|
|
164
|
-
);
|
|
165
|
-
} catch (e) {
|
|
166
|
-
logDebug(`Got ${(e as Error).message} when requesting ${link}`);
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
return null;
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
private getContentType(headers: AxiosResponse["headers"]): string | null {
|
|
173
|
-
const contentType =
|
|
174
|
-
headers["content-type"] ?? headers["Content-Type"] ?? null;
|
|
175
|
-
if (contentType === null) {
|
|
176
|
-
return contentType;
|
|
177
|
-
}
|
|
178
|
-
if (typeof contentType !== "string") {
|
|
179
|
-
return null;
|
|
180
|
-
}
|
|
181
|
-
return contentType;
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
async crawlLink(initialLink: string, maxDepth: number = this.maxDepth) {
|
|
185
|
-
const root = await this.getCanonical(initialLink);
|
|
186
|
-
this.addRoot(root);
|
|
187
|
-
this.maxDepth = maxDepth;
|
|
188
|
-
|
|
189
|
-
let [url, depth]: [string, number] | [undefined, undefined] = [root, 0];
|
|
190
|
-
|
|
191
|
-
while (url !== undefined) {
|
|
192
|
-
assert(depth !== undefined);
|
|
193
|
-
logDebug("\n*********\n");
|
|
194
|
-
logDebug(
|
|
195
|
-
`Crawled: ${this.crawled.size}\nUncrawled: ${this.uncrawled.size}`,
|
|
196
|
-
);
|
|
197
|
-
logDebug(`URL: ${url}\nDepth:${depth}\n`);
|
|
198
|
-
if (depth <= maxDepth) {
|
|
199
|
-
this.markCrawled(url, depth);
|
|
200
|
-
logDebug(`Crawling ${url}...`);
|
|
201
|
-
const links = await this.getAllLinks(url);
|
|
202
|
-
for (const link of links) {
|
|
203
|
-
const resolved = await this.resolveLink(link, url);
|
|
204
|
-
if (!this.isUnderRoot(resolved)) {
|
|
205
|
-
logDebug(`Out of scope: ${resolved} (${root})`);
|
|
206
|
-
continue;
|
|
207
|
-
}
|
|
208
|
-
logDebug(
|
|
209
|
-
`${resolved} is under one of ${[...this.roots.entries()].join(",")}`,
|
|
210
|
-
);
|
|
211
|
-
if ((this.crawled.get(resolved) ?? maxDepth) < depth) {
|
|
212
|
-
logDebug(
|
|
213
|
-
`Already crawled ${resolved} at depth ${this.crawled.get(resolved)}`,
|
|
214
|
-
);
|
|
215
|
-
continue;
|
|
216
|
-
}
|
|
217
|
-
if (!this.uncrawled.has(resolved)) {
|
|
218
|
-
const head = await this.fetchHead(resolved);
|
|
219
|
-
if (head === null) {
|
|
220
|
-
continue;
|
|
221
|
-
}
|
|
222
|
-
const contentType = this.getContentType(head);
|
|
223
|
-
if (contentType === null || !contentType.startsWith("text/html")) {
|
|
224
|
-
logDebug(
|
|
225
|
-
`Page at ${resolved} has MIME type ${contentType}, so skipping`,
|
|
226
|
-
);
|
|
227
|
-
continue;
|
|
228
|
-
}
|
|
229
|
-
logDebug(`Adding ${resolved} to list of uncrawled`);
|
|
230
|
-
this.addUncrawled(resolved, depth + 1);
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
}
|
|
234
|
-
[url, depth] = popMap(this.uncrawled);
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
return [[...this.crawled.keys()], [...this.uncrawled.keys()]];
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
clear() {
|
|
241
|
-
this.crawled.clear();
|
|
242
|
-
this.uncrawled.clear();
|
|
243
|
-
this.documents.clear();
|
|
244
|
-
this.roots.clear();
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
async addRoot(root: string): Promise<string> {
|
|
248
|
-
const canonical = await this.getCanonical(root);
|
|
249
|
-
this.roots.add(canonical);
|
|
250
|
-
return canonical;
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
/**
|
|
254
|
-
* Sets the maximum crawl depth.
|
|
255
|
-
*
|
|
256
|
-
* If the number is negative or infinite, it sets the maxDepth to 0
|
|
257
|
-
* Otherwise, it truncates the number to an integer (if necessary) and sets the maxDepth to that number
|
|
258
|
-
*
|
|
259
|
-
* Returns the number that was saved
|
|
260
|
-
*/
|
|
261
|
-
setMaxDepth(depth: number) {
|
|
262
|
-
if (depth < 0) {
|
|
263
|
-
this.maxDepth = 0;
|
|
264
|
-
} else if (Number.isFinite(depth)) {
|
|
265
|
-
this.maxDepth = Math.trunc(depth);
|
|
266
|
-
} else {
|
|
267
|
-
this.maxDepth = 0;
|
|
268
|
-
}
|
|
269
|
-
return this.maxDepth;
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
getCrawledUrls() {
|
|
273
|
-
return [...this.crawled.keys()];
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
getDocuments(): Document[] {
|
|
277
|
-
return this.getCrawledUrls()
|
|
278
|
-
.map((url) => this.documents.get(url))
|
|
279
|
-
.filter((d) => d !== undefined);
|
|
280
|
-
}
|
|
281
|
-
}
|
package/src/http-operations.ts
DELETED
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import { default as _axios } from "axios";
|
|
2
|
-
import debug from "debug";
|
|
3
|
-
|
|
4
|
-
const logDebug = debug("crawler:http");
|
|
5
|
-
|
|
6
|
-
export async function httpHead(link: string) {
|
|
7
|
-
const aborter = new AbortController();
|
|
8
|
-
|
|
9
|
-
const abortTimeout = setTimeout(() => {
|
|
10
|
-
logDebug(`HEAD ${link} Timed Out`);
|
|
11
|
-
aborter.abort();
|
|
12
|
-
}, 5000);
|
|
13
|
-
logDebug(`HEAD ${link}`);
|
|
14
|
-
const head = await _axios.head(link, { signal: aborter.signal });
|
|
15
|
-
clearTimeout(abortTimeout);
|
|
16
|
-
return head;
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
export async function httpGet(link: string) {
|
|
20
|
-
const aborter = new AbortController();
|
|
21
|
-
|
|
22
|
-
const abortTimeout = setTimeout(() => {
|
|
23
|
-
logDebug(`GET ${link} Timed Out`);
|
|
24
|
-
aborter.abort();
|
|
25
|
-
}, 5000);
|
|
26
|
-
|
|
27
|
-
logDebug(`GET ${link}`);
|
|
28
|
-
const got = await _axios.get<string | Buffer>(link, {
|
|
29
|
-
signal: aborter.signal,
|
|
30
|
-
});
|
|
31
|
-
clearTimeout(abortTimeout);
|
|
32
|
-
return got;
|
|
33
|
-
}
|
package/src/ndnu.ts
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import { inspect } from "node:util";
|
|
2
|
-
import { WebCrawler } from "./WebCralwer.js";
|
|
3
|
-
|
|
4
|
-
const NDNU = "https://ndnu.edu/financial-aid/";
|
|
5
|
-
|
|
6
|
-
const crawler = new WebCrawler();
|
|
7
|
-
crawler.setMaxDepth(1);
|
|
8
|
-
crawler.on("progress", (progress) =>
|
|
9
|
-
console.log(`Progress: ${progress ?? "-"}%`),
|
|
10
|
-
);
|
|
11
|
-
|
|
12
|
-
await crawler.crawlLink(NDNU);
|
|
13
|
-
console.log("Crawled");
|
|
14
|
-
console.log(crawler.getDocuments().map(({ content: _, ...d }) => inspect(d)));
|
package/src/popMap.test.ts
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import { beforeEach, describe, expect, it } from "vitest";
|
|
2
|
-
import { popMap } from "./pop-map.js";
|
|
3
|
-
|
|
4
|
-
describe("popMap", () => {
|
|
5
|
-
const testMap = new Map<string, number>();
|
|
6
|
-
|
|
7
|
-
beforeEach(() => {
|
|
8
|
-
testMap.clear();
|
|
9
|
-
testMap.set("a", 1);
|
|
10
|
-
testMap.set("b", 2);
|
|
11
|
-
testMap.set("c", 3);
|
|
12
|
-
});
|
|
13
|
-
|
|
14
|
-
it("pops a value from a map", () => {
|
|
15
|
-
const entry = popMap(testMap);
|
|
16
|
-
expect(entry).toBeDefined();
|
|
17
|
-
expect(testMap.size).toBe(2);
|
|
18
|
-
expect(entry[0]).toBeOneOf(["a", "b", "c"]);
|
|
19
|
-
});
|
|
20
|
-
|
|
21
|
-
it("handles an empty map", () => {
|
|
22
|
-
popMap(testMap);
|
|
23
|
-
expect(testMap.size).toBe(2);
|
|
24
|
-
popMap(testMap);
|
|
25
|
-
expect(testMap.size).toBe(1);
|
|
26
|
-
popMap(testMap);
|
|
27
|
-
expect(testMap.size).toBe(0);
|
|
28
|
-
popMap(testMap);
|
|
29
|
-
expect(testMap.size).toBe(0);
|
|
30
|
-
});
|
|
31
|
-
|
|
32
|
-
it("handles additions to the map", () => {
|
|
33
|
-
popMap(testMap);
|
|
34
|
-
expect(testMap.size).toBe(2);
|
|
35
|
-
testMap.set("q", 42);
|
|
36
|
-
popMap(testMap);
|
|
37
|
-
expect(testMap.size).toBe(2);
|
|
38
|
-
});
|
|
39
|
-
});
|
package/tsconfig.json
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"exclude": [
|
|
3
|
-
"./src/ndnu.ts",
|
|
4
|
-
"./src/*.test.*",
|
|
5
|
-
"./dist"
|
|
6
|
-
],
|
|
7
|
-
// Visit https://aka.ms/tsconfig to read more about this file
|
|
8
|
-
"compilerOptions": {
|
|
9
|
-
// File Layout
|
|
10
|
-
"rootDir": "./src",
|
|
11
|
-
"outDir": "./dist",
|
|
12
|
-
// Environment Settings
|
|
13
|
-
// See also https://aka.ms/tsconfig/module
|
|
14
|
-
"module": "nodenext",
|
|
15
|
-
"target": "esnext",
|
|
16
|
-
"types": [
|
|
17
|
-
"node"
|
|
18
|
-
],
|
|
19
|
-
// For nodejs:
|
|
20
|
-
// "lib": ["esnext"],
|
|
21
|
-
// "types": ["node"],
|
|
22
|
-
// and npm install -D @types/node
|
|
23
|
-
// Other Outputs
|
|
24
|
-
"sourceMap": true,
|
|
25
|
-
"declaration": true,
|
|
26
|
-
"declarationMap": true,
|
|
27
|
-
// Stricter Typechecking Options
|
|
28
|
-
"noUncheckedIndexedAccess": true,
|
|
29
|
-
"exactOptionalPropertyTypes": true,
|
|
30
|
-
"strict": true,
|
|
31
|
-
"verbatimModuleSyntax": true,
|
|
32
|
-
"isolatedModules": true,
|
|
33
|
-
"noUncheckedSideEffectImports": true,
|
|
34
|
-
"moduleDetection": "force",
|
|
35
|
-
"skipLibCheck": true,
|
|
36
|
-
}
|
|
37
|
-
}
|