@weborigami/origami 0.0.68 → 0.0.70
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/exports/exports.js +6 -0
- package/package.json +4 -4
- package/src/builtins/@changes.js +24 -14
- package/src/builtins/@code.js +37 -0
- package/src/builtins/@crawl.js +31 -455
- package/src/builtins/@indent.js +115 -0
- package/src/builtins/@keysTree.js +1 -1
- package/src/builtins/@siteAudit.js +19 -0
- package/src/builtins/@slug.js +3 -0
- package/src/builtins/@treeHttp.js +1 -1
- package/src/builtins/@treeHttps.js +1 -1
- package/src/common/processUnpackedContent.js +5 -1
- package/src/common/utilities.js +7 -4
- package/src/crawler/crawlResources.js +180 -0
- package/src/crawler/findPaths.js +259 -0
- package/src/crawler/utilities.js +38 -0
package/src/builtins/@crawl.js
CHANGED
|
@@ -6,14 +6,10 @@ import {
|
|
|
6
6
|
keysFromPath,
|
|
7
7
|
trailingSlash,
|
|
8
8
|
} from "@weborigami/async-tree";
|
|
9
|
-
import { InvokeFunctionsTransform
|
|
10
|
-
import
|
|
11
|
-
import
|
|
12
|
-
import
|
|
13
|
-
|
|
14
|
-
// A fake base URL used to handle cases where an href is relative and must be
|
|
15
|
-
// treated relative to some base URL.
|
|
16
|
-
const fakeBaseUrl = new URL("https://fake");
|
|
9
|
+
import { InvokeFunctionsTransform } from "@weborigami/language";
|
|
10
|
+
import crawlResources from "../crawler/crawlResources.js";
|
|
11
|
+
import { normalizeKeys } from "../crawler/utilities.js";
|
|
12
|
+
import getTreeArgument from "../misc/getTreeArgument.js";
|
|
17
13
|
|
|
18
14
|
/**
|
|
19
15
|
* Crawl a tree, starting its root index.html page, and following links to
|
|
@@ -24,23 +20,19 @@ const fakeBaseUrl = new URL("https://fake");
|
|
|
24
20
|
* that obtain the requested value from the original site.
|
|
25
21
|
*
|
|
26
22
|
* @typedef {import("@weborigami/types").AsyncTree} AsyncTree
|
|
27
|
-
* @typedef {import("@weborigami/async-tree").Treelike
|
|
23
|
+
* @typedef {import("@weborigami/async-tree").Treelike} Treelike
|
|
28
24
|
* @this {AsyncTree|null}
|
|
29
25
|
* @param {Treelike} treelike
|
|
30
26
|
* @param {string} [baseHref]
|
|
31
27
|
* @returns {Promise<AsyncTree>}
|
|
32
28
|
*/
|
|
33
|
-
export default async function
|
|
34
|
-
|
|
35
|
-
const tree =
|
|
36
|
-
typeof treelike === "string"
|
|
37
|
-
? treeHttps.call(this, treelike)
|
|
38
|
-
: Tree.from(treelike, { parent: this });
|
|
29
|
+
export default async function crawlBuiltin(treelike, baseHref) {
|
|
30
|
+
const tree = await getTreeArgument(this, arguments, treelike, "@crawl");
|
|
39
31
|
|
|
40
32
|
if (baseHref === undefined) {
|
|
41
33
|
// Ask tree or original treelike if it has an `href` property we can use as
|
|
42
34
|
// the base href to determine whether a link is local within the tree or
|
|
43
|
-
// not. If not, use a fake `local:/` href.
|
|
35
|
+
// not. If not, use a fake `local:/` base href.
|
|
44
36
|
baseHref =
|
|
45
37
|
/** @type {any} */ (tree).href ??
|
|
46
38
|
/** @type {any} */ (treelike).href ??
|
|
@@ -48,7 +40,21 @@ export default async function crawl(treelike, baseHref) {
|
|
|
48
40
|
if (!baseHref?.endsWith("/")) {
|
|
49
41
|
baseHref += "/";
|
|
50
42
|
}
|
|
43
|
+
} else {
|
|
44
|
+
// Is the href already valid?
|
|
45
|
+
let isHrefValid = false;
|
|
46
|
+
try {
|
|
47
|
+
new URL(baseHref);
|
|
48
|
+
isHrefValid = true;
|
|
49
|
+
} catch (e) {
|
|
50
|
+
// Ignore
|
|
51
|
+
}
|
|
52
|
+
if (!isHrefValid) {
|
|
53
|
+
// Use a fake base href.
|
|
54
|
+
baseHref = `local:/${baseHref}`;
|
|
55
|
+
}
|
|
51
56
|
}
|
|
57
|
+
|
|
52
58
|
// @ts-ignore
|
|
53
59
|
const baseUrl = new URL(baseHref);
|
|
54
60
|
|
|
@@ -57,25 +63,18 @@ export default async function crawl(treelike, baseHref) {
|
|
|
57
63
|
const errors = [];
|
|
58
64
|
|
|
59
65
|
// We iterate until there are no more promises to wait for.
|
|
60
|
-
for await (const result of
|
|
61
|
-
const {
|
|
66
|
+
for await (const result of crawlResources(tree, baseUrl)) {
|
|
67
|
+
const { normalizedKeys, resourcePaths, value } = result;
|
|
62
68
|
|
|
63
69
|
// Cache the value
|
|
64
70
|
if (value) {
|
|
65
|
-
addValueToObject(cache,
|
|
71
|
+
addValueToObject(cache, normalizedKeys, value);
|
|
66
72
|
}
|
|
67
|
-
// else if (keys) {
|
|
68
|
-
// // A missing robots.txt isn't an error; anything else missing is.
|
|
69
|
-
// const path = keys.join("/");
|
|
70
|
-
// if (path !== "robots.txt") {
|
|
71
|
-
// errors.push(path);
|
|
72
|
-
// }
|
|
73
|
-
// }
|
|
74
73
|
|
|
75
74
|
// Add indirect resource functions to the resource tree. When requested,
|
|
76
75
|
// these functions will obtain the resource from the original site.
|
|
77
76
|
for (const resourcePath of resourcePaths) {
|
|
78
|
-
const resourceKeys =
|
|
77
|
+
const resourceKeys = normalizeKeys(keysFromPath(resourcePath));
|
|
79
78
|
const fn = () => {
|
|
80
79
|
return Tree.traverse(tree, ...resourceKeys);
|
|
81
80
|
};
|
|
@@ -101,17 +100,6 @@ export default async function crawl(treelike, baseHref) {
|
|
|
101
100
|
return result;
|
|
102
101
|
}
|
|
103
102
|
|
|
104
|
-
// For indexing and storage purposes, treat a path that ends in a trailing slash
|
|
105
|
-
// as if it ends in index.html.
|
|
106
|
-
function adjustKeys(keys) {
|
|
107
|
-
if (keys.length > 0 && !trailingSlash.has(keys.at(-1))) {
|
|
108
|
-
return keys;
|
|
109
|
-
}
|
|
110
|
-
const adjustedKeys = keys.slice();
|
|
111
|
-
adjustedKeys.push("index.html");
|
|
112
|
-
return adjustedKeys;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
103
|
function addValueToObject(object, keys, value) {
|
|
116
104
|
for (let i = 0, current = object; i < keys.length; i++) {
|
|
117
105
|
const key = trailingSlash.remove(keys[i]);
|
|
@@ -128,10 +116,9 @@ function addValueToObject(object, keys, value) {
|
|
|
128
116
|
if (!current[key]) {
|
|
129
117
|
current[key] = {};
|
|
130
118
|
} else if (!isPlainObject(current[key])) {
|
|
131
|
-
// Already have a value at this point. The site has a page
|
|
132
|
-
//
|
|
133
|
-
//
|
|
134
|
-
// current value to "index.html".
|
|
119
|
+
// Already have a value at this point. The site has a page at a route
|
|
120
|
+
// like /foo, and the site also has resources within that at routes like
|
|
121
|
+
// /foo/bar.jpg. We move the current value to "index.html".
|
|
135
122
|
current[key] = { "index.html": current[key] };
|
|
136
123
|
}
|
|
137
124
|
current = current[key];
|
|
@@ -139,416 +126,5 @@ function addValueToObject(object, keys, value) {
|
|
|
139
126
|
}
|
|
140
127
|
}
|
|
141
128
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
// retrieved at a path, along with the paths to other resources found in that
|
|
145
|
-
// text.
|
|
146
|
-
async function* crawlPaths(tree, baseUrl) {
|
|
147
|
-
// We want to kick off requests for new paths as quickly as we find them, then
|
|
148
|
-
// yield whichever result finishes first. Unfortunately, Promise.any() only
|
|
149
|
-
// tells us the result of the first promise to resolve, not which promise that
|
|
150
|
-
// was. So we keep track of a dictionary mapping paths to a promise for the
|
|
151
|
-
// value at that path. When a promise resolves, we mark it as resolved by
|
|
152
|
-
// setting its entry in the dictionary to null.
|
|
153
|
-
const promisesForPaths = {};
|
|
154
|
-
|
|
155
|
-
// Keep track of which resources refer to which paths.
|
|
156
|
-
const mapResourceToPaths = {};
|
|
157
|
-
|
|
158
|
-
let errorPaths = [];
|
|
159
|
-
|
|
160
|
-
// Seed the promise dictionary with robots.txt and the root path.
|
|
161
|
-
const initialPaths = ["/robots.txt", "/"];
|
|
162
|
-
initialPaths.forEach((path) => {
|
|
163
|
-
promisesForPaths[path] = processPath(tree, path, baseUrl);
|
|
164
|
-
});
|
|
165
|
-
|
|
166
|
-
while (true) {
|
|
167
|
-
// Get the latest array of promises that haven't been resolved yet.
|
|
168
|
-
const promises = Object.values(promisesForPaths).filter(
|
|
169
|
-
(promise) => promise !== null
|
|
170
|
-
);
|
|
171
|
-
|
|
172
|
-
if (promises.length === 0) {
|
|
173
|
-
// No unresolved promises; we're done.
|
|
174
|
-
break;
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
// Wait for the first promise to resolve.
|
|
178
|
-
const result = await Promise.any(promises);
|
|
179
|
-
|
|
180
|
-
// Mark the promise for that result as resolved.
|
|
181
|
-
promisesForPaths[result.path] = null;
|
|
182
|
-
|
|
183
|
-
// Add the crawlable paths to the map.
|
|
184
|
-
mapResourceToPaths[result.path] = result.crawlablePaths;
|
|
185
|
-
|
|
186
|
-
// Add promises for crawlable paths in the result.
|
|
187
|
-
result.crawlablePaths.forEach((path) => {
|
|
188
|
-
// Only add a promise for this path if we don't already have one.
|
|
189
|
-
if (promisesForPaths[path] === undefined) {
|
|
190
|
-
promisesForPaths[path] = processPath(tree, path, baseUrl);
|
|
191
|
-
}
|
|
192
|
-
});
|
|
193
|
-
|
|
194
|
-
// If there was no value, add this to the errors.
|
|
195
|
-
// A missing robots.txt isn't an error; anything else missing is.
|
|
196
|
-
if (result.value === null && result.path !== "/robots.txt") {
|
|
197
|
-
errorPaths.push(result.path);
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
yield result;
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
if (errorPaths.length > 0) {
|
|
204
|
-
// Create a map of the resources that refer to each error.
|
|
205
|
-
const errorsMap = {};
|
|
206
|
-
for (const resource in mapResourceToPaths) {
|
|
207
|
-
const paths = mapResourceToPaths[resource];
|
|
208
|
-
for (const path of paths) {
|
|
209
|
-
if (errorPaths.includes(path)) {
|
|
210
|
-
errorsMap[resource] ??= [];
|
|
211
|
-
errorsMap[resource].push(path);
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
const errorsJson = JSON.stringify(errorsMap, null, 2);
|
|
216
|
-
yield {
|
|
217
|
-
keys: ["crawl-errors.json"],
|
|
218
|
-
path: "crawl-errors.json",
|
|
219
|
-
resourcePaths: [],
|
|
220
|
-
value: errorsJson,
|
|
221
|
-
};
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
// Filter the paths to those that are local to the site.
|
|
226
|
-
function filterPaths(paths, baseUrl, localPath) {
|
|
227
|
-
// Convert paths to absolute URLs.
|
|
228
|
-
const localUrl = new URL(localPath, baseUrl);
|
|
229
|
-
const basePathname = baseUrl.pathname;
|
|
230
|
-
// @ts-ignore
|
|
231
|
-
const absoluteUrls = paths.map((path) => new URL(path, localUrl));
|
|
232
|
-
|
|
233
|
-
// Convert the absolute URLs to paths relative to the baseHref. If the URL
|
|
234
|
-
// points outside the tree rooted at the baseHref, the relative path will be
|
|
235
|
-
// null. We ignore the protocol in this test, because in practice sites often
|
|
236
|
-
// fumble the use of http and https, treating them interchangeably.
|
|
237
|
-
const relativePaths = absoluteUrls.map((url) => {
|
|
238
|
-
if (url.host === baseUrl.host && url.pathname.startsWith(basePathname)) {
|
|
239
|
-
return url.pathname.slice(basePathname.length);
|
|
240
|
-
} else {
|
|
241
|
-
return null;
|
|
242
|
-
}
|
|
243
|
-
});
|
|
244
|
-
|
|
245
|
-
// Filter out the null paths.
|
|
246
|
-
/** @type {string[]} */
|
|
247
|
-
// @ts-ignore
|
|
248
|
-
const filteredPaths = relativePaths.filter((path) => path);
|
|
249
|
-
return filteredPaths;
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
function findPaths(value, key, baseUrl, localPath) {
|
|
253
|
-
const text = utilities.toString(value);
|
|
254
|
-
|
|
255
|
-
// We guess the value is HTML is if its key has an .html extension or
|
|
256
|
-
// doesn't have an extension, or the value starts with `<`.
|
|
257
|
-
const ext = key ? extname(key).toLowerCase() : "";
|
|
258
|
-
const maybeHtml = ext === "" || text?.trim().startsWith("<");
|
|
259
|
-
let foundPaths;
|
|
260
|
-
if (ext === ".html" || ext === ".htm") {
|
|
261
|
-
foundPaths = findPathsInHtml(text);
|
|
262
|
-
} else if (ext === ".css") {
|
|
263
|
-
foundPaths = findPathsInCss(text);
|
|
264
|
-
} else if (ext === ".js") {
|
|
265
|
-
foundPaths = findPathsInJs(text);
|
|
266
|
-
} else if (ext === ".map") {
|
|
267
|
-
foundPaths = findPathsInImageMap(text);
|
|
268
|
-
} else if (key === "robots.txt") {
|
|
269
|
-
foundPaths = findPathsInRobotsTxt(text);
|
|
270
|
-
} else if (key === "sitemap.xml") {
|
|
271
|
-
foundPaths = findPathsInSitemapXml(text);
|
|
272
|
-
} else if (maybeHtml) {
|
|
273
|
-
foundPaths = findPathsInHtml(text);
|
|
274
|
-
} else {
|
|
275
|
-
// Doesn't have an extension we want to process
|
|
276
|
-
return {
|
|
277
|
-
crawlablePaths: [],
|
|
278
|
-
resourcePaths: [],
|
|
279
|
-
};
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
const crawlablePaths = filterPaths(
|
|
283
|
-
foundPaths.crawlablePaths,
|
|
284
|
-
baseUrl,
|
|
285
|
-
localPath
|
|
286
|
-
);
|
|
287
|
-
const resourcePaths = filterPaths(
|
|
288
|
-
foundPaths.resourcePaths,
|
|
289
|
-
baseUrl,
|
|
290
|
-
localPath
|
|
291
|
-
);
|
|
292
|
-
return {
|
|
293
|
-
crawlablePaths,
|
|
294
|
-
resourcePaths,
|
|
295
|
-
};
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
function findPathsInCss(css) {
|
|
299
|
-
const resourcePaths = [];
|
|
300
|
-
let match;
|
|
301
|
-
|
|
302
|
-
// Find `url()` functions.
|
|
303
|
-
const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
|
|
304
|
-
while ((match = urlRegex.exec(css))) {
|
|
305
|
-
const href = normalizeHref(match.groups?.href);
|
|
306
|
-
if (href) {
|
|
307
|
-
resourcePaths.push();
|
|
308
|
-
}
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
return {
|
|
312
|
-
crawlablePaths: [],
|
|
313
|
-
resourcePaths,
|
|
314
|
-
};
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
// These are ancient server-side image maps. They're so old that it's hard to
|
|
318
|
-
// find documentation on them, but they're used on the reference Space Jam
|
|
319
|
-
// website we use for testing the crawler. Example:
|
|
320
|
-
// https://www.spacejam.com/1996/bin/bball.map
|
|
321
|
-
function findPathsInImageMap(imageMap) {
|
|
322
|
-
const resourcePaths = [];
|
|
323
|
-
let match;
|
|
324
|
-
|
|
325
|
-
// Find hrefs as the second column in each line.
|
|
326
|
-
const hrefRegex = /^\w+ (?<href>\S+)(\s*$| [\d, ]+$)/gm;
|
|
327
|
-
while ((match = hrefRegex.exec(imageMap))) {
|
|
328
|
-
const href = normalizeHref(match.groups?.href);
|
|
329
|
-
if (href) {
|
|
330
|
-
resourcePaths.push(href);
|
|
331
|
-
}
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
return {
|
|
335
|
-
crawlablePaths: [],
|
|
336
|
-
resourcePaths,
|
|
337
|
-
};
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
function findPathsInJs(js) {
|
|
341
|
-
const crawlablePaths = [];
|
|
342
|
-
let match;
|
|
343
|
-
|
|
344
|
-
// Find `import` statements.
|
|
345
|
-
const importRegex = /import [\s\S]+?from\s+["'](?<import>[^"']*)["'];/g;
|
|
346
|
-
while ((match = importRegex.exec(js))) {
|
|
347
|
-
const href = normalizeHref(match.groups?.import);
|
|
348
|
-
if (href) {
|
|
349
|
-
crawlablePaths.push(href);
|
|
350
|
-
}
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
return {
|
|
354
|
-
crawlablePaths,
|
|
355
|
-
resourcePaths: [],
|
|
356
|
-
};
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
function findPathsInHtml(html) {
|
|
360
|
-
const crawlablePaths = [];
|
|
361
|
-
const resourcePaths = [];
|
|
362
|
-
let match;
|
|
363
|
-
|
|
364
|
-
// Find `href` attributes in anchor and link tags.
|
|
365
|
-
const linkRegex =
|
|
366
|
-
/<(?:a|A|link|LINK) [^>]*?(?:href|HREF)=["'](?<link>[^>]*?)["'][^>]*>/g;
|
|
367
|
-
while ((match = linkRegex.exec(html))) {
|
|
368
|
-
// Links can point to be other crawlable paths and resource paths.
|
|
369
|
-
// We guess the type based on the extension.
|
|
370
|
-
const href = normalizeHref(match.groups?.link);
|
|
371
|
-
if (href) {
|
|
372
|
-
if (isCrawlableHref(href)) {
|
|
373
|
-
crawlablePaths.push(href);
|
|
374
|
-
} else {
|
|
375
|
-
resourcePaths.push(href);
|
|
376
|
-
}
|
|
377
|
-
}
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
// Find `src` attributes in img and script tags.
|
|
381
|
-
const srcRegex =
|
|
382
|
-
/<(?<tag>img|IMG|script|SCRIPT) [^>]*?(?:src|SRC)=["'](?<src>[^>]*?)["'][^>]*>/g;
|
|
383
|
-
while ((match = srcRegex.exec(html))) {
|
|
384
|
-
const tag = match.groups?.tag;
|
|
385
|
-
const src = normalizeHref(match.groups?.src);
|
|
386
|
-
if (src) {
|
|
387
|
-
if (tag === "script" || tag === "SCRIPT") {
|
|
388
|
-
crawlablePaths.push(src);
|
|
389
|
-
} else {
|
|
390
|
-
resourcePaths.push(src);
|
|
391
|
-
}
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
|
|
395
|
-
// Find `url()` functions in CSS.
|
|
396
|
-
const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
|
|
397
|
-
while ((match = urlRegex.exec(html))) {
|
|
398
|
-
const href = normalizeHref(match.groups?.href);
|
|
399
|
-
if (href) {
|
|
400
|
-
resourcePaths.push(href);
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
// Find `src` attribute on frame tags.
|
|
405
|
-
const frameRegex =
|
|
406
|
-
/<(?:frame|FRAME) [^>]*?(?:src|SRC)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
407
|
-
while ((match = frameRegex.exec(html))) {
|
|
408
|
-
const href = normalizeHref(match.groups?.href);
|
|
409
|
-
if (href) {
|
|
410
|
-
crawlablePaths.push(href);
|
|
411
|
-
}
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
// Find ancient `background` attribute on body tag.
|
|
415
|
-
const backgroundRegex =
|
|
416
|
-
/<(?:body|BODY) [^>]*?(?:background|BACKGROUND)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
417
|
-
while ((match = backgroundRegex.exec(html))) {
|
|
418
|
-
const href = normalizeHref(match.groups?.href);
|
|
419
|
-
if (href) {
|
|
420
|
-
resourcePaths.push(href);
|
|
421
|
-
}
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
// Find `href` attribute on area tags.
|
|
425
|
-
const areaRegex =
|
|
426
|
-
/<(?:area|AREA) [^>]*?(?:href|HREF)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
427
|
-
while ((match = areaRegex.exec(html))) {
|
|
428
|
-
const href = normalizeHref(match.groups?.href);
|
|
429
|
-
if (href) {
|
|
430
|
-
crawlablePaths.push(href);
|
|
431
|
-
}
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
return { crawlablePaths, resourcePaths };
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
function findPathsInRobotsTxt(txt) {
|
|
438
|
-
const crawlablePaths = [];
|
|
439
|
-
let match;
|
|
440
|
-
|
|
441
|
-
// Find `Sitemap` directives.
|
|
442
|
-
const sitemapRegex = /Sitemap:\s*(?<href>[^\s]*)/g;
|
|
443
|
-
while ((match = sitemapRegex.exec(txt))) {
|
|
444
|
-
const href = normalizeHref(match.groups?.href);
|
|
445
|
-
if (href) {
|
|
446
|
-
crawlablePaths.push(href);
|
|
447
|
-
}
|
|
448
|
-
}
|
|
449
|
-
|
|
450
|
-
return {
|
|
451
|
-
crawlablePaths,
|
|
452
|
-
resourcePaths: [],
|
|
453
|
-
};
|
|
454
|
-
}
|
|
455
|
-
|
|
456
|
-
function findPathsInSitemapXml(xml) {
|
|
457
|
-
const crawlablePaths = [];
|
|
458
|
-
let match;
|
|
459
|
-
|
|
460
|
-
// Find `loc` elements.
|
|
461
|
-
const locRegex = /<loc>(?<href>[^<]*)<\/loc>/g;
|
|
462
|
-
while ((match = locRegex.exec(xml))) {
|
|
463
|
-
const href = normalizeHref(match.groups?.href);
|
|
464
|
-
if (href) {
|
|
465
|
-
crawlablePaths.push(href);
|
|
466
|
-
}
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
return {
|
|
470
|
-
crawlablePaths,
|
|
471
|
-
resourcePaths: [],
|
|
472
|
-
};
|
|
473
|
-
}
|
|
474
|
-
|
|
475
|
-
function isCrawlableHref(href) {
|
|
476
|
-
// Use a fake base URL to cover the case where the href is relative.
|
|
477
|
-
const url = new URL(href, fakeBaseUrl);
|
|
478
|
-
const pathname = url.pathname;
|
|
479
|
-
const lastKey = pathname.split("/").pop() ?? "";
|
|
480
|
-
if (lastKey === "robots.txt" || lastKey === "sitemap.xml") {
|
|
481
|
-
return true;
|
|
482
|
-
}
|
|
483
|
-
const ext = extname(lastKey);
|
|
484
|
-
// We assume an empty extension is HTML.
|
|
485
|
-
const crawlableExtensions = [".html", ".css", ".js", ".map", ""];
|
|
486
|
-
return crawlableExtensions.includes(ext);
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
// Remove any search parameters or hash from the href. Preserve absolute or
|
|
490
|
-
// relative nature of URL. If the URL only has a search or hash, return null.
|
|
491
|
-
function normalizeHref(href) {
|
|
492
|
-
// Remove everything after a `#` or `?` character.
|
|
493
|
-
const normalized = href.split(/[?#]/)[0];
|
|
494
|
-
return normalized === "" ? null : normalized;
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
async function processPath(tree, path, baseUrl) {
|
|
498
|
-
if (path === undefined) {
|
|
499
|
-
return {
|
|
500
|
-
crawlablePaths: [],
|
|
501
|
-
keys: null,
|
|
502
|
-
path,
|
|
503
|
-
resourcePaths: [],
|
|
504
|
-
value: null,
|
|
505
|
-
};
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
// Convert path to keys
|
|
509
|
-
let keys = keysFromPath(path);
|
|
510
|
-
|
|
511
|
-
// Paths (including those created by the filterPaths function above) will have
|
|
512
|
-
// spaces, etc., escaped. In general, these need to be unescaped so we can
|
|
513
|
-
// find them in the tree.
|
|
514
|
-
keys = keys.map(decodeURIComponent);
|
|
515
|
-
|
|
516
|
-
// Traverse tree to get value.
|
|
517
|
-
let value = await Tree.traverse(tree, ...keys);
|
|
518
|
-
if (Tree.isAsyncTree(value)) {
|
|
519
|
-
// Path is actually a directory; see if it has an index.html
|
|
520
|
-
value = await Tree.traverse(value, "index.html");
|
|
521
|
-
}
|
|
522
|
-
|
|
523
|
-
const adjustedKeys = adjustKeys(keys);
|
|
524
|
-
|
|
525
|
-
if (value === undefined) {
|
|
526
|
-
return {
|
|
527
|
-
crawlablePaths: [],
|
|
528
|
-
keys: adjustedKeys,
|
|
529
|
-
path,
|
|
530
|
-
resourcePaths: [],
|
|
531
|
-
value: null,
|
|
532
|
-
};
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
// Find paths in the value
|
|
536
|
-
const key = adjustedKeys.at(-1);
|
|
537
|
-
const { crawlablePaths, resourcePaths } = await findPaths(
|
|
538
|
-
value,
|
|
539
|
-
key,
|
|
540
|
-
baseUrl,
|
|
541
|
-
path
|
|
542
|
-
);
|
|
543
|
-
|
|
544
|
-
return {
|
|
545
|
-
crawlablePaths,
|
|
546
|
-
keys: adjustedKeys,
|
|
547
|
-
path,
|
|
548
|
-
resourcePaths,
|
|
549
|
-
value,
|
|
550
|
-
};
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
crawl.usage = `@crawl <tree>\tCrawl a tree`;
|
|
554
|
-
crawl.documentation = "https://weborigami.org/language/@crawl.html";
|
|
129
|
+
crawlBuiltin.usage = `@crawl <tree>\tCrawl a tree`;
|
|
130
|
+
crawlBuiltin.documentation = "https://weborigami.org/language/@crawl.html";
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
const lastLineWhitespaceRegex = /\n(?<indent>[ \t]*)$/;
|
|
2
|
+
|
|
3
|
+
const mapStringsToModifications = new Map();
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Normalize indentation in a tagged template string.
|
|
7
|
+
*
|
|
8
|
+
* @param {TemplateStringsArray} strings
|
|
9
|
+
* @param {...any} values
|
|
10
|
+
* @returns {string}
|
|
11
|
+
*/
|
|
12
|
+
export default function indent(strings, ...values) {
|
|
13
|
+
let modified = mapStringsToModifications.get(strings);
|
|
14
|
+
if (!modified) {
|
|
15
|
+
modified = modifyStrings(strings);
|
|
16
|
+
mapStringsToModifications.set(strings, modified);
|
|
17
|
+
}
|
|
18
|
+
const { blockIndentations, strings: modifiedStrings } = modified;
|
|
19
|
+
return joinBlocks(modifiedStrings, values, blockIndentations);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Join strings and values, applying the given block indentation to the lines of
|
|
23
|
+
// values for block placholders.
|
|
24
|
+
function joinBlocks(strings, values, blockIndentations) {
|
|
25
|
+
let result = strings[0];
|
|
26
|
+
for (let i = 0; i < values.length; i++) {
|
|
27
|
+
let text = values[i];
|
|
28
|
+
if (text) {
|
|
29
|
+
const blockIndentation = blockIndentations[i];
|
|
30
|
+
if (blockIndentation) {
|
|
31
|
+
const lines = text.split("\n");
|
|
32
|
+
text = "";
|
|
33
|
+
if (lines.at(-1) === "") {
|
|
34
|
+
// Drop empty last line
|
|
35
|
+
lines.pop();
|
|
36
|
+
}
|
|
37
|
+
for (let line of lines) {
|
|
38
|
+
text += blockIndentation + line + "\n";
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
result += text;
|
|
42
|
+
}
|
|
43
|
+
result += strings[i + 1];
|
|
44
|
+
}
|
|
45
|
+
return result;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Given an array of template boilerplate strings, return an object { modified,
|
|
49
|
+
// blockIndentations } where `strings` is the array of strings with indentation
|
|
50
|
+
// removed, and `blockIndentations` is an array of indentation strings for each
|
|
51
|
+
// block placeholder.
|
|
52
|
+
function modifyStrings(strings) {
|
|
53
|
+
// Phase one: Identify the indentation based on the first real line of the
|
|
54
|
+
// first string (skipping the initial newline), and remove this indentation
|
|
55
|
+
// from all lines of all strings.
|
|
56
|
+
let indent;
|
|
57
|
+
if (strings.length > 0 && strings[0].startsWith("\n")) {
|
|
58
|
+
// Look for indenttation
|
|
59
|
+
const firstLineWhitespaceRegex = /^\n(?<indent>[ \t]*)/;
|
|
60
|
+
const match = strings[0].match(firstLineWhitespaceRegex);
|
|
61
|
+
indent = match?.groups.indent;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Determine the modified strings. If this invoked as a JS tagged template
|
|
65
|
+
// literal, the `strings` argument will be an odd array-ish object that we'll
|
|
66
|
+
// want to convert to a real array.
|
|
67
|
+
let modified;
|
|
68
|
+
if (indent) {
|
|
69
|
+
// De-indent the strings.
|
|
70
|
+
const indentationRegex = new RegExp(`\n${indent}`, "g");
|
|
71
|
+
// The `replaceAll` also converts strings to a real array.
|
|
72
|
+
modified = strings.map((string) =>
|
|
73
|
+
string.replaceAll(indentationRegex, "\n")
|
|
74
|
+
);
|
|
75
|
+
// Remove indentation from last line of last string
|
|
76
|
+
modified[modified.length - 1] = modified
|
|
77
|
+
.at(-1)
|
|
78
|
+
.replace(lastLineWhitespaceRegex, "\n");
|
|
79
|
+
} else {
|
|
80
|
+
// No indentation; just copy the strings so we have a real array
|
|
81
|
+
modified = strings.slice();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Phase two: Identify any block placholders, identify and remove their
|
|
85
|
+
// preceding indentation, and remove the following newline. Work backward from
|
|
86
|
+
// the end towards the start because we're modifying the strings in place and
|
|
87
|
+
// our pattern matching won't work going forward from start to end.
|
|
88
|
+
let blockIndentations = [];
|
|
89
|
+
for (let i = modified.length - 2; i >= 0; i--) {
|
|
90
|
+
// Get the modified before and after substitution with index `i`
|
|
91
|
+
const beforeString = modified[i];
|
|
92
|
+
const afterString = modified[i + 1];
|
|
93
|
+
const match = beforeString.match(lastLineWhitespaceRegex);
|
|
94
|
+
if (match && afterString.startsWith("\n")) {
|
|
95
|
+
// The substitution between these strings is a block substitution
|
|
96
|
+
let blockIndentation = match.groups.indent;
|
|
97
|
+
blockIndentations[i] = blockIndentation;
|
|
98
|
+
// Trim the before and after strings
|
|
99
|
+
if (blockIndentation) {
|
|
100
|
+
modified[i] = beforeString.slice(0, -blockIndentation.length);
|
|
101
|
+
}
|
|
102
|
+
modified[i + 1] = afterString.slice(1);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Remove newline from start of first string *after* removing indentation.
|
|
107
|
+
if (modified[0].startsWith("\n")) {
|
|
108
|
+
modified[0] = modified[0].slice(1);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
blockIndentations,
|
|
113
|
+
strings: modified,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
@@ -8,7 +8,7 @@ import assertTreeIsDefined from "../misc/assertTreeIsDefined.js";
|
|
|
8
8
|
*
|
|
9
9
|
* @this {AsyncTree|null}
|
|
10
10
|
* @param {string} host
|
|
11
|
-
* @param {...string
|
|
11
|
+
* @param {...string} keys
|
|
12
12
|
*/
|
|
13
13
|
export default function keysTree(host, ...keys) {
|
|
14
14
|
assertTreeIsDefined(this, "keysTree");
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { Tree } from "@weborigami/async-tree";
|
|
2
|
+
import getTreeArgument from "../misc/getTreeArgument.js";
|
|
3
|
+
import crawl from "./@crawl.js";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @this {import("@weborigami/types").AsyncTree|null}
|
|
7
|
+
* @param {import("@weborigami/async-tree").Treelike} treelike
|
|
8
|
+
*/
|
|
9
|
+
export default async function siteAudit(treelike) {
|
|
10
|
+
const tree = await getTreeArgument(this, arguments, treelike, "@siteAudit");
|
|
11
|
+
const crawled = await crawl.call(this, tree);
|
|
12
|
+
let crawlErrorsJson = await crawled.get("crawl-errors.json");
|
|
13
|
+
if (!crawlErrorsJson) {
|
|
14
|
+
return undefined;
|
|
15
|
+
}
|
|
16
|
+
const errors = Tree.from(JSON.parse(crawlErrorsJson), { deep: true });
|
|
17
|
+
errors.parent = this;
|
|
18
|
+
return errors;
|
|
19
|
+
}
|
package/src/builtins/@slug.js
CHANGED