@weborigami/origami 0.0.67-beta.2 → 0.0.69

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -86,6 +86,7 @@ export { default as serve } from "../src/builtins/@serve.js";
86
86
  export { default as setDeep } from "../src/builtins/@setDeep.js";
87
87
  export { default as shell } from "../src/builtins/@shell.js";
88
88
  export { default as shuffle } from "../src/builtins/@shuffle.js";
89
+ export { default as siteAudit } from "../src/builtins/@siteAudit.js";
89
90
  export { default as sitemap } from "../src/builtins/@sitemap.js";
90
91
  export * from "../src/builtins/@slash.js";
91
92
  export { default as slug } from "../src/builtins/@slug.js";
@@ -136,6 +137,9 @@ export { default as processUnpackedContent } from "../src/common/processUnpacked
136
137
  export * from "../src/common/serialize.js";
137
138
  export { default as ShuffleTransform } from "../src/common/ShuffleTransform.js";
138
139
  export * from "../src/common/utilities.js";
140
+ export { default as crawlResources } from "../src/crawler/crawlResources.js";
141
+ export { default as findPaths } from "../src/crawler/findPaths.js";
142
+ export * from "../src/crawler/utilities.js";
139
143
  export { default as assertTreeIsDefined } from "../src/misc/assertTreeIsDefined.js";
140
144
  export { default as getTreeArgument } from "../src/misc/getTreeArgument.js";
141
145
  export { default as OriCommandTransform } from "../src/misc/OriCommandTransform.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@weborigami/origami",
3
- "version": "0.0.67-beta.2",
3
+ "version": "0.0.69",
4
4
  "description": "Web Origami language, CLI, framework, and server",
5
5
  "type": "module",
6
6
  "repository": {
@@ -17,9 +17,9 @@
17
17
  "typescript": "5.6.2"
18
18
  },
19
19
  "dependencies": {
20
- "@weborigami/async-tree": "0.0.67-beta.2",
21
- "@weborigami/language": "0.0.67-beta.2",
22
- "@weborigami/types": "0.0.67-beta.2",
20
+ "@weborigami/async-tree": "0.0.69",
21
+ "@weborigami/language": "0.0.69",
22
+ "@weborigami/types": "0.0.69",
23
23
  "exif-parser": "0.1.12",
24
24
  "graphviz-wasm": "3.0.2",
25
25
  "highlight.js": "11.10.0",
@@ -1,4 +1,4 @@
1
- import { Tree } from "@weborigami/async-tree";
1
+ import { trailingSlash, Tree } from "@weborigami/async-tree";
2
2
 
3
3
  /**
4
4
  * Given an old tree and a new tree, return a tree of changes indicated
@@ -17,36 +17,46 @@ export default async function changes(oldTreelike, newTreelike) {
17
17
  const oldKeys = Array.from(await oldTree.keys());
18
18
  const newKeys = Array.from(await newTree.keys());
19
19
 
20
- const result = {};
20
+ const oldKeysNormalized = oldKeys.map(trailingSlash.remove);
21
+ const newKeysNormalized = newKeys.map(trailingSlash.remove);
21
22
 
22
- for (const key of oldKeys) {
23
- if (!newKeys.includes(key)) {
24
- result[key] = "deleted";
23
+ let result;
24
+
25
+ for (const oldKey of oldKeys) {
26
+ const oldNormalized = trailingSlash.remove(oldKey);
27
+ if (!newKeysNormalized.includes(oldNormalized)) {
28
+ result ??= {};
29
+ result[oldKey] = "deleted";
25
30
  continue;
26
31
  }
27
32
 
28
- const oldValue = await oldTree.get(key);
29
- const newValue = await newTree.get(key);
33
+ const oldValue = await oldTree.get(oldKey);
34
+ const newValue = await newTree.get(oldKey);
30
35
 
31
36
  if (Tree.isAsyncTree(oldValue) && Tree.isAsyncTree(newValue)) {
32
37
  const treeChanges = await changes.call(this, oldValue, newValue);
33
- if (Object.keys(treeChanges).length > 0) {
34
- result[key] = treeChanges;
38
+ if (treeChanges && Object.keys(treeChanges).length > 0) {
39
+ result ??= {};
40
+ result[oldKey] = treeChanges;
35
41
  }
36
42
  } else if (oldValue?.toString && newValue?.toString) {
37
43
  const oldText = oldValue.toString();
38
44
  const newText = newValue.toString();
39
45
  if (oldText !== newText) {
40
- result[key] = "changed";
46
+ result ??= {};
47
+ result[oldKey] = "changed";
41
48
  }
42
49
  } else {
43
- result[key] = "changed";
50
+ result ??= {};
51
+ result[oldKey] = "changed";
44
52
  }
45
53
  }
46
54
 
47
- for (const key of newKeys) {
48
- if (!oldKeys.includes(key)) {
49
- result[key] = "added";
55
+ for (const newKey of newKeys) {
56
+ const newNormalized = trailingSlash.remove(newKey);
57
+ if (!oldKeysNormalized.includes(newNormalized)) {
58
+ result ??= {};
59
+ result[newKey] = "added";
50
60
  }
51
61
  }
52
62
 
@@ -6,14 +6,10 @@ import {
6
6
  keysFromPath,
7
7
  trailingSlash,
8
8
  } from "@weborigami/async-tree";
9
- import { InvokeFunctionsTransform, extname } from "@weborigami/language";
10
- import * as utilities from "../common/utilities.js";
11
- import assertTreeIsDefined from "../misc/assertTreeIsDefined.js";
12
- import treeHttps from "./@treeHttps.js";
13
-
14
- // A fake base URL used to handle cases where an href is relative and must be
15
- // treated relative to some base URL.
16
- const fakeBaseUrl = new URL("https://fake");
9
+ import { InvokeFunctionsTransform } from "@weborigami/language";
10
+ import crawlResources from "../crawler/crawlResources.js";
11
+ import { normalizeKeys } from "../crawler/utilities.js";
12
+ import getTreeArgument from "../misc/getTreeArgument.js";
17
13
 
18
14
  /**
19
15
  * Crawl a tree, starting its root index.html page, and following links to
@@ -24,23 +20,19 @@ const fakeBaseUrl = new URL("https://fake");
24
20
  * that obtain the requested value from the original site.
25
21
  *
26
22
  * @typedef {import("@weborigami/types").AsyncTree} AsyncTree
27
- * @typedef {import("@weborigami/async-tree").Treelike|string} Treelike
23
+ * @typedef {import("@weborigami/async-tree").Treelike} Treelike
28
24
  * @this {AsyncTree|null}
29
25
  * @param {Treelike} treelike
30
26
  * @param {string} [baseHref]
31
27
  * @returns {Promise<AsyncTree>}
32
28
  */
33
- export default async function crawl(treelike, baseHref) {
34
- assertTreeIsDefined(this, "crawl");
35
- const tree =
36
- typeof treelike === "string"
37
- ? treeHttps.call(this, treelike)
38
- : Tree.from(treelike, { parent: this });
29
+ export default async function crawlBuiltin(treelike, baseHref) {
30
+ const tree = await getTreeArgument(this, arguments, treelike, "@crawl");
39
31
 
40
32
  if (baseHref === undefined) {
41
33
  // Ask tree or original treelike if it has an `href` property we can use as
42
34
  // the base href to determine whether a link is local within the tree or
43
- // not. If not, use a fake `local:/` href.
35
+ // not. If not, use a fake `local:/` base href.
44
36
  baseHref =
45
37
  /** @type {any} */ (tree).href ??
46
38
  /** @type {any} */ (treelike).href ??
@@ -48,7 +40,21 @@ export default async function crawl(treelike, baseHref) {
48
40
  if (!baseHref?.endsWith("/")) {
49
41
  baseHref += "/";
50
42
  }
43
+ } else {
44
+ // Is the href already valid?
45
+ let isHrefValid = false;
46
+ try {
47
+ new URL(baseHref);
48
+ isHrefValid = true;
49
+ } catch (e) {
50
+ // Ignore
51
+ }
52
+ if (!isHrefValid) {
53
+ // Use a fake base href.
54
+ baseHref = `local:/${baseHref}`;
55
+ }
51
56
  }
57
+
52
58
  // @ts-ignore
53
59
  const baseUrl = new URL(baseHref);
54
60
 
@@ -57,24 +63,18 @@ export default async function crawl(treelike, baseHref) {
57
63
  const errors = [];
58
64
 
59
65
  // We iterate until there are no more promises to wait for.
60
- for await (const result of crawlPaths(tree, baseUrl)) {
61
- const { keys, resourcePaths, value } = result;
66
+ for await (const result of crawlResources(tree, baseUrl)) {
67
+ const { normalizedKeys, resourcePaths, value } = result;
62
68
 
63
69
  // Cache the value
64
70
  if (value) {
65
- addValueToObject(cache, keys, value);
66
- } else if (keys) {
67
- // A missing robots.txt isn't an error; anything else missing is.
68
- const path = keys.join("/");
69
- if (path !== "robots.txt") {
70
- errors.push(path);
71
- }
71
+ addValueToObject(cache, normalizedKeys, value);
72
72
  }
73
73
 
74
74
  // Add indirect resource functions to the resource tree. When requested,
75
75
  // these functions will obtain the resource from the original site.
76
76
  for (const resourcePath of resourcePaths) {
77
- const resourceKeys = adjustKeys(keysFromPath(resourcePath));
77
+ const resourceKeys = normalizeKeys(keysFromPath(resourcePath));
78
78
  const fn = () => {
79
79
  return Tree.traverse(tree, ...resourceKeys);
80
80
  };
@@ -100,17 +100,6 @@ export default async function crawl(treelike, baseHref) {
100
100
  return result;
101
101
  }
102
102
 
103
- // For indexing and storage purposes, treat a path that ends in a trailing slash
104
- // as if it ends in index.html.
105
- function adjustKeys(keys) {
106
- if (keys.length > 0 && !trailingSlash.has(keys.at(-1))) {
107
- return keys;
108
- }
109
- const adjustedKeys = keys.slice();
110
- adjustedKeys.push("index.html");
111
- return adjustedKeys;
112
- }
113
-
114
103
  function addValueToObject(object, keys, value) {
115
104
  for (let i = 0, current = object; i < keys.length; i++) {
116
105
  const key = trailingSlash.remove(keys[i]);
@@ -127,10 +116,9 @@ function addValueToObject(object, keys, value) {
127
116
  if (!current[key]) {
128
117
  current[key] = {};
129
118
  } else if (!isPlainObject(current[key])) {
130
- // Already have a value at this point. The site has a page
131
- // at a route like /foo, and the site also has resources
132
- // within that at routes like /foo/bar.jpg. We move the
133
- // current value to "index.html".
119
+ // Already have a value at this point. The site has a page at a route
120
+ // like /foo, and the site also has resources within that at routes like
121
+ // /foo/bar.jpg. We move the current value to "index.html".
134
122
  current[key] = { "index.html": current[key] };
135
123
  }
136
124
  current = current[key];
@@ -138,376 +126,5 @@ function addValueToObject(object, keys, value) {
138
126
  }
139
127
  }
140
128
 
141
- // Crawl the paths for the given tree, starting at the given base URL, and
142
- // yield the results. The results will include the HTML/script/stylesheet value
143
- // retrieved at a path, along with the paths to other resources found in that
144
- // text.
145
- async function* crawlPaths(tree, baseUrl) {
146
- // We want to kick off requests for new paths as quickly as we find them, then
147
- // yield whichever result finishes first. Unfortunately, Promise.any() only
148
- // tells us the result of the first promise to resolve, not which promise that
149
- // was. So we keep track of a dictionary mapping paths to a promise for the
150
- // value at that path. When a promise resolves, we mark it as resolved by
151
- // setting its entry in the dictionary to null.
152
- const promisesForPaths = {};
153
-
154
- // Seed the promise dictionary with robots.txt and the root path.
155
- const initialPaths = ["/robots.txt", "/"];
156
- initialPaths.forEach((path) => {
157
- promisesForPaths[path] = processPath(tree, path, baseUrl);
158
- });
159
-
160
- while (true) {
161
- // Get the latest array of promises that haven't been resolved yet.
162
- const promises = Object.values(promisesForPaths).filter(
163
- (promise) => promise !== null
164
- );
165
-
166
- if (promises.length === 0) {
167
- // No unresolved promises; we're done.
168
- break;
169
- }
170
-
171
- // Wait for the first promise to resolve.
172
- const result = await Promise.any(promises);
173
-
174
- // Mark the promise for that result as resolved.
175
- promisesForPaths[result.path] = null;
176
-
177
- // Add promises for crawlable paths in the result.
178
- result.crawlablePaths.forEach((path) => {
179
- // Only add a promise for this path if we don't already have one.
180
- if (promisesForPaths[path] === undefined) {
181
- promisesForPaths[path] = processPath(tree, path, baseUrl);
182
- }
183
- });
184
-
185
- yield result;
186
- }
187
- }
188
-
189
- // Filter the paths to those that are local to the site.
190
- function filterPaths(paths, baseUrl, localPath) {
191
- // Convert paths to absolute URLs.
192
- const localUrl = new URL(localPath, baseUrl);
193
- const basePathname = baseUrl.pathname;
194
- // @ts-ignore
195
- const absoluteUrls = paths.map((path) => new URL(path, localUrl));
196
-
197
- // Convert the absolute URLs to paths relative to the baseHref. If the URL
198
- // points outside the tree rooted at the baseHref, the relative path will be
199
- // null. We ignore the protocol in this test, because in practice sites often
200
- // fumble the use of http and https, treating them interchangeably.
201
- const relativePaths = absoluteUrls.map((url) => {
202
- if (url.host === baseUrl.host && url.pathname.startsWith(basePathname)) {
203
- return url.pathname.slice(basePathname.length);
204
- } else {
205
- return null;
206
- }
207
- });
208
-
209
- // Filter out the null paths.
210
- /** @type {string[]} */
211
- // @ts-ignore
212
- const filteredPaths = relativePaths.filter((path) => path);
213
- return filteredPaths;
214
- }
215
-
216
- function findPaths(value, key, baseUrl, localPath) {
217
- const text = utilities.toString(value);
218
-
219
- // We guess the value is HTML is if its key has an .html extension or
220
- // doesn't have an extension, or the value starts with `<`.
221
- const ext = key ? extname(key).toLowerCase() : "";
222
- const maybeHtml = ext === "" || text?.trim().startsWith("<");
223
- let foundPaths;
224
- if (ext === ".html" || ext === ".htm") {
225
- foundPaths = findPathsInHtml(text);
226
- } else if (ext === ".css") {
227
- foundPaths = findPathsInCss(text);
228
- } else if (ext === ".js") {
229
- foundPaths = findPathsInJs(text);
230
- } else if (ext === ".map") {
231
- foundPaths = findPathsInImageMap(text);
232
- } else if (key === "robots.txt") {
233
- foundPaths = findPathsInRobotsTxt(text);
234
- } else if (key === "sitemap.xml") {
235
- foundPaths = findPathsInSitemapXml(text);
236
- } else if (maybeHtml) {
237
- foundPaths = findPathsInHtml(text);
238
- } else {
239
- // Doesn't have an extension we want to process
240
- return {
241
- crawlablePaths: [],
242
- resourcePaths: [],
243
- };
244
- }
245
-
246
- const crawlablePaths = filterPaths(
247
- foundPaths.crawlablePaths,
248
- baseUrl,
249
- localPath
250
- );
251
- const resourcePaths = filterPaths(
252
- foundPaths.resourcePaths,
253
- baseUrl,
254
- localPath
255
- );
256
- return {
257
- crawlablePaths,
258
- resourcePaths,
259
- };
260
- }
261
-
262
- function findPathsInCss(css) {
263
- const resourcePaths = [];
264
- let match;
265
-
266
- // Find `url()` functions.
267
- const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
268
- while ((match = urlRegex.exec(css))) {
269
- const href = normalizeHref(match.groups?.href);
270
- if (href) {
271
- resourcePaths.push();
272
- }
273
- }
274
-
275
- return {
276
- crawlablePaths: [],
277
- resourcePaths,
278
- };
279
- }
280
-
281
- // These are ancient server-side image maps. They're so old that it's hard to
282
- // find documentation on them, but they're used on the reference Space Jam
283
- // website we use for testing the crawler. Example:
284
- // https://www.spacejam.com/1996/bin/bball.map
285
- function findPathsInImageMap(imageMap) {
286
- const resourcePaths = [];
287
- let match;
288
-
289
- // Find hrefs as the second column in each line.
290
- const hrefRegex = /^\w+ (?<href>\S+)(\s*$| [\d, ]+$)/gm;
291
- while ((match = hrefRegex.exec(imageMap))) {
292
- const href = normalizeHref(match.groups?.href);
293
- if (href) {
294
- resourcePaths.push(href);
295
- }
296
- }
297
-
298
- return {
299
- crawlablePaths: [],
300
- resourcePaths,
301
- };
302
- }
303
-
304
- function findPathsInJs(js) {
305
- const crawlablePaths = [];
306
- let match;
307
-
308
- // Find `import` statements.
309
- const importRegex = /import [\s\S]+?from\s+["'](?<import>[^"']*)["'];/g;
310
- while ((match = importRegex.exec(js))) {
311
- const href = normalizeHref(match.groups?.import);
312
- if (href) {
313
- crawlablePaths.push(href);
314
- }
315
- }
316
-
317
- return {
318
- crawlablePaths,
319
- resourcePaths: [],
320
- };
321
- }
322
-
323
- function findPathsInHtml(html) {
324
- const crawlablePaths = [];
325
- const resourcePaths = [];
326
- let match;
327
-
328
- // Find `href` attributes in anchor and link tags.
329
- const linkRegex =
330
- /<(?:a|A|link|LINK) [^>]*?(?:href|HREF)=["'](?<link>[^>]*?)["'][^>]*>/g;
331
- while ((match = linkRegex.exec(html))) {
332
- // Links can point to be other crawlable paths and resource paths.
333
- // We guess the type based on the extension.
334
- const href = normalizeHref(match.groups?.link);
335
- if (href) {
336
- if (isCrawlableHref(href)) {
337
- crawlablePaths.push(href);
338
- } else {
339
- resourcePaths.push(href);
340
- }
341
- }
342
- }
343
-
344
- // Find `src` attributes in img and script tags.
345
- const srcRegex =
346
- /<(?<tag>img|IMG|script|SCRIPT) [^>]*?(?:src|SRC)=["'](?<src>[^>]*?)["'][^>]*>/g;
347
- while ((match = srcRegex.exec(html))) {
348
- const tag = match.groups?.tag;
349
- const src = normalizeHref(match.groups?.src);
350
- if (src) {
351
- if (tag === "script" || tag === "SCRIPT") {
352
- crawlablePaths.push(src);
353
- } else {
354
- resourcePaths.push(src);
355
- }
356
- }
357
- }
358
-
359
- // Find `url()` functions in CSS.
360
- const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
361
- while ((match = urlRegex.exec(html))) {
362
- const href = normalizeHref(match.groups?.href);
363
- if (href) {
364
- resourcePaths.push(href);
365
- }
366
- }
367
-
368
- // Find `src` attribute on frame tags.
369
- const frameRegex =
370
- /<(?:frame|FRAME) [^>]*?(?:src|SRC)=["'](?<href>[^>]*?)["'][^>]*>/g;
371
- while ((match = frameRegex.exec(html))) {
372
- const href = normalizeHref(match.groups?.href);
373
- if (href) {
374
- crawlablePaths.push(href);
375
- }
376
- }
377
-
378
- // Find ancient `background` attribute on body tag.
379
- const backgroundRegex =
380
- /<(?:body|BODY) [^>]*?(?:background|BACKGROUND)=["'](?<href>[^>]*?)["'][^>]*>/g;
381
- while ((match = backgroundRegex.exec(html))) {
382
- const href = normalizeHref(match.groups?.href);
383
- if (href) {
384
- resourcePaths.push(href);
385
- }
386
- }
387
-
388
- // Find `href` attribute on area tags.
389
- const areaRegex =
390
- /<(?:area|AREA) [^>]*?(?:href|HREF)=["'](?<href>[^>]*?)["'][^>]*>/g;
391
- while ((match = areaRegex.exec(html))) {
392
- const href = normalizeHref(match.groups?.href);
393
- if (href) {
394
- crawlablePaths.push(href);
395
- }
396
- }
397
-
398
- return { crawlablePaths, resourcePaths };
399
- }
400
-
401
- function findPathsInRobotsTxt(txt) {
402
- const crawlablePaths = [];
403
- let match;
404
-
405
- // Find `Sitemap` directives.
406
- const sitemapRegex = /Sitemap:\s*(?<href>[^\s]*)/g;
407
- while ((match = sitemapRegex.exec(txt))) {
408
- const href = normalizeHref(match.groups?.href);
409
- if (href) {
410
- crawlablePaths.push(href);
411
- }
412
- }
413
-
414
- return {
415
- crawlablePaths,
416
- resourcePaths: [],
417
- };
418
- }
419
-
420
- function findPathsInSitemapXml(xml) {
421
- const crawlablePaths = [];
422
- let match;
423
-
424
- // Find `loc` elements.
425
- const locRegex = /<loc>(?<href>[^<]*)<\/loc>/g;
426
- while ((match = locRegex.exec(xml))) {
427
- const href = normalizeHref(match.groups?.href);
428
- if (href) {
429
- crawlablePaths.push(href);
430
- }
431
- }
432
-
433
- return {
434
- crawlablePaths,
435
- resourcePaths: [],
436
- };
437
- }
438
-
439
- function isCrawlableHref(href) {
440
- // Use a fake base URL to cover the case where the href is relative.
441
- const url = new URL(href, fakeBaseUrl);
442
- const pathname = url.pathname;
443
- const lastKey = pathname.split("/").pop() ?? "";
444
- if (lastKey === "robots.txt" || lastKey === "sitemap.xml") {
445
- return true;
446
- }
447
- const ext = extname(lastKey);
448
- // We assume an empty extension is HTML.
449
- const crawlableExtensions = [".html", ".css", ".js", ".map", ""];
450
- return crawlableExtensions.includes(ext);
451
- }
452
-
453
- // Remove any search parameters or hash from the href. Preserve absolute or
454
- // relative nature of URL. If the URL only has a search or hash, return null.
455
- function normalizeHref(href) {
456
- // Remove everything after a `#` or `?` character.
457
- const normalized = href.split(/[?#]/)[0];
458
- return normalized === "" ? null : normalized;
459
- }
460
-
461
- async function processPath(tree, path, baseUrl) {
462
- if (path === undefined) {
463
- return {
464
- crawlablePaths: [],
465
- keys: null,
466
- path,
467
- resourcePaths: [],
468
- value: null,
469
- };
470
- }
471
-
472
- // Convert path to keys
473
- const keys = keysFromPath(path);
474
-
475
- // Traverse tree to get value.
476
- let value = await Tree.traverse(tree, ...keys);
477
- if (Tree.isAsyncTree(value)) {
478
- // Path is actually a directory; see if it has an index.html
479
- value = await Tree.traverse(value, "index.html");
480
- }
481
-
482
- const adjustedKeys = adjustKeys(keys);
483
-
484
- if (value === undefined) {
485
- return {
486
- crawlablePaths: [],
487
- keys: adjustedKeys,
488
- path,
489
- resourcePaths: [],
490
- value: null,
491
- };
492
- }
493
-
494
- // Find paths in the value
495
- const key = adjustedKeys.at(-1);
496
- const { crawlablePaths, resourcePaths } = await findPaths(
497
- value,
498
- key,
499
- baseUrl,
500
- path
501
- );
502
-
503
- return {
504
- crawlablePaths,
505
- keys: adjustedKeys,
506
- path,
507
- resourcePaths,
508
- value,
509
- };
510
- }
511
-
512
- crawl.usage = `@crawl <tree>\tCrawl a tree`;
513
- crawl.documentation = "https://weborigami.org/language/@crawl.html";
129
+ crawlBuiltin.usage = `@crawl <tree>\tCrawl a tree`;
130
+ crawlBuiltin.documentation = "https://weborigami.org/language/@crawl.html";
@@ -8,7 +8,7 @@ import assertTreeIsDefined from "../misc/assertTreeIsDefined.js";
8
8
  *
9
9
  * @this {AsyncTree|null}
10
10
  * @param {string} host
11
- * @param {...string|Symbol} keys
11
+ * @param {...string} keys
12
12
  */
13
13
  export default function keysTree(host, ...keys) {
14
14
  assertTreeIsDefined(this, "keysTree");
@@ -1,4 +1,7 @@
1
- import ShuffleTransform from "../common/ShuffleTransform.js";
1
+ import {
2
+ default as ShuffleTransform,
3
+ shuffle,
4
+ } from "../common/ShuffleTransform.js";
2
5
  import { transformObject } from "../common/utilities.js";
3
6
  import getTreeArgument from "../misc/getTreeArgument.js";
4
7
 
@@ -11,10 +14,21 @@ import getTreeArgument from "../misc/getTreeArgument.js";
11
14
  * @this {AsyncTree|null}
12
15
  * @param {Treelike} [treelike]
13
16
  */
14
- export default async function shuffle(treelike) {
17
+ export default async function shuffleTree(treelike) {
18
+ // Special case: If the treelike is an array, shuffle it directly. Otherwise
19
+ // we'll end up shuffling the array's indexes, and if this is directly
20
+ // displayed by the ori CLI, this will end up creating a plain object. Even
21
+ // though this object will be created with the keys in the correct shuffled
22
+ // order, a JS object will always return numeric keys in numeric order --
23
+ // undoing the shuffle.
24
+ if (Array.isArray(treelike)) {
25
+ const array = treelike.slice();
26
+ shuffle(array);
27
+ return array;
28
+ }
15
29
  const tree = await getTreeArgument(this, arguments, treelike, "@shuffle");
16
30
  return transformObject(ShuffleTransform, tree);
17
31
  }
18
32
 
19
- shuffle.usage = `@shuffle <tree>\tReturn a new tree with the original's keys shuffled`;
20
- shuffle.documentation = "https://weborigami.org/cli/builtins.html#shuffle";
33
+ shuffleTree.usage = `@shuffle <tree>\tReturn a new tree with the original's keys shuffled`;
34
+ shuffleTree.documentation = "https://weborigami.org/cli/builtins.html#shuffle";
@@ -0,0 +1,19 @@
1
+ import { Tree } from "@weborigami/async-tree";
2
+ import getTreeArgument from "../misc/getTreeArgument.js";
3
+ import crawl from "./@crawl.js";
4
+
5
+ /**
6
+ * @this {import("@weborigami/types").AsyncTree|null}
7
+ * @param {import("@weborigami/async-tree").Treelike} treelike
8
+ */
9
+ export default async function siteAudit(treelike) {
10
+ const tree = await getTreeArgument(this, arguments, treelike, "@siteAudit");
11
+ const crawled = await crawl.call(this, tree);
12
+ let crawlErrorsJson = await crawled.get("crawl-errors.json");
13
+ if (!crawlErrorsJson) {
14
+ return undefined;
15
+ }
16
+ const errors = Tree.from(JSON.parse(crawlErrorsJson), { deep: true });
17
+ errors.parent = this;
18
+ return errors;
19
+ }
@@ -11,5 +11,8 @@ export default function slug(filename) {
11
11
  slug = slug.replace(/^-+/, "");
12
12
  slug = slug.replace(/-+$/, "");
13
13
 
14
+ // Collapse consecutive dashes to a single dash.
15
+ slug = slug.replace(/-+/g, "-");
16
+
14
17
  return slug;
15
18
  }
@@ -8,7 +8,7 @@ import assertTreeIsDefined from "../misc/assertTreeIsDefined.js";
8
8
  *
9
9
  * @this {AsyncTree|null}
10
10
  * @param {string} host
11
- * @param {...string|Symbol} keys
11
+ * @param {...string} keys
12
12
  */
13
13
  export default function treeHttp(host, ...keys) {
14
14
  assertTreeIsDefined(this, "treeHttp");
@@ -8,7 +8,7 @@ import assertTreeIsDefined from "../misc/assertTreeIsDefined.js";
8
8
  *
9
9
  * @this {AsyncTree|null}
10
10
  * @param {string} host
11
- * @param {...string|Symbol} keys
11
+ * @param {...string} keys
12
12
  */
13
13
  export default function treeHttps(host, ...keys) {
14
14
  assertTreeIsDefined(this, "treeHttps");
@@ -18,7 +18,7 @@ export default function ShuffleTransform(Base) {
18
18
  *
19
19
  * Performs a Fisher-Yates shuffle. From http://sedition.com/perl/javascript-fy.html
20
20
  */
21
- function shuffle(array) {
21
+ export function shuffle(array) {
22
22
  let i = array.length;
23
23
  while (--i >= 0) {
24
24
  const j = Math.floor(Math.random() * (i + 1));
@@ -0,0 +1,179 @@
1
+ import {
2
+ keysFromPath,
3
+ pathFromKeys,
4
+ trailingSlash,
5
+ Tree,
6
+ } from "@weborigami/async-tree";
7
+ import findPaths from "./findPaths.js";
8
+ import { normalizeKeys } from "./utilities.js";
9
+
10
+ /**
11
+ * Crawl the paths for the given tree, starting at the given base URL, and yield
12
+ * the crawled resources.
13
+ *
14
+ * Each will include the HTML/script/stylesheet value retrieved at a given path.
15
+ */
16
+ export default async function* crawlResources(tree, baseUrl) {
17
+ // We want to kick off requests for new paths as quickly as we find them, then
18
+ // yield whichever result finishes first. Unfortunately, Promise.any() only
19
+ // tells us the result of the first promise to resolve, not which promise that
20
+ // was. So we keep track of a dictionary mapping paths to a promise for the
21
+ // value at that path. When a promise resolves, we mark it as resolved by
22
+ // setting its entry in the dictionary to null.
23
+ const promisesForPaths = {};
24
+
25
+ // Keep track of which resources make which outbound links.
26
+ const resourceOutboundReferences = {};
27
+
28
+ let errorPaths = [];
29
+
30
+ // Seed the promise dictionary with robots.txt at the root and an empty path
31
+ // indicating the current directory (relative to the baseUrl).
32
+ const initialPaths = ["/robots.txt", ""];
33
+ initialPaths.forEach((path) => {
34
+ promisesForPaths[path] = processPath(tree, path, baseUrl);
35
+ });
36
+
37
+ while (true) {
38
+ // Get the latest array of promises that haven't been resolved yet.
39
+ const promises = Object.values(promisesForPaths).filter(
40
+ (promise) => promise !== null
41
+ );
42
+
43
+ if (promises.length === 0) {
44
+ // No unresolved promises; we're done.
45
+ break;
46
+ }
47
+
48
+ // Wait for the first promise to resolve.
49
+ const result = await Promise.any(promises);
50
+
51
+ // Mark the promise for that result as resolved.
52
+ promisesForPaths[result.path] = null;
53
+
54
+ if (result.value === null) {
55
+ // Expected resource doesn't exist; add this to the errors. Exception: a
56
+ // path in the set of initialPaths that doesn't exist is not an error.
57
+ if (!initialPaths.includes(result.path)) {
58
+ errorPaths.push(result.path);
59
+ }
60
+ continue;
61
+ }
62
+
63
+ // Add the crawlable paths to the map. Use the normalized keys (will include
64
+ // "index.html" if the path ends in a trailing slash).
65
+ const normalizedPath = pathFromKeys(result.normalizedKeys);
66
+ resourceOutboundReferences[normalizedPath] = result.crawlablePaths;
67
+
68
+ // Add promises for crawlable paths in the result.
69
+ result.crawlablePaths.forEach((path) => {
70
+ // Only add a promise for this path if we don't already have one.
71
+ if (promisesForPaths[path] === undefined) {
72
+ promisesForPaths[path] = processPath(tree, path, baseUrl);
73
+ }
74
+ });
75
+
76
+ yield result;
77
+ }
78
+
79
+ if (errorPaths.length > 0) {
80
+ // Create a map of the resources that refer to each missing resource.
81
+ const errorsMap = {};
82
+ for (const sourcePath in resourceOutboundReferences) {
83
+ // Does this resource refer to any of the error paths?
84
+ const targetPaths = resourceOutboundReferences[sourcePath];
85
+ for (const targetPath of targetPaths) {
86
+ if (errorPaths.includes(targetPath)) {
87
+ errorsMap[sourcePath] ??= [];
88
+ errorsMap[sourcePath].push(targetPath);
89
+ }
90
+ }
91
+ }
92
+
93
+ // Review the errors map to find any paths that could not be traced back to
94
+ // a referring resource. These are internal crawler errors. We log them so
95
+ // that the use can report them and we can investigate them.
96
+ for (const errorPath of errorPaths) {
97
+ if (!Object.values(errorsMap).flat().includes(errorPath)) {
98
+ errorsMap["(unknown)"] ??= [];
99
+ errorsMap["(unknown)"].push(errorPath);
100
+ }
101
+ }
102
+
103
+ const errorsJson = JSON.stringify(errorsMap, null, 2);
104
+ yield {
105
+ normalizedKeys: ["crawl-errors.json"],
106
+ path: "crawl-errors.json",
107
+ resourcePaths: [],
108
+ value: errorsJson,
109
+ };
110
+ }
111
+ }
112
+
113
+ async function processPath(tree, path, baseUrl) {
114
+ // Don't process any path outside the baseUrl.
115
+ const url = new URL(path, baseUrl);
116
+ if (!url.pathname.startsWith(baseUrl.pathname)) {
117
+ return {
118
+ path,
119
+ value: null,
120
+ };
121
+ }
122
+
123
+ // Convert path to keys
124
+ let keys = keysFromPath(path);
125
+
126
+ // Paths (including those created by the filterPaths function above) will have
127
+ // spaces, etc., escaped. In general, these need to be unescaped so we can
128
+ // find them in the tree.
129
+ keys = keys.map(decodeURIComponent);
130
+
131
+ // Traverse tree to get value.
132
+ let value = await Tree.traverse(tree, ...keys);
133
+ const normalizedKeys = normalizeKeys(keys);
134
+ let normalizedPath = path;
135
+ if (Tree.isTreelike(value)) {
136
+ // Path is actually a directory; see if it has an index.html
137
+ value = await Tree.traverse(value, "index.html");
138
+ if (value !== undefined) {
139
+ if (path.length > 0) {
140
+ // Mark the path as ending in a slash
141
+ normalizedPath = trailingSlash.add(path);
142
+ }
143
+
144
+ // Add index.html to keys if it's not already there
145
+ if (normalizedKeys.at(-1) !== "index.html") {
146
+ normalizedKeys.push("index.html");
147
+ }
148
+ }
149
+ }
150
+
151
+ if (value === undefined) {
152
+ return {
153
+ crawlablePaths: [],
154
+ keys,
155
+ normalizedKeys,
156
+ path,
157
+ resourcePaths: [],
158
+ value: null,
159
+ };
160
+ }
161
+
162
+ // Find paths in the value
163
+ const key = normalizedKeys.at(-1);
164
+ const { crawlablePaths, resourcePaths } = await findPaths(
165
+ value,
166
+ key,
167
+ baseUrl,
168
+ normalizedPath
169
+ );
170
+
171
+ return {
172
+ crawlablePaths,
173
+ keys,
174
+ normalizedKeys,
175
+ path,
176
+ resourcePaths,
177
+ value,
178
+ };
179
+ }
@@ -0,0 +1,259 @@
1
+ import { toString } from "@weborigami/async-tree";
2
+ import { extname } from "@weborigami/language";
3
+ import { isCrawlableHref, normalizeHref } from "./utilities.js";
4
+
5
+ // Filter the paths to those that are local to the site.
6
+ function filterPaths(paths, baseUrl, localPath) {
7
+ // Convert paths to absolute URLs.
8
+ const localUrl = new URL(localPath, baseUrl);
9
+ const basePathname = baseUrl.pathname;
10
+ // @ts-ignore
11
+ const absoluteUrls = paths.map((path) => new URL(path, localUrl));
12
+
13
+ // Convert the absolute URLs to paths relative to the baseHref. If the URL
14
+ // points outside the tree rooted at the baseHref, the relative path will be
15
+ // null. We ignore the protocol in this test, because in practice sites often
16
+ // fumble the use of http and https, treating them interchangeably.
17
+ const relativePaths = absoluteUrls.map((url) => {
18
+ if (url.host === baseUrl.host && url.pathname.startsWith(basePathname)) {
19
+ return url.pathname.slice(basePathname.length);
20
+ } else {
21
+ return null;
22
+ }
23
+ });
24
+
25
+ // Filter out the null paths.
26
+ /** @type {string[]} */
27
+ // @ts-ignore
28
+ const filteredPaths = relativePaths.filter((path) => path);
29
+ return filteredPaths;
30
+ }
31
+
32
+ /**
33
+ * Given a value retrieved from a site using a given key (name), determine what
34
+ * kind of file it is and, based on that, find the paths it references.
35
+ */
36
+ export default function findPaths(value, key, baseUrl, localPath) {
37
+ const text = toString(value);
38
+
39
+ // We guess the value is HTML is if its key has an .html extension or
40
+ // doesn't have an extension, or the value starts with `<`.
41
+ const ext = key ? extname(key).toLowerCase() : "";
42
+ const maybeHtml = ext === "" || text?.trim().startsWith("<");
43
+ let foundPaths;
44
+ if (ext === ".html" || ext === ".htm" || ext === ".xhtml") {
45
+ foundPaths = findPathsInHtml(text);
46
+ } else if (ext === ".css") {
47
+ foundPaths = findPathsInCss(text);
48
+ } else if (ext === ".js") {
49
+ foundPaths = findPathsInJs(text);
50
+ } else if (ext === ".map") {
51
+ foundPaths = findPathsInImageMap(text);
52
+ } else if (key === "robots.txt") {
53
+ foundPaths = findPathsInRobotsTxt(text);
54
+ } else if (key === "sitemap.xml") {
55
+ foundPaths = findPathsInSitemapXml(text);
56
+ } else if (maybeHtml) {
57
+ foundPaths = findPathsInHtml(text);
58
+ } else {
59
+ // Doesn't have an extension we want to process
60
+ return {
61
+ crawlablePaths: [],
62
+ resourcePaths: [],
63
+ };
64
+ }
65
+
66
+ const crawlablePaths = filterPaths(
67
+ foundPaths.crawlablePaths,
68
+ baseUrl,
69
+ localPath
70
+ );
71
+
72
+ const resourcePaths = filterPaths(
73
+ foundPaths.resourcePaths,
74
+ baseUrl,
75
+ localPath
76
+ );
77
+
78
+ return {
79
+ crawlablePaths,
80
+ resourcePaths,
81
+ };
82
+ }
83
+
84
+ function findPathsInCss(css) {
85
+ const resourcePaths = [];
86
+ let match;
87
+
88
+ // Find `url()` functions.
89
+ const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
90
+ while ((match = urlRegex.exec(css))) {
91
+ const href = normalizeHref(match.groups?.href);
92
+ if (href) {
93
+ resourcePaths.push();
94
+ }
95
+ }
96
+
97
+ return {
98
+ crawlablePaths: [],
99
+ resourcePaths,
100
+ };
101
+ }
102
+
103
+ // These are ancient server-side image maps. They're so old that it's hard to
104
+ // find documentation on them, but they're used on the reference Space Jam
105
+ // website we use for testing the crawler. Example:
106
+ // https://www.spacejam.com/1996/bin/bball.map
107
+ function findPathsInImageMap(imageMap) {
108
+ const resourcePaths = [];
109
+ let match;
110
+
111
+ // Find hrefs as the second column in each line.
112
+ const hrefRegex = /^\w+ (?<href>\S+)(\s*$| [\d, ]+$)/gm;
113
+ while ((match = hrefRegex.exec(imageMap))) {
114
+ const href = normalizeHref(match.groups?.href);
115
+ if (href) {
116
+ resourcePaths.push(href);
117
+ }
118
+ }
119
+
120
+ return {
121
+ crawlablePaths: [],
122
+ resourcePaths,
123
+ };
124
+ }
125
+
126
+ function findPathsInJs(js) {
127
+ const crawlablePaths = [];
128
+ let match;
129
+
130
+ // Find `import` statements.
131
+ const importRegex = /import [\s\S]+?from\s+["'](?<import>[^"']*)["'];/g;
132
+ while ((match = importRegex.exec(js))) {
133
+ const href = normalizeHref(match.groups?.import);
134
+ if (href) {
135
+ crawlablePaths.push(href);
136
+ }
137
+ }
138
+
139
+ return {
140
+ crawlablePaths,
141
+ resourcePaths: [],
142
+ };
143
+ }
144
+
145
+ function findPathsInHtml(html) {
146
+ const crawlablePaths = [];
147
+ const resourcePaths = [];
148
+ let match;
149
+
150
+ // Find `href` attributes in anchor and link tags.
151
+ const linkRegex =
152
+ /<(?:a|A|link|LINK)[\s][^>]*?(?:href|HREF)=["'](?<link>[^>]*?)["'][^>]*>/g;
153
+ while ((match = linkRegex.exec(html))) {
154
+ // Links can point to be other crawlable paths and resource paths.
155
+ // We guess the type based on the extension.
156
+ const href = normalizeHref(match.groups?.link);
157
+ if (href) {
158
+ if (isCrawlableHref(href)) {
159
+ crawlablePaths.push(href);
160
+ } else {
161
+ resourcePaths.push(href);
162
+ }
163
+ }
164
+ }
165
+
166
+ // Find `src` attributes in img and script tags.
167
+ const srcRegex =
168
+ /<(?<tag>img|IMG|script|SCRIPT)[\s][^>]*?(?:src|SRC)=["'](?<src>[^>]*?)["'][^>]*>/g;
169
+ while ((match = srcRegex.exec(html))) {
170
+ const tag = match.groups?.tag;
171
+ const src = normalizeHref(match.groups?.src);
172
+ if (src) {
173
+ if (tag === "script" || tag === "SCRIPT") {
174
+ crawlablePaths.push(src);
175
+ } else {
176
+ resourcePaths.push(src);
177
+ }
178
+ }
179
+ }
180
+
181
+ // Find `url()` functions in CSS.
182
+ const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
183
+ while ((match = urlRegex.exec(html))) {
184
+ const href = normalizeHref(match.groups?.href);
185
+ if (href) {
186
+ resourcePaths.push(href);
187
+ }
188
+ }
189
+
190
+ // Find `src` attribute on frame tags.
191
+ const frameRegex =
192
+ /<(?:frame|FRAME)[\s][^>]*?(?:src|SRC)=["'](?<href>[^>]*?)["'][^>]*>/g;
193
+ while ((match = frameRegex.exec(html))) {
194
+ const href = normalizeHref(match.groups?.href);
195
+ if (href) {
196
+ crawlablePaths.push(href);
197
+ }
198
+ }
199
+
200
+ // Find ancient `background` attribute on body tag.
201
+ const backgroundRegex =
202
+ /<(?:body|BODY)[\s][^>]*?(?:background|BACKGROUND)=["'](?<href>[^>]*?)["'][^>]*>/g;
203
+ while ((match = backgroundRegex.exec(html))) {
204
+ const href = normalizeHref(match.groups?.href);
205
+ if (href) {
206
+ resourcePaths.push(href);
207
+ }
208
+ }
209
+
210
+ // Find `href` attribute on area tags.
211
+ const areaRegex =
212
+ /<(?:area|AREA)[\s][^>]*?(?:href|HREF)=["'](?<href>[^>]*?)["'][^>]*>/g;
213
+ while ((match = areaRegex.exec(html))) {
214
+ const href = normalizeHref(match.groups?.href);
215
+ if (href) {
216
+ crawlablePaths.push(href);
217
+ }
218
+ }
219
+
220
+ return { crawlablePaths, resourcePaths };
221
+ }
222
+
223
+ function findPathsInRobotsTxt(txt) {
224
+ const crawlablePaths = [];
225
+ let match;
226
+
227
+ // Find `Sitemap` directives.
228
+ const sitemapRegex = /Sitemap:\s*(?<href>[^\s]*)/g;
229
+ while ((match = sitemapRegex.exec(txt))) {
230
+ const href = normalizeHref(match.groups?.href);
231
+ if (href) {
232
+ crawlablePaths.push(href);
233
+ }
234
+ }
235
+
236
+ return {
237
+ crawlablePaths,
238
+ resourcePaths: [],
239
+ };
240
+ }
241
+
242
+ function findPathsInSitemapXml(xml) {
243
+ const crawlablePaths = [];
244
+ let match;
245
+
246
+ // Find `loc` elements.
247
+ const locRegex = /<loc>(?<href>[^<]*)<\/loc>/g;
248
+ while ((match = locRegex.exec(xml))) {
249
+ const href = normalizeHref(match.groups?.href);
250
+ if (href) {
251
+ crawlablePaths.push(href);
252
+ }
253
+ }
254
+
255
+ return {
256
+ crawlablePaths,
257
+ resourcePaths: [],
258
+ };
259
+ }
@@ -0,0 +1,38 @@
1
+ import { trailingSlash } from "@weborigami/async-tree";
2
+ import { extname } from "@weborigami/language";
3
+
4
+ // A fake base URL used to handle cases where an href is relative and must be
5
+ // treated relative to some base URL.
6
+ const fakeBaseUrl = new URL("https://fake");
7
+
8
+ export function isCrawlableHref(href) {
9
+ // Use a fake base URL to cover the case where the href is relative.
10
+ const url = new URL(href, fakeBaseUrl);
11
+ const pathname = url.pathname;
12
+ const lastKey = pathname.split("/").pop() ?? "";
13
+ if (lastKey === "robots.txt" || lastKey === "sitemap.xml") {
14
+ return true;
15
+ }
16
+ const ext = extname(lastKey);
17
+ // We assume an empty extension is HTML.
18
+ const crawlableExtensions = [".html", ".css", ".js", ".map", ".xhtml", ""];
19
+ return crawlableExtensions.includes(ext);
20
+ }
21
+
22
+ // Remove any search parameters or hash from the href. Preserve absolute or
23
+ // relative nature of URL. If the URL only has a search or hash, return null.
24
+ export function normalizeHref(href) {
25
+ // Remove everything after a `#` or `?` character.
26
+ const normalized = href.split(/[?#]/)[0];
27
+ return normalized === "" ? null : normalized;
28
+ }
29
+
30
+ // For indexing and storage purposes, treat a path that ends in a trailing slash
31
+ // as if it ends in index.html.
32
+ export function normalizeKeys(keys) {
33
+ const normalized = keys.slice();
34
+ if (normalized.length === 0 || trailingSlash.has(normalized.at(-1))) {
35
+ normalized.push("index.html");
36
+ }
37
+ return normalized;
38
+ }