@weborigami/origami 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -4
- package/src/dev/crawler/audit.js +85 -0
- package/src/{site → dev}/crawler/crawl.js +3 -66
- package/src/{site → dev}/crawler/crawlResources.js +44 -18
- package/src/dev/crawler/findPaths.js +90 -0
- package/src/dev/crawler/pathsInCss.js +51 -0
- package/src/dev/crawler/pathsInHtml.js +161 -0
- package/src/dev/crawler/pathsInImageMap.js +25 -0
- package/src/dev/crawler/pathsInJs.js +140 -0
- package/src/dev/crawler/pathsInRobotsTxt.js +20 -0
- package/src/dev/crawler/pathsInSitemap.js +20 -0
- package/src/dev/crawler/utilities.js +125 -0
- package/src/dev/dev.js +2 -0
- package/src/handlers/handlers.js +2 -0
- package/src/handlers/ts.handler.js +1 -0
- package/src/help/help.yaml +6 -6
- package/src/site/site.js +0 -2
- package/src/text/htmlDom.js +6 -0
- package/src/text/text.js +1 -0
- package/src/site/audit.js +0 -19
- package/src/site/crawler/findPaths.js +0 -266
- package/src/site/crawler/utilities.js +0 -37
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@weborigami/origami",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.2",
|
|
4
4
|
"description": "Web Origami language, CLI, framework, and server",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"repository": {
|
|
@@ -17,13 +17,15 @@
|
|
|
17
17
|
"typescript": "5.8.2"
|
|
18
18
|
},
|
|
19
19
|
"dependencies": {
|
|
20
|
-
"@weborigami/async-tree": "0.3.
|
|
21
|
-
"@weborigami/language": "0.3.1",
|
|
20
|
+
"@weborigami/async-tree": "0.3.2",
|
|
22
21
|
"@weborigami/json-feed-to-rss": "1.0.0",
|
|
23
|
-
"@weborigami/
|
|
22
|
+
"@weborigami/language": "0.3.2",
|
|
23
|
+
"@weborigami/types": "0.3.2",
|
|
24
|
+
"css-tree": "3.1.0",
|
|
24
25
|
"exif-parser": "0.1.12",
|
|
25
26
|
"graphviz-wasm": "3.0.2",
|
|
26
27
|
"highlight.js": "11.11.1",
|
|
28
|
+
"jsdom": "26.1.0",
|
|
27
29
|
"marked": "15.0.7",
|
|
28
30
|
"marked-gfm-heading-id": "4.1.1",
|
|
29
31
|
"marked-highlight": "2.2.1",
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { pathFromKeys, symbols, Tree } from "@weborigami/async-tree";
|
|
2
|
+
import getTreeArgument from "../../common/getTreeArgument.js";
|
|
3
|
+
import crawlResources from "./crawlResources.js";
|
|
4
|
+
import { getBaseUrl } from "./utilities.js";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Crawl the indicated tree and return an audit of any broken links to internal
|
|
8
|
+
* pages or other resources.
|
|
9
|
+
*
|
|
10
|
+
* @typedef {import("@weborigami/types").AsyncTree} AsyncTree
|
|
11
|
+
* @typedef {import("@weborigami/async-tree").Treelike} Treelike
|
|
12
|
+
*
|
|
13
|
+
* @this {AsyncTree|null}
|
|
14
|
+
* @param {Treelike} treelike
|
|
15
|
+
* @param {string} [baseHref]
|
|
16
|
+
*/
|
|
17
|
+
export default async function audit(treelike, baseHref) {
|
|
18
|
+
const tree = await getTreeArgument(this, arguments, treelike, "site:audit");
|
|
19
|
+
const baseUrl = getBaseUrl(baseHref, treelike);
|
|
20
|
+
|
|
21
|
+
let errors = {};
|
|
22
|
+
let report;
|
|
23
|
+
const resourceReferences = {};
|
|
24
|
+
const resourcePromises = {};
|
|
25
|
+
|
|
26
|
+
// Iterate through all the resources to crawl the whole tree.
|
|
27
|
+
for await (const result of crawlResources(tree, baseUrl)) {
|
|
28
|
+
const { normalizedKeys, resourcePaths, value: resource } = result;
|
|
29
|
+
const normalizedPath = pathFromKeys(normalizedKeys);
|
|
30
|
+
if (normalizedPath === "crawl-errors.json") {
|
|
31
|
+
// Final error report; add missing pages to the errors
|
|
32
|
+
report = JSON.parse(resource);
|
|
33
|
+
for (const [path, pagePaths] of Object.entries(report)) {
|
|
34
|
+
if (!errors[path]) {
|
|
35
|
+
errors[path] = [];
|
|
36
|
+
}
|
|
37
|
+
errors[path].push(...pagePaths);
|
|
38
|
+
}
|
|
39
|
+
} else {
|
|
40
|
+
// Record which resources this path references
|
|
41
|
+
resourceReferences[normalizedPath] = resourcePaths;
|
|
42
|
+
|
|
43
|
+
// Add all resources to the set that should be verified
|
|
44
|
+
for (const resourcePath of resourcePaths) {
|
|
45
|
+
// Start request, don't wait for it to complete yet
|
|
46
|
+
resourcePromises[resourcePath] ??= Tree.traversePath(
|
|
47
|
+
tree,
|
|
48
|
+
resourcePath
|
|
49
|
+
).then(
|
|
50
|
+
// Just return true or false to indicate if value is defined
|
|
51
|
+
(value) => value !== undefined
|
|
52
|
+
);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Add any references to missing resources to the errors
|
|
58
|
+
for (const [refererPath, resourcePaths] of Object.entries(
|
|
59
|
+
resourceReferences
|
|
60
|
+
)) {
|
|
61
|
+
for (const resourcePath of resourcePaths) {
|
|
62
|
+
const found = await resourcePromises[resourcePath];
|
|
63
|
+
if (!found) {
|
|
64
|
+
if (!errors[refererPath]) {
|
|
65
|
+
errors[refererPath] = [];
|
|
66
|
+
}
|
|
67
|
+
errors[refererPath].push(resourcePath);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if (Object.keys(errors).length === 0) {
|
|
73
|
+
return undefined;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
Object.defineProperty(errors, symbols.parent, {
|
|
77
|
+
enumerable: false,
|
|
78
|
+
value: this,
|
|
79
|
+
});
|
|
80
|
+
Object.defineProperty(errors, symbols.deep, {
|
|
81
|
+
enumerable: false,
|
|
82
|
+
value: true,
|
|
83
|
+
});
|
|
84
|
+
return errors;
|
|
85
|
+
}
|
|
@@ -2,13 +2,12 @@ import {
|
|
|
2
2
|
DeepObjectTree,
|
|
3
3
|
Tree,
|
|
4
4
|
deepMerge,
|
|
5
|
-
isPlainObject,
|
|
6
5
|
keysFromPath,
|
|
7
|
-
trailingSlash,
|
|
8
6
|
} from "@weborigami/async-tree";
|
|
9
7
|
import { InvokeFunctionsTransform } from "@weborigami/language";
|
|
10
8
|
import getTreeArgument from "../../common/getTreeArgument.js";
|
|
11
9
|
import crawlResources from "./crawlResources.js";
|
|
10
|
+
import { addValueToObject, getBaseUrl } from "./utilities.js";
|
|
12
11
|
|
|
13
12
|
/**
|
|
14
13
|
* Crawl a tree, starting its root index.html page, and following links to
|
|
@@ -20,6 +19,7 @@ import crawlResources from "./crawlResources.js";
|
|
|
20
19
|
*
|
|
21
20
|
* @typedef {import("@weborigami/types").AsyncTree} AsyncTree
|
|
22
21
|
* @typedef {import("@weborigami/async-tree").Treelike} Treelike
|
|
22
|
+
*
|
|
23
23
|
* @this {AsyncTree|null}
|
|
24
24
|
* @param {Treelike} treelike
|
|
25
25
|
* @param {string} [baseHref]
|
|
@@ -27,39 +27,10 @@ import crawlResources from "./crawlResources.js";
|
|
|
27
27
|
*/
|
|
28
28
|
export default async function crawlBuiltin(treelike, baseHref) {
|
|
29
29
|
const tree = await getTreeArgument(this, arguments, treelike, "site:crawl");
|
|
30
|
-
|
|
31
|
-
if (baseHref === undefined) {
|
|
32
|
-
// Ask tree or original treelike if it has an `href` property we can use as
|
|
33
|
-
// the base href to determine whether a link is local within the tree or
|
|
34
|
-
// not. If not, use a fake `local:/` base href.
|
|
35
|
-
baseHref =
|
|
36
|
-
/** @type {any} */ (tree).href ??
|
|
37
|
-
/** @type {any} */ (treelike).href ??
|
|
38
|
-
"local:/";
|
|
39
|
-
if (!baseHref?.endsWith("/")) {
|
|
40
|
-
baseHref += "/";
|
|
41
|
-
}
|
|
42
|
-
} else {
|
|
43
|
-
// Is the href already valid?
|
|
44
|
-
let isHrefValid = false;
|
|
45
|
-
try {
|
|
46
|
-
new URL(baseHref);
|
|
47
|
-
isHrefValid = true;
|
|
48
|
-
} catch (e) {
|
|
49
|
-
// Ignore
|
|
50
|
-
}
|
|
51
|
-
if (!isHrefValid) {
|
|
52
|
-
// Use a fake base href.
|
|
53
|
-
baseHref = `local:/${baseHref}`;
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
// @ts-ignore
|
|
58
|
-
const baseUrl = new URL(baseHref);
|
|
30
|
+
const baseUrl = getBaseUrl(baseHref, treelike);
|
|
59
31
|
|
|
60
32
|
const cache = {};
|
|
61
33
|
const resources = {};
|
|
62
|
-
const errors = [];
|
|
63
34
|
|
|
64
35
|
// We iterate until there are no more promises to wait for.
|
|
65
36
|
for await (const result of crawlResources(tree, baseUrl)) {
|
|
@@ -81,14 +52,6 @@ export default async function crawlBuiltin(treelike, baseHref) {
|
|
|
81
52
|
}
|
|
82
53
|
}
|
|
83
54
|
|
|
84
|
-
if (errors.length) {
|
|
85
|
-
addValueToObject(
|
|
86
|
-
cache,
|
|
87
|
-
["crawl-errors.json"],
|
|
88
|
-
JSON.stringify(errors, null, 2)
|
|
89
|
-
);
|
|
90
|
-
}
|
|
91
|
-
|
|
92
55
|
// Merge the cache on top of the resources tree. If we have an actual value
|
|
93
56
|
// for something already, that's better than a function that will get that
|
|
94
57
|
// value.
|
|
@@ -98,29 +61,3 @@ export default async function crawlBuiltin(treelike, baseHref) {
|
|
|
98
61
|
);
|
|
99
62
|
return result;
|
|
100
63
|
}
|
|
101
|
-
|
|
102
|
-
function addValueToObject(object, keys, value) {
|
|
103
|
-
for (let i = 0, current = object; i < keys.length; i++) {
|
|
104
|
-
const key = trailingSlash.remove(keys[i]);
|
|
105
|
-
if (i === keys.length - 1) {
|
|
106
|
-
// Write out value
|
|
107
|
-
if (isPlainObject(current[key])) {
|
|
108
|
-
// Route with existing values; treat the new value as an index.html
|
|
109
|
-
current[key]["index.html"] = value;
|
|
110
|
-
} else {
|
|
111
|
-
current[key] = value;
|
|
112
|
-
}
|
|
113
|
-
} else {
|
|
114
|
-
// Traverse further
|
|
115
|
-
if (!current[key]) {
|
|
116
|
-
current[key] = {};
|
|
117
|
-
} else if (!isPlainObject(current[key])) {
|
|
118
|
-
// Already have a value at this point. The site has a page at a route
|
|
119
|
-
// like /foo, and the site also has resources within that at routes like
|
|
120
|
-
// /foo/bar.jpg. We move the current value to "index.html".
|
|
121
|
-
current[key] = { "index.html": current[key] };
|
|
122
|
-
}
|
|
123
|
-
current = current[key];
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
|
@@ -129,28 +129,54 @@ async function processPath(tree, path, baseUrl) {
|
|
|
129
129
|
keys = keys.map(decodeURIComponent);
|
|
130
130
|
|
|
131
131
|
// Traverse tree to get value.
|
|
132
|
-
let value
|
|
133
|
-
|
|
134
|
-
let normalizedPath
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
132
|
+
let value;
|
|
133
|
+
let normalizedKeys;
|
|
134
|
+
let normalizedPath;
|
|
135
|
+
try {
|
|
136
|
+
value = await Tree.traverse(tree, ...keys);
|
|
137
|
+
normalizedKeys = keys.slice();
|
|
138
|
+
normalizedPath = path;
|
|
139
|
+
if (Tree.isTreelike(value)) {
|
|
140
|
+
// Path is actually a directory. See if we can get the empty string or
|
|
141
|
+
// "index.html".
|
|
142
|
+
value =
|
|
143
|
+
(await Tree.traverse(value, "")) ??
|
|
144
|
+
(await Tree.traverse(value, "index.html"));
|
|
145
|
+
if (value !== undefined) {
|
|
146
|
+
if (path.length > 0) {
|
|
147
|
+
// Mark the path as ending in a slash
|
|
148
|
+
normalizedPath = trailingSlash.add(path);
|
|
149
|
+
const key = normalizedKeys.pop();
|
|
150
|
+
normalizedKeys.push(trailingSlash.add(key));
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Add index.html to keys if it's not already there
|
|
154
|
+
if (normalizedKeys.at(-1) !== "index.html") {
|
|
155
|
+
normalizedKeys.push("index.html");
|
|
156
|
+
}
|
|
147
157
|
}
|
|
158
|
+
}
|
|
148
159
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
160
|
+
if (value === undefined && path.length > 0) {
|
|
161
|
+
// The path may be a URL like `foo` or `foo/` that points to `foo.html`, so
|
|
162
|
+
// we'll try looking adding `.html` to the end. We don't want to check every
|
|
163
|
+
// path twice, so we only do this if the last key does *not* include an
|
|
164
|
+
// extension.
|
|
165
|
+
const lastKey = keys.at(-1);
|
|
166
|
+
if (lastKey !== "" && !lastKey?.includes(".")) {
|
|
167
|
+
const adjustedLastKey = `${trailingSlash.remove(lastKey)}.html`;
|
|
168
|
+
const adjustedKeys = [...keys.slice(0, -1), adjustedLastKey];
|
|
169
|
+
value = await Tree.traverse(tree, ...adjustedKeys);
|
|
170
|
+
if (value !== undefined) {
|
|
171
|
+
// Page exists at foo.html
|
|
172
|
+
normalizedPath = pathFromKeys(adjustedKeys);
|
|
173
|
+
normalizedKeys = adjustedKeys;
|
|
174
|
+
}
|
|
152
175
|
}
|
|
153
176
|
}
|
|
177
|
+
} catch (error) {
|
|
178
|
+
// Ignore errors, return empty paths below
|
|
179
|
+
value = undefined;
|
|
154
180
|
}
|
|
155
181
|
|
|
156
182
|
if (value === undefined) {
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { extension, toString } from "@weborigami/async-tree";
|
|
2
|
+
import pathsInCss from "./pathsInCss.js";
|
|
3
|
+
import pathsInHtml from "./pathsInHtml.js";
|
|
4
|
+
import pathsInImageMap from "./pathsInImageMap.js";
|
|
5
|
+
import pathsInJs from "./pathsInJs.js";
|
|
6
|
+
import pathsInRobotsTxt from "./pathsInRobotsTxt.js";
|
|
7
|
+
import pathsInSitemap from "./pathsInSitemap.js";
|
|
8
|
+
|
|
9
|
+
// Filter the paths to those that are local to the site.
|
|
10
|
+
function filterPaths(paths, baseUrl, localPath) {
|
|
11
|
+
// Convert paths to absolute URLs.
|
|
12
|
+
const localUrl = new URL(localPath, baseUrl);
|
|
13
|
+
const basePathname = baseUrl.pathname;
|
|
14
|
+
// @ts-ignore
|
|
15
|
+
const absoluteUrls = paths.map((path) => new URL(path, localUrl));
|
|
16
|
+
|
|
17
|
+
// Convert the absolute URLs to paths relative to the baseHref. If the URL
|
|
18
|
+
// points outside the tree rooted at the baseHref, the relative path will be
|
|
19
|
+
// null. We ignore the protocol in this test, because in practice sites often
|
|
20
|
+
// fumble the use of http and https, treating them interchangeably.
|
|
21
|
+
const relativePaths = absoluteUrls.map((url) => {
|
|
22
|
+
if (url.host === baseUrl.host && url.pathname.startsWith(basePathname)) {
|
|
23
|
+
const path = url.pathname.slice(basePathname.length);
|
|
24
|
+
// The process of creating the URLs will have escaped characters. We
|
|
25
|
+
// remove them. This has the side-effect of removing them if they existed
|
|
26
|
+
// in the original path; it would be better if we avoided that.
|
|
27
|
+
return decodeURIComponent(path);
|
|
28
|
+
} else {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
// Filter out the null paths.
|
|
34
|
+
/** @type {string[]} */
|
|
35
|
+
// @ts-ignore
|
|
36
|
+
const filteredPaths = relativePaths.filter((path) => path);
|
|
37
|
+
return filteredPaths;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Given a value retrieved from a site using a given key (name), determine what
|
|
42
|
+
* kind of file it is and, based on that, find the paths it references.
|
|
43
|
+
*/
|
|
44
|
+
export default function findPaths(value, key, baseUrl, localPath) {
|
|
45
|
+
const text = toString(value);
|
|
46
|
+
|
|
47
|
+
// We guess the value is HTML is if its key has an .html extension or
|
|
48
|
+
// doesn't have an extension, or the value starts with `<`.
|
|
49
|
+
const ext = key ? extension.extname(key).toLowerCase() : "";
|
|
50
|
+
let foundPaths;
|
|
51
|
+
if (ext === ".html" || ext === ".htm" || ext === ".xhtml") {
|
|
52
|
+
foundPaths = pathsInHtml(text);
|
|
53
|
+
} else if (ext === ".css") {
|
|
54
|
+
foundPaths = pathsInCss(text);
|
|
55
|
+
} else if (ext === ".js") {
|
|
56
|
+
foundPaths = pathsInJs(text);
|
|
57
|
+
} else if (ext === ".map") {
|
|
58
|
+
foundPaths = pathsInImageMap(text);
|
|
59
|
+
} else if (key === "robots.txt") {
|
|
60
|
+
foundPaths = pathsInRobotsTxt(text);
|
|
61
|
+
} else if (key === "sitemap.xml") {
|
|
62
|
+
foundPaths = pathsInSitemap(text);
|
|
63
|
+
} else if (ext === "" && text?.trim().startsWith("<")) {
|
|
64
|
+
// Probably HTML
|
|
65
|
+
foundPaths = pathsInHtml(text);
|
|
66
|
+
} else {
|
|
67
|
+
// Doesn't have an extension we want to process
|
|
68
|
+
return {
|
|
69
|
+
crawlablePaths: [],
|
|
70
|
+
resourcePaths: [],
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const crawlablePaths = filterPaths(
|
|
75
|
+
foundPaths.crawlablePaths,
|
|
76
|
+
baseUrl,
|
|
77
|
+
localPath
|
|
78
|
+
);
|
|
79
|
+
|
|
80
|
+
const resourcePaths = filterPaths(
|
|
81
|
+
foundPaths.resourcePaths,
|
|
82
|
+
baseUrl,
|
|
83
|
+
localPath
|
|
84
|
+
);
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
crawlablePaths,
|
|
88
|
+
resourcePaths,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { parse, walk } from "css-tree";
|
|
2
|
+
import { addHref } from "./utilities.js";
|
|
3
|
+
|
|
4
|
+
const imageFunctions = ["cross-fade", "image", "image-set"];
|
|
5
|
+
|
|
6
|
+
export default function pathsInCss(css, context = "stylesheet") {
|
|
7
|
+
const paths = {
|
|
8
|
+
crawlablePaths: [],
|
|
9
|
+
resourcePaths: [],
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
let ast;
|
|
13
|
+
try {
|
|
14
|
+
ast = parse(css, { context });
|
|
15
|
+
} catch (e) {
|
|
16
|
+
// If the CSS is invalid, we can't parse it, so we can't extract paths. For
|
|
17
|
+
// now we just return no paths.
|
|
18
|
+
return paths;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
if (!ast) {
|
|
22
|
+
// Unclear why parser sometimes returns an undefined AST
|
|
23
|
+
return paths;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
walk(
|
|
27
|
+
ast,
|
|
28
|
+
/** @this {any} */
|
|
29
|
+
function (node) {
|
|
30
|
+
const { type, value } = node;
|
|
31
|
+
if (
|
|
32
|
+
this.atrule?.name === "import" &&
|
|
33
|
+
(type === "String" || type === "Url")
|
|
34
|
+
) {
|
|
35
|
+
// A plain string or url() in an @import
|
|
36
|
+
addHref(paths, value, true);
|
|
37
|
+
} else if (
|
|
38
|
+
type === "String" &&
|
|
39
|
+
imageFunctions.includes(this.function?.name)
|
|
40
|
+
) {
|
|
41
|
+
// A plain string in an cross-fade(), image(), or image-set()
|
|
42
|
+
addHref(paths, value, false);
|
|
43
|
+
} else if (type === "Url") {
|
|
44
|
+
// A url() anywhere else
|
|
45
|
+
addHref(paths, value, false);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
);
|
|
49
|
+
|
|
50
|
+
return paths;
|
|
51
|
+
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import { JSDOM, VirtualConsole } from "jsdom";
|
|
2
|
+
import pathsInCss from "./pathsInCss.js";
|
|
3
|
+
import pathsInJs from "./pathsInJs.js";
|
|
4
|
+
import { addHref } from "./utilities.js";
|
|
5
|
+
|
|
6
|
+
export default function pathsInHtml(html) {
|
|
7
|
+
const paths = {
|
|
8
|
+
crawlablePaths: [],
|
|
9
|
+
resourcePaths: [],
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
// Create a virtual console to avoid logging errors to the console
|
|
13
|
+
const virtualConsole = new VirtualConsole();
|
|
14
|
+
const document = new JSDOM(html, { virtualConsole }).window.document;
|
|
15
|
+
|
|
16
|
+
// Find `href` attributes in anchor, area, link, SVG tags.
|
|
17
|
+
//
|
|
18
|
+
// NOTE: As of April 2024, jsdom querySelectorAll does not appear to find
|
|
19
|
+
// elements with mixed-case tag names.
|
|
20
|
+
const hrefTags = document.querySelectorAll(
|
|
21
|
+
"a[href], area[href], image[href], feImage[href], filter[href], linearGradient[href], link[href], mpath[href], pattern[href], radialGradient[href], textPath[href], use[href]"
|
|
22
|
+
);
|
|
23
|
+
for (const hrefTag of hrefTags) {
|
|
24
|
+
const crawlable = ["A", "AREA"].includes(hrefTag.tagName)
|
|
25
|
+
? true
|
|
26
|
+
: undefined;
|
|
27
|
+
addHref(paths, hrefTag.getAttribute("href"), crawlable);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Find `src` attributes in input, frame, media, and script tags.
|
|
31
|
+
const srcTags = document.querySelectorAll(
|
|
32
|
+
"audio[src], embed[src], frame[src], iframe[src], img[src], input[src], script[src], source[src], track[src], video[src]"
|
|
33
|
+
);
|
|
34
|
+
for (const srcTag of srcTags) {
|
|
35
|
+
const crawlable = ["FRAME", "IFRAME"].includes(srcTag.tagName)
|
|
36
|
+
? true
|
|
37
|
+
: srcTag.tagName === "SCRIPT"
|
|
38
|
+
? srcTag.type === "module" // Only crawl modules
|
|
39
|
+
: undefined;
|
|
40
|
+
addHref(paths, srcTag.getAttribute("src"), crawlable);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Find `srcset` attributes in image and source tags.
|
|
44
|
+
const srcsetTags = document.querySelectorAll("img[srcset], source[srcset]");
|
|
45
|
+
for (const srcsetTag of srcsetTags) {
|
|
46
|
+
const srcset = srcsetTag.getAttribute("srcset");
|
|
47
|
+
const srcRegex = /(?<url>[^\s,]+)(?=\s+\d+(?:\.\d+)?[wxh])/g;
|
|
48
|
+
let match;
|
|
49
|
+
while ((match = srcRegex.exec(srcset))) {
|
|
50
|
+
if (match.groups?.url) {
|
|
51
|
+
addHref(paths, match.groups.url, false);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Find `poster` attributes in <video> tags.
|
|
57
|
+
const posterTags = document.querySelectorAll("video[poster]");
|
|
58
|
+
for (const posterTag of posterTags) {
|
|
59
|
+
addHref(paths, posterTag.getAttribute("poster"), false);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Find `data` attributes in <object> tags.
|
|
63
|
+
const objectTags = document.querySelectorAll("object[data]");
|
|
64
|
+
for (const objectTag of objectTags) {
|
|
65
|
+
addHref(paths, objectTag.getAttribute("data"), false);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Find deprecated `background` attribute on body and table tags.
|
|
69
|
+
const backgroundTags = document.querySelectorAll(
|
|
70
|
+
"body[background], table[background], td[background], th[background]"
|
|
71
|
+
);
|
|
72
|
+
for (const backgroundTag of backgroundTags) {
|
|
73
|
+
addHref(paths, backgroundTag.getAttribute("background"), false);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Find deprecated `longdesc` attributes on <img> tags.
|
|
77
|
+
const longdescTags = document.querySelectorAll("img[longdesc]");
|
|
78
|
+
for (const longdescTag of longdescTags) {
|
|
79
|
+
addHref(paths, longdescTag.getAttribute("longdesc"), false);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Find paths in <meta> image tags.
|
|
83
|
+
const imageMetaTags = document.querySelectorAll('meta[property$=":image"]');
|
|
84
|
+
for (const imageMetaTag of imageMetaTags) {
|
|
85
|
+
const content = imageMetaTag.getAttribute("content");
|
|
86
|
+
if (content) {
|
|
87
|
+
addHref(paths, content, false);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Find paths in CSS in <style> tags.
|
|
92
|
+
const styleTags = document.querySelectorAll("style");
|
|
93
|
+
for (const styleAttribute of styleTags) {
|
|
94
|
+
const cssPaths = pathsInCss(styleAttribute.textContent);
|
|
95
|
+
paths.crawlablePaths.push(...cssPaths.crawlablePaths);
|
|
96
|
+
paths.resourcePaths.push(...cssPaths.resourcePaths);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Find URLs in CSS in `style` attributes.
|
|
100
|
+
const styleAttributeTags = document.querySelectorAll("[style]");
|
|
101
|
+
for (const tag of styleAttributeTags) {
|
|
102
|
+
const style = tag.getAttribute("style");
|
|
103
|
+
const stylePaths = pathsInCss(style, "declarationList");
|
|
104
|
+
stylePaths.resourcePaths.forEach((href) => {
|
|
105
|
+
addHref(paths, href, false);
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Find URLs in SVG attributes.
|
|
110
|
+
const svgAttributeNames = [
|
|
111
|
+
"clip-path",
|
|
112
|
+
"fill",
|
|
113
|
+
"filter",
|
|
114
|
+
"marker-end",
|
|
115
|
+
"marker-start",
|
|
116
|
+
"mask",
|
|
117
|
+
"stroke",
|
|
118
|
+
];
|
|
119
|
+
const svgTags = document.querySelectorAll(
|
|
120
|
+
svgAttributeNames.map((name) => `[${name}]`).join(", ")
|
|
121
|
+
);
|
|
122
|
+
for (const svgTag of svgTags) {
|
|
123
|
+
for (const name of svgAttributeNames) {
|
|
124
|
+
const attributeValue = svgTag.getAttribute(name);
|
|
125
|
+
if (!attributeValue) {
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
const urlRegex = /url\((['"]?)(?<href>.*?)\1\)/g;
|
|
129
|
+
const attributeValueMatch = urlRegex.exec(attributeValue);
|
|
130
|
+
if (attributeValueMatch) {
|
|
131
|
+
const href = attributeValueMatch.groups?.href;
|
|
132
|
+
if (href) {
|
|
133
|
+
addHref(paths, href, false);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Also look for JS `import` statements that might be in <script type="module"> tags.
|
|
140
|
+
const scriptTags = document.querySelectorAll("script[type='module']");
|
|
141
|
+
for (const scriptTag of scriptTags) {
|
|
142
|
+
const jsPaths = pathsInJs(scriptTag.textContent);
|
|
143
|
+
paths.crawlablePaths.push(...jsPaths.crawlablePaths);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Special handling for <noframes> in framesets. We need to use a regex for
|
|
147
|
+
// this because the jsdom parser supports frames, so it will treat a
|
|
148
|
+
// <noframes> tag as a text node.
|
|
149
|
+
const noframesRegex = /<noframes>(?<html>[\s\S]*?)<\/noframes>/g;
|
|
150
|
+
let match;
|
|
151
|
+
while ((match = noframesRegex.exec(html))) {
|
|
152
|
+
const noframesHtml = match.groups?.html;
|
|
153
|
+
if (noframesHtml) {
|
|
154
|
+
const noframesPaths = pathsInHtml(noframesHtml);
|
|
155
|
+
paths.crawlablePaths.push(...noframesPaths.crawlablePaths);
|
|
156
|
+
paths.resourcePaths.push(...noframesPaths.resourcePaths);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return paths;
|
|
161
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { normalizeHref } from "./utilities.js";
|
|
2
|
+
|
|
3
|
+
// These are ancient server-side image maps. They're so old that it's hard to
|
|
4
|
+
// find documentation on them, but they're used on the reference Space Jam
|
|
5
|
+
// website we use for testing the crawler.
|
|
6
|
+
//
|
|
7
|
+
// Example: https://www.spacejam.com/1996/bin/bball.map
|
|
8
|
+
export default function pathsInImageMap(imageMap) {
|
|
9
|
+
const resourcePaths = [];
|
|
10
|
+
let match;
|
|
11
|
+
|
|
12
|
+
// Find hrefs as the second column in each line.
|
|
13
|
+
const hrefRegex = /^\w+ (?<href>\S+)(\s*$| [\d, ]+$)/gm;
|
|
14
|
+
while ((match = hrefRegex.exec(imageMap))) {
|
|
15
|
+
const href = normalizeHref(match.groups?.href);
|
|
16
|
+
if (href) {
|
|
17
|
+
resourcePaths.push(href);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
return {
|
|
22
|
+
crawlablePaths: [],
|
|
23
|
+
resourcePaths,
|
|
24
|
+
};
|
|
25
|
+
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Find static module references in JavaScript code.
|
|
3
|
+
*
|
|
4
|
+
* Matches:
|
|
5
|
+
*
|
|
6
|
+
* * `import … from "x"`
|
|
7
|
+
* * `import "x"`
|
|
8
|
+
* * `export … from "x"`
|
|
9
|
+
* * `export { … } from "x"`
|
|
10
|
+
*
|
|
11
|
+
* This does simple lexical analysis to avoid matching paths inside comments or
|
|
12
|
+
* string literals.
|
|
13
|
+
*
|
|
14
|
+
* @param {string} js
|
|
15
|
+
*/
|
|
16
|
+
export default function pathsInJs(js) {
|
|
17
|
+
return {
|
|
18
|
+
crawlablePaths: modulePaths(js),
|
|
19
|
+
resourcePaths: [],
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function modulePaths(src) {
|
|
24
|
+
const tokens = Array.from(tokenize(src));
|
|
25
|
+
const paths = new Set();
|
|
26
|
+
|
|
27
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
28
|
+
const t = tokens[i];
|
|
29
|
+
|
|
30
|
+
// static import
|
|
31
|
+
if (t.type === "Identifier" && t.value === "import") {
|
|
32
|
+
// look ahead for either:
|
|
33
|
+
// import "mod"
|
|
34
|
+
// import … from "mod"
|
|
35
|
+
let j = i + 1;
|
|
36
|
+
// skip any punctuation or identifiers until we hit 'from' or a StringLiteral
|
|
37
|
+
while (
|
|
38
|
+
j < tokens.length &&
|
|
39
|
+
tokens[j].type !== "StringLiteral" &&
|
|
40
|
+
!(tokens[j].type === "Identifier" && tokens[j].value === "from")
|
|
41
|
+
) {
|
|
42
|
+
j++;
|
|
43
|
+
}
|
|
44
|
+
// import "mod"
|
|
45
|
+
if (tokens[j]?.type === "StringLiteral") {
|
|
46
|
+
paths.add(tokens[j].value);
|
|
47
|
+
} else if (
|
|
48
|
+
// import … from "mod"
|
|
49
|
+
tokens[j]?.value === "from" &&
|
|
50
|
+
tokens[j + 1]?.type === "StringLiteral"
|
|
51
|
+
) {
|
|
52
|
+
paths.add(tokens[j + 1].value);
|
|
53
|
+
}
|
|
54
|
+
} else if (t.type === "Identifier" && t.value === "export") {
|
|
55
|
+
// re-export or export‐from
|
|
56
|
+
|
|
57
|
+
// find a 'from' token on the same statement
|
|
58
|
+
let j = i + 1;
|
|
59
|
+
while (
|
|
60
|
+
j < tokens.length &&
|
|
61
|
+
!(tokens[j].type === "Identifier" && tokens[j].value === "from")
|
|
62
|
+
) {
|
|
63
|
+
// stop at semicolon so we don't run past the statement
|
|
64
|
+
if (tokens[j].type === "Punctuator" && tokens[j].value === ";") {
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
j++;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (
|
|
71
|
+
tokens[j]?.value === "from" &&
|
|
72
|
+
tokens[j + 1]?.type === "StringLiteral"
|
|
73
|
+
) {
|
|
74
|
+
paths.add(tokens[j + 1].value);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return [...paths];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Lexer emits Identifiers, StringLiterals, and Punctuators
|
|
83
|
+
function* tokenize(src) {
|
|
84
|
+
let i = 0;
|
|
85
|
+
while (i < src.length) {
|
|
86
|
+
const c = src[i];
|
|
87
|
+
|
|
88
|
+
// Skip single‐line comments
|
|
89
|
+
if (c === "/" && src[i + 1] === "/") {
|
|
90
|
+
i += 2;
|
|
91
|
+
while (i < src.length && src[i] !== "\n") {
|
|
92
|
+
i++;
|
|
93
|
+
}
|
|
94
|
+
} else if (c === "/" && src[i + 1] === "*") {
|
|
95
|
+
// Skip multi‐line comments
|
|
96
|
+
i += 2;
|
|
97
|
+
while (i < src.length && !(src[i] === "*" && src[i + 1] === "/")) {
|
|
98
|
+
i++;
|
|
99
|
+
}
|
|
100
|
+
i += 2;
|
|
101
|
+
continue;
|
|
102
|
+
} else if (c === '"' || c === "'" || c === "`") {
|
|
103
|
+
// Skip string literals (but capture them)
|
|
104
|
+
const quote = c;
|
|
105
|
+
let start = i + 1;
|
|
106
|
+
i++;
|
|
107
|
+
while (i < src.length) {
|
|
108
|
+
if (src[i] === "\\") {
|
|
109
|
+
i += 2;
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
if (src[i] === quote) {
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
i++;
|
|
116
|
+
}
|
|
117
|
+
const str = src.slice(start, i);
|
|
118
|
+
i++;
|
|
119
|
+
yield { type: "StringLiteral", value: str };
|
|
120
|
+
continue;
|
|
121
|
+
} else if (/[A-Za-z_$]/.test(c)) {
|
|
122
|
+
// Identifier
|
|
123
|
+
let start = i;
|
|
124
|
+
i++;
|
|
125
|
+
while (i < src.length && /[\w$]/.test(src[i])) {
|
|
126
|
+
i++;
|
|
127
|
+
}
|
|
128
|
+
yield { type: "Identifier", value: src.slice(start, i) };
|
|
129
|
+
continue;
|
|
130
|
+
} else if (/[{}();,]/.test(c)) {
|
|
131
|
+
// Punctuator (we still keep braces/semis for possible future use)
|
|
132
|
+
yield { type: "Punctuator", value: c };
|
|
133
|
+
i++;
|
|
134
|
+
continue;
|
|
135
|
+
} else {
|
|
136
|
+
// Skip everything else (whitespace, operators, etc.)
|
|
137
|
+
i++;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { normalizeHref } from "./utilities.js";
|
|
2
|
+
|
|
3
|
+
export default function pathsInRobotsTxt(txt) {
|
|
4
|
+
const crawlablePaths = [];
|
|
5
|
+
let match;
|
|
6
|
+
|
|
7
|
+
// Find `Sitemap` directives.
|
|
8
|
+
const sitemapRegex = /Sitemap:\s*(?<href>[^\s]*)/g;
|
|
9
|
+
while ((match = sitemapRegex.exec(txt))) {
|
|
10
|
+
const href = normalizeHref(match.groups?.href);
|
|
11
|
+
if (href) {
|
|
12
|
+
crawlablePaths.push(href);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
return {
|
|
17
|
+
crawlablePaths,
|
|
18
|
+
resourcePaths: [],
|
|
19
|
+
};
|
|
20
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { normalizeHref } from "./utilities.js";
|
|
2
|
+
|
|
3
|
+
export default function pathsInSitemap(xml) {
|
|
4
|
+
const crawlablePaths = [];
|
|
5
|
+
let match;
|
|
6
|
+
|
|
7
|
+
// Find `loc` elements.
|
|
8
|
+
const locRegex = /<loc>(?<href>[^<]*)<\/loc>/g;
|
|
9
|
+
while ((match = locRegex.exec(xml))) {
|
|
10
|
+
const href = normalizeHref(match.groups?.href);
|
|
11
|
+
if (href) {
|
|
12
|
+
crawlablePaths.push(href);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
return {
|
|
17
|
+
crawlablePaths,
|
|
18
|
+
resourcePaths: [],
|
|
19
|
+
};
|
|
20
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import {
|
|
2
|
+
extension,
|
|
3
|
+
isPlainObject,
|
|
4
|
+
trailingSlash,
|
|
5
|
+
} from "@weborigami/async-tree";
|
|
6
|
+
|
|
7
|
+
// A fake base URL used to handle cases where an href is relative and must be
|
|
8
|
+
// treated relative to some base URL.
|
|
9
|
+
const fakeBaseUrl = new URL("fake:/");
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Destructively add a path to the paths object
|
|
13
|
+
*/
|
|
14
|
+
export function addHref(paths, href, isCrawlable) {
|
|
15
|
+
href = normalizeHref(href);
|
|
16
|
+
if (href === null) {
|
|
17
|
+
// Normalized href is null, was just an anchor or search; skip
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
isCrawlable ??= isCrawlableHref(href);
|
|
21
|
+
if (isCrawlable) {
|
|
22
|
+
paths.crawlablePaths.push(href);
|
|
23
|
+
} else {
|
|
24
|
+
paths.resourcePaths.push(href);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Add the value to the object at the path given by the keys
|
|
30
|
+
*
|
|
31
|
+
* @param {any} object
|
|
32
|
+
* @param {string[]} keys
|
|
33
|
+
* @param {any} value
|
|
34
|
+
*/
|
|
35
|
+
export function addValueToObject(object, keys, value) {
|
|
36
|
+
for (let i = 0, current = object; i < keys.length; i++) {
|
|
37
|
+
const key = trailingSlash.remove(keys[i]);
|
|
38
|
+
if (i === keys.length - 1) {
|
|
39
|
+
// Write out value
|
|
40
|
+
if (isPlainObject(current[key])) {
|
|
41
|
+
// Route with existing values; treat the new value as an index.html
|
|
42
|
+
current[key]["index.html"] = value;
|
|
43
|
+
} else {
|
|
44
|
+
current[key] = value;
|
|
45
|
+
}
|
|
46
|
+
} else {
|
|
47
|
+
// Traverse further
|
|
48
|
+
if (!current[key]) {
|
|
49
|
+
current[key] = {};
|
|
50
|
+
} else if (!isPlainObject(current[key])) {
|
|
51
|
+
// Already have a value at this point. The site has a page at a route
|
|
52
|
+
// like /foo, and the site also has resources within that at routes like
|
|
53
|
+
// /foo/bar.jpg. We move the current value to "index.html".
|
|
54
|
+
current[key] = { "index.html": current[key] };
|
|
55
|
+
}
|
|
56
|
+
current = current[key];
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Determine a URL we can use to determine whether a link is local within the
|
|
63
|
+
* tree or not.
|
|
64
|
+
*
|
|
65
|
+
* If a baseHref is supplied, convert that to a URL. If it's a relative path,
|
|
66
|
+
* use a fake base URL. If no baseHref is supplied, see if the `object`
|
|
67
|
+
* parameter defines an `href` property and use that to construct a URL.
|
|
68
|
+
*
|
|
69
|
+
* @param {string|undefined} baseHref
|
|
70
|
+
* @param {any} object
|
|
71
|
+
*/
|
|
72
|
+
export function getBaseUrl(baseHref, object) {
|
|
73
|
+
let url;
|
|
74
|
+
if (baseHref !== undefined) {
|
|
75
|
+
// See if the href is valid
|
|
76
|
+
try {
|
|
77
|
+
url = new URL(baseHref);
|
|
78
|
+
} catch (e) {
|
|
79
|
+
// Invalid, probably a path; use a fake protocol
|
|
80
|
+
url = new URL(baseHref, fakeBaseUrl);
|
|
81
|
+
}
|
|
82
|
+
} else if (object.href) {
|
|
83
|
+
// Use href property on object
|
|
84
|
+
let href = object.href;
|
|
85
|
+
if (!href?.endsWith("/")) {
|
|
86
|
+
href += "/";
|
|
87
|
+
}
|
|
88
|
+
url = new URL(href);
|
|
89
|
+
} else {
|
|
90
|
+
url = fakeBaseUrl;
|
|
91
|
+
}
|
|
92
|
+
return url;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export function isCrawlableHref(href) {
|
|
96
|
+
// Use a fake base URL to cover the case where the href is relative.
|
|
97
|
+
const url = new URL(href, fakeBaseUrl);
|
|
98
|
+
const pathname = url.pathname;
|
|
99
|
+
const lastKey = pathname.split("/").pop() ?? "";
|
|
100
|
+
if (lastKey === "robots.txt" || lastKey === "sitemap.xml") {
|
|
101
|
+
return true;
|
|
102
|
+
}
|
|
103
|
+
const ext = extension.extname(lastKey);
|
|
104
|
+
// We assume an empty extension is HTML.
|
|
105
|
+
const crawlableExtensions = [".html", ".css", ".js", ".map", ".xhtml", ""];
|
|
106
|
+
return crawlableExtensions.includes(ext);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Remove any search parameters or hash from the href. Preserve absolute or
|
|
110
|
+
// relative nature of URL. If the URL only has a search or hash, return null.
|
|
111
|
+
export function normalizeHref(href) {
|
|
112
|
+
// Remove everything after a `#` or `?` character.
|
|
113
|
+
const normalized = href.split(/[?#]/)[0];
|
|
114
|
+
return normalized === "" ? null : normalized;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// For indexing and storage purposes, treat a path that ends in a trailing slash
|
|
118
|
+
// as if it ends in index.html.
|
|
119
|
+
export function normalizeKeys(keys) {
|
|
120
|
+
const normalized = keys.slice();
|
|
121
|
+
if (normalized.length === 0 || trailingSlash.has(normalized.at(-1))) {
|
|
122
|
+
normalized.push("index.html");
|
|
123
|
+
}
|
|
124
|
+
return normalized;
|
|
125
|
+
}
|
package/src/dev/dev.js
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
export { default as breakpoint } from "./breakpoint.js";
|
|
2
2
|
export { default as changes } from "./changes.js";
|
|
3
3
|
export { default as code } from "./code.js";
|
|
4
|
+
export { default as audit } from "./crawler/audit.js";
|
|
5
|
+
export { default as crawl } from "./crawler/crawl.js";
|
|
4
6
|
export { default as debug } from "./debug.js";
|
|
5
7
|
export { default as explore } from "./explore.js";
|
|
6
8
|
export { default as log } from "./log.js";
|
package/src/handlers/handlers.js
CHANGED
|
@@ -14,6 +14,7 @@ import jpgHandler from "./jpg.handler.js";
|
|
|
14
14
|
import jsonHandler from "./json.handler.js";
|
|
15
15
|
import mdHandler from "./md.handler.js";
|
|
16
16
|
import mjsHandler from "./mjs.handler.js";
|
|
17
|
+
import tsHandler from "./ts.handler.js";
|
|
17
18
|
import txtHandler from "./txt.handler.js";
|
|
18
19
|
import xhtmlHandler from "./xhtml.handler.js";
|
|
19
20
|
import ymlHandler from "./yml.handler.js";
|
|
@@ -31,6 +32,7 @@ export default {
|
|
|
31
32
|
"mjs.handler": mjsHandler,
|
|
32
33
|
"ori.handler": oriHandler,
|
|
33
34
|
"oridocument.handler": oridocumentHandler,
|
|
35
|
+
"ts.handler": tsHandler,
|
|
34
36
|
"txt.handler": txtHandler,
|
|
35
37
|
"wasm.handler": wasmHandler,
|
|
36
38
|
"xhtml.handler": xhtmlHandler,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { default as default } from "./js.handler.js";
|
package/src/help/help.yaml
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
dev:
|
|
2
2
|
description: Develop and debug Origami projects
|
|
3
3
|
commands:
|
|
4
|
+
audit:
|
|
5
|
+
args: (tree)
|
|
6
|
+
description: Identify broken internal links and references
|
|
4
7
|
breakpoint:
|
|
5
8
|
args: (a)
|
|
6
9
|
description: Break into the JavaScript debugger, then return a
|
|
7
10
|
changes:
|
|
8
11
|
args: (old, new)
|
|
9
12
|
description: Return a tree of changes
|
|
13
|
+
crawl:
|
|
14
|
+
args: (tree, base)
|
|
15
|
+
description: A tree of a site's discoverable resources
|
|
10
16
|
debug:
|
|
11
17
|
args: (tree)
|
|
12
18
|
description: Add debug features to the tree
|
|
@@ -213,12 +219,6 @@ scope:
|
|
|
213
219
|
site:
|
|
214
220
|
description: Add common website features
|
|
215
221
|
commands:
|
|
216
|
-
audit:
|
|
217
|
-
args: (tree)
|
|
218
|
-
description: Identify broken internal links and references
|
|
219
|
-
crawl:
|
|
220
|
-
args: (tree, base)
|
|
221
|
-
description: A tree of a site's discoverable resources
|
|
222
222
|
index:
|
|
223
223
|
args: (tree)
|
|
224
224
|
description: A default index.html page for the tree
|
package/src/site/site.js
CHANGED
package/src/text/text.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
export { taggedTemplateIndent as indent } from "@weborigami/language";
|
|
2
2
|
export { default as document } from "./document.js";
|
|
3
|
+
export { default as htmlDom } from "./htmlDom.js";
|
|
3
4
|
export { default as inline } from "./inline.js";
|
|
4
5
|
export { default as mdHtml } from "./mdHtml.js";
|
package/src/site/audit.js
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import { Tree } from "@weborigami/async-tree";
|
|
2
|
-
import getTreeArgument from "../common/getTreeArgument.js";
|
|
3
|
-
import crawl from "./crawler/crawl.js";
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* @this {import("@weborigami/types").AsyncTree|null}
|
|
7
|
-
* @param {import("@weborigami/async-tree").Treelike} treelike
|
|
8
|
-
*/
|
|
9
|
-
export default async function audit(treelike) {
|
|
10
|
-
const tree = await getTreeArgument(this, arguments, treelike, "site:audit");
|
|
11
|
-
const crawled = await crawl.call(this, tree);
|
|
12
|
-
let crawlErrorsJson = await crawled.get("crawl-errors.json");
|
|
13
|
-
if (!crawlErrorsJson) {
|
|
14
|
-
return undefined;
|
|
15
|
-
}
|
|
16
|
-
const errors = Tree.from(JSON.parse(crawlErrorsJson), { deep: true });
|
|
17
|
-
errors.parent = this;
|
|
18
|
-
return errors;
|
|
19
|
-
}
|
|
@@ -1,266 +0,0 @@
|
|
|
1
|
-
import { extension, toString } from "@weborigami/async-tree";
|
|
2
|
-
import { isCrawlableHref, normalizeHref } from "./utilities.js";
|
|
3
|
-
|
|
4
|
-
// Filter the paths to those that are local to the site.
|
|
5
|
-
function filterPaths(paths, baseUrl, localPath) {
|
|
6
|
-
// Convert paths to absolute URLs.
|
|
7
|
-
const localUrl = new URL(localPath, baseUrl);
|
|
8
|
-
const basePathname = baseUrl.pathname;
|
|
9
|
-
// @ts-ignore
|
|
10
|
-
const absoluteUrls = paths.map((path) => new URL(path, localUrl));
|
|
11
|
-
|
|
12
|
-
// Convert the absolute URLs to paths relative to the baseHref. If the URL
|
|
13
|
-
// points outside the tree rooted at the baseHref, the relative path will be
|
|
14
|
-
// null. We ignore the protocol in this test, because in practice sites often
|
|
15
|
-
// fumble the use of http and https, treating them interchangeably.
|
|
16
|
-
const relativePaths = absoluteUrls.map((url) => {
|
|
17
|
-
if (url.host === baseUrl.host && url.pathname.startsWith(basePathname)) {
|
|
18
|
-
const path = url.pathname.slice(basePathname.length);
|
|
19
|
-
// The process of creating the URLs will have escaped characters. We
|
|
20
|
-
// remove them. This has the side-effect of removing them if they existed
|
|
21
|
-
// in the original path; it would be better if we avoided that.
|
|
22
|
-
return decodeURIComponent(path);
|
|
23
|
-
} else {
|
|
24
|
-
return null;
|
|
25
|
-
}
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
// Filter out the null paths.
|
|
29
|
-
/** @type {string[]} */
|
|
30
|
-
// @ts-ignore
|
|
31
|
-
const filteredPaths = relativePaths.filter((path) => path);
|
|
32
|
-
return filteredPaths;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Given a value retrieved from a site using a given key (name), determine what
|
|
37
|
-
* kind of file it is and, based on that, find the paths it references.
|
|
38
|
-
*/
|
|
39
|
-
export default function findPaths(value, key, baseUrl, localPath) {
|
|
40
|
-
const text = toString(value);
|
|
41
|
-
|
|
42
|
-
// We guess the value is HTML is if its key has an .html extension or
|
|
43
|
-
// doesn't have an extension, or the value starts with `<`.
|
|
44
|
-
const ext = key ? extension.extname(key).toLowerCase() : "";
|
|
45
|
-
let foundPaths;
|
|
46
|
-
if (ext === ".html" || ext === ".htm" || ext === ".xhtml") {
|
|
47
|
-
foundPaths = findPathsInHtml(text);
|
|
48
|
-
} else if (ext === ".css") {
|
|
49
|
-
foundPaths = findPathsInCss(text);
|
|
50
|
-
} else if (ext === ".js") {
|
|
51
|
-
foundPaths = findPathsInJs(text);
|
|
52
|
-
} else if (ext === ".map") {
|
|
53
|
-
foundPaths = findPathsInImageMap(text);
|
|
54
|
-
} else if (key === "robots.txt") {
|
|
55
|
-
foundPaths = findPathsInRobotsTxt(text);
|
|
56
|
-
} else if (key === "sitemap.xml") {
|
|
57
|
-
foundPaths = findPathsInSitemapXml(text);
|
|
58
|
-
} else if (ext === "" && text?.trim().startsWith("<")) {
|
|
59
|
-
// Probably HTML
|
|
60
|
-
foundPaths = findPathsInHtml(text);
|
|
61
|
-
} else {
|
|
62
|
-
// Doesn't have an extension we want to process
|
|
63
|
-
return {
|
|
64
|
-
crawlablePaths: [],
|
|
65
|
-
resourcePaths: [],
|
|
66
|
-
};
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
const crawlablePaths = filterPaths(
|
|
70
|
-
foundPaths.crawlablePaths,
|
|
71
|
-
baseUrl,
|
|
72
|
-
localPath
|
|
73
|
-
);
|
|
74
|
-
|
|
75
|
-
const resourcePaths = filterPaths(
|
|
76
|
-
foundPaths.resourcePaths,
|
|
77
|
-
baseUrl,
|
|
78
|
-
localPath
|
|
79
|
-
);
|
|
80
|
-
|
|
81
|
-
return {
|
|
82
|
-
crawlablePaths,
|
|
83
|
-
resourcePaths,
|
|
84
|
-
};
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
function findPathsInCss(css) {
|
|
88
|
-
const resourcePaths = [];
|
|
89
|
-
let match;
|
|
90
|
-
|
|
91
|
-
// Find `url()` functions.
|
|
92
|
-
const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
|
|
93
|
-
while ((match = urlRegex.exec(css))) {
|
|
94
|
-
const href = normalizeHref(match.groups?.href);
|
|
95
|
-
if (href) {
|
|
96
|
-
resourcePaths.push(href);
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
return {
|
|
101
|
-
crawlablePaths: [],
|
|
102
|
-
resourcePaths,
|
|
103
|
-
};
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
// These are ancient server-side image maps. They're so old that it's hard to
|
|
107
|
-
// find documentation on them, but they're used on the reference Space Jam
|
|
108
|
-
// website we use for testing the crawler. Example:
|
|
109
|
-
// https://www.spacejam.com/1996/bin/bball.map
|
|
110
|
-
function findPathsInImageMap(imageMap) {
|
|
111
|
-
const resourcePaths = [];
|
|
112
|
-
let match;
|
|
113
|
-
|
|
114
|
-
// Find hrefs as the second column in each line.
|
|
115
|
-
const hrefRegex = /^\w+ (?<href>\S+)(\s*$| [\d, ]+$)/gm;
|
|
116
|
-
while ((match = hrefRegex.exec(imageMap))) {
|
|
117
|
-
const href = normalizeHref(match.groups?.href);
|
|
118
|
-
if (href) {
|
|
119
|
-
resourcePaths.push(href);
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
return {
|
|
124
|
-
crawlablePaths: [],
|
|
125
|
-
resourcePaths,
|
|
126
|
-
};
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
function findPathsInJs(js) {
|
|
130
|
-
const crawlablePaths = [];
|
|
131
|
-
let match;
|
|
132
|
-
|
|
133
|
-
// Find `import` statements.
|
|
134
|
-
const importRegex = /import [\s\S]+?from\s+["'](?<import>[^"']*)["'];/g;
|
|
135
|
-
while ((match = importRegex.exec(js))) {
|
|
136
|
-
const href = normalizeHref(match.groups?.import);
|
|
137
|
-
if (href) {
|
|
138
|
-
crawlablePaths.push(href);
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
return {
|
|
143
|
-
crawlablePaths,
|
|
144
|
-
resourcePaths: [],
|
|
145
|
-
};
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
function findPathsInHtml(html) {
|
|
149
|
-
const crawlablePaths = [];
|
|
150
|
-
const resourcePaths = [];
|
|
151
|
-
let match;
|
|
152
|
-
|
|
153
|
-
// Find `href` attributes in anchor and link tags.
|
|
154
|
-
const linkRegex =
|
|
155
|
-
/<(?:a|A|link|LINK)[\s][^>]*?(?:href|HREF)=["'](?<link>[^>]*?)["'][^>]*>/g;
|
|
156
|
-
while ((match = linkRegex.exec(html))) {
|
|
157
|
-
// Links can point to be other crawlable paths and resource paths.
|
|
158
|
-
// We guess the type based on the extension.
|
|
159
|
-
const href = normalizeHref(match.groups?.link);
|
|
160
|
-
if (href) {
|
|
161
|
-
if (isCrawlableHref(href)) {
|
|
162
|
-
crawlablePaths.push(href);
|
|
163
|
-
} else {
|
|
164
|
-
resourcePaths.push(href);
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
// Find `src` attributes in img and script tags.
|
|
170
|
-
const srcRegex =
|
|
171
|
-
/<(?<tag>img|IMG|script|SCRIPT)[\s][^>]*?(?:src|SRC)=["'](?<src>[^>]*?)["'][^>]*>/g;
|
|
172
|
-
while ((match = srcRegex.exec(html))) {
|
|
173
|
-
const tag = match.groups?.tag;
|
|
174
|
-
const src = normalizeHref(match.groups?.src);
|
|
175
|
-
if (src) {
|
|
176
|
-
if (tag === "script" || tag === "SCRIPT") {
|
|
177
|
-
crawlablePaths.push(src);
|
|
178
|
-
} else {
|
|
179
|
-
resourcePaths.push(src);
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
// Find `url()` functions in CSS.
|
|
185
|
-
const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
|
|
186
|
-
while ((match = urlRegex.exec(html))) {
|
|
187
|
-
const href = normalizeHref(match.groups?.href);
|
|
188
|
-
if (href) {
|
|
189
|
-
resourcePaths.push(href);
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
// Find `src` attribute on frame tags.
|
|
194
|
-
const frameRegex =
|
|
195
|
-
/<(?:frame|FRAME)[\s][^>]*?(?:src|SRC)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
196
|
-
while ((match = frameRegex.exec(html))) {
|
|
197
|
-
const href = normalizeHref(match.groups?.href);
|
|
198
|
-
if (href) {
|
|
199
|
-
crawlablePaths.push(href);
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
// Find ancient `background` attribute on body tag.
|
|
204
|
-
const backgroundRegex =
|
|
205
|
-
/<(?:body|BODY)[\s][^>]*?(?:background|BACKGROUND)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
206
|
-
while ((match = backgroundRegex.exec(html))) {
|
|
207
|
-
const href = normalizeHref(match.groups?.href);
|
|
208
|
-
if (href) {
|
|
209
|
-
resourcePaths.push(href);
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// Find `href` attribute on area tags.
|
|
214
|
-
const areaRegex =
|
|
215
|
-
/<(?:area|AREA)[\s][^>]*?(?:href|HREF)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
216
|
-
while ((match = areaRegex.exec(html))) {
|
|
217
|
-
const href = normalizeHref(match.groups?.href);
|
|
218
|
-
if (href) {
|
|
219
|
-
crawlablePaths.push(href);
|
|
220
|
-
}
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
// Also look for JS `import` statements that might be in <script type="module"> tags.
|
|
224
|
-
const jsResults = findPathsInJs(html);
|
|
225
|
-
crawlablePaths.push(...jsResults.crawlablePaths);
|
|
226
|
-
|
|
227
|
-
return { crawlablePaths, resourcePaths };
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
function findPathsInRobotsTxt(txt) {
|
|
231
|
-
const crawlablePaths = [];
|
|
232
|
-
let match;
|
|
233
|
-
|
|
234
|
-
// Find `Sitemap` directives.
|
|
235
|
-
const sitemapRegex = /Sitemap:\s*(?<href>[^\s]*)/g;
|
|
236
|
-
while ((match = sitemapRegex.exec(txt))) {
|
|
237
|
-
const href = normalizeHref(match.groups?.href);
|
|
238
|
-
if (href) {
|
|
239
|
-
crawlablePaths.push(href);
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
return {
|
|
244
|
-
crawlablePaths,
|
|
245
|
-
resourcePaths: [],
|
|
246
|
-
};
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
function findPathsInSitemapXml(xml) {
|
|
250
|
-
const crawlablePaths = [];
|
|
251
|
-
let match;
|
|
252
|
-
|
|
253
|
-
// Find `loc` elements.
|
|
254
|
-
const locRegex = /<loc>(?<href>[^<]*)<\/loc>/g;
|
|
255
|
-
while ((match = locRegex.exec(xml))) {
|
|
256
|
-
const href = normalizeHref(match.groups?.href);
|
|
257
|
-
if (href) {
|
|
258
|
-
crawlablePaths.push(href);
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
return {
|
|
263
|
-
crawlablePaths,
|
|
264
|
-
resourcePaths: [],
|
|
265
|
-
};
|
|
266
|
-
}
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import { extension, trailingSlash } from "@weborigami/async-tree";
|
|
2
|
-
|
|
3
|
-
// A fake base URL used to handle cases where an href is relative and must be
|
|
4
|
-
// treated relative to some base URL.
|
|
5
|
-
const fakeBaseUrl = new URL("https://fake");
|
|
6
|
-
|
|
7
|
-
export function isCrawlableHref(href) {
|
|
8
|
-
// Use a fake base URL to cover the case where the href is relative.
|
|
9
|
-
const url = new URL(href, fakeBaseUrl);
|
|
10
|
-
const pathname = url.pathname;
|
|
11
|
-
const lastKey = pathname.split("/").pop() ?? "";
|
|
12
|
-
if (lastKey === "robots.txt" || lastKey === "sitemap.xml") {
|
|
13
|
-
return true;
|
|
14
|
-
}
|
|
15
|
-
const ext = extension.extname(lastKey);
|
|
16
|
-
// We assume an empty extension is HTML.
|
|
17
|
-
const crawlableExtensions = [".html", ".css", ".js", ".map", ".xhtml", ""];
|
|
18
|
-
return crawlableExtensions.includes(ext);
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
// Remove any search parameters or hash from the href. Preserve absolute or
|
|
22
|
-
// relative nature of URL. If the URL only has a search or hash, return null.
|
|
23
|
-
export function normalizeHref(href) {
|
|
24
|
-
// Remove everything after a `#` or `?` character.
|
|
25
|
-
const normalized = href.split(/[?#]/)[0];
|
|
26
|
-
return normalized === "" ? null : normalized;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
// For indexing and storage purposes, treat a path that ends in a trailing slash
|
|
30
|
-
// as if it ends in index.html.
|
|
31
|
-
export function normalizeKeys(keys) {
|
|
32
|
-
const normalized = keys.slice();
|
|
33
|
-
if (normalized.length === 0 || trailingSlash.has(normalized.at(-1))) {
|
|
34
|
-
normalized.push("index.html");
|
|
35
|
-
}
|
|
36
|
-
return normalized;
|
|
37
|
-
}
|