@weborigami/origami 0.0.68 → 0.0.70
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/exports/exports.js +6 -0
- package/package.json +4 -4
- package/src/builtins/@changes.js +24 -14
- package/src/builtins/@code.js +37 -0
- package/src/builtins/@crawl.js +31 -455
- package/src/builtins/@indent.js +115 -0
- package/src/builtins/@keysTree.js +1 -1
- package/src/builtins/@siteAudit.js +19 -0
- package/src/builtins/@slug.js +3 -0
- package/src/builtins/@treeHttp.js +1 -1
- package/src/builtins/@treeHttps.js +1 -1
- package/src/common/processUnpackedContent.js +5 -1
- package/src/common/utilities.js +7 -4
- package/src/crawler/crawlResources.js +180 -0
- package/src/crawler/findPaths.js +259 -0
- package/src/crawler/utilities.js +38 -0
|
@@ -8,7 +8,7 @@ import assertTreeIsDefined from "../misc/assertTreeIsDefined.js";
|
|
|
8
8
|
*
|
|
9
9
|
* @this {AsyncTree|null}
|
|
10
10
|
* @param {string} host
|
|
11
|
-
* @param {...string
|
|
11
|
+
* @param {...string} keys
|
|
12
12
|
*/
|
|
13
13
|
export default function treeHttp(host, ...keys) {
|
|
14
14
|
assertTreeIsDefined(this, "treeHttp");
|
|
@@ -8,7 +8,7 @@ import assertTreeIsDefined from "../misc/assertTreeIsDefined.js";
|
|
|
8
8
|
*
|
|
9
9
|
* @this {AsyncTree|null}
|
|
10
10
|
* @param {string} host
|
|
11
|
-
* @param {...string
|
|
11
|
+
* @param {...string} keys
|
|
12
12
|
*/
|
|
13
13
|
export default function treeHttps(host, ...keys) {
|
|
14
14
|
assertTreeIsDefined(this, "treeHttps");
|
|
@@ -24,7 +24,11 @@ export default function processUnpackedContent(content, parent, attachedData) {
|
|
|
24
24
|
} else {
|
|
25
25
|
target = base;
|
|
26
26
|
}
|
|
27
|
-
|
|
27
|
+
const result = content.bind(target);
|
|
28
|
+
if (content.code) {
|
|
29
|
+
result.code = content.code;
|
|
30
|
+
}
|
|
31
|
+
return result;
|
|
28
32
|
} else if (Tree.isAsyncTree(content) && !content.parent) {
|
|
29
33
|
const result = Object.create(content);
|
|
30
34
|
result.parent = parent;
|
package/src/common/utilities.js
CHANGED
|
@@ -64,13 +64,16 @@ export function toFunction(obj) {
|
|
|
64
64
|
return obj;
|
|
65
65
|
} else if (isUnpackable(obj)) {
|
|
66
66
|
// Extract the contents of the object and convert that to a function.
|
|
67
|
-
let
|
|
67
|
+
let fnPromise;
|
|
68
68
|
/** @this {any} */
|
|
69
69
|
return async function (...args) {
|
|
70
|
-
if (!
|
|
71
|
-
|
|
72
|
-
|
|
70
|
+
if (!fnPromise) {
|
|
71
|
+
// unpack() may return a function or a promise for a function; normalize
|
|
72
|
+
// to a promise for a function
|
|
73
|
+
const unpackPromise = Promise.resolve(obj.unpack());
|
|
74
|
+
fnPromise = unpackPromise.then((content) => toFunction(content));
|
|
73
75
|
}
|
|
76
|
+
const fn = await fnPromise;
|
|
74
77
|
return fn.call(this, ...args);
|
|
75
78
|
};
|
|
76
79
|
} else if (Tree.isTreelike(obj)) {
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import {
|
|
2
|
+
keysFromPath,
|
|
3
|
+
pathFromKeys,
|
|
4
|
+
trailingSlash,
|
|
5
|
+
Tree,
|
|
6
|
+
} from "@weborigami/async-tree";
|
|
7
|
+
import findPaths from "./findPaths.js";
|
|
8
|
+
import { normalizeKeys } from "./utilities.js";
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Crawl the paths for the given tree, starting at the given base URL, and yield
|
|
12
|
+
* the crawled resources.
|
|
13
|
+
*
|
|
14
|
+
* Each will include the HTML/script/stylesheet value retrieved at a given path.
|
|
15
|
+
*/
|
|
16
|
+
export default async function* crawlResources(tree, baseUrl) {
|
|
17
|
+
// We want to kick off requests for new paths as quickly as we find them, then
|
|
18
|
+
// yield whichever result finishes first. Unfortunately, Promise.any() only
|
|
19
|
+
// tells us the result of the first promise to resolve, not which promise that
|
|
20
|
+
// was. So we keep track of a dictionary mapping paths to a promise for the
|
|
21
|
+
// value at that path. When a promise resolves, we mark it as resolved by
|
|
22
|
+
// setting its entry in the dictionary to null.
|
|
23
|
+
const promisesForPaths = {};
|
|
24
|
+
|
|
25
|
+
// Keep track of which resources make which outbound links.
|
|
26
|
+
const resourceOutboundReferences = {};
|
|
27
|
+
|
|
28
|
+
let errorPaths = [];
|
|
29
|
+
|
|
30
|
+
// Seed the promise dictionary with robots.txt at the root, a sitemap.xml at
|
|
31
|
+
// the root, and an empty path indicating the current directory (relative to
|
|
32
|
+
// the baseUrl).
|
|
33
|
+
const initialPaths = ["/robots.txt", "/sitemap.xml", ""];
|
|
34
|
+
initialPaths.forEach((path) => {
|
|
35
|
+
promisesForPaths[path] = processPath(tree, path, baseUrl);
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
while (true) {
|
|
39
|
+
// Get the latest array of promises that haven't been resolved yet.
|
|
40
|
+
const promises = Object.values(promisesForPaths).filter(
|
|
41
|
+
(promise) => promise !== null
|
|
42
|
+
);
|
|
43
|
+
|
|
44
|
+
if (promises.length === 0) {
|
|
45
|
+
// No unresolved promises; we're done.
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Wait for the first promise to resolve.
|
|
50
|
+
const result = await Promise.any(promises);
|
|
51
|
+
|
|
52
|
+
// Mark the promise for that result as resolved.
|
|
53
|
+
promisesForPaths[result.path] = null;
|
|
54
|
+
|
|
55
|
+
if (result.value === null) {
|
|
56
|
+
// Expected resource doesn't exist; add this to the errors. Exception: a
|
|
57
|
+
// path in the set of initialPaths that doesn't exist is not an error.
|
|
58
|
+
if (!initialPaths.includes(result.path)) {
|
|
59
|
+
errorPaths.push(result.path);
|
|
60
|
+
}
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Add the crawlable paths to the map. Use the normalized keys (will include
|
|
65
|
+
// "index.html" if the path ends in a trailing slash).
|
|
66
|
+
const normalizedPath = pathFromKeys(result.normalizedKeys);
|
|
67
|
+
resourceOutboundReferences[normalizedPath] = result.crawlablePaths;
|
|
68
|
+
|
|
69
|
+
// Add promises for crawlable paths in the result.
|
|
70
|
+
result.crawlablePaths.forEach((path) => {
|
|
71
|
+
// Only add a promise for this path if we don't already have one.
|
|
72
|
+
if (promisesForPaths[path] === undefined) {
|
|
73
|
+
promisesForPaths[path] = processPath(tree, path, baseUrl);
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
yield result;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if (errorPaths.length > 0) {
|
|
81
|
+
// Create a map of the resources that refer to each missing resource.
|
|
82
|
+
const errorsMap = {};
|
|
83
|
+
for (const sourcePath in resourceOutboundReferences) {
|
|
84
|
+
// Does this resource refer to any of the error paths?
|
|
85
|
+
const targetPaths = resourceOutboundReferences[sourcePath];
|
|
86
|
+
for (const targetPath of targetPaths) {
|
|
87
|
+
if (errorPaths.includes(targetPath)) {
|
|
88
|
+
errorsMap[sourcePath] ??= [];
|
|
89
|
+
errorsMap[sourcePath].push(targetPath);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Review the errors map to find any paths that could not be traced back to
|
|
95
|
+
// a referring resource. These are internal crawler errors. We log them so
|
|
96
|
+
// that the use can report them and we can investigate them.
|
|
97
|
+
for (const errorPath of errorPaths) {
|
|
98
|
+
if (!Object.values(errorsMap).flat().includes(errorPath)) {
|
|
99
|
+
errorsMap["(unknown)"] ??= [];
|
|
100
|
+
errorsMap["(unknown)"].push(errorPath);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const errorsJson = JSON.stringify(errorsMap, null, 2);
|
|
105
|
+
yield {
|
|
106
|
+
normalizedKeys: ["crawl-errors.json"],
|
|
107
|
+
path: "crawl-errors.json",
|
|
108
|
+
resourcePaths: [],
|
|
109
|
+
value: errorsJson,
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
async function processPath(tree, path, baseUrl) {
|
|
115
|
+
// Don't process any path outside the baseUrl.
|
|
116
|
+
const url = new URL(path, baseUrl);
|
|
117
|
+
if (!url.pathname.startsWith(baseUrl.pathname)) {
|
|
118
|
+
return {
|
|
119
|
+
path,
|
|
120
|
+
value: null,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Convert path to keys
|
|
125
|
+
let keys = keysFromPath(path);
|
|
126
|
+
|
|
127
|
+
// Paths (including those created by the filterPaths function above) will have
|
|
128
|
+
// spaces, etc., escaped. In general, these need to be unescaped so we can
|
|
129
|
+
// find them in the tree.
|
|
130
|
+
keys = keys.map(decodeURIComponent);
|
|
131
|
+
|
|
132
|
+
// Traverse tree to get value.
|
|
133
|
+
let value = await Tree.traverse(tree, ...keys);
|
|
134
|
+
const normalizedKeys = normalizeKeys(keys);
|
|
135
|
+
let normalizedPath = path;
|
|
136
|
+
if (Tree.isTreelike(value)) {
|
|
137
|
+
// Path is actually a directory; see if it has an index.html
|
|
138
|
+
value = await Tree.traverse(value, "index.html");
|
|
139
|
+
if (value !== undefined) {
|
|
140
|
+
if (path.length > 0) {
|
|
141
|
+
// Mark the path as ending in a slash
|
|
142
|
+
normalizedPath = trailingSlash.add(path);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Add index.html to keys if it's not already there
|
|
146
|
+
if (normalizedKeys.at(-1) !== "index.html") {
|
|
147
|
+
normalizedKeys.push("index.html");
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (value === undefined) {
|
|
153
|
+
return {
|
|
154
|
+
crawlablePaths: [],
|
|
155
|
+
keys,
|
|
156
|
+
normalizedKeys,
|
|
157
|
+
path,
|
|
158
|
+
resourcePaths: [],
|
|
159
|
+
value: null,
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Find paths in the value
|
|
164
|
+
const key = normalizedKeys.at(-1);
|
|
165
|
+
const { crawlablePaths, resourcePaths } = await findPaths(
|
|
166
|
+
value,
|
|
167
|
+
key,
|
|
168
|
+
baseUrl,
|
|
169
|
+
normalizedPath
|
|
170
|
+
);
|
|
171
|
+
|
|
172
|
+
return {
|
|
173
|
+
crawlablePaths,
|
|
174
|
+
keys,
|
|
175
|
+
normalizedKeys,
|
|
176
|
+
path,
|
|
177
|
+
resourcePaths,
|
|
178
|
+
value,
|
|
179
|
+
};
|
|
180
|
+
}
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
import { toString } from "@weborigami/async-tree";
|
|
2
|
+
import { extname } from "@weborigami/language";
|
|
3
|
+
import { isCrawlableHref, normalizeHref } from "./utilities.js";
|
|
4
|
+
|
|
5
|
+
// Filter the paths to those that are local to the site.
|
|
6
|
+
function filterPaths(paths, baseUrl, localPath) {
|
|
7
|
+
// Convert paths to absolute URLs.
|
|
8
|
+
const localUrl = new URL(localPath, baseUrl);
|
|
9
|
+
const basePathname = baseUrl.pathname;
|
|
10
|
+
// @ts-ignore
|
|
11
|
+
const absoluteUrls = paths.map((path) => new URL(path, localUrl));
|
|
12
|
+
|
|
13
|
+
// Convert the absolute URLs to paths relative to the baseHref. If the URL
|
|
14
|
+
// points outside the tree rooted at the baseHref, the relative path will be
|
|
15
|
+
// null. We ignore the protocol in this test, because in practice sites often
|
|
16
|
+
// fumble the use of http and https, treating them interchangeably.
|
|
17
|
+
const relativePaths = absoluteUrls.map((url) => {
|
|
18
|
+
if (url.host === baseUrl.host && url.pathname.startsWith(basePathname)) {
|
|
19
|
+
return url.pathname.slice(basePathname.length);
|
|
20
|
+
} else {
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
// Filter out the null paths.
|
|
26
|
+
/** @type {string[]} */
|
|
27
|
+
// @ts-ignore
|
|
28
|
+
const filteredPaths = relativePaths.filter((path) => path);
|
|
29
|
+
return filteredPaths;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Given a value retrieved from a site using a given key (name), determine what
|
|
34
|
+
* kind of file it is and, based on that, find the paths it references.
|
|
35
|
+
*/
|
|
36
|
+
export default function findPaths(value, key, baseUrl, localPath) {
|
|
37
|
+
const text = toString(value);
|
|
38
|
+
|
|
39
|
+
// We guess the value is HTML is if its key has an .html extension or
|
|
40
|
+
// doesn't have an extension, or the value starts with `<`.
|
|
41
|
+
const ext = key ? extname(key).toLowerCase() : "";
|
|
42
|
+
const maybeHtml = ext === "" || text?.trim().startsWith("<");
|
|
43
|
+
let foundPaths;
|
|
44
|
+
if (ext === ".html" || ext === ".htm" || ext === ".xhtml") {
|
|
45
|
+
foundPaths = findPathsInHtml(text);
|
|
46
|
+
} else if (ext === ".css") {
|
|
47
|
+
foundPaths = findPathsInCss(text);
|
|
48
|
+
} else if (ext === ".js") {
|
|
49
|
+
foundPaths = findPathsInJs(text);
|
|
50
|
+
} else if (ext === ".map") {
|
|
51
|
+
foundPaths = findPathsInImageMap(text);
|
|
52
|
+
} else if (key === "robots.txt") {
|
|
53
|
+
foundPaths = findPathsInRobotsTxt(text);
|
|
54
|
+
} else if (key === "sitemap.xml") {
|
|
55
|
+
foundPaths = findPathsInSitemapXml(text);
|
|
56
|
+
} else if (maybeHtml) {
|
|
57
|
+
foundPaths = findPathsInHtml(text);
|
|
58
|
+
} else {
|
|
59
|
+
// Doesn't have an extension we want to process
|
|
60
|
+
return {
|
|
61
|
+
crawlablePaths: [],
|
|
62
|
+
resourcePaths: [],
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const crawlablePaths = filterPaths(
|
|
67
|
+
foundPaths.crawlablePaths,
|
|
68
|
+
baseUrl,
|
|
69
|
+
localPath
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
const resourcePaths = filterPaths(
|
|
73
|
+
foundPaths.resourcePaths,
|
|
74
|
+
baseUrl,
|
|
75
|
+
localPath
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
crawlablePaths,
|
|
80
|
+
resourcePaths,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function findPathsInCss(css) {
|
|
85
|
+
const resourcePaths = [];
|
|
86
|
+
let match;
|
|
87
|
+
|
|
88
|
+
// Find `url()` functions.
|
|
89
|
+
const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
|
|
90
|
+
while ((match = urlRegex.exec(css))) {
|
|
91
|
+
const href = normalizeHref(match.groups?.href);
|
|
92
|
+
if (href) {
|
|
93
|
+
resourcePaths.push();
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
crawlablePaths: [],
|
|
99
|
+
resourcePaths,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// These are ancient server-side image maps. They're so old that it's hard to
|
|
104
|
+
// find documentation on them, but they're used on the reference Space Jam
|
|
105
|
+
// website we use for testing the crawler. Example:
|
|
106
|
+
// https://www.spacejam.com/1996/bin/bball.map
|
|
107
|
+
function findPathsInImageMap(imageMap) {
|
|
108
|
+
const resourcePaths = [];
|
|
109
|
+
let match;
|
|
110
|
+
|
|
111
|
+
// Find hrefs as the second column in each line.
|
|
112
|
+
const hrefRegex = /^\w+ (?<href>\S+)(\s*$| [\d, ]+$)/gm;
|
|
113
|
+
while ((match = hrefRegex.exec(imageMap))) {
|
|
114
|
+
const href = normalizeHref(match.groups?.href);
|
|
115
|
+
if (href) {
|
|
116
|
+
resourcePaths.push(href);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
crawlablePaths: [],
|
|
122
|
+
resourcePaths,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function findPathsInJs(js) {
|
|
127
|
+
const crawlablePaths = [];
|
|
128
|
+
let match;
|
|
129
|
+
|
|
130
|
+
// Find `import` statements.
|
|
131
|
+
const importRegex = /import [\s\S]+?from\s+["'](?<import>[^"']*)["'];/g;
|
|
132
|
+
while ((match = importRegex.exec(js))) {
|
|
133
|
+
const href = normalizeHref(match.groups?.import);
|
|
134
|
+
if (href) {
|
|
135
|
+
crawlablePaths.push(href);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
crawlablePaths,
|
|
141
|
+
resourcePaths: [],
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function findPathsInHtml(html) {
|
|
146
|
+
const crawlablePaths = [];
|
|
147
|
+
const resourcePaths = [];
|
|
148
|
+
let match;
|
|
149
|
+
|
|
150
|
+
// Find `href` attributes in anchor and link tags.
|
|
151
|
+
const linkRegex =
|
|
152
|
+
/<(?:a|A|link|LINK)[\s][^>]*?(?:href|HREF)=["'](?<link>[^>]*?)["'][^>]*>/g;
|
|
153
|
+
while ((match = linkRegex.exec(html))) {
|
|
154
|
+
// Links can point to be other crawlable paths and resource paths.
|
|
155
|
+
// We guess the type based on the extension.
|
|
156
|
+
const href = normalizeHref(match.groups?.link);
|
|
157
|
+
if (href) {
|
|
158
|
+
if (isCrawlableHref(href)) {
|
|
159
|
+
crawlablePaths.push(href);
|
|
160
|
+
} else {
|
|
161
|
+
resourcePaths.push(href);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Find `src` attributes in img and script tags.
|
|
167
|
+
const srcRegex =
|
|
168
|
+
/<(?<tag>img|IMG|script|SCRIPT)[\s][^>]*?(?:src|SRC)=["'](?<src>[^>]*?)["'][^>]*>/g;
|
|
169
|
+
while ((match = srcRegex.exec(html))) {
|
|
170
|
+
const tag = match.groups?.tag;
|
|
171
|
+
const src = normalizeHref(match.groups?.src);
|
|
172
|
+
if (src) {
|
|
173
|
+
if (tag === "script" || tag === "SCRIPT") {
|
|
174
|
+
crawlablePaths.push(src);
|
|
175
|
+
} else {
|
|
176
|
+
resourcePaths.push(src);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Find `url()` functions in CSS.
|
|
182
|
+
const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
|
|
183
|
+
while ((match = urlRegex.exec(html))) {
|
|
184
|
+
const href = normalizeHref(match.groups?.href);
|
|
185
|
+
if (href) {
|
|
186
|
+
resourcePaths.push(href);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Find `src` attribute on frame tags.
|
|
191
|
+
const frameRegex =
|
|
192
|
+
/<(?:frame|FRAME)[\s][^>]*?(?:src|SRC)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
193
|
+
while ((match = frameRegex.exec(html))) {
|
|
194
|
+
const href = normalizeHref(match.groups?.href);
|
|
195
|
+
if (href) {
|
|
196
|
+
crawlablePaths.push(href);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Find ancient `background` attribute on body tag.
|
|
201
|
+
const backgroundRegex =
|
|
202
|
+
/<(?:body|BODY)[\s][^>]*?(?:background|BACKGROUND)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
203
|
+
while ((match = backgroundRegex.exec(html))) {
|
|
204
|
+
const href = normalizeHref(match.groups?.href);
|
|
205
|
+
if (href) {
|
|
206
|
+
resourcePaths.push(href);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Find `href` attribute on area tags.
|
|
211
|
+
const areaRegex =
|
|
212
|
+
/<(?:area|AREA)[\s][^>]*?(?:href|HREF)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
213
|
+
while ((match = areaRegex.exec(html))) {
|
|
214
|
+
const href = normalizeHref(match.groups?.href);
|
|
215
|
+
if (href) {
|
|
216
|
+
crawlablePaths.push(href);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return { crawlablePaths, resourcePaths };
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function findPathsInRobotsTxt(txt) {
|
|
224
|
+
const crawlablePaths = [];
|
|
225
|
+
let match;
|
|
226
|
+
|
|
227
|
+
// Find `Sitemap` directives.
|
|
228
|
+
const sitemapRegex = /Sitemap:\s*(?<href>[^\s]*)/g;
|
|
229
|
+
while ((match = sitemapRegex.exec(txt))) {
|
|
230
|
+
const href = normalizeHref(match.groups?.href);
|
|
231
|
+
if (href) {
|
|
232
|
+
crawlablePaths.push(href);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
return {
|
|
237
|
+
crawlablePaths,
|
|
238
|
+
resourcePaths: [],
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
function findPathsInSitemapXml(xml) {
|
|
243
|
+
const crawlablePaths = [];
|
|
244
|
+
let match;
|
|
245
|
+
|
|
246
|
+
// Find `loc` elements.
|
|
247
|
+
const locRegex = /<loc>(?<href>[^<]*)<\/loc>/g;
|
|
248
|
+
while ((match = locRegex.exec(xml))) {
|
|
249
|
+
const href = normalizeHref(match.groups?.href);
|
|
250
|
+
if (href) {
|
|
251
|
+
crawlablePaths.push(href);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return {
|
|
256
|
+
crawlablePaths,
|
|
257
|
+
resourcePaths: [],
|
|
258
|
+
};
|
|
259
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { trailingSlash } from "@weborigami/async-tree";
|
|
2
|
+
import { extname } from "@weborigami/language";
|
|
3
|
+
|
|
4
|
+
// A fake base URL used to handle cases where an href is relative and must be
|
|
5
|
+
// treated relative to some base URL.
|
|
6
|
+
const fakeBaseUrl = new URL("https://fake");
|
|
7
|
+
|
|
8
|
+
export function isCrawlableHref(href) {
|
|
9
|
+
// Use a fake base URL to cover the case where the href is relative.
|
|
10
|
+
const url = new URL(href, fakeBaseUrl);
|
|
11
|
+
const pathname = url.pathname;
|
|
12
|
+
const lastKey = pathname.split("/").pop() ?? "";
|
|
13
|
+
if (lastKey === "robots.txt" || lastKey === "sitemap.xml") {
|
|
14
|
+
return true;
|
|
15
|
+
}
|
|
16
|
+
const ext = extname(lastKey);
|
|
17
|
+
// We assume an empty extension is HTML.
|
|
18
|
+
const crawlableExtensions = [".html", ".css", ".js", ".map", ".xhtml", ""];
|
|
19
|
+
return crawlableExtensions.includes(ext);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Remove any search parameters or hash from the href. Preserve absolute or
|
|
23
|
+
// relative nature of URL. If the URL only has a search or hash, return null.
|
|
24
|
+
export function normalizeHref(href) {
|
|
25
|
+
// Remove everything after a `#` or `?` character.
|
|
26
|
+
const normalized = href.split(/[?#]/)[0];
|
|
27
|
+
return normalized === "" ? null : normalized;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// For indexing and storage purposes, treat a path that ends in a trailing slash
|
|
31
|
+
// as if it ends in index.html.
|
|
32
|
+
export function normalizeKeys(keys) {
|
|
33
|
+
const normalized = keys.slice();
|
|
34
|
+
if (normalized.length === 0 || trailingSlash.has(normalized.at(-1))) {
|
|
35
|
+
normalized.push("index.html");
|
|
36
|
+
}
|
|
37
|
+
return normalized;
|
|
38
|
+
}
|