@weborigami/origami 0.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/ReadMe.md +3 -0
- package/exports/PathTransform.d.ts +5 -0
- package/exports/PathTransform.js +18 -0
- package/exports/buildExports.js +109 -0
- package/exports/exports.js +121 -0
- package/index.ts +25 -0
- package/package.json +40 -0
- package/src/builtins/!.js +21 -0
- package/src/builtins/@apply.js +6 -0
- package/src/builtins/@arrows.js +34 -0
- package/src/builtins/@builtins.js +18 -0
- package/src/builtins/@cache.js +36 -0
- package/src/builtins/@config.js +25 -0
- package/src/builtins/@copy.js +71 -0
- package/src/builtins/@crawl.js +507 -0
- package/src/builtins/@debug.js +89 -0
- package/src/builtins/@document.js +18 -0
- package/src/builtins/@equals.js +6 -0
- package/src/builtins/@explore.js +68 -0
- package/src/builtins/@false.js +1 -0
- package/src/builtins/@files.js +22 -0
- package/src/builtins/@filter.js +23 -0
- package/src/builtins/@globs.js +23 -0
- package/src/builtins/@help.js +49 -0
- package/src/builtins/@http.js +19 -0
- package/src/builtins/@https.js +19 -0
- package/src/builtins/@if.js +27 -0
- package/src/builtins/@image/format.js +5 -0
- package/src/builtins/@image/resize.js +5 -0
- package/src/builtins/@index.js +72 -0
- package/src/builtins/@inherited.js +17 -0
- package/src/builtins/@inline.js +29 -0
- package/src/builtins/@invoke.js +30 -0
- package/src/builtins/@js.js +33 -0
- package/src/builtins/@json.js +22 -0
- package/src/builtins/@loaders/css.js +4 -0
- package/src/builtins/@loaders/htm.js +4 -0
- package/src/builtins/@loaders/html.js +4 -0
- package/src/builtins/@loaders/js.js +14 -0
- package/src/builtins/@loaders/json.js +8 -0
- package/src/builtins/@loaders/md.js +4 -0
- package/src/builtins/@loaders/mjs.js +4 -0
- package/src/builtins/@loaders/ori.js +21 -0
- package/src/builtins/@loaders/orit.js +48 -0
- package/src/builtins/@loaders/txt.js +33 -0
- package/src/builtins/@loaders/xhtml.js +4 -0
- package/src/builtins/@loaders/yaml.js +18 -0
- package/src/builtins/@loaders/yml.js +4 -0
- package/src/builtins/@map.js +182 -0
- package/src/builtins/@match.js +92 -0
- package/src/builtins/@mdHtml.js +45 -0
- package/src/builtins/@new.js +6 -0
- package/src/builtins/@node.js +15 -0
- package/src/builtins/@not.js +6 -0
- package/src/builtins/@or.js +6 -0
- package/src/builtins/@ori.js +83 -0
- package/src/builtins/@pack.js +13 -0
- package/src/builtins/@parse/json.js +7 -0
- package/src/builtins/@parse/yaml.js +9 -0
- package/src/builtins/@project.js +71 -0
- package/src/builtins/@repeat.js +8 -0
- package/src/builtins/@rss.js +49 -0
- package/src/builtins/@scope/extend.js +22 -0
- package/src/builtins/@scope/get.js +25 -0
- package/src/builtins/@scope/invoke.js +22 -0
- package/src/builtins/@scope/set.js +25 -0
- package/src/builtins/@serve.js +74 -0
- package/src/builtins/@shell.js +16 -0
- package/src/builtins/@stdin.js +26 -0
- package/src/builtins/@svg.js +42 -0
- package/src/builtins/@tree/concat.js +21 -0
- package/src/builtins/@tree/count.js +24 -0
- package/src/builtins/@tree/defineds.js +37 -0
- package/src/builtins/@tree/dot.js +201 -0
- package/src/builtins/@tree/exceptions.js +50 -0
- package/src/builtins/@tree/first.js +28 -0
- package/src/builtins/@tree/flowSvg.js +55 -0
- package/src/builtins/@tree/fn.js +34 -0
- package/src/builtins/@tree/from.js +27 -0
- package/src/builtins/@tree/fromJson.js +6 -0
- package/src/builtins/@tree/fromYaml.js +24 -0
- package/src/builtins/@tree/groupBy.js +39 -0
- package/src/builtins/@tree/inners.js +44 -0
- package/src/builtins/@tree/isAsyncTree.js +17 -0
- package/src/builtins/@tree/keys.js +24 -0
- package/src/builtins/@tree/keysJson.js +44 -0
- package/src/builtins/@tree/map.d.ts +19 -0
- package/src/builtins/@tree/merge.js +47 -0
- package/src/builtins/@tree/mergeDeep.js +44 -0
- package/src/builtins/@tree/nextKey.js +29 -0
- package/src/builtins/@tree/parent.js +24 -0
- package/src/builtins/@tree/paths.js +35 -0
- package/src/builtins/@tree/plain.js +22 -0
- package/src/builtins/@tree/previousKey.js +29 -0
- package/src/builtins/@tree/reverse.js +51 -0
- package/src/builtins/@tree/setDeep.js +45 -0
- package/src/builtins/@tree/shuffle.js +31 -0
- package/src/builtins/@tree/sitemap.js +59 -0
- package/src/builtins/@tree/sort.js +25 -0
- package/src/builtins/@tree/sortBy.js +40 -0
- package/src/builtins/@tree/static.js +51 -0
- package/src/builtins/@tree/table.js +74 -0
- package/src/builtins/@tree/take.js +40 -0
- package/src/builtins/@tree/values.js +23 -0
- package/src/builtins/@tree/valuesDeep.js +23 -0
- package/src/builtins/@treeHttp.js +19 -0
- package/src/builtins/@treeHttps.js +19 -0
- package/src/builtins/@true.js +1 -0
- package/src/builtins/@unpack.js +13 -0
- package/src/builtins/@watch.js +108 -0
- package/src/builtins/@with.js +22 -0
- package/src/builtins/@yaml.js +23 -0
- package/src/builtins/~.js +9 -0
- package/src/cli/cli.js +86 -0
- package/src/cli/defaultModuleExport.js +16 -0
- package/src/cli/showUsage.js +86 -0
- package/src/common/CommandModulesTransform.d.ts +5 -0
- package/src/common/CommandModulesTransform.js +37 -0
- package/src/common/ConstantTree.js +17 -0
- package/src/common/ExplorableSiteTransform.d.ts +5 -0
- package/src/common/ExplorableSiteTransform.js +77 -0
- package/src/common/FilterTree.js +60 -0
- package/src/common/GlobTree.js +67 -0
- package/src/common/ShuffleTransform.js +29 -0
- package/src/common/TextDocument.js +57 -0
- package/src/common/addValueKeyToScope.js +30 -0
- package/src/common/arrowFunctionsMap.js +35 -0
- package/src/common/processUnpackedContent.js +39 -0
- package/src/common/serialize.d.ts +8 -0
- package/src/common/serialize.js +138 -0
- package/src/common/utilities.d.ts +7 -0
- package/src/common/utilities.js +132 -0
- package/src/misc/OriCommandTransform.d.ts +5 -0
- package/src/misc/OriCommandTransform.js +54 -0
- package/src/misc/assertScopeIsDefined.js +7 -0
- package/src/misc/explore.orit +241 -0
- package/src/misc/yamlOrigamiTag.js +17 -0
- package/src/server/mediaTypes.js +97 -0
- package/src/server/server.js +258 -0
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ObjectTree,
|
|
3
|
+
Tree,
|
|
4
|
+
isPlainObject,
|
|
5
|
+
keysFromPath,
|
|
6
|
+
mergeDeep,
|
|
7
|
+
} from "@weborigami/async-tree";
|
|
8
|
+
import { InvokeFunctionsTransform, Scope, extname } from "@weborigami/language";
|
|
9
|
+
import * as utilities from "../common/utilities.js";
|
|
10
|
+
import assertScopeIsDefined from "../misc/assertScopeIsDefined.js";
|
|
11
|
+
import treeHttps from "./@treeHttps.js";
|
|
12
|
+
|
|
13
|
+
// A fake base URL used to handle cases where an href is relative and must be
|
|
14
|
+
// treated relative to some base URL.
|
|
15
|
+
const fakeBaseUrl = new URL("https://fake");
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Crawl a tree, starting its root index.html page, and following links to
|
|
19
|
+
* crawlable pages, scripts, and stylesheets.
|
|
20
|
+
*
|
|
21
|
+
* Returns a new tree of the crawled content. The crawled content will be
|
|
22
|
+
* in-memory. Referenced resources like images will be represented as functions
|
|
23
|
+
* that obtain the requested value from the original site.
|
|
24
|
+
*
|
|
25
|
+
* @typedef {import("@weborigami/types").AsyncTree} AsyncTree
|
|
26
|
+
* @typedef {import("@weborigami/async-tree").Treelike|string} Treelike
|
|
27
|
+
* @this {AsyncTree|null}
|
|
28
|
+
* @param {Treelike} treelike
|
|
29
|
+
* @param {string} [baseHref]
|
|
30
|
+
* @returns {Promise<AsyncTree>}
|
|
31
|
+
*/
|
|
32
|
+
export default async function crawl(treelike, baseHref) {
|
|
33
|
+
assertScopeIsDefined(this);
|
|
34
|
+
const tree =
|
|
35
|
+
typeof treelike === "string"
|
|
36
|
+
? treeHttps.call(this, treelike)
|
|
37
|
+
: Tree.from(treelike);
|
|
38
|
+
|
|
39
|
+
if (baseHref === undefined) {
|
|
40
|
+
// Ask tree or original treelike if it has an `href` property we can use as
|
|
41
|
+
// the base href to determine whether a link is local within the tree or
|
|
42
|
+
// not. If not, use a fake `local:/` href.
|
|
43
|
+
baseHref =
|
|
44
|
+
/** @type {any} */ (tree).href ??
|
|
45
|
+
/** @type {any} */ (treelike).href ??
|
|
46
|
+
"local:/";
|
|
47
|
+
if (!baseHref?.endsWith("/")) {
|
|
48
|
+
baseHref += "/";
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
// @ts-ignore
|
|
52
|
+
const baseUrl = new URL(baseHref);
|
|
53
|
+
|
|
54
|
+
const cache = {};
|
|
55
|
+
const resources = {};
|
|
56
|
+
|
|
57
|
+
// We iterate until there are no more promises to wait for.
|
|
58
|
+
for await (const result of crawlPaths(tree, baseUrl)) {
|
|
59
|
+
const { keys, resourcePaths, value } = result;
|
|
60
|
+
|
|
61
|
+
// Cache the value
|
|
62
|
+
if (value) {
|
|
63
|
+
addValueToObject(cache, keys, value);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Add indirect resource functions to the resource tree. When requested,
|
|
67
|
+
// these functions will obtain the resource from the original site.
|
|
68
|
+
for (const resourcePath of resourcePaths) {
|
|
69
|
+
const resourceKeys = adjustKeys(keysFromPath(resourcePath));
|
|
70
|
+
const fn = () => {
|
|
71
|
+
return traverse(tree, ...resourceKeys);
|
|
72
|
+
};
|
|
73
|
+
addValueToObject(resources, resourceKeys, fn);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Merge the cache on top of the resources tree. If we have an actual value
|
|
78
|
+
// for something already, that's better than a function that will get that
|
|
79
|
+
// value.
|
|
80
|
+
|
|
81
|
+
/** @type {AsyncTree} */
|
|
82
|
+
let result = mergeDeep(
|
|
83
|
+
new ObjectTree(cache),
|
|
84
|
+
new (InvokeFunctionsTransform(ObjectTree))(resources)
|
|
85
|
+
);
|
|
86
|
+
result = Scope.treeWithScope(result, this);
|
|
87
|
+
return result;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// For indexing and storage purposes, treat a path that ends in a trailing slash
|
|
91
|
+
// (or the dot we use to seed the queue) as if it ends in index.html.
|
|
92
|
+
function adjustKeys(keys) {
|
|
93
|
+
const adjustedKeys = keys.slice();
|
|
94
|
+
if (adjustedKeys.at(-1) === "") {
|
|
95
|
+
adjustedKeys[adjustedKeys.length - 1] = "index.html";
|
|
96
|
+
}
|
|
97
|
+
return adjustedKeys;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function addValueToObject(object, keys, value) {
|
|
101
|
+
for (let i = 0, current = object; i < keys.length; i++) {
|
|
102
|
+
const key = keys[i];
|
|
103
|
+
if (i === keys.length - 1) {
|
|
104
|
+
// Write out value
|
|
105
|
+
if (isPlainObject(current[key])) {
|
|
106
|
+
// Route with existing values; treat the new value as an index.html
|
|
107
|
+
current[key]["index.html"] = value;
|
|
108
|
+
} else {
|
|
109
|
+
current[key] = value;
|
|
110
|
+
}
|
|
111
|
+
} else {
|
|
112
|
+
// Traverse further
|
|
113
|
+
if (!current[key]) {
|
|
114
|
+
current[key] = {};
|
|
115
|
+
} else if (!isPlainObject(current[key])) {
|
|
116
|
+
// Already have a value at this point. The site has a page
|
|
117
|
+
// at a route like /foo, and the site also has resources
|
|
118
|
+
// within that at routes like /foo/bar.jpg. We move the
|
|
119
|
+
// current value to "index.html".
|
|
120
|
+
current[key] = { "index.html": current[key] };
|
|
121
|
+
}
|
|
122
|
+
current = current[key];
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Crawl the paths for the given tree, starting at the given base URL, and
|
|
128
|
+
// yield the results. The results will include the HTML/script/stylesheet value
|
|
129
|
+
// retrieved at a path, along with the paths to other resources found in that
|
|
130
|
+
// text.
|
|
131
|
+
async function* crawlPaths(tree, baseUrl) {
|
|
132
|
+
// We want to kick off requests for new paths as quickly as we find them, then
|
|
133
|
+
// yield whichever result finishes first. Unfortunately, Promise.any() only
|
|
134
|
+
// tells us the result of the first promise to resolve, not which promise that
|
|
135
|
+
// was. So we keep track of a dictionary mapping paths to a promise for the
|
|
136
|
+
// value at that path. When a promise resolves, we mark it as resolved by
|
|
137
|
+
// setting its entry in the dictionary to null.
|
|
138
|
+
const promisesForPaths = {};
|
|
139
|
+
|
|
140
|
+
// Seed the promise dictionary with robots.txt and the root path.
|
|
141
|
+
const initialPaths = ["/robots.txt", ""];
|
|
142
|
+
initialPaths.forEach((path) => {
|
|
143
|
+
promisesForPaths[path] = processPath(tree, path, baseUrl);
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
while (true) {
|
|
147
|
+
// Get the latest array of promises that haven't been resolved yet.
|
|
148
|
+
const promises = Object.values(promisesForPaths).filter(
|
|
149
|
+
(promise) => promise !== null
|
|
150
|
+
);
|
|
151
|
+
|
|
152
|
+
if (promises.length === 0) {
|
|
153
|
+
// No unresolved promises; we're done.
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Wait for the first promise to resolve.
|
|
158
|
+
const result = await Promise.any(promises);
|
|
159
|
+
|
|
160
|
+
// Mark the promise for that result as resolved.
|
|
161
|
+
promisesForPaths[result.path] = null;
|
|
162
|
+
|
|
163
|
+
// Add promises for crawlable paths in the result.
|
|
164
|
+
result.crawlablePaths.forEach((path) => {
|
|
165
|
+
// Only add a promise for this path if we don't already have one.
|
|
166
|
+
if (promisesForPaths[path] === undefined) {
|
|
167
|
+
promisesForPaths[path] = processPath(tree, path, baseUrl);
|
|
168
|
+
}
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
yield result;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Filter the paths to those that are local to the site.
|
|
176
|
+
function filterPaths(paths, baseUrl, localPath) {
|
|
177
|
+
// Convert paths to absolute URLs.
|
|
178
|
+
const localUrl = new URL(localPath, baseUrl);
|
|
179
|
+
const basePathname = baseUrl.pathname;
|
|
180
|
+
// @ts-ignore
|
|
181
|
+
const absoluteUrls = paths.map((path) => new URL(path, localUrl));
|
|
182
|
+
|
|
183
|
+
// Convert the absolute URLs to paths relative to the baseHref. If the URL
|
|
184
|
+
// points outside the tree rooted at the baseHref, the relative path will be
|
|
185
|
+
// null. We ignore the protocol in this test, because in practice sites often
|
|
186
|
+
// fumble the use of http and https, treating them interchangeably.
|
|
187
|
+
const relativePaths = absoluteUrls.map((url) => {
|
|
188
|
+
if (url.host === baseUrl.host && url.pathname.startsWith(basePathname)) {
|
|
189
|
+
return url.pathname.slice(basePathname.length);
|
|
190
|
+
} else {
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
// Filter out the null paths.
|
|
196
|
+
/** @type {string[]} */
|
|
197
|
+
// @ts-ignore
|
|
198
|
+
const filteredPaths = relativePaths.filter((path) => path);
|
|
199
|
+
return filteredPaths;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function findPaths(value, key, baseUrl, localPath) {
|
|
203
|
+
const text = utilities.toString(value);
|
|
204
|
+
|
|
205
|
+
// We guess the value is HTML is if its key has an .html extension or
|
|
206
|
+
// doesn't have an extension, or the value starts with `<`.
|
|
207
|
+
const ext = key ? extname(key).toLowerCase() : "";
|
|
208
|
+
const maybeHtml = ext === "" || text?.trim().startsWith("<");
|
|
209
|
+
let foundPaths;
|
|
210
|
+
if (ext === ".html" || ext === ".htm") {
|
|
211
|
+
foundPaths = findPathsInHtml(text);
|
|
212
|
+
} else if (ext === ".css") {
|
|
213
|
+
foundPaths = findPathsInCss(text);
|
|
214
|
+
} else if (ext === ".js") {
|
|
215
|
+
foundPaths = findPathsInJs(text);
|
|
216
|
+
} else if (ext === ".map") {
|
|
217
|
+
foundPaths = findPathsInImageMap(text);
|
|
218
|
+
} else if (key === "robots.txt") {
|
|
219
|
+
foundPaths = findPathsInRobotsTxt(text);
|
|
220
|
+
} else if (key === "sitemap.xml") {
|
|
221
|
+
foundPaths = findPathsInSitemapXml(text);
|
|
222
|
+
} else if (maybeHtml) {
|
|
223
|
+
foundPaths = findPathsInHtml(text);
|
|
224
|
+
} else {
|
|
225
|
+
// Doesn't have an extension we want to process
|
|
226
|
+
return {
|
|
227
|
+
crawlablePaths: [],
|
|
228
|
+
resourcePaths: [],
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const crawlablePaths = filterPaths(
|
|
233
|
+
foundPaths.crawlablePaths,
|
|
234
|
+
baseUrl,
|
|
235
|
+
localPath
|
|
236
|
+
);
|
|
237
|
+
const resourcePaths = filterPaths(
|
|
238
|
+
foundPaths.resourcePaths,
|
|
239
|
+
baseUrl,
|
|
240
|
+
localPath
|
|
241
|
+
);
|
|
242
|
+
return {
|
|
243
|
+
crawlablePaths,
|
|
244
|
+
resourcePaths,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function findPathsInCss(css) {
|
|
249
|
+
const resourcePaths = [];
|
|
250
|
+
let match;
|
|
251
|
+
|
|
252
|
+
// Find `url()` functions.
|
|
253
|
+
const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
|
|
254
|
+
while ((match = urlRegex.exec(css))) {
|
|
255
|
+
const href = normalizeHref(match.groups?.href);
|
|
256
|
+
if (href) {
|
|
257
|
+
resourcePaths.push();
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return {
|
|
262
|
+
crawlablePaths: [],
|
|
263
|
+
resourcePaths,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// These are ancient server-side image maps. They're so old that it's hard to
|
|
268
|
+
// find documentation on them, but they're used on the reference Space Jam
|
|
269
|
+
// website we use for testing the crawler. Example:
|
|
270
|
+
// https://www.spacejam.com/1996/bin/bball.map
|
|
271
|
+
function findPathsInImageMap(imageMap) {
|
|
272
|
+
const resourcePaths = [];
|
|
273
|
+
let match;
|
|
274
|
+
|
|
275
|
+
// Find hrefs as the second column in each line.
|
|
276
|
+
const hrefRegex = /^\w+ (?<href>\S+)(\s*$| [\d, ]+$)/gm;
|
|
277
|
+
while ((match = hrefRegex.exec(imageMap))) {
|
|
278
|
+
const href = normalizeHref(match.groups?.href);
|
|
279
|
+
if (href) {
|
|
280
|
+
resourcePaths.push(href);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return {
|
|
285
|
+
crawlablePaths: [],
|
|
286
|
+
resourcePaths,
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function findPathsInJs(js) {
|
|
291
|
+
const crawlablePaths = [];
|
|
292
|
+
let match;
|
|
293
|
+
|
|
294
|
+
// Find `import` statements.
|
|
295
|
+
const importRegex = /import [\s\S]+?from\s+["'](?<import>[^"']*)["'];/g;
|
|
296
|
+
while ((match = importRegex.exec(js))) {
|
|
297
|
+
const href = normalizeHref(match.groups?.import);
|
|
298
|
+
if (href) {
|
|
299
|
+
crawlablePaths.push(href);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
return {
|
|
304
|
+
crawlablePaths,
|
|
305
|
+
resourcePaths: [],
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
function findPathsInHtml(html) {
|
|
310
|
+
const crawlablePaths = [];
|
|
311
|
+
const resourcePaths = [];
|
|
312
|
+
let match;
|
|
313
|
+
|
|
314
|
+
// Find `href` attributes in anchor and link tags.
|
|
315
|
+
const linkRegex =
|
|
316
|
+
/<(?:a|A|link|LINK) [^>]*?(?:href|HREF)=["'](?<link>[^>]*?)["'][^>]*>/g;
|
|
317
|
+
while ((match = linkRegex.exec(html))) {
|
|
318
|
+
// Links can point to be other crawlable paths and resource paths.
|
|
319
|
+
// We guess the type based on the extension.
|
|
320
|
+
const href = normalizeHref(match.groups?.link);
|
|
321
|
+
if (href) {
|
|
322
|
+
if (isCrawlableHref(href)) {
|
|
323
|
+
crawlablePaths.push(href);
|
|
324
|
+
} else {
|
|
325
|
+
resourcePaths.push(href);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Find `src` attributes in img and script tags.
|
|
331
|
+
const srcRegex =
|
|
332
|
+
/<(?<tag>img|IMG|script|SCRIPT) [^>]*?(?:src|SRC)=["'](?<src>[^>]*?)["'][^>]*>/g;
|
|
333
|
+
while ((match = srcRegex.exec(html))) {
|
|
334
|
+
const tag = match.groups?.tag;
|
|
335
|
+
const src = normalizeHref(match.groups?.src);
|
|
336
|
+
if (src) {
|
|
337
|
+
if (tag === "script" || tag === "SCRIPT") {
|
|
338
|
+
crawlablePaths.push(src);
|
|
339
|
+
} else {
|
|
340
|
+
resourcePaths.push(src);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// Find `url()` functions in CSS.
|
|
346
|
+
const urlRegex = /url\(["']?(?<href>[^"')]*?)["']?\)/g;
|
|
347
|
+
while ((match = urlRegex.exec(html))) {
|
|
348
|
+
const href = normalizeHref(match.groups?.href);
|
|
349
|
+
if (href) {
|
|
350
|
+
resourcePaths.push(href);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Find `src` attribute on frame tags.
|
|
355
|
+
const frameRegex =
|
|
356
|
+
/<(?:frame|FRAME) [^>]*?(?:src|SRC)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
357
|
+
while ((match = frameRegex.exec(html))) {
|
|
358
|
+
const href = normalizeHref(match.groups?.href);
|
|
359
|
+
if (href) {
|
|
360
|
+
crawlablePaths.push(href);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Find ancient `background` attribute on body tag.
|
|
365
|
+
const backgroundRegex =
|
|
366
|
+
/<(?:body|BODY) [^>]*?(?:background|BACKGROUND)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
367
|
+
while ((match = backgroundRegex.exec(html))) {
|
|
368
|
+
const href = normalizeHref(match.groups?.href);
|
|
369
|
+
if (href) {
|
|
370
|
+
resourcePaths.push(href);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Find `href` attribute on area tags.
|
|
375
|
+
const areaRegex =
|
|
376
|
+
/<(?:area|AREA) [^>]*?(?:href|HREF)=["'](?<href>[^>]*?)["'][^>]*>/g;
|
|
377
|
+
while ((match = areaRegex.exec(html))) {
|
|
378
|
+
const href = normalizeHref(match.groups?.href);
|
|
379
|
+
if (href) {
|
|
380
|
+
crawlablePaths.push(href);
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
return { crawlablePaths, resourcePaths };
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
function findPathsInRobotsTxt(txt) {
|
|
388
|
+
const crawlablePaths = [];
|
|
389
|
+
let match;
|
|
390
|
+
|
|
391
|
+
// Find `Sitemap` directives.
|
|
392
|
+
const sitemapRegex = /Sitemap:\s*(?<href>[^\s]*)/g;
|
|
393
|
+
while ((match = sitemapRegex.exec(txt))) {
|
|
394
|
+
const href = normalizeHref(match.groups?.href);
|
|
395
|
+
if (href) {
|
|
396
|
+
crawlablePaths.push(href);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
return {
|
|
401
|
+
crawlablePaths,
|
|
402
|
+
resourcePaths: [],
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
function findPathsInSitemapXml(xml) {
|
|
407
|
+
const crawlablePaths = [];
|
|
408
|
+
let match;
|
|
409
|
+
|
|
410
|
+
// Find `loc` elements.
|
|
411
|
+
const locRegex = /<loc>(?<href>[^<]*)<\/loc>/g;
|
|
412
|
+
while ((match = locRegex.exec(xml))) {
|
|
413
|
+
const href = normalizeHref(match.groups?.href);
|
|
414
|
+
if (href) {
|
|
415
|
+
crawlablePaths.push(href);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
return {
|
|
420
|
+
crawlablePaths,
|
|
421
|
+
resourcePaths: [],
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
function isCrawlableHref(href) {
|
|
426
|
+
// Use a fake base URL to cover the case where the href is relative.
|
|
427
|
+
const url = new URL(href, fakeBaseUrl);
|
|
428
|
+
const pathname = url.pathname;
|
|
429
|
+
const lastKey = pathname.split("/").pop() ?? "";
|
|
430
|
+
if (lastKey === "robots.txt" || lastKey === "sitemap.xml") {
|
|
431
|
+
return true;
|
|
432
|
+
}
|
|
433
|
+
const ext = extname(lastKey);
|
|
434
|
+
// We assume an empty extension is HTML.
|
|
435
|
+
const crawlableExtensions = [".html", ".css", ".js", ".map", ""];
|
|
436
|
+
return crawlableExtensions.includes(ext);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Remove any search parameters or hash from the href. Preserve absolute or
|
|
440
|
+
// relative nature of URL. If the URL only has a search or hash, return null.
|
|
441
|
+
function normalizeHref(href) {
|
|
442
|
+
// Remove everything after a `#` or `?` character.
|
|
443
|
+
const normalized = href.split(/[?#]/)[0];
|
|
444
|
+
return normalized === "" ? null : normalized;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
async function processPath(tree, path, baseUrl) {
|
|
448
|
+
if (path === undefined) {
|
|
449
|
+
return {
|
|
450
|
+
crawlablePaths: [],
|
|
451
|
+
keys: null,
|
|
452
|
+
path,
|
|
453
|
+
resourcePaths: [],
|
|
454
|
+
value: null,
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
// Convert path to keys
|
|
459
|
+
/** @type {any[]} */
|
|
460
|
+
let keys = path === "" ? [""] : keysFromPath(path);
|
|
461
|
+
|
|
462
|
+
// Traverse tree to get value.
|
|
463
|
+
let value = await traverse(tree, ...keys);
|
|
464
|
+
if (Tree.isTreelike(value)) {
|
|
465
|
+
// Path is actually a directory; see if it has an index.html
|
|
466
|
+
value = await traverse(value, "index.html");
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
if (value === undefined) {
|
|
470
|
+
return { crawlablePaths: [], keys, path, resourcePaths: [], value: null };
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
const adjustedKeys = adjustKeys(keys);
|
|
474
|
+
|
|
475
|
+
// Find paths in the value
|
|
476
|
+
const key = adjustedKeys.at(-1);
|
|
477
|
+
const { crawlablePaths, resourcePaths } = await findPaths(
|
|
478
|
+
value,
|
|
479
|
+
key,
|
|
480
|
+
baseUrl,
|
|
481
|
+
path
|
|
482
|
+
);
|
|
483
|
+
|
|
484
|
+
return {
|
|
485
|
+
crawlablePaths,
|
|
486
|
+
keys: adjustedKeys,
|
|
487
|
+
path,
|
|
488
|
+
resourcePaths,
|
|
489
|
+
value,
|
|
490
|
+
};
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
async function traverse(tree, ...keys) {
|
|
494
|
+
if (tree.resolve && keys.length > 1) {
|
|
495
|
+
// Tree like SiteTree that supports resolve() method
|
|
496
|
+
const lastKey = keys.pop();
|
|
497
|
+
const path = keys.join("/");
|
|
498
|
+
const resolved = tree.resolve(path);
|
|
499
|
+
return resolved.get(lastKey);
|
|
500
|
+
} else {
|
|
501
|
+
// Regular async tree
|
|
502
|
+
return Tree.traverse(tree, ...keys);
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
crawl.usage = `@crawl <tree>\tCrawl a tree`;
|
|
507
|
+
crawl.documentation = "https://graphorigami.org/language/@crawl.html";
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { Tree, isPlainObject } from "@weborigami/async-tree";
|
|
2
|
+
import ExplorableSiteTransform from "../common/ExplorableSiteTransform.js";
|
|
3
|
+
import { isTransformApplied, transformObject } from "../common/utilities.js";
|
|
4
|
+
import OriCommandTransform from "../misc/OriCommandTransform.js";
|
|
5
|
+
import assertScopeIsDefined from "../misc/assertScopeIsDefined.js";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Add debugging features to the indicated tree.
|
|
9
|
+
*
|
|
10
|
+
* @typedef {import("@weborigami/types").AsyncTree} AsyncTree
|
|
11
|
+
* @typedef {import("@weborigami/async-tree").Treelike} Treelike
|
|
12
|
+
*
|
|
13
|
+
* @this {AsyncTree|null}
|
|
14
|
+
* @param {Treelike} [treelike]
|
|
15
|
+
*/
|
|
16
|
+
export default async function debug(treelike) {
|
|
17
|
+
assertScopeIsDefined(this);
|
|
18
|
+
treelike = treelike ?? (await this?.get("@current"));
|
|
19
|
+
if (treelike === undefined) {
|
|
20
|
+
return;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// The debug command leaves the tree's existing scope intact; it does not
|
|
24
|
+
// apply its own scope to the tree.
|
|
25
|
+
let tree = Tree.from(treelike);
|
|
26
|
+
|
|
27
|
+
if (!isTransformApplied(ExplorableSiteTransform, tree)) {
|
|
28
|
+
tree = transformObject(ExplorableSiteTransform, tree);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
tree = transformObject(DebugTransform, tree);
|
|
32
|
+
|
|
33
|
+
return tree;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* @typedef {import("../../index.ts").Constructor<AsyncTree>} AsyncTreeConstructor
|
|
38
|
+
* @param {AsyncTreeConstructor} Base
|
|
39
|
+
*/
|
|
40
|
+
function DebugTransform(Base) {
|
|
41
|
+
return class Debug extends OriCommandTransform(Base) {
|
|
42
|
+
async get(key) {
|
|
43
|
+
let value = await super.get(key);
|
|
44
|
+
|
|
45
|
+
// Since this transform is for diagnostic purposes, cast arrays
|
|
46
|
+
// or plain objects to trees so we can debug them too.
|
|
47
|
+
if (value instanceof Array || isPlainObject(value)) {
|
|
48
|
+
value = Tree.from(value);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Ensure debug transforms are applied to explorable results.
|
|
52
|
+
if (Tree.isAsyncTree(value)) {
|
|
53
|
+
if (!isTransformApplied(ExplorableSiteTransform, value)) {
|
|
54
|
+
value = transformObject(ExplorableSiteTransform, value);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (!isTransformApplied(DebugTransform, value)) {
|
|
58
|
+
value = transformObject(DebugTransform, value);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (value?.unpack) {
|
|
63
|
+
// If the value isn't a tree, but has a tree attached via an `unpack`
|
|
64
|
+
// method, wrap the unpack method to provide debug support for it.
|
|
65
|
+
const original = value.unpack.bind(value);
|
|
66
|
+
value.unpack = async () => {
|
|
67
|
+
let content = await original();
|
|
68
|
+
if (!Tree.isTreelike(content)) {
|
|
69
|
+
return content;
|
|
70
|
+
}
|
|
71
|
+
/** @type {any} */
|
|
72
|
+
let tree = Tree.from(content);
|
|
73
|
+
if (!isTransformApplied(ExplorableSiteTransform, tree)) {
|
|
74
|
+
tree = transformObject(ExplorableSiteTransform, tree);
|
|
75
|
+
}
|
|
76
|
+
if (!isTransformApplied(DebugTransform, tree)) {
|
|
77
|
+
tree = transformObject(DebugTransform, tree);
|
|
78
|
+
}
|
|
79
|
+
return tree;
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return value;
|
|
84
|
+
}
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
debug.usage = `@debug <tree>\tAdd debug features to a tree`;
|
|
89
|
+
debug.documentation = "https://graphorigami.org/language/@debug.html";
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import TextDocument from "../common/TextDocument.js";
|
|
2
|
+
import assertScopeIsDefined from "../misc/assertScopeIsDefined.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @typedef {import("@weborigami/types").AsyncTree} AsyncTree
|
|
6
|
+
* @typedef {import("@weborigami/async-tree").StringLike} StringLike
|
|
7
|
+
*
|
|
8
|
+
* @this {AsyncTree|null}
|
|
9
|
+
* @param {StringLike} text
|
|
10
|
+
* @param {any} [data]
|
|
11
|
+
* @param {AsyncTree|null} [parent]
|
|
12
|
+
* @returns
|
|
13
|
+
*/
|
|
14
|
+
export default function document(text, data, parent) {
|
|
15
|
+
assertScopeIsDefined(this);
|
|
16
|
+
const merged = Object.assign({}, data, { "@text": text });
|
|
17
|
+
return new TextDocument(merged, parent ?? this);
|
|
18
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/** @typedef {import("@weborigami/types").AsyncTree} AsyncTree */
|
|
2
|
+
import { ObjectTree } from "@weborigami/async-tree";
|
|
3
|
+
import { OrigamiFiles, Scope } from "@weborigami/language";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
import builtins from "../builtins/@builtins.js";
|
|
7
|
+
import { keySymbol } from "../common/utilities.js";
|
|
8
|
+
import debug from "./@debug.js";
|
|
9
|
+
|
|
10
|
+
const dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
11
|
+
const miscDir = path.resolve(dirname, "../misc");
|
|
12
|
+
const miscFiles = Scope.treeWithScope(new OrigamiFiles(miscDir), builtins);
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* @this {AsyncTree|null}
|
|
16
|
+
*/
|
|
17
|
+
export default async function explore() {
|
|
18
|
+
const scope = Scope.getScope(this);
|
|
19
|
+
const templateFile = await miscFiles.get("explore.orit");
|
|
20
|
+
const template = await templateFile.unpack();
|
|
21
|
+
|
|
22
|
+
const data = await getScopeData(scope);
|
|
23
|
+
const text = await template(data);
|
|
24
|
+
|
|
25
|
+
const ambientsTree = new ObjectTree({
|
|
26
|
+
"@current": this,
|
|
27
|
+
});
|
|
28
|
+
ambientsTree[keySymbol] = "explore command";
|
|
29
|
+
const extendedScope = new Scope(ambientsTree, scope);
|
|
30
|
+
|
|
31
|
+
/** @type {any} */
|
|
32
|
+
const result = new String(text);
|
|
33
|
+
result.unpack = () => debug.call(scope, extendedScope);
|
|
34
|
+
|
|
35
|
+
return result;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// To test if a given tree represents the builtins, we walk up the chain to see
|
|
39
|
+
// if any of its prototypes are the builtins tree.
|
|
40
|
+
function isBuiltins(tree) {
|
|
41
|
+
while (tree) {
|
|
42
|
+
if (tree === builtins) {
|
|
43
|
+
return true;
|
|
44
|
+
}
|
|
45
|
+
tree = Object.getPrototypeOf(tree);
|
|
46
|
+
}
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
async function getScopeData(scope) {
|
|
51
|
+
const trees = scope.trees ?? [scope];
|
|
52
|
+
const data = [];
|
|
53
|
+
for (const tree of trees) {
|
|
54
|
+
if (isBuiltins(tree)) {
|
|
55
|
+
// Skip builtins.
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
const name = tree[keySymbol];
|
|
59
|
+
const treeKeys = Array.from(await tree.keys());
|
|
60
|
+
// Skip system-ish files that start with a period.
|
|
61
|
+
const keys = treeKeys.filter((key) => !key.startsWith?.("."));
|
|
62
|
+
data.push({ name, keys });
|
|
63
|
+
}
|
|
64
|
+
return data;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
explore.usage = "@explore\tExplore the current scope in the browser";
|
|
68
|
+
explore.documentation = "https://graphorigami.org/language/@explore.html";
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export default false;
|