@mui/internal-code-infra 0.0.3-canary.6 → 0.0.3-canary.60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -0
- package/build/babel-config.d.mts +40 -0
- package/build/brokenLinksChecker/index.d.mts +138 -0
- package/build/cli/cmdArgosPush.d.mts +13 -0
- package/build/cli/cmdBuild.d.mts +56 -0
- package/build/cli/cmdCopyFiles.d.mts +20 -0
- package/build/cli/cmdExtractErrorCodes.d.mts +3 -0
- package/build/cli/cmdGithubAuth.d.mts +6 -0
- package/build/cli/cmdListWorkspaces.d.mts +18 -0
- package/build/cli/cmdPublish.d.mts +27 -0
- package/build/cli/cmdPublishCanary.d.mts +30 -0
- package/build/cli/cmdPublishNewPackage.d.mts +8 -0
- package/build/cli/cmdSetVersionOverrides.d.mts +9 -0
- package/build/cli/cmdValidateBuiltTypes.d.mts +2 -0
- package/build/cli/index.d.mts +1 -0
- package/build/eslint/baseConfig.d.mts +10 -0
- package/build/eslint/docsConfig.d.mts +4 -0
- package/build/eslint/extensions.d.mts +8 -0
- package/build/eslint/index.d.mts +4 -0
- package/build/eslint/jsonConfig.d.mts +4 -0
- package/build/eslint/material-ui/config.d.mts +8 -0
- package/build/eslint/material-ui/index.d.mts +2 -0
- package/build/eslint/material-ui/rules/disallow-active-element-as-key-event-target.d.mts +5 -0
- package/build/eslint/material-ui/rules/disallow-react-api-in-server-components.d.mts +2 -0
- package/build/eslint/material-ui/rules/docgen-ignore-before-comment.d.mts +2 -0
- package/build/eslint/material-ui/rules/mui-name-matches-component-name.d.mts +5 -0
- package/build/eslint/material-ui/rules/no-empty-box.d.mts +5 -0
- package/build/eslint/material-ui/rules/no-restricted-resolved-imports.d.mts +12 -0
- package/build/eslint/material-ui/rules/no-styled-box.d.mts +5 -0
- package/build/eslint/material-ui/rules/rules-of-use-theme-variants.d.mts +9 -0
- package/build/eslint/material-ui/rules/straight-quotes.d.mts +5 -0
- package/build/eslint/testConfig.d.mts +14 -0
- package/build/markdownlint/duplicate-h1.d.mts +27 -0
- package/build/markdownlint/git-diff.d.mts +8 -0
- package/build/markdownlint/index.d.mts +56 -0
- package/build/markdownlint/straight-quotes.d.mts +8 -0
- package/build/markdownlint/table-alignment.d.mts +8 -0
- package/build/markdownlint/terminal-language.d.mts +8 -0
- package/build/prettier.d.mts +20 -0
- package/build/stylelint/index.d.mts +32 -0
- package/build/utils/babel.d.mts +71 -0
- package/build/utils/build.d.mts +50 -0
- package/build/utils/changelog.d.mts +64 -0
- package/build/utils/credentials.d.mts +17 -0
- package/build/utils/extractErrorCodes.d.mts +19 -0
- package/build/utils/git.d.mts +26 -0
- package/build/utils/github.d.mts +41 -0
- package/build/utils/pnpm.d.mts +238 -0
- package/build/utils/typescript.d.mts +35 -0
- package/package.json +92 -42
- package/src/babel-config.mjs +52 -8
- package/src/brokenLinksChecker/__fixtures__/static-site/broken-links.html +20 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/broken-targets.html +22 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/example.md +9 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/external-links.html +21 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/ignored-page.html +17 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/index.html +26 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/known-targets.json +5 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/nested/page.html +19 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/orphaned-page.html +20 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/page-with-api-links.html +20 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/page-with-custom-targets.html +24 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/page-with-ignored-content.html +28 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/page-with-known-target-links.html +19 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/valid.html +20 -0
- package/src/brokenLinksChecker/__fixtures__/static-site/with-anchors.html +31 -0
- package/src/brokenLinksChecker/index.mjs +641 -0
- package/src/brokenLinksChecker/index.test.ts +178 -0
- package/src/cli/cmdArgosPush.mjs +13 -2
- package/src/cli/cmdBuild.mjs +228 -31
- package/src/cli/cmdGithubAuth.mjs +36 -0
- package/src/cli/cmdListWorkspaces.mjs +2 -2
- package/src/cli/cmdPublish.mjs +203 -49
- package/src/cli/cmdPublishCanary.mjs +404 -46
- package/src/cli/cmdPublishNewPackage.mjs +86 -0
- package/src/cli/cmdSetVersionOverrides.mjs +17 -1
- package/src/cli/cmdValidateBuiltTypes.mjs +49 -0
- package/src/cli/index.mjs +6 -2
- package/src/cli/packageJson.d.ts +729 -0
- package/src/eslint/baseConfig.mjs +96 -78
- package/src/eslint/docsConfig.mjs +13 -13
- package/src/eslint/extensions.mjs +8 -8
- package/src/eslint/jsonConfig.mjs +40 -0
- package/src/eslint/material-ui/config.mjs +8 -9
- package/src/eslint/material-ui/rules/mui-name-matches-component-name.mjs +4 -2
- package/src/eslint/material-ui/rules/rules-of-use-theme-variants.mjs +2 -1
- package/src/eslint/testConfig.mjs +72 -66
- package/src/stylelint/index.mjs +46 -0
- package/src/untyped-plugins.d.ts +13 -0
- package/src/{cli → utils}/babel.mjs +10 -3
- package/src/utils/build.mjs +27 -1
- package/src/utils/changelog.mjs +157 -0
- package/src/utils/credentials.mjs +71 -0
- package/src/utils/extractErrorCodes.mjs +2 -2
- package/src/utils/git.mjs +67 -0
- package/src/utils/github.mjs +263 -0
- package/src/{cli → utils}/pnpm.mjs +23 -13
- package/src/{cli → utils}/typescript.mjs +13 -7
- package/src/cli/cmdJsonLint.mjs +0 -69
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8" />
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
6
|
+
<title>Page with Anchors</title>
|
|
7
|
+
</head>
|
|
8
|
+
<body>
|
|
9
|
+
<h1>Page with Anchors</h1>
|
|
10
|
+
<nav>
|
|
11
|
+
<ul>
|
|
12
|
+
<li><a href="/">Home</a></li>
|
|
13
|
+
<li><a href="#section1">Jump to section 1</a></li>
|
|
14
|
+
<li><a href="#section2">Jump to section 2</a></li>
|
|
15
|
+
<li><a href="#section3">Jump to section 3</a></li>
|
|
16
|
+
</ul>
|
|
17
|
+
</nav>
|
|
18
|
+
<section id="section1">
|
|
19
|
+
<h2>Section 1</h2>
|
|
20
|
+
<p>Content for section 1</p>
|
|
21
|
+
</section>
|
|
22
|
+
<section id="section2">
|
|
23
|
+
<h2>Section 2</h2>
|
|
24
|
+
<p>Content for section 2</p>
|
|
25
|
+
</section>
|
|
26
|
+
<section id="section3">
|
|
27
|
+
<h2>Section 3</h2>
|
|
28
|
+
<p>Content for section 3</p>
|
|
29
|
+
</section>
|
|
30
|
+
</body>
|
|
31
|
+
</html>
|
|
@@ -0,0 +1,641 @@
|
|
|
1
|
+
/* eslint-disable no-console */
|
|
2
|
+
import { execaCommand } from 'execa';
|
|
3
|
+
import timers from 'node:timers/promises';
|
|
4
|
+
import { parse } from 'node-html-parser';
|
|
5
|
+
import * as fs from 'node:fs/promises';
|
|
6
|
+
import * as path from 'node:path';
|
|
7
|
+
import chalk from 'chalk';
|
|
8
|
+
import { Transform } from 'node:stream';
|
|
9
|
+
import contentType from 'content-type';
|
|
10
|
+
|
|
11
|
+
const DEFAULT_CONCURRENCY = 4;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Creates a Transform stream that prefixes each line with a given string.
|
|
15
|
+
* Useful for distinguishing server logs from other output.
|
|
16
|
+
* @param {string} prefix - String to prepend to each line
|
|
17
|
+
* @returns {Transform} Transform stream that adds the prefix to each line
|
|
18
|
+
*/
|
|
19
|
+
const prefixLines = (prefix) => {
|
|
20
|
+
let leftover = '';
|
|
21
|
+
return new Transform({
|
|
22
|
+
transform(chunk, enc, cb) {
|
|
23
|
+
const lines = (leftover + chunk.toString()).split(/\r?\n/);
|
|
24
|
+
leftover = /** @type {string} */ (lines.pop());
|
|
25
|
+
this.push(lines.map((l) => `${prefix + l}\n`).join(''));
|
|
26
|
+
cb();
|
|
27
|
+
},
|
|
28
|
+
flush(cb) {
|
|
29
|
+
if (leftover) {
|
|
30
|
+
this.push(`${prefix + leftover}\n`);
|
|
31
|
+
}
|
|
32
|
+
cb();
|
|
33
|
+
},
|
|
34
|
+
});
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Maps page URLs to sets of known target IDs (anchors) on that page.
|
|
39
|
+
* Used to track which link targets (e.g., #section-id) exist on each page.
|
|
40
|
+
* @typedef {Map<string, Set<string>>} LinkStructure
|
|
41
|
+
*/
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Serialized representation of LinkStructure for JSON storage.
|
|
45
|
+
* Converts Maps and Sets to plain objects and arrays for file persistence.
|
|
46
|
+
* @typedef {Object} SerializedLinkStructure
|
|
47
|
+
* @property {Record<string, string[]>} targets - Object mapping page URLs to arrays of target IDs
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Fetches a URL and throws an error if the response is not OK.
|
|
52
|
+
* @param {string | URL} url - URL to fetch
|
|
53
|
+
* @returns {Promise<Response>} Fetch response if successful
|
|
54
|
+
* @throws {Error} If the response status is not OK (not in 200-299 range)
|
|
55
|
+
*/
|
|
56
|
+
async function fetchUrl(url) {
|
|
57
|
+
const res = await fetch(url);
|
|
58
|
+
if (!res.ok) {
|
|
59
|
+
throw new Error(`Failed to fetch ${url}: [${res.status}] ${res.statusText}`);
|
|
60
|
+
}
|
|
61
|
+
return res;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Polls a URL until it responds successfully or times out.
|
|
66
|
+
* Used to wait for a dev server to start.
|
|
67
|
+
* @param {string} url - URL to poll
|
|
68
|
+
* @param {number} timeout - Maximum milliseconds to wait before timing out
|
|
69
|
+
* @returns {Promise<void>} Resolves when URL responds successfully
|
|
70
|
+
* @throws {Error} If timeout is reached before URL responds
|
|
71
|
+
*/
|
|
72
|
+
async function pollUrl(url, timeout) {
|
|
73
|
+
const start = Date.now();
|
|
74
|
+
while (true) {
|
|
75
|
+
try {
|
|
76
|
+
// eslint-disable-next-line no-await-in-loop
|
|
77
|
+
await fetchUrl(url);
|
|
78
|
+
return;
|
|
79
|
+
} catch (/** @type {any} */ error) {
|
|
80
|
+
if (Date.now() - start > timeout) {
|
|
81
|
+
throw new Error(`Timeout waiting for ${url}: ${error.message}`, { cause: error });
|
|
82
|
+
}
|
|
83
|
+
// eslint-disable-next-line no-await-in-loop
|
|
84
|
+
await timers.setTimeout(1000);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Converts serialized link structure (from JSON) back to Map/Set form.
|
|
91
|
+
* @param {SerializedLinkStructure} data - Serialized structure with plain objects/arrays
|
|
92
|
+
* @returns {LinkStructure} Deserialized structure using Map and Set
|
|
93
|
+
*/
|
|
94
|
+
function deserializeLinkStructure(data) {
|
|
95
|
+
const linkStructure = new Map();
|
|
96
|
+
for (const url of Object.keys(data.targets)) {
|
|
97
|
+
linkStructure.set(url, new Set(data.targets[url]));
|
|
98
|
+
}
|
|
99
|
+
return linkStructure;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Data about a crawled page including its URL, HTTP status, and available link targets.
|
|
104
|
+
* @typedef {Object} PageData
|
|
105
|
+
* @property {string} url - The normalized page URL (without trailing slash unless root)
|
|
106
|
+
* @property {number} status - HTTP status code from the response (e.g., 200, 404, 500)
|
|
107
|
+
* @property {Set<string>} targets - Set of available anchor targets on the page, keyed by hash (e.g., '#intro')
|
|
108
|
+
*/
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Serializes and writes discovered page targets to a JSON file.
|
|
112
|
+
* @param {Map<string, PageData>} pages - Map of crawled pages with their targets
|
|
113
|
+
* @param {string} outPath - File path to write the JSON output
|
|
114
|
+
* @returns {Promise<void>}
|
|
115
|
+
*/
|
|
116
|
+
async function writePagesToFile(pages, outPath) {
|
|
117
|
+
/** @type {SerializedLinkStructure} */
|
|
118
|
+
const fileContent = { targets: {} };
|
|
119
|
+
for (const [url, pageData] of pages.entries()) {
|
|
120
|
+
fileContent.targets[url] = Array.from(pageData.targets.keys());
|
|
121
|
+
}
|
|
122
|
+
const dir = path.dirname(outPath);
|
|
123
|
+
await fs.mkdir(dir, { recursive: true });
|
|
124
|
+
await fs.writeFile(outPath, JSON.stringify(fileContent, null, 2), 'utf-8');
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Computes the accessible name of an element according to ARIA rules.
|
|
129
|
+
* Polyfill for `node.computedName` available only in Chrome v112+.
|
|
130
|
+
* Checks in order: aria-label, aria-labelledby, label[for], img alt, innerText.
|
|
131
|
+
* @param {import('node-html-parser').HTMLElement | null} elm - Element to compute name for
|
|
132
|
+
* @param {import('node-html-parser').HTMLElement} ownerDocument - Document containing the element
|
|
133
|
+
* @returns {string} The computed accessible name, or empty string if none found
|
|
134
|
+
*/
|
|
135
|
+
function getAccessibleName(elm, ownerDocument) {
|
|
136
|
+
if (!elm) {
|
|
137
|
+
return '';
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// 1. aria-label
|
|
141
|
+
const ariaLabel = elm.getAttribute('aria-label')?.trim();
|
|
142
|
+
if (ariaLabel) {
|
|
143
|
+
return ariaLabel;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// 2. aria-labelledby
|
|
147
|
+
const labelledby = elm.getAttribute('aria-labelledby');
|
|
148
|
+
if (labelledby) {
|
|
149
|
+
const labels = [];
|
|
150
|
+
for (const id of labelledby.split(/\s+/)) {
|
|
151
|
+
const label = getAccessibleName(ownerDocument.getElementById(id), ownerDocument);
|
|
152
|
+
if (label) {
|
|
153
|
+
labels.push(label);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
const label = labels.join(' ').trim();
|
|
157
|
+
if (label) {
|
|
158
|
+
return label;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// 3. <label for="id">
|
|
163
|
+
if (elm.id) {
|
|
164
|
+
const label = ownerDocument.querySelector(`label[for="${elm.id}"]`);
|
|
165
|
+
if (label) {
|
|
166
|
+
return getAccessibleName(label, ownerDocument);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// 4. <img alt="">
|
|
171
|
+
if (elm.tagName === 'IMG') {
|
|
172
|
+
const alt = elm.getAttribute('alt')?.trim();
|
|
173
|
+
if (alt) {
|
|
174
|
+
return alt;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// 5. Fallback: visible text
|
|
179
|
+
return elm.innerText.trim();
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Generic concurrent task queue with configurable concurrency limit.
|
|
184
|
+
* Processes tasks in FIFO order with a maximum number of concurrent workers.
|
|
185
|
+
* @template T
|
|
186
|
+
*/
|
|
187
|
+
class Queue {
|
|
188
|
+
/** Array of pending tasks waiting to be processed */
|
|
189
|
+
/** @type {T[]} */
|
|
190
|
+
tasks = [];
|
|
191
|
+
|
|
192
|
+
/** Set of currently running task promises */
|
|
193
|
+
/** @type {Set<Promise<void>>} */
|
|
194
|
+
pending = new Set();
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Creates a new queue with a worker function and concurrency limit.
|
|
198
|
+
* @param {(task: T) => Promise<void>} worker - Async function to process each task
|
|
199
|
+
* @param {number} concurrency - Maximum number of tasks to run simultaneously
|
|
200
|
+
*/
|
|
201
|
+
constructor(worker, concurrency) {
|
|
202
|
+
this.worker = worker;
|
|
203
|
+
this.concurrency = concurrency;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Adds a task to the queue and starts processing if under concurrency limit.
|
|
208
|
+
* @param {T} task - Task to add to the queue
|
|
209
|
+
*/
|
|
210
|
+
add(task) {
|
|
211
|
+
this.tasks.push(task);
|
|
212
|
+
this.run();
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
async run() {
|
|
216
|
+
while (this.pending.size < this.concurrency && this.tasks.length > 0) {
|
|
217
|
+
const task = /** @type {T} */ (this.tasks.shift());
|
|
218
|
+
const p = this.worker(task).finally(() => {
|
|
219
|
+
this.pending.delete(p);
|
|
220
|
+
this.run();
|
|
221
|
+
});
|
|
222
|
+
this.pending.add(p);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Waits for all pending and queued tasks to complete.
|
|
228
|
+
* @returns {Promise<void>}
|
|
229
|
+
*/
|
|
230
|
+
async waitAll() {
|
|
231
|
+
while (this.pending.size > 0) {
|
|
232
|
+
// eslint-disable-next-line no-await-in-loop
|
|
233
|
+
await Promise.all(this.pending);
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Represents a hyperlink found during crawling.
|
|
240
|
+
* @typedef {Object} Link
|
|
241
|
+
* @property {string | null} src - URL of the page where this link was found, or null for seed URLs
|
|
242
|
+
* @property {string | null} text - Accessible name/text content of the link element, or null for seed URLs
|
|
243
|
+
* @property {string} href - The href attribute value (may be relative or absolute, with or without hash)
|
|
244
|
+
*/
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Extracts and normalizes the page URL from a link href.
|
|
248
|
+
* Returns null for external links, ignored paths, or non-standard URLs.
|
|
249
|
+
* Normalizes by removing trailing slashes (except root) and preserving query params.
|
|
250
|
+
* @param {string} href - Link href to process (e.g., '/docs/api#section?query=1')
|
|
251
|
+
* @param {RegExp[]} ignoredPaths - Array of patterns to exclude
|
|
252
|
+
* @returns {string | null} Normalized page URL with query but without hash, or null if external/ignored
|
|
253
|
+
*/
|
|
254
|
+
function getPageUrl(href, ignoredPaths = []) {
|
|
255
|
+
if (!href.startsWith('/')) {
|
|
256
|
+
return null;
|
|
257
|
+
}
|
|
258
|
+
const parsed = new URL(href, 'http://localhost');
|
|
259
|
+
if (ignoredPaths.some((pattern) => pattern.test(parsed.pathname))) {
|
|
260
|
+
return null;
|
|
261
|
+
}
|
|
262
|
+
// Normalize pathname by removing trailing slash (except for root)
|
|
263
|
+
let pathname = parsed.pathname;
|
|
264
|
+
if (pathname !== '/' && pathname.endsWith('/')) {
|
|
265
|
+
pathname = pathname.slice(0, -1);
|
|
266
|
+
}
|
|
267
|
+
const link = pathname + parsed.search;
|
|
268
|
+
return link;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Configuration options for the broken links crawler.
|
|
273
|
+
* @typedef {Object} CrawlOptions
|
|
274
|
+
* @property {string | null} [startCommand] - Shell command to start the dev server (e.g., 'npm run dev'). If null, assumes server is already running
|
|
275
|
+
* @property {string} host - Base URL of the site to crawl (e.g., 'http://localhost:3000')
|
|
276
|
+
* @property {string | null} [outPath] - File path to write discovered link targets to. If null, targets are not persisted
|
|
277
|
+
* @property {RegExp[]} [ignoredPaths] - Array of regex patterns to exclude from crawling (e.g., [/^\/api\//] to skip /api/* routes)
|
|
278
|
+
* @property {string[]} [ignoredContent] - CSS selectors for elements whose nested links should be ignored (e.g., ['.sidebar', 'footer'])
|
|
279
|
+
* @property {Set<string>} [ignoredTargets] - Set of element IDs to ignore as link targets (defaults to '__next', '__NEXT_DATA__')
|
|
280
|
+
* @property {Map<string, Set<string>>} [knownTargets] - Pre-populated map of known valid targets to skip crawling (useful for external pages)
|
|
281
|
+
* @property {string[]} [knownTargetsDownloadUrl] - URLs to fetch known targets from (fetched JSON will be merged with knownTargets)
|
|
282
|
+
* @property {number} [concurrency] - Number of concurrent page fetches (defaults to 4)
|
|
283
|
+
* @property {string[]} [seedUrls] - Starting URLs for the crawl (defaults to ['/'])
|
|
284
|
+
*/
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* Fully resolved configuration with all optional fields filled with defaults.
|
|
288
|
+
* @typedef {Required<CrawlOptions>} ResolvedCrawlOptions
|
|
289
|
+
*/
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Resolves partial crawl options by filling in defaults for all optional fields.
|
|
293
|
+
* @param {CrawlOptions} rawOptions - Partial options from user
|
|
294
|
+
* @returns {ResolvedCrawlOptions} Fully resolved options with all defaults applied
|
|
295
|
+
*/
|
|
296
|
+
function resolveOptions(rawOptions) {
|
|
297
|
+
return {
|
|
298
|
+
startCommand: rawOptions.startCommand ?? null,
|
|
299
|
+
host: rawOptions.host,
|
|
300
|
+
outPath: rawOptions.outPath ?? null,
|
|
301
|
+
ignoredPaths: rawOptions.ignoredPaths ?? [],
|
|
302
|
+
ignoredContent: rawOptions.ignoredContent ?? [],
|
|
303
|
+
ignoredTargets: rawOptions.ignoredTargets ?? new Set(['__next', '__NEXT_DATA__']),
|
|
304
|
+
knownTargets: rawOptions.knownTargets ?? new Map(),
|
|
305
|
+
knownTargetsDownloadUrl: rawOptions.knownTargetsDownloadUrl ?? [],
|
|
306
|
+
concurrency: rawOptions.concurrency ?? DEFAULT_CONCURRENCY,
|
|
307
|
+
seedUrls: rawOptions.seedUrls ?? ['/'],
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
/**
|
|
312
|
+
* Merges multiple Maps, similar to Object.assign for objects.
|
|
313
|
+
* Later sources override earlier ones for duplicate keys.
|
|
314
|
+
* @template K, V
|
|
315
|
+
* @param {Map<K, V>} target - Target map to merge into (will be mutated)
|
|
316
|
+
* @param {...Map<K, V>} sources - Source maps to merge from
|
|
317
|
+
* @returns {Map<K, V>} The mutated target map
|
|
318
|
+
*/
|
|
319
|
+
function mergeMaps(target, ...sources) {
|
|
320
|
+
for (const source of sources) {
|
|
321
|
+
for (const [key, value] of source.entries()) {
|
|
322
|
+
target.set(key, value);
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
return target;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Downloads and deserializes known link targets from remote URLs.
|
|
330
|
+
* Fetches JSON files containing serialized link structures in parallel.
|
|
331
|
+
* @param {string[]} urls - Array of URLs to fetch known targets from
|
|
332
|
+
* @returns {Promise<LinkStructure[]>} Array of deserialized link structures
|
|
333
|
+
*/
|
|
334
|
+
async function downloadKnownTargets(urls) {
|
|
335
|
+
if (urls.length === 0) {
|
|
336
|
+
return [];
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
console.log(chalk.blue(`Downloading known targets from ${urls.length} URL(s)...`));
|
|
340
|
+
|
|
341
|
+
const results = await Promise.all(
|
|
342
|
+
urls.map(async (url) => {
|
|
343
|
+
console.log(` Fetching ${chalk.underline(url)}`);
|
|
344
|
+
const res = await fetchUrl(url);
|
|
345
|
+
const data = await res.json();
|
|
346
|
+
return deserializeLinkStructure(data);
|
|
347
|
+
}),
|
|
348
|
+
);
|
|
349
|
+
|
|
350
|
+
return results;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Resolves all known targets by downloading remote ones and merging with user-provided.
|
|
355
|
+
* User-provided targets take priority over downloaded ones.
|
|
356
|
+
* @param {ResolvedCrawlOptions} options - Resolved crawl options
|
|
357
|
+
* @returns {Promise<LinkStructure>} Merged map of all known targets
|
|
358
|
+
*/
|
|
359
|
+
async function resolveKnownTargets(options) {
|
|
360
|
+
const downloaded = await downloadKnownTargets(options.knownTargetsDownloadUrl);
|
|
361
|
+
// Merge downloaded with user-provided, user-provided takes priority
|
|
362
|
+
return mergeMaps(new Map(), ...downloaded, options.knownTargets);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Represents a broken link or broken link target discovered during crawling.
|
|
367
|
+
* @typedef {Object} Issue
|
|
368
|
+
* @property {'broken-link' | 'broken-target'} type - Type of issue: 'broken-link' for 404 pages, 'broken-target' for missing anchors
|
|
369
|
+
* @property {string} message - Human-readable description of the issue (e.g., 'Target not found', 'Page returned error 404')
|
|
370
|
+
* @property {Link} link - The link object that has the issue
|
|
371
|
+
*/
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Results from a complete crawl operation.
|
|
375
|
+
* @typedef {Object} CrawlResult
|
|
376
|
+
* @property {Set<Link>} links - All links discovered during the crawl
|
|
377
|
+
* @property {Map<string, PageData>} pages - All pages crawled, keyed by normalized URL
|
|
378
|
+
* @property {Issue[]} issues - All broken links and broken targets found
|
|
379
|
+
*/
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Reports broken links to stderr, grouped by source page for better readability.
|
|
383
|
+
* @param {Issue[]} issuesList - Array of issues to report
|
|
384
|
+
*/
|
|
385
|
+
function reportIssues(issuesList) {
|
|
386
|
+
if (issuesList.length === 0) {
|
|
387
|
+
return;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
console.error('\nBroken links found:\n');
|
|
391
|
+
|
|
392
|
+
// Group issues by source URL
|
|
393
|
+
/** @type {Map<string, Issue[]>} */
|
|
394
|
+
const issuesBySource = new Map();
|
|
395
|
+
for (const issue of issuesList) {
|
|
396
|
+
const sourceUrl = issue.link.src ?? '(unknown)';
|
|
397
|
+
const sourceIssues = issuesBySource.get(sourceUrl) ?? [];
|
|
398
|
+
if (sourceIssues.length === 0) {
|
|
399
|
+
issuesBySource.set(sourceUrl, sourceIssues);
|
|
400
|
+
}
|
|
401
|
+
sourceIssues.push(issue);
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// Report issues grouped by source
|
|
405
|
+
for (const [sourceUrl, sourceIssues] of issuesBySource.entries()) {
|
|
406
|
+
console.error(`Source ${chalk.cyan(sourceUrl)}:`);
|
|
407
|
+
for (const issue of sourceIssues) {
|
|
408
|
+
const reason = issue.type === 'broken-target' ? 'target not found' : 'returned status 404';
|
|
409
|
+
console.error(` [${issue.link.text}](${chalk.cyan(issue.link.href)}) (${reason})`);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Crawls a website starting from seed URLs, discovering all internal links and checking for broken links/targets.
|
|
416
|
+
* @param {CrawlOptions} rawOptions - Configuration options for the crawl
|
|
417
|
+
* @returns {Promise<CrawlResult>} Crawl results including all links, pages, and issues found
|
|
418
|
+
*/
|
|
419
|
+
export async function crawl(rawOptions) {
|
|
420
|
+
const options = resolveOptions(rawOptions);
|
|
421
|
+
const startTime = Date.now();
|
|
422
|
+
|
|
423
|
+
/** @type {AbortController | null} */
|
|
424
|
+
let controller = null;
|
|
425
|
+
if (options.startCommand) {
|
|
426
|
+
console.log(chalk.blue(`Starting server with "${options.startCommand}"...`));
|
|
427
|
+
controller = new AbortController();
|
|
428
|
+
const appProcess = execaCommand(options.startCommand, {
|
|
429
|
+
stdout: 'pipe',
|
|
430
|
+
stderr: 'pipe',
|
|
431
|
+
cancelSignal: controller.signal,
|
|
432
|
+
env: {
|
|
433
|
+
FORCE_COLOR: '1',
|
|
434
|
+
...process.env,
|
|
435
|
+
},
|
|
436
|
+
});
|
|
437
|
+
|
|
438
|
+
// Prefix server logs
|
|
439
|
+
const serverPrefix = chalk.gray('server: ');
|
|
440
|
+
appProcess.stdout.pipe(prefixLines(serverPrefix)).pipe(process.stdout);
|
|
441
|
+
appProcess.stderr.pipe(prefixLines(serverPrefix)).pipe(process.stderr);
|
|
442
|
+
appProcess.catch(() => {});
|
|
443
|
+
|
|
444
|
+
await pollUrl(options.host, 10000);
|
|
445
|
+
|
|
446
|
+
console.log(`Server started on ${chalk.underline(options.host)}`);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
const knownTargets = await resolveKnownTargets(options);
|
|
450
|
+
|
|
451
|
+
/** @type {Map<string, Promise<PageData>>} */
|
|
452
|
+
const crawledPages = new Map();
|
|
453
|
+
/** @type {Set<Link>} */
|
|
454
|
+
const crawledLinks = new Set();
|
|
455
|
+
|
|
456
|
+
const queue = new Queue(async (/** @type {Link} */ link) => {
|
|
457
|
+
crawledLinks.add(link);
|
|
458
|
+
|
|
459
|
+
const pageUrl = getPageUrl(link.href, options.ignoredPaths);
|
|
460
|
+
if (pageUrl === null) {
|
|
461
|
+
return;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
if (knownTargets.has(pageUrl)) {
|
|
465
|
+
return;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
if (crawledPages.has(pageUrl)) {
|
|
469
|
+
return;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
const pagePromise = Promise.resolve().then(async () => {
|
|
473
|
+
console.log(`Crawling ${chalk.cyan(pageUrl)}...`);
|
|
474
|
+
const res = await fetch(new URL(pageUrl, options.host));
|
|
475
|
+
|
|
476
|
+
/** @type {PageData} */
|
|
477
|
+
const pageData = {
|
|
478
|
+
url: pageUrl,
|
|
479
|
+
status: res.status,
|
|
480
|
+
targets: new Set(),
|
|
481
|
+
};
|
|
482
|
+
|
|
483
|
+
if (pageData.status < 200 || pageData.status >= 400) {
|
|
484
|
+
console.warn(chalk.yellow(`Warning: ${pageUrl} returned status ${pageData.status}`));
|
|
485
|
+
return pageData;
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
const contentTypeHeader = res.headers.get('content-type');
|
|
489
|
+
let type = 'text/html';
|
|
490
|
+
|
|
491
|
+
if (contentTypeHeader) {
|
|
492
|
+
try {
|
|
493
|
+
const parsed = contentType.parse(contentTypeHeader);
|
|
494
|
+
type = parsed.type;
|
|
495
|
+
} catch {
|
|
496
|
+
console.warn(
|
|
497
|
+
chalk.yellow(`Warning: ${pageUrl} returned invalid content-type: ${contentTypeHeader}`),
|
|
498
|
+
);
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
if (type.startsWith('image/')) {
|
|
503
|
+
// Skip images
|
|
504
|
+
return pageData;
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
if (type !== 'text/html') {
|
|
508
|
+
console.warn(chalk.yellow(`Warning: ${pageUrl} returned non-HTML content-type: ${type}`));
|
|
509
|
+
// TODO: Handle text/markdown. Parse content as markdown and extract links/targets.
|
|
510
|
+
return pageData;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
const content = await res.text();
|
|
514
|
+
|
|
515
|
+
const dom = parse(content);
|
|
516
|
+
|
|
517
|
+
let ignoredSelector = ':not(*)'; // matches nothing
|
|
518
|
+
if (options.ignoredContent.length > 0) {
|
|
519
|
+
ignoredSelector = Array.from(options.ignoredContent)
|
|
520
|
+
.flatMap((selector) => [selector, `${selector} *`])
|
|
521
|
+
.join(',');
|
|
522
|
+
}
|
|
523
|
+
const linksSelector = `a[href]:not(${ignoredSelector})`;
|
|
524
|
+
|
|
525
|
+
const pageLinks = dom.querySelectorAll(linksSelector).map((a) => ({
|
|
526
|
+
src: pageUrl,
|
|
527
|
+
text: getAccessibleName(a, dom),
|
|
528
|
+
href: a.getAttribute('href') ?? '',
|
|
529
|
+
}));
|
|
530
|
+
|
|
531
|
+
for (const target of dom.querySelectorAll('*[id]')) {
|
|
532
|
+
if (!options.ignoredTargets.has(target.id)) {
|
|
533
|
+
pageData.targets.add(`#${target.id}`);
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
for (const pageLink of pageLinks) {
|
|
538
|
+
queue.add(pageLink);
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
return pageData;
|
|
542
|
+
});
|
|
543
|
+
|
|
544
|
+
crawledPages.set(pageUrl, pagePromise);
|
|
545
|
+
|
|
546
|
+
await pagePromise;
|
|
547
|
+
}, options.concurrency);
|
|
548
|
+
|
|
549
|
+
for (const seedUrl of options.seedUrls) {
|
|
550
|
+
queue.add({ src: null, text: null, href: seedUrl });
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
await queue.waitAll();
|
|
554
|
+
|
|
555
|
+
if (controller) {
|
|
556
|
+
console.log(chalk.blue('Stopping server...'));
|
|
557
|
+
controller.abort();
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
const results = new Map(
|
|
561
|
+
await Promise.all(
|
|
562
|
+
Array.from(crawledPages.entries(), async ([a, b]) => /** @type {const} */ ([a, await b])),
|
|
563
|
+
),
|
|
564
|
+
);
|
|
565
|
+
|
|
566
|
+
if (options.outPath) {
|
|
567
|
+
await writePagesToFile(results, options.outPath);
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
/** Array to collect all issues found during validation */
|
|
571
|
+
/** @type {Issue[]} */
|
|
572
|
+
const issues = [];
|
|
573
|
+
|
|
574
|
+
/**
|
|
575
|
+
* Records a broken link or target issue.
|
|
576
|
+
* @param {Link} link - The link with the issue
|
|
577
|
+
* @param {'broken-target' | 'broken-link'} type - Type of issue
|
|
578
|
+
* @param {string} message - Human-readable error message
|
|
579
|
+
*/
|
|
580
|
+
function recordBrokenLink(link, type, message) {
|
|
581
|
+
issues.push({
|
|
582
|
+
type,
|
|
583
|
+
message,
|
|
584
|
+
link,
|
|
585
|
+
});
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
for (const crawledLink of crawledLinks) {
|
|
589
|
+
const pageUrl = getPageUrl(crawledLink.href, options.ignoredPaths);
|
|
590
|
+
if (pageUrl !== null) {
|
|
591
|
+
// Internal link
|
|
592
|
+
const parsed = new URL(crawledLink.href, 'http://localhost');
|
|
593
|
+
|
|
594
|
+
const knownPage = knownTargets.get(pageUrl);
|
|
595
|
+
if (knownPage) {
|
|
596
|
+
if (parsed.hash && !knownPage.has(parsed.hash)) {
|
|
597
|
+
recordBrokenLink(crawledLink, 'broken-target', 'Target not found');
|
|
598
|
+
} else {
|
|
599
|
+
// all good
|
|
600
|
+
}
|
|
601
|
+
} else {
|
|
602
|
+
const page = results.get(pageUrl);
|
|
603
|
+
|
|
604
|
+
if (!page) {
|
|
605
|
+
recordBrokenLink(crawledLink, 'broken-link', 'Page not crawled');
|
|
606
|
+
} else if (page.status >= 400) {
|
|
607
|
+
recordBrokenLink(crawledLink, 'broken-link', `Page returned error ${page.status}`);
|
|
608
|
+
} else if (parsed.hash) {
|
|
609
|
+
if (!page.targets.has(parsed.hash)) {
|
|
610
|
+
recordBrokenLink(crawledLink, 'broken-target', 'Target not found');
|
|
611
|
+
}
|
|
612
|
+
} else {
|
|
613
|
+
// all good
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
reportIssues(issues);
|
|
620
|
+
|
|
621
|
+
// Derive counts from issues
|
|
622
|
+
const brokenLinks = issues.filter((issue) => issue.type === 'broken-link').length;
|
|
623
|
+
const brokenLinkTargets = issues.filter((issue) => issue.type === 'broken-target').length;
|
|
624
|
+
|
|
625
|
+
const endTime = Date.now();
|
|
626
|
+
const durationSeconds = (endTime - startTime) / 1000;
|
|
627
|
+
const duration = new Intl.NumberFormat('en-US', {
|
|
628
|
+
style: 'unit',
|
|
629
|
+
unit: 'second',
|
|
630
|
+
maximumFractionDigits: 2,
|
|
631
|
+
}).format(durationSeconds);
|
|
632
|
+
console.log(chalk.blue(`\nCrawl completed in ${duration}`));
|
|
633
|
+
console.log(` Total links found: ${chalk.cyan(crawledLinks.size)}`);
|
|
634
|
+
console.log(` Total broken links: ${chalk.cyan(brokenLinks)}`);
|
|
635
|
+
console.log(` Total broken link targets: ${chalk.cyan(brokenLinkTargets)}`);
|
|
636
|
+
if (options.outPath) {
|
|
637
|
+
console.log(chalk.blue(`Output written to: ${options.outPath}`));
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
return { links: crawledLinks, pages: results, issues };
|
|
641
|
+
}
|