@mui/internal-code-infra 0.0.3-canary.6 → 0.0.3-canary.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/README.md +55 -0
  2. package/build/babel-config.d.mts +40 -0
  3. package/build/brokenLinksChecker/index.d.mts +138 -0
  4. package/build/cli/cmdArgosPush.d.mts +13 -0
  5. package/build/cli/cmdBuild.d.mts +56 -0
  6. package/build/cli/cmdCopyFiles.d.mts +20 -0
  7. package/build/cli/cmdExtractErrorCodes.d.mts +3 -0
  8. package/build/cli/cmdGithubAuth.d.mts +6 -0
  9. package/build/cli/cmdListWorkspaces.d.mts +18 -0
  10. package/build/cli/cmdPublish.d.mts +27 -0
  11. package/build/cli/cmdPublishCanary.d.mts +30 -0
  12. package/build/cli/cmdPublishNewPackage.d.mts +8 -0
  13. package/build/cli/cmdSetVersionOverrides.d.mts +9 -0
  14. package/build/cli/cmdValidateBuiltTypes.d.mts +2 -0
  15. package/build/cli/index.d.mts +1 -0
  16. package/build/eslint/baseConfig.d.mts +10 -0
  17. package/build/eslint/docsConfig.d.mts +4 -0
  18. package/build/eslint/extensions.d.mts +8 -0
  19. package/build/eslint/index.d.mts +4 -0
  20. package/build/eslint/jsonConfig.d.mts +4 -0
  21. package/build/eslint/material-ui/config.d.mts +8 -0
  22. package/build/eslint/material-ui/index.d.mts +2 -0
  23. package/build/eslint/material-ui/rules/disallow-active-element-as-key-event-target.d.mts +5 -0
  24. package/build/eslint/material-ui/rules/disallow-react-api-in-server-components.d.mts +2 -0
  25. package/build/eslint/material-ui/rules/docgen-ignore-before-comment.d.mts +2 -0
  26. package/build/eslint/material-ui/rules/mui-name-matches-component-name.d.mts +5 -0
  27. package/build/eslint/material-ui/rules/no-empty-box.d.mts +5 -0
  28. package/build/eslint/material-ui/rules/no-restricted-resolved-imports.d.mts +12 -0
  29. package/build/eslint/material-ui/rules/no-styled-box.d.mts +5 -0
  30. package/build/eslint/material-ui/rules/rules-of-use-theme-variants.d.mts +9 -0
  31. package/build/eslint/material-ui/rules/straight-quotes.d.mts +5 -0
  32. package/build/eslint/testConfig.d.mts +14 -0
  33. package/build/markdownlint/duplicate-h1.d.mts +27 -0
  34. package/build/markdownlint/git-diff.d.mts +8 -0
  35. package/build/markdownlint/index.d.mts +56 -0
  36. package/build/markdownlint/straight-quotes.d.mts +8 -0
  37. package/build/markdownlint/table-alignment.d.mts +8 -0
  38. package/build/markdownlint/terminal-language.d.mts +8 -0
  39. package/build/prettier.d.mts +20 -0
  40. package/build/stylelint/index.d.mts +32 -0
  41. package/build/utils/babel.d.mts +71 -0
  42. package/build/utils/build.d.mts +50 -0
  43. package/build/utils/changelog.d.mts +64 -0
  44. package/build/utils/credentials.d.mts +17 -0
  45. package/build/utils/extractErrorCodes.d.mts +19 -0
  46. package/build/utils/git.d.mts +26 -0
  47. package/build/utils/github.d.mts +41 -0
  48. package/build/utils/pnpm.d.mts +238 -0
  49. package/build/utils/typescript.d.mts +35 -0
  50. package/package.json +92 -42
  51. package/src/babel-config.mjs +52 -8
  52. package/src/brokenLinksChecker/__fixtures__/static-site/broken-links.html +20 -0
  53. package/src/brokenLinksChecker/__fixtures__/static-site/broken-targets.html +22 -0
  54. package/src/brokenLinksChecker/__fixtures__/static-site/example.md +9 -0
  55. package/src/brokenLinksChecker/__fixtures__/static-site/external-links.html +21 -0
  56. package/src/brokenLinksChecker/__fixtures__/static-site/ignored-page.html +17 -0
  57. package/src/brokenLinksChecker/__fixtures__/static-site/index.html +26 -0
  58. package/src/brokenLinksChecker/__fixtures__/static-site/known-targets.json +5 -0
  59. package/src/brokenLinksChecker/__fixtures__/static-site/nested/page.html +19 -0
  60. package/src/brokenLinksChecker/__fixtures__/static-site/orphaned-page.html +20 -0
  61. package/src/brokenLinksChecker/__fixtures__/static-site/page-with-api-links.html +20 -0
  62. package/src/brokenLinksChecker/__fixtures__/static-site/page-with-custom-targets.html +24 -0
  63. package/src/brokenLinksChecker/__fixtures__/static-site/page-with-ignored-content.html +28 -0
  64. package/src/brokenLinksChecker/__fixtures__/static-site/page-with-known-target-links.html +19 -0
  65. package/src/brokenLinksChecker/__fixtures__/static-site/valid.html +20 -0
  66. package/src/brokenLinksChecker/__fixtures__/static-site/with-anchors.html +31 -0
  67. package/src/brokenLinksChecker/index.mjs +641 -0
  68. package/src/brokenLinksChecker/index.test.ts +178 -0
  69. package/src/cli/cmdArgosPush.mjs +13 -2
  70. package/src/cli/cmdBuild.mjs +228 -31
  71. package/src/cli/cmdGithubAuth.mjs +36 -0
  72. package/src/cli/cmdListWorkspaces.mjs +2 -2
  73. package/src/cli/cmdPublish.mjs +203 -49
  74. package/src/cli/cmdPublishCanary.mjs +404 -46
  75. package/src/cli/cmdPublishNewPackage.mjs +86 -0
  76. package/src/cli/cmdSetVersionOverrides.mjs +17 -1
  77. package/src/cli/cmdValidateBuiltTypes.mjs +49 -0
  78. package/src/cli/index.mjs +6 -2
  79. package/src/cli/packageJson.d.ts +729 -0
  80. package/src/eslint/baseConfig.mjs +96 -78
  81. package/src/eslint/docsConfig.mjs +13 -13
  82. package/src/eslint/extensions.mjs +8 -8
  83. package/src/eslint/jsonConfig.mjs +40 -0
  84. package/src/eslint/material-ui/config.mjs +8 -9
  85. package/src/eslint/material-ui/rules/mui-name-matches-component-name.mjs +4 -2
  86. package/src/eslint/material-ui/rules/rules-of-use-theme-variants.mjs +2 -1
  87. package/src/eslint/testConfig.mjs +72 -66
  88. package/src/stylelint/index.mjs +46 -0
  89. package/src/untyped-plugins.d.ts +13 -0
  90. package/src/{cli → utils}/babel.mjs +10 -3
  91. package/src/utils/build.mjs +27 -1
  92. package/src/utils/changelog.mjs +157 -0
  93. package/src/utils/credentials.mjs +71 -0
  94. package/src/utils/extractErrorCodes.mjs +2 -2
  95. package/src/utils/git.mjs +67 -0
  96. package/src/utils/github.mjs +263 -0
  97. package/src/{cli → utils}/pnpm.mjs +23 -13
  98. package/src/{cli → utils}/typescript.mjs +13 -7
  99. package/src/cli/cmdJsonLint.mjs +0 -69
@@ -0,0 +1,31 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Page with Anchors</title>
7
+ </head>
8
+ <body>
9
+ <h1>Page with Anchors</h1>
10
+ <nav>
11
+ <ul>
12
+ <li><a href="/">Home</a></li>
13
+ <li><a href="#section1">Jump to section 1</a></li>
14
+ <li><a href="#section2">Jump to section 2</a></li>
15
+ <li><a href="#section3">Jump to section 3</a></li>
16
+ </ul>
17
+ </nav>
18
+ <section id="section1">
19
+ <h2>Section 1</h2>
20
+ <p>Content for section 1</p>
21
+ </section>
22
+ <section id="section2">
23
+ <h2>Section 2</h2>
24
+ <p>Content for section 2</p>
25
+ </section>
26
+ <section id="section3">
27
+ <h2>Section 3</h2>
28
+ <p>Content for section 3</p>
29
+ </section>
30
+ </body>
31
+ </html>
@@ -0,0 +1,641 @@
1
+ /* eslint-disable no-console */
2
+ import { execaCommand } from 'execa';
3
+ import timers from 'node:timers/promises';
4
+ import { parse } from 'node-html-parser';
5
+ import * as fs from 'node:fs/promises';
6
+ import * as path from 'node:path';
7
+ import chalk from 'chalk';
8
+ import { Transform } from 'node:stream';
9
+ import contentType from 'content-type';
10
+
11
+ const DEFAULT_CONCURRENCY = 4;
12
+
13
+ /**
14
+ * Creates a Transform stream that prefixes each line with a given string.
15
+ * Useful for distinguishing server logs from other output.
16
+ * @param {string} prefix - String to prepend to each line
17
+ * @returns {Transform} Transform stream that adds the prefix to each line
18
+ */
19
+ const prefixLines = (prefix) => {
20
+ let leftover = '';
21
+ return new Transform({
22
+ transform(chunk, enc, cb) {
23
+ const lines = (leftover + chunk.toString()).split(/\r?\n/);
24
+ leftover = /** @type {string} */ (lines.pop());
25
+ this.push(lines.map((l) => `${prefix + l}\n`).join(''));
26
+ cb();
27
+ },
28
+ flush(cb) {
29
+ if (leftover) {
30
+ this.push(`${prefix + leftover}\n`);
31
+ }
32
+ cb();
33
+ },
34
+ });
35
+ };
36
+
37
+ /**
38
+ * Maps page URLs to sets of known target IDs (anchors) on that page.
39
+ * Used to track which link targets (e.g., #section-id) exist on each page.
40
+ * @typedef {Map<string, Set<string>>} LinkStructure
41
+ */
42
+
43
+ /**
44
+ * Serialized representation of LinkStructure for JSON storage.
45
+ * Converts Maps and Sets to plain objects and arrays for file persistence.
46
+ * @typedef {Object} SerializedLinkStructure
47
+ * @property {Record<string, string[]>} targets - Object mapping page URLs to arrays of target IDs
48
+ */
49
+
50
+ /**
51
+ * Fetches a URL and throws an error if the response is not OK.
52
+ * @param {string | URL} url - URL to fetch
53
+ * @returns {Promise<Response>} Fetch response if successful
54
+ * @throws {Error} If the response status is not OK (not in 200-299 range)
55
+ */
56
+ async function fetchUrl(url) {
57
+ const res = await fetch(url);
58
+ if (!res.ok) {
59
+ throw new Error(`Failed to fetch ${url}: [${res.status}] ${res.statusText}`);
60
+ }
61
+ return res;
62
+ }
63
+
64
+ /**
65
+ * Polls a URL until it responds successfully or times out.
66
+ * Used to wait for a dev server to start.
67
+ * @param {string} url - URL to poll
68
+ * @param {number} timeout - Maximum milliseconds to wait before timing out
69
+ * @returns {Promise<void>} Resolves when URL responds successfully
70
+ * @throws {Error} If timeout is reached before URL responds
71
+ */
72
+ async function pollUrl(url, timeout) {
73
+ const start = Date.now();
74
+ while (true) {
75
+ try {
76
+ // eslint-disable-next-line no-await-in-loop
77
+ await fetchUrl(url);
78
+ return;
79
+ } catch (/** @type {any} */ error) {
80
+ if (Date.now() - start > timeout) {
81
+ throw new Error(`Timeout waiting for ${url}: ${error.message}`, { cause: error });
82
+ }
83
+ // eslint-disable-next-line no-await-in-loop
84
+ await timers.setTimeout(1000);
85
+ }
86
+ }
87
+ }
88
+
89
+ /**
90
+ * Converts serialized link structure (from JSON) back to Map/Set form.
91
+ * @param {SerializedLinkStructure} data - Serialized structure with plain objects/arrays
92
+ * @returns {LinkStructure} Deserialized structure using Map and Set
93
+ */
94
+ function deserializeLinkStructure(data) {
95
+ const linkStructure = new Map();
96
+ for (const url of Object.keys(data.targets)) {
97
+ linkStructure.set(url, new Set(data.targets[url]));
98
+ }
99
+ return linkStructure;
100
+ }
101
+
102
+ /**
103
+ * Data about a crawled page including its URL, HTTP status, and available link targets.
104
+ * @typedef {Object} PageData
105
+ * @property {string} url - The normalized page URL (without trailing slash unless root)
106
+ * @property {number} status - HTTP status code from the response (e.g., 200, 404, 500)
107
+ * @property {Set<string>} targets - Set of available anchor targets on the page, keyed by hash (e.g., '#intro')
108
+ */
109
+
110
+ /**
111
+ * Serializes and writes discovered page targets to a JSON file.
112
+ * @param {Map<string, PageData>} pages - Map of crawled pages with their targets
113
+ * @param {string} outPath - File path to write the JSON output
114
+ * @returns {Promise<void>}
115
+ */
116
+ async function writePagesToFile(pages, outPath) {
117
+ /** @type {SerializedLinkStructure} */
118
+ const fileContent = { targets: {} };
119
+ for (const [url, pageData] of pages.entries()) {
120
+ fileContent.targets[url] = Array.from(pageData.targets.keys());
121
+ }
122
+ const dir = path.dirname(outPath);
123
+ await fs.mkdir(dir, { recursive: true });
124
+ await fs.writeFile(outPath, JSON.stringify(fileContent, null, 2), 'utf-8');
125
+ }
126
+
127
+ /**
128
+ * Computes the accessible name of an element according to ARIA rules.
129
+ * Polyfill for `node.computedName` available only in Chrome v112+.
130
+ * Checks in order: aria-label, aria-labelledby, label[for], img alt, innerText.
131
+ * @param {import('node-html-parser').HTMLElement | null} elm - Element to compute name for
132
+ * @param {import('node-html-parser').HTMLElement} ownerDocument - Document containing the element
133
+ * @returns {string} The computed accessible name, or empty string if none found
134
+ */
135
+ function getAccessibleName(elm, ownerDocument) {
136
+ if (!elm) {
137
+ return '';
138
+ }
139
+
140
+ // 1. aria-label
141
+ const ariaLabel = elm.getAttribute('aria-label')?.trim();
142
+ if (ariaLabel) {
143
+ return ariaLabel;
144
+ }
145
+
146
+ // 2. aria-labelledby
147
+ const labelledby = elm.getAttribute('aria-labelledby');
148
+ if (labelledby) {
149
+ const labels = [];
150
+ for (const id of labelledby.split(/\s+/)) {
151
+ const label = getAccessibleName(ownerDocument.getElementById(id), ownerDocument);
152
+ if (label) {
153
+ labels.push(label);
154
+ }
155
+ }
156
+ const label = labels.join(' ').trim();
157
+ if (label) {
158
+ return label;
159
+ }
160
+ }
161
+
162
+ // 3. <label for="id">
163
+ if (elm.id) {
164
+ const label = ownerDocument.querySelector(`label[for="${elm.id}"]`);
165
+ if (label) {
166
+ return getAccessibleName(label, ownerDocument);
167
+ }
168
+ }
169
+
170
+ // 4. <img alt="">
171
+ if (elm.tagName === 'IMG') {
172
+ const alt = elm.getAttribute('alt')?.trim();
173
+ if (alt) {
174
+ return alt;
175
+ }
176
+ }
177
+
178
+ // 5. Fallback: visible text
179
+ return elm.innerText.trim();
180
+ }
181
+
182
+ /**
183
+ * Generic concurrent task queue with configurable concurrency limit.
184
+ * Processes tasks in FIFO order with a maximum number of concurrent workers.
185
+ * @template T
186
+ */
187
+ class Queue {
188
+ /** Array of pending tasks waiting to be processed */
189
+ /** @type {T[]} */
190
+ tasks = [];
191
+
192
+ /** Set of currently running task promises */
193
+ /** @type {Set<Promise<void>>} */
194
+ pending = new Set();
195
+
196
+ /**
197
+ * Creates a new queue with a worker function and concurrency limit.
198
+ * @param {(task: T) => Promise<void>} worker - Async function to process each task
199
+ * @param {number} concurrency - Maximum number of tasks to run simultaneously
200
+ */
201
+ constructor(worker, concurrency) {
202
+ this.worker = worker;
203
+ this.concurrency = concurrency;
204
+ }
205
+
206
+ /**
207
+ * Adds a task to the queue and starts processing if under concurrency limit.
208
+ * @param {T} task - Task to add to the queue
209
+ */
210
+ add(task) {
211
+ this.tasks.push(task);
212
+ this.run();
213
+ }
214
+
215
+ async run() {
216
+ while (this.pending.size < this.concurrency && this.tasks.length > 0) {
217
+ const task = /** @type {T} */ (this.tasks.shift());
218
+ const p = this.worker(task).finally(() => {
219
+ this.pending.delete(p);
220
+ this.run();
221
+ });
222
+ this.pending.add(p);
223
+ }
224
+ }
225
+
226
+ /**
227
+ * Waits for all pending and queued tasks to complete.
228
+ * @returns {Promise<void>}
229
+ */
230
+ async waitAll() {
231
+ while (this.pending.size > 0) {
232
+ // eslint-disable-next-line no-await-in-loop
233
+ await Promise.all(this.pending);
234
+ }
235
+ }
236
+ }
237
+
238
+ /**
239
+ * Represents a hyperlink found during crawling.
240
+ * @typedef {Object} Link
241
+ * @property {string | null} src - URL of the page where this link was found, or null for seed URLs
242
+ * @property {string | null} text - Accessible name/text content of the link element, or null for seed URLs
243
+ * @property {string} href - The href attribute value (may be relative or absolute, with or without hash)
244
+ */
245
+
246
+ /**
247
+ * Extracts and normalizes the page URL from a link href.
248
+ * Returns null for external links, ignored paths, or non-standard URLs.
249
+ * Normalizes by removing trailing slashes (except root) and preserving query params.
250
+ * @param {string} href - Link href to process (e.g., '/docs/api#section?query=1')
251
+ * @param {RegExp[]} ignoredPaths - Array of patterns to exclude
252
+ * @returns {string | null} Normalized page URL with query but without hash, or null if external/ignored
253
+ */
254
+ function getPageUrl(href, ignoredPaths = []) {
255
+ if (!href.startsWith('/')) {
256
+ return null;
257
+ }
258
+ const parsed = new URL(href, 'http://localhost');
259
+ if (ignoredPaths.some((pattern) => pattern.test(parsed.pathname))) {
260
+ return null;
261
+ }
262
+ // Normalize pathname by removing trailing slash (except for root)
263
+ let pathname = parsed.pathname;
264
+ if (pathname !== '/' && pathname.endsWith('/')) {
265
+ pathname = pathname.slice(0, -1);
266
+ }
267
+ const link = pathname + parsed.search;
268
+ return link;
269
+ }
270
+
271
+ /**
272
+ * Configuration options for the broken links crawler.
273
+ * @typedef {Object} CrawlOptions
274
+ * @property {string | null} [startCommand] - Shell command to start the dev server (e.g., 'npm run dev'). If null, assumes server is already running
275
+ * @property {string} host - Base URL of the site to crawl (e.g., 'http://localhost:3000')
276
+ * @property {string | null} [outPath] - File path to write discovered link targets to. If null, targets are not persisted
277
+ * @property {RegExp[]} [ignoredPaths] - Array of regex patterns to exclude from crawling (e.g., [/^\/api\//] to skip /api/* routes)
278
+ * @property {string[]} [ignoredContent] - CSS selectors for elements whose nested links should be ignored (e.g., ['.sidebar', 'footer'])
279
+ * @property {Set<string>} [ignoredTargets] - Set of element IDs to ignore as link targets (defaults to '__next', '__NEXT_DATA__')
280
+ * @property {Map<string, Set<string>>} [knownTargets] - Pre-populated map of known valid targets to skip crawling (useful for external pages)
281
+ * @property {string[]} [knownTargetsDownloadUrl] - URLs to fetch known targets from (fetched JSON will be merged with knownTargets)
282
+ * @property {number} [concurrency] - Number of concurrent page fetches (defaults to 4)
283
+ * @property {string[]} [seedUrls] - Starting URLs for the crawl (defaults to ['/'])
284
+ */
285
+
286
+ /**
287
+ * Fully resolved configuration with all optional fields filled with defaults.
288
+ * @typedef {Required<CrawlOptions>} ResolvedCrawlOptions
289
+ */
290
+
291
+ /**
292
+ * Resolves partial crawl options by filling in defaults for all optional fields.
293
+ * @param {CrawlOptions} rawOptions - Partial options from user
294
+ * @returns {ResolvedCrawlOptions} Fully resolved options with all defaults applied
295
+ */
296
+ function resolveOptions(rawOptions) {
297
+ return {
298
+ startCommand: rawOptions.startCommand ?? null,
299
+ host: rawOptions.host,
300
+ outPath: rawOptions.outPath ?? null,
301
+ ignoredPaths: rawOptions.ignoredPaths ?? [],
302
+ ignoredContent: rawOptions.ignoredContent ?? [],
303
+ ignoredTargets: rawOptions.ignoredTargets ?? new Set(['__next', '__NEXT_DATA__']),
304
+ knownTargets: rawOptions.knownTargets ?? new Map(),
305
+ knownTargetsDownloadUrl: rawOptions.knownTargetsDownloadUrl ?? [],
306
+ concurrency: rawOptions.concurrency ?? DEFAULT_CONCURRENCY,
307
+ seedUrls: rawOptions.seedUrls ?? ['/'],
308
+ };
309
+ }
310
+
311
+ /**
312
+ * Merges multiple Maps, similar to Object.assign for objects.
313
+ * Later sources override earlier ones for duplicate keys.
314
+ * @template K, V
315
+ * @param {Map<K, V>} target - Target map to merge into (will be mutated)
316
+ * @param {...Map<K, V>} sources - Source maps to merge from
317
+ * @returns {Map<K, V>} The mutated target map
318
+ */
319
+ function mergeMaps(target, ...sources) {
320
+ for (const source of sources) {
321
+ for (const [key, value] of source.entries()) {
322
+ target.set(key, value);
323
+ }
324
+ }
325
+ return target;
326
+ }
327
+
328
+ /**
329
+ * Downloads and deserializes known link targets from remote URLs.
330
+ * Fetches JSON files containing serialized link structures in parallel.
331
+ * @param {string[]} urls - Array of URLs to fetch known targets from
332
+ * @returns {Promise<LinkStructure[]>} Array of deserialized link structures
333
+ */
334
+ async function downloadKnownTargets(urls) {
335
+ if (urls.length === 0) {
336
+ return [];
337
+ }
338
+
339
+ console.log(chalk.blue(`Downloading known targets from ${urls.length} URL(s)...`));
340
+
341
+ const results = await Promise.all(
342
+ urls.map(async (url) => {
343
+ console.log(` Fetching ${chalk.underline(url)}`);
344
+ const res = await fetchUrl(url);
345
+ const data = await res.json();
346
+ return deserializeLinkStructure(data);
347
+ }),
348
+ );
349
+
350
+ return results;
351
+ }
352
+
353
+ /**
354
+ * Resolves all known targets by downloading remote ones and merging with user-provided.
355
+ * User-provided targets take priority over downloaded ones.
356
+ * @param {ResolvedCrawlOptions} options - Resolved crawl options
357
+ * @returns {Promise<LinkStructure>} Merged map of all known targets
358
+ */
359
+ async function resolveKnownTargets(options) {
360
+ const downloaded = await downloadKnownTargets(options.knownTargetsDownloadUrl);
361
+ // Merge downloaded with user-provided, user-provided takes priority
362
+ return mergeMaps(new Map(), ...downloaded, options.knownTargets);
363
+ }
364
+
365
+ /**
366
+ * Represents a broken link or broken link target discovered during crawling.
367
+ * @typedef {Object} Issue
368
+ * @property {'broken-link' | 'broken-target'} type - Type of issue: 'broken-link' for 404 pages, 'broken-target' for missing anchors
369
+ * @property {string} message - Human-readable description of the issue (e.g., 'Target not found', 'Page returned error 404')
370
+ * @property {Link} link - The link object that has the issue
371
+ */
372
+
373
+ /**
374
+ * Results from a complete crawl operation.
375
+ * @typedef {Object} CrawlResult
376
+ * @property {Set<Link>} links - All links discovered during the crawl
377
+ * @property {Map<string, PageData>} pages - All pages crawled, keyed by normalized URL
378
+ * @property {Issue[]} issues - All broken links and broken targets found
379
+ */
380
+
381
+ /**
382
+ * Reports broken links to stderr, grouped by source page for better readability.
383
+ * @param {Issue[]} issuesList - Array of issues to report
384
+ */
385
+ function reportIssues(issuesList) {
386
+ if (issuesList.length === 0) {
387
+ return;
388
+ }
389
+
390
+ console.error('\nBroken links found:\n');
391
+
392
+ // Group issues by source URL
393
+ /** @type {Map<string, Issue[]>} */
394
+ const issuesBySource = new Map();
395
+ for (const issue of issuesList) {
396
+ const sourceUrl = issue.link.src ?? '(unknown)';
397
+ const sourceIssues = issuesBySource.get(sourceUrl) ?? [];
398
+ if (sourceIssues.length === 0) {
399
+ issuesBySource.set(sourceUrl, sourceIssues);
400
+ }
401
+ sourceIssues.push(issue);
402
+ }
403
+
404
+ // Report issues grouped by source
405
+ for (const [sourceUrl, sourceIssues] of issuesBySource.entries()) {
406
+ console.error(`Source ${chalk.cyan(sourceUrl)}:`);
407
+ for (const issue of sourceIssues) {
408
+ const reason = issue.type === 'broken-target' ? 'target not found' : 'returned status 404';
409
+ console.error(` [${issue.link.text}](${chalk.cyan(issue.link.href)}) (${reason})`);
410
+ }
411
+ }
412
+ }
413
+
414
+ /**
415
+ * Crawls a website starting from seed URLs, discovering all internal links and checking for broken links/targets.
416
+ * @param {CrawlOptions} rawOptions - Configuration options for the crawl
417
+ * @returns {Promise<CrawlResult>} Crawl results including all links, pages, and issues found
418
+ */
419
+ export async function crawl(rawOptions) {
420
+ const options = resolveOptions(rawOptions);
421
+ const startTime = Date.now();
422
+
423
+ /** @type {AbortController | null} */
424
+ let controller = null;
425
+ if (options.startCommand) {
426
+ console.log(chalk.blue(`Starting server with "${options.startCommand}"...`));
427
+ controller = new AbortController();
428
+ const appProcess = execaCommand(options.startCommand, {
429
+ stdout: 'pipe',
430
+ stderr: 'pipe',
431
+ cancelSignal: controller.signal,
432
+ env: {
433
+ FORCE_COLOR: '1',
434
+ ...process.env,
435
+ },
436
+ });
437
+
438
+ // Prefix server logs
439
+ const serverPrefix = chalk.gray('server: ');
440
+ appProcess.stdout.pipe(prefixLines(serverPrefix)).pipe(process.stdout);
441
+ appProcess.stderr.pipe(prefixLines(serverPrefix)).pipe(process.stderr);
442
+ appProcess.catch(() => {});
443
+
444
+ await pollUrl(options.host, 10000);
445
+
446
+ console.log(`Server started on ${chalk.underline(options.host)}`);
447
+ }
448
+
449
+ const knownTargets = await resolveKnownTargets(options);
450
+
451
+ /** @type {Map<string, Promise<PageData>>} */
452
+ const crawledPages = new Map();
453
+ /** @type {Set<Link>} */
454
+ const crawledLinks = new Set();
455
+
456
+ const queue = new Queue(async (/** @type {Link} */ link) => {
457
+ crawledLinks.add(link);
458
+
459
+ const pageUrl = getPageUrl(link.href, options.ignoredPaths);
460
+ if (pageUrl === null) {
461
+ return;
462
+ }
463
+
464
+ if (knownTargets.has(pageUrl)) {
465
+ return;
466
+ }
467
+
468
+ if (crawledPages.has(pageUrl)) {
469
+ return;
470
+ }
471
+
472
+ const pagePromise = Promise.resolve().then(async () => {
473
+ console.log(`Crawling ${chalk.cyan(pageUrl)}...`);
474
+ const res = await fetch(new URL(pageUrl, options.host));
475
+
476
+ /** @type {PageData} */
477
+ const pageData = {
478
+ url: pageUrl,
479
+ status: res.status,
480
+ targets: new Set(),
481
+ };
482
+
483
+ if (pageData.status < 200 || pageData.status >= 400) {
484
+ console.warn(chalk.yellow(`Warning: ${pageUrl} returned status ${pageData.status}`));
485
+ return pageData;
486
+ }
487
+
488
+ const contentTypeHeader = res.headers.get('content-type');
489
+ let type = 'text/html';
490
+
491
+ if (contentTypeHeader) {
492
+ try {
493
+ const parsed = contentType.parse(contentTypeHeader);
494
+ type = parsed.type;
495
+ } catch {
496
+ console.warn(
497
+ chalk.yellow(`Warning: ${pageUrl} returned invalid content-type: ${contentTypeHeader}`),
498
+ );
499
+ }
500
+ }
501
+
502
+ if (type.startsWith('image/')) {
503
+ // Skip images
504
+ return pageData;
505
+ }
506
+
507
+ if (type !== 'text/html') {
508
+ console.warn(chalk.yellow(`Warning: ${pageUrl} returned non-HTML content-type: ${type}`));
509
+ // TODO: Handle text/markdown. Parse content as markdown and extract links/targets.
510
+ return pageData;
511
+ }
512
+
513
+ const content = await res.text();
514
+
515
+ const dom = parse(content);
516
+
517
+ let ignoredSelector = ':not(*)'; // matches nothing
518
+ if (options.ignoredContent.length > 0) {
519
+ ignoredSelector = Array.from(options.ignoredContent)
520
+ .flatMap((selector) => [selector, `${selector} *`])
521
+ .join(',');
522
+ }
523
+ const linksSelector = `a[href]:not(${ignoredSelector})`;
524
+
525
+ const pageLinks = dom.querySelectorAll(linksSelector).map((a) => ({
526
+ src: pageUrl,
527
+ text: getAccessibleName(a, dom),
528
+ href: a.getAttribute('href') ?? '',
529
+ }));
530
+
531
+ for (const target of dom.querySelectorAll('*[id]')) {
532
+ if (!options.ignoredTargets.has(target.id)) {
533
+ pageData.targets.add(`#${target.id}`);
534
+ }
535
+ }
536
+
537
+ for (const pageLink of pageLinks) {
538
+ queue.add(pageLink);
539
+ }
540
+
541
+ return pageData;
542
+ });
543
+
544
+ crawledPages.set(pageUrl, pagePromise);
545
+
546
+ await pagePromise;
547
+ }, options.concurrency);
548
+
549
+ for (const seedUrl of options.seedUrls) {
550
+ queue.add({ src: null, text: null, href: seedUrl });
551
+ }
552
+
553
+ await queue.waitAll();
554
+
555
+ if (controller) {
556
+ console.log(chalk.blue('Stopping server...'));
557
+ controller.abort();
558
+ }
559
+
560
+ const results = new Map(
561
+ await Promise.all(
562
+ Array.from(crawledPages.entries(), async ([a, b]) => /** @type {const} */ ([a, await b])),
563
+ ),
564
+ );
565
+
566
+ if (options.outPath) {
567
+ await writePagesToFile(results, options.outPath);
568
+ }
569
+
570
+ /** Array to collect all issues found during validation */
571
+ /** @type {Issue[]} */
572
+ const issues = [];
573
+
574
+ /**
575
+ * Records a broken link or target issue.
576
+ * @param {Link} link - The link with the issue
577
+ * @param {'broken-target' | 'broken-link'} type - Type of issue
578
+ * @param {string} message - Human-readable error message
579
+ */
580
+ function recordBrokenLink(link, type, message) {
581
+ issues.push({
582
+ type,
583
+ message,
584
+ link,
585
+ });
586
+ }
587
+
588
+ for (const crawledLink of crawledLinks) {
589
+ const pageUrl = getPageUrl(crawledLink.href, options.ignoredPaths);
590
+ if (pageUrl !== null) {
591
+ // Internal link
592
+ const parsed = new URL(crawledLink.href, 'http://localhost');
593
+
594
+ const knownPage = knownTargets.get(pageUrl);
595
+ if (knownPage) {
596
+ if (parsed.hash && !knownPage.has(parsed.hash)) {
597
+ recordBrokenLink(crawledLink, 'broken-target', 'Target not found');
598
+ } else {
599
+ // all good
600
+ }
601
+ } else {
602
+ const page = results.get(pageUrl);
603
+
604
+ if (!page) {
605
+ recordBrokenLink(crawledLink, 'broken-link', 'Page not crawled');
606
+ } else if (page.status >= 400) {
607
+ recordBrokenLink(crawledLink, 'broken-link', `Page returned error ${page.status}`);
608
+ } else if (parsed.hash) {
609
+ if (!page.targets.has(parsed.hash)) {
610
+ recordBrokenLink(crawledLink, 'broken-target', 'Target not found');
611
+ }
612
+ } else {
613
+ // all good
614
+ }
615
+ }
616
+ }
617
+ }
618
+
619
+ reportIssues(issues);
620
+
621
+ // Derive counts from issues
622
+ const brokenLinks = issues.filter((issue) => issue.type === 'broken-link').length;
623
+ const brokenLinkTargets = issues.filter((issue) => issue.type === 'broken-target').length;
624
+
625
+ const endTime = Date.now();
626
+ const durationSeconds = (endTime - startTime) / 1000;
627
+ const duration = new Intl.NumberFormat('en-US', {
628
+ style: 'unit',
629
+ unit: 'second',
630
+ maximumFractionDigits: 2,
631
+ }).format(durationSeconds);
632
+ console.log(chalk.blue(`\nCrawl completed in ${duration}`));
633
+ console.log(` Total links found: ${chalk.cyan(crawledLinks.size)}`);
634
+ console.log(` Total broken links: ${chalk.cyan(brokenLinks)}`);
635
+ console.log(` Total broken link targets: ${chalk.cyan(brokenLinkTargets)}`);
636
+ if (options.outPath) {
637
+ console.log(chalk.blue(`Output written to: ${options.outPath}`));
638
+ }
639
+
640
+ return { links: crawledLinks, pages: results, issues };
641
+ }