afdocs 0.3.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -30
- package/dist/checks/agent-discoverability/llms-txt-directive.js +206 -5
- package/dist/checks/agent-discoverability/llms-txt-directive.js.map +1 -1
- package/dist/checks/authentication/auth-alternative-access.js +109 -6
- package/dist/checks/authentication/auth-alternative-access.js.map +1 -1
- package/dist/checks/authentication/auth-gate-detection.js +8 -3
- package/dist/checks/authentication/auth-gate-detection.js.map +1 -1
- package/dist/checks/content-structure/markdown-code-fence-validity.js +1 -1
- package/dist/checks/content-structure/markdown-code-fence-validity.js.map +1 -1
- package/dist/checks/content-structure/section-header-quality.js +240 -6
- package/dist/checks/content-structure/section-header-quality.js.map +1 -1
- package/dist/checks/content-structure/tabbed-content-serialization.js +200 -5
- package/dist/checks/content-structure/tabbed-content-serialization.js.map +1 -1
- package/dist/checks/index.d.ts +1 -0
- package/dist/checks/index.d.ts.map +1 -1
- package/dist/checks/index.js +1 -0
- package/dist/checks/index.js.map +1 -1
- package/dist/checks/llms-txt/llms-txt-exists.js +17 -10
- package/dist/checks/llms-txt/llms-txt-exists.js.map +1 -1
- package/dist/checks/observability/llms-txt-freshness.d.ts +24 -1
- package/dist/checks/observability/llms-txt-freshness.d.ts.map +1 -1
- package/dist/checks/observability/llms-txt-freshness.js +391 -5
- package/dist/checks/observability/llms-txt-freshness.js.map +1 -1
- package/dist/checks/observability/markdown-content-parity.js +599 -5
- package/dist/checks/observability/markdown-content-parity.js.map +1 -1
- package/dist/checks/page-size/content-start-position.js +3 -7
- package/dist/checks/page-size/content-start-position.js.map +1 -1
- package/dist/checks/page-size/page-size-html.js +4 -8
- package/dist/checks/page-size/page-size-html.js.map +1 -1
- package/dist/checks/page-size/rendering-strategy.d.ts +2 -0
- package/dist/checks/page-size/rendering-strategy.d.ts.map +1 -0
- package/dist/checks/page-size/rendering-strategy.js +154 -0
- package/dist/checks/page-size/rendering-strategy.js.map +1 -0
- package/dist/checks/url-stability/redirect-behavior.js +127 -5
- package/dist/checks/url-stability/redirect-behavior.js.map +1 -1
- package/dist/cli/commands/check.d.ts.map +1 -1
- package/dist/cli/commands/check.js +9 -0
- package/dist/cli/commands/check.js.map +1 -1
- package/dist/cli/formatters/text.d.ts.map +1 -1
- package/dist/cli/formatters/text.js +13 -3
- package/dist/cli/formatters/text.js.map +1 -1
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +4 -1
- package/dist/cli/index.js.map +1 -1
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +1 -0
- package/dist/constants.js.map +1 -1
- package/dist/helpers/detect-rendering.d.ts +31 -0
- package/dist/helpers/detect-rendering.d.ts.map +1 -0
- package/dist/helpers/detect-rendering.js +85 -0
- package/dist/helpers/detect-rendering.js.map +1 -0
- package/dist/helpers/detect-tabs.d.ts +12 -0
- package/dist/helpers/detect-tabs.d.ts.map +1 -0
- package/dist/helpers/detect-tabs.js +309 -0
- package/dist/helpers/detect-tabs.js.map +1 -0
- package/dist/helpers/fetch-page.d.ts +8 -0
- package/dist/helpers/fetch-page.d.ts.map +1 -0
- package/dist/helpers/fetch-page.js +20 -0
- package/dist/helpers/fetch-page.js.map +1 -0
- package/dist/helpers/get-page-urls.d.ts +9 -0
- package/dist/helpers/get-page-urls.d.ts.map +1 -1
- package/dist/helpers/get-page-urls.js +153 -21
- package/dist/helpers/get-page-urls.js.map +1 -1
- package/dist/helpers/index.d.ts +5 -0
- package/dist/helpers/index.d.ts.map +1 -1
- package/dist/helpers/index.js +3 -0
- package/dist/helpers/index.js.map +1 -1
- package/dist/helpers/to-md-urls.d.ts +4 -0
- package/dist/helpers/to-md-urls.d.ts.map +1 -1
- package/dist/helpers/to-md-urls.js +13 -0
- package/dist/helpers/to-md-urls.js.map +1 -1
- package/dist/runner.d.ts.map +1 -1
- package/dist/runner.js +1 -0
- package/dist/runner.js.map +1 -1
- package/dist/types.d.ts +19 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +2 -1
|
@@ -1,10 +1,604 @@
|
|
|
1
|
+
import { parse } from 'node-html-parser';
|
|
1
2
|
import { registerCheck } from '../registry.js';
|
|
2
|
-
|
|
3
|
+
import { fetchPage } from '../../helpers/fetch-page.js';
|
|
4
|
+
/** Thresholds for the percentage of HTML segments not found in markdown. */
|
|
5
|
+
const WARN_THRESHOLD = 5;
|
|
6
|
+
const FAIL_THRESHOLD = 20;
|
|
7
|
+
/** Minimum character length for a text segment to be considered meaningful. */
|
|
8
|
+
const MIN_SEGMENT_LENGTH = 20;
|
|
9
|
+
/**
|
|
10
|
+
* Minimum number of unique HTML segments required for a meaningful comparison.
|
|
11
|
+
* Pages below this threshold auto-pass because the percentage is too volatile
|
|
12
|
+
* (e.g., 3 breadcrumb items on a 10-segment page = 30% "missing").
|
|
13
|
+
*/
|
|
14
|
+
const MIN_SEGMENTS_FOR_COMPARISON = 10;
|
|
15
|
+
/** HTML tags to strip before extracting text (non-content chrome). */
|
|
16
|
+
const STRIP_TAGS = [
|
|
17
|
+
'script',
|
|
18
|
+
'style',
|
|
19
|
+
'nav',
|
|
20
|
+
'footer',
|
|
21
|
+
'header',
|
|
22
|
+
'noscript',
|
|
23
|
+
'button',
|
|
24
|
+
'svg',
|
|
25
|
+
'aside',
|
|
26
|
+
];
|
|
27
|
+
/** CSS selectors for common doc-site chrome that lives inside <main>. */
|
|
28
|
+
const STRIP_SELECTORS = [
|
|
29
|
+
'[aria-label="breadcrumb"]',
|
|
30
|
+
'[aria-label="pagination"]',
|
|
31
|
+
'[class*="breadcrumb"]',
|
|
32
|
+
'[class*="pagination"]',
|
|
33
|
+
'[class*="prev-next"]',
|
|
34
|
+
'[class*="prevnext"]',
|
|
35
|
+
'[class*="page-nav"]',
|
|
36
|
+
'[class*="feedback"]',
|
|
37
|
+
'[class*="helpful"]',
|
|
38
|
+
'[class*="table-of-contents"]',
|
|
39
|
+
'[class*="toc"]',
|
|
40
|
+
'[rel="prev"]',
|
|
41
|
+
'[rel="next"]',
|
|
42
|
+
'.sr-only',
|
|
43
|
+
];
|
|
44
|
+
/**
|
|
45
|
+
* Segment-level patterns for common non-content text that survives DOM stripping.
|
|
46
|
+
* Matched against normalized (lowercased, whitespace-collapsed) segments.
|
|
47
|
+
*/
|
|
48
|
+
const NOISE_PATTERNS = [
|
|
49
|
+
/^last updated/,
|
|
50
|
+
/^was this page helpful/,
|
|
51
|
+
/^thank you for your feedback/,
|
|
52
|
+
/^previous\s+\S.*next\s+\S/, // "Previous X Next Y" pagination
|
|
53
|
+
/^start from the beginning$/,
|
|
54
|
+
/^join our .* server/, // "Join our Discord Server..."
|
|
55
|
+
/^loading video content/,
|
|
56
|
+
/^\/.+\/.+/, // breadcrumb paths like "/Connect to Neon/..."
|
|
57
|
+
];
|
|
58
|
+
/**
|
|
59
|
+
* Known HTML tag names used to distinguish real tags from angle-bracket
|
|
60
|
+
* placeholders like <YOUR_API_KEY> or <clusterName> in code examples.
|
|
61
|
+
* Only needs to cover tags that appear in node-html-parser's .text output
|
|
62
|
+
* (i.e., tags inside <pre> that survive as raw text).
|
|
63
|
+
*/
|
|
64
|
+
const HTML_TAG_NAMES = new Set([
|
|
65
|
+
'a',
|
|
66
|
+
'abbr',
|
|
67
|
+
'address',
|
|
68
|
+
'article',
|
|
69
|
+
'aside',
|
|
70
|
+
'audio',
|
|
71
|
+
'b',
|
|
72
|
+
'bdi',
|
|
73
|
+
'bdo',
|
|
74
|
+
'blockquote',
|
|
75
|
+
'body',
|
|
76
|
+
'br',
|
|
77
|
+
'button',
|
|
78
|
+
'canvas',
|
|
79
|
+
'caption',
|
|
80
|
+
'cite',
|
|
81
|
+
'code',
|
|
82
|
+
'col',
|
|
83
|
+
'colgroup',
|
|
84
|
+
'data',
|
|
85
|
+
'dd',
|
|
86
|
+
'del',
|
|
87
|
+
'details',
|
|
88
|
+
'dfn',
|
|
89
|
+
'dialog',
|
|
90
|
+
'div',
|
|
91
|
+
'dl',
|
|
92
|
+
'dt',
|
|
93
|
+
'em',
|
|
94
|
+
'embed',
|
|
95
|
+
'fieldset',
|
|
96
|
+
'figcaption',
|
|
97
|
+
'figure',
|
|
98
|
+
'footer',
|
|
99
|
+
'form',
|
|
100
|
+
'h1',
|
|
101
|
+
'h2',
|
|
102
|
+
'h3',
|
|
103
|
+
'h4',
|
|
104
|
+
'h5',
|
|
105
|
+
'h6',
|
|
106
|
+
'head',
|
|
107
|
+
'header',
|
|
108
|
+
'hr',
|
|
109
|
+
'html',
|
|
110
|
+
'i',
|
|
111
|
+
'iframe',
|
|
112
|
+
'img',
|
|
113
|
+
'input',
|
|
114
|
+
'ins',
|
|
115
|
+
'kbd',
|
|
116
|
+
'label',
|
|
117
|
+
'legend',
|
|
118
|
+
'li',
|
|
119
|
+
'link',
|
|
120
|
+
'main',
|
|
121
|
+
'map',
|
|
122
|
+
'mark',
|
|
123
|
+
'meta',
|
|
124
|
+
'meter',
|
|
125
|
+
'nav',
|
|
126
|
+
'noscript',
|
|
127
|
+
'object',
|
|
128
|
+
'ol',
|
|
129
|
+
'optgroup',
|
|
130
|
+
'option',
|
|
131
|
+
'output',
|
|
132
|
+
'p',
|
|
133
|
+
'param',
|
|
134
|
+
'picture',
|
|
135
|
+
'pre',
|
|
136
|
+
'progress',
|
|
137
|
+
'q',
|
|
138
|
+
'rp',
|
|
139
|
+
'rt',
|
|
140
|
+
'ruby',
|
|
141
|
+
's',
|
|
142
|
+
'samp',
|
|
143
|
+
'script',
|
|
144
|
+
'section',
|
|
145
|
+
'select',
|
|
146
|
+
'slot',
|
|
147
|
+
'small',
|
|
148
|
+
'source',
|
|
149
|
+
'span',
|
|
150
|
+
'strong',
|
|
151
|
+
'style',
|
|
152
|
+
'sub',
|
|
153
|
+
'summary',
|
|
154
|
+
'sup',
|
|
155
|
+
'table',
|
|
156
|
+
'tbody',
|
|
157
|
+
'td',
|
|
158
|
+
'template',
|
|
159
|
+
'textarea',
|
|
160
|
+
'tfoot',
|
|
161
|
+
'th',
|
|
162
|
+
'thead',
|
|
163
|
+
'time',
|
|
164
|
+
'title',
|
|
165
|
+
'tr',
|
|
166
|
+
'track',
|
|
167
|
+
'u',
|
|
168
|
+
'ul',
|
|
169
|
+
'var',
|
|
170
|
+
'video',
|
|
171
|
+
'wbr',
|
|
172
|
+
]);
|
|
173
|
+
/** Block-level HTML elements that should produce line breaks in extracted text. */
|
|
174
|
+
const BLOCK_TAGS = new Set([
|
|
175
|
+
'p',
|
|
176
|
+
'div',
|
|
177
|
+
'h1',
|
|
178
|
+
'h2',
|
|
179
|
+
'h3',
|
|
180
|
+
'h4',
|
|
181
|
+
'h5',
|
|
182
|
+
'h6',
|
|
183
|
+
'li',
|
|
184
|
+
'tr',
|
|
185
|
+
'td',
|
|
186
|
+
'th',
|
|
187
|
+
'blockquote',
|
|
188
|
+
'pre',
|
|
189
|
+
'dt',
|
|
190
|
+
'dd',
|
|
191
|
+
'figcaption',
|
|
192
|
+
'section',
|
|
193
|
+
'article',
|
|
194
|
+
'details',
|
|
195
|
+
'summary',
|
|
196
|
+
'br',
|
|
197
|
+
'hr',
|
|
198
|
+
]);
|
|
199
|
+
/**
|
|
200
|
+
* Minimum link density (0–1) and minimum link count for an element to be
|
|
201
|
+
* classified as navigation chrome. Navigation panels are structurally
|
|
202
|
+
* distinguishable from content: they consist almost entirely of links with
|
|
203
|
+
* very little non-link text between them. Content sections, even link-heavy
|
|
204
|
+
* ones like "Related resources", include enough description text to stay
|
|
205
|
+
* well below this threshold.
|
|
206
|
+
*/
|
|
207
|
+
const NAV_LINK_DENSITY_THRESHOLD = 0.7;
|
|
208
|
+
const NAV_MIN_LINK_COUNT = 10;
|
|
209
|
+
/**
|
|
210
|
+
* Extract plain text from HTML, stripping chrome elements.
|
|
211
|
+
* Inserts newlines between block-level elements so that paragraphs,
|
|
212
|
+
* list items, etc. become separate lines in the output.
|
|
213
|
+
*/
|
|
214
|
+
/**
|
|
215
|
+
* Heuristic selectors for content containers, tried in order when
|
|
216
|
+
* <main> and <article> are not present. Common across doc platforms
|
|
217
|
+
* like Mintlify, ReadMe, Docusaurus/Starlight, and custom sites.
|
|
218
|
+
*/
|
|
219
|
+
const CONTENT_SELECTORS = [
|
|
220
|
+
'[role="main"]',
|
|
221
|
+
'#content',
|
|
222
|
+
'.sl-markdown-content',
|
|
223
|
+
'.markdown-content',
|
|
224
|
+
'.markdown-body',
|
|
225
|
+
'.docs-content',
|
|
226
|
+
'.doc-content',
|
|
227
|
+
'.main-pane',
|
|
228
|
+
'.page-content',
|
|
229
|
+
'.prose',
|
|
230
|
+
];
|
|
231
|
+
function extractHtmlText(html) {
|
|
232
|
+
const root = parse(html);
|
|
233
|
+
// Prefer the tightest content container available.
|
|
234
|
+
// Priority: heuristic selector inside article/main > article inside main
|
|
235
|
+
// > article > heuristic selector inside main > main > heuristic on root > body
|
|
236
|
+
const main = root.querySelector('main');
|
|
237
|
+
const article = main?.querySelector('article') ?? root.querySelector('article');
|
|
238
|
+
let content = null;
|
|
239
|
+
// Look for a heuristic content selector inside the best semantic container
|
|
240
|
+
const semanticContainer = article ?? main;
|
|
241
|
+
if (semanticContainer) {
|
|
242
|
+
for (const selector of CONTENT_SELECTORS) {
|
|
243
|
+
content = semanticContainer.querySelector(selector);
|
|
244
|
+
if (content)
|
|
245
|
+
break;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
// Fall back to the semantic container itself
|
|
249
|
+
if (!content)
|
|
250
|
+
content = semanticContainer;
|
|
251
|
+
// If no semantic container, try heuristic selectors on the root
|
|
252
|
+
if (!content) {
|
|
253
|
+
for (const selector of CONTENT_SELECTORS) {
|
|
254
|
+
content = root.querySelector(selector);
|
|
255
|
+
if (content)
|
|
256
|
+
break;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
if (!content)
|
|
260
|
+
content = root.querySelector('body');
|
|
261
|
+
if (!content)
|
|
262
|
+
return root.text;
|
|
263
|
+
// Remove non-content elements by tag
|
|
264
|
+
for (const tag of STRIP_TAGS) {
|
|
265
|
+
for (const el of content.querySelectorAll(tag)) {
|
|
266
|
+
el.remove();
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
// Remove common doc-site chrome by CSS selector
|
|
270
|
+
for (const selector of STRIP_SELECTORS) {
|
|
271
|
+
for (const el of content.querySelectorAll(selector)) {
|
|
272
|
+
el.remove();
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
// Remove elements that look like navigation based on link density.
|
|
276
|
+
// Navigation panels (sidebars, header menus) are structurally distinct
|
|
277
|
+
// from content: they consist almost entirely of links. This catches
|
|
278
|
+
// nav-like elements that use <div> instead of <nav>/<aside>.
|
|
279
|
+
for (const el of content.querySelectorAll('*')) {
|
|
280
|
+
const text = el.text || '';
|
|
281
|
+
if (text.length < 100)
|
|
282
|
+
continue;
|
|
283
|
+
const links = el.querySelectorAll('a');
|
|
284
|
+
if (links.length < NAV_MIN_LINK_COUNT)
|
|
285
|
+
continue;
|
|
286
|
+
const linkTextLen = links.reduce((sum, a) => sum + (a.text?.length || 0), 0);
|
|
287
|
+
if (linkTextLen / text.length > NAV_LINK_DENSITY_THRESHOLD) {
|
|
288
|
+
el.remove();
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
// Insert newlines before block-level elements so .text produces
|
|
292
|
+
// separated lines instead of smashing paragraphs together
|
|
293
|
+
for (const tag of BLOCK_TAGS) {
|
|
294
|
+
for (const el of content.querySelectorAll(tag)) {
|
|
295
|
+
el.insertAdjacentHTML('beforebegin', '\n');
|
|
296
|
+
el.insertAdjacentHTML('afterend', '\n');
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
// node-html-parser treats <pre> content as raw text, so <style> tags
|
|
300
|
+
// injected inside code blocks (e.g., Emotion CSS-in-JS / Leafygreen)
|
|
301
|
+
// survive DOM-level stripping. Remove <style>...</style> blocks first,
|
|
302
|
+
// inject newlines before <div tags to separate code lines (e.g.,
|
|
303
|
+
// Expressive Code / Shiki use <div class="ec-line"> inside <pre>),
|
|
304
|
+
// then strip HTML tags while preserving angle-bracket placeholders
|
|
305
|
+
// like <YOUR_API_KEY> or <clusterName> (decoded from <...> entities).
|
|
306
|
+
return content.text
|
|
307
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
308
|
+
.replace(/<!--[\s\S]*?-->/g, '')
|
|
309
|
+
.replace(/<div[\s>]/gi, '\n<div ')
|
|
310
|
+
.replace(/<\/[^>\s]+>/g, '')
|
|
311
|
+
.replace(/<([a-zA-Z][a-zA-Z0-9-]*)([^>]*)>/g, (_match, tag, rest) => HTML_TAG_NAMES.has(tag.toLowerCase()) ? '' : tag + rest);
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* Extract plain text from markdown by stripping all formatting.
|
|
315
|
+
*/
|
|
316
|
+
function extractMarkdownText(markdown) {
|
|
317
|
+
return (markdown
|
|
318
|
+
// Remove code fences but keep code content
|
|
319
|
+
.replace(/^```[\w]*\n?/gm, '')
|
|
320
|
+
// Remove heading markers
|
|
321
|
+
.replace(/^#{1,6}\s+/gm, '')
|
|
322
|
+
// Remove setext-style heading underlines
|
|
323
|
+
.replace(/^[=-]+$/gm, '')
|
|
324
|
+
// Remove link/image URLs, keep text: [text](url) → text
|
|
325
|
+
.replace(/!?\[([^\]]*)\]\([^)]*\)/g, '$1')
|
|
326
|
+
// Remove reference-style link definitions
|
|
327
|
+
.replace(/^\[.*?\]:\s+.*$/gm, '')
|
|
328
|
+
// Remove list bullets/numbers (before emphasis, so leading * isn't
|
|
329
|
+
// misinterpreted as an emphasis marker)
|
|
330
|
+
.replace(/^[\s]*[-*+]\s+/gm, '')
|
|
331
|
+
.replace(/^[\s]*\d+\.\s+/gm, '')
|
|
332
|
+
// Remove inline code backticks but keep content (before emphasis
|
|
333
|
+
// stripping so that underscores in code identifiers aren't mangled)
|
|
334
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
335
|
+
// Remove emphasis markers (* only — underscores are too common in
|
|
336
|
+
// code identifiers like mongoc_client_get_database and cause false
|
|
337
|
+
// mismatches when stripped as emphasis)
|
|
338
|
+
.replace(/(\*{1,3})(.*?)\1/g, '$2')
|
|
339
|
+
// Remove blockquote markers
|
|
340
|
+
.replace(/^>\s?/gm, '')
|
|
341
|
+
// Remove horizontal rules
|
|
342
|
+
.replace(/^[-*_]{3,}$/gm, ''));
|
|
343
|
+
}
|
|
344
|
+
/**
|
|
345
|
+
* Normalize text for fuzzy containment matching:
|
|
346
|
+
* strip zero-width characters, normalize typographic quotes,
|
|
347
|
+
* strip angle brackets around placeholders, collapse whitespace, and lowercase.
|
|
348
|
+
*/
|
|
349
|
+
function normalize(text) {
|
|
350
|
+
return (text
|
|
351
|
+
.replace(/\u200B/g, '')
|
|
352
|
+
.replace(/\u200C/g, '')
|
|
353
|
+
.replace(/\u200D/g, '')
|
|
354
|
+
.replace(/\uFEFF/g, '')
|
|
355
|
+
.replace(/[\u2018\u2019\u201A]/g, "'")
|
|
356
|
+
.replace(/[\u201C\u201D\u201E]/g, '"')
|
|
357
|
+
.replace(/[\u2013\u2014]/g, '-')
|
|
358
|
+
.replace(/\u2026/g, '...')
|
|
359
|
+
// Strip angle brackets but keep content — normalizes <YOUR_API_KEY> to
|
|
360
|
+
// YOUR_API_KEY so HTML-side (entities decoded, tags stripped) and
|
|
361
|
+
// markdown-side (raw angle brackets) produce the same text.
|
|
362
|
+
.replace(/<([^>]+)>/g, '$1')
|
|
363
|
+
.toLowerCase()
|
|
364
|
+
.replace(/\s+/g, ' ')
|
|
365
|
+
.trim());
|
|
366
|
+
}
|
|
367
|
+
/**
|
|
368
|
+
* Check if a normalized segment matches any common noise pattern.
|
|
369
|
+
*/
|
|
370
|
+
function isNoiseSegment(normalized) {
|
|
371
|
+
return NOISE_PATTERNS.some((pattern) => pattern.test(normalized));
|
|
372
|
+
}
|
|
373
|
+
/**
|
|
374
|
+
* Split text into meaningful segments: non-empty lines of at least
|
|
375
|
+
* MIN_SEGMENT_LENGTH characters, trimmed, with common noise filtered out.
|
|
376
|
+
*/
|
|
377
|
+
function toSegments(text) {
|
|
378
|
+
return text
|
|
379
|
+
.split('\n')
|
|
380
|
+
.map((line) => line.trim())
|
|
381
|
+
.filter((line) => line.length >= MIN_SEGMENT_LENGTH)
|
|
382
|
+
.filter((line) => !isNoiseSegment(line.toLowerCase()));
|
|
383
|
+
}
|
|
384
|
+
/**
|
|
385
|
+
* Check what fraction of HTML segments can be found in the markdown text.
|
|
386
|
+
* Uses normalized substring containment rather than positional diffing,
|
|
387
|
+
* so reordering and formatting differences don't cause false positives.
|
|
388
|
+
*/
|
|
389
|
+
function computeParity(htmlText, markdownText) {
|
|
390
|
+
// Deduplicate segments so repeated chrome (breadcrumbs, nav titles) or
|
|
391
|
+
// repeated content is only counted once when checking for presence.
|
|
392
|
+
const allSegments = toSegments(htmlText);
|
|
393
|
+
const seen = new Set();
|
|
394
|
+
const htmlSegments = [];
|
|
395
|
+
for (const seg of allSegments) {
|
|
396
|
+
const key = normalize(seg);
|
|
397
|
+
if (!seen.has(key)) {
|
|
398
|
+
seen.add(key);
|
|
399
|
+
htmlSegments.push(seg);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
if (htmlSegments.length === 0) {
|
|
403
|
+
return {
|
|
404
|
+
status: 'pass',
|
|
405
|
+
missingPercent: 0,
|
|
406
|
+
totalSegments: 0,
|
|
407
|
+
missingSegments: 0,
|
|
408
|
+
sampleDiffs: [],
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
// Pages with very few segments produce volatile percentages (a couple of
|
|
412
|
+
// breadcrumb items on a 7-segment page = 30%+). Auto-pass these.
|
|
413
|
+
if (htmlSegments.length < MIN_SEGMENTS_FOR_COMPARISON) {
|
|
414
|
+
return {
|
|
415
|
+
status: 'pass',
|
|
416
|
+
missingPercent: 0,
|
|
417
|
+
totalSegments: htmlSegments.length,
|
|
418
|
+
missingSegments: 0,
|
|
419
|
+
sampleDiffs: [],
|
|
420
|
+
};
|
|
421
|
+
}
|
|
422
|
+
const normalizedMd = normalize(extractMarkdownText(markdownText));
|
|
423
|
+
const sampleDiffs = [];
|
|
424
|
+
let missingCount = 0;
|
|
425
|
+
for (const segment of htmlSegments) {
|
|
426
|
+
const normalizedSegment = normalize(segment);
|
|
427
|
+
if (!normalizedMd.includes(normalizedSegment)) {
|
|
428
|
+
missingCount++;
|
|
429
|
+
if (sampleDiffs.length < 5) {
|
|
430
|
+
sampleDiffs.push(`- ${segment}`);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
const missingPercent = htmlSegments.length > 0 ? Math.round((missingCount / htmlSegments.length) * 100) : 0;
|
|
435
|
+
let status;
|
|
436
|
+
if (missingPercent < WARN_THRESHOLD) {
|
|
437
|
+
status = 'pass';
|
|
438
|
+
}
|
|
439
|
+
else if (missingPercent < FAIL_THRESHOLD) {
|
|
440
|
+
status = 'warn';
|
|
441
|
+
}
|
|
442
|
+
else {
|
|
443
|
+
status = 'fail';
|
|
444
|
+
}
|
|
445
|
+
return {
|
|
446
|
+
status,
|
|
447
|
+
missingPercent,
|
|
448
|
+
totalSegments: htmlSegments.length,
|
|
449
|
+
missingSegments: missingCount,
|
|
450
|
+
sampleDiffs,
|
|
451
|
+
};
|
|
452
|
+
}
|
|
453
|
+
/**
|
|
454
|
+
* Derive the HTML page URL from a cached page URL.
|
|
455
|
+
* Inverts the transforms from toMdUrls():
|
|
456
|
+
* /docs/guide.md → /docs/guide
|
|
457
|
+
* /docs/guide/index.md → /docs/guide/
|
|
458
|
+
* /docs/guide.mdx → /docs/guide
|
|
459
|
+
* If the URL doesn't end in .md/.mdx, return it unchanged.
|
|
460
|
+
*/
|
|
461
|
+
function toHtmlUrl(url) {
|
|
462
|
+
const parsed = new URL(url);
|
|
463
|
+
if (parsed.pathname.endsWith('/index.md') || parsed.pathname.endsWith('/index.mdx')) {
|
|
464
|
+
parsed.pathname = parsed.pathname.replace(/\/index\.mdx?$/, '/');
|
|
465
|
+
return parsed.toString();
|
|
466
|
+
}
|
|
467
|
+
if (/\.mdx?$/i.test(parsed.pathname)) {
|
|
468
|
+
parsed.pathname = parsed.pathname.replace(/\.mdx?$/i, '');
|
|
469
|
+
return parsed.toString();
|
|
470
|
+
}
|
|
471
|
+
return url;
|
|
472
|
+
}
|
|
473
|
+
function worstStatus(statuses) {
|
|
474
|
+
if (statuses.includes('fail'))
|
|
475
|
+
return 'fail';
|
|
476
|
+
if (statuses.includes('warn'))
|
|
477
|
+
return 'warn';
|
|
478
|
+
return 'pass';
|
|
479
|
+
}
|
|
480
|
+
async function check(ctx) {
|
|
481
|
+
const id = 'markdown-content-parity';
|
|
482
|
+
const category = 'observability';
|
|
483
|
+
// Collect pages that have cached markdown from upstream checks
|
|
484
|
+
const pagesToCompare = [];
|
|
485
|
+
for (const [url, cached] of ctx.pageCache) {
|
|
486
|
+
if (cached.markdown?.content) {
|
|
487
|
+
pagesToCompare.push({
|
|
488
|
+
url,
|
|
489
|
+
markdownContent: cached.markdown.content,
|
|
490
|
+
markdownSource: cached.markdown.source,
|
|
491
|
+
});
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
if (pagesToCompare.length === 0) {
|
|
495
|
+
return {
|
|
496
|
+
id,
|
|
497
|
+
category,
|
|
498
|
+
status: 'skip',
|
|
499
|
+
message: 'No pages with markdown versions available to compare',
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
const results = [];
|
|
503
|
+
const concurrency = ctx.options.maxConcurrency;
|
|
504
|
+
for (let i = 0; i < pagesToCompare.length; i += concurrency) {
|
|
505
|
+
const batch = pagesToCompare.slice(i, i + concurrency);
|
|
506
|
+
const batchResults = await Promise.all(batch.map(async ({ url, markdownContent, markdownSource }) => {
|
|
507
|
+
try {
|
|
508
|
+
// Fetch the HTML version of the page
|
|
509
|
+
const htmlUrl = toHtmlUrl(url);
|
|
510
|
+
const page = await fetchPage(ctx, htmlUrl);
|
|
511
|
+
if (page.status >= 400) {
|
|
512
|
+
// HTML URL returned an error (e.g., 404) — skip this page
|
|
513
|
+
return {
|
|
514
|
+
url,
|
|
515
|
+
markdownSource,
|
|
516
|
+
status: 'pass',
|
|
517
|
+
missingPercent: 0,
|
|
518
|
+
totalSegments: 0,
|
|
519
|
+
missingSegments: 0,
|
|
520
|
+
sampleDiffs: [],
|
|
521
|
+
error: `HTML page returned ${page.status}`,
|
|
522
|
+
};
|
|
523
|
+
}
|
|
524
|
+
if (!page.isHtml) {
|
|
525
|
+
// The "HTML" version is already markdown/plain text — no meaningful comparison
|
|
526
|
+
return {
|
|
527
|
+
url,
|
|
528
|
+
markdownSource,
|
|
529
|
+
status: 'pass',
|
|
530
|
+
missingPercent: 0,
|
|
531
|
+
totalSegments: 0,
|
|
532
|
+
missingSegments: 0,
|
|
533
|
+
sampleDiffs: [],
|
|
534
|
+
};
|
|
535
|
+
}
|
|
536
|
+
const htmlText = extractHtmlText(page.body);
|
|
537
|
+
const parity = computeParity(htmlText, markdownContent);
|
|
538
|
+
return { url, markdownSource, ...parity };
|
|
539
|
+
}
|
|
540
|
+
catch (err) {
|
|
541
|
+
return {
|
|
542
|
+
url,
|
|
543
|
+
markdownSource,
|
|
544
|
+
status: 'fail',
|
|
545
|
+
missingPercent: 100,
|
|
546
|
+
totalSegments: 0,
|
|
547
|
+
missingSegments: 0,
|
|
548
|
+
sampleDiffs: [],
|
|
549
|
+
error: err instanceof Error ? err.message : String(err),
|
|
550
|
+
};
|
|
551
|
+
}
|
|
552
|
+
}));
|
|
553
|
+
results.push(...batchResults);
|
|
554
|
+
}
|
|
555
|
+
const successful = results.filter((r) => !r.error);
|
|
556
|
+
const fetchErrors = results.filter((r) => r.error).length;
|
|
557
|
+
if (successful.length === 0) {
|
|
558
|
+
return {
|
|
559
|
+
id,
|
|
560
|
+
category,
|
|
561
|
+
status: 'fail',
|
|
562
|
+
message: `Could not fetch HTML for any pages to compare${fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : ''}`,
|
|
563
|
+
details: {
|
|
564
|
+
pagesCompared: 0,
|
|
565
|
+
fetchErrors,
|
|
566
|
+
pageResults: results,
|
|
567
|
+
},
|
|
568
|
+
};
|
|
569
|
+
}
|
|
570
|
+
const overallStatus = worstStatus(successful.map((r) => r.status));
|
|
571
|
+
const passBucket = successful.filter((r) => r.status === 'pass').length;
|
|
572
|
+
const warnBucket = successful.filter((r) => r.status === 'warn').length;
|
|
573
|
+
const failBucket = successful.filter((r) => r.status === 'fail').length;
|
|
574
|
+
const avgMissingPercent = successful.length > 0
|
|
575
|
+
? Math.round(successful.reduce((sum, r) => sum + r.missingPercent, 0) / successful.length)
|
|
576
|
+
: 0;
|
|
577
|
+
const suffix = fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : '';
|
|
578
|
+
let message;
|
|
579
|
+
if (overallStatus === 'pass') {
|
|
580
|
+
message = `All ${successful.length} pages have equivalent markdown and HTML content (avg ${avgMissingPercent}% missing)${suffix}`;
|
|
581
|
+
}
|
|
582
|
+
else if (overallStatus === 'warn') {
|
|
583
|
+
message = `${warnBucket} of ${successful.length} pages have minor content differences between markdown and HTML${suffix}`;
|
|
584
|
+
}
|
|
585
|
+
else {
|
|
586
|
+
message = `${failBucket} of ${successful.length} pages have substantive content differences between markdown and HTML (avg ${avgMissingPercent}% missing)${suffix}`;
|
|
587
|
+
}
|
|
3
588
|
return {
|
|
4
|
-
id
|
|
5
|
-
category
|
|
6
|
-
status:
|
|
7
|
-
message
|
|
589
|
+
id,
|
|
590
|
+
category,
|
|
591
|
+
status: overallStatus,
|
|
592
|
+
message,
|
|
593
|
+
details: {
|
|
594
|
+
pagesCompared: successful.length,
|
|
595
|
+
passBucket,
|
|
596
|
+
warnBucket,
|
|
597
|
+
failBucket,
|
|
598
|
+
fetchErrors,
|
|
599
|
+
avgMissingPercent,
|
|
600
|
+
pageResults: results,
|
|
601
|
+
},
|
|
8
602
|
};
|
|
9
603
|
}
|
|
10
604
|
registerCheck({
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"markdown-content-parity.js","sourceRoot":"","sources":["../../../src/checks/observability/markdown-content-parity.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAG/C,KAAK,UAAU,KAAK,CAAC,IAAkB;IACrC,OAAO;QACL,EAAE,EAAE,yBAAyB;QAC7B,QAAQ,EAAE,eAAe;QACzB,MAAM,EAAE,MAAM;QACd,OAAO,EAAE,qBAAqB;KAC/B,CAAC;AACJ,CAAC;AAED,aAAa,CAAC;IACZ,EAAE,EAAE,yBAAyB;IAC7B,QAAQ,EAAE,eAAe;IACzB,WAAW,EAAE,+DAA+D;IAC5E,SAAS,EAAE,CAAC,CAAC,sBAAsB,EAAE,qBAAqB,CAAC,CAAC;IAC5D,GAAG,EAAE,KAAK;CACX,CAAC,CAAC"}
|
|
1
|
+
{"version":3,"file":"markdown-content-parity.js","sourceRoot":"","sources":["../../../src/checks/observability/markdown-content-parity.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACzC,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAC/C,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AAGxD,4EAA4E;AAC5E,MAAM,cAAc,GAAG,CAAC,CAAC;AACzB,MAAM,cAAc,GAAG,EAAE,CAAC;AAE1B,+EAA+E;AAC/E,MAAM,kBAAkB,GAAG,EAAE,CAAC;AAE9B;;;;GAIG;AACH,MAAM,2BAA2B,GAAG,EAAE,CAAC;AAEvC,sEAAsE;AACtE,MAAM,UAAU,GAAG;IACjB,QAAQ;IACR,OAAO;IACP,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,UAAU;IACV,QAAQ;IACR,KAAK;IACL,OAAO;CACR,CAAC;AAEF,yEAAyE;AACzE,MAAM,eAAe,GAAG;IACtB,2BAA2B;IAC3B,2BAA2B;IAC3B,uBAAuB;IACvB,uBAAuB;IACvB,sBAAsB;IACtB,qBAAqB;IACrB,qBAAqB;IACrB,qBAAqB;IACrB,oBAAoB;IACpB,8BAA8B;IAC9B,gBAAgB;IAChB,cAAc;IACd,cAAc;IACd,UAAU;CACX,CAAC;AAEF;;;GAGG;AACH,MAAM,cAAc,GAAG;IACrB,eAAe;IACf,wBAAwB;IACxB,8BAA8B;IAC9B,2BAA2B,EAAE,iCAAiC;IAC9D,4BAA4B;IAC5B,qBAAqB,EAAE,+BAA+B;IACtD,wBAAwB;IACxB,WAAW,EAAE,+CAA+C;CAC7D,CAAC;AAiBF;;;;;GAKG;AACH,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;IAC7B,GAAG;IACH,MAAM;IACN,SAAS;IACT,SAAS;IACT,OAAO;IACP,OAAO;IACP,GAAG;IACH,KAAK;IACL,KAAK;IACL,YAAY;IACZ,MAAM;IACN,IAAI;IACJ,QAAQ;IACR,QAAQ;IACR,SAAS;IACT,MAAM;IACN,MAAM;IACN,KAAK;IACL,UAAU;IACV,MAAM;IACN,IAAI;IACJ,KAAK;IACL,SAAS;IACT,KAAK;IACL,QAAQ;IACR,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,OAAO;IACP,UAAU;IACV,YAAY;IACZ,QAAQ;IACR,QAAQ;IACR,MAAM;IACN,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,MAAM;IACN,QAAQ;IACR,IAAI;IACJ,MAAM;IACN,GAAG;IACH,QAAQ;IACR,KAAK;IACL,OAAO;IACP,KAAK;IACL,KAAK;IACL,OAAO;IACP,QAAQ;IACR,IAAI;IACJ,MAAM;IACN,MAAM;IACN,KAAK;IACL,MAAM;IACN,MAAM;IACN,OAAO;IACP,KAAK;IACL,UAAU;IACV,QAAQ;IACR,IAAI;IACJ,UAAU;IACV,QAAQ;IACR,QAAQ;IACR,GAAG;IACH,OAAO;IACP,SAAS;IACT,KAAK;IACL,UAAU;IACV,GAAG;IACH,IAAI;IACJ,IAAI;IACJ,MAAM;IACN,GAAG;IACH,MAAM;IACN,QAAQ;IACR,SAAS;IACT,QAAQ;IACR,MAAM;IACN,OAAO;IACP,QAAQ;IACR,MAAM;IACN,QAAQ;IACR,OAAO;IACP,KAAK;IACL,SAAS;IACT,KAAK;IACL,OAAO;IACP,OAAO;IACP,IAAI;IACJ,UAAU;IACV,UAAU;IACV,OAAO;IACP,IAAI;IACJ,OAAO;IACP,MAAM;IACN,OAAO;IACP,IAAI;IACJ,OAAO;IACP,GAAG;IACH,IAAI;IACJ,KAAK;IACL,OAAO;IACP,KAAK;CACN,CAAC,CAAC;AAEH,mFAAmF;AACnF,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC;IACzB,GAAG;IACH,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,YAAY;IACZ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,YAAY;IACZ,SAAS;IACT,SAAS;IACT,SAAS;IACT,SAAS;IACT,IAAI;IACJ,IAAI;CACL,CAAC,CAAC;AAEH;;;;;;;GAOG;AACH,MAAM,0BAA0B,GAAG,GAAG,CAAC;AACvC,MAAM,kBAAkB,GAAG,EAAE,CAAC;AAE9B;;;;GAIG;AACH;;;;GAIG;AACH,MAAM,iBAAiB,GAAG;IACxB,eAAe;IACf,UAAU;IACV,sBAAsB;IACtB,mBAAmB;IACnB,gBAAgB;IAChB,eAAe;IACf,cAAc;IACd,YAAY;IACZ,eAAe;IACf,QAAQ;CACT,CAAC;AAEF,SAAS,eAAe,CAAC,IAAY;IACnC,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC;IAEzB,mDAAmD;IACnD,yEAAyE;IACzE,+EAA+E;IAC/E,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;IACxC,MAAM,OAAO,GAAG,IAAI,EAAE,aAAa,CAAC,SAAS,CAAC,IAAI,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;IAChF,IAAI,OAAO,GAA0C,IAAI,CAAC;IAE1D,2EAA2E;IAC3E,MAAM,iBAAiB,GAAG,OAAO,IAAI,IAAI,CAAC;IAC1C,IAAI,iBAAiB,EAAE,CAAC;QACtB,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;YACzC,OAAO,GAAG,iBAAiB,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;YACpD,IAAI,OAAO;gBAAE,MAAM;QACrB,CAAC;IACH,CAAC;IACD,6CAA6C;IAC7C,IAAI,CAAC,OAAO;QAAE,OAAO,GAAG,iBAAiB,CAAC;IAE1C,gEAAgE;IAChE,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;YACzC,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;YACvC,IAAI,OAAO;gBAAE,MAAM;QACrB,CAAC;IACH,CAAC;IAED,IAAI,CAAC,OAAO;QAAE,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;IACnD,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAC,IAAI,CAAC;IAE/B,qCAAqC;IACrC,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,KAAK,MAAM,EAAE,IAAI,OAAO,CAAC,gBAAgB,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/C,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IAED,gDAAgD;IAChD,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;QACvC,KAAK,MAAM,EAAE,IAAI,OAAO,CAAC,gBAAgB,CAAC,QAAQ,CAAC,EAAE,CAAC;YACpD,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IAED,mEAAmE;IACnE,uEAAuE;IACvE,oEAAoE;IACpE,6DAA6D;IAC7D,KAAK,MAAM,EAAE,IAAI,OAAO,CAAC,gBAAgB,CAAC,GAAG,CAAC,EAAE,CAAC;QAC/C,MAAM,IAAI,GAAG,EAAE,CAAC,IAAI,IAAI,EAAE,CAAC;QAC3B,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG;YAAE,SAAS;QAChC,MAAM,KAAK,GAAG,EAAE,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;QACvC,IAAI,KAAK,CAAC,MAAM,GAAG,kBAAkB;YAAE,SAAS;QAChD,MAAM,WAAW,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,MAAM,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAC7E,IAAI,WAAW,GAAG,IAAI,CAAC,MAAM,GAAG,0BAA0B,EAAE,CAAC;YAC3D,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IAED,gEAAgE;IAChE,0DAA0D;IAC1D,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,KAAK,MAAM,EAAE,IAAI,OAAO,CAAC,gBAAgB,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/C,EAAE,CAAC,kBAAkB,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC;YAC3C,EAAE,CAAC,kBAAkB,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;QAC1C,CAAC;IACH,CAAC;IAED,qEAAqE;IACrE,qEAAqE;IACrE,uEAAuE;IACvE,iEAAiE;IACjE,mEAAmE;IACnE,mEAAmE;IACnE,4EAA4E;IAC5E,OAAO,OAAO,CAAC,IAAI;SAChB,OAAO,CAAC,iCAAiC,EAAE,EAAE,CAAC;SAC9C,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC;SAC/B,OAAO,CAAC,aAAa,EAAE,SAAS,CAAC;SACjC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC;SAC3B,OAAO,CAAC,mCAAmC,EAAE,CAAC,MAAM,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,CAClE,cAAc,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,IAAI,CACxD,CAAC;AACN,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,QAAgB;IAC3C,OAAO,CACL,QAAQ;QACN,2CAA2C;SAC1C,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC;QAC9B,yBAAyB;SACxB,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC;QAC5B,yCAAyC;SACxC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;QACzB,wDAAwD;SACvD,OAAO,CAAC,0BAA0B,EAAE,IAAI,CAAC;QAC1C,0CAA0C;SACzC,OAAO,CAAC,mBAAmB,EAAE,EAAE,CAAC;QACjC,mEAAmE;QACnE,wCAAwC;SACvC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC;SAC/B,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAChC,iEAAiE;QACjE,oEAAoE;SACnE,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC;QAC5B,kEAAkE;QAClE,mEAAmE;QACnE,wCAAwC;SACvC,OAAO,CAAC,mBAAmB,EAAE,IAAI,CAAC;QACnC,4BAA4B;SAC3B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;QACvB,0BAA0B;SACzB,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC,CAChC,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,CACL,IAAI;SACD,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,OAAO,CAAC,uBAAuB,EAAE,GAAG,CAAC;SACrC,OAAO,CAAC,uBAAuB,EAAE,GAAG,CAAC;SACrC,OAAO,CAAC,iBAAiB,EAAE,GAAG,CAAC;SAC/B,OAAO,CAAC,SAAS,EAAE,KAAK,CAAC;QAC1B,uEAAuE;QACvE,kEAAkE;QAClE,4DAA4D;SAC3D,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC;SAC3B,WAAW,EAAE;SACb,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CACV,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,UAAkB;IACxC,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC;AACpE,CAAC;AAED;;;GAGG;AACH,SAAS,UAAU,CAAC,IAAY;IAC9B,OAAO,IAAI;SACR,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,IAAI,kBAAkB,CAAC;SACnD,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,cAAc,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;AAC3D,CAAC;AAED;;;;GAIG;AACH,SAAS,aAAa,CACpB,QAAgB,EAChB,YAAoB;IAEpB,uEAAuE;IACvE,oEAAoE;IACpE,MAAM,WAAW,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;IACzC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,YAAY,GAAa,EAAE,CAAC;IAClC,KAAK,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QAC9B,MAAM,GAAG,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;QAC3B,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACd,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO;YACL,MAAM,EAAE,MAAM;YACd,cAAc,EAAE,CAAC;YACjB,aAAa,EAAE,CAAC;YAChB,eAAe,EAAE,CAAC;YAClB,WAAW,EAAE,EAAE;SAChB,CAAC;IACJ,CAAC;IAED,yEAAyE;IACzE,iEAAiE;IACjE,IAAI,YAAY,CAAC,MAAM,GAAG,2BAA2B,EAAE,CAAC;QACtD,OAAO;YACL,MAAM,EAAE,MAAM;YACd,cAAc,EAAE,CAAC;YACjB,aAAa,EAAE,YAAY,CAAC,MAAM;YAClC,eAAe,EAAE,CAAC;YAClB,WAAW,EAAE,EAAE;SAChB,CAAC;IACJ,CAAC;IAED,MAAM,YAAY,GAAG,SAAS,CAAC,mBAAmB,CAAC,YAAY,CAAC,CAAC,CAAC;IAClE,MAAM,WAAW,GAAa,EAAE,CAAC;IACjC,IAAI,YAAY,GAAG,CAAC,CAAC;IAErB,KAAK,MAAM,OAAO,IAAI,YAAY,EAAE,CAAC;QACnC,MAAM,iBAAiB,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC;QAC7C,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,CAAC;YAC9C,YAAY,EAAE,CAAC;YACf,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3B,WAAW,CAAC,IAAI,CAAC,KAAK,OAAO,EAAE,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,cAAc,GAClB,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,YAAY,GAAG,YAAY,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEvF,IAAI,MAAmB,CAAC;IACxB,IAAI,cAAc,GAAG,cAAc,EAAE,CAAC;QACpC,MAAM,GAAG,MAAM,CAAC;IAClB,CAAC;SAAM,IAAI,cAAc,GAAG,cAAc,EAAE,CAAC;QAC3C,MAAM,GAAG,MAAM,CAAC;IAClB,CAAC;SAAM,CAAC;QACN,MAAM,GAAG,MAAM,CAAC;IAClB,CAAC;IAED,OAAO;QACL,MAAM;QACN,cAAc;QACd,aAAa,EAAE,YAAY,CAAC,MAAM;QAClC,eAAe,EAAE,YAAY;QAC7B,WAAW;KACZ,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,SAAS,CAAC,GAAW;IAC5B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,IAAI,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;QACpF,MAAM,CAAC,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,gBAAgB,EAAE,GAAG,CAAC,CAAC;QACjE,OAAO,MAAM,CAAC,QAAQ,EAAE,CAAC;IAC3B,CAAC;IACD,IAAI,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;QACrC,MAAM,CAAC,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAC1D,OAAO,MAAM,CAAC,QAAQ,EAAE,CAAC;IAC3B,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,WAAW,CAAC,QAAuB;IAC1C,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IAC7C,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IAC7C,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,KAAK,UAAU,KAAK,CAAC,GAAiB;IACpC,MAAM,EAAE,GAAG,yBAAyB,CAAC;IACrC,MAAM,QAAQ,GAAG,eAAe,CAAC;IAEjC,+DAA+D;IAC/D,MAAM,cAAc,GAIf,EAAE,CAAC;IAER,KAAK,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,GAAG,CAAC,SAAS,EAAE,CAAC;QAC1C,IAAI,MAAM,CAAC,QAAQ,EAAE,OAAO,EAAE,CAAC;YAC7B,cAAc,CAAC,IAAI,CAAC;gBAClB,GAAG;gBACH,eAAe,EAAE,MAAM,CAAC,QAAQ,CAAC,OAAO;gBACxC,cAAc,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM;aACvC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAChC,OAAO;YACL,EAAE;YACF,QAAQ;YACR,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,sDAAsD;SAChE,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAuB,EAAE,CAAC;IACvC,MAAM,WAAW,GAAG,GAAG,CAAC,OAAO,CAAC,cAAc,CAAC;IAE/C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,IAAI,WAAW,EAAE,CAAC;QAC5D,MAAM,KAAK,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC;QACvD,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,GAAG,CACpC,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,GAAG,EAAE,eAAe,EAAE,cAAc,EAAE,EAA6B,EAAE;YACtF,IAAI,CAAC;gBACH,qCAAqC;gBACrC,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;gBAC/B,MAAM,IAAI,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;gBAE3C,IAAI,IAAI,CAAC,MAAM,IAAI,GAAG,EAAE,CAAC;oBACvB,0DAA0D;oBAC1D,OAAO;wBACL,GAAG;wBACH,cAAc;wBACd,MAAM,EAAE,MAAM;wBACd,cAAc,EAAE,CAAC;wBACjB,aAAa,EAAE,CAAC;wBAChB,eAAe,EAAE,CAAC;wBAClB,WAAW,EAAE,EAAE;wBACf,KAAK,EAAE,sBAAsB,IAAI,CAAC,MAAM,EAAE;qBAC3C,CAAC;gBACJ,CAAC;gBAED,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;oBACjB,+EAA+E;oBAC/E,OAAO;wBACL,GAAG;wBACH,cAAc;wBACd,MAAM,EAAE,MAAM;wBACd,cAAc,EAAE,CAAC;wBACjB,aAAa,EAAE,CAAC;wBAChB,eAAe,EAAE,CAAC;wBAClB,WAAW,EAAE,EAAE;qBAChB,CAAC;gBACJ,CAAC;gBAED,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAC5C,MAAM,MAAM,GAAG,aAAa,CAAC,QAAQ,EAAE,eAAe,CAAC,CAAC;gBAExD,OAAO,EAAE,GAAG,EAAE,cAAc,EAAE,GAAG,MAAM,EAAE,CAAC;YAC5C,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO;oBACL,GAAG;oBACH,cAAc;oBACd,MAAM,EAAE,MAAM;oBACd,cAAc,EAAE,GAAG;oBACnB,aAAa,EAAE,CAAC;oBAChB,eAAe,EAAE,CAAC;oBAClB,WAAW,EAAE,EAAE;oBACf,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;iBACxD,CAAC;YACJ,CAAC;QACH,CAAC,CAAC,CACH,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC;IAChC,CAAC;IAED,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IACnD,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;IAE1D,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,OAAO;YACL,EAAE;YACF,QAAQ;YACR,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,gDAAgD,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,WAAW,kBAAkB,CAAC,CAAC,CAAC,EAAE,EAAE;YACpH,OAAO,EAAE;gBACP,aAAa,EAAE,CAAC;gBAChB,WAAW;gBACX,WAAW,EAAE,OAAO;aACrB;SACF,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,WAAW,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;IACnE,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,iBAAiB,GACrB,UAAU,CAAC,MAAM,GAAG,CAAC;QACnB,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,cAAc,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;QAC1F,CAAC,CAAC,CAAC,CAAC;IACR,MAAM,MAAM,GAAG,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,WAAW,kBAAkB,CAAC,CAAC,CAAC,EAAE,CAAC;IAEzE,IAAI,OAAe,CAAC;IACpB,IAAI,aAAa,KAAK,MAAM,EAAE,CAAC;QAC7B,OAAO,GAAG,OAAO,UAAU,CAAC,MAAM,yDAAyD,iBAAiB,aAAa,MAAM,EAAE,CAAC;IACpI,CAAC;SAAM,IAAI,aAAa,KAAK,MAAM,EAAE,CAAC;QACpC,OAAO,GAAG,GAAG,UAAU,OAAO,UAAU,CAAC,MAAM,kEAAkE,MAAM,EAAE,CAAC;IAC5H,CAAC;SAAM,CAAC;QACN,OAAO,GAAG,GAAG,UAAU,OAAO,UAAU,CAAC,MAAM,8EAA8E,iBAAiB,aAAa,MAAM,EAAE,CAAC;IACtK,CAAC;IAED,OAAO;QACL,EAAE;QACF,QAAQ;QACR,MAAM,EAAE,aAAa;QACrB,OAAO;QACP,OAAO,EAAE;YACP,aAAa,EAAE,UAAU,CAAC,MAAM;YAChC,UAAU;YACV,UAAU;YACV,UAAU;YACV,WAAW;YACX,iBAAiB;YACjB,WAAW,EAAE,OAAO;SACrB;KACF,CAAC;AACJ,CAAC;AAED,aAAa,CAAC;IACZ,EAAE,EAAE,yBAAyB;IAC7B,QAAQ,EAAE,eAAe;IACzB,WAAW,EAAE,+DAA+D;IAC5E,SAAS,EAAE,CAAC,CAAC,sBAAsB,EAAE,qBAAqB,CAAC,CAAC;IAC5D,GAAG,EAAE,KAAK;CACX,CAAC,CAAC"}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { registerCheck } from '../registry.js';
|
|
2
|
-
import { looksLikeHtml } from '../../helpers/detect-markdown.js';
|
|
3
2
|
import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
|
|
4
3
|
import { htmlToMarkdown } from '../../helpers/html-to-markdown.js';
|
|
4
|
+
import { fetchPage } from '../../helpers/fetch-page.js';
|
|
5
5
|
const CSS_PATTERN = /[{}\s]*[a-z0-9_-]+\s*:\s*[^;]+;/;
|
|
6
6
|
const JS_PATTERNS = [/^\s*(function|var|const|let|import|export)\b/, /^\s*\/\//, /[{};]\s*$/];
|
|
7
7
|
const INLINE_SCRIPT_MIN_LENGTH = 200;
|
|
@@ -155,12 +155,8 @@ async function check(ctx) {
|
|
|
155
155
|
const batch = pageUrls.slice(i, i + concurrency);
|
|
156
156
|
const batchResults = await Promise.all(batch.map(async (url) => {
|
|
157
157
|
try {
|
|
158
|
-
const
|
|
159
|
-
const
|
|
160
|
-
const contentType = response.headers.get('content-type') ?? '';
|
|
161
|
-
const isMarkdownType = contentType.includes('text/markdown') || contentType.includes('text/plain');
|
|
162
|
-
const isHtml = !isMarkdownType && (contentType.includes('text/html') || looksLikeHtml(body));
|
|
163
|
-
const markdown = isHtml ? htmlToMarkdown(body) : body;
|
|
158
|
+
const page = await fetchPage(ctx, url);
|
|
159
|
+
const markdown = page.isHtml ? htmlToMarkdown(page.body) : page.body;
|
|
164
160
|
const totalChars = markdown.length;
|
|
165
161
|
const contentStartChar = findContentStart(markdown);
|
|
166
162
|
const contentStartPercent = totalChars > 0 ? Math.round((contentStartChar / totalChars) * 100) : 0;
|