@se-studio/site-check 1.1.3 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/cms-seo/bulk-action-publish.d.ts +56 -0
- package/dist/cms-seo/bulk-action-publish.d.ts.map +1 -0
- package/dist/cms-seo/bulk-action-publish.js +488 -0
- package/dist/cms-seo/bulk-action-publish.js.map +1 -0
- package/dist/cms-seo/cma-client.d.ts +8 -0
- package/dist/cms-seo/cma-client.d.ts.map +1 -0
- package/dist/cms-seo/cma-client.js +11 -0
- package/dist/cms-seo/cma-client.js.map +1 -0
- package/dist/cms-seo/cma-types.d.ts +2 -0
- package/dist/cms-seo/cma-types.d.ts.map +1 -0
- package/dist/cms-seo/cma-types.js +2 -0
- package/dist/cms-seo/cma-types.js.map +1 -0
- package/dist/cms-seo/featured-image-backfill.d.ts +16 -0
- package/dist/cms-seo/featured-image-backfill.d.ts.map +1 -0
- package/dist/cms-seo/featured-image-backfill.js +357 -0
- package/dist/cms-seo/featured-image-backfill.js.map +1 -0
- package/dist/cms-seo/index.d.ts +6 -0
- package/dist/cms-seo/index.d.ts.map +1 -0
- package/dist/cms-seo/index.js +5 -0
- package/dist/cms-seo/index.js.map +1 -0
- package/dist/cms-seo/seo-audit.d.ts +74 -0
- package/dist/cms-seo/seo-audit.d.ts.map +1 -0
- package/dist/cms-seo/seo-audit.js +926 -0
- package/dist/cms-seo/seo-audit.js.map +1 -0
- package/dist/production-audit/index.d.ts +84 -0
- package/dist/production-audit/index.d.ts.map +1 -0
- package/dist/production-audit/index.js +794 -0
- package/dist/production-audit/index.js.map +1 -0
- package/package.json +24 -2
|
@@ -0,0 +1,794 @@
|
|
|
1
|
+
import { mkdir, writeFile } from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
const STOPWORDS = new Set([
|
|
4
|
+
'a',
|
|
5
|
+
'an',
|
|
6
|
+
'and',
|
|
7
|
+
'are',
|
|
8
|
+
'as',
|
|
9
|
+
'at',
|
|
10
|
+
'be',
|
|
11
|
+
'by',
|
|
12
|
+
'for',
|
|
13
|
+
'from',
|
|
14
|
+
'in',
|
|
15
|
+
'is',
|
|
16
|
+
'it',
|
|
17
|
+
'of',
|
|
18
|
+
'on',
|
|
19
|
+
'or',
|
|
20
|
+
'the',
|
|
21
|
+
'to',
|
|
22
|
+
'with',
|
|
23
|
+
'our',
|
|
24
|
+
'your',
|
|
25
|
+
'we',
|
|
26
|
+
'us',
|
|
27
|
+
]);
|
|
28
|
+
function escapeCsv(value) {
|
|
29
|
+
if (/[",\n]/.test(value))
|
|
30
|
+
return `"${value.replace(/"/g, '""')}"`;
|
|
31
|
+
return value;
|
|
32
|
+
}
|
|
33
|
+
function normalizeBaseUrl(baseUrl) {
|
|
34
|
+
return baseUrl.replace(/\/$/, '');
|
|
35
|
+
}
|
|
36
|
+
function normalizePageUrl(url, trailingSlash) {
|
|
37
|
+
const parsed = new URL(url);
|
|
38
|
+
let pathname = parsed.pathname;
|
|
39
|
+
if (pathname !== '/' && trailingSlash && !pathname.endsWith('/')) {
|
|
40
|
+
pathname = `${pathname}/`;
|
|
41
|
+
}
|
|
42
|
+
if (pathname !== '/' && !trailingSlash && pathname.endsWith('/')) {
|
|
43
|
+
pathname = pathname.slice(0, -1);
|
|
44
|
+
}
|
|
45
|
+
parsed.pathname = pathname;
|
|
46
|
+
return parsed.toString();
|
|
47
|
+
}
|
|
48
|
+
export function htmlUrlToMarkdownUrl(pageUrl) {
|
|
49
|
+
const parsed = new URL(pageUrl);
|
|
50
|
+
if (parsed.pathname === '/' || parsed.pathname === '') {
|
|
51
|
+
parsed.pathname = '/index.md';
|
|
52
|
+
return parsed.toString();
|
|
53
|
+
}
|
|
54
|
+
const path = parsed.pathname.endsWith('/')
|
|
55
|
+
? `${parsed.pathname.slice(0, -1)}.md`
|
|
56
|
+
: `${parsed.pathname}.md`;
|
|
57
|
+
parsed.pathname = path;
|
|
58
|
+
return parsed.toString();
|
|
59
|
+
}
|
|
60
|
+
function parseSitemapLocs(xml) {
|
|
61
|
+
const locs = [];
|
|
62
|
+
const re = /<loc>\s*([^<]+?)\s*<\/loc>/gi;
|
|
63
|
+
let match = re.exec(xml);
|
|
64
|
+
while (match) {
|
|
65
|
+
const loc = match[1];
|
|
66
|
+
if (loc)
|
|
67
|
+
locs.push(loc.trim());
|
|
68
|
+
match = re.exec(xml);
|
|
69
|
+
}
|
|
70
|
+
return [...new Set(locs)];
|
|
71
|
+
}
|
|
72
|
+
export function parseMarkdownIndexUrls(text, baseUrl) {
|
|
73
|
+
const base = normalizeBaseUrl(baseUrl);
|
|
74
|
+
const urls = [];
|
|
75
|
+
for (const line of text.split('\n')) {
|
|
76
|
+
const trimmed = line.trim();
|
|
77
|
+
if (!trimmed.startsWith('http'))
|
|
78
|
+
continue;
|
|
79
|
+
let htmlUrl = trimmed;
|
|
80
|
+
if (htmlUrl.endsWith('/index.md')) {
|
|
81
|
+
htmlUrl = `${base}/`;
|
|
82
|
+
}
|
|
83
|
+
else if (htmlUrl.endsWith('.md')) {
|
|
84
|
+
htmlUrl = `${htmlUrl.slice(0, -3)}/`;
|
|
85
|
+
}
|
|
86
|
+
urls.push(normalizePageUrl(htmlUrl, true));
|
|
87
|
+
}
|
|
88
|
+
return urls;
|
|
89
|
+
}
|
|
90
|
+
function stripHtml(html) {
|
|
91
|
+
return html
|
|
92
|
+
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
|
93
|
+
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
|
94
|
+
.replace(/<[^>]+>/g, ' ')
|
|
95
|
+
.replace(/\s+/g, ' ')
|
|
96
|
+
.trim();
|
|
97
|
+
}
|
|
98
|
+
function extractAttr(tag, attr) {
|
|
99
|
+
const re = new RegExp(`${attr}\\s*=\\s*["']([^"']*)["']`, 'i');
|
|
100
|
+
const m = tag.match(re);
|
|
101
|
+
return m?.[1];
|
|
102
|
+
}
|
|
103
|
+
function extractMetaContent(html, name) {
|
|
104
|
+
const re = new RegExp(`<meta[^>]+(?:name=["']${name}["'][^>]+content=["']([^"']*)["']|content=["']([^"']*)["'][^>]+name=["']${name}["'])`, 'i');
|
|
105
|
+
const m = html.match(re);
|
|
106
|
+
return (m?.[1] ?? m?.[2] ?? '').trim();
|
|
107
|
+
}
|
|
108
|
+
function extractTitle(html) {
|
|
109
|
+
const m = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
110
|
+
return m?.[1] ? stripHtml(m[1]) : '';
|
|
111
|
+
}
|
|
112
|
+
function extractCanonical(html) {
|
|
113
|
+
const tags = html.match(/<link[^>]+>/gi) ?? [];
|
|
114
|
+
for (const tag of tags) {
|
|
115
|
+
if (!/rel=["']canonical["']/i.test(tag))
|
|
116
|
+
continue;
|
|
117
|
+
return extractAttr(tag, 'href') ?? '';
|
|
118
|
+
}
|
|
119
|
+
return '';
|
|
120
|
+
}
|
|
121
|
+
function extractH1(html) {
|
|
122
|
+
const m = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
|
123
|
+
return m?.[1] ? stripHtml(m[1]) : '';
|
|
124
|
+
}
|
|
125
|
+
function extractMainTextLen(html) {
|
|
126
|
+
const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
|
|
127
|
+
if (mainMatch?.[1])
|
|
128
|
+
return stripHtml(mainMatch[1]).length;
|
|
129
|
+
let body = html.replace(/<nav[\s\S]*?<\/nav>/gi, ' ');
|
|
130
|
+
body = body.replace(/<footer[\s\S]*?<\/footer>/gi, ' ');
|
|
131
|
+
const bodyMatch = body.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
132
|
+
return bodyMatch?.[1] ? stripHtml(bodyMatch[1]).length : stripHtml(html).length;
|
|
133
|
+
}
|
|
134
|
+
function extractJsonLdBlocks(html) {
|
|
135
|
+
const blocks = [];
|
|
136
|
+
const re = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
137
|
+
let match = re.exec(html);
|
|
138
|
+
while (match) {
|
|
139
|
+
const block = match[1];
|
|
140
|
+
if (block)
|
|
141
|
+
blocks.push(block.trim());
|
|
142
|
+
match = re.exec(html);
|
|
143
|
+
}
|
|
144
|
+
return blocks;
|
|
145
|
+
}
|
|
146
|
+
function significantTokens(text) {
|
|
147
|
+
const tokens = new Set();
|
|
148
|
+
for (const raw of text.toLowerCase().split(/[^a-z0-9]+/)) {
|
|
149
|
+
if (raw.length < 2 || STOPWORDS.has(raw))
|
|
150
|
+
continue;
|
|
151
|
+
tokens.add(raw);
|
|
152
|
+
}
|
|
153
|
+
return tokens;
|
|
154
|
+
}
|
|
155
|
+
function tokenOverlap(a, b) {
|
|
156
|
+
let count = 0;
|
|
157
|
+
for (const t of a) {
|
|
158
|
+
if (b.has(t))
|
|
159
|
+
count++;
|
|
160
|
+
}
|
|
161
|
+
return count;
|
|
162
|
+
}
|
|
163
|
+
function stripBrandSuffix(title, siteTitle) {
|
|
164
|
+
const suffix = `| ${siteTitle}`;
|
|
165
|
+
if (title.endsWith(suffix))
|
|
166
|
+
return title.slice(0, -suffix.length).trim();
|
|
167
|
+
return title;
|
|
168
|
+
}
|
|
169
|
+
function slugTokens(pathname) {
|
|
170
|
+
const segments = pathname
|
|
171
|
+
.replace(/^\/|\/$/g, '')
|
|
172
|
+
.split('/')
|
|
173
|
+
.filter(Boolean);
|
|
174
|
+
const last = segments.at(-1) ?? '';
|
|
175
|
+
return significantTokens(last.replace(/-/g, ' '));
|
|
176
|
+
}
|
|
177
|
+
function collectSchemaTypes(value, types) {
|
|
178
|
+
if (!value || typeof value !== 'object')
|
|
179
|
+
return;
|
|
180
|
+
if (Array.isArray(value)) {
|
|
181
|
+
for (const item of value)
|
|
182
|
+
collectSchemaTypes(item, types);
|
|
183
|
+
return;
|
|
184
|
+
}
|
|
185
|
+
const obj = value;
|
|
186
|
+
if (typeof obj['@type'] === 'string')
|
|
187
|
+
types.add(obj['@type']);
|
|
188
|
+
if (Array.isArray(obj['@type'])) {
|
|
189
|
+
for (const t of obj['@type']) {
|
|
190
|
+
if (typeof t === 'string')
|
|
191
|
+
types.add(t);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
if (obj['@graph'])
|
|
195
|
+
collectSchemaTypes(obj['@graph'], types);
|
|
196
|
+
for (const v of Object.values(obj)) {
|
|
197
|
+
if (v && typeof v === 'object')
|
|
198
|
+
collectSchemaTypes(v, types);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
function parseLinkCanonical(headers) {
|
|
202
|
+
const link = headers.get('link');
|
|
203
|
+
if (!link)
|
|
204
|
+
return undefined;
|
|
205
|
+
const m = link.match(/<([^>]+)>;\s*rel="canonical"/i);
|
|
206
|
+
return m?.[1];
|
|
207
|
+
}
|
|
208
|
+
function urlsEquivalent(a, b, trailingSlash) {
|
|
209
|
+
try {
|
|
210
|
+
return normalizePageUrl(a, trailingSlash) === normalizePageUrl(b, trailingSlash);
|
|
211
|
+
}
|
|
212
|
+
catch {
|
|
213
|
+
return a === b;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
/** CMS Mustache placeholders ({{var}} / {{{var}}}), not JSON `}}` closers or `{search_term_string}`. */
|
|
217
|
+
export function hasUnfilledMustache(text) {
|
|
218
|
+
return /\{\{\{[^}]+\}\}\}/.test(text) || /\{\{[^}]+\}\}/.test(text);
|
|
219
|
+
}
|
|
220
|
+
function shouldSkipMarkdown(pathname, prefixes) {
|
|
221
|
+
return prefixes.some((p) => pathname.startsWith(p.startsWith('/') ? p : `/${p}`));
|
|
222
|
+
}
|
|
223
|
+
function classifyRow(input) {
|
|
224
|
+
const titleFlags = [];
|
|
225
|
+
const descFlags = [];
|
|
226
|
+
const schemaFlags = [];
|
|
227
|
+
const contentFlags = [];
|
|
228
|
+
const jsonldErrors = [];
|
|
229
|
+
const schemaTypes = new Set();
|
|
230
|
+
const isHome = input.pathname === '/' || input.pathname === '';
|
|
231
|
+
if (input.httpStatus !== 200)
|
|
232
|
+
contentFlags.push('http_error');
|
|
233
|
+
if (!input.title) {
|
|
234
|
+
titleFlags.push('missing_title');
|
|
235
|
+
}
|
|
236
|
+
else if (isHome) {
|
|
237
|
+
if (input.title !== input.siteTitle)
|
|
238
|
+
titleFlags.push('homepage_title_unexpected');
|
|
239
|
+
}
|
|
240
|
+
else {
|
|
241
|
+
const titleCore = stripBrandSuffix(input.title, input.siteTitle);
|
|
242
|
+
if (titleCore === input.siteTitle)
|
|
243
|
+
titleFlags.push('generic_title');
|
|
244
|
+
const slugTok = slugTokens(input.pathname);
|
|
245
|
+
const titleTok = significantTokens(titleCore);
|
|
246
|
+
const h1Tok = significantTokens(input.h1);
|
|
247
|
+
if (slugTok.size > 0 &&
|
|
248
|
+
tokenOverlap(slugTok, titleTok) === 0 &&
|
|
249
|
+
tokenOverlap(slugTok, h1Tok) === 0) {
|
|
250
|
+
titleFlags.push('title_slug_mismatch');
|
|
251
|
+
}
|
|
252
|
+
const h1WordCount = input.h1.split(/\s+/).filter(Boolean).length;
|
|
253
|
+
if (h1WordCount >= 4 && tokenOverlap(significantTokens(titleCore), h1Tok) < 1) {
|
|
254
|
+
titleFlags.push('title_h1_mismatch');
|
|
255
|
+
}
|
|
256
|
+
for (const prefix of input.articlePathPrefixes) {
|
|
257
|
+
if (!input.pathname.startsWith(`/${prefix}/`))
|
|
258
|
+
continue;
|
|
259
|
+
const segments = input.pathname.replace(`/${prefix}/`, '').split('/').filter(Boolean);
|
|
260
|
+
if (segments.length < 2)
|
|
261
|
+
continue;
|
|
262
|
+
const articleSlug = segments.at(-1) ?? '';
|
|
263
|
+
const slugWords = significantTokens(articleSlug.replace(/-/g, ' '));
|
|
264
|
+
if (slugWords.size > 0 && tokenOverlap(slugWords, titleTok) === 0) {
|
|
265
|
+
titleFlags.push('article_title_slug_mismatch');
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
if (!input.description) {
|
|
270
|
+
descFlags.push('missing_description');
|
|
271
|
+
}
|
|
272
|
+
else {
|
|
273
|
+
if (hasUnfilledMustache(input.description))
|
|
274
|
+
descFlags.push('mustache_leak');
|
|
275
|
+
if (input.description.length < 50)
|
|
276
|
+
descFlags.push('short_description');
|
|
277
|
+
else if (input.description.length > 160)
|
|
278
|
+
descFlags.push('long_description');
|
|
279
|
+
else if (input.description.length < 120)
|
|
280
|
+
descFlags.push('suboptimal_description');
|
|
281
|
+
}
|
|
282
|
+
const thinContentExempt = /^\/(tags|categories|resources\/news|resources\/publications)\//.test(input.pathname) &&
|
|
283
|
+
input.pathname.split('/').filter(Boolean).length <= 3;
|
|
284
|
+
if (input.mainTextLen < input.thinContentThreshold &&
|
|
285
|
+
input.httpStatus === 200 &&
|
|
286
|
+
!thinContentExempt) {
|
|
287
|
+
contentFlags.push('thin_content');
|
|
288
|
+
}
|
|
289
|
+
if (input.jsonLdBlocks.length === 0 && !input.noindex && input.httpStatus === 200) {
|
|
290
|
+
schemaFlags.push('missing_jsonld');
|
|
291
|
+
}
|
|
292
|
+
for (const block of input.jsonLdBlocks) {
|
|
293
|
+
if (hasUnfilledMustache(block))
|
|
294
|
+
schemaFlags.push('unfilled_mustache');
|
|
295
|
+
try {
|
|
296
|
+
const parsed = JSON.parse(block);
|
|
297
|
+
const serialized = JSON.stringify(parsed);
|
|
298
|
+
if (!/schema\.org/i.test(serialized))
|
|
299
|
+
schemaFlags.push('missing_schema_context');
|
|
300
|
+
collectSchemaTypes(parsed, schemaTypes);
|
|
301
|
+
const hasType = schemaTypes.size > 0;
|
|
302
|
+
if (!hasType)
|
|
303
|
+
schemaFlags.push('missing_schema_type');
|
|
304
|
+
}
|
|
305
|
+
catch {
|
|
306
|
+
jsonldErrors.push('invalid_json');
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
if (isHome && input.httpStatus === 200 && !input.noindex) {
|
|
310
|
+
if (!schemaTypes.has('Organization'))
|
|
311
|
+
schemaFlags.push('expected_organization');
|
|
312
|
+
if (!schemaTypes.has('WebSite'))
|
|
313
|
+
schemaFlags.push('expected_website');
|
|
314
|
+
}
|
|
315
|
+
if (input.pathname.startsWith('/people/') && input.httpStatus === 200) {
|
|
316
|
+
if (!schemaTypes.has('Person') && !schemaTypes.has('ProfilePage')) {
|
|
317
|
+
schemaFlags.push('expected_person_schema');
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
for (const prefix of input.articlePathPrefixes) {
|
|
321
|
+
if (input.pathname.startsWith(`/${prefix}/`) && input.pathname.split('/').length > 3) {
|
|
322
|
+
if (!schemaTypes.has('BlogPosting') &&
|
|
323
|
+
!schemaTypes.has('ScholarlyArticle') &&
|
|
324
|
+
!schemaTypes.has('NewsArticle')) {
|
|
325
|
+
schemaFlags.push('expected_article_schema');
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
if (input.pathname === `/${prefix}/` || input.pathname === `/${prefix}`) {
|
|
329
|
+
if (!schemaTypes.has('CollectionPage'))
|
|
330
|
+
schemaFlags.push('expected_collection_page');
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
if (input.pathname.startsWith('/categories/') && input.httpStatus === 200) {
|
|
334
|
+
if (!schemaTypes.has('CollectionPage'))
|
|
335
|
+
schemaFlags.push('expected_collection_page');
|
|
336
|
+
}
|
|
337
|
+
if (input.expectMarkdown) {
|
|
338
|
+
if (input.mdStatus !== 200) {
|
|
339
|
+
contentFlags.push('markdown_missing');
|
|
340
|
+
}
|
|
341
|
+
else if (input.mdLen < 50) {
|
|
342
|
+
contentFlags.push('markdown_empty');
|
|
343
|
+
}
|
|
344
|
+
if (input.mdCanonical &&
|
|
345
|
+
!urlsEquivalent(input.mdCanonical, input.pageUrl, input.trailingSlash)) {
|
|
346
|
+
contentFlags.push('markdown_canonical_mismatch');
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
const errorFlags = new Set([
|
|
350
|
+
'http_error',
|
|
351
|
+
'missing_title',
|
|
352
|
+
'missing_description',
|
|
353
|
+
'mustache_leak',
|
|
354
|
+
'missing_jsonld',
|
|
355
|
+
'invalid_json',
|
|
356
|
+
'unfilled_mustache',
|
|
357
|
+
'markdown_missing',
|
|
358
|
+
'markdown_empty',
|
|
359
|
+
'markdown_canonical_mismatch',
|
|
360
|
+
]);
|
|
361
|
+
const all = [...titleFlags, ...descFlags, ...schemaFlags, ...contentFlags, ...jsonldErrors];
|
|
362
|
+
let severity = 'ok';
|
|
363
|
+
if (all.some((f) => errorFlags.has(f)))
|
|
364
|
+
severity = 'error';
|
|
365
|
+
else if (all.length > 0)
|
|
366
|
+
severity = 'warn';
|
|
367
|
+
return {
|
|
368
|
+
titleFlags,
|
|
369
|
+
descFlags,
|
|
370
|
+
schemaFlags,
|
|
371
|
+
contentFlags,
|
|
372
|
+
jsonldErrors,
|
|
373
|
+
schemaTypes,
|
|
374
|
+
severity,
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
async function fetchWithTimeout(url, init, timeoutMs = 30_000) {
|
|
378
|
+
const controller = new AbortController();
|
|
379
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
380
|
+
try {
|
|
381
|
+
return await fetch(url, { ...init, signal: controller.signal, redirect: 'follow' });
|
|
382
|
+
}
|
|
383
|
+
finally {
|
|
384
|
+
clearTimeout(timer);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
async function mapPool(items, concurrency, fn) {
|
|
388
|
+
const results = new Array(items.length);
|
|
389
|
+
let nextIndex = 0;
|
|
390
|
+
async function worker() {
|
|
391
|
+
while (true) {
|
|
392
|
+
const i = nextIndex++;
|
|
393
|
+
if (i >= items.length)
|
|
394
|
+
return;
|
|
395
|
+
const item = items[i];
|
|
396
|
+
if (item === undefined)
|
|
397
|
+
return;
|
|
398
|
+
results[i] = await fn(item, i);
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
|
|
402
|
+
return results;
|
|
403
|
+
}
|
|
404
|
+
export async function runProductionSeoAudit(config) {
|
|
405
|
+
const base = normalizeBaseUrl(config.baseUrl);
|
|
406
|
+
const trailingSlash = config.trailingSlash ?? true;
|
|
407
|
+
const sitemapUrl = `${base}${config.sitemapPath ?? '/sitemap.xml'}`;
|
|
408
|
+
const markdownIndexUrl = `${base}${config.markdownIndexPath ?? '/markdown-index.txt'}`;
|
|
409
|
+
const concurrency = config.concurrency ?? 8;
|
|
410
|
+
const thinContentThreshold = config.thinContentThreshold ?? 200;
|
|
411
|
+
const minMarkdownLength = config.minMarkdownLength ?? 50;
|
|
412
|
+
const requestDelayMs = config.requestDelayMs ?? 0;
|
|
413
|
+
const articlePathPrefixes = config.articlePathPrefixes ?? [];
|
|
414
|
+
const skipMarkdownPathPrefixes = config.skipMarkdownPathPrefixes ?? ['/download/'];
|
|
415
|
+
const sitemapRes = await fetchWithTimeout(sitemapUrl);
|
|
416
|
+
if (!sitemapRes.ok)
|
|
417
|
+
throw new Error(`Failed to fetch sitemap: ${sitemapRes.status} ${sitemapUrl}`);
|
|
418
|
+
const sitemapXml = await sitemapRes.text();
|
|
419
|
+
const sitemapUrls = parseSitemapLocs(sitemapXml).map((u) => normalizePageUrl(u, trailingSlash));
|
|
420
|
+
let markdownIndexUrls = [];
|
|
421
|
+
try {
|
|
422
|
+
const indexRes = await fetchWithTimeout(markdownIndexUrl);
|
|
423
|
+
if (indexRes.ok) {
|
|
424
|
+
const indexText = await indexRes.text();
|
|
425
|
+
markdownIndexUrls = parseMarkdownIndexUrls(indexText, base);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
catch {
|
|
429
|
+
// optional cross-check
|
|
430
|
+
}
|
|
431
|
+
const markdownSet = new Set(markdownIndexUrls.map((u) => normalizePageUrl(u, trailingSlash)));
|
|
432
|
+
const missingFromMarkdownIndex = sitemapUrls.filter((u) => !markdownSet.has(u));
|
|
433
|
+
const rows = await mapPool(sitemapUrls, concurrency, async (pageUrl) => {
|
|
434
|
+
if (requestDelayMs > 0)
|
|
435
|
+
await new Promise((r) => setTimeout(r, requestDelayMs));
|
|
436
|
+
const htmlRes = await fetchWithTimeout(pageUrl, {
|
|
437
|
+
headers: { Accept: 'text/html', 'User-Agent': 'PedestalProductionSeoAudit/1.0' },
|
|
438
|
+
});
|
|
439
|
+
const html = htmlRes.ok ? await htmlRes.text() : '';
|
|
440
|
+
const pathname = new URL(htmlRes.url || pageUrl).pathname;
|
|
441
|
+
const title = extractTitle(html);
|
|
442
|
+
const description = extractMetaContent(html, 'description');
|
|
443
|
+
const h1 = extractH1(html);
|
|
444
|
+
const mainTextLen = extractMainTextLen(html);
|
|
445
|
+
const canonical = extractCanonical(html) || pageUrl;
|
|
446
|
+
const robots = extractMetaContent(html, 'robots');
|
|
447
|
+
const noindex = /noindex/i.test(robots);
|
|
448
|
+
const jsonLdBlocks = extractJsonLdBlocks(html);
|
|
449
|
+
const normalizedPageUrl = normalizePageUrl(htmlRes.url || pageUrl, trailingSlash);
|
|
450
|
+
const inMarkdownIndex = markdownSet.has(normalizedPageUrl);
|
|
451
|
+
const expectMarkdown = inMarkdownIndex && !shouldSkipMarkdown(pathname, skipMarkdownPathPrefixes);
|
|
452
|
+
const mdUrl = htmlUrlToMarkdownUrl(pageUrl);
|
|
453
|
+
let mdStatus = 0;
|
|
454
|
+
let mdLen = 0;
|
|
455
|
+
let mdCanonical;
|
|
456
|
+
if (expectMarkdown) {
|
|
457
|
+
try {
|
|
458
|
+
const mdRes = await fetchWithTimeout(mdUrl, {
|
|
459
|
+
headers: { Accept: 'text/markdown', 'User-Agent': 'PedestalProductionSeoAudit/1.0' },
|
|
460
|
+
});
|
|
461
|
+
mdStatus = mdRes.status;
|
|
462
|
+
if (mdRes.ok) {
|
|
463
|
+
const mdText = await mdRes.text();
|
|
464
|
+
mdLen = mdText.trim().length;
|
|
465
|
+
mdCanonical = parseLinkCanonical(mdRes.headers);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
catch {
|
|
469
|
+
mdStatus = 0;
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
const classified = classifyRow({
|
|
473
|
+
url: pageUrl,
|
|
474
|
+
pathname,
|
|
475
|
+
siteTitle: config.siteTitle,
|
|
476
|
+
title,
|
|
477
|
+
description,
|
|
478
|
+
h1,
|
|
479
|
+
mainTextLen,
|
|
480
|
+
httpStatus: htmlRes.status,
|
|
481
|
+
noindex,
|
|
482
|
+
jsonLdBlocks,
|
|
483
|
+
articlePathPrefixes,
|
|
484
|
+
thinContentThreshold,
|
|
485
|
+
mdStatus,
|
|
486
|
+
mdLen: mdLen,
|
|
487
|
+
mdCanonical,
|
|
488
|
+
pageUrl: normalizedPageUrl,
|
|
489
|
+
trailingSlash,
|
|
490
|
+
inMarkdownIndex,
|
|
491
|
+
expectMarkdown,
|
|
492
|
+
});
|
|
493
|
+
const allFlags = [
|
|
494
|
+
...classified.titleFlags,
|
|
495
|
+
...classified.descFlags,
|
|
496
|
+
...classified.schemaFlags,
|
|
497
|
+
...classified.contentFlags,
|
|
498
|
+
...classified.jsonldErrors,
|
|
499
|
+
];
|
|
500
|
+
if (mdLen < minMarkdownLength && mdStatus === 200) {
|
|
501
|
+
allFlags.push('markdown_short');
|
|
502
|
+
}
|
|
503
|
+
return {
|
|
504
|
+
url: pageUrl,
|
|
505
|
+
finalUrl: htmlRes.url || pageUrl,
|
|
506
|
+
httpStatus: htmlRes.status,
|
|
507
|
+
title,
|
|
508
|
+
description,
|
|
509
|
+
descLen: description.length,
|
|
510
|
+
h1,
|
|
511
|
+
mainTextLen,
|
|
512
|
+
canonical,
|
|
513
|
+
noindex,
|
|
514
|
+
jsonldCount: jsonLdBlocks.length,
|
|
515
|
+
jsonldErrors: classified.jsonldErrors.join(';'),
|
|
516
|
+
schemaTypes: [...classified.schemaTypes].sort().join(', '),
|
|
517
|
+
titleFlags: classified.titleFlags.join(';'),
|
|
518
|
+
descFlags: classified.descFlags.join(';'),
|
|
519
|
+
schemaFlags: classified.schemaFlags.join(';'),
|
|
520
|
+
contentFlags: classified.contentFlags.join(';'),
|
|
521
|
+
mdStatus,
|
|
522
|
+
mdLen,
|
|
523
|
+
mdCanonicalOk: !expectMarkdown
|
|
524
|
+
? true
|
|
525
|
+
: mdCanonical
|
|
526
|
+
? urlsEquivalent(mdCanonical, normalizedPageUrl, trailingSlash)
|
|
527
|
+
: mdStatus === 200,
|
|
528
|
+
allFlags: [...new Set(allFlags)].join(';'),
|
|
529
|
+
severity: classified.severity,
|
|
530
|
+
};
|
|
531
|
+
});
|
|
532
|
+
const flagCounts = {};
|
|
533
|
+
let errorCount = 0;
|
|
534
|
+
let warnCount = 0;
|
|
535
|
+
for (const row of rows) {
|
|
536
|
+
if (row.severity === 'error')
|
|
537
|
+
errorCount++;
|
|
538
|
+
if (row.severity === 'warn')
|
|
539
|
+
warnCount++;
|
|
540
|
+
for (const flag of row.allFlags.split(';').filter(Boolean)) {
|
|
541
|
+
flagCounts[flag] = (flagCounts[flag] ?? 0) + 1;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
if (missingFromMarkdownIndex.length > 0) {
|
|
545
|
+
flagCounts.missing_from_markdown_index = missingFromMarkdownIndex.length;
|
|
546
|
+
}
|
|
547
|
+
return {
|
|
548
|
+
rows,
|
|
549
|
+
sitemapUrlCount: sitemapUrls.length,
|
|
550
|
+
markdownIndexUrlCount: markdownIndexUrls.length,
|
|
551
|
+
missingFromMarkdownIndex,
|
|
552
|
+
flagCounts,
|
|
553
|
+
errorCount,
|
|
554
|
+
warnCount,
|
|
555
|
+
};
|
|
556
|
+
}
|
|
557
|
+
const CSV_HEADERS = [
|
|
558
|
+
'url',
|
|
559
|
+
'finalUrl',
|
|
560
|
+
'httpStatus',
|
|
561
|
+
'title',
|
|
562
|
+
'description',
|
|
563
|
+
'descLen',
|
|
564
|
+
'h1',
|
|
565
|
+
'mainTextLen',
|
|
566
|
+
'canonical',
|
|
567
|
+
'noindex',
|
|
568
|
+
'jsonldCount',
|
|
569
|
+
'jsonldErrors',
|
|
570
|
+
'schemaTypes',
|
|
571
|
+
'titleFlags',
|
|
572
|
+
'descFlags',
|
|
573
|
+
'schemaFlags',
|
|
574
|
+
'contentFlags',
|
|
575
|
+
'mdStatus',
|
|
576
|
+
'mdLen',
|
|
577
|
+
'mdCanonicalOk',
|
|
578
|
+
'allFlags',
|
|
579
|
+
'severity',
|
|
580
|
+
];
|
|
581
|
+
export function rowsToCsv(rows) {
|
|
582
|
+
const lines = [CSV_HEADERS.join(',')];
|
|
583
|
+
for (const row of rows) {
|
|
584
|
+
lines.push([
|
|
585
|
+
row.url,
|
|
586
|
+
row.finalUrl,
|
|
587
|
+
String(row.httpStatus),
|
|
588
|
+
row.title,
|
|
589
|
+
row.description,
|
|
590
|
+
String(row.descLen),
|
|
591
|
+
row.h1,
|
|
592
|
+
String(row.mainTextLen),
|
|
593
|
+
row.canonical,
|
|
594
|
+
row.noindex ? 'true' : 'false',
|
|
595
|
+
String(row.jsonldCount),
|
|
596
|
+
row.jsonldErrors,
|
|
597
|
+
row.schemaTypes,
|
|
598
|
+
row.titleFlags,
|
|
599
|
+
row.descFlags,
|
|
600
|
+
row.schemaFlags,
|
|
601
|
+
row.contentFlags,
|
|
602
|
+
String(row.mdStatus),
|
|
603
|
+
String(row.mdLen),
|
|
604
|
+
row.mdCanonicalOk ? 'true' : 'false',
|
|
605
|
+
row.allFlags,
|
|
606
|
+
row.severity,
|
|
607
|
+
]
|
|
608
|
+
.map(escapeCsv)
|
|
609
|
+
.join(','));
|
|
610
|
+
}
|
|
611
|
+
return `${lines.join('\n')}\n`;
|
|
612
|
+
}
|
|
613
|
+
export function evaluateRichResultsSpotCheck(input) {
|
|
614
|
+
const types = input.schemaTypes;
|
|
615
|
+
let result = 'pass';
|
|
616
|
+
const notes = [];
|
|
617
|
+
if (input.httpStatus !== 200) {
|
|
618
|
+
return {
|
|
619
|
+
label: input.label,
|
|
620
|
+
url: input.url,
|
|
621
|
+
result: 'fail',
|
|
622
|
+
notes: `HTTP ${input.httpStatus}`,
|
|
623
|
+
};
|
|
624
|
+
}
|
|
625
|
+
if (input.jsonldErrors.length > 0) {
|
|
626
|
+
return {
|
|
627
|
+
label: input.label,
|
|
628
|
+
url: input.url,
|
|
629
|
+
result: 'fail',
|
|
630
|
+
notes: input.jsonldErrors.join(', '),
|
|
631
|
+
};
|
|
632
|
+
}
|
|
633
|
+
if (types.size === 0) {
|
|
634
|
+
return { label: input.label, url: input.url, result: 'fail', notes: 'No JSON-LD types' };
|
|
635
|
+
}
|
|
636
|
+
const pathname = new URL(input.url).pathname;
|
|
637
|
+
if (pathname === '/' || pathname === '') {
|
|
638
|
+
if (!types.has('Organization'))
|
|
639
|
+
notes.push('missing Organization');
|
|
640
|
+
if (!types.has('WebSite'))
|
|
641
|
+
notes.push('missing WebSite');
|
|
642
|
+
}
|
|
643
|
+
else if (pathname.startsWith('/people/')) {
|
|
644
|
+
if (!types.has('Person') && !types.has('ProfilePage'))
|
|
645
|
+
notes.push('missing Person/ProfilePage');
|
|
646
|
+
}
|
|
647
|
+
else if (pathname.startsWith('/categories/')) {
|
|
648
|
+
if (!types.has('CollectionPage'))
|
|
649
|
+
notes.push('missing CollectionPage (warn)');
|
|
650
|
+
}
|
|
651
|
+
else if (pathname.match(/^\/resources\/news\/[^/]+\/[^/]+\/$/)) {
|
|
652
|
+
if (!types.has('BlogPosting') && !types.has('NewsArticle') && !types.has('ScholarlyArticle')) {
|
|
653
|
+
notes.push('missing article schema');
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
else if (pathname.startsWith('/resources/publications/') && pathname.split('/').length > 5) {
|
|
657
|
+
if (!types.has('ScholarlyArticle') && !types.has('BlogPosting')) {
|
|
658
|
+
notes.push('missing ScholarlyArticle/BlogPosting');
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
else if (pathname === '/resources/news/' || pathname === '/resources/news') {
|
|
662
|
+
if (!types.has('CollectionPage'))
|
|
663
|
+
notes.push('missing CollectionPage (warn)');
|
|
664
|
+
}
|
|
665
|
+
if (notes.length > 0) {
|
|
666
|
+
result = notes.some((n) => n.startsWith('missing CollectionPage')) ? 'warn' : 'fail';
|
|
667
|
+
}
|
|
668
|
+
return {
|
|
669
|
+
label: input.label,
|
|
670
|
+
url: input.url,
|
|
671
|
+
result,
|
|
672
|
+
notes: notes.length > 0 ? notes.join('; ') : `Types: ${[...types].sort().join(', ')}`,
|
|
673
|
+
};
|
|
674
|
+
}
|
|
675
|
+
export function pickRichResultsSampleUrls(rows, baseUrl) {
|
|
676
|
+
const base = normalizeBaseUrl(baseUrl);
|
|
677
|
+
const find = (pred) => rows.find(pred)?.url;
|
|
678
|
+
const newsArticle = find((r) => r.url.includes('/resources/news/') &&
|
|
679
|
+
r.url.replace(base, '').split('/').filter(Boolean).length >= 4 &&
|
|
680
|
+
r.httpStatus === 200);
|
|
681
|
+
const publication = find((r) => r.url.includes('/resources/publications/') &&
|
|
682
|
+
r.url.includes('/abstract-manuscript/') &&
|
|
683
|
+
r.httpStatus === 200);
|
|
684
|
+
const person = find((r) => {
|
|
685
|
+
const segments = r.url.replace(base, '').split('/').filter(Boolean);
|
|
686
|
+
return (segments[0] === 'people' && segments.length >= 2 && r.httpStatus === 200 && r.jsonldCount > 0);
|
|
687
|
+
});
|
|
688
|
+
const category = find((r) => {
|
|
689
|
+
const segments = r.url.replace(base, '').split('/').filter(Boolean);
|
|
690
|
+
return segments[0] === 'categories' && segments.length >= 2 && r.httpStatus === 200;
|
|
691
|
+
});
|
|
692
|
+
return [
|
|
693
|
+
{ label: 'Homepage', url: `${base}/` },
|
|
694
|
+
{ label: 'News hub', url: `${base}/resources/news/` },
|
|
695
|
+
...(newsArticle ? [{ label: 'News article', url: newsArticle }] : []),
|
|
696
|
+
...(publication ? [{ label: 'Publication', url: publication }] : []),
|
|
697
|
+
...(person ? [{ label: 'Person', url: person }] : []),
|
|
698
|
+
...(category ? [{ label: 'Category hub', url: category }] : []),
|
|
699
|
+
];
|
|
700
|
+
}
|
|
701
|
+
export function buildRichResultsSpotChecks(rows, config) {
|
|
702
|
+
const samples = pickRichResultsSampleUrls(rows, config.baseUrl);
|
|
703
|
+
const rowByUrl = new Map(rows.map((r) => [normalizePageUrl(r.url, config.trailingSlash ?? true), r]));
|
|
704
|
+
return samples.map(({ label, url }) => {
|
|
705
|
+
const row = rowByUrl.get(normalizePageUrl(url, config.trailingSlash ?? true));
|
|
706
|
+
const types = new Set(row?.schemaTypes
|
|
707
|
+
? row.schemaTypes
|
|
708
|
+
.split(',')
|
|
709
|
+
.map((t) => t.trim())
|
|
710
|
+
.filter(Boolean)
|
|
711
|
+
: []);
|
|
712
|
+
return evaluateRichResultsSpotCheck({
|
|
713
|
+
label,
|
|
714
|
+
url,
|
|
715
|
+
httpStatus: row?.httpStatus ?? 0,
|
|
716
|
+
schemaTypes: types,
|
|
717
|
+
jsonldErrors: row?.jsonldErrors ? row.jsonldErrors.split(';').filter(Boolean) : [],
|
|
718
|
+
});
|
|
719
|
+
});
|
|
720
|
+
}
|
|
721
|
+
export function buildMarkdownReport(result, config, richResults) {
|
|
722
|
+
const sortedFlags = Object.entries(result.flagCounts).sort((a, b) => b[1] - a[1]);
|
|
723
|
+
const errors = result.rows.filter((r) => r.severity === 'error');
|
|
724
|
+
const warns = result.rows.filter((r) => r.severity === 'warn');
|
|
725
|
+
const lines = [
|
|
726
|
+
'# Pedestal production SEO audit',
|
|
727
|
+
'',
|
|
728
|
+
`**Site:** ${config.baseUrl}`,
|
|
729
|
+
`**Generated:** ${new Date().toISOString()}`,
|
|
730
|
+
'',
|
|
731
|
+
'## Summary',
|
|
732
|
+
'',
|
|
733
|
+
`| Metric | Count |`,
|
|
734
|
+
`|--------|------:|`,
|
|
735
|
+
`| Sitemap URLs crawled | ${result.sitemapUrlCount} |`,
|
|
736
|
+
`| Markdown index URLs | ${result.markdownIndexUrlCount} |`,
|
|
737
|
+
`| Rows with errors | ${result.errorCount} |`,
|
|
738
|
+
`| Rows with warnings | ${result.warnCount} |`,
|
|
739
|
+
`| Missing from markdown-index.txt | ${result.missingFromMarkdownIndex.length} |`,
|
|
740
|
+
'',
|
|
741
|
+
'## Flag counts',
|
|
742
|
+
'',
|
|
743
|
+
'| Flag | Count |',
|
|
744
|
+
'|------|------:|',
|
|
745
|
+
...sortedFlags.map(([flag, count]) => `| ${flag} | ${count} |`),
|
|
746
|
+
'',
|
|
747
|
+
];
|
|
748
|
+
if (errors.length > 0) {
|
|
749
|
+
lines.push('## Errors (sample)', '', '| URL | Flags |', '|-----|-------|');
|
|
750
|
+
for (const row of errors.slice(0, 40)) {
|
|
751
|
+
lines.push(`| ${row.url} | ${row.allFlags} |`);
|
|
752
|
+
}
|
|
753
|
+
if (errors.length > 40)
|
|
754
|
+
lines.push('', `_…and ${errors.length - 40} more. See CSV._`);
|
|
755
|
+
lines.push('');
|
|
756
|
+
}
|
|
757
|
+
if (warns.length > 0) {
|
|
758
|
+
lines.push('## Warnings (sample)', '', '| URL | Flags |', '|-----|-------|');
|
|
759
|
+
for (const row of warns.slice(0, 30)) {
|
|
760
|
+
lines.push(`| ${row.url} | ${row.allFlags} |`);
|
|
761
|
+
}
|
|
762
|
+
if (warns.length > 30)
|
|
763
|
+
lines.push('', `_…and ${warns.length - 30} more. See CSV._`);
|
|
764
|
+
lines.push('');
|
|
765
|
+
}
|
|
766
|
+
if (result.missingFromMarkdownIndex.length > 0) {
|
|
767
|
+
lines.push('## Missing from markdown-index.txt', '');
|
|
768
|
+
for (const url of result.missingFromMarkdownIndex.slice(0, 50)) {
|
|
769
|
+
lines.push(`- ${url}`);
|
|
770
|
+
}
|
|
771
|
+
if (result.missingFromMarkdownIndex.length > 50) {
|
|
772
|
+
lines.push(`- _…and ${result.missingFromMarkdownIndex.length - 50} more_`);
|
|
773
|
+
}
|
|
774
|
+
lines.push('');
|
|
775
|
+
}
|
|
776
|
+
const spotChecks = richResults ?? [];
|
|
777
|
+
lines.push('## Rich Results spot-check', '', 'Automated JSON-LD type checks on representative URLs. Confirm in [Google Rich Results Test](https://search.google.com/test/rich-results) if needed.', '', '| Page | URL | Result | Notes |', '|------|-----|--------|-------|', ...(spotChecks.length > 0
|
|
778
|
+
? spotChecks.map((s) => `| ${s.label} | ${s.url} | ${s.result} | ${s.notes} |`)
|
|
779
|
+
: ['| (no samples) | | | |']), '');
|
|
780
|
+
return lines.join('\n');
|
|
781
|
+
}
|
|
782
|
+
export async function writeProductionSeoAuditOutputs(result, config, richResults) {
|
|
783
|
+
await mkdir(config.outputDir, { recursive: true });
|
|
784
|
+
const csvPath = path.join(config.outputDir, config.outputCsvName ?? 'pedestal-production-audit.csv');
|
|
785
|
+
const mdPath = path.join(config.outputDir, config.outputMdName ?? 'pedestal-production-audit.md');
|
|
786
|
+
const spotChecks = richResults ?? buildRichResultsSpotChecks(result.rows, config);
|
|
787
|
+
await writeFile(csvPath, rowsToCsv(result.rows), 'utf8');
|
|
788
|
+
await writeFile(mdPath, buildMarkdownReport(result, config, spotChecks), 'utf8');
|
|
789
|
+
return { csvPath, mdPath };
|
|
790
|
+
}
|
|
791
|
+
export function getProductionAuditExitCode(result) {
|
|
792
|
+
return result.errorCount > 0 ? 1 : 0;
|
|
793
|
+
}
|
|
794
|
+
//# sourceMappingURL=index.js.map
|