@growth-labs/seo 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +8 -2
  2. package/dist/index.d.ts.map +1 -1
  3. package/dist/index.js +56 -37
  4. package/dist/index.js.map +1 -1
  5. package/dist/middleware/seo.d.ts.map +1 -1
  6. package/dist/middleware/seo.js +8 -3
  7. package/dist/middleware/seo.js.map +1 -1
  8. package/dist/options.d.ts +140 -0
  9. package/dist/options.d.ts.map +1 -1
  10. package/dist/options.js +18 -0
  11. package/dist/options.js.map +1 -1
  12. package/dist/routes/aeo-twin.d.ts.map +1 -1
  13. package/dist/routes/aeo-twin.js +33 -2
  14. package/dist/routes/aeo-twin.js.map +1 -1
  15. package/dist/routes/sitemap-index.d.ts.map +1 -1
  16. package/dist/routes/sitemap-index.js +45 -32
  17. package/dist/routes/sitemap-index.js.map +1 -1
  18. package/dist/types.d.ts +1 -1
  19. package/dist/types.d.ts.map +1 -1
  20. package/dist/utils/define-content-provider.d.ts +1 -0
  21. package/dist/utils/define-content-provider.d.ts.map +1 -1
  22. package/dist/utils/define-content-provider.js.map +1 -1
  23. package/dist/utils/json-ld/video.d.ts.map +1 -1
  24. package/dist/utils/json-ld/video.js +18 -0
  25. package/dist/utils/json-ld/video.js.map +1 -1
  26. package/dist/utils/sitemap.d.ts.map +1 -1
  27. package/dist/utils/sitemap.js +1 -1
  28. package/dist/utils/sitemap.js.map +1 -1
  29. package/dist/utils/validation.d.ts +5 -0
  30. package/dist/utils/validation.d.ts.map +1 -1
  31. package/dist/utils/validation.js +78 -4
  32. package/dist/utils/validation.js.map +1 -1
  33. package/package.json +1 -1
  34. package/src/index.ts +57 -37
  35. package/src/middleware/seo.ts +8 -3
  36. package/src/options.ts +21 -0
  37. package/src/routes/aeo-twin.ts +38 -2
  38. package/src/routes/sitemap-index.ts +48 -35
  39. package/src/types.ts +1 -1
  40. package/src/utils/define-content-provider.ts +1 -0
  41. package/src/utils/json-ld/video.ts +20 -0
  42. package/src/utils/sitemap.ts +4 -2
  43. package/src/utils/validation.ts +95 -4
package/src/index.ts CHANGED
@@ -89,53 +89,65 @@ export default function seo(userOptions: SeoOptions): AstroIntegration {
89
89
 
90
90
  const injected: string[] = []
91
91
  const skipped: Array<{ pattern: string; reason: string }> = []
92
+ const injectedRoutes = options.injectedRoutes
92
93
 
93
- // ─── Sitemaps (provider-wired gated) ───
94
- if (providerWired) {
95
- injectRoute({
96
- pattern: SITEMAP_INDEX_PATH,
97
- entrypoint: resolveEntrypoint('./routes/sitemap-index'),
98
- prerender: false,
99
- })
100
- injected.push(SITEMAP_INDEX_PATH)
94
+ const injectSeoRoute = (pattern: string, entrypoint: string) => {
101
95
  injectRoute({
102
- pattern: '/sitemap-articles.xml',
103
- entrypoint: resolveEntrypoint('./routes/sitemap-articles'),
96
+ pattern,
97
+ entrypoint: resolveEntrypoint(entrypoint),
104
98
  prerender: false,
105
99
  })
106
- injected.push('/sitemap-articles.xml')
107
- injectRoute({
108
- pattern: '/sitemap-pages.xml',
109
- entrypoint: resolveEntrypoint('./routes/sitemap-pages'),
110
- prerender: false,
111
- })
112
- injected.push('/sitemap-pages.xml')
113
- injectRoute({
114
- pattern: '/sitemap-videos.xml',
115
- entrypoint: resolveEntrypoint('./routes/sitemap-videos'),
116
- prerender: false,
100
+ injected.push(pattern)
101
+ }
102
+
103
+ const skipDisabledRoute = (pattern: string, optionName: string) => {
104
+ skipped.push({
105
+ pattern,
106
+ reason: `injectedRoutes.${optionName} is false`,
117
107
  })
118
- injected.push('/sitemap-videos.xml')
119
- if (options.commerce?.enabled) {
120
- injectRoute({
121
- pattern: '/sitemap-products.xml',
122
- entrypoint: resolveEntrypoint('./routes/sitemap-products'),
123
- prerender: false,
124
- })
125
- injected.push('/sitemap-products.xml')
108
+ }
109
+
110
+ // ─── Sitemaps (provider-wired gated) ───
111
+ if (providerWired) {
112
+ if (injectedRoutes.sitemapIndex) {
113
+ injectSeoRoute(SITEMAP_INDEX_PATH, './routes/sitemap-index')
114
+ } else {
115
+ skipDisabledRoute(SITEMAP_INDEX_PATH, 'sitemapIndex')
116
+ }
117
+
118
+ if (injectedRoutes.sitemapArticles) {
119
+ injectSeoRoute('/sitemap-articles.xml', './routes/sitemap-articles')
120
+ } else {
121
+ skipDisabledRoute('/sitemap-articles.xml', 'sitemapArticles')
122
+ }
123
+
124
+ if (injectedRoutes.sitemapPages) {
125
+ injectSeoRoute('/sitemap-pages.xml', './routes/sitemap-pages')
126
+ } else {
127
+ skipDisabledRoute('/sitemap-pages.xml', 'sitemapPages')
128
+ }
129
+
130
+ if (injectedRoutes.sitemapVideos) {
131
+ injectSeoRoute('/sitemap-videos.xml', './routes/sitemap-videos')
132
+ } else {
133
+ skipDisabledRoute('/sitemap-videos.xml', 'sitemapVideos')
134
+ }
135
+
136
+ if (!injectedRoutes.sitemapProducts) {
137
+ skipDisabledRoute('/sitemap-products.xml', 'sitemapProducts')
138
+ } else if (options.commerce?.enabled) {
139
+ injectSeoRoute('/sitemap-products.xml', './routes/sitemap-products')
126
140
  } else {
127
141
  skipped.push({
128
142
  pattern: '/sitemap-products.xml',
129
143
  reason: 'commerce.enabled is false',
130
144
  })
131
145
  }
132
- if (options.markdownSitemap && aeo && aeo.mode !== 'middleware') {
133
- injectRoute({
134
- pattern: '/sitemap-markdown.xml',
135
- entrypoint: resolveEntrypoint('./routes/sitemap-markdown'),
136
- prerender: false,
137
- })
138
- injected.push('/sitemap-markdown.xml')
146
+
147
+ if (!injectedRoutes.sitemapMarkdown) {
148
+ skipDisabledRoute('/sitemap-markdown.xml', 'sitemapMarkdown')
149
+ } else if (options.markdownSitemap && aeo && aeo.mode !== 'middleware') {
150
+ injectSeoRoute('/sitemap-markdown.xml', './routes/sitemap-markdown')
139
151
  } else if (options.markdownSitemap) {
140
152
  skipped.push({
141
153
  pattern: '/sitemap-markdown.xml',
@@ -313,12 +325,17 @@ export default function seo(userOptions: SeoOptions): AstroIntegration {
313
325
 
314
326
  for (const file of htmlFiles) {
315
327
  const html = readFileSync(file, 'utf-8')
328
+ const relPath = file.replace(outDir, '')
316
329
  const result = validatePage(html, {
317
330
  titleMaxLength: options.validation.titleMaxLength,
318
331
  descriptionMaxLength: options.validation.descriptionMaxLength,
319
332
  heroMinWidth: options.validation.heroMinWidth,
333
+ pagePath: relPath,
334
+ requireH1: options.validation.requireH1,
335
+ requireHeroImage: options.validation.requireHeroImage,
336
+ requireArticleSchema: options.validation.requireArticleSchema,
337
+ requireMaxImagePreviewLarge: options.validation.requireMaxImagePreviewLarge,
320
338
  })
321
- const relPath = file.replace(outDir, '')
322
339
  for (const error of result.errors) {
323
340
  logger.error(`${relPath}: ${error}`)
324
341
  errorCount++
@@ -352,6 +369,9 @@ export default function seo(userOptions: SeoOptions): AstroIntegration {
352
369
 
353
370
  if (errorCount || warningCount) {
354
371
  logger.info(`SEO validation: ${errorCount} errors, ${warningCount} warnings`)
372
+ if (errorCount > 0) {
373
+ throw new Error(`[@growth-labs/seo] SEO validation failed with ${errorCount} errors`)
374
+ }
355
375
  } else {
356
376
  logger.info('SEO validation: all checks passed')
357
377
  }
@@ -109,13 +109,18 @@ export function createSeoMiddleware(
109
109
  // - CF-Connecting-IP: FCrDNS is keyed on it
110
110
  appendVaryHeaders(newHeaders)
111
111
 
112
- // Link alternate header on HTML responses when aeoTwins enabled, except for
113
- // members items (they have no markdown counterpart at rest).
112
+ // Link alternate header on HTML responses when aeoTwins enabled.
113
+ //
114
+ // Emit only when contentProvider returns an item for this URL AND that item
115
+ // is not members-gated. Previously this also emitted Link headers for URLs
116
+ // the contentProvider did not know about (hubs, region pages, the home page,
117
+ // etc.) — those URLs have no twin at rest, so the advertised .md target 404s.
118
+ // Gating to items-only matches actually-emitted twin paths.
114
119
  if (aeo && isHtmlResponse(response)) {
115
120
  const linkItem = contentProvider
116
121
  ? await findItemForPath(contentProvider, context, url.pathname)
117
122
  : undefined
118
- if (!linkItem || linkItem.access !== 'members') {
123
+ if (linkItem && linkItem.access !== 'members') {
119
124
  const target =
120
125
  aeo.mode === 'static' || aeo.mode === 'both' ? twinUrlFor(url, aeo) : url.toString()
121
126
  const linkValue = `<${target}>; rel="alternate"; type="text/markdown"`
package/src/options.ts CHANGED
@@ -185,6 +185,19 @@ const crawlerPolicySchema = z
185
185
  })
186
186
  .default({})
187
187
 
188
+ // ─── Injected route ownership ───
189
+
190
+ const injectedRoutesSchema = z
191
+ .object({
192
+ sitemapIndex: z.boolean().default(true),
193
+ sitemapArticles: z.boolean().default(true),
194
+ sitemapPages: z.boolean().default(true),
195
+ sitemapVideos: z.boolean().default(true),
196
+ sitemapProducts: z.boolean().default(true),
197
+ sitemapMarkdown: z.boolean().default(true),
198
+ })
199
+ .default({})
200
+
188
201
  // ─── Main schema ───
189
202
 
190
203
  const siteUrlSchema = z.union([
@@ -217,6 +230,10 @@ export const seoOptionsSchema = z.object({
217
230
  markdownSitemap: z.boolean().default(true),
218
231
  rss: z.boolean().default(false),
219
232
 
233
+ // ─── Injected route ownership ───
234
+ // Set a route false when the consumer application owns that public path.
235
+ injectedRoutes: injectedRoutesSchema,
236
+
220
237
  // ─── AEO twins ───
221
238
  // Boolean form = { mode: 'static' } when true, no twins emitted when false.
222
239
  aeoTwins: z.union([z.boolean(), aeoTwinsObjectSchema]).default(false),
@@ -269,6 +286,10 @@ export const seoOptionsSchema = z.object({
269
286
  heroMinWidth: z.number().default(1200),
270
287
  titleMaxLength: z.number().default(110),
271
288
  descriptionMaxLength: z.number().default(160),
289
+ requireH1: z.boolean().default(true),
290
+ requireHeroImage: z.boolean().default(true),
291
+ requireArticleSchema: z.boolean().default(true),
292
+ requireMaxImagePreviewLarge: z.boolean().default(true),
272
293
  enabled: z.boolean().default(true),
273
294
  })
274
295
  .default({}),
@@ -51,11 +51,35 @@ export const getStaticPaths: GetStaticPaths = async () => {
51
51
  // Emit twins for static + both modes; middleware mode serves on-demand.
52
52
  if (!aeo || aeo.mode === 'middleware' || !contentProvider) return []
53
53
 
54
+ // Twin generation now covers articles + pages + videos + podcasts. Hubs,
55
+ // region pillars, series indexes, and the home page typically come through
56
+ // `type: 'pages'`. Each consumer's contentProvider decides which surfaces
57
+ // belong to which type; if a type is unknown it returns [] and we skip.
58
+ const TWIN_TYPES = ['articles', 'pages', 'videos', 'podcasts'] as const
59
+
54
60
  let items: ContentItem[]
55
61
  try {
56
62
  // contentProvider resolves here because this function runs inside Astro's
57
63
  // build pipeline — `astro:content` and other Vite virtual modules work.
58
- items = await contentProvider({ type: 'articles' }, {} as never)
64
+ const batches = await Promise.all(
65
+ TWIN_TYPES.map((type) =>
66
+ (contentProvider({ type }, {} as never) as Promise<ContentItem[]>).catch(
67
+ () => [] as ContentItem[],
68
+ ),
69
+ ),
70
+ )
71
+ // Deduplicate by URL — a contentProvider might surface the same item under
72
+ // multiple types (rare, but cheap to defend against).
73
+ const seen = new Set<string>()
74
+ items = []
75
+ for (const batch of batches) {
76
+ for (const it of batch) {
77
+ if (!it || typeof it.url !== 'string') continue
78
+ if (seen.has(it.url)) continue
79
+ seen.add(it.url)
80
+ items.push(it)
81
+ }
82
+ }
59
83
  } catch {
60
84
  // Don't fail the build; log nothing here because astro's getStaticPaths
61
85
  // swallows console output. The consumer sees "zero paths emitted" which
@@ -74,7 +98,19 @@ export const getStaticPaths: GetStaticPaths = async () => {
74
98
  for (const item of filtered) {
75
99
  const primaryUrl = twinUrl(item.url)
76
100
  const primaryPath = stripOrigin(primaryUrl).replace(/^\//, '')
77
- const body = item.description ?? ''
101
+ // Paywall-aware twin body. For free items the twin echoes the description
102
+ // (consumer-provided summary). For gated items we never emit the full body
103
+ // — twins are public regardless of the HTML paywall — so we keep the same
104
+ // description-only body and append an explicit "Full analysis is gated"
105
+ // footer that AI engines can cite without claiming the paid content as
106
+ // theirs. The HTML page carries the structured-data paywall markup
107
+ // (`isAccessibleForFree: false` + `hasPart`); the twin's textual boundary
108
+ // must agree with that signal.
109
+ const isGated = item.access === 'members' || item.isAccessibleForFree === false
110
+ const sample = item.description ?? ''
111
+ const body = isGated
112
+ ? `${sample}${sample ? '\n\n' : ''}---\n\nFull analysis is behind the paywall. Read the gated piece at ${item.url}.`
113
+ : sample
78
114
 
79
115
  const contentHash =
80
116
  stalenessMode === 'content-hash' ? await computeContentHash(item, body) : undefined
@@ -14,6 +14,7 @@ export const GET: APIRoute = async (context) => {
14
14
  const sitemaps: SitemapEntry[] = []
15
15
 
16
16
  const { site } = config
17
+ const injectedRoutes = config.injectedRoutes
17
18
 
18
19
  // Fetch articles lastmod if possible
19
20
  let articlesLastmod: string | undefined
@@ -22,43 +23,49 @@ export const GET: APIRoute = async (context) => {
22
23
  let productsLastmod: string | undefined
23
24
 
24
25
  if (contentProvider) {
25
- try {
26
- const articles = await contentProvider({ type: 'articles' }, context as any)
27
- if (articles.length > 0) {
28
- const dates = articles
29
- .map((a) => a.dateModified ?? a.datePublished)
30
- .filter(Boolean) as string[]
31
- if (dates.length > 0) {
32
- articlesLastmod = dates.sort().at(-1)
26
+ if (injectedRoutes.sitemapArticles) {
27
+ try {
28
+ const articles = await contentProvider({ type: 'articles' }, context as any)
29
+ if (articles.length > 0) {
30
+ const dates = articles
31
+ .map((a) => a.dateModified ?? a.datePublished)
32
+ .filter(Boolean) as string[]
33
+ if (dates.length > 0) {
34
+ articlesLastmod = dates.sort().at(-1)
35
+ }
33
36
  }
34
- }
35
- } catch {}
37
+ } catch {}
38
+ }
36
39
 
37
- try {
38
- const pages = await contentProvider({ type: 'pages' }, context as any)
39
- if (pages.length > 0) {
40
- const dates = pages
41
- .map((p) => p.dateModified ?? p.datePublished)
42
- .filter(Boolean) as string[]
43
- if (dates.length > 0) {
44
- pagesLastmod = dates.sort().at(-1)
40
+ if (injectedRoutes.sitemapPages) {
41
+ try {
42
+ const pages = await contentProvider({ type: 'pages' }, context as any)
43
+ if (pages.length > 0) {
44
+ const dates = pages
45
+ .map((p) => p.dateModified ?? p.datePublished)
46
+ .filter(Boolean) as string[]
47
+ if (dates.length > 0) {
48
+ pagesLastmod = dates.sort().at(-1)
49
+ }
45
50
  }
46
- }
47
- } catch {}
51
+ } catch {}
52
+ }
48
53
 
49
- try {
50
- const videos = await contentProvider({ type: 'videos' }, context as any)
51
- if (videos.length > 0) {
52
- const dates = videos
53
- .map((v) => v.dateModified ?? v.datePublished)
54
- .filter(Boolean) as string[]
55
- if (dates.length > 0) {
56
- videosLastmod = dates.sort().at(-1)
54
+ if (injectedRoutes.sitemapVideos) {
55
+ try {
56
+ const videos = await contentProvider({ type: 'videos' }, context as any)
57
+ if (videos.length > 0) {
58
+ const dates = videos
59
+ .map((v) => v.dateModified ?? v.datePublished)
60
+ .filter(Boolean) as string[]
61
+ if (dates.length > 0) {
62
+ videosLastmod = dates.sort().at(-1)
63
+ }
57
64
  }
58
- }
59
- } catch {}
65
+ } catch {}
66
+ }
60
67
 
61
- if (config.commerce?.enabled) {
68
+ if (config.commerce?.enabled && injectedRoutes.sitemapProducts) {
62
69
  try {
63
70
  const products = await contentProvider({ type: 'products' }, context as any)
64
71
  if (products.length > 0) {
@@ -73,11 +80,17 @@ export const GET: APIRoute = async (context) => {
73
80
  }
74
81
  }
75
82
 
76
- sitemaps.push({ loc: `${site}/sitemap-articles.xml`, lastmod: articlesLastmod })
77
- sitemaps.push({ loc: `${site}/sitemap-pages.xml`, lastmod: pagesLastmod })
78
- sitemaps.push({ loc: `${site}/sitemap-videos.xml`, lastmod: videosLastmod })
83
+ if (injectedRoutes.sitemapArticles) {
84
+ sitemaps.push({ loc: `${site}/sitemap-articles.xml`, lastmod: articlesLastmod })
85
+ }
86
+ if (injectedRoutes.sitemapPages) {
87
+ sitemaps.push({ loc: `${site}/sitemap-pages.xml`, lastmod: pagesLastmod })
88
+ }
89
+ if (injectedRoutes.sitemapVideos) {
90
+ sitemaps.push({ loc: `${site}/sitemap-videos.xml`, lastmod: videosLastmod })
91
+ }
79
92
 
80
- if (config.commerce?.enabled) {
93
+ if (config.commerce?.enabled && injectedRoutes.sitemapProducts) {
81
94
  sitemaps.push({ loc: `${site}/sitemap-products.xml`, lastmod: productsLastmod })
82
95
  }
83
96
 
package/src/types.ts CHANGED
@@ -117,7 +117,7 @@ export interface ContentItem {
117
117
 
118
118
  // ─── Content provider ───
119
119
 
120
- export type ContentType = 'articles' | 'pages' | 'videos' | 'products' | 'authors'
120
+ export type ContentType = 'articles' | 'pages' | 'videos' | 'podcasts' | 'products' | 'authors'
121
121
 
122
122
  export interface ContentProviderParams {
123
123
  type: ContentType
@@ -16,6 +16,7 @@ export interface ContentItemByType {
16
16
  articles: ContentItem
17
17
  pages: ContentItem
18
18
  videos: ContentItem & { video: NonNullable<ContentItem['video']> }
19
+ podcasts: ContentItem
19
20
  products: ContentItem & { product: NonNullable<ContentItem['product']> }
20
21
  authors: ContentItem
21
22
  }
@@ -3,12 +3,22 @@ import type { ContentItem, JsonLdObject } from '../../types.js'
3
3
  export function generateVideoJsonLd(item: ContentItem): JsonLdObject {
4
4
  const video = item.video!
5
5
 
6
+ // Derive isAccessibleForFree: explicit field wins, otherwise derive from access.
7
+ // Matches the article emitter so paywalled videos carry the same Google-recognized
8
+ // markup as paywalled articles. Without this, Googlebot has no way to know the
9
+ // gated playback is intentional and may flag the page as cloaking.
10
+ const isAccessibleForFree = item.isAccessibleForFree ?? item.access !== 'members'
11
+
6
12
  const result: JsonLdObject = {
7
13
  '@context': 'https://schema.org',
8
14
  '@type': 'VideoObject',
9
15
  name: item.title,
10
16
  thumbnailUrl: video.thumbnailUrl,
11
17
  duration: video.duration,
18
+ // Google requires the string form 'True'/'False' for Rich Results when paywall
19
+ // markup is emitted. Same convention as the article schema.
20
+ // https://developers.google.com/search/docs/appearance/structured-data/paywalled-content
21
+ isAccessibleForFree: isAccessibleForFree ? 'True' : 'False',
12
22
  }
13
23
 
14
24
  if (item.description) result.description = item.description
@@ -16,5 +26,15 @@ export function generateVideoJsonLd(item: ContentItem): JsonLdObject {
16
26
  if (video.embedUrl) result.embedUrl = video.embedUrl
17
27
  if (item.datePublished) result.uploadDate = item.datePublished
18
28
 
29
+ // hasPart paywall marker — emitted whenever the item is gated and a cssSelector
30
+ // is configured. Mirrors the article emitter behaviour.
31
+ if (!isAccessibleForFree && item.paywallCssSelector) {
32
+ result.hasPart = {
33
+ '@type': 'WebPageElement',
34
+ isAccessibleForFree: 'False',
35
+ cssSelector: item.paywallCssSelector,
36
+ }
37
+ }
38
+
19
39
  return result
20
40
  }
@@ -105,9 +105,11 @@ ${entries}
105
105
 
106
106
  export function generateVideoSitemap(items: ContentItem[]): string {
107
107
  const entries = items
108
- .filter((item) => item.video)
108
+ .filter((item): item is ContentItem & { video: NonNullable<ContentItem['video']> } =>
109
+ Boolean(item.video && (item.video.contentUrl || item.video.embedUrl)),
110
+ )
109
111
  .map((item) => {
110
- const v = item.video!
112
+ const v = item.video
111
113
  const contentUrlTag = v.contentUrl
112
114
  ? `\n <video:content_loc>${escapeXml(v.contentUrl)}</video:content_loc>`
113
115
  : ''
@@ -7,6 +7,11 @@ export interface PageValidationOptions {
7
7
  titleMaxLength: number
8
8
  descriptionMaxLength: number
9
9
  heroMinWidth?: number
10
+ pagePath?: string
11
+ requireH1?: boolean
12
+ requireHeroImage?: boolean
13
+ requireArticleSchema?: boolean
14
+ requireMaxImagePreviewLarge?: boolean
10
15
  }
11
16
 
12
17
  /**
@@ -139,7 +144,7 @@ export function validatePage(html: string, options: PageValidationOptions): Vali
139
144
  html.match(/<meta\s[^>]*property=["']og:image["'][^>]*>/i) ??
140
145
  html.match(/<meta\s[^>]*property=og:image[^>]*>/i)
141
146
  if (!ogImageMatch) {
142
- warnings.push('Missing og:image meta tag')
147
+ pushIssue(options.requireHeroImage, errors, warnings, 'Missing hero image og:image meta tag')
143
148
  }
144
149
 
145
150
  // og:image:width check against heroMinWidth
@@ -154,18 +159,27 @@ export function validatePage(html: string, options: PageValidationOptions): Vali
154
159
  if (widthContentMatch) {
155
160
  const width = Number(widthContentMatch[1])
156
161
  if (!Number.isNaN(width) && width < options.heroMinWidth) {
157
- warnings.push(`Hero image width ${width}px is below minimum ${options.heroMinWidth}px`)
162
+ pushIssue(
163
+ options.requireHeroImage,
164
+ errors,
165
+ warnings,
166
+ `Hero image width ${width}px is below minimum ${options.heroMinWidth}px`,
167
+ )
158
168
  }
169
+ } else if (options.requireHeroImage) {
170
+ errors.push(`Hero image width is missing; minimum is ${options.heroMinWidth}px`)
159
171
  }
172
+ } else if (options.requireHeroImage && ogImageMatch) {
173
+ errors.push(`Hero image width is missing; minimum is ${options.heroMinWidth}px`)
160
174
  }
161
175
  }
162
176
 
163
177
  // H1 checks
164
178
  const h1Matches = html.match(/<h1[\s>]/gi) ?? []
165
179
  if (h1Matches.length === 0) {
166
- warnings.push('Missing H1 tag')
180
+ pushIssue(options.requireH1, errors, warnings, 'Missing H1 tag')
167
181
  } else if (h1Matches.length > 1) {
168
- warnings.push(`Multiple H1 tags found (${h1Matches.length})`)
182
+ pushIssue(options.requireH1, errors, warnings, `Multiple H1 tags found (${h1Matches.length})`)
169
183
  }
170
184
 
171
185
  // JSON-LD presence check
@@ -174,9 +188,86 @@ export function validatePage(html: string, options: PageValidationOptions): Vali
174
188
  warnings.push('Missing JSON-LD structured data')
175
189
  }
176
190
 
191
+ if (
192
+ options.requireArticleSchema &&
193
+ isLikelyArticlePath(options.pagePath) &&
194
+ !hasArticleJsonLd(html)
195
+ ) {
196
+ errors.push('Missing valid Article JSON-LD for article route')
197
+ }
198
+
199
+ if (options.requireMaxImagePreviewLarge && !hasMaxImagePreviewLarge(html)) {
200
+ errors.push('Missing robots max-image-preview:large directive')
201
+ }
202
+
177
203
  return { errors, warnings }
178
204
  }
179
205
 
206
+ function pushIssue(
207
+ asError: boolean | undefined,
208
+ errors: string[],
209
+ warnings: string[],
210
+ message: string,
211
+ ) {
212
+ if (asError) errors.push(message)
213
+ else warnings.push(message)
214
+ }
215
+
216
+ function isLikelyArticlePath(pagePath: string | undefined): boolean {
217
+ if (!pagePath) return false
218
+ return /\/(article|articles|news|story|stories)\//i.test(pagePath)
219
+ }
220
+
221
+ function hasArticleJsonLd(html: string): boolean {
222
+ for (const rawJson of extractJsonLdBodies(html)) {
223
+ try {
224
+ const parsed = JSON.parse(rawJson) as unknown
225
+ if (hasArticleType(parsed)) return true
226
+ } catch {}
227
+ }
228
+ return false
229
+ }
230
+
231
+ function extractJsonLdBodies(html: string): string[] {
232
+ const bodies: string[] = []
233
+ const pattern = /<script\s[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi
234
+ for (const match of html.matchAll(pattern)) {
235
+ if (match[1]) bodies.push(match[1].trim())
236
+ }
237
+ return bodies
238
+ }
239
+
240
+ function hasArticleType(value: unknown): boolean {
241
+ if (Array.isArray(value)) return value.some(hasArticleType)
242
+ if (!isRecord(value)) return false
243
+
244
+ const type = value['@type']
245
+ if (type === 'Article' || type === 'NewsArticle' || type === 'BlogPosting') return true
246
+ if (
247
+ Array.isArray(type) &&
248
+ type.some((item) => item === 'Article' || item === 'NewsArticle' || item === 'BlogPosting')
249
+ ) {
250
+ return true
251
+ }
252
+
253
+ return hasArticleType(value['@graph'])
254
+ }
255
+
256
+ function hasMaxImagePreviewLarge(html: string): boolean {
257
+ const metaTags = html.match(/<meta\s+[^>]*>/gi) ?? []
258
+ for (const tag of metaTags) {
259
+ if (getHtmlAttr(tag, 'name')?.toLowerCase() !== 'robots') continue
260
+ const content = getHtmlAttr(tag, 'content')?.toLowerCase() ?? ''
261
+ const directives = content.split(',').map((part) => part.trim())
262
+ if (directives.includes('max-image-preview:large')) return true
263
+ }
264
+ return false
265
+ }
266
+
267
+ function isRecord(value: unknown): value is Record<string, unknown> {
268
+ return typeof value === 'object' && value !== null
269
+ }
270
+
180
271
  function isNoindexMetaRefreshRedirect(html: string): boolean {
181
272
  const metaTags = html.match(/<meta\s+[^>]*>/gi) ?? []
182
273
  const hasRefresh = metaTags.some(