@growth-labs/seo 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/dist/utils/validation.d.ts.map +1 -1
  2. package/dist/utils/validation.js +22 -0
  3. package/dist/utils/validation.js.map +1 -1
  4. package/package.json +9 -5
  5. package/src/_internal/state.ts +26 -0
  6. package/src/bindings.ts +146 -0
  7. package/src/cron/prune-aeo-r2.ts +140 -0
  8. package/src/durable-objects/aeo-revalidation-coord.ts +246 -0
  9. package/src/index.ts +380 -0
  10. package/src/middleware/seo.ts +350 -0
  11. package/src/options.ts +456 -0
  12. package/src/routes/aeo-twin.ts +130 -0
  13. package/src/routes/apple-news.ts +36 -0
  14. package/src/routes/llms-full.ts +36 -0
  15. package/src/routes/llms.ts +15 -0
  16. package/src/routes/podcast-narration.ts +45 -0
  17. package/src/routes/podcast.ts +27 -0
  18. package/src/routes/revalidate.ts +298 -0
  19. package/src/routes/robots.ts +21 -0
  20. package/src/routes/rss.ts +29 -0
  21. package/src/routes/sitemap-articles.ts +25 -0
  22. package/src/routes/sitemap-index.ts +89 -0
  23. package/src/routes/sitemap-markdown.ts +39 -0
  24. package/src/routes/sitemap-pages.ts +24 -0
  25. package/src/routes/sitemap-products.ts +24 -0
  26. package/src/routes/sitemap-videos.ts +24 -0
  27. package/src/runtime.ts +17 -0
  28. package/src/site-url-core.ts +71 -0
  29. package/src/site-url.ts +21 -0
  30. package/src/types.ts +166 -0
  31. package/src/utils/aeo-summary.ts +176 -0
  32. package/src/utils/aeo-twin-emitter.ts +173 -0
  33. package/src/utils/aeo.ts +223 -0
  34. package/src/utils/apple-news-anf.ts +163 -0
  35. package/src/utils/apple-news-rss.ts +136 -0
  36. package/src/utils/content-filter.ts +87 -0
  37. package/src/utils/crawler-class.ts +155 -0
  38. package/src/utils/define-content-provider.ts +65 -0
  39. package/src/utils/effective-auth.ts +44 -0
  40. package/src/utils/fcrdns.ts +269 -0
  41. package/src/utils/fresh-layer.ts +175 -0
  42. package/src/utils/hreflang.ts +26 -0
  43. package/src/utils/index.ts +91 -0
  44. package/src/utils/json-ld/article.ts +120 -0
  45. package/src/utils/json-ld/audio.ts +32 -0
  46. package/src/utils/json-ld/breadcrumb.ts +28 -0
  47. package/src/utils/json-ld/faq.ts +18 -0
  48. package/src/utils/json-ld/howto.ts +23 -0
  49. package/src/utils/json-ld/index.ts +12 -0
  50. package/src/utils/json-ld/item-list.ts +26 -0
  51. package/src/utils/json-ld/organization.ts +42 -0
  52. package/src/utils/json-ld/person.ts +25 -0
  53. package/src/utils/json-ld/product.ts +155 -0
  54. package/src/utils/json-ld/video.ts +20 -0
  55. package/src/utils/json-ld/website.ts +27 -0
  56. package/src/utils/llms-full.ts +90 -0
  57. package/src/utils/llms.ts +45 -0
  58. package/src/utils/meta.ts +184 -0
  59. package/src/utils/podcast.ts +112 -0
  60. package/src/utils/robots.ts +47 -0
  61. package/src/utils/rss.ts +64 -0
  62. package/src/utils/seo-head.ts +81 -0
  63. package/src/utils/sitemap-markdown.ts +80 -0
  64. package/src/utils/sitemap.ts +169 -0
  65. package/src/utils/staleness.ts +61 -0
  66. package/src/utils/validation.ts +308 -0
  67. package/src/virtual.d.ts +8 -0
  68. package/src/vite-plugin.ts +66 -0
@@ -0,0 +1,350 @@
1
+ import 'virtual:growth-labs/seo/config'
2
+ import { getConfig, getContentProvider } from '../_internal/state.js'
3
+ import type { SeoEnv } from '../bindings.js'
4
+ import { type ResolvedSeoOptions, resolveAeoTwins } from '../options.js'
5
+ import { getRuntimeEnv, getWaitUntil } from '../runtime.js'
6
+ import type { ContentItem, ContentProvider, CrawlerClass, EffectiveAuthSegment } from '../types.js'
7
+ import { estimateTokenCount, generateAeoMarkdown } from '../utils/aeo.js'
8
+ import { classifyRequest } from '../utils/crawler-class.js'
9
+ import { computeEffectiveAuthSegment, type RawAuthSegment } from '../utils/effective-auth.js'
10
+ import { createDohResolver, createFcrdnsVerifier, type FcrdnsVerifier } from '../utils/fcrdns.js'
11
+ import { type FreshLayerBinding, readFreshTwin, writeFreshTwin } from '../utils/fresh-layer.js'
12
+
13
+ // ─── Public middleware factory ───
14
+
15
+ export interface SeoMiddlewareDeps {
16
+ /** Request classifier. Defaults to one backed by a DoH FCrDNS verifier. */
17
+ classify?: (request: Request) => Promise<CrawlerClass>
18
+ /** R2/KV binding for the fresh-twin layer. Required for mode 'middleware' or 'both'. */
19
+ freshLayer?: FreshLayerBinding
20
+ /** Assets binding for reading the stale build-time twin on R2 miss. */
21
+ assets?: { fetch(request: Request): Promise<Response> }
22
+ /** Consumer's raw auth segment for this request. Defaults to 'anon'. */
23
+ rawAuthSegment?: (request: Request) => RawAuthSegment
24
+ /** Fire-and-forget ctx.waitUntil equivalent. Used to schedule background revalidation. */
25
+ waitUntil?: (promise: Promise<unknown>) => void
26
+ }
27
+
28
+ export interface SeoMiddlewareContext {
29
+ request: Request
30
+ locals?: Record<string, unknown>
31
+ }
32
+
33
+ export function createSeoMiddleware(
34
+ options: ResolvedSeoOptions,
35
+ contentProvider?: ContentProvider,
36
+ deps: SeoMiddlewareDeps = {},
37
+ ) {
38
+ const aeo = resolveAeoTwins(options.aeoTwins)
39
+ const classify = deps.classify ?? createDefaultClassifier()
40
+
41
+ return async (
42
+ context: SeoMiddlewareContext,
43
+ next: () => Promise<Response>,
44
+ ): Promise<Response> => {
45
+ const { request } = context
46
+ const accept = request.headers.get('accept') ?? ''
47
+ const url = new URL(request.url)
48
+
49
+ // 1. Classify the request. Expose classification on Astro.locals for the
50
+ // consumer's cache-key builder.
51
+ const crawlerClass: CrawlerClass = await classify(request)
52
+ const rawAuth = deps.rawAuthSegment?.(request) ?? 'anon'
53
+ const effectiveAuthSegment: EffectiveAuthSegment = computeEffectiveAuthSegment(
54
+ crawlerClass,
55
+ rawAuth,
56
+ )
57
+
58
+ if (context.locals) {
59
+ context.locals.crawlerClass = crawlerClass
60
+ context.locals.effectiveAuthSegment = effectiveAuthSegment
61
+ }
62
+
63
+ // 2. LLM training crawlers are 403'd on members items, before any other work.
64
+ // We look up the item only when crawlerClass requires it to avoid per-request
65
+ // contentProvider calls on the hot path for anonymous requests.
66
+ const blockLlmTraining = options.crawlerPolicy.blockLlmTrainingCrawlers
67
+ if (blockLlmTraining && crawlerClass === 'llmTrainingCrawler' && contentProvider) {
68
+ const item = await findItemForPath(contentProvider, context, url.pathname)
69
+ if (item?.access === 'members') {
70
+ return new Response('Forbidden', {
71
+ status: 403,
72
+ headers: {
73
+ 'Content-Type': 'text/plain; charset=utf-8',
74
+ 'X-Robots-Tag': 'noindex',
75
+ ...varyHeaders(),
76
+ },
77
+ })
78
+ }
79
+ }
80
+
81
+ // 3. Accept: text/markdown handling (middleware mode of AEO twins).
82
+ const wantsMarkdown = accept.includes('text/markdown')
83
+ if (aeo && aeo.mode !== 'static' && wantsMarkdown && contentProvider) {
84
+ const markdownResponse = await serveAeoMarkdown({
85
+ request,
86
+ context,
87
+ options,
88
+ contentProvider,
89
+ freshLayer: deps.freshLayer,
90
+ assets: deps.assets,
91
+ waitUntil: deps.waitUntil,
92
+ })
93
+ if (markdownResponse) return markdownResponse
94
+ }
95
+
96
+ // 4. Pass through to the next handler (SSR page or static asset route).
97
+ const response = await next()
98
+
99
+ // 5. Wrap response with our observability + policy headers.
100
+ const newHeaders = new Headers(response.headers)
101
+
102
+ // Content-Signal on every response.
103
+ newHeaders.set('content-signal', buildContentSignalHeader(options))
104
+
105
+ // Vary must cover every axis that branches the response:
106
+ // - Accept: response body differs on text/markdown
107
+ // - User-Agent: crawler class depends on it
108
+ // - Cookie: auth segment depends on it
109
+ // - CF-Connecting-IP: FCrDNS is keyed on it
110
+ appendVaryHeaders(newHeaders)
111
+
112
+ // Link alternate header on HTML responses when aeoTwins enabled, except for
113
+ // members items (they have no markdown counterpart at rest).
114
+ if (aeo && isHtmlResponse(response)) {
115
+ const linkItem = contentProvider
116
+ ? await findItemForPath(contentProvider, context, url.pathname)
117
+ : undefined
118
+ if (!linkItem || linkItem.access !== 'members') {
119
+ const target =
120
+ aeo.mode === 'static' || aeo.mode === 'both' ? twinUrlFor(url, aeo) : url.toString()
121
+ const linkValue = `<${target}>; rel="alternate"; type="text/markdown"`
122
+ const existing = newHeaders.get('Link')
123
+ newHeaders.set('Link', existing ? `${existing}, ${linkValue}` : linkValue)
124
+ }
125
+ }
126
+
127
+ return new Response(response.body, {
128
+ status: response.status,
129
+ statusText: response.statusText,
130
+ headers: newHeaders,
131
+ })
132
+ }
133
+ }
134
+
135
+ // ─── Astro middleware entry ───
136
+
137
+ let _classifier: FcrdnsVerifier | undefined
138
+
139
+ export const onRequest = async (
140
+ context: SeoMiddlewareContext,
141
+ next: () => Promise<Response>,
142
+ ): Promise<Response> => {
143
+ const config = getConfig()
144
+ const contentProvider = getContentProvider()
145
+ const env = getRuntimeEnv(context) as SeoEnv
146
+ const aeo = resolveAeoTwins(config.aeoTwins)
147
+
148
+ // Wire bindings from env when aeoTwins is enabled. Middleware-mode requires R2+DO;
149
+ // static-only consumers get the lighter path.
150
+ let freshLayer: FreshLayerBinding | undefined
151
+ if (env && aeo && aeo.mode !== 'static' && aeo.freshLayer) {
152
+ const bindingName = aeo.freshLayer.bindingName
153
+ const impl = (env as unknown as Record<string, unknown>)[bindingName]
154
+ if (impl) {
155
+ freshLayer = {
156
+ type: aeo.freshLayer.type,
157
+ impl: impl as FreshLayerBinding['impl'],
158
+ deploymentId: env.CF_VERSION_METADATA?.id ?? 'dev',
159
+ }
160
+ }
161
+ }
162
+
163
+ const middleware = createSeoMiddleware(config, contentProvider, {
164
+ freshLayer,
165
+ assets: env?.ASSETS,
166
+ waitUntil: getWaitUntil(context),
167
+ })
168
+ return middleware(context, next)
169
+ }
170
+
171
+ // ─── Internals ───
172
+
173
+ function createDefaultClassifier() {
174
+ if (!_classifier) {
175
+ _classifier = createFcrdnsVerifier(createDohResolver())
176
+ }
177
+ const verifier = _classifier
178
+ return (request: Request) => classifyRequest({ request, fcrdnsVerify: verifier })
179
+ }
180
+
181
+ function buildContentSignalHeader(config: ResolvedSeoOptions): string {
182
+ const { aiTrain, search, aiInput } = config.contentSignal
183
+ return `ai-train=${aiTrain}, search=${search}, ai-input=${aiInput}`
184
+ }
185
+
186
+ function varyHeaders(): Record<string, string> {
187
+ return { Vary: 'Accept, User-Agent, Cookie, CF-Connecting-IP' }
188
+ }
189
+
190
+ function appendVaryHeaders(headers: Headers): void {
191
+ const existing = headers.get('Vary')
192
+ const required = 'Accept, User-Agent, Cookie, CF-Connecting-IP'
193
+ headers.set('Vary', existing ? dedupVary(`${existing}, ${required}`) : required)
194
+ }
195
+
196
+ function dedupVary(combined: string): string {
197
+ const parts = combined
198
+ .split(',')
199
+ .map((s) => s.trim())
200
+ .filter(Boolean)
201
+ return [...new Set(parts)].join(', ')
202
+ }
203
+
204
+ function isHtmlResponse(response: Response): boolean {
205
+ const ct = response.headers.get('content-type') ?? ''
206
+ return ct.includes('text/html')
207
+ }
208
+
209
+ function twinUrlFor(url: URL, aeo: { twinUrl?: (u: string) => string }): string {
210
+ const fn = aeo.twinUrl ?? ((u: string) => `${u.replace(/\/+$/, '')}.md`)
211
+ return fn(url.toString())
212
+ }
213
+
214
+ async function findItemForPath(
215
+ contentProvider: ContentProvider,
216
+ context: SeoMiddlewareContext,
217
+ pathname: string,
218
+ ): Promise<ContentItem | undefined> {
219
+ // Look up a single item by path; narrow the fetch via `slugs` so CMS providers
220
+ // can short-circuit the lookup instead of loading the whole catalog.
221
+ const slug = pathname.replace(/^\/+/, '').split('/').pop() ?? ''
222
+ try {
223
+ // Type assertion: APIContext is the Astro request context in real code but our
224
+ // middleware context is narrower; contentProvider's APIContext parameter is
225
+ // treated opaquely at this layer.
226
+ const items = await contentProvider({ type: 'articles', slugs: [slug] }, context as never)
227
+ return items.find((item) => {
228
+ try {
229
+ return new URL(item.url).pathname === pathname
230
+ } catch {
231
+ return item.url === pathname
232
+ }
233
+ })
234
+ } catch {
235
+ return undefined
236
+ }
237
+ }
238
+
239
+ // ─── AEO markdown serve path ───
240
+
241
+ interface ServeMarkdownInput {
242
+ request: Request
243
+ context: SeoMiddlewareContext
244
+ options: ResolvedSeoOptions
245
+ contentProvider: ContentProvider
246
+ freshLayer?: FreshLayerBinding
247
+ assets?: { fetch(request: Request): Promise<Response> }
248
+ waitUntil?: (promise: Promise<unknown>) => void
249
+ }
250
+
251
+ async function serveAeoMarkdown(input: ServeMarkdownInput): Promise<Response | null> {
252
+ const { request, context, options, contentProvider, freshLayer, assets, waitUntil } = input
253
+ const url = new URL(request.url)
254
+ const aeo = resolveAeoTwins(options.aeoTwins)
255
+ if (!aeo) return null
256
+ const twinPath = urlPathFromTwinUrl(twinUrlFor(url, aeo))
257
+
258
+ // Layer 1: fresh layer (R2/KV).
259
+ if (freshLayer) {
260
+ const hit = await readFreshTwin(freshLayer, twinPath)
261
+ if (hit) {
262
+ return markdownResponse(hit.body, options, hit.contentType)
263
+ }
264
+ }
265
+
266
+ // Layer 2: stale baseline via ASSETS binding.
267
+ if (assets) {
268
+ const assetReq = new Request(`https://assets.local${twinPath}`, { method: 'GET' })
269
+ const assetResp = await assets.fetch(assetReq)
270
+ if (assetResp.status === 200) {
271
+ const body = await assetResp.text()
272
+ return markdownResponse(body, options)
273
+ }
274
+ }
275
+
276
+ // Layer 3: fallthrough render.
277
+ const matched = await findItemForPath(contentProvider, context, url.pathname)
278
+ if (!matched) {
279
+ return null // Let the route handler serve HTML as usual.
280
+ }
281
+ if (matched.access === 'members') {
282
+ // Never serve markdown for gated items via negotiation.
283
+ return null
284
+ }
285
+
286
+ const body = matched.description ?? ''
287
+ const markdown = generateAeoMarkdown(matched, {
288
+ publisherName: options.organization.name,
289
+ schemaType: options.schemaType,
290
+ content: body,
291
+ ragChunkMarkers: aeo.ragChunkMarkers,
292
+ canonical: matched.url,
293
+ twinUrl: twinUrlFor(url, aeo),
294
+ })
295
+
296
+ // When the fresh layer is wired, return a 503 stub and schedule the write in
297
+ // the background. 200 OK with thin body would risk Googlebot indexing the
298
+ // description-only response as canonical and not re-crawling for days.
299
+ if (waitUntil && freshLayer) {
300
+ waitUntil(writeFreshTwin(freshLayer, twinPath, markdown).catch(() => {}))
301
+ return new Response('aeo twin generating', {
302
+ status: 503,
303
+ headers: {
304
+ 'Content-Type': 'text/plain; charset=utf-8',
305
+ 'Retry-After': '5',
306
+ 'X-Robots-Tag': 'noindex',
307
+ 'Cache-Control': 'no-store',
308
+ 'content-signal': buildContentSignalHeader(options),
309
+ ...varyHeaders(),
310
+ },
311
+ })
312
+ }
313
+
314
+ // Simple mode: no fresh layer wired (tests, static-first consumers without a
315
+ // Worker DO). Render inline synchronously. Still safe — we're not caching a
316
+ // thin stub, we're serving the full markdown.
317
+ return markdownResponse(markdown, options)
318
+ }
319
+
320
+ function markdownResponse(
321
+ body: string,
322
+ options: ResolvedSeoOptions,
323
+ contentType = 'text/markdown; charset=utf-8',
324
+ ): Response {
325
+ const tokenCount = estimateTokenCount(body)
326
+ return new Response(body, {
327
+ headers: {
328
+ 'Content-Type': contentType,
329
+ 'x-markdown-tokens': String(tokenCount),
330
+ 'X-Robots-Tag': 'noindex',
331
+ 'content-signal': buildContentSignalHeader(options),
332
+ ...varyHeaders(),
333
+ },
334
+ })
335
+ }
336
+
337
+ function urlPathFromTwinUrl(twinUrl: string): string {
338
+ try {
339
+ return new URL(twinUrl).pathname
340
+ } catch {
341
+ return twinUrl.startsWith('/') ? twinUrl : `/${twinUrl}`
342
+ }
343
+ }
344
+
345
+ export const _internals = {
346
+ appendVaryHeaders,
347
+ dedupVary,
348
+ twinUrlFor,
349
+ findItemForPath,
350
+ }