@growth-labs/seo 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/utils/validation.d.ts.map +1 -1
- package/dist/utils/validation.js +22 -0
- package/dist/utils/validation.js.map +1 -1
- package/package.json +9 -5
- package/src/_internal/state.ts +26 -0
- package/src/bindings.ts +146 -0
- package/src/cron/prune-aeo-r2.ts +140 -0
- package/src/durable-objects/aeo-revalidation-coord.ts +246 -0
- package/src/index.ts +380 -0
- package/src/middleware/seo.ts +350 -0
- package/src/options.ts +456 -0
- package/src/routes/aeo-twin.ts +130 -0
- package/src/routes/apple-news.ts +36 -0
- package/src/routes/llms-full.ts +36 -0
- package/src/routes/llms.ts +15 -0
- package/src/routes/podcast-narration.ts +45 -0
- package/src/routes/podcast.ts +27 -0
- package/src/routes/revalidate.ts +298 -0
- package/src/routes/robots.ts +21 -0
- package/src/routes/rss.ts +29 -0
- package/src/routes/sitemap-articles.ts +25 -0
- package/src/routes/sitemap-index.ts +89 -0
- package/src/routes/sitemap-markdown.ts +39 -0
- package/src/routes/sitemap-pages.ts +24 -0
- package/src/routes/sitemap-products.ts +24 -0
- package/src/routes/sitemap-videos.ts +24 -0
- package/src/runtime.ts +17 -0
- package/src/site-url-core.ts +71 -0
- package/src/site-url.ts +21 -0
- package/src/types.ts +166 -0
- package/src/utils/aeo-summary.ts +176 -0
- package/src/utils/aeo-twin-emitter.ts +173 -0
- package/src/utils/aeo.ts +223 -0
- package/src/utils/apple-news-anf.ts +163 -0
- package/src/utils/apple-news-rss.ts +136 -0
- package/src/utils/content-filter.ts +87 -0
- package/src/utils/crawler-class.ts +155 -0
- package/src/utils/define-content-provider.ts +65 -0
- package/src/utils/effective-auth.ts +44 -0
- package/src/utils/fcrdns.ts +269 -0
- package/src/utils/fresh-layer.ts +175 -0
- package/src/utils/hreflang.ts +26 -0
- package/src/utils/index.ts +91 -0
- package/src/utils/json-ld/article.ts +120 -0
- package/src/utils/json-ld/audio.ts +32 -0
- package/src/utils/json-ld/breadcrumb.ts +28 -0
- package/src/utils/json-ld/faq.ts +18 -0
- package/src/utils/json-ld/howto.ts +23 -0
- package/src/utils/json-ld/index.ts +12 -0
- package/src/utils/json-ld/item-list.ts +26 -0
- package/src/utils/json-ld/organization.ts +42 -0
- package/src/utils/json-ld/person.ts +25 -0
- package/src/utils/json-ld/product.ts +155 -0
- package/src/utils/json-ld/video.ts +20 -0
- package/src/utils/json-ld/website.ts +27 -0
- package/src/utils/llms-full.ts +90 -0
- package/src/utils/llms.ts +45 -0
- package/src/utils/meta.ts +184 -0
- package/src/utils/podcast.ts +112 -0
- package/src/utils/robots.ts +47 -0
- package/src/utils/rss.ts +64 -0
- package/src/utils/seo-head.ts +81 -0
- package/src/utils/sitemap-markdown.ts +80 -0
- package/src/utils/sitemap.ts +169 -0
- package/src/utils/staleness.ts +61 -0
- package/src/utils/validation.ts +308 -0
- package/src/virtual.d.ts +8 -0
- package/src/vite-plugin.ts +66 -0
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
import 'virtual:growth-labs/seo/config'
|
|
2
|
+
import { getConfig, getContentProvider } from '../_internal/state.js'
|
|
3
|
+
import type { SeoEnv } from '../bindings.js'
|
|
4
|
+
import { type ResolvedSeoOptions, resolveAeoTwins } from '../options.js'
|
|
5
|
+
import { getRuntimeEnv, getWaitUntil } from '../runtime.js'
|
|
6
|
+
import type { ContentItem, ContentProvider, CrawlerClass, EffectiveAuthSegment } from '../types.js'
|
|
7
|
+
import { estimateTokenCount, generateAeoMarkdown } from '../utils/aeo.js'
|
|
8
|
+
import { classifyRequest } from '../utils/crawler-class.js'
|
|
9
|
+
import { computeEffectiveAuthSegment, type RawAuthSegment } from '../utils/effective-auth.js'
|
|
10
|
+
import { createDohResolver, createFcrdnsVerifier, type FcrdnsVerifier } from '../utils/fcrdns.js'
|
|
11
|
+
import { type FreshLayerBinding, readFreshTwin, writeFreshTwin } from '../utils/fresh-layer.js'
|
|
12
|
+
|
|
13
|
+
// ─── Public middleware factory ───
|
|
14
|
+
|
|
15
|
+
export interface SeoMiddlewareDeps {
|
|
16
|
+
/** Request classifier. Defaults to one backed by a DoH FCrDNS verifier. */
|
|
17
|
+
classify?: (request: Request) => Promise<CrawlerClass>
|
|
18
|
+
/** R2/KV binding for the fresh-twin layer. Required for mode 'middleware' or 'both'. */
|
|
19
|
+
freshLayer?: FreshLayerBinding
|
|
20
|
+
/** Assets binding for reading the stale build-time twin on R2 miss. */
|
|
21
|
+
assets?: { fetch(request: Request): Promise<Response> }
|
|
22
|
+
/** Consumer's raw auth segment for this request. Defaults to 'anon'. */
|
|
23
|
+
rawAuthSegment?: (request: Request) => RawAuthSegment
|
|
24
|
+
/** Fire-and-forget ctx.waitUntil equivalent. Used to schedule background revalidation. */
|
|
25
|
+
waitUntil?: (promise: Promise<unknown>) => void
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface SeoMiddlewareContext {
|
|
29
|
+
request: Request
|
|
30
|
+
locals?: Record<string, unknown>
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function createSeoMiddleware(
|
|
34
|
+
options: ResolvedSeoOptions,
|
|
35
|
+
contentProvider?: ContentProvider,
|
|
36
|
+
deps: SeoMiddlewareDeps = {},
|
|
37
|
+
) {
|
|
38
|
+
const aeo = resolveAeoTwins(options.aeoTwins)
|
|
39
|
+
const classify = deps.classify ?? createDefaultClassifier()
|
|
40
|
+
|
|
41
|
+
return async (
|
|
42
|
+
context: SeoMiddlewareContext,
|
|
43
|
+
next: () => Promise<Response>,
|
|
44
|
+
): Promise<Response> => {
|
|
45
|
+
const { request } = context
|
|
46
|
+
const accept = request.headers.get('accept') ?? ''
|
|
47
|
+
const url = new URL(request.url)
|
|
48
|
+
|
|
49
|
+
// 1. Classify the request. Expose classification on Astro.locals for the
|
|
50
|
+
// consumer's cache-key builder.
|
|
51
|
+
const crawlerClass: CrawlerClass = await classify(request)
|
|
52
|
+
const rawAuth = deps.rawAuthSegment?.(request) ?? 'anon'
|
|
53
|
+
const effectiveAuthSegment: EffectiveAuthSegment = computeEffectiveAuthSegment(
|
|
54
|
+
crawlerClass,
|
|
55
|
+
rawAuth,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if (context.locals) {
|
|
59
|
+
context.locals.crawlerClass = crawlerClass
|
|
60
|
+
context.locals.effectiveAuthSegment = effectiveAuthSegment
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// 2. LLM training crawlers are 403'd on members items, before any other work.
|
|
64
|
+
// We look up the item only when crawlerClass requires it to avoid per-request
|
|
65
|
+
// contentProvider calls on the hot path for anonymous requests.
|
|
66
|
+
const blockLlmTraining = options.crawlerPolicy.blockLlmTrainingCrawlers
|
|
67
|
+
if (blockLlmTraining && crawlerClass === 'llmTrainingCrawler' && contentProvider) {
|
|
68
|
+
const item = await findItemForPath(contentProvider, context, url.pathname)
|
|
69
|
+
if (item?.access === 'members') {
|
|
70
|
+
return new Response('Forbidden', {
|
|
71
|
+
status: 403,
|
|
72
|
+
headers: {
|
|
73
|
+
'Content-Type': 'text/plain; charset=utf-8',
|
|
74
|
+
'X-Robots-Tag': 'noindex',
|
|
75
|
+
...varyHeaders(),
|
|
76
|
+
},
|
|
77
|
+
})
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// 3. Accept: text/markdown handling (middleware mode of AEO twins).
|
|
82
|
+
const wantsMarkdown = accept.includes('text/markdown')
|
|
83
|
+
if (aeo && aeo.mode !== 'static' && wantsMarkdown && contentProvider) {
|
|
84
|
+
const markdownResponse = await serveAeoMarkdown({
|
|
85
|
+
request,
|
|
86
|
+
context,
|
|
87
|
+
options,
|
|
88
|
+
contentProvider,
|
|
89
|
+
freshLayer: deps.freshLayer,
|
|
90
|
+
assets: deps.assets,
|
|
91
|
+
waitUntil: deps.waitUntil,
|
|
92
|
+
})
|
|
93
|
+
if (markdownResponse) return markdownResponse
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// 4. Pass through to the next handler (SSR page or static asset route).
|
|
97
|
+
const response = await next()
|
|
98
|
+
|
|
99
|
+
// 5. Wrap response with our observability + policy headers.
|
|
100
|
+
const newHeaders = new Headers(response.headers)
|
|
101
|
+
|
|
102
|
+
// Content-Signal on every response.
|
|
103
|
+
newHeaders.set('content-signal', buildContentSignalHeader(options))
|
|
104
|
+
|
|
105
|
+
// Vary must cover every axis that branches the response:
|
|
106
|
+
// - Accept: response body differs on text/markdown
|
|
107
|
+
// - User-Agent: crawler class depends on it
|
|
108
|
+
// - Cookie: auth segment depends on it
|
|
109
|
+
// - CF-Connecting-IP: FCrDNS is keyed on it
|
|
110
|
+
appendVaryHeaders(newHeaders)
|
|
111
|
+
|
|
112
|
+
// Link alternate header on HTML responses when aeoTwins enabled, except for
|
|
113
|
+
// members items (they have no markdown counterpart at rest).
|
|
114
|
+
if (aeo && isHtmlResponse(response)) {
|
|
115
|
+
const linkItem = contentProvider
|
|
116
|
+
? await findItemForPath(contentProvider, context, url.pathname)
|
|
117
|
+
: undefined
|
|
118
|
+
if (!linkItem || linkItem.access !== 'members') {
|
|
119
|
+
const target =
|
|
120
|
+
aeo.mode === 'static' || aeo.mode === 'both' ? twinUrlFor(url, aeo) : url.toString()
|
|
121
|
+
const linkValue = `<${target}>; rel="alternate"; type="text/markdown"`
|
|
122
|
+
const existing = newHeaders.get('Link')
|
|
123
|
+
newHeaders.set('Link', existing ? `${existing}, ${linkValue}` : linkValue)
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return new Response(response.body, {
|
|
128
|
+
status: response.status,
|
|
129
|
+
statusText: response.statusText,
|
|
130
|
+
headers: newHeaders,
|
|
131
|
+
})
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// ─── Astro middleware entry ───
|
|
136
|
+
|
|
137
|
+
let _classifier: FcrdnsVerifier | undefined
|
|
138
|
+
|
|
139
|
+
export const onRequest = async (
|
|
140
|
+
context: SeoMiddlewareContext,
|
|
141
|
+
next: () => Promise<Response>,
|
|
142
|
+
): Promise<Response> => {
|
|
143
|
+
const config = getConfig()
|
|
144
|
+
const contentProvider = getContentProvider()
|
|
145
|
+
const env = getRuntimeEnv(context) as SeoEnv
|
|
146
|
+
const aeo = resolveAeoTwins(config.aeoTwins)
|
|
147
|
+
|
|
148
|
+
// Wire bindings from env when aeoTwins is enabled. Middleware-mode requires R2+DO;
|
|
149
|
+
// static-only consumers get the lighter path.
|
|
150
|
+
let freshLayer: FreshLayerBinding | undefined
|
|
151
|
+
if (env && aeo && aeo.mode !== 'static' && aeo.freshLayer) {
|
|
152
|
+
const bindingName = aeo.freshLayer.bindingName
|
|
153
|
+
const impl = (env as unknown as Record<string, unknown>)[bindingName]
|
|
154
|
+
if (impl) {
|
|
155
|
+
freshLayer = {
|
|
156
|
+
type: aeo.freshLayer.type,
|
|
157
|
+
impl: impl as FreshLayerBinding['impl'],
|
|
158
|
+
deploymentId: env.CF_VERSION_METADATA?.id ?? 'dev',
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const middleware = createSeoMiddleware(config, contentProvider, {
|
|
164
|
+
freshLayer,
|
|
165
|
+
assets: env?.ASSETS,
|
|
166
|
+
waitUntil: getWaitUntil(context),
|
|
167
|
+
})
|
|
168
|
+
return middleware(context, next)
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// ─── Internals ───
|
|
172
|
+
|
|
173
|
+
function createDefaultClassifier() {
|
|
174
|
+
if (!_classifier) {
|
|
175
|
+
_classifier = createFcrdnsVerifier(createDohResolver())
|
|
176
|
+
}
|
|
177
|
+
const verifier = _classifier
|
|
178
|
+
return (request: Request) => classifyRequest({ request, fcrdnsVerify: verifier })
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function buildContentSignalHeader(config: ResolvedSeoOptions): string {
|
|
182
|
+
const { aiTrain, search, aiInput } = config.contentSignal
|
|
183
|
+
return `ai-train=${aiTrain}, search=${search}, ai-input=${aiInput}`
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function varyHeaders(): Record<string, string> {
|
|
187
|
+
return { Vary: 'Accept, User-Agent, Cookie, CF-Connecting-IP' }
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function appendVaryHeaders(headers: Headers): void {
|
|
191
|
+
const existing = headers.get('Vary')
|
|
192
|
+
const required = 'Accept, User-Agent, Cookie, CF-Connecting-IP'
|
|
193
|
+
headers.set('Vary', existing ? dedupVary(`${existing}, ${required}`) : required)
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
function dedupVary(combined: string): string {
|
|
197
|
+
const parts = combined
|
|
198
|
+
.split(',')
|
|
199
|
+
.map((s) => s.trim())
|
|
200
|
+
.filter(Boolean)
|
|
201
|
+
return [...new Set(parts)].join(', ')
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
function isHtmlResponse(response: Response): boolean {
|
|
205
|
+
const ct = response.headers.get('content-type') ?? ''
|
|
206
|
+
return ct.includes('text/html')
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function twinUrlFor(url: URL, aeo: { twinUrl?: (u: string) => string }): string {
|
|
210
|
+
const fn = aeo.twinUrl ?? ((u: string) => `${u.replace(/\/+$/, '')}.md`)
|
|
211
|
+
return fn(url.toString())
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
async function findItemForPath(
|
|
215
|
+
contentProvider: ContentProvider,
|
|
216
|
+
context: SeoMiddlewareContext,
|
|
217
|
+
pathname: string,
|
|
218
|
+
): Promise<ContentItem | undefined> {
|
|
219
|
+
// Look up a single item by path; narrow the fetch via `slugs` so CMS providers
|
|
220
|
+
// can short-circuit the lookup instead of loading the whole catalog.
|
|
221
|
+
const slug = pathname.replace(/^\/+/, '').split('/').pop() ?? ''
|
|
222
|
+
try {
|
|
223
|
+
// Type assertion: APIContext is the Astro request context in real code but our
|
|
224
|
+
// middleware context is narrower; contentProvider's APIContext parameter is
|
|
225
|
+
// treated opaquely at this layer.
|
|
226
|
+
const items = await contentProvider({ type: 'articles', slugs: [slug] }, context as never)
|
|
227
|
+
return items.find((item) => {
|
|
228
|
+
try {
|
|
229
|
+
return new URL(item.url).pathname === pathname
|
|
230
|
+
} catch {
|
|
231
|
+
return item.url === pathname
|
|
232
|
+
}
|
|
233
|
+
})
|
|
234
|
+
} catch {
|
|
235
|
+
return undefined
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// ─── AEO markdown serve path ───
|
|
240
|
+
|
|
241
|
+
interface ServeMarkdownInput {
|
|
242
|
+
request: Request
|
|
243
|
+
context: SeoMiddlewareContext
|
|
244
|
+
options: ResolvedSeoOptions
|
|
245
|
+
contentProvider: ContentProvider
|
|
246
|
+
freshLayer?: FreshLayerBinding
|
|
247
|
+
assets?: { fetch(request: Request): Promise<Response> }
|
|
248
|
+
waitUntil?: (promise: Promise<unknown>) => void
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
async function serveAeoMarkdown(input: ServeMarkdownInput): Promise<Response | null> {
|
|
252
|
+
const { request, context, options, contentProvider, freshLayer, assets, waitUntil } = input
|
|
253
|
+
const url = new URL(request.url)
|
|
254
|
+
const aeo = resolveAeoTwins(options.aeoTwins)
|
|
255
|
+
if (!aeo) return null
|
|
256
|
+
const twinPath = urlPathFromTwinUrl(twinUrlFor(url, aeo))
|
|
257
|
+
|
|
258
|
+
// Layer 1: fresh layer (R2/KV).
|
|
259
|
+
if (freshLayer) {
|
|
260
|
+
const hit = await readFreshTwin(freshLayer, twinPath)
|
|
261
|
+
if (hit) {
|
|
262
|
+
return markdownResponse(hit.body, options, hit.contentType)
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Layer 2: stale baseline via ASSETS binding.
|
|
267
|
+
if (assets) {
|
|
268
|
+
const assetReq = new Request(`https://assets.local${twinPath}`, { method: 'GET' })
|
|
269
|
+
const assetResp = await assets.fetch(assetReq)
|
|
270
|
+
if (assetResp.status === 200) {
|
|
271
|
+
const body = await assetResp.text()
|
|
272
|
+
return markdownResponse(body, options)
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Layer 3: fallthrough render.
|
|
277
|
+
const matched = await findItemForPath(contentProvider, context, url.pathname)
|
|
278
|
+
if (!matched) {
|
|
279
|
+
return null // Let the route handler serve HTML as usual.
|
|
280
|
+
}
|
|
281
|
+
if (matched.access === 'members') {
|
|
282
|
+
// Never serve markdown for gated items via negotiation.
|
|
283
|
+
return null
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
const body = matched.description ?? ''
|
|
287
|
+
const markdown = generateAeoMarkdown(matched, {
|
|
288
|
+
publisherName: options.organization.name,
|
|
289
|
+
schemaType: options.schemaType,
|
|
290
|
+
content: body,
|
|
291
|
+
ragChunkMarkers: aeo.ragChunkMarkers,
|
|
292
|
+
canonical: matched.url,
|
|
293
|
+
twinUrl: twinUrlFor(url, aeo),
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
// When the fresh layer is wired, return a 503 stub and schedule the write in
|
|
297
|
+
// the background. 200 OK with thin body would risk Googlebot indexing the
|
|
298
|
+
// description-only response as canonical and not re-crawling for days.
|
|
299
|
+
if (waitUntil && freshLayer) {
|
|
300
|
+
waitUntil(writeFreshTwin(freshLayer, twinPath, markdown).catch(() => {}))
|
|
301
|
+
return new Response('aeo twin generating', {
|
|
302
|
+
status: 503,
|
|
303
|
+
headers: {
|
|
304
|
+
'Content-Type': 'text/plain; charset=utf-8',
|
|
305
|
+
'Retry-After': '5',
|
|
306
|
+
'X-Robots-Tag': 'noindex',
|
|
307
|
+
'Cache-Control': 'no-store',
|
|
308
|
+
'content-signal': buildContentSignalHeader(options),
|
|
309
|
+
...varyHeaders(),
|
|
310
|
+
},
|
|
311
|
+
})
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Simple mode: no fresh layer wired (tests, static-first consumers without a
|
|
315
|
+
// Worker DO). Render inline synchronously. Still safe — we're not caching a
|
|
316
|
+
// thin stub, we're serving the full markdown.
|
|
317
|
+
return markdownResponse(markdown, options)
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
function markdownResponse(
|
|
321
|
+
body: string,
|
|
322
|
+
options: ResolvedSeoOptions,
|
|
323
|
+
contentType = 'text/markdown; charset=utf-8',
|
|
324
|
+
): Response {
|
|
325
|
+
const tokenCount = estimateTokenCount(body)
|
|
326
|
+
return new Response(body, {
|
|
327
|
+
headers: {
|
|
328
|
+
'Content-Type': contentType,
|
|
329
|
+
'x-markdown-tokens': String(tokenCount),
|
|
330
|
+
'X-Robots-Tag': 'noindex',
|
|
331
|
+
'content-signal': buildContentSignalHeader(options),
|
|
332
|
+
...varyHeaders(),
|
|
333
|
+
},
|
|
334
|
+
})
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
function urlPathFromTwinUrl(twinUrl: string): string {
|
|
338
|
+
try {
|
|
339
|
+
return new URL(twinUrl).pathname
|
|
340
|
+
} catch {
|
|
341
|
+
return twinUrl.startsWith('/') ? twinUrl : `/${twinUrl}`
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
export const _internals = {
|
|
346
|
+
appendVaryHeaders,
|
|
347
|
+
dedupVary,
|
|
348
|
+
twinUrlFor,
|
|
349
|
+
findItemForPath,
|
|
350
|
+
}
|