npm - @growth-labs/seo - Versions diffs - 0.4.0 → 0.4.2 - Mend

@growth-labs/seo 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/dist/utils/validation.d.ts.map +1 -1
package/dist/utils/validation.js +22 -0
package/dist/utils/validation.js.map +1 -1
package/package.json +9 -5
package/src/_internal/state.ts +26 -0
package/src/bindings.ts +146 -0
package/src/cron/prune-aeo-r2.ts +140 -0
package/src/durable-objects/aeo-revalidation-coord.ts +246 -0
package/src/index.ts +380 -0
package/src/middleware/seo.ts +350 -0
package/src/options.ts +456 -0
package/src/routes/aeo-twin.ts +130 -0
package/src/routes/apple-news.ts +36 -0
package/src/routes/llms-full.ts +36 -0
package/src/routes/llms.ts +15 -0
package/src/routes/podcast-narration.ts +45 -0
package/src/routes/podcast.ts +27 -0
package/src/routes/revalidate.ts +298 -0
package/src/routes/robots.ts +21 -0
package/src/routes/rss.ts +29 -0
package/src/routes/sitemap-articles.ts +25 -0
package/src/routes/sitemap-index.ts +89 -0
package/src/routes/sitemap-markdown.ts +39 -0
package/src/routes/sitemap-pages.ts +24 -0
package/src/routes/sitemap-products.ts +24 -0
package/src/routes/sitemap-videos.ts +24 -0
package/src/runtime.ts +17 -0
package/src/site-url-core.ts +71 -0
package/src/site-url.ts +21 -0
package/src/types.ts +166 -0
package/src/utils/aeo-summary.ts +176 -0
package/src/utils/aeo-twin-emitter.ts +173 -0
package/src/utils/aeo.ts +223 -0
package/src/utils/apple-news-anf.ts +163 -0
package/src/utils/apple-news-rss.ts +136 -0
package/src/utils/content-filter.ts +87 -0
package/src/utils/crawler-class.ts +155 -0
package/src/utils/define-content-provider.ts +65 -0
package/src/utils/effective-auth.ts +44 -0
package/src/utils/fcrdns.ts +269 -0
package/src/utils/fresh-layer.ts +175 -0
package/src/utils/hreflang.ts +26 -0
package/src/utils/index.ts +91 -0
package/src/utils/json-ld/article.ts +120 -0
package/src/utils/json-ld/audio.ts +32 -0
package/src/utils/json-ld/breadcrumb.ts +28 -0
package/src/utils/json-ld/faq.ts +18 -0
package/src/utils/json-ld/howto.ts +23 -0
package/src/utils/json-ld/index.ts +12 -0
package/src/utils/json-ld/item-list.ts +26 -0
package/src/utils/json-ld/organization.ts +42 -0
package/src/utils/json-ld/person.ts +25 -0
package/src/utils/json-ld/product.ts +155 -0
package/src/utils/json-ld/video.ts +20 -0
package/src/utils/json-ld/website.ts +27 -0
package/src/utils/llms-full.ts +90 -0
package/src/utils/llms.ts +45 -0
package/src/utils/meta.ts +184 -0
package/src/utils/podcast.ts +112 -0
package/src/utils/robots.ts +47 -0
package/src/utils/rss.ts +64 -0
package/src/utils/seo-head.ts +81 -0
package/src/utils/sitemap-markdown.ts +80 -0
package/src/utils/sitemap.ts +169 -0
package/src/utils/staleness.ts +61 -0
package/src/utils/validation.ts +308 -0
package/src/virtual.d.ts +8 -0
package/src/vite-plugin.ts +66 -0

package/src/utils/crawler-class.ts ADDED Viewed

@@ -0,0 +1,155 @@
+import type { CrawlerClass } from '../types.js'
+import type { FcrdnsVerifier } from './fcrdns.js'
+// ─── User-Agent patterns ───
+// Known LLM-training crawlers. Blocked at robots.txt AND 403'd in middleware on
+// access: 'members' items. Matched case-insensitively against the UA string.
+const LLM_TRAINING_UAS = [
+	'GPTBot',
+	'ClaudeBot',
+	'CCBot',
+	'Google-Extended',
+	'PerplexityBot',
+	'Applebot-Extended',
+	'Bytespider',
+	'FacebookBot',
+	'OAI-SearchBot',
+	'anthropic-ai',
+	'cohere-ai',
+	'AI2Bot',
+	'Diffbot',
+	'ImagesiftBot',
+	'Omgilibot',
+	'Omgili',
+	'Timpibot',
+] as const
+// User-directed LLM agents — fetch on behalf of a logged-in user. Treated as anonymous:
+// these forward response to a third-party model that may retain them, so never serve
+// gated content even if member cookies are present.
+const USER_DIRECTED_UAS = [
+	'ChatGPT-User',
+	'Claude-User',
+	'PerplexityBot-User',
+	'Google-NotebookLM',
+] as const
+// FCrDNS suffixes for verified search crawlers. Case-insensitive, dot-boundary match.
+// Deliberately excludes googleusercontent.com (Google Cloud VMs, Apps Script, proxy
+// infrastructure that would let any GCP user impersonate Googlebot).
+export const VERIFIED_SEARCH_SUFFIXES = [
+	'googlebot.com',
+	'google.com',
+	'search.msn.com',
+	'applebot.apple.com',
+	'duckduckbot.com',
+] as const
+// ─── Classification ───
+export interface ClassifyRequestInput {
+	request: Request
+	// FCrDNS verifier. Injected so tests can mock DNS without hitting the network.
+	// In production, wired to the DoH-backed implementation in fcrdns.ts.
+	fcrdnsVerify?: FcrdnsVerifier
+	// If true, skips the Cloudflare Bot Management fast path. Used in tests.
+	skipBotManagementFastPath?: boolean
+}
+/**
+ * Classify a request into one of four crawler classes.
+ *
+ * Order of checks:
+ *   1. Cloudflare Bot Management fast path (zero subrequests) when available.
+ *   2. LLM-training UA match.
+ *   3. User-directed LLM agent UA match.
+ *   4. FCrDNS against verified-search-crawler suffixes (DoH subrequests if uncached).
+ *   5. Fallthrough → anonymous.
+ *
+ * The BM fast path and UA checks always run; FCrDNS only runs when no UA matches
+ * (spoofed UAs that claim to be Googlebot without FCrDNS confirmation fail here).
+ *
+ * @returns the crawler class for cache-key segmentation, body-variant selection,
+ *          and JSON-LD redaction decisions downstream.
+ */
+export async function classifyRequest(input: ClassifyRequestInput): Promise<CrawlerClass> {
+	const { request, fcrdnsVerify, skipBotManagementFastPath = false } = input
+	// 1. Cloudflare Bot Management fast path — zero subrequests, Cloudflare-stood-behind.
+	//    Only present on Enterprise zones with BM enabled. Absent on Free/Pro/Business.
+	if (!skipBotManagementFastPath) {
+		const cf = (
+			request as {
+				cf?: { botManagement?: { verifiedBot?: boolean }; verifiedBotCategory?: string }
+			}
+		).cf
+		if (cf?.botManagement?.verifiedBot && cf.verifiedBotCategory === 'Search Engine Crawler') {
+			return 'verifiedSearchCrawler'
+		}
+	}
+	const ua = request.headers.get('user-agent') ?? ''
+	// 2. User-directed LLM agent — checked BEFORE training-bot UAs, because many
+	//    user-directed UAs contain training-bot substrings (e.g. `PerplexityBot-User`
+	//    contains `PerplexityBot`; `ChatGPT-User` contains `ChatGPT`). The more
+	//    specific rule must match first.
+	if (matchesAnyToken(ua, USER_DIRECTED_UAS)) {
+		return 'userDirectedLlmAgent'
+	}
+	// 3. LLM-training crawler — matches a training-bot UA substring.
+	if (matchesAnyToken(ua, LLM_TRAINING_UAS)) {
+		return 'llmTrainingCrawler'
+	}
+	// 4. FCrDNS for verified search crawlers. Only attempted if a verifier is wired
+	//    (the classifier is pure when FCrDNS isn't available — production middleware
+	//    always wires it; pure-function tests can skip it).
+	if (fcrdnsVerify) {
+		const clientIp = getClientIp(request)
+		if (clientIp) {
+			const verified = await fcrdnsVerify({
+				clientIp,
+				trustedSuffixes: VERIFIED_SEARCH_SUFFIXES,
+			})
+			if (verified) return 'verifiedSearchCrawler'
+		}
+	}
+	// 5. Fallthrough.
+	return 'anonymous'
+}
+// ─── Internals ───
+function matchesAnyToken(ua: string, tokens: readonly string[]): boolean {
+	if (!ua) return false
+	const lowered = ua.toLowerCase()
+	return tokens.some((token) => lowered.includes(token.toLowerCase()))
+}
+/**
+ * Extract the effective client IP from a request. Prefers CF-Connecting-IP
+ * (Cloudflare-verified); falls back to X-Forwarded-For's first entry.
+ */
+function getClientIp(request: Request): string | null {
+	const cf = request.headers.get('cf-connecting-ip')
+	if (cf) return cf
+	const xff = request.headers.get('x-forwarded-for')
+	if (xff) {
+		const first = xff.split(',')[0]?.trim()
+		if (first) return first
+	}
+	return null
+}
+// ─── Exports for testing ───
+export const _internals = {
+	LLM_TRAINING_UAS,
+	USER_DIRECTED_UAS,
+	matchesAnyToken,
+	getClientIp,
+}

package/src/utils/define-content-provider.ts ADDED Viewed

@@ -0,0 +1,65 @@
+import type { APIContext } from 'astro'
+import type { ContentItem, ContentProviderParams, ContentType } from '../types.js'
+/**
+ * Narrows `ContentItem` per `ContentType` so a consumer-authored provider
+ * body gets compile-time feedback when a required field for that type is
+ * missing from the returned objects.
+ *
+ * The base `ContentItem` is intentionally permissive because most fields are
+ * optional at the JSON-LD layer. These narrowed variants add the fields that
+ * the feed and sitemap generators actually need for each content type —
+ * `video` for videos, `product` for products — so an author can't silently
+ * ship an item with `type: 'products'` but no `product` payload.
+ */
+export interface ContentItemByType {
+	articles: ContentItem
+	pages: ContentItem
+	videos: ContentItem & { video: NonNullable<ContentItem['video']> }
+	products: ContentItem & { product: NonNullable<ContentItem['product']> }
+	authors: ContentItem
+}
+export type TypedContentProvider = <T extends ContentType>(
+	params: ContentProviderParams & { type: T },
+	context: APIContext,
+) => Promise<ContentItemByType[T][]>
+/**
+ * Identity helper that preserves the narrowest `type` → return-shape binding
+ * for a consumer-authored content provider. Use inside your
+ * `contentProviderModule` default export so TypeScript catches mismatches
+ * between the `type` the caller requested and the shape you returned.
+ *
+ * ```ts
+ * // src/lib/content-provider.mjs
+ * import { defineContentProvider } from '@growth-labs/seo/utils'
+ *
+ * export default defineContentProvider(async ({ type }, ctx) => {
+ *   if (type === 'products') {
+ *     return [{
+ *       url: 'https://example.com/widget',
+ *       title: 'Widget',
+ *       // Required for `products` — compile error if omitted.
+ *       product: {
+ *         name: 'Widget', description: 'Blue', price: 10, currency: 'USD',
+ *         availability: 'InStock', images: ['https://example.com/widget.jpg'],
+ *       },
+ *     }]
+ *   }
+ *   return []
+ * })
+ * ```
+ */
+export function defineContentProvider(provider: TypedContentProvider): TypedContentProvider {
+	return provider
+}
+/**
+ * Module-style variant of {@link defineContentProvider}. Semantically identical
+ * but communicates intent at the call site: "this is the default export of my
+ * content-provider module file." Use whichever reads better.
+ */
+export function defineContentProviderModule(provider: TypedContentProvider): TypedContentProvider {
+	return provider
+}

package/src/utils/effective-auth.ts ADDED Viewed

@@ -0,0 +1,44 @@
+import type { CrawlerClass, EffectiveAuthSegment } from '../types.js'
+/**
+ * Raw auth segment from the consumer's auth layer. Consumers produce this (typically
+ * 'anon' for unauthenticated, 'member' for a logged-in subscriber) and feed it into
+ * the package to derive the effective segment.
+ */
+export type RawAuthSegment = 'anon' | 'member'
+/**
+ * Compute the effective auth segment given a crawler class and a raw consumer auth segment.
+ *
+ * This function encodes the policy from spec "Effective auth segment" (lines
+ * "crawler class overrides member cookies"):
+ *
+ *   - Verified search crawler → 'search-full' (regardless of cookies). Gets the
+ *     sanctioned paywall-marked full body under Flexible Sampling.
+ *   - LLM training crawler → caller should 403 before calling this; if it does reach
+ *     here, we surface 'anon' defensively.
+ *   - User-directed LLM agent → 'anon', ALWAYS. Even if member cookies are present.
+ *     Load-bearing: ChatGPT-User / Claude-User forward responses to a third-party
+ *     model that may retain them; we cannot verify the cookies belong to a paid
+ *     subscriber, and leaking gated content to a third-party LLM defeats gating.
+ *   - Anonymous → raw as-is.
+ *
+ * Consumers use `effectiveAuthSegment` (NOT raw `authSegment`) in cache keys so
+ * that cache segmentation reflects the crawler-class override.
+ */
+export function computeEffectiveAuthSegment(
+	crawlerClass: CrawlerClass,
+	rawAuthSegment: RawAuthSegment,
+): EffectiveAuthSegment {
+	switch (crawlerClass) {
+		case 'verifiedSearchCrawler':
+			return 'search-full'
+		case 'llmTrainingCrawler':
+			// Caller should reject with 403 before computing this; defensive fallback.
+			return 'anon'
+		case 'userDirectedLlmAgent':
+			return 'anon' // Override: never 'member', even with cookies.
+		case 'anonymous':
+			return rawAuthSegment
+	}
+}

package/src/utils/fcrdns.ts ADDED Viewed

@@ -0,0 +1,269 @@
+// Forward-confirmed reverse DNS verification. Used to classify incoming requests
+// as verified search crawlers when Cloudflare Bot Management isn't available.
+//
+// Algorithm (spec "FCrDNS algorithm"):
+//   1. PTR lookup on the client IP → hostname
+//   2. Hostname must end with a trusted suffix, on a dot boundary
+//   3. A/AAAA lookup on that hostname → IPs
+//   4. Client IP must be in the result set
+//
+// Caching (spec "FCrDNS cache semantics"):
+//   - Positive: keyed by (clientIp, matchedHostname); TTL = min(10min, rDNS TTL, fDNS TTL)
+//   - Negative: 60s fixed TTL
+//   - Process-local only; do NOT persist across deploys
+// ─── Resolver interface ───
+export interface DnsAnswer {
+	name: string
+	type: number
+	TTL: number // seconds
+	data: string
+}
+export interface DnsResolver {
+	/**
+	 * Perform a DoH-JSON-style DNS query.
+	 *
+	 * Returns the Answer array or null on failure. `type` is an RFC 1035
+	 * numeric record type (12 = PTR, 1 = A, 28 = AAAA).
+	 *
+	 * Production implementation: {@link createDohResolver}. Tests inject a mock.
+	 */
+	query(name: string, type: 'PTR' | 'A' | 'AAAA'): Promise<DnsAnswer[] | null>
+}
+// ─── Public API ───
+export interface FcrdnsVerifyInput {
+	clientIp: string
+	trustedSuffixes: readonly string[]
+}
+export type FcrdnsVerifier = (input: FcrdnsVerifyInput) => Promise<boolean>
+/**
+ * Create an FCrDNS verifier. Returns a function that, given an IP and suffix list,
+ * returns true iff the IP's forward-confirmed reverse DNS lands on a trusted suffix.
+ *
+ * The returned verifier carries its own cache; create one per Worker isolate.
+ *
+ * @param resolver DNS resolver. Default in production: DoH against 1.1.1.1.
+ *                 Tests inject a mock.
+ * @param now time source (for testable TTL expiry). Default: Date.now.
+ */
+export function createFcrdnsVerifier(
+	resolver: DnsResolver,
+	now: () => number = Date.now,
+): FcrdnsVerifier {
+	const cache = new FcrdnsCache(now)
+	return async ({ clientIp, trustedSuffixes }: FcrdnsVerifyInput): Promise<boolean> => {
+		// Cache check (positive or negative).
+		const cached = cache.get(clientIp)
+		if (cached !== undefined) return cached
+		try {
+			// 1. Reverse DNS lookup.
+			const ptrName = reverseIpToArpa(clientIp)
+			if (!ptrName) {
+				cache.setNegative(clientIp)
+				return false
+			}
+			const ptrAnswers = await resolver.query(ptrName, 'PTR')
+			if (!ptrAnswers || ptrAnswers.length === 0) {
+				cache.setNegative(clientIp)
+				return false
+			}
+			// 2. Suffix match on any PTR answer (dot-boundary, case-insensitive).
+			let matchedHostname: string | null = null
+			let ptrTtl = Number.POSITIVE_INFINITY
+			for (const answer of ptrAnswers) {
+				const hostname = stripTrailingDot(answer.data).toLowerCase()
+				if (matchesAnyTrustedSuffix(hostname, trustedSuffixes)) {
+					matchedHostname = hostname
+					ptrTtl = Math.min(ptrTtl, answer.TTL)
+					break
+				}
+			}
+			if (!matchedHostname) {
+				cache.setNegative(clientIp)
+				return false
+			}
+			// 3. Forward DNS lookup. Try A first; if IPv6, also AAAA.
+			const isIpv6 = clientIp.includes(':')
+			const fwdType = isIpv6 ? 'AAAA' : 'A'
+			const fwdAnswers = await resolver.query(matchedHostname, fwdType)
+			if (!fwdAnswers || fwdAnswers.length === 0) {
+				cache.setNegative(clientIp)
+				return false
+			}
+			// 4. Client IP must be in the result set.
+			const normalizedClient = normalizeIp(clientIp)
+			const matched = fwdAnswers.some((a) => normalizeIp(a.data) === normalizedClient)
+			if (!matched) {
+				cache.setNegative(clientIp)
+				return false
+			}
+			// Positive: cache with TTL = min(10min, rDNS TTL, fDNS TTL).
+			const fwdTtl = fwdAnswers.reduce((min, a) => Math.min(min, a.TTL), Number.POSITIVE_INFINITY)
+			const ttlSeconds = Math.min(600, ptrTtl, fwdTtl)
+			cache.setPositive(clientIp, matchedHostname, ttlSeconds * 1000)
+			return true
+		} catch {
+			// Any error → fail closed (anonymous), short negative cache to avoid repeat hits.
+			cache.setNegative(clientIp)
+			return false
+		}
+	}
+}
+// ─── DoH resolver (production default) ───
+/**
+ * Create a DoH-JSON resolver that queries Cloudflare's 1.1.1.1. One subrequest per
+ * DNS lookup. Not a singleton — resolver has no persistent state of its own.
+ *
+ * Docs: https://developers.cloudflare.com/1.1.1.1/encryption/dns-over-https/make-api-requests/dns-json/
+ */
+export function createDohResolver(
+	baseUrl = 'https://1.1.1.1/dns-query',
+	fetchImpl: typeof fetch = fetch,
+): DnsResolver {
+	return {
+		async query(name: string, type: 'PTR' | 'A' | 'AAAA'): Promise<DnsAnswer[] | null> {
+			const url = `${baseUrl}?name=${encodeURIComponent(name)}&type=${type}`
+			const response = await fetchImpl(url, {
+				headers: { accept: 'application/dns-json' },
+			})
+			if (!response.ok) return null
+			const body = (await response.json()) as { Status?: number; Answer?: DnsAnswer[] }
+			// Status 0 = NOERROR. Anything else means no usable answer.
+			if (body.Status !== 0 || !body.Answer) return null
+			return body.Answer
+		},
+	}
+}
+// ─── Internals ───
+/**
+ * Convert an IP address to its in-addr.arpa / ip6.arpa PTR query name.
+ * Returns null for malformed input.
+ */
+function reverseIpToArpa(ip: string): string | null {
+	if (ip.includes(':')) {
+		// IPv6 — expand and nibble-reverse into ip6.arpa.
+		const expanded = expandIpv6(ip)
+		if (!expanded) return null
+		const nibbles = expanded.replace(/:/g, '').split('').reverse().join('.')
+		return `${nibbles}.ip6.arpa`
+	}
+	// IPv4 — reverse octets into in-addr.arpa.
+	const octets = ip.split('.')
+	if (octets.length !== 4 || octets.some((o) => !/^\d+$/.test(o))) return null
+	return `${octets.reverse().join('.')}.in-addr.arpa`
+}
+/**
+ * Expand an IPv6 address to its full 32-hex-character form (colon-separated groups
+ * of 4). Handles "::" compression. Returns null for malformed input.
+ */
+function expandIpv6(ip: string): string | null {
+	const doubleColon = ip.indexOf('::')
+	let parts: string[]
+	if (doubleColon === -1) {
+		parts = ip.split(':')
+	} else {
+		const head = ip.slice(0, doubleColon).split(':').filter(Boolean)
+		const tail = ip
+			.slice(doubleColon + 2)
+			.split(':')
+			.filter(Boolean)
+		const missing = 8 - head.length - tail.length
+		if (missing < 0) return null
+		parts = [...head, ...Array(missing).fill('0000'), ...tail]
+	}
+	if (parts.length !== 8) return null
+	return parts.map((p) => p.padStart(4, '0')).join(':')
+}
+function matchesAnyTrustedSuffix(hostname: string, suffixes: readonly string[]): boolean {
+	const lowered = hostname.toLowerCase()
+	return suffixes.some((suffix) => {
+		const lsuffix = suffix.toLowerCase()
+		return lowered === lsuffix || lowered.endsWith(`.${lsuffix}`)
+	})
+}
+function stripTrailingDot(s: string): string {
+	return s.endsWith('.') ? s.slice(0, -1) : s
+}
+/**
+ * Normalize an IP for comparison. IPv4 addresses pass through; IPv6 are canonicalized
+ * by expanding "::" and stripping leading zeros in each group.
+ */
+function normalizeIp(ip: string): string {
+	if (!ip.includes(':')) return ip // IPv4
+	const expanded = expandIpv6(ip)
+	if (!expanded) return ip
+	// Strip leading zeros in each group for canonical comparison.
+	return expanded
+		.split(':')
+		.map((g) => g.replace(/^0+/, '') || '0')
+		.join(':')
+}
+// ─── Cache ───
+interface CacheEntry {
+	verified: boolean
+	expiresAt: number
+	hostname?: string
+}
+class FcrdnsCache {
+	private readonly entries = new Map<string, CacheEntry>()
+	constructor(private readonly now: () => number) {}
+	get(clientIp: string): boolean | undefined {
+		const entry = this.entries.get(clientIp)
+		if (!entry) return undefined
+		if (entry.expiresAt <= this.now()) {
+			this.entries.delete(clientIp)
+			return undefined
+		}
+		return entry.verified
+	}
+	setPositive(clientIp: string, hostname: string, ttlMs: number): void {
+		this.entries.set(clientIp, {
+			verified: true,
+			hostname,
+			expiresAt: this.now() + Math.max(1000, ttlMs),
+		})
+	}
+	setNegative(clientIp: string): void {
+		this.entries.set(clientIp, {
+			verified: false,
+			expiresAt: this.now() + 60_000,
+		})
+	}
+}
+// ─── Exports for testing ───
+export const _internals = {
+	reverseIpToArpa,
+	expandIpv6,
+	matchesAnyTrustedSuffix,
+	normalizeIp,
+	FcrdnsCache,
+}