@growth-labs/seo 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/utils/validation.d.ts.map +1 -1
- package/dist/utils/validation.js +22 -0
- package/dist/utils/validation.js.map +1 -1
- package/package.json +9 -5
- package/src/_internal/state.ts +26 -0
- package/src/bindings.ts +146 -0
- package/src/cron/prune-aeo-r2.ts +140 -0
- package/src/durable-objects/aeo-revalidation-coord.ts +246 -0
- package/src/index.ts +380 -0
- package/src/middleware/seo.ts +350 -0
- package/src/options.ts +456 -0
- package/src/routes/aeo-twin.ts +130 -0
- package/src/routes/apple-news.ts +36 -0
- package/src/routes/llms-full.ts +36 -0
- package/src/routes/llms.ts +15 -0
- package/src/routes/podcast-narration.ts +45 -0
- package/src/routes/podcast.ts +27 -0
- package/src/routes/revalidate.ts +298 -0
- package/src/routes/robots.ts +21 -0
- package/src/routes/rss.ts +29 -0
- package/src/routes/sitemap-articles.ts +25 -0
- package/src/routes/sitemap-index.ts +89 -0
- package/src/routes/sitemap-markdown.ts +39 -0
- package/src/routes/sitemap-pages.ts +24 -0
- package/src/routes/sitemap-products.ts +24 -0
- package/src/routes/sitemap-videos.ts +24 -0
- package/src/runtime.ts +17 -0
- package/src/site-url-core.ts +71 -0
- package/src/site-url.ts +21 -0
- package/src/types.ts +166 -0
- package/src/utils/aeo-summary.ts +176 -0
- package/src/utils/aeo-twin-emitter.ts +173 -0
- package/src/utils/aeo.ts +223 -0
- package/src/utils/apple-news-anf.ts +163 -0
- package/src/utils/apple-news-rss.ts +136 -0
- package/src/utils/content-filter.ts +87 -0
- package/src/utils/crawler-class.ts +155 -0
- package/src/utils/define-content-provider.ts +65 -0
- package/src/utils/effective-auth.ts +44 -0
- package/src/utils/fcrdns.ts +269 -0
- package/src/utils/fresh-layer.ts +175 -0
- package/src/utils/hreflang.ts +26 -0
- package/src/utils/index.ts +91 -0
- package/src/utils/json-ld/article.ts +120 -0
- package/src/utils/json-ld/audio.ts +32 -0
- package/src/utils/json-ld/breadcrumb.ts +28 -0
- package/src/utils/json-ld/faq.ts +18 -0
- package/src/utils/json-ld/howto.ts +23 -0
- package/src/utils/json-ld/index.ts +12 -0
- package/src/utils/json-ld/item-list.ts +26 -0
- package/src/utils/json-ld/organization.ts +42 -0
- package/src/utils/json-ld/person.ts +25 -0
- package/src/utils/json-ld/product.ts +155 -0
- package/src/utils/json-ld/video.ts +20 -0
- package/src/utils/json-ld/website.ts +27 -0
- package/src/utils/llms-full.ts +90 -0
- package/src/utils/llms.ts +45 -0
- package/src/utils/meta.ts +184 -0
- package/src/utils/podcast.ts +112 -0
- package/src/utils/robots.ts +47 -0
- package/src/utils/rss.ts +64 -0
- package/src/utils/seo-head.ts +81 -0
- package/src/utils/sitemap-markdown.ts +80 -0
- package/src/utils/sitemap.ts +169 -0
- package/src/utils/staleness.ts +61 -0
- package/src/utils/validation.ts +308 -0
- package/src/virtual.d.ts +8 -0
- package/src/vite-plugin.ts +66 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import type { ResolvedSeoOptions, SeoOptionsWithResolvedSite } from './options.js'
|
|
2
|
+
|
|
3
|
+
export interface SiteUrlEnvVarRef {
|
|
4
|
+
envVar: string
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export type SiteUrlSource = string | SiteUrlEnvVarRef
|
|
8
|
+
|
|
9
|
+
export function isSiteUrlEnvVarRef(value: unknown): value is SiteUrlEnvVarRef {
|
|
10
|
+
return (
|
|
11
|
+
typeof value === 'object' &&
|
|
12
|
+
value !== null &&
|
|
13
|
+
'envVar' in value &&
|
|
14
|
+
typeof value.envVar === 'string'
|
|
15
|
+
)
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function validateSiteUrl(value: string, label = 'site'): string {
|
|
19
|
+
try {
|
|
20
|
+
new URL(value)
|
|
21
|
+
return value
|
|
22
|
+
} catch {
|
|
23
|
+
throw new Error(`@growth-labs/seo: ${label} must resolve to a valid URL.`)
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function resolveSiteUrl(site: SiteUrlSource, bindingsEnv?: Record<string, unknown>): string {
|
|
28
|
+
if (typeof site === 'string') {
|
|
29
|
+
return validateSiteUrl(site)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (!isSiteUrlEnvVarRef(site)) {
|
|
33
|
+
throw new Error('@growth-labs/seo: site must be a URL string or { envVar }.')
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const fromCloudflare = bindingsEnv?.[site.envVar]
|
|
37
|
+
if (typeof fromCloudflare === 'string') {
|
|
38
|
+
return validateSiteUrl(fromCloudflare, `site env binding ${site.envVar}`)
|
|
39
|
+
}
|
|
40
|
+
if (fromCloudflare !== undefined) {
|
|
41
|
+
throw new Error(`@growth-labs/seo: site env binding ${site.envVar} must be a string URL.`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const fromNode = getNodeEnv(site.envVar)
|
|
45
|
+
if (typeof fromNode === 'string') {
|
|
46
|
+
return validateSiteUrl(fromNode, `process.env.${site.envVar}`)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
throw new Error(
|
|
50
|
+
`@growth-labs/seo: site env binding ${site.envVar} is not set. ` +
|
|
51
|
+
'Provide it as a Cloudflare Worker env binding, or use process.env only in Node tooling/tests.',
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export function resolveSeoConfig(
|
|
56
|
+
config: ResolvedSeoOptions,
|
|
57
|
+
bindingsEnv?: Record<string, unknown>,
|
|
58
|
+
): SeoOptionsWithResolvedSite {
|
|
59
|
+
return {
|
|
60
|
+
...config,
|
|
61
|
+
site: resolveSiteUrl(config.site, bindingsEnv),
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function getNodeEnv(name: string): string | undefined {
|
|
66
|
+
return (
|
|
67
|
+
globalThis as {
|
|
68
|
+
process?: { env?: Record<string, string | undefined> }
|
|
69
|
+
}
|
|
70
|
+
).process?.env?.[name]
|
|
71
|
+
}
|
package/src/site-url.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { env as cloudflareEnv } from 'cloudflare:workers'
|
|
2
|
+
import type { ResolvedSeoOptions, SeoOptionsWithResolvedSite } from './options.js'
|
|
3
|
+
import {
|
|
4
|
+
resolveSeoConfig as resolveSeoConfigCore,
|
|
5
|
+
resolveSiteUrl as resolveSiteUrlCore,
|
|
6
|
+
type SiteUrlSource,
|
|
7
|
+
} from './site-url-core.js'
|
|
8
|
+
|
|
9
|
+
export function resolveSiteUrl(
|
|
10
|
+
site: SiteUrlSource,
|
|
11
|
+
bindingsEnv: Record<string, unknown> = cloudflareEnv as Record<string, unknown>,
|
|
12
|
+
): string {
|
|
13
|
+
return resolveSiteUrlCore(site, bindingsEnv)
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export function resolveSeoConfig(
|
|
17
|
+
config: ResolvedSeoOptions,
|
|
18
|
+
bindingsEnv?: Record<string, unknown>,
|
|
19
|
+
): SeoOptionsWithResolvedSite {
|
|
20
|
+
return resolveSeoConfigCore(config, bindingsEnv ?? (cloudflareEnv as Record<string, unknown>))
|
|
21
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import type { APIContext } from 'astro'
|
|
2
|
+
|
|
3
|
+
// ─── Content data interfaces ───
|
|
4
|
+
|
|
5
|
+
export interface ContentAuthor {
|
|
6
|
+
name: string
|
|
7
|
+
url?: string
|
|
8
|
+
jobTitle?: string
|
|
9
|
+
knowsAbout?: string[]
|
|
10
|
+
sameAs?: string[]
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface ContentAudio {
|
|
14
|
+
url: string
|
|
15
|
+
duration: string // ISO 8601, e.g. 'PT8M30S'
|
|
16
|
+
narrator?: string
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface ContentLocaleAlternate {
|
|
20
|
+
lang: string
|
|
21
|
+
url: string
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface ProductVariant {
|
|
25
|
+
name: string
|
|
26
|
+
sku?: string
|
|
27
|
+
price: number
|
|
28
|
+
currency?: string
|
|
29
|
+
availability: 'InStock' | 'OutOfStock' | 'PreOrder' | 'Discontinued'
|
|
30
|
+
image?: string
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface ContentProduct {
|
|
34
|
+
name: string
|
|
35
|
+
description: string
|
|
36
|
+
price: number
|
|
37
|
+
currency: string
|
|
38
|
+
availability: 'InStock' | 'OutOfStock' | 'PreOrder' | 'Discontinued'
|
|
39
|
+
brand?: string
|
|
40
|
+
sku?: string
|
|
41
|
+
gtin?: string
|
|
42
|
+
mpn?: string
|
|
43
|
+
images: string[]
|
|
44
|
+
rating?: {
|
|
45
|
+
value: number
|
|
46
|
+
count: number
|
|
47
|
+
bestRating?: number
|
|
48
|
+
}
|
|
49
|
+
reviews?: Array<{
|
|
50
|
+
author: string
|
|
51
|
+
rating: number
|
|
52
|
+
body?: string
|
|
53
|
+
datePublished?: string
|
|
54
|
+
}>
|
|
55
|
+
variants?: ProductVariant[]
|
|
56
|
+
condition?: 'NewCondition' | 'UsedCondition' | 'RefurbishedCondition'
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export interface PodcastEpisodeMeta {
|
|
60
|
+
episodeNumber?: number
|
|
61
|
+
seasonNumber?: number
|
|
62
|
+
episodeType?: 'full' | 'trailer' | 'bonus'
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// 'public' — freely available to crawlers, LLMs, AEO twins emitted.
|
|
66
|
+
// 'members' — gated content; no .md twin emitted. Unconditionally excluded from llms.txt,
|
|
67
|
+
// llms-full.txt, /feed.xml, /apple-news.xml, /listen.xml, and sitemap-markdown.xml.
|
|
68
|
+
// Still eligible for the primary sitemap so the paywalled URL remains indexable.
|
|
69
|
+
// `includeInFeed` is a no-op for members.
|
|
70
|
+
export type ContentAccess = 'public' | 'members'
|
|
71
|
+
|
|
72
|
+
export interface ContentItem {
|
|
73
|
+
// ─── Core ───
|
|
74
|
+
url: string
|
|
75
|
+
title: string
|
|
76
|
+
description?: string
|
|
77
|
+
image?: string
|
|
78
|
+
datePublished?: string
|
|
79
|
+
dateModified?: string
|
|
80
|
+
authors?: ContentAuthor[]
|
|
81
|
+
|
|
82
|
+
// ─── Audio narration ───
|
|
83
|
+
audio?: ContentAudio
|
|
84
|
+
podcastEpisode?: PodcastEpisodeMeta
|
|
85
|
+
|
|
86
|
+
// ─── Multilingual ───
|
|
87
|
+
locale?: string
|
|
88
|
+
alternateLocales?: ContentLocaleAlternate[]
|
|
89
|
+
|
|
90
|
+
// ─── Apple News ───
|
|
91
|
+
appleNewsId?: string
|
|
92
|
+
appleNewsPublishable?: 'yes' | 'no'
|
|
93
|
+
appleNewsSection?: string
|
|
94
|
+
newsKeywords?: string[]
|
|
95
|
+
|
|
96
|
+
// ─── Access / paywall ───
|
|
97
|
+
access?: ContentAccess
|
|
98
|
+
includeInSitemap?: boolean // default true
|
|
99
|
+
includeInFeed?: boolean // public-only; no-op for members
|
|
100
|
+
summary?: string // consumer-provided summary for the summary-twin generator
|
|
101
|
+
isAccessibleForFree?: boolean // JSON-LD; derived from `access` if omitted
|
|
102
|
+
paywallCssSelector?: string // e.g. '.premium-content' — used in JSON-LD hasPart
|
|
103
|
+
|
|
104
|
+
// ─── Commerce ───
|
|
105
|
+
product?: ContentProduct
|
|
106
|
+
|
|
107
|
+
// ─── FAQ / HowTo / Video ───
|
|
108
|
+
faq?: Array<{ question: string; answer: string }>
|
|
109
|
+
howToSteps?: Array<{ name: string; text: string; image?: string }>
|
|
110
|
+
video?: {
|
|
111
|
+
thumbnailUrl: string
|
|
112
|
+
duration: string
|
|
113
|
+
contentUrl?: string
|
|
114
|
+
embedUrl?: string
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// ─── Content provider ───
|
|
119
|
+
|
|
120
|
+
export type ContentType = 'articles' | 'pages' | 'videos' | 'products' | 'authors'
|
|
121
|
+
|
|
122
|
+
export interface ContentProviderParams {
|
|
123
|
+
type: ContentType
|
|
124
|
+
slugs?: string[]
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
export type ContentProvider = (
|
|
128
|
+
params: ContentProviderParams,
|
|
129
|
+
context: APIContext,
|
|
130
|
+
) => Promise<ContentItem[]>
|
|
131
|
+
|
|
132
|
+
// ─── Crawler-class dispatch ───
|
|
133
|
+
|
|
134
|
+
// Set at request time by the SEO middleware on Astro.locals. Drives body-variant selection,
|
|
135
|
+
// cache-key segmentation, and JSON-LD redaction. See spec "Crawler-class policy".
|
|
136
|
+
export type CrawlerClass =
|
|
137
|
+
| 'verifiedSearchCrawler'
|
|
138
|
+
| 'llmTrainingCrawler'
|
|
139
|
+
| 'userDirectedLlmAgent'
|
|
140
|
+
| 'anonymous'
|
|
141
|
+
|
|
142
|
+
// Derived segment that incorporates the crawler-class override of raw consumer auth.
|
|
143
|
+
// Consumers use this (never raw authSegment) in cache keys to avoid leaking member bodies
|
|
144
|
+
// to user-directed LLM agents bearing member cookies.
|
|
145
|
+
export type EffectiveAuthSegment = 'anon' | 'member' | 'search-full'
|
|
146
|
+
|
|
147
|
+
// ─── Utility return types ───
|
|
148
|
+
|
|
149
|
+
export interface MetaTag {
|
|
150
|
+
[key: string]: string
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
export interface CanonicalLink {
|
|
154
|
+
rel: 'canonical'
|
|
155
|
+
href: string
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
export interface HreflangLink {
|
|
159
|
+
rel: 'alternate'
|
|
160
|
+
hreflang: string
|
|
161
|
+
href: string
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// ─── JSON-LD base ───
|
|
165
|
+
|
|
166
|
+
export type JsonLdObject = Record<string, unknown>
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import type { ContentItem } from '../types.js'
|
|
2
|
+
import { estimateTokenCount } from './aeo.js'
|
|
3
|
+
|
|
4
|
+
// ─── Public API ───
|
|
5
|
+
|
|
6
|
+
export interface GenerateSummaryTwinOptions {
|
|
7
|
+
publisherName: string
|
|
8
|
+
schemaType: string
|
|
9
|
+
/** Full article body in markdown (used by tiers 2 and 3 when item.summary is absent). */
|
|
10
|
+
content?: string
|
|
11
|
+
/** URL of the full twin for the `fullUrl` frontmatter cross-link. */
|
|
12
|
+
fullUrl: string
|
|
13
|
+
/** Target cap in tokens. Default 400; hard ceiling. */
|
|
14
|
+
maxTokens?: number
|
|
15
|
+
/** Emits a build-time warning callback when tier-4 fallback fires (spec 1483). */
|
|
16
|
+
onMinimalFallback?: (item: ContentItem) => void
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface GenerateSummaryTwinResult {
|
|
20
|
+
markdown: string
|
|
21
|
+
tier: 1 | 2 | 3 | 4
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const DEFAULT_MAX_TOKENS = 400
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Generate a summary twin for a ContentItem via the 4-tier fallback chain
|
|
28
|
+
* (spec "Summary twin"):
|
|
29
|
+
*
|
|
30
|
+
* 1. item.summary (consumer-provided) — used verbatim.
|
|
31
|
+
* 2. Bullet-list extraction from the article body — 3-5 claims pulled from
|
|
32
|
+
* top-level `- ` / `* ` list items.
|
|
33
|
+
* 3. Narrative fallback — description + first sentence of each `## section`
|
|
34
|
+
* (max 5) + final sentence of article as conclusion.
|
|
35
|
+
* 4. Minimal — description only. NO frontmatter flag (LLMs would propagate it
|
|
36
|
+
* into user-visible citations per v5 review). Build-time telemetry only,
|
|
37
|
+
* via the onMinimalFallback callback.
|
|
38
|
+
*
|
|
39
|
+
* The returned markdown includes frontmatter: `type: summary`, `title`, `url`,
|
|
40
|
+
* `fullUrl`, `datePublished`/`dateModified` if set. First-match wins.
|
|
41
|
+
*/
|
|
42
|
+
export function generateSummaryTwin(
|
|
43
|
+
item: ContentItem,
|
|
44
|
+
options: GenerateSummaryTwinOptions,
|
|
45
|
+
): GenerateSummaryTwinResult {
|
|
46
|
+
const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS
|
|
47
|
+
|
|
48
|
+
const { body, tier } = pickBody(item, options.content, options.onMinimalFallback)
|
|
49
|
+
const capped = capToTokens(body, maxTokens)
|
|
50
|
+
|
|
51
|
+
const fm = buildFrontmatter(item, options)
|
|
52
|
+
const markdown = `---\n${fm}\n---\n\n${capped}`
|
|
53
|
+
|
|
54
|
+
return { markdown, tier }
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// ─── Internals ───
|
|
58
|
+
|
|
59
|
+
function pickBody(
|
|
60
|
+
item: ContentItem,
|
|
61
|
+
content: string | undefined,
|
|
62
|
+
onMinimalFallback: ((item: ContentItem) => void) | undefined,
|
|
63
|
+
): { body: string; tier: 1 | 2 | 3 | 4 } {
|
|
64
|
+
// Tier 1: explicit summary
|
|
65
|
+
if (item.summary && item.summary.trim().length > 0) {
|
|
66
|
+
return { body: item.summary.trim(), tier: 1 }
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Tier 2: bullet-list extraction
|
|
70
|
+
if (content) {
|
|
71
|
+
const bullets = extractTopLevelBullets(content)
|
|
72
|
+
if (bullets.length >= 3) {
|
|
73
|
+
const selected = bullets.slice(0, 5)
|
|
74
|
+
const lede = item.description?.trim() ?? item.title
|
|
75
|
+
return {
|
|
76
|
+
body: `${lede}\n\n${selected.map((b) => `- ${b}`).join('\n')}`,
|
|
77
|
+
tier: 2,
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Tier 3: narrative fallback
|
|
83
|
+
if (content) {
|
|
84
|
+
const sectionFirsts = extractFirstSentencePerSection(content).slice(0, 5)
|
|
85
|
+
const lastSentence = extractLastSentence(content)
|
|
86
|
+
const lede = item.description?.trim()
|
|
87
|
+
const narrativeParts: string[] = []
|
|
88
|
+
if (lede) narrativeParts.push(lede)
|
|
89
|
+
if (sectionFirsts.length > 0) {
|
|
90
|
+
narrativeParts.push(sectionFirsts.map((s) => `- ${s}`).join('\n'))
|
|
91
|
+
}
|
|
92
|
+
if (lastSentence) narrativeParts.push(lastSentence)
|
|
93
|
+
const narrative = narrativeParts.join('\n\n')
|
|
94
|
+
if (narrative.length >= 100) {
|
|
95
|
+
return { body: narrative, tier: 3 }
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Tier 4: minimal fallback
|
|
100
|
+
onMinimalFallback?.(item)
|
|
101
|
+
return { body: item.description ?? item.title, tier: 4 }
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function extractTopLevelBullets(md: string): string[] {
|
|
105
|
+
const out: string[] = []
|
|
106
|
+
// Match top-level (no leading whitespace) `- ` or `* ` bullets.
|
|
107
|
+
const re = /^[ \t]*(?:[-*])\s+(.+)$/gm
|
|
108
|
+
let m: RegExpExecArray | null = re.exec(md)
|
|
109
|
+
while (m !== null) {
|
|
110
|
+
const line = m[1]?.trim()
|
|
111
|
+
if (line && line.length > 0) out.push(line)
|
|
112
|
+
m = re.exec(md)
|
|
113
|
+
}
|
|
114
|
+
return out
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function extractFirstSentencePerSection(md: string): string[] {
|
|
118
|
+
const out: string[] = []
|
|
119
|
+
// Split on `## ` headings; skip the heading line itself and find first sentence of body.
|
|
120
|
+
const sections = md.split(/^## .+$/gm).slice(1)
|
|
121
|
+
for (const section of sections) {
|
|
122
|
+
const firstPara = section
|
|
123
|
+
.split(/\n{2,}/)
|
|
124
|
+
.map((p) => p.trim())
|
|
125
|
+
.find((p) => p.length > 0 && !p.startsWith('#'))
|
|
126
|
+
if (firstPara) {
|
|
127
|
+
const sentence = firstSentence(firstPara)
|
|
128
|
+
if (sentence) out.push(sentence)
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return out
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function extractLastSentence(md: string): string | undefined {
|
|
135
|
+
// Take the last non-empty paragraph, pick its last sentence.
|
|
136
|
+
const paras = md
|
|
137
|
+
.split(/\n{2,}/)
|
|
138
|
+
.map((p) => p.trim())
|
|
139
|
+
.filter((p) => p.length > 0 && !p.startsWith('#'))
|
|
140
|
+
const lastPara = paras[paras.length - 1]
|
|
141
|
+
if (!lastPara) return undefined
|
|
142
|
+
const sentences = lastPara.split(/(?<=[.!?])\s+/)
|
|
143
|
+
return sentences[sentences.length - 1]?.trim()
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function firstSentence(text: string): string | undefined {
|
|
147
|
+
const match = /^(.+?[.!?])(?:\s|$)/.exec(text)
|
|
148
|
+
return match?.[1]?.trim() ?? text.split('\n')[0]?.trim()
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function capToTokens(body: string, maxTokens: number): string {
|
|
152
|
+
if (estimateTokenCount(body) <= maxTokens) return body
|
|
153
|
+
const maxChars = maxTokens * 4
|
|
154
|
+
return `${body.slice(0, maxChars).trimEnd()}…`
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
function buildFrontmatter(item: ContentItem, options: GenerateSummaryTwinOptions): string {
|
|
158
|
+
const lines: string[] = []
|
|
159
|
+
lines.push(`type: summary`)
|
|
160
|
+
lines.push(`title: ${yv(item.title)}`)
|
|
161
|
+
lines.push(`url: ${yv(`${options.fullUrl}.summary.md`)}`)
|
|
162
|
+
lines.push(`fullUrl: ${yv(options.fullUrl)}`)
|
|
163
|
+
if (item.datePublished) lines.push(`datePublished: ${yv(item.datePublished)}`)
|
|
164
|
+
if (item.dateModified) lines.push(`dateModified: ${yv(item.dateModified)}`)
|
|
165
|
+
lines.push(`publisher: ${yv(options.publisherName)}`)
|
|
166
|
+
lines.push(`schemaType: ${yv(options.schemaType)}`)
|
|
167
|
+
return lines.join('\n')
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
function yv(v: string): string {
|
|
171
|
+
const isUrl = /^https?:\/\//.test(v)
|
|
172
|
+
const isDate = /^\d{4}-\d{2}-\d{2}/.test(v)
|
|
173
|
+
const needsQuotes = /[#{}[\],&*?|>!%@`'"\n]/.test(v) || (v.includes(':') && !isUrl && !isDate)
|
|
174
|
+
if (!needsQuotes) return v
|
|
175
|
+
return `"${v.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n')}"`
|
|
176
|
+
}
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import type { ContentItem } from '../types.js'
|
|
2
|
+
import { type GenerateAeoMarkdownOptions, generateAeoMarkdown } from './aeo.js'
|
|
3
|
+
import { generateSummaryTwin } from './aeo-summary.js'
|
|
4
|
+
import { forMarkdownTwin } from './content-filter.js'
|
|
5
|
+
import { computeContentHash } from './staleness.js'
|
|
6
|
+
|
|
7
|
+
// Public API ─────────────────────────────────────────────────────────────
|
|
8
|
+
|
|
9
|
+
export type RenderBody = (item: ContentItem) => string | Promise<string>
|
|
10
|
+
|
|
11
|
+
export interface EmitAeoTwinsOptions {
|
|
12
|
+
items: ContentItem[]
|
|
13
|
+
publisherName: string
|
|
14
|
+
schemaType: string
|
|
15
|
+
/** Resolver that returns the article body in markdown. Required; the emitter
|
|
16
|
+
* can't synthesize content. Consumers typically produce this from their CMS. */
|
|
17
|
+
renderBody: RenderBody
|
|
18
|
+
/** Map item.url → primary twin URL. Default: append '.md'. */
|
|
19
|
+
twinUrl?: (articleUrl: string) => string
|
|
20
|
+
/** Predicate applied after the default members filter. Default: always true. */
|
|
21
|
+
include?: (item: ContentItem) => boolean
|
|
22
|
+
/** Emit a summary twin alongside the primary. Default true. */
|
|
23
|
+
summaryTwin?: boolean
|
|
24
|
+
/** Wrap semantic sections in <!-- aeo:section --> markers. Default true. */
|
|
25
|
+
ragChunkMarkers?: boolean
|
|
26
|
+
/** Stale-hash metadata mode. */
|
|
27
|
+
stalenessCheck?: 'content-hash' | 'dateModified' | 'none'
|
|
28
|
+
/** Called when the summary generator falls back to tier 4. Used for build telemetry. */
|
|
29
|
+
onSummaryMinimalFallback?: (item: ContentItem) => void
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface EmittedTwin {
|
|
33
|
+
/** URL path this file will be served at (used by the writer to derive the filesystem path). */
|
|
34
|
+
urlPath: string
|
|
35
|
+
/** Full URL (for frontmatter). */
|
|
36
|
+
url: string
|
|
37
|
+
/** File body. */
|
|
38
|
+
content: string
|
|
39
|
+
/** Content-type for HTTP response headers when this file is served. */
|
|
40
|
+
contentType: string
|
|
41
|
+
/** If this emission is the primary twin for an item, the item is populated. */
|
|
42
|
+
item?: ContentItem
|
|
43
|
+
/** If this emission is a summary twin, the corresponding primary URL. */
|
|
44
|
+
primaryUrl?: string
|
|
45
|
+
/** Primary twins carry a stable content hash for staleness validation; absent on summaries. */
|
|
46
|
+
contentHash?: string
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface EmitAeoTwinsResult {
|
|
50
|
+
twins: EmittedTwin[]
|
|
51
|
+
/** Map of item.url → contentHash. Persist to disk for staleness validation. */
|
|
52
|
+
contentHashes: Map<string, string>
|
|
53
|
+
/** Number of items filtered out (either by access rule or consumer predicate). */
|
|
54
|
+
skipped: number
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Compute all twin files for a given contentProvider output, without touching
|
|
59
|
+
* the filesystem. Callers (index.ts at astro:build:done time) write the returned
|
|
60
|
+
* twins to disk and record content hashes for later staleness validation.
|
|
61
|
+
*
|
|
62
|
+
* Filtering:
|
|
63
|
+
* - Members items excluded unconditionally (via forMarkdownTwin).
|
|
64
|
+
* - Consumer-supplied `include` predicate applied after the access filter.
|
|
65
|
+
*
|
|
66
|
+
* For each surviving item we emit:
|
|
67
|
+
* - Primary twin at twinUrl(item.url).
|
|
68
|
+
* - Summary twin at <primary>.summary.md (when summaryTwin: true).
|
|
69
|
+
*
|
|
70
|
+
* Aliases (spec twinAliases) are NOT emitted as static files in v7 — they're
|
|
71
|
+
* middleware-only redirects. Static-mode twins live at the primary URL only.
|
|
72
|
+
*/
|
|
73
|
+
export async function emitAeoTwins(options: EmitAeoTwinsOptions): Promise<EmitAeoTwinsResult> {
|
|
74
|
+
const {
|
|
75
|
+
items,
|
|
76
|
+
publisherName,
|
|
77
|
+
schemaType,
|
|
78
|
+
renderBody,
|
|
79
|
+
twinUrl = defaultTwinUrl,
|
|
80
|
+
include = () => true,
|
|
81
|
+
summaryTwin = true,
|
|
82
|
+
ragChunkMarkers = true,
|
|
83
|
+
stalenessCheck = 'content-hash',
|
|
84
|
+
onSummaryMinimalFallback,
|
|
85
|
+
} = options
|
|
86
|
+
|
|
87
|
+
const filtered = forMarkdownTwin(items).filter(include)
|
|
88
|
+
const skipped = items.length - filtered.length
|
|
89
|
+
|
|
90
|
+
const twins: EmittedTwin[] = []
|
|
91
|
+
const contentHashes = new Map<string, string>()
|
|
92
|
+
|
|
93
|
+
for (const item of filtered) {
|
|
94
|
+
const primaryUrl = twinUrl(item.url)
|
|
95
|
+
const primaryUrlPath = urlPath(primaryUrl)
|
|
96
|
+
const body = await renderBody(item)
|
|
97
|
+
|
|
98
|
+
const contentHash =
|
|
99
|
+
stalenessCheck === 'content-hash' ? await computeContentHash(item, body) : undefined
|
|
100
|
+
if (contentHash) contentHashes.set(item.url, contentHash)
|
|
101
|
+
|
|
102
|
+
const summaryUrl = summaryTwin ? `${primaryUrl}.summary.md` : undefined
|
|
103
|
+
|
|
104
|
+
const aeoOpts: GenerateAeoMarkdownOptions = {
|
|
105
|
+
publisherName,
|
|
106
|
+
schemaType,
|
|
107
|
+
content: body,
|
|
108
|
+
ragChunkMarkers,
|
|
109
|
+
canonical: item.url,
|
|
110
|
+
twinUrl: primaryUrl,
|
|
111
|
+
summaryUrl,
|
|
112
|
+
contentHash,
|
|
113
|
+
}
|
|
114
|
+
const primaryContent = generateAeoMarkdown(item, aeoOpts)
|
|
115
|
+
|
|
116
|
+
twins.push({
|
|
117
|
+
urlPath: primaryUrlPath,
|
|
118
|
+
url: primaryUrl,
|
|
119
|
+
content: primaryContent,
|
|
120
|
+
contentType: 'text/markdown; charset=utf-8',
|
|
121
|
+
item,
|
|
122
|
+
contentHash,
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
if (summaryTwin && summaryUrl) {
|
|
126
|
+
const summary = generateSummaryTwin(item, {
|
|
127
|
+
publisherName,
|
|
128
|
+
schemaType,
|
|
129
|
+
content: body,
|
|
130
|
+
fullUrl: primaryUrl,
|
|
131
|
+
onMinimalFallback: onSummaryMinimalFallback,
|
|
132
|
+
})
|
|
133
|
+
twins.push({
|
|
134
|
+
urlPath: urlPath(summaryUrl),
|
|
135
|
+
url: summaryUrl,
|
|
136
|
+
content: summary.markdown,
|
|
137
|
+
contentType: 'text/markdown; charset=utf-8',
|
|
138
|
+
primaryUrl,
|
|
139
|
+
})
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return { twins, contentHashes, skipped }
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// ─── Defaults ───
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Default twinUrl: strip trailing slashes and append '.md'.
|
|
150
|
+
* `/article/midway/` -> `/article/midway.md`
|
|
151
|
+
* `/article/midway` -> `/article/midway.md`
|
|
152
|
+
*/
|
|
153
|
+
function defaultTwinUrl(articleUrl: string): string {
|
|
154
|
+
return `${articleUrl.replace(/\/+$/, '')}.md`
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Extract the URL-path portion of an absolute or relative URL.
|
|
159
|
+
* Used by the caller to derive the filesystem write path under dist/client/.
|
|
160
|
+
*/
|
|
161
|
+
function urlPath(url: string): string {
|
|
162
|
+
try {
|
|
163
|
+
return new URL(url).pathname
|
|
164
|
+
} catch {
|
|
165
|
+
// Relative URL — assume it's already a path.
|
|
166
|
+
return url.startsWith('/') ? url : `/${url}`
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
export const _internals = {
|
|
171
|
+
defaultTwinUrl,
|
|
172
|
+
urlPath,
|
|
173
|
+
}
|