@redpanda-data/docs-extensions-and-macros 4.15.1 → 4.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,287 @@
1
+ 'use strict'
2
+
3
+ /**
4
+ * Adds Git commit dates to pages as attributes.
5
+ *
6
+ * This extension:
7
+ * 1. Gets the first commit date (when file was created) -> page-git-created-date
8
+ * 2. Gets the last commit date (when file was modified) -> page-git-modified-date
9
+ * 3. Adds these to page.asciidoc.attributes with page- prefix for UI template access
10
+ *
11
+ * Supports both local repos (with worktree) and remote repos (bare clones with gitdir).
12
+ * Antora caches remote repos as bare Git repos in ~/.cache/antora/content/
13
+ *
14
+ * Performance optimization: Uses isomorphic-git to walk the entire git log ONCE per
15
+ * repository, building a filepath→dates map. This is O(commits) instead of O(files * commits).
16
+ * For a repo with 1000 files and 5000 commits, this reduces operations from 5M to 5K.
17
+ *
18
+ * Attribute naming: Uses page- prefix so attributes appear in page.attributes
19
+ * in Handlebars templates (Antora strips the prefix when exposing to UI model).
20
+ *
21
+ * Only runs on pages that have origin info (skips virtual/generated pages).
22
+ */
23
+
24
+ const path = require('path')
25
+ const fs = require('fs')
26
+
27
+ /**
28
+ * Resolve isomorphic-git from Antora's dependencies
29
+ * @param {Object} context - Extension context with module info
30
+ * @returns {Object} isomorphic-git module
31
+ */
32
+ function requireGit (context) {
33
+ return require(
34
+ require.resolve('isomorphic-git', {
35
+ paths: [require.resolve('@antora/content-aggregator', { paths: context.module.paths }) + '/..']
36
+ })
37
+ )
38
+ }
39
+
40
+ /**
41
+ * Format timestamp to ISO date string (YYYY-MM-DD)
42
+ * @param {number} timestamp - Unix timestamp in seconds
43
+ * @returns {string} ISO date string
44
+ */
45
+ function formatDate (timestamp) {
46
+ return new Date(timestamp * 1000).toISOString().substring(0, 10)
47
+ }
48
+
49
+ /**
50
+ * Build a map of filepath -> {created, modified} dates from git log
51
+ * Walks the entire log once, tracking first and last commit for each MODIFIED file
52
+ *
53
+ * This compares each commit's tree with its parent to find which files actually changed,
54
+ * rather than just looking at all files in the tree (which would give incorrect dates).
55
+ *
56
+ * @param {Object} git - isomorphic-git module
57
+ * @param {string} gitdir - Path to .git directory
58
+ * @param {string} ref - Git ref (branch/tag/commit)
59
+ * @param {Object} logger - Logger instance
60
+ * @returns {Promise<Map<string, {created: string, modified: string}>>}
61
+ */
62
+ async function buildFileDateMap (git, gitdir, ref, logger) {
63
+ const fileDates = new Map()
64
+ const cache = {}
65
+
66
+ try {
67
+ // Get all commits - walking from newest to oldest
68
+ const commits = await git.log({
69
+ fs,
70
+ gitdir,
71
+ ref,
72
+ cache,
73
+ })
74
+
75
+ logger.info(`Walking ${commits.length} commits for ${path.basename(gitdir)} (ref: ${ref})`)
76
+
77
+ // Build tree cache to avoid re-reading trees
78
+ const treeCache = new Map()
79
+
80
+ // Process commits from newest to oldest
81
+ // First occurrence = modified date, last occurrence = created date
82
+ for (let i = 0; i < commits.length; i++) {
83
+ const commit = commits[i]
84
+ const timestamp = commit.commit.committer.timestamp
85
+ const date = formatDate(timestamp)
86
+
87
+ try {
88
+ const currentTreeOid = commit.commit.tree
89
+ const parentCommits = commit.commit.parent || []
90
+
91
+ // Get files in current commit's tree
92
+ const currentFiles = await getTreeFiles(git, gitdir, currentTreeOid, '', cache, treeCache)
93
+
94
+ // Get files in parent commit's tree (if parent exists)
95
+ let parentFiles = new Map()
96
+ if (parentCommits.length > 0) {
97
+ const parentCommit = await git.readCommit({ fs, gitdir, oid: parentCommits[0], cache })
98
+ const parentTreeOid = parentCommit.commit.tree
99
+ parentFiles = await getTreeFiles(git, gitdir, parentTreeOid, '', cache, treeCache)
100
+ }
101
+
102
+ // Find files that were added or modified (different OID from parent)
103
+ for (const [filepath, oid] of currentFiles) {
104
+ const parentOid = parentFiles.get(filepath)
105
+ const isModified = !parentOid || parentOid !== oid
106
+
107
+ if (isModified) {
108
+ if (!fileDates.has(filepath)) {
109
+ // First time seeing this file modified (from newest commit)
110
+ fileDates.set(filepath, { created: date, modified: date })
111
+ } else {
112
+ // Update created date (older commit where file was modified)
113
+ const entry = fileDates.get(filepath)
114
+ entry.created = date
115
+ }
116
+ }
117
+ }
118
+ } catch (err) {
119
+ // Skip commits that can't be read
120
+ logger.debug(`Skipping commit ${commit.oid.substring(0, 7)}: ${err.message}`)
121
+ }
122
+ }
123
+ } catch (err) {
124
+ logger.warn(`Failed to read git log for ${gitdir}: ${err.message}`)
125
+ }
126
+
127
+ return fileDates
128
+ }
129
+
130
+ /**
131
+ * Recursively walk a git tree to get all file paths with their OIDs
132
+ * Returns a Map of filepath → OID for comparison between commits
133
+ *
134
+ * @param {Object} git - isomorphic-git module
135
+ * @param {string} gitdir - Path to .git directory
136
+ * @param {string} oid - Tree object ID
137
+ * @param {string} prefix - Current path prefix
138
+ * @param {Object} cache - Git object cache
139
+ * @param {Map} treeCache - Cache of tree OID → files map
140
+ * @returns {Promise<Map<string, string>>} Map of filepath → blob OID
141
+ */
142
+ async function getTreeFiles (git, gitdir, oid, prefix, cache, treeCache) {
143
+ // Check tree cache first
144
+ if (treeCache.has(oid)) {
145
+ return treeCache.get(oid)
146
+ }
147
+
148
+ const files = new Map()
149
+
150
+ try {
151
+ const { tree } = await git.readTree({
152
+ fs,
153
+ gitdir,
154
+ oid,
155
+ cache,
156
+ })
157
+
158
+ for (const entry of tree) {
159
+ const filepath = prefix ? `${prefix}/${entry.path}` : entry.path
160
+
161
+ if (entry.type === 'blob') {
162
+ files.set(filepath, entry.oid)
163
+ } else if (entry.type === 'tree') {
164
+ // Recurse into subdirectory
165
+ const subfiles = await getTreeFiles(git, gitdir, entry.oid, filepath, cache, treeCache)
166
+ for (const [subpath, suboid] of subfiles) {
167
+ files.set(subpath, suboid)
168
+ }
169
+ }
170
+ }
171
+
172
+ // Cache this tree's files
173
+ treeCache.set(oid, files)
174
+ } catch (err) {
175
+ // Skip trees that can't be read
176
+ }
177
+
178
+ return files
179
+ }
180
+
181
+ module.exports.register = function () {
182
+ const logger = this.getLogger('add-git-dates-extension')
183
+ const context = this
184
+
185
+ // Run on documentsConverted after Antora builds page.asciidoc.attributes
186
+ this.on('documentsConverted', async ({ contentCatalog }) => {
187
+ const startTime = Date.now()
188
+ let processedCount = 0
189
+ let skippedCount = 0
190
+
191
+ // Load isomorphic-git
192
+ let git
193
+ try {
194
+ git = requireGit(context)
195
+ } catch (err) {
196
+ logger.error(`Failed to load isomorphic-git: ${err.message}`)
197
+ return
198
+ }
199
+
200
+ // Group pages by BOTH gitdir AND ref (since same repo can have multiple branches/versions)
201
+ const pagesByRepoAndRef = new Map()
202
+ const skipLoggedRepos = new Set()
203
+
204
+ contentCatalog.getPages().forEach((page) => {
205
+ const origin = page.src?.origin
206
+ if (!origin?.url) {
207
+ skippedCount++
208
+ return
209
+ }
210
+
211
+ // Need gitdir for isomorphic-git (works for both local and bare repos)
212
+ const gitdir = origin.gitdir || (origin.worktree ? path.join(origin.worktree, '.git') : null)
213
+ if (!gitdir) {
214
+ // Debug: Log which repos don't have gitdir
215
+ if (!skipLoggedRepos.has(origin.url)) {
216
+ logger.info(`⚠️ Skipping repo without gitdir: ${origin.url} (has gitdir: ${!!origin.gitdir}, has worktree: ${!!origin.worktree})`)
217
+ skipLoggedRepos.add(origin.url)
218
+ }
219
+ skippedCount++
220
+ return
221
+ }
222
+
223
+ // Ensure asciidoc.attributes exists
224
+ if (!page.asciidoc) page.asciidoc = {}
225
+ if (!page.asciidoc.attributes) page.asciidoc.attributes = {}
226
+
227
+ const startPath = origin.startPath || ''
228
+ const relativeFilePath = startPath ? path.join(startPath, page.src.path) : page.src.path
229
+ const ref = origin.refhash || origin.refname || 'HEAD'
230
+
231
+ // Create composite key: gitdir + ref to handle multiple branches per repo
232
+ const repoRefKey = `${gitdir}::${ref}`
233
+
234
+ // Group by repo AND ref
235
+ if (!pagesByRepoAndRef.has(repoRefKey)) {
236
+ pagesByRepoAndRef.set(repoRefKey, {
237
+ gitdir,
238
+ ref,
239
+ pages: []
240
+ })
241
+ }
242
+ pagesByRepoAndRef.get(repoRefKey).pages.push({ page, relativeFilePath })
243
+ })
244
+
245
+ const totalPages = Array.from(pagesByRepoAndRef.values()).reduce((sum, r) => sum + r.pages.length, 0)
246
+ const repoCount = new Set(Array.from(pagesByRepoAndRef.values()).map(r => r.gitdir)).size
247
+ logger.info(`Processing ${totalPages} pages across ${repoCount} repos (${pagesByRepoAndRef.size} branches) for git dates (skipped ${skippedCount} virtual/generated)`)
248
+
249
+ // Log which repos are being processed
250
+ const reposBeingProcessed = new Set()
251
+ pagesByRepoAndRef.forEach(({ gitdir }) => {
252
+ if (!reposBeingProcessed.has(gitdir)) {
253
+ logger.info(`✓ Will process git dates for: ${gitdir}`)
254
+ reposBeingProcessed.add(gitdir)
255
+ }
256
+ })
257
+
258
+ // Process each repository + ref combination
259
+ for (const [repoRefKey, { gitdir, ref, pages }] of pagesByRepoAndRef) {
260
+ const repoStartTime = Date.now()
261
+
262
+ try {
263
+ // Build the filepath -> dates map for this repo + ref
264
+ const fileDateMap = await buildFileDateMap(git, gitdir, ref, logger)
265
+
266
+ // Apply dates to pages
267
+ for (const { page, relativeFilePath } of pages) {
268
+ const dates = fileDateMap.get(relativeFilePath)
269
+ if (dates) {
270
+ page.asciidoc.attributes['page-git-created-date'] = dates.created
271
+ page.asciidoc.attributes['page-git-modified-date'] = dates.modified
272
+ processedCount++
273
+ }
274
+ }
275
+
276
+ const repoTime = Date.now() - repoStartTime
277
+ logger.debug(`Processed ${pages.length} pages from ${path.basename(gitdir)}@${ref.substring(0,8)} in ${repoTime}ms (map size: ${fileDateMap.size})`)
278
+ } catch (err) {
279
+ logger.warn(`Failed to process repo ${gitdir}@${ref}: ${err.message}`)
280
+ }
281
+ }
282
+
283
+ const duration = Date.now() - startTime
284
+ const perPage = totalPages > 0 ? (duration / totalPages).toFixed(1) : 0
285
+ logger.info(`Git dates added: processed=${processedCount}, skipped=${skippedCount}, duration=${duration}ms (${perPage}ms/page)`)
286
+ })
287
+ }
@@ -1,7 +1,9 @@
1
1
  'use strict';
2
2
 
3
+ const { toMarkdownUrl } = require('../extension-utils/url-utils');
4
+
3
5
  /**
4
- * Extracts markdown from llms.adoc page and generates llms.txt and llms-full.txt.
6
+ * Extracts markdown from llms.adoc page and generates AI-friendly documentation exports.
5
7
  *
6
8
  * This extension:
7
9
  * 1. Adds site-url attribute to home component:
@@ -11,7 +13,8 @@
11
13
  * 3. Gets the markdown content from page.markdownContents (set by convert-to-markdown extension)
12
14
  * 4. Unpublishes the HTML page
13
15
  * 5. Places llms.txt (markdown) at site root
14
- * 6. Generates llms-full.txt with markdown from latest versions
16
+ * 6. Generates llms-full.txt with markdown from latest versions of all components
17
+ * 7. Generates component-specific full.txt files (e.g., redpanda-full.txt, cloud-full.txt)
15
18
  *
16
19
  * Must run after convert-to-markdown extension to access page.markdownContents.
17
20
  */
@@ -73,13 +76,16 @@ module.exports.register = function () {
73
76
  content = content.replace(/^<!--[\s\S]*?-->\s*/gm, '').trim();
74
77
  logger.debug(`Stripped HTML comments, now ${content.length} bytes`);
75
78
 
76
- // Fix URLs: convert em dashes back to double hyphens
79
+ // Fix URLs: convert em dashes back to double hyphens and remove invisible characters
77
80
  // The markdown converter applies smart typography that turns -- into — (em dash)
81
+ // and inserts zero-width spaces (U+200B) and other invisible Unicode characters
78
82
  // This breaks URLs like deploy-preview-159--redpanda-documentation.netlify.app
79
- content = content.replace(/\(https?:\/\/[^)]*—[^)]*\)/g, (match) => {
80
- return match.replace(/—/g, '--');
83
+ content = content.replace(/\(https?:\/\/[^)]*[—\u200B-\u200D\uFEFF][^)]*\)/g, (match) => {
84
+ return match
85
+ .replace(/—/g, '--')
86
+ .replace(/[\u200B-\u200D\uFEFF]/g, '');
81
87
  });
82
- logger.debug('Fixed em dashes in URLs');
88
+ logger.debug('Fixed em dashes and invisible characters in URLs');
83
89
 
84
90
  // Unpublish the HTML page FIRST (following unpublish-pages pattern)
85
91
  if (llmsPage.out) {
@@ -143,6 +149,15 @@ module.exports.register = function () {
143
149
  }
144
150
  });
145
151
  fullContent += `\n`;
152
+ fullContent += `### AI-Friendly Documentation Formats\n\n`;
153
+ fullContent += `We provide multiple formats optimized for AI consumption:\n\n`;
154
+ fullContent += `- **${siteUrl}/llms.txt**: Curated overview following the llms.txt standard - start here for a quick introduction\n`;
155
+ fullContent += `- **${siteUrl}/llms-full.txt**: Complete documentation export (this file) - comprehensive reference with all pages\n`;
156
+ fullContent += `- **Component-specific exports**: Focused documentation for individual products:\n`;
157
+ components.forEach(component => {
158
+ fullContent += ` - \`${siteUrl}/${component.name}-full.txt\`: ${component.title}\n`;
159
+ });
160
+ fullContent += `- **Individual markdown pages**: Each HTML page has a corresponding .md file (e.g., \`/docs/page.html\` → \`/docs/page.md\`)\n\n`;
146
161
  fullContent += `### Accessing Versioned Content\n\n`;
147
162
  fullContent += `For components with versioned documentation (like Redpanda Self-Managed), older versions can be accessed by replacing the version segment in the URL:\n`;
148
163
  fullContent += `- Latest: \`${siteUrl}/current/page-path\`\n`;
@@ -158,7 +173,8 @@ module.exports.register = function () {
158
173
  });
159
174
 
160
175
  pages.forEach((page, index) => {
161
- const pageUrl = page.pub?.url ? `${siteUrl}${page.pub.url}` : 'unknown';
176
+ const mdUrl = page.pub?.url ? toMarkdownUrl(page.pub.url) : '';
177
+ const pageUrl = mdUrl ? `${siteUrl}${mdUrl}` : 'unknown';
162
178
  const pageTitle = page.asciidoc?.doctitle || page.src?.stem || 'Untitled';
163
179
 
164
180
  fullContent += `# Page ${index + 1}: ${pageTitle}\n\n`;
@@ -175,16 +191,334 @@ module.exports.register = function () {
175
191
  });
176
192
  logger.info(`Generated llms-full.txt with ${pages.length} pages`);
177
193
 
194
+ // Generate component-specific full.txt files
195
+ logger.info('Generating component-specific full.txt files...');
196
+ const componentGroups = new Map();
197
+
198
+ // Group pages by component
199
+ pages.forEach(page => {
200
+ const componentName = page.src.component;
201
+ if (!componentGroups.has(componentName)) {
202
+ componentGroups.set(componentName, []);
203
+ }
204
+ componentGroups.get(componentName).push(page);
205
+ });
206
+
207
+ // Generate a full.txt file for each component
208
+ componentGroups.forEach((componentPages, componentName) => {
209
+ const component = components.find(c => c.name === componentName);
210
+ if (!component) return;
211
+
212
+ const latest = component.latest || component.versions[0];
213
+ if (!latest) return;
214
+
215
+ // Sort pages by URL for consistent ordering
216
+ componentPages.sort((a, b) => {
217
+ const urlA = a.pub?.url || '';
218
+ const urlB = b.pub?.url || '';
219
+ return urlA.localeCompare(urlB);
220
+ });
221
+
222
+ let componentContent = `# ${component.title} - Full Markdown Export\n\n`;
223
+ componentContent += `> This file contains all ${component.title} documentation pages in markdown format for AI agent consumption.\n`;
224
+ componentContent += `> Generated from ${componentPages.length} pages on ${new Date().toISOString()}\n`;
225
+ componentContent += `> Component: ${component.name} | Version: ${latest.version}\n`;
226
+ componentContent += `> Site: ${siteUrl}\n\n`;
227
+ componentContent += `## About This Export\n\n`;
228
+ componentContent += `This export includes the **latest version** (${latest.version}) of the ${component.title} documentation.\n\n`;
229
+ componentContent += `### AI-Friendly Documentation Formats\n\n`;
230
+ componentContent += `We provide multiple formats optimized for AI consumption:\n\n`;
231
+ componentContent += `- **${siteUrl}/llms.txt**: Curated overview of all Redpanda documentation\n`;
232
+ componentContent += `- **${siteUrl}/llms-full.txt**: Complete documentation export with all components\n`;
233
+ componentContent += `- **${siteUrl}/${componentName}-full.txt**: This file - ${component.title} documentation only\n`;
234
+ componentContent += `- **Individual markdown pages**: Each HTML page has a corresponding .md file\n\n`;
235
+
236
+ if (component.versions.length > 1) {
237
+ componentContent += `### Accessing Older Versions\n\n`;
238
+ componentContent += `This component has versioned documentation. Older versions can be accessed by replacing the version segment in the URL:\n`;
239
+ componentContent += `- Latest: \`${siteUrl}/current/page-path\`\n`;
240
+ componentContent += `- Specific version: \`${siteUrl}/24.3/page-path\`, \`${siteUrl}/25.1/page-path\`, etc.\n\n`;
241
+ }
242
+
243
+ componentContent += `---\n\n`;
244
+
245
+ // Add all pages
246
+ componentPages.forEach((page, index) => {
247
+ const mdUrl = page.pub?.url ? toMarkdownUrl(page.pub.url) : '';
248
+ const pageUrl = mdUrl ? `${siteUrl}${mdUrl}` : 'unknown';
249
+ const pageTitle = page.asciidoc?.doctitle || page.src?.stem || 'Untitled';
250
+
251
+ componentContent += `# Page ${index + 1}: ${pageTitle}\n\n`;
252
+ componentContent += `**URL**: ${pageUrl}\n\n`;
253
+ componentContent += `---\n\n`;
254
+ componentContent += page.markdownContents.toString('utf8');
255
+ componentContent += `\n\n---\n\n`;
256
+ });
257
+
258
+ // Add component-specific full.txt file to site root
259
+ siteCatalog.addFile({
260
+ contents: Buffer.from(componentContent, 'utf8'),
261
+ out: { path: `${componentName}-full.txt` },
262
+ });
263
+ logger.info(`Generated ${componentName}-full.txt with ${componentPages.length} pages`);
264
+ });
265
+
178
266
  // Add llms.txt to site root (using content extracted earlier)
179
267
  if (llmsPage && llmsPage.llmsTxtContent) {
180
268
  logger.info('Adding llms.txt to site root');
269
+
181
270
  siteCatalog.addFile({
182
271
  contents: Buffer.from(llmsPage.llmsTxtContent, 'utf8'),
183
272
  out: { path: 'llms.txt' },
184
273
  });
185
274
  logger.info('Successfully added llms.txt');
275
+
276
+ // Add llms.txt to sitemap with git dates
277
+ try {
278
+ // Build a map of filename -> most recent git modified date
279
+ const gitDates = new Map();
280
+
281
+ // llms.txt uses the llms page's git modified date
282
+ if (llmsPage.asciidoc?.attributes?.['page-git-modified-date']) {
283
+ gitDates.set('llms.txt', llmsPage.asciidoc.attributes['page-git-modified-date']);
284
+ }
285
+
286
+ // llms-full.txt uses the most recent modified date from all pages
287
+ if (pages.length > 0) {
288
+ const mostRecent = getMostRecentGitDate(pages);
289
+ if (mostRecent) {
290
+ gitDates.set('llms-full.txt', mostRecent);
291
+ }
292
+ }
293
+
294
+ // Component-specific files use most recent from that component
295
+ componentGroups.forEach((componentPages, componentName) => {
296
+ const mostRecent = getMostRecentGitDate(componentPages);
297
+ if (mostRecent) {
298
+ gitDates.set(`${componentName}-full.txt`, mostRecent);
299
+ }
300
+ });
301
+
302
+ addToSitemap(contentCatalog, siteCatalog, siteUrl, gitDates, logger);
303
+ } catch (err) {
304
+ logger.warn(`Failed to add llms.txt to sitemap: ${err.message}`);
305
+ }
186
306
  } else {
187
307
  logger.warn('llms.txt not generated - page not found or no content extracted');
188
308
  }
189
309
  });
190
310
  };
311
+
312
+ /**
313
+ * Get the most recent git modified date from a collection of pages
314
+ */
315
+ function getMostRecentGitDate(pages) {
316
+ let mostRecent = null;
317
+
318
+ for (const page of pages) {
319
+ const gitDate = page.asciidoc?.attributes?.['page-git-modified-date'];
320
+ if (gitDate) {
321
+ const date = new Date(gitDate);
322
+ if (!mostRecent || date > mostRecent) {
323
+ mostRecent = date;
324
+ }
325
+ }
326
+ }
327
+
328
+ return mostRecent ? mostRecent.toISOString() : null;
329
+ }
330
+
331
+ /**
332
+ * Add llms.txt and all -full.txt files to sitemap by creating a separate sitemap-llms.xml
333
+ * and adding it to the main sitemap index
334
+ *
335
+ * @param {Object} contentCatalog - Antora content catalog (source pages)
336
+ * @param {Object} siteCatalog - Antora site catalog (output files)
337
+ * @param {string} siteUrl - Base site URL
338
+ * @param {Map<string, string>} gitDates - Map of filename -> ISO date string from git history
339
+ * @param {Object} logger - Logger instance
340
+ */
341
+ function addToSitemap(contentCatalog, siteCatalog, siteUrl, gitDates, logger) {
342
+ const now = new Date().toISOString();
343
+
344
+ // Find all llms .txt files in the site catalog
345
+ const llmsFiles = siteCatalog.getFiles()
346
+ .filter(file => {
347
+ const filename = file.out.path;
348
+ return filename === 'llms.txt' ||
349
+ filename === 'llms-full.txt' ||
350
+ filename.endsWith('-full.txt');
351
+ })
352
+ .map(file => file.out.path)
353
+ .sort(); // Sort for consistent ordering
354
+
355
+ logger.info(`Found ${llmsFiles.length} llms files to add to sitemap: ${llmsFiles.join(', ')}`);
356
+
357
+ // Create sitemap-llms.xml with all llms files
358
+ const urlEntries = llmsFiles.map(filename => {
359
+ // Use git date if available, otherwise fall back to build time
360
+ const lastmod = gitDates.get(filename) || now;
361
+ return ` <url>
362
+ <loc>${siteUrl}/${filename}</loc>
363
+ <lastmod>${lastmod}</lastmod>
364
+ <changefreq>daily</changefreq>
365
+ <priority>1.0</priority>
366
+ </url>`;
367
+ }).join('\n');
368
+
369
+ const llmsSitemapXml = `<?xml version="1.0" encoding="UTF-8"?>
370
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
371
+ ${urlEntries}
372
+ </urlset>`;
373
+
374
+ siteCatalog.addFile({
375
+ contents: Buffer.from(llmsSitemapXml, 'utf8'),
376
+ out: { path: 'sitemap-llms.xml' },
377
+ });
378
+ logger.info(`Created sitemap-llms.xml with ${llmsFiles.length} entries`);
379
+
380
+ // Find and update the main sitemap index
381
+ const sitemapIndex = siteCatalog.getFiles().find(file =>
382
+ file.out.path === 'sitemap.xml'
383
+ );
384
+
385
+ if (!sitemapIndex) {
386
+ logger.warn('Main sitemap.xml not found, cannot add llms sitemap to index');
387
+ return;
388
+ }
389
+
390
+ // Parse and update the sitemap index
391
+ let sitemapIndexXml = sitemapIndex.contents.toString('utf8');
392
+
393
+ // Add lastmod to all existing component sitemaps for consistency
394
+ sitemapIndexXml = addLastmodToComponentSitemaps(contentCatalog, siteCatalog, sitemapIndexXml, siteUrl, logger);
395
+
396
+ // Check if sitemap-llms.xml is already in the index
397
+ if (sitemapIndexXml.includes('sitemap-llms.xml')) {
398
+ logger.debug('sitemap-llms.xml already in sitemap index');
399
+ // Update the index with modified component sitemaps even if llms sitemap exists
400
+ sitemapIndex.contents = Buffer.from(sitemapIndexXml, 'utf8');
401
+ return;
402
+ }
403
+
404
+ // Find the most recent date from all llms files for the sitemap-llms.xml lastmod
405
+ let sitemapLastmod = now;
406
+ if (gitDates.size > 0) {
407
+ const dates = Array.from(gitDates.values()).map(d => new Date(d));
408
+ const mostRecent = new Date(Math.max(...dates));
409
+ sitemapLastmod = mostRecent.toISOString();
410
+ }
411
+
412
+ // Add sitemap-llms.xml entry before the closing </sitemapindex> tag
413
+ const llmsSitemapEntry = ` <sitemap>
414
+ <loc>${siteUrl}/sitemap-llms.xml</loc>
415
+ <lastmod>${sitemapLastmod}</lastmod>
416
+ </sitemap>
417
+ </sitemapindex>`;
418
+
419
+ sitemapIndexXml = sitemapIndexXml.replace('</sitemapindex>', llmsSitemapEntry);
420
+
421
+ // Update the sitemap index in the catalog
422
+ sitemapIndex.contents = Buffer.from(sitemapIndexXml, 'utf8');
423
+ logger.info('Added sitemap-llms.xml to main sitemap index');
424
+ }
425
+
426
+ /**
427
+ * Add lastmod to all component sitemaps in the sitemap index for consistency
428
+ * Also updates the component sitemaps themselves to use git dates instead of build time
429
+ *
430
+ * @param {Object} contentCatalog - Antora content catalog (source pages with git dates)
431
+ * @param {Object} siteCatalog - Antora site catalog (output files including sitemaps)
432
+ * @param {string} sitemapIndexXml - The sitemap index XML content
433
+ * @param {string} siteUrl - Base site URL for matching URLs
434
+ * @param {Object} logger - Logger instance
435
+ */
436
+ function addLastmodToComponentSitemaps(contentCatalog, siteCatalog, sitemapIndexXml, siteUrl, logger) {
437
+ // Build a map of URL -> git date from all pages
438
+ const urlToGitDate = new Map();
439
+ const allPages = contentCatalog.getPages();
440
+
441
+ allPages.forEach(page => {
442
+ if (page.pub?.url && page.asciidoc?.attributes?.['page-git-modified-date']) {
443
+ const gitDate = page.asciidoc.attributes['page-git-modified-date'];
444
+ const url = `${siteUrl}${page.pub.url}`;
445
+ urlToGitDate.set(url, gitDate);
446
+ }
447
+ });
448
+
449
+ logger.info(`Built URL -> git date map with ${urlToGitDate.size} entries`);
450
+ if (urlToGitDate.size > 0 && urlToGitDate.size < 20) {
451
+ urlToGitDate.forEach((date, url) => {
452
+ logger.debug(` ${url} -> ${date}`);
453
+ });
454
+ }
455
+
456
+ // Find all component sitemap XML files
457
+ const componentSitemaps = siteCatalog.getFiles()
458
+ .filter(file => {
459
+ const path = file.out.path;
460
+ return path.startsWith('sitemap-') &&
461
+ path.endsWith('.xml') &&
462
+ path !== 'sitemap-llms.xml';
463
+ });
464
+
465
+ logger.debug(`Found ${componentSitemaps.length} component sitemaps to update with git dates`);
466
+
467
+ // For each component sitemap, update URLs with git dates and find the most recent
468
+ componentSitemaps.forEach(sitemapFile => {
469
+ const filename = sitemapFile.out.path;
470
+ let xml = sitemapFile.contents.toString('utf8');
471
+ const dates = [];
472
+ let updatedCount = 0;
473
+
474
+ // Update each URL in the sitemap with its git date if available
475
+ xml = xml.replace(
476
+ /<url>\s*<loc>([^<]+)<\/loc>\s*<lastmod>([^<]+)<\/lastmod>\s*<\/url>/g,
477
+ (match, url, oldDate) => {
478
+ const gitDate = urlToGitDate.get(url);
479
+ if (gitDate) {
480
+ updatedCount++;
481
+ dates.push(new Date(gitDate));
482
+ return `<url>\n<loc>${url}</loc>\n<lastmod>${gitDate}</lastmod>\n</url>`;
483
+ } else {
484
+ // Keep original date
485
+ try {
486
+ dates.push(new Date(oldDate));
487
+ } catch (e) {
488
+ // Skip invalid dates
489
+ }
490
+ return match;
491
+ }
492
+ }
493
+ );
494
+
495
+ // Update the sitemap file in the catalog
496
+ sitemapFile.contents = Buffer.from(xml, 'utf8');
497
+ logger.info(`Updated sitemap ${filename}: ${updatedCount} URLs with git dates, ${dates.length} total dates`);
498
+
499
+ if (dates.length === 0) {
500
+ logger.debug(`No dates found in ${filename}`);
501
+ return;
502
+ }
503
+
504
+ // Find the most recent date
505
+ const mostRecent = new Date(Math.max(...dates));
506
+ const lastmod = mostRecent.toISOString();
507
+
508
+ // Update the sitemap index entry to include/update lastmod
509
+ // Match various patterns since Antora might have different formatting
510
+ const locPattern = new RegExp(
511
+ `(<loc>[^<]*/${filename.replace(/\./g, '\\.')}</loc>)(?:\\s*<lastmod>[^<]*</lastmod>)?\\s*</sitemap>`,
512
+ 'g'
513
+ );
514
+
515
+ sitemapIndexXml = sitemapIndexXml.replace(
516
+ locPattern,
517
+ `$1\n <lastmod>${lastmod}</lastmod>\n </sitemap>`
518
+ );
519
+
520
+ logger.debug(`Updated lastmod to ${lastmod} for ${filename} in index`);
521
+ });
522
+
523
+ return sitemapIndexXml;
524
+ }