docusaurus-plugin-llms 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -8,221 +8,10 @@
8
8
  * The plugin runs during the Docusaurus build process and scans all Markdown files in the docs directory.
9
9
  */
10
10
 
11
- import * as fs from 'fs/promises';
12
11
  import * as path from 'path';
13
- import matter from 'gray-matter';
14
- import { minimatch } from 'minimatch';
15
12
  import type { LoadContext, Plugin } from '@docusaurus/types';
16
-
17
- /**
18
- * Interface for processed document information
19
- */
20
- interface DocInfo {
21
- title: string;
22
- path: string;
23
- url: string;
24
- content: string;
25
- description: string;
26
- }
27
-
28
- /**
29
- * Plugin options interface
30
- */
31
- interface PluginOptions {
32
- /** Whether to generate the llms.txt file (default: true) */
33
- generateLLMsTxt?: boolean;
34
-
35
- /** Whether to generate the llms-full.txt file (default: true) */
36
- generateLLMsFullTxt?: boolean;
37
-
38
- /** Base directory for documentation files (default: 'docs') */
39
- docsDir?: string;
40
-
41
- /** Array of glob patterns for files to ignore */
42
- ignoreFiles?: string[];
43
-
44
- /** Custom title to use in generated files (defaults to site title) */
45
- title?: string;
46
-
47
- /** Custom description to use in generated files (defaults to site tagline) */
48
- description?: string;
49
-
50
- /** Custom file name for the links file (default: 'llms.txt') */
51
- llmsTxtFilename?: string;
52
-
53
- /** Custom file name for the full content file (default: 'llms-full.txt') */
54
- llmsFullTxtFilename?: string;
55
-
56
- /** Whether to include blog content (default: false) */
57
- includeBlog?: boolean;
58
- }
59
-
60
- /**
61
- * Write content to a file
62
- * @param filePath - Path to write the file to
63
- * @param data - Content to write
64
- */
65
- async function writeFile(filePath: string, data: string): Promise<void> {
66
- return fs.writeFile(filePath, data, 'utf8');
67
- }
68
-
69
- /**
70
- * Read content from a file
71
- * @param filePath - Path of the file to read
72
- * @returns Content of the file
73
- */
74
- async function readFile(filePath: string): Promise<string> {
75
- return fs.readFile(filePath, 'utf8');
76
- }
77
-
78
- /**
79
- * Check if a file should be ignored based on glob patterns
80
- * @param filePath - Path to the file
81
- * @param baseDir - Base directory for relative paths
82
- * @param ignorePatterns - Glob patterns for files to ignore
83
- * @returns Whether the file should be ignored
84
- */
85
- function shouldIgnoreFile(filePath: string, baseDir: string, ignorePatterns: string[]): boolean {
86
- if (ignorePatterns.length === 0) {
87
- return false;
88
- }
89
-
90
- const relativePath = path.relative(baseDir, filePath);
91
-
92
- return ignorePatterns.some(pattern =>
93
- minimatch(relativePath, pattern, { matchBase: true })
94
- );
95
- }
96
-
97
- /**
98
- * Recursively reads all Markdown files in a directory
99
- * @param dir - Directory to scan
100
- * @param baseDir - Base directory for relative paths
101
- * @param ignorePatterns - Glob patterns for files to ignore
102
- * @returns Array of file paths
103
- */
104
- async function readMarkdownFiles(dir: string, baseDir: string, ignorePatterns: string[] = []): Promise<string[]> {
105
- const files: string[] = [];
106
- const entries = await fs.readdir(dir, { withFileTypes: true });
107
-
108
- for (const entry of entries) {
109
- const fullPath = path.join(dir, entry.name);
110
-
111
- if (shouldIgnoreFile(fullPath, baseDir, ignorePatterns)) {
112
- continue;
113
- }
114
-
115
- if (entry.isDirectory()) {
116
- const subDirFiles = await readMarkdownFiles(fullPath, baseDir, ignorePatterns);
117
- files.push(...subDirFiles);
118
- } else if (entry.name.endsWith('.md') || entry.name.endsWith('.mdx')) {
119
- files.push(fullPath);
120
- }
121
- }
122
-
123
- return files;
124
- }
125
-
126
- /**
127
- * Extract title from content or use the filename
128
- * @param data - Frontmatter data
129
- * @param content - Markdown content
130
- * @param filePath - Path to the file
131
- * @returns Extracted title
132
- */
133
- function extractTitle(data: any, content: string, filePath: string): string {
134
- // First try frontmatter
135
- if (data.title) {
136
- return data.title;
137
- }
138
-
139
- // Then try first heading
140
- const headingMatch = content.match(/^#\s+(.*)/m);
141
- if (headingMatch) {
142
- return headingMatch[1].trim();
143
- }
144
-
145
- // Finally use filename
146
- return path.basename(filePath, path.extname(filePath))
147
- .replace(/-/g, ' ')
148
- .replace(/\b\w/g, c => c.toUpperCase());
149
- }
150
-
151
- /**
152
- * Clean markdown content for LLM consumption
153
- * @param content - Raw markdown content
154
- * @returns Cleaned content
155
- */
156
- function cleanMarkdownContent(content: string): string {
157
- // Remove HTML tags
158
- let cleaned = content.replace(/<[^>]*>/g, '');
159
-
160
- // Normalize whitespace
161
- cleaned = cleaned.replace(/\r\n/g, '\n')
162
- .replace(/\n{3,}/g, '\n\n')
163
- .trim();
164
-
165
- return cleaned;
166
- }
167
-
168
- /**
169
- * Process a markdown file and extract its metadata and content
170
- * @param filePath - Path to the markdown file
171
- * @param baseDir - Base directory
172
- * @param siteUrl - Base URL of the site
173
- * @param pathPrefix - Path prefix for URLs (e.g., 'docs' or 'blog')
174
- * @returns Processed file data
175
- */
176
- async function processMarkdownFile(
177
- filePath: string,
178
- baseDir: string,
179
- siteUrl: string,
180
- pathPrefix: string = 'docs'
181
- ): Promise<DocInfo> {
182
- const content = await readFile(filePath);
183
- const { data, content: markdownContent } = matter(content);
184
-
185
- const relativePath = path.relative(baseDir, filePath);
186
- // Convert to URL path format (replace backslashes with forward slashes on Windows)
187
- const normalizedPath = relativePath.replace(/\\/g, '/');
188
-
189
- // Convert .md extension to appropriate path
190
- const linkPathBase = normalizedPath.replace(/\.mdx?$/, '');
191
-
192
- // Handle index files specially
193
- const linkPath = linkPathBase.endsWith('index')
194
- ? linkPathBase.replace(/\/index$/, '')
195
- : linkPathBase;
196
-
197
- // Generate full URL
198
- const fullUrl = new URL(`${pathPrefix}/${linkPath}`, siteUrl).toString();
199
-
200
- // Extract title
201
- const title = extractTitle(data, markdownContent, filePath);
202
-
203
- // Get description from frontmatter or first paragraph
204
- let description = data.description || '';
205
- if (!description) {
206
- const paragraphs = markdownContent.split('\n\n');
207
- for (const para of paragraphs) {
208
- if (para.trim() && !para.startsWith('#')) {
209
- description = para.trim();
210
- break;
211
- }
212
- }
213
- }
214
-
215
- // Clean and process content
216
- const cleanedContent = cleanMarkdownContent(markdownContent);
217
-
218
- return {
219
- title,
220
- path: normalizedPath,
221
- url: fullUrl,
222
- content: cleanedContent,
223
- description: description || '',
224
- };
225
- }
13
+ import { PluginOptions, PluginContext } from './types';
14
+ import { collectDocFiles, generateStandardLLMFiles, generateCustomLLMFiles } from './generator';
226
15
 
227
16
  /**
228
17
  * A Docusaurus plugin to generate LLM-friendly documentation following
@@ -247,6 +36,10 @@ export default function docusaurusPluginLLMs(
247
36
  llmsTxtFilename = 'llms.txt',
248
37
  llmsFullTxtFilename = 'llms-full.txt',
249
38
  includeBlog = false,
39
+ pathTransformation,
40
+ includeOrder = [],
41
+ includeUnmatchedLast = true,
42
+ customLLMFiles = [],
250
43
  } = options;
251
44
 
252
45
  const {
@@ -254,6 +47,38 @@ export default function docusaurusPluginLLMs(
254
47
  siteConfig,
255
48
  outDir,
256
49
  } = context;
50
+
51
+ // Build the site URL with proper trailing slash
52
+ const siteUrl = siteConfig.url + (
53
+ siteConfig.baseUrl.endsWith('/')
54
+ ? siteConfig.baseUrl.slice(0, -1)
55
+ : siteConfig.baseUrl || ''
56
+ );
57
+
58
+ // Create a plugin context object with processed options
59
+ const pluginContext: PluginContext = {
60
+ siteDir,
61
+ outDir,
62
+ siteUrl,
63
+ docsDir,
64
+ docTitle: title || siteConfig.title,
65
+ docDescription: description || siteConfig.tagline || '',
66
+ options: {
67
+ generateLLMsTxt,
68
+ generateLLMsFullTxt,
69
+ docsDir,
70
+ ignoreFiles,
71
+ title,
72
+ description,
73
+ llmsTxtFilename,
74
+ llmsFullTxtFilename,
75
+ includeBlog,
76
+ pathTransformation,
77
+ includeOrder,
78
+ includeUnmatchedLast,
79
+ customLLMFiles,
80
+ }
81
+ };
257
82
 
258
83
  return {
259
84
  name: 'docusaurus-plugin-llms',
@@ -263,150 +88,25 @@ export default function docusaurusPluginLLMs(
263
88
  */
264
89
  async postBuild(): Promise<void> {
265
90
  console.log('Generating LLM-friendly documentation...');
266
-
267
- // Custom title and description or fallback to site values
268
- const docTitle = title || siteConfig.title;
269
- const docDescription = description || siteConfig.tagline || '';
270
-
271
- // Build the site URL with proper trailing slash
272
- const siteUrl = siteConfig.url + (
273
- siteConfig.baseUrl.endsWith('/')
274
- ? siteConfig.baseUrl.slice(0, -1)
275
- : siteConfig.baseUrl || ''
276
- );
277
-
278
- // Initialize docs collection
279
- const allDocs: DocInfo[] = [];
280
-
91
+
281
92
  try {
282
- // Process docs directory
283
- const fullDocsDir = path.join(siteDir, docsDir);
284
-
285
- try {
286
- await fs.access(fullDocsDir);
287
-
288
- // Collect all markdown files from docs directory
289
- const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles);
290
-
291
- if (docFiles.length > 0) {
292
- // Process each file
293
- for (const filePath of docFiles) {
294
- try {
295
- const docInfo = await processMarkdownFile(
296
- filePath,
297
- fullDocsDir,
298
- siteUrl,
299
- 'docs'
300
- );
301
- allDocs.push(docInfo);
302
- } catch (err: any) {
303
- console.warn(`Error processing ${filePath}: ${err.message}`);
304
- }
305
- }
306
- console.log(`Processed ${docFiles.length} documentation files`);
307
- } else {
308
- console.warn('No markdown files found in docs directory.');
309
- }
310
- } catch (err) {
311
- console.warn(`Docs directory not found: ${fullDocsDir}`);
312
- }
313
-
314
- // Process blog if enabled
315
- if (includeBlog) {
316
- const blogDir = path.join(siteDir, 'blog');
317
-
318
- try {
319
- await fs.access(blogDir);
320
-
321
- // Collect all markdown files from blog directory
322
- const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles);
323
-
324
- if (blogFiles.length > 0) {
325
- // Process each file
326
- for (const filePath of blogFiles) {
327
- try {
328
- const docInfo = await processMarkdownFile(
329
- filePath,
330
- blogDir,
331
- siteUrl,
332
- 'blog'
333
- );
334
- allDocs.push(docInfo);
335
- } catch (err: any) {
336
- console.warn(`Error processing ${filePath}: ${err.message}`);
337
- }
338
- }
339
- console.log(`Processed ${blogFiles.length} blog files`);
340
- } else {
341
- console.warn('No markdown files found in blog directory.');
342
- }
343
- } catch (err) {
344
- console.warn(`Blog directory not found: ${blogDir}`);
345
- }
346
- }
93
+ // Collect all document files
94
+ const allDocFiles = await collectDocFiles(pluginContext);
347
95
 
348
96
  // Skip further processing if no documents were found
349
- if (allDocs.length === 0) {
97
+ if (allDocFiles.length === 0) {
350
98
  console.warn('No documents found to process.');
351
99
  return;
352
100
  }
353
101
 
354
- // Sort files to ensure consistent ordering
355
- allDocs.sort((a, b) => a.title.localeCompare(b.title));
356
-
357
- // Generate llms.txt
358
- if (generateLLMsTxt) {
359
- const llmsTxtPath = path.join(outDir, llmsTxtFilename);
360
- const tocItems = allDocs.map(doc => {
361
- return `- [${doc.title}](${doc.url})${doc.description ? `: ${doc.description.split('\n')[0]}` : ''}`;
362
- });
363
-
364
- const llmsTxtContent = `# ${docTitle}
365
-
366
- > ${docDescription}
367
-
368
- This file contains links to all documentation sections following the llmtxt.org standard.
369
-
370
- ## Table of Contents
371
-
372
- ${tocItems.join('\n')}
373
- `;
374
-
375
- await writeFile(llmsTxtPath, llmsTxtContent);
376
- console.log(`Generated ${llmsTxtFilename}: ${llmsTxtPath}`);
377
- }
378
-
379
- // Generate llms-full.txt with all content
380
- if (generateLLMsFullTxt) {
381
- const llmsFullTxtPath = path.join(outDir, llmsFullTxtFilename);
382
-
383
- const fullContentSections = allDocs.map(doc => {
384
- return `## ${doc.title}
385
-
386
- ${doc.content}`;
387
- });
388
-
389
- const llmsFullTxtContent = `# ${docTitle}
390
-
391
- > ${docDescription}
392
-
393
- This file contains all documentation content in a single document following the llmtxt.org standard.
394
-
395
- ${fullContentSections.join('\n\n---\n\n')}
396
- `;
397
-
398
- await writeFile(llmsFullTxtPath, llmsFullTxtContent);
399
- console.log(`Generated ${llmsFullTxtFilename}: ${llmsFullTxtPath}`);
400
- }
102
+ // Process standard LLM files (llms.txt and llms-full.txt)
103
+ await generateStandardLLMFiles(pluginContext, allDocFiles);
401
104
 
402
- // Output statistics
403
- const stats = {
404
- totalDocuments: allDocs.length,
405
- totalBytes: allDocs.reduce((sum, doc) => sum + doc.content.length, 0),
406
- approxTokens: Math.round(allDocs.reduce((sum, doc) => sum + doc.content.length, 0) / 4), // Rough token estimate
407
- };
105
+ // Process custom LLM files
106
+ await generateCustomLLMFiles(pluginContext, allDocFiles);
408
107
 
409
- console.log(`Stats: ${stats.totalDocuments} documents, ${Math.round(stats.totalBytes / 1024)}KB, ~${stats.approxTokens} tokens`);
108
+ // Output overall statistics
109
+ console.log(`Stats: ${allDocFiles.length} total available documents processed`);
410
110
  } catch (err: any) {
411
111
  console.error('Error generating LLM documentation:', err);
412
112
  }
@@ -0,0 +1,236 @@
1
+ /**
2
+ * Document processing functions for the docusaurus-plugin-llms plugin
3
+ */
4
+
5
+ import * as path from 'path';
6
+ import matter from 'gray-matter';
7
+ import { minimatch } from 'minimatch';
8
+ import { DocInfo, PluginContext } from './types';
9
+ import {
10
+ readFile,
11
+ extractTitle,
12
+ cleanMarkdownContent,
13
+ applyPathTransformations
14
+ } from './utils';
15
+
16
+ /**
17
+ * Process a markdown file and extract its metadata and content
18
+ * @param filePath - Path to the markdown file
19
+ * @param baseDir - Base directory
20
+ * @param siteUrl - Base URL of the site
21
+ * @param pathPrefix - Path prefix for URLs (e.g., 'docs' or 'blog')
22
+ * @param pathTransformation - Path transformation configuration
23
+ * @returns Processed file data
24
+ */
25
+ export async function processMarkdownFile(
26
+ filePath: string,
27
+ baseDir: string,
28
+ siteUrl: string,
29
+ pathPrefix: string = 'docs',
30
+ pathTransformation?: {
31
+ ignorePaths?: string[];
32
+ addPaths?: string[];
33
+ }
34
+ ): Promise<DocInfo> {
35
+ const content = await readFile(filePath);
36
+ const { data, content: markdownContent } = matter(content);
37
+
38
+ const relativePath = path.relative(baseDir, filePath);
39
+ // Convert to URL path format (replace backslashes with forward slashes on Windows)
40
+ const normalizedPath = relativePath.replace(/\\/g, '/');
41
+
42
+ // Convert .md extension to appropriate path
43
+ const linkPathBase = normalizedPath.replace(/\.mdx?$/, '');
44
+
45
+ // Handle index files specially
46
+ const linkPath = linkPathBase.endsWith('index')
47
+ ? linkPathBase.replace(/\/index$/, '')
48
+ : linkPathBase;
49
+
50
+ // Apply path transformations to the link path
51
+ const transformedLinkPath = applyPathTransformations(linkPath, pathTransformation);
52
+
53
+ // Also apply path transformations to the pathPrefix if it's not empty
54
+ // This allows removing 'docs' from the path when specified in ignorePaths
55
+ let transformedPathPrefix = pathPrefix;
56
+ if (pathPrefix && pathTransformation?.ignorePaths?.includes(pathPrefix)) {
57
+ transformedPathPrefix = '';
58
+ }
59
+
60
+ // Generate full URL with transformed path and path prefix
61
+ const fullUrl = new URL(
62
+ `${transformedPathPrefix ? `${transformedPathPrefix}/` : ''}${transformedLinkPath}`,
63
+ siteUrl
64
+ ).toString();
65
+
66
+ // Extract title
67
+ const title = extractTitle(data, markdownContent, filePath);
68
+
69
+ // Get description from frontmatter or first paragraph
70
+ let description = '';
71
+
72
+ // First priority: Use frontmatter description if available
73
+ if (data.description) {
74
+ description = data.description;
75
+ } else {
76
+ // Second priority: Find the first non-heading paragraph
77
+ const paragraphs = markdownContent.split('\n\n');
78
+ for (const para of paragraphs) {
79
+ const trimmedPara = para.trim();
80
+ // Skip empty paragraphs and headings
81
+ if (trimmedPara && !trimmedPara.startsWith('#')) {
82
+ description = trimmedPara;
83
+ break;
84
+ }
85
+ }
86
+
87
+ // Third priority: If still no description, use the first heading's content
88
+ if (!description) {
89
+ const firstHeadingMatch = markdownContent.match(/^#\s+(.*?)$/m);
90
+ if (firstHeadingMatch && firstHeadingMatch[1]) {
91
+ description = firstHeadingMatch[1].trim();
92
+ }
93
+ }
94
+ }
95
+
96
+ // Only remove heading markers at the beginning of descriptions or lines
97
+ // This preserves # characters that are part of the content
98
+ if (description) {
99
+ // Original approach had issues with hashtags inside content
100
+ // Fix: Only remove # symbols at the beginning of lines or description
101
+ // that are followed by a space (actual heading markers)
102
+ description = description.replace(/^(#+)\s+/gm, '');
103
+
104
+ // Special handling for description frontmatter with heading markers
105
+ if (data.description && data.description.startsWith('#')) {
106
+ // If the description in frontmatter starts with a heading marker,
107
+ // we should preserve it in the extracted description
108
+ description = description.replace(/^#+\s+/, '');
109
+ }
110
+
111
+ // Preserve inline hashtags (not heading markers)
112
+ // We don't want to treat hashtags in the middle of content as headings
113
+
114
+ // Validate that the description doesn't contain markdown headings
115
+ if (description.match(/^#+\s+/m)) {
116
+ console.warn(`Warning: Description for "${title}" may still contain heading markers`);
117
+ }
118
+
119
+ // Warn if the description contains HTML tags
120
+ if (/<[^>]+>/g.test(description)) {
121
+ console.warn(`Warning: Description for "${title}" contains HTML tags`);
122
+ }
123
+
124
+ // Warn if the description is very long
125
+ if (description.length > 500) {
126
+ console.warn(`Warning: Description for "${title}" is very long (${description.length} characters)`);
127
+ }
128
+ }
129
+
130
+ // Clean and process content
131
+ const cleanedContent = cleanMarkdownContent(markdownContent);
132
+
133
+ return {
134
+ title,
135
+ path: normalizedPath,
136
+ url: fullUrl,
137
+ content: cleanedContent,
138
+ description: description || '',
139
+ };
140
+ }
141
+
142
+ /**
143
+ * Process files based on include patterns, ignore patterns, and ordering
144
+ * @param context - Plugin context
145
+ * @param allFiles - All available files
146
+ * @param includePatterns - Patterns for files to include
147
+ * @param ignorePatterns - Patterns for files to ignore
148
+ * @param orderPatterns - Patterns for ordering files
149
+ * @param includeUnmatched - Whether to include unmatched files
150
+ * @returns Processed files
151
+ */
152
+ export async function processFilesWithPatterns(
153
+ context: PluginContext,
154
+ allFiles: string[],
155
+ includePatterns: string[] = [],
156
+ ignorePatterns: string[] = [],
157
+ orderPatterns: string[] = [],
158
+ includeUnmatched: boolean = false
159
+ ): Promise<DocInfo[]> {
160
+ const { siteDir, siteUrl, docsDir } = context;
161
+
162
+ // Filter files based on include patterns
163
+ let filteredFiles = allFiles;
164
+
165
+ if (includePatterns.length > 0) {
166
+ filteredFiles = allFiles.filter(file => {
167
+ const relativePath = path.relative(siteDir, file);
168
+ return includePatterns.some(pattern =>
169
+ minimatch(relativePath, pattern, { matchBase: true })
170
+ );
171
+ });
172
+ }
173
+
174
+ // Apply ignore patterns
175
+ if (ignorePatterns.length > 0) {
176
+ filteredFiles = filteredFiles.filter(file => {
177
+ const relativePath = path.relative(siteDir, file);
178
+ return !ignorePatterns.some(pattern =>
179
+ minimatch(relativePath, pattern, { matchBase: true })
180
+ );
181
+ });
182
+ }
183
+
184
+ // Order files according to orderPatterns
185
+ let filesToProcess: string[] = [];
186
+
187
+ if (orderPatterns.length > 0) {
188
+ const matchedFiles = new Set<string>();
189
+
190
+ // Process files according to orderPatterns
191
+ for (const pattern of orderPatterns) {
192
+ const matchingFiles = filteredFiles.filter(file => {
193
+ const relativePath = path.relative(siteDir, file);
194
+ return minimatch(relativePath, pattern, { matchBase: true }) && !matchedFiles.has(file);
195
+ });
196
+
197
+ for (const file of matchingFiles) {
198
+ filesToProcess.push(file);
199
+ matchedFiles.add(file);
200
+ }
201
+ }
202
+
203
+ // Add remaining files if includeUnmatched is true
204
+ if (includeUnmatched) {
205
+ const remainingFiles = filteredFiles.filter(file => !matchedFiles.has(file));
206
+ filesToProcess.push(...remainingFiles);
207
+ }
208
+ } else {
209
+ filesToProcess = filteredFiles;
210
+ }
211
+
212
+ // Process each file to generate DocInfo
213
+ const processedDocs: DocInfo[] = [];
214
+
215
+ for (const filePath of filesToProcess) {
216
+ try {
217
+ // Determine if this is a blog or docs file
218
+ const isBlogFile = filePath.includes(path.join(siteDir, 'blog'));
219
+ const baseDir = isBlogFile ? path.join(siteDir, 'blog') : path.join(siteDir, docsDir);
220
+ const pathPrefix = isBlogFile ? 'blog' : 'docs';
221
+
222
+ const docInfo = await processMarkdownFile(
223
+ filePath,
224
+ baseDir,
225
+ siteUrl,
226
+ pathPrefix,
227
+ context.options.pathTransformation
228
+ );
229
+ processedDocs.push(docInfo);
230
+ } catch (err: any) {
231
+ console.warn(`Error processing ${filePath}: ${err.message}`);
232
+ }
233
+ }
234
+
235
+ return processedDocs;
236
+ }