docusaurus-plugin-llms 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -8,293 +8,10 @@
8
8
  * The plugin runs during the Docusaurus build process and scans all Markdown files in the docs directory.
9
9
  */
10
10
 
11
- import * as fs from 'fs/promises';
12
11
  import * as path from 'path';
13
- import matter from 'gray-matter';
14
- import { minimatch } from 'minimatch';
15
12
  import type { LoadContext, Plugin } from '@docusaurus/types';
16
-
17
- /**
18
- * Interface for processed document information
19
- */
20
- interface DocInfo {
21
- title: string;
22
- path: string;
23
- url: string;
24
- content: string;
25
- description: string;
26
- }
27
-
28
- /**
29
- * Plugin options interface
30
- */
31
- interface PluginOptions {
32
- /** Whether to generate the llms.txt file (default: true) */
33
- generateLLMsTxt?: boolean;
34
-
35
- /** Whether to generate the llms-full.txt file (default: true) */
36
- generateLLMsFullTxt?: boolean;
37
-
38
- /** Base directory for documentation files (default: 'docs') */
39
- docsDir?: string;
40
-
41
- /** Array of glob patterns for files to ignore */
42
- ignoreFiles?: string[];
43
-
44
- /** Custom title to use in generated files (defaults to site title) */
45
- title?: string;
46
-
47
- /** Custom description to use in generated files (defaults to site tagline) */
48
- description?: string;
49
-
50
- /** Custom file name for the links file (default: 'llms.txt') */
51
- llmsTxtFilename?: string;
52
-
53
- /** Custom file name for the full content file (default: 'llms-full.txt') */
54
- llmsFullTxtFilename?: string;
55
-
56
- /** Whether to include blog content (default: false) */
57
- includeBlog?: boolean;
58
-
59
- /** Path transformation options for URL construction */
60
- pathTransformation?: {
61
- /** Path segments to ignore when constructing URLs (will be removed if found) */
62
- ignorePaths?: string[];
63
- /** Path segments to add when constructing URLs (will be prepended if not already present) */
64
- addPaths?: string[];
65
- };
66
- }
67
-
68
- /**
69
- * Write content to a file
70
- * @param filePath - Path to write the file to
71
- * @param data - Content to write
72
- */
73
- async function writeFile(filePath: string, data: string): Promise<void> {
74
- return fs.writeFile(filePath, data, 'utf8');
75
- }
76
-
77
- /**
78
- * Read content from a file
79
- * @param filePath - Path of the file to read
80
- * @returns Content of the file
81
- */
82
- async function readFile(filePath: string): Promise<string> {
83
- return fs.readFile(filePath, 'utf8');
84
- }
85
-
86
- /**
87
- * Check if a file should be ignored based on glob patterns
88
- * @param filePath - Path to the file
89
- * @param baseDir - Base directory for relative paths
90
- * @param ignorePatterns - Glob patterns for files to ignore
91
- * @returns Whether the file should be ignored
92
- */
93
- function shouldIgnoreFile(filePath: string, baseDir: string, ignorePatterns: string[]): boolean {
94
- if (ignorePatterns.length === 0) {
95
- return false;
96
- }
97
-
98
- const relativePath = path.relative(baseDir, filePath);
99
-
100
- return ignorePatterns.some(pattern =>
101
- minimatch(relativePath, pattern, { matchBase: true })
102
- );
103
- }
104
-
105
- /**
106
- * Recursively reads all Markdown files in a directory
107
- * @param dir - Directory to scan
108
- * @param baseDir - Base directory for relative paths
109
- * @param ignorePatterns - Glob patterns for files to ignore
110
- * @returns Array of file paths
111
- */
112
- async function readMarkdownFiles(dir: string, baseDir: string, ignorePatterns: string[] = []): Promise<string[]> {
113
- const files: string[] = [];
114
- const entries = await fs.readdir(dir, { withFileTypes: true });
115
-
116
- for (const entry of entries) {
117
- const fullPath = path.join(dir, entry.name);
118
-
119
- if (shouldIgnoreFile(fullPath, baseDir, ignorePatterns)) {
120
- continue;
121
- }
122
-
123
- if (entry.isDirectory()) {
124
- const subDirFiles = await readMarkdownFiles(fullPath, baseDir, ignorePatterns);
125
- files.push(...subDirFiles);
126
- } else if (entry.name.endsWith('.md') || entry.name.endsWith('.mdx')) {
127
- files.push(fullPath);
128
- }
129
- }
130
-
131
- return files;
132
- }
133
-
134
- /**
135
- * Extract title from content or use the filename
136
- * @param data - Frontmatter data
137
- * @param content - Markdown content
138
- * @param filePath - Path to the file
139
- * @returns Extracted title
140
- */
141
- function extractTitle(data: any, content: string, filePath: string): string {
142
- // First try frontmatter
143
- if (data.title) {
144
- return data.title;
145
- }
146
-
147
- // Then try first heading
148
- const headingMatch = content.match(/^#\s+(.*)/m);
149
- if (headingMatch) {
150
- return headingMatch[1].trim();
151
- }
152
-
153
- // Finally use filename
154
- return path.basename(filePath, path.extname(filePath))
155
- .replace(/-/g, ' ')
156
- .replace(/\b\w/g, c => c.toUpperCase());
157
- }
158
-
159
- /**
160
- * Clean markdown content for LLM consumption
161
- * @param content - Raw markdown content
162
- * @returns Cleaned content
163
- */
164
- function cleanMarkdownContent(content: string): string {
165
- // Remove HTML tags
166
- let cleaned = content.replace(/<[^>]*>/g, '');
167
-
168
- // Normalize whitespace
169
- cleaned = cleaned.replace(/\r\n/g, '\n')
170
- .replace(/\n{3,}/g, '\n\n')
171
- .trim();
172
-
173
- return cleaned;
174
- }
175
-
176
- /**
177
- * Apply path transformations according to configuration
178
- * @param urlPath - Original URL path
179
- * @param pathTransformation - Path transformation configuration
180
- * @returns Transformed URL path
181
- */
182
- function applyPathTransformations(
183
- urlPath: string,
184
- pathTransformation?: PluginOptions['pathTransformation']
185
- ): string {
186
- if (!pathTransformation) {
187
- return urlPath;
188
- }
189
-
190
- let transformedPath = urlPath;
191
-
192
- // Remove ignored path segments
193
- if (pathTransformation.ignorePaths?.length) {
194
- for (const ignorePath of pathTransformation.ignorePaths) {
195
- // Create a regex that matches the ignore path at the beginning, middle, or end of the path
196
- // We use word boundaries to ensure we match complete path segments
197
- const ignoreRegex = new RegExp(`(^|/)(${ignorePath})(/|$)`, 'g');
198
- transformedPath = transformedPath.replace(ignoreRegex, '$1$3');
199
- }
200
-
201
- // Clean up any double slashes that might have been created
202
- transformedPath = transformedPath.replace(/\/+/g, '/');
203
-
204
- // Remove leading slash if present
205
- transformedPath = transformedPath.replace(/^\//, '');
206
- }
207
-
208
- // Add path segments if they're not already present
209
- if (pathTransformation.addPaths?.length) {
210
- // Process in reverse order to maintain the specified order in the final path
211
- // This is because each path is prepended to the front
212
- const pathsToAdd = [...pathTransformation.addPaths].reverse();
213
-
214
- for (const addPath of pathsToAdd) {
215
- // Only add if not already present at the beginning
216
- if (!transformedPath.startsWith(addPath + '/') && transformedPath !== addPath) {
217
- transformedPath = `${addPath}/${transformedPath}`;
218
- }
219
- }
220
- }
221
-
222
- return transformedPath;
223
- }
224
-
225
- /**
226
- * Process a markdown file and extract its metadata and content
227
- * @param filePath - Path to the markdown file
228
- * @param baseDir - Base directory
229
- * @param siteUrl - Base URL of the site
230
- * @param pathPrefix - Path prefix for URLs (e.g., 'docs' or 'blog')
231
- * @param pathTransformation - Path transformation configuration
232
- * @returns Processed file data
233
- */
234
- async function processMarkdownFile(
235
- filePath: string,
236
- baseDir: string,
237
- siteUrl: string,
238
- pathPrefix: string = 'docs',
239
- pathTransformation?: PluginOptions['pathTransformation']
240
- ): Promise<DocInfo> {
241
- const content = await readFile(filePath);
242
- const { data, content: markdownContent } = matter(content);
243
-
244
- const relativePath = path.relative(baseDir, filePath);
245
- // Convert to URL path format (replace backslashes with forward slashes on Windows)
246
- const normalizedPath = relativePath.replace(/\\/g, '/');
247
-
248
- // Convert .md extension to appropriate path
249
- const linkPathBase = normalizedPath.replace(/\.mdx?$/, '');
250
-
251
- // Handle index files specially
252
- const linkPath = linkPathBase.endsWith('index')
253
- ? linkPathBase.replace(/\/index$/, '')
254
- : linkPathBase;
255
-
256
- // Apply path transformations to the link path
257
- const transformedLinkPath = applyPathTransformations(linkPath, pathTransformation);
258
-
259
- // Also apply path transformations to the pathPrefix if it's not empty
260
- // This allows removing 'docs' from the path when specified in ignorePaths
261
- let transformedPathPrefix = pathPrefix;
262
- if (pathPrefix && pathTransformation?.ignorePaths?.includes(pathPrefix)) {
263
- transformedPathPrefix = '';
264
- }
265
-
266
- // Generate full URL with transformed path and path prefix
267
- const fullUrl = new URL(
268
- `${transformedPathPrefix ? `${transformedPathPrefix}/` : ''}${transformedLinkPath}`,
269
- siteUrl
270
- ).toString();
271
-
272
- // Extract title
273
- const title = extractTitle(data, markdownContent, filePath);
274
-
275
- // Get description from frontmatter or first paragraph
276
- let description = data.description || '';
277
- if (!description) {
278
- const paragraphs = markdownContent.split('\n\n');
279
- for (const para of paragraphs) {
280
- if (para.trim() && !para.startsWith('#')) {
281
- description = para.trim();
282
- break;
283
- }
284
- }
285
- }
286
-
287
- // Clean and process content
288
- const cleanedContent = cleanMarkdownContent(markdownContent);
289
-
290
- return {
291
- title,
292
- path: normalizedPath,
293
- url: fullUrl,
294
- content: cleanedContent,
295
- description: description || '',
296
- };
297
- }
13
+ import { PluginOptions, PluginContext } from './types';
14
+ import { collectDocFiles, generateStandardLLMFiles, generateCustomLLMFiles } from './generator';
298
15
 
299
16
  /**
300
17
  * A Docusaurus plugin to generate LLM-friendly documentation following
@@ -320,6 +37,9 @@ export default function docusaurusPluginLLMs(
320
37
  llmsFullTxtFilename = 'llms-full.txt',
321
38
  includeBlog = false,
322
39
  pathTransformation,
40
+ includeOrder = [],
41
+ includeUnmatchedLast = true,
42
+ customLLMFiles = [],
323
43
  } = options;
324
44
 
325
45
  const {
@@ -327,6 +47,38 @@ export default function docusaurusPluginLLMs(
327
47
  siteConfig,
328
48
  outDir,
329
49
  } = context;
50
+
51
+ // Build the site URL with proper trailing slash
52
+ const siteUrl = siteConfig.url + (
53
+ siteConfig.baseUrl.endsWith('/')
54
+ ? siteConfig.baseUrl.slice(0, -1)
55
+ : siteConfig.baseUrl || ''
56
+ );
57
+
58
+ // Create a plugin context object with processed options
59
+ const pluginContext: PluginContext = {
60
+ siteDir,
61
+ outDir,
62
+ siteUrl,
63
+ docsDir,
64
+ docTitle: title || siteConfig.title,
65
+ docDescription: description || siteConfig.tagline || '',
66
+ options: {
67
+ generateLLMsTxt,
68
+ generateLLMsFullTxt,
69
+ docsDir,
70
+ ignoreFiles,
71
+ title,
72
+ description,
73
+ llmsTxtFilename,
74
+ llmsFullTxtFilename,
75
+ includeBlog,
76
+ pathTransformation,
77
+ includeOrder,
78
+ includeUnmatchedLast,
79
+ customLLMFiles,
80
+ }
81
+ };
330
82
 
331
83
  return {
332
84
  name: 'docusaurus-plugin-llms',
@@ -336,152 +88,25 @@ export default function docusaurusPluginLLMs(
336
88
  */
337
89
  async postBuild(): Promise<void> {
338
90
  console.log('Generating LLM-friendly documentation...');
339
-
340
- // Custom title and description or fallback to site values
341
- const docTitle = title || siteConfig.title;
342
- const docDescription = description || siteConfig.tagline || '';
343
-
344
- // Build the site URL with proper trailing slash
345
- const siteUrl = siteConfig.url + (
346
- siteConfig.baseUrl.endsWith('/')
347
- ? siteConfig.baseUrl.slice(0, -1)
348
- : siteConfig.baseUrl || ''
349
- );
350
-
351
- // Initialize docs collection
352
- const allDocs: DocInfo[] = [];
353
-
91
+
354
92
  try {
355
- // Process docs directory
356
- const fullDocsDir = path.join(siteDir, docsDir);
357
-
358
- try {
359
- await fs.access(fullDocsDir);
360
-
361
- // Collect all markdown files from docs directory
362
- const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles);
363
-
364
- if (docFiles.length > 0) {
365
- // Process each file
366
- for (const filePath of docFiles) {
367
- try {
368
- const docInfo = await processMarkdownFile(
369
- filePath,
370
- fullDocsDir,
371
- siteUrl,
372
- 'docs',
373
- pathTransformation
374
- );
375
- allDocs.push(docInfo);
376
- } catch (err: any) {
377
- console.warn(`Error processing ${filePath}: ${err.message}`);
378
- }
379
- }
380
- console.log(`Processed ${docFiles.length} documentation files`);
381
- } else {
382
- console.warn('No markdown files found in docs directory.');
383
- }
384
- } catch (err) {
385
- console.warn(`Docs directory not found: ${fullDocsDir}`);
386
- }
387
-
388
- // Process blog if enabled
389
- if (includeBlog) {
390
- const blogDir = path.join(siteDir, 'blog');
391
-
392
- try {
393
- await fs.access(blogDir);
394
-
395
- // Collect all markdown files from blog directory
396
- const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles);
397
-
398
- if (blogFiles.length > 0) {
399
- // Process each file
400
- for (const filePath of blogFiles) {
401
- try {
402
- const docInfo = await processMarkdownFile(
403
- filePath,
404
- blogDir,
405
- siteUrl,
406
- 'blog',
407
- pathTransformation
408
- );
409
- allDocs.push(docInfo);
410
- } catch (err: any) {
411
- console.warn(`Error processing ${filePath}: ${err.message}`);
412
- }
413
- }
414
- console.log(`Processed ${blogFiles.length} blog files`);
415
- } else {
416
- console.warn('No markdown files found in blog directory.');
417
- }
418
- } catch (err) {
419
- console.warn(`Blog directory not found: ${blogDir}`);
420
- }
421
- }
93
+ // Collect all document files
94
+ const allDocFiles = await collectDocFiles(pluginContext);
422
95
 
423
96
  // Skip further processing if no documents were found
424
- if (allDocs.length === 0) {
97
+ if (allDocFiles.length === 0) {
425
98
  console.warn('No documents found to process.');
426
99
  return;
427
100
  }
428
101
 
429
- // Sort files to ensure consistent ordering
430
- allDocs.sort((a, b) => a.title.localeCompare(b.title));
431
-
432
- // Generate llms.txt
433
- if (generateLLMsTxt) {
434
- const llmsTxtPath = path.join(outDir, llmsTxtFilename);
435
- const tocItems = allDocs.map(doc => {
436
- return `- [${doc.title}](${doc.url})${doc.description ? `: ${doc.description.split('\n')[0]}` : ''}`;
437
- });
438
-
439
- const llmsTxtContent = `# ${docTitle}
440
-
441
- > ${docDescription}
442
-
443
- This file contains links to all documentation sections following the llmtxt.org standard.
444
-
445
- ## Table of Contents
446
-
447
- ${tocItems.join('\n')}
448
- `;
449
-
450
- await writeFile(llmsTxtPath, llmsTxtContent);
451
- console.log(`Generated ${llmsTxtFilename}: ${llmsTxtPath}`);
452
- }
453
-
454
- // Generate llms-full.txt with all content
455
- if (generateLLMsFullTxt) {
456
- const llmsFullTxtPath = path.join(outDir, llmsFullTxtFilename);
457
-
458
- const fullContentSections = allDocs.map(doc => {
459
- return `## ${doc.title}
460
-
461
- ${doc.content}`;
462
- });
463
-
464
- const llmsFullTxtContent = `# ${docTitle}
465
-
466
- > ${docDescription}
467
-
468
- This file contains all documentation content in a single document following the llmtxt.org standard.
469
-
470
- ${fullContentSections.join('\n\n---\n\n')}
471
- `;
472
-
473
- await writeFile(llmsFullTxtPath, llmsFullTxtContent);
474
- console.log(`Generated ${llmsFullTxtFilename}: ${llmsFullTxtPath}`);
475
- }
102
+ // Process standard LLM files (llms.txt and llms-full.txt)
103
+ await generateStandardLLMFiles(pluginContext, allDocFiles);
476
104
 
477
- // Output statistics
478
- const stats = {
479
- totalDocuments: allDocs.length,
480
- totalBytes: allDocs.reduce((sum, doc) => sum + doc.content.length, 0),
481
- approxTokens: Math.round(allDocs.reduce((sum, doc) => sum + doc.content.length, 0) / 4), // Rough token estimate
482
- };
105
+ // Process custom LLM files
106
+ await generateCustomLLMFiles(pluginContext, allDocFiles);
483
107
 
484
- console.log(`Stats: ${stats.totalDocuments} documents, ${Math.round(stats.totalBytes / 1024)}KB, ~${stats.approxTokens} tokens`);
108
+ // Output overall statistics
109
+ console.log(`Stats: ${allDocFiles.length} total available documents processed`);
485
110
  } catch (err: any) {
486
111
  console.error('Error generating LLM documentation:', err);
487
112
  }