docusaurus-plugin-llms 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/generator.ts CHANGED
@@ -5,12 +5,20 @@
5
5
  import * as path from 'path';
6
6
  import * as fs from 'fs/promises';
7
7
  import { DocInfo, PluginContext, CustomLLMFile } from './types';
8
- import {
9
- writeFile,
10
- readMarkdownFiles,
11
- sanitizeForFilename,
12
- ensureUniqueIdentifier,
13
- createMarkdownContent
8
+ import {
9
+ writeFile,
10
+ readMarkdownFiles,
11
+ sanitizeForFilename,
12
+ ensureUniqueIdentifier,
13
+ createMarkdownContent,
14
+ normalizePath,
15
+ validatePathLength,
16
+ shortenPathIfNeeded,
17
+ logger,
18
+ getErrorMessage,
19
+ isNonEmptyString,
20
+ isNonEmptyArray,
21
+ isDefined
14
22
  } from './utils';
15
23
  import { processFilesWithPatterns } from './processor';
16
24
 
@@ -20,11 +28,12 @@ import { processFilesWithPatterns } from './processor';
20
28
  * @returns Cleaned description suitable for TOC
21
29
  */
22
30
  function cleanDescriptionForToc(description: string): string {
23
- if (!description) return '';
24
-
31
+ if (!isNonEmptyString(description)) return '';
32
+
25
33
  // Get just the first line for TOC display
26
- const firstLine = description.split('\n')[0];
27
-
34
+ const lines = description.split('\n');
35
+ const firstLine = lines.length > 0 ? lines[0] : '';
36
+
28
37
  // Remove heading markers only at the beginning of the line
29
38
  // Be careful to only remove actual heading markers (# followed by space at beginning)
30
39
  // and not hashtag symbols that are part of the content (inline hashtags)
@@ -43,6 +52,7 @@ function cleanDescriptionForToc(description: string): string {
43
52
  * @param includeFullContent - Whether to include full content or just links
44
53
  * @param version - Version of the file
45
54
  * @param customRootContent - Optional custom content to include at the root level
55
+ * @param batchSize - Batch size for processing documents (default: 100)
46
56
  */
47
57
  export async function generateLLMFile(
48
58
  docs: DocInfo[],
@@ -51,19 +61,39 @@ export async function generateLLMFile(
51
61
  fileDescription: string,
52
62
  includeFullContent: boolean,
53
63
  version?: string,
54
- customRootContent?: string
64
+ customRootContent?: string,
65
+ batchSize: number = 100
55
66
  ): Promise<void> {
56
- console.log(`Generating file: ${outputPath}, version: ${version || 'undefined'}`);
67
+ // Validate path length before proceeding
68
+ if (!validatePathLength(outputPath)) {
69
+ throw new Error(`Output path exceeds maximum length: ${outputPath}`);
70
+ }
71
+
72
+ logger.verbose(`Generating file: ${outputPath}, version: ${version || 'undefined'}`);
57
73
  const versionInfo = version ? `\n\nVersion: ${version}` : '';
58
74
 
59
75
  if (includeFullContent) {
60
76
  // Generate full content file with header deduplication
77
+ // Process documents in batches to prevent memory issues on large sites
61
78
  const usedHeaders = new Set<string>();
62
- const fullContentSections = docs.map(doc => {
79
+ const fullContentSections: string[] = [];
80
+
81
+ // Process documents in batches
82
+ for (let i = 0; i < docs.length; i += batchSize) {
83
+ const batch = docs.slice(i, i + batchSize);
84
+ const batchNumber = Math.floor(i / batchSize) + 1;
85
+ const totalBatches = Math.ceil(docs.length / batchSize);
86
+
87
+ if (totalBatches > 1) {
88
+ logger.verbose(`Processing batch ${batchNumber}/${totalBatches} (${batch.length} documents)`);
89
+ }
90
+
91
+ const batchSections = batch.map(doc => {
63
92
  // Check if content already starts with the same heading to avoid duplication
64
93
  const trimmedContent = doc.content.trim();
65
- const firstLine = trimmedContent.split('\n')[0];
66
-
94
+ const contentLines = trimmedContent.split('\n');
95
+ const firstLine = contentLines.length > 0 ? contentLines[0] : '';
96
+
67
97
  // Check if the first line is a heading that matches our title
68
98
  const headingMatch = firstLine.match(/^#+\s+(.+)$/);
69
99
  const firstHeadingText = headingMatch ? headingMatch[1].trim() : null;
@@ -74,10 +104,10 @@ export async function generateLLMFile(
74
104
  usedHeaders,
75
105
  (counter, base) => {
76
106
  // Try to make it more descriptive by adding the file path info if available
77
- if (doc.path && counter === 2) {
107
+ if (isNonEmptyString(doc.path) && counter === 2) {
78
108
  const pathParts = doc.path.split('/');
79
- const folderName = pathParts.length > 1 ? pathParts[pathParts.length - 2] : '';
80
- if (folderName) {
109
+ const folderName = pathParts.length >= 2 ? pathParts[pathParts.length - 2] : '';
110
+ if (isNonEmptyString(folderName)) {
81
111
  return `(${folderName.charAt(0).toUpperCase() + folderName.slice(1)})`;
82
112
  }
83
113
  }
@@ -86,19 +116,11 @@ export async function generateLLMFile(
86
116
  );
87
117
 
88
118
  if (firstHeadingText === doc.title) {
89
- // Content already has the same heading, replace it with our unique header if needed
90
- if (uniqueHeader !== doc.title) {
91
- const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
92
- return `## ${uniqueHeader}
93
-
94
- ${restOfContent}`;
95
- } else {
96
- // Replace the existing H1 with H2 to comply with llmstxt.org standard
97
- const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
98
- return `## ${uniqueHeader}
119
+ // Content already has the same heading, replace it with our unique header
120
+ const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
121
+ return `## ${uniqueHeader}
99
122
 
100
123
  ${restOfContent}`;
101
- }
102
124
  } else {
103
125
  // Content doesn't have the same heading, add our unique H2 header
104
126
  return `## ${uniqueHeader}
@@ -107,6 +129,9 @@ ${doc.content}`;
107
129
  }
108
130
  });
109
131
 
132
+ fullContentSections.push(...batchSections);
133
+ }
134
+
110
135
  // Use custom root content or default message
111
136
  const rootContent = customRootContent || 'This file contains all documentation content in a single document following the llmstxt.org standard.';
112
137
 
@@ -117,7 +142,11 @@ ${doc.content}`;
117
142
  true // include metadata (description)
118
143
  );
119
144
 
120
- await writeFile(outputPath, llmFileContent);
145
+ try {
146
+ await writeFile(outputPath, llmFileContent);
147
+ } catch (error: unknown) {
148
+ throw new Error(`Failed to write file ${outputPath}: ${getErrorMessage(error)}`);
149
+ }
121
150
  } else {
122
151
  // Generate links-only file
123
152
  const tocItems = docs.map(doc => {
@@ -137,19 +166,24 @@ ${doc.content}`;
137
166
  true // include metadata (description)
138
167
  );
139
168
 
140
- await writeFile(outputPath, llmFileContent);
169
+ try {
170
+ await writeFile(outputPath, llmFileContent);
171
+ } catch (error: unknown) {
172
+ throw new Error(`Failed to write file ${outputPath}: ${getErrorMessage(error)}`);
173
+ }
141
174
  }
142
-
143
- console.log(`Generated: ${outputPath}`);
175
+
176
+ logger.info(`Generated: ${outputPath}`);
144
177
  }
145
178
 
146
179
  /**
147
180
  * Generate individual markdown files for each document
148
- * @param docs - Processed document information
181
+ * @param docs - Processed document information
149
182
  * @param outputDir - Directory to write the markdown files
150
183
  * @param siteUrl - Base site URL
151
184
  * @param docsDir - The configured docs directory name (e.g., 'docs', 'documentation', etc.)
152
185
  * @param keepFrontMatter - Array of frontmatter keys to preserve in generated files
186
+ * @param preserveDirectoryStructure - Whether to preserve the full directory structure (default: true)
153
187
  * @returns Updated docs with new URLs pointing to generated markdown files
154
188
  */
155
189
  export async function generateIndividualMarkdownFiles(
@@ -157,24 +191,64 @@ export async function generateIndividualMarkdownFiles(
157
191
  outputDir: string,
158
192
  siteUrl: string,
159
193
  docsDir: string = 'docs',
160
- keepFrontMatter: string[] = []
194
+ keepFrontMatter: string[] = [],
195
+ preserveDirectoryStructure: boolean = true
161
196
  ): Promise<DocInfo[]> {
162
197
  const updatedDocs: DocInfo[] = [];
163
198
  const usedPaths = new Set<string>();
164
-
165
-
199
+
200
+
166
201
  for (const doc of docs) {
167
- // Use the original path structure, cleaning it up for file system use
202
+ // Use the original path structure as default filename.
168
203
  let relativePath = doc.path
169
204
  .replace(/^\/+/, '') // Remove leading slashes
170
205
  .replace(/\.mdx?$/, '.md'); // Ensure .md extension
171
-
172
-
173
- relativePath = relativePath
174
- .replace(new RegExp(`^${docsDir.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/`), '');// Remove configured docs dir prefix
175
-
206
+
207
+
208
+ // Strip the docsDir prefix only if preserveDirectoryStructure is false
209
+ if (!preserveDirectoryStructure) {
210
+ relativePath = relativePath
211
+ .replace(new RegExp(`^${docsDir.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/`), '');// Remove configured docs dir prefix
212
+ }
213
+
214
+ // If frontmatter has slug, use that.
215
+ if (isNonEmptyString(doc.frontMatter?.slug)) {
216
+ const slug = doc.frontMatter.slug.trim().replace(/^\/+|\/+$/g, ''); // Trim whitespace and slashes
217
+
218
+ if (isNonEmptyString(slug)) { // Only process if slug is not empty after trimming
219
+ if (slug.includes('/')) {
220
+ // Nested slug: create directory structure
221
+ relativePath = slug + '.md';
222
+ } else {
223
+ // Simple slug: replace just the filename
224
+ const pathParts = relativePath.replace(/\.md$/, '').split('/');
225
+ pathParts[pathParts.length - 1] = slug;
226
+ relativePath = pathParts.join('/') + '.md';
227
+ }
228
+ }
229
+ }
230
+ // Otherwise, if frontmatter has id, use that.
231
+ else if (isNonEmptyString(doc.frontMatter?.id)) {
232
+ const id = doc.frontMatter.id.trim().replace(/^\/+|\/+$/g, ''); // Trim whitespace and slashes
233
+
234
+ if (isNonEmptyString(id)) { // Only process if id is not empty after trimming
235
+ if (id.includes('/')) {
236
+ // Nested id: create directory structure
237
+ relativePath = id + '.md';
238
+ } else {
239
+ // Simple id: replace just the filename
240
+ const pathParts = relativePath.replace(/\.md$/, '').split('/');
241
+ pathParts[pathParts.length - 1] = id;
242
+ relativePath = pathParts.join('/') + '.md';
243
+ }
244
+ }
245
+ }
246
+
247
+ // Trim any leading/trailing whitespace from the path
248
+ relativePath = relativePath.trim();
249
+
176
250
  // If path is empty or invalid, create a fallback path
177
- if (!relativePath || relativePath === '.md') {
251
+ if (!isNonEmptyString(relativePath) || relativePath === '.md') {
178
252
  const sanitizedTitle = sanitizeForFilename(doc.title, 'untitled');
179
253
  relativePath = `${sanitizedTitle}.md`;
180
254
  }
@@ -182,25 +256,48 @@ export async function generateIndividualMarkdownFiles(
182
256
  // Ensure path uniqueness
183
257
  let uniquePath = relativePath;
184
258
  let counter = 1;
259
+ const MAX_PATH_ITERATIONS = 10000;
260
+ let pathIterations = 0;
261
+
185
262
  while (usedPaths.has(uniquePath.toLowerCase())) {
186
263
  counter++;
187
264
  const pathParts = relativePath.split('.');
188
265
  const extension = pathParts.pop() || 'md';
189
266
  const basePath = pathParts.join('.');
190
267
  uniquePath = `${basePath}-${counter}.${extension}`;
268
+
269
+ pathIterations++;
270
+ if (pathIterations >= MAX_PATH_ITERATIONS) {
271
+ // Fallback to timestamp
272
+ const timestamp = Date.now();
273
+ uniquePath = `${basePath}-${timestamp}.${extension}`;
274
+ logger.warn(`Maximum iterations reached for unique path. Using timestamp: ${uniquePath}`);
275
+ break;
276
+ }
191
277
  }
192
278
  usedPaths.add(uniquePath.toLowerCase());
193
-
194
- // Create the full file path and ensure directory exists
195
- const fullPath = path.join(outputDir, uniquePath);
279
+
280
+ // Create the full file path and validate/shorten if needed
281
+ let fullPath = path.join(outputDir, uniquePath);
282
+ fullPath = shortenPathIfNeeded(fullPath, outputDir, uniquePath);
283
+
284
+ // Update uniquePath to reflect the shortened path if it was changed
285
+ if (fullPath !== path.join(outputDir, uniquePath)) {
286
+ uniquePath = path.relative(outputDir, fullPath);
287
+ }
288
+
196
289
  const directory = path.dirname(fullPath);
197
-
290
+
198
291
  // Create directory structure if it doesn't exist
199
- await fs.mkdir(directory, { recursive: true });
292
+ try {
293
+ await fs.mkdir(directory, { recursive: true });
294
+ } catch (error: unknown) {
295
+ throw new Error(`Failed to create directory ${directory}: ${getErrorMessage(error)}`);
296
+ }
200
297
 
201
298
  // Extract preserved frontmatter if specified
202
299
  let preservedFrontMatter: Record<string, any> = {};
203
- if (keepFrontMatter.length > 0 && doc.frontMatter) {
300
+ if (isNonEmptyArray(keepFrontMatter) && isDefined(doc.frontMatter)) {
204
301
  for (const key of keepFrontMatter) {
205
302
  if (key in doc.frontMatter) {
206
303
  preservedFrontMatter[key] = doc.frontMatter[key];
@@ -210,19 +307,23 @@ export async function generateIndividualMarkdownFiles(
210
307
 
211
308
  // Create markdown content using the utility function
212
309
  const markdownContent = createMarkdownContent(
213
- doc.title,
214
- doc.description,
215
- doc.content,
310
+ doc.title,
311
+ doc.description,
312
+ doc.content,
216
313
  true, // includeMetadata
217
314
  Object.keys(preservedFrontMatter).length > 0 ? preservedFrontMatter : undefined
218
315
  );
219
-
316
+
220
317
  // Write the markdown file
221
- await writeFile(fullPath, markdownContent);
318
+ try {
319
+ await writeFile(fullPath, markdownContent);
320
+ } catch (error: unknown) {
321
+ throw new Error(`Failed to write file ${fullPath}: ${getErrorMessage(error)}`);
322
+ }
222
323
 
223
324
  // Create updated DocInfo with new URL pointing to the generated markdown file
224
325
  // Convert file path to URL path (use forward slashes)
225
- const urlPath = uniquePath.replace(/\\/g, '/');
326
+ const urlPath = normalizePath(uniquePath);
226
327
  const newUrl = `${siteUrl}/${urlPath}`;
227
328
 
228
329
  updatedDocs.push({
@@ -231,7 +332,7 @@ export async function generateIndividualMarkdownFiles(
231
332
  path: `/${urlPath}` // Update path to the new markdown file
232
333
  });
233
334
 
234
- console.log(`Generated markdown file: ${uniquePath}`);
335
+ logger.verbose(`Generated markdown file: ${uniquePath}`);
235
336
  }
236
337
 
237
338
  return updatedDocs;
@@ -254,8 +355,8 @@ export async function generateStandardLLMFiles(
254
355
  options
255
356
  } = context;
256
357
 
257
- const {
258
- generateLLMsTxt,
358
+ const {
359
+ generateLLMsTxt,
259
360
  generateLLMsFullTxt,
260
361
  llmsTxtFilename = 'llms.txt',
261
362
  llmsFullTxtFilename = 'llms-full.txt',
@@ -264,10 +365,12 @@ export async function generateStandardLLMFiles(
264
365
  version,
265
366
  generateMarkdownFiles = false,
266
367
  rootContent,
267
- fullRootContent
368
+ fullRootContent,
369
+ processingBatchSize = 100
268
370
  } = options;
269
371
 
270
372
  if (!generateLLMsTxt && !generateLLMsFullTxt) {
373
+ logger.warn('No standard LLM files configured for generation. Skipping.');
271
374
  return;
272
375
  }
273
376
 
@@ -281,17 +384,24 @@ export async function generateStandardLLMFiles(
281
384
  includeUnmatchedLast
282
385
  );
283
386
 
284
- console.log(`Processed ${processedDocs.length} documentation files for standard LLM files`);
285
-
387
+ logger.verbose(`Processed ${processedDocs.length} documentation files for standard LLM files`);
388
+
389
+ // Check if we have documents to process
390
+ if (!isNonEmptyArray(processedDocs)) {
391
+ logger.warn('No documents found matching patterns for standard LLM files. Skipping.');
392
+ return;
393
+ }
394
+
286
395
  // Generate individual markdown files if requested
287
- if (generateMarkdownFiles && processedDocs.length > 0) {
288
- console.log('Generating individual markdown files...');
396
+ if (generateMarkdownFiles) {
397
+ logger.info('Generating individual markdown files...');
289
398
  processedDocs = await generateIndividualMarkdownFiles(
290
399
  processedDocs,
291
400
  outDir,
292
401
  siteUrl,
293
402
  context.docsDir,
294
- context.options.keepFrontMatter || []
403
+ context.options.keepFrontMatter || [],
404
+ context.options.preserveDirectoryStructure !== false // Default to true
295
405
  );
296
406
  }
297
407
 
@@ -305,7 +415,8 @@ export async function generateStandardLLMFiles(
305
415
  docDescription,
306
416
  false, // links only
307
417
  version,
308
- rootContent
418
+ rootContent,
419
+ processingBatchSize
309
420
  );
310
421
  }
311
422
 
@@ -319,7 +430,8 @@ export async function generateStandardLLMFiles(
319
430
  docDescription,
320
431
  true, // full content
321
432
  version,
322
- fullRootContent
433
+ fullRootContent,
434
+ processingBatchSize
323
435
  );
324
436
  }
325
437
  }
@@ -334,16 +446,22 @@ export async function generateCustomLLMFiles(
334
446
  allDocFiles: string[]
335
447
  ): Promise<void> {
336
448
  const { outDir, siteUrl, docTitle, docDescription, options } = context;
337
- const { customLLMFiles = [], ignoreFiles = [], generateMarkdownFiles = false } = options;
449
+ const {
450
+ customLLMFiles = [],
451
+ ignoreFiles = [],
452
+ generateMarkdownFiles = false,
453
+ processingBatchSize = 100
454
+ } = options;
338
455
 
339
456
  if (customLLMFiles.length === 0) {
457
+ logger.warn('No custom LLM files configured. Skipping.');
340
458
  return;
341
459
  }
342
460
 
343
- console.log(`Generating ${customLLMFiles.length} custom LLM files...`);
461
+ logger.info(`Generating ${customLLMFiles.length} custom LLM files...`);
344
462
 
345
463
  for (const customFile of customLLMFiles) {
346
- console.log(`Processing custom file: ${customFile.filename}, version: ${customFile.version || 'undefined'}`);
464
+ logger.verbose(`Processing custom file: ${customFile.filename}, version: ${customFile.version || 'undefined'}`);
347
465
 
348
466
  // Combine global ignores with custom ignores
349
467
  const combinedIgnores = [...ignoreFiles];
@@ -364,13 +482,14 @@ export async function generateCustomLLMFiles(
364
482
  if (customDocs.length > 0) {
365
483
  // Generate individual markdown files if requested
366
484
  if (generateMarkdownFiles) {
367
- console.log(`Generating individual markdown files for custom file: ${customFile.filename}...`);
485
+ logger.info(`Generating individual markdown files for custom file: ${customFile.filename}...`);
368
486
  customDocs = await generateIndividualMarkdownFiles(
369
487
  customDocs,
370
488
  outDir,
371
489
  siteUrl,
372
490
  context.docsDir,
373
- context.options.keepFrontMatter || []
491
+ context.options.keepFrontMatter || [],
492
+ context.options.preserveDirectoryStructure !== false // Default to true
374
493
  );
375
494
  }
376
495
 
@@ -387,12 +506,13 @@ export async function generateCustomLLMFiles(
387
506
  customDescription,
388
507
  customFile.fullContent,
389
508
  customFile.version,
390
- customFile.rootContent
509
+ customFile.rootContent,
510
+ processingBatchSize
391
511
  );
392
512
 
393
- console.log(`Generated custom LLM file: ${customFile.filename} with ${customDocs.length} documents`);
513
+ logger.info(`Generated custom LLM file: ${customFile.filename} with ${customDocs.length} documents`);
394
514
  } else {
395
- console.warn(`No matching documents found for custom LLM file: ${customFile.filename}`);
515
+ logger.warn(`No matching documents found for custom LLM file: ${customFile.filename}`);
396
516
  }
397
517
  }
398
518
  }
@@ -404,7 +524,7 @@ export async function generateCustomLLMFiles(
404
524
  */
405
525
  export async function collectDocFiles(context: PluginContext): Promise<string[]> {
406
526
  const { siteDir, docsDir, options } = context;
407
- const { ignoreFiles = [], includeBlog = false } = options;
527
+ const { ignoreFiles = [], includeBlog = false, warnOnIgnoredFiles = false } = options;
408
528
 
409
529
  const allDocFiles: string[] = [];
410
530
 
@@ -413,13 +533,13 @@ export async function collectDocFiles(context: PluginContext): Promise<string[]>
413
533
 
414
534
  try {
415
535
  await fs.access(fullDocsDir);
416
-
536
+
417
537
  // Collect all markdown files from docs directory
418
- const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles);
538
+ const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles, docsDir, warnOnIgnoredFiles);
419
539
  allDocFiles.push(...docFiles);
420
-
421
- } catch (err) {
422
- console.warn(`Docs directory not found: ${fullDocsDir}`);
540
+
541
+ } catch (err: unknown) {
542
+ logger.warn(`Docs directory not found: ${fullDocsDir}`);
423
543
  }
424
544
 
425
545
  // Process blog if enabled
@@ -428,13 +548,13 @@ export async function collectDocFiles(context: PluginContext): Promise<string[]>
428
548
 
429
549
  try {
430
550
  await fs.access(blogDir);
431
-
551
+
432
552
  // Collect all markdown files from blog directory
433
- const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles);
553
+ const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles, docsDir, warnOnIgnoredFiles);
434
554
  allDocFiles.push(...blogFiles);
435
-
436
- } catch (err) {
437
- console.warn(`Blog directory not found: ${blogDir}`);
555
+
556
+ } catch (err: unknown) {
557
+ logger.warn(`Blog directory not found: ${blogDir}`);
438
558
  }
439
559
  }
440
560