npm - docusaurus-plugin-llms - Versions diffs - 0.2.2 → 0.3.0 - Mend

docusaurus-plugin-llms 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +246 -16
package/lib/generator-current.d.ts +44 -0
package/lib/generator-current.js +398 -0
package/lib/generator.d.ts +4 -2
package/lib/generator.js +163 -71
package/lib/index.js +174 -10
package/lib/null-handling-guide.d.ts +47 -0
package/lib/null-handling-guide.js +290 -0
package/lib/processor.d.ts +0 -10
package/lib/processor.js +217 -80
package/lib/types.d.ts +10 -0
package/lib/utils.d.ts +141 -7
package/lib/utils.js +429 -34
package/package.json +2 -2
package/src/generator.ts +206 -86
package/src/index.ts +202 -14
package/src/null-handling-guide.ts +321 -0
package/src/processor.ts +303 -126
package/src/types.ts +15 -0
package/src/utils.ts +530 -59

package/src/generator.ts CHANGED Viewed

@@ -5,12 +5,20 @@
 import * as path from 'path';
 import * as fs from 'fs/promises';
 import { DocInfo, PluginContext, CustomLLMFile } from './types';
-import {
-  writeFile,
-  readMarkdownFiles,
-  sanitizeForFilename,
-  ensureUniqueIdentifier,
-  createMarkdownContent
+import {
+  writeFile,
+  readMarkdownFiles,
+  sanitizeForFilename,
+  ensureUniqueIdentifier,
+  createMarkdownContent,
+  normalizePath,
+  validatePathLength,
+  shortenPathIfNeeded,
+  logger,
+  getErrorMessage,
+  isNonEmptyString,
+  isNonEmptyArray,
+  isDefined
 } from './utils';
 import { processFilesWithPatterns } from './processor';
@@ -20,11 +28,12 @@ import { processFilesWithPatterns } from './processor';
  * @returns Cleaned description suitable for TOC
  */
 function cleanDescriptionForToc(description: string): string {
-  if (!description) return '';
+  if (!isNonEmptyString(description)) return '';
   // Get just the first line for TOC display
-  const firstLine = description.split('\n')[0];
+  const lines = description.split('\n');
+  const firstLine = lines.length > 0 ? lines[0] : '';
   // Remove heading markers only at the beginning of the line
   // Be careful to only remove actual heading markers (# followed by space at beginning)
   // and not hashtag symbols that are part of the content (inline hashtags)
@@ -43,6 +52,7 @@ function cleanDescriptionForToc(description: string): string {
  * @param includeFullContent - Whether to include full content or just links
  * @param version - Version of the file
  * @param customRootContent - Optional custom content to include at the root level
+ * @param batchSize - Batch size for processing documents (default: 100)
  */
 export async function generateLLMFile(
   docs: DocInfo[],
@@ -51,19 +61,39 @@ export async function generateLLMFile(
   fileDescription: string,
   includeFullContent: boolean,
   version?: string,
-  customRootContent?: string
+  customRootContent?: string,
+  batchSize: number = 100
 ): Promise<void> {
-  console.log(`Generating file: ${outputPath}, version: ${version || 'undefined'}`);
+  // Validate path length before proceeding
+  if (!validatePathLength(outputPath)) {
+    throw new Error(`Output path exceeds maximum length: ${outputPath}`);
+  }
+  logger.verbose(`Generating file: ${outputPath}, version: ${version || 'undefined'}`);
   const versionInfo = version ? `\n\nVersion: ${version}` : '';
   if (includeFullContent) {
     // Generate full content file with header deduplication
+    // Process documents in batches to prevent memory issues on large sites
     const usedHeaders = new Set<string>();
-    const fullContentSections = docs.map(doc => {
+    const fullContentSections: string[] = [];
+    // Process documents in batches
+    for (let i = 0; i < docs.length; i += batchSize) {
+      const batch = docs.slice(i, i + batchSize);
+      const batchNumber = Math.floor(i / batchSize) + 1;
+      const totalBatches = Math.ceil(docs.length / batchSize);
+      if (totalBatches > 1) {
+        logger.verbose(`Processing batch ${batchNumber}/${totalBatches} (${batch.length} documents)`);
+      }
+      const batchSections = batch.map(doc => {
       // Check if content already starts with the same heading to avoid duplication
       const trimmedContent = doc.content.trim();
-      const firstLine = trimmedContent.split('\n')[0];
+      const contentLines = trimmedContent.split('\n');
+      const firstLine = contentLines.length > 0 ? contentLines[0] : '';
       // Check if the first line is a heading that matches our title
       const headingMatch = firstLine.match(/^#+\s+(.+)$/);
       const firstHeadingText = headingMatch ? headingMatch[1].trim() : null;
@@ -74,10 +104,10 @@ export async function generateLLMFile(
         usedHeaders,
         (counter, base) => {
           // Try to make it more descriptive by adding the file path info if available
-          if (doc.path && counter === 2) {
+          if (isNonEmptyString(doc.path) && counter === 2) {
             const pathParts = doc.path.split('/');
-            const folderName = pathParts.length > 1 ? pathParts[pathParts.length - 2] : '';
-            if (folderName) {
+            const folderName = pathParts.length >= 2 ? pathParts[pathParts.length - 2] : '';
+            if (isNonEmptyString(folderName)) {
               return `(${folderName.charAt(0).toUpperCase() + folderName.slice(1)})`;
             }
           }
@@ -86,19 +116,11 @@ export async function generateLLMFile(
       );
       if (firstHeadingText === doc.title) {
-        // Content already has the same heading, replace it with our unique header if needed
-        if (uniqueHeader !== doc.title) {
-          const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
-          return `## ${uniqueHeader}
-${restOfContent}`;
-        } else {
-          // Replace the existing H1 with H2 to comply with llmstxt.org standard
-          const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
-          return `## ${uniqueHeader}
+        // Content already has the same heading, replace it with our unique header
+        const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
+        return `## ${uniqueHeader}
 ${restOfContent}`;
-        }
       } else {
         // Content doesn't have the same heading, add our unique H2 header
         return `## ${uniqueHeader}
@@ -107,6 +129,9 @@ ${doc.content}`;
       }
     });
+      fullContentSections.push(...batchSections);
+    }
     // Use custom root content or default message
     const rootContent = customRootContent || 'This file contains all documentation content in a single document following the llmstxt.org standard.';
@@ -117,7 +142,11 @@ ${doc.content}`;
       true // include metadata (description)
     );
-    await writeFile(outputPath, llmFileContent);
+    try {
+      await writeFile(outputPath, llmFileContent);
+    } catch (error: unknown) {
+      throw new Error(`Failed to write file ${outputPath}: ${getErrorMessage(error)}`);
+    }
   } else {
     // Generate links-only file
     const tocItems = docs.map(doc => {
@@ -137,19 +166,24 @@ ${doc.content}`;
       true // include metadata (description)
     );
-    await writeFile(outputPath, llmFileContent);
+    try {
+      await writeFile(outputPath, llmFileContent);
+    } catch (error: unknown) {
+      throw new Error(`Failed to write file ${outputPath}: ${getErrorMessage(error)}`);
+    }
   }
-  console.log(`Generated: ${outputPath}`);
+  logger.info(`Generated: ${outputPath}`);
 }
 /**
  * Generate individual markdown files for each document
- * @param docs - Processed document information
+ * @param docs - Processed document information
  * @param outputDir - Directory to write the markdown files
  * @param siteUrl - Base site URL
  * @param docsDir - The configured docs directory name (e.g., 'docs', 'documentation', etc.)
  * @param keepFrontMatter - Array of frontmatter keys to preserve in generated files
+ * @param preserveDirectoryStructure - Whether to preserve the full directory structure (default: true)
  * @returns Updated docs with new URLs pointing to generated markdown files
  */
 export async function generateIndividualMarkdownFiles(
@@ -157,24 +191,64 @@ export async function generateIndividualMarkdownFiles(
   outputDir: string,
   siteUrl: string,
   docsDir: string = 'docs',
-  keepFrontMatter: string[] = []
+  keepFrontMatter: string[] = [],
+  preserveDirectoryStructure: boolean = true
 ): Promise<DocInfo[]> {
   const updatedDocs: DocInfo[] = [];
   const usedPaths = new Set<string>();
   for (const doc of docs) {
-    // Use the original path structure, cleaning it up for file system use
+    // Use the original path structure as default filename.
     let relativePath = doc.path
       .replace(/^\/+/, '') // Remove leading slashes
       .replace(/\.mdx?$/, '.md'); // Ensure .md extension
-    relativePath = relativePath
-      .replace(new RegExp(`^${docsDir.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/`), '');// Remove configured docs dir prefix
+    // Strip the docsDir prefix only if preserveDirectoryStructure is false
+    if (!preserveDirectoryStructure) {
+      relativePath = relativePath
+        .replace(new RegExp(`^${docsDir.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/`), '');// Remove configured docs dir prefix
+    }
+    // If frontmatter has slug, use that.
+    if (isNonEmptyString(doc.frontMatter?.slug)) {
+      const slug = doc.frontMatter.slug.trim().replace(/^\/+|\/+$/g, ''); // Trim whitespace and slashes
+      if (isNonEmptyString(slug)) { // Only process if slug is not empty after trimming
+        if (slug.includes('/')) {
+          // Nested slug: create directory structure
+          relativePath = slug + '.md';
+        } else {
+          // Simple slug: replace just the filename
+          const pathParts = relativePath.replace(/\.md$/, '').split('/');
+          pathParts[pathParts.length - 1] = slug;
+          relativePath = pathParts.join('/') + '.md';
+        }
+      }
+    }
+    // Otherwise, if frontmatter has id, use that.
+    else if (isNonEmptyString(doc.frontMatter?.id)) {
+      const id = doc.frontMatter.id.trim().replace(/^\/+|\/+$/g, ''); // Trim whitespace and slashes
+      if (isNonEmptyString(id)) { // Only process if id is not empty after trimming
+        if (id.includes('/')) {
+          // Nested id: create directory structure
+          relativePath = id + '.md';
+        } else {
+          // Simple id: replace just the filename
+          const pathParts = relativePath.replace(/\.md$/, '').split('/');
+          pathParts[pathParts.length - 1] = id;
+          relativePath = pathParts.join('/') + '.md';
+        }
+      }
+    }
+    // Trim any leading/trailing whitespace from the path
+    relativePath = relativePath.trim();
     // If path is empty or invalid, create a fallback path
-    if (!relativePath || relativePath === '.md') {
+    if (!isNonEmptyString(relativePath) || relativePath === '.md') {
       const sanitizedTitle = sanitizeForFilename(doc.title, 'untitled');
       relativePath = `${sanitizedTitle}.md`;
     }
@@ -182,25 +256,48 @@ export async function generateIndividualMarkdownFiles(
     // Ensure path uniqueness
     let uniquePath = relativePath;
     let counter = 1;
+    const MAX_PATH_ITERATIONS = 10000;
+    let pathIterations = 0;
     while (usedPaths.has(uniquePath.toLowerCase())) {
       counter++;
       const pathParts = relativePath.split('.');
       const extension = pathParts.pop() || 'md';
       const basePath = pathParts.join('.');
       uniquePath = `${basePath}-${counter}.${extension}`;
+      pathIterations++;
+      if (pathIterations >= MAX_PATH_ITERATIONS) {
+        // Fallback to timestamp
+        const timestamp = Date.now();
+        uniquePath = `${basePath}-${timestamp}.${extension}`;
+        logger.warn(`Maximum iterations reached for unique path. Using timestamp: ${uniquePath}`);
+        break;
+      }
     }
     usedPaths.add(uniquePath.toLowerCase());
-    // Create the full file path and ensure directory exists
-    const fullPath = path.join(outputDir, uniquePath);
+    // Create the full file path and validate/shorten if needed
+    let fullPath = path.join(outputDir, uniquePath);
+    fullPath = shortenPathIfNeeded(fullPath, outputDir, uniquePath);
+    // Update uniquePath to reflect the shortened path if it was changed
+    if (fullPath !== path.join(outputDir, uniquePath)) {
+      uniquePath = path.relative(outputDir, fullPath);
+    }
     const directory = path.dirname(fullPath);
     // Create directory structure if it doesn't exist
-    await fs.mkdir(directory, { recursive: true });
+    try {
+      await fs.mkdir(directory, { recursive: true });
+    } catch (error: unknown) {
+      throw new Error(`Failed to create directory ${directory}: ${getErrorMessage(error)}`);
+    }
     // Extract preserved frontmatter if specified
     let preservedFrontMatter: Record<string, any> = {};
-    if (keepFrontMatter.length > 0 && doc.frontMatter) {
+    if (isNonEmptyArray(keepFrontMatter) && isDefined(doc.frontMatter)) {
       for (const key of keepFrontMatter) {
         if (key in doc.frontMatter) {
           preservedFrontMatter[key] = doc.frontMatter[key];
@@ -210,19 +307,23 @@ export async function generateIndividualMarkdownFiles(
     // Create markdown content using the utility function
     const markdownContent = createMarkdownContent(
-      doc.title,
-      doc.description,
-      doc.content,
+      doc.title,
+      doc.description,
+      doc.content,
       true, // includeMetadata
       Object.keys(preservedFrontMatter).length > 0 ? preservedFrontMatter : undefined
     );
     // Write the markdown file
-    await writeFile(fullPath, markdownContent);
+    try {
+      await writeFile(fullPath, markdownContent);
+    } catch (error: unknown) {
+      throw new Error(`Failed to write file ${fullPath}: ${getErrorMessage(error)}`);
+    }
     // Create updated DocInfo with new URL pointing to the generated markdown file
     // Convert file path to URL path (use forward slashes)
-    const urlPath = uniquePath.replace(/\\/g, '/');
+    const urlPath = normalizePath(uniquePath);
     const newUrl = `${siteUrl}/${urlPath}`;
     updatedDocs.push({
@@ -231,7 +332,7 @@ export async function generateIndividualMarkdownFiles(
       path: `/${urlPath}` // Update path to the new markdown file
     });
-    console.log(`Generated markdown file: ${uniquePath}`);
+    logger.verbose(`Generated markdown file: ${uniquePath}`);
   }
   return updatedDocs;
@@ -254,8 +355,8 @@ export async function generateStandardLLMFiles(
     options
   } = context;
-  const {
-    generateLLMsTxt,
+  const {
+    generateLLMsTxt,
     generateLLMsFullTxt,
     llmsTxtFilename = 'llms.txt',
     llmsFullTxtFilename = 'llms-full.txt',
@@ -264,10 +365,12 @@ export async function generateStandardLLMFiles(
     version,
     generateMarkdownFiles = false,
     rootContent,
-    fullRootContent
+    fullRootContent,
+    processingBatchSize = 100
   } = options;
   if (!generateLLMsTxt && !generateLLMsFullTxt) {
+    logger.warn('No standard LLM files configured for generation. Skipping.');
     return;
   }
@@ -281,17 +384,24 @@ export async function generateStandardLLMFiles(
     includeUnmatchedLast
   );
-  console.log(`Processed ${processedDocs.length} documentation files for standard LLM files`);
+  logger.verbose(`Processed ${processedDocs.length} documentation files for standard LLM files`);
+  // Check if we have documents to process
+  if (!isNonEmptyArray(processedDocs)) {
+    logger.warn('No documents found matching patterns for standard LLM files. Skipping.');
+    return;
+  }
   // Generate individual markdown files if requested
-  if (generateMarkdownFiles && processedDocs.length > 0) {
-    console.log('Generating individual markdown files...');
+  if (generateMarkdownFiles) {
+    logger.info('Generating individual markdown files...');
     processedDocs = await generateIndividualMarkdownFiles(
       processedDocs,
       outDir,
       siteUrl,
       context.docsDir,
-      context.options.keepFrontMatter || []
+      context.options.keepFrontMatter || [],
+      context.options.preserveDirectoryStructure !== false // Default to true
     );
   }
@@ -305,7 +415,8 @@ export async function generateStandardLLMFiles(
       docDescription,
       false, // links only
       version,
-      rootContent
+      rootContent,
+      processingBatchSize
     );
   }
@@ -319,7 +430,8 @@ export async function generateStandardLLMFiles(
       docDescription,
       true, // full content
       version,
-      fullRootContent
+      fullRootContent,
+      processingBatchSize
     );
   }
 }
@@ -334,16 +446,22 @@ export async function generateCustomLLMFiles(
   allDocFiles: string[]
 ): Promise<void> {
   const { outDir, siteUrl, docTitle, docDescription, options } = context;
-  const { customLLMFiles = [], ignoreFiles = [], generateMarkdownFiles = false } = options;
+  const {
+    customLLMFiles = [],
+    ignoreFiles = [],
+    generateMarkdownFiles = false,
+    processingBatchSize = 100
+  } = options;
   if (customLLMFiles.length === 0) {
+    logger.warn('No custom LLM files configured. Skipping.');
     return;
   }
-  console.log(`Generating ${customLLMFiles.length} custom LLM files...`);
+  logger.info(`Generating ${customLLMFiles.length} custom LLM files...`);
   for (const customFile of customLLMFiles) {
-    console.log(`Processing custom file: ${customFile.filename}, version: ${customFile.version || 'undefined'}`);
+    logger.verbose(`Processing custom file: ${customFile.filename}, version: ${customFile.version || 'undefined'}`);
     // Combine global ignores with custom ignores
     const combinedIgnores = [...ignoreFiles];
@@ -364,13 +482,14 @@ export async function generateCustomLLMFiles(
     if (customDocs.length > 0) {
       // Generate individual markdown files if requested
       if (generateMarkdownFiles) {
-        console.log(`Generating individual markdown files for custom file: ${customFile.filename}...`);
+        logger.info(`Generating individual markdown files for custom file: ${customFile.filename}...`);
         customDocs = await generateIndividualMarkdownFiles(
           customDocs,
           outDir,
           siteUrl,
           context.docsDir,
-          context.options.keepFrontMatter || []
+          context.options.keepFrontMatter || [],
+          context.options.preserveDirectoryStructure !== false // Default to true
         );
       }
@@ -387,12 +506,13 @@ export async function generateCustomLLMFiles(
         customDescription,
         customFile.fullContent,
         customFile.version,
-        customFile.rootContent
+        customFile.rootContent,
+        processingBatchSize
       );
-      console.log(`Generated custom LLM file: ${customFile.filename} with ${customDocs.length} documents`);
+      logger.info(`Generated custom LLM file: ${customFile.filename} with ${customDocs.length} documents`);
     } else {
-      console.warn(`No matching documents found for custom LLM file: ${customFile.filename}`);
+      logger.warn(`No matching documents found for custom LLM file: ${customFile.filename}`);
     }
   }
 }
@@ -404,7 +524,7 @@ export async function generateCustomLLMFiles(
  */
 export async function collectDocFiles(context: PluginContext): Promise<string[]> {
   const { siteDir, docsDir, options } = context;
-  const { ignoreFiles = [], includeBlog = false } = options;
+  const { ignoreFiles = [], includeBlog = false, warnOnIgnoredFiles = false } = options;
   const allDocFiles: string[] = [];
@@ -413,13 +533,13 @@ export async function collectDocFiles(context: PluginContext): Promise<string[]>
   try {
     await fs.access(fullDocsDir);
     // Collect all markdown files from docs directory
-    const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles);
+    const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles, docsDir, warnOnIgnoredFiles);
     allDocFiles.push(...docFiles);
-  } catch (err) {
-    console.warn(`Docs directory not found: ${fullDocsDir}`);
+  } catch (err: unknown) {
+    logger.warn(`Docs directory not found: ${fullDocsDir}`);
   }
   // Process blog if enabled
@@ -428,13 +548,13 @@ export async function collectDocFiles(context: PluginContext): Promise<string[]>
     try {
       await fs.access(blogDir);
       // Collect all markdown files from blog directory
-      const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles);
+      const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles, docsDir, warnOnIgnoredFiles);
       allDocFiles.push(...blogFiles);
-    } catch (err) {
-      console.warn(`Blog directory not found: ${blogDir}`);
+    } catch (err: unknown) {
+      logger.warn(`Blog directory not found: ${blogDir}`);
     }
   }