npm - docusaurus-plugin-llms - Versions diffs - 0.2.0 → 0.3.0 - Mend

docusaurus-plugin-llms 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +246 -15
package/lib/generator-current.d.ts +44 -0
package/lib/generator-current.js +398 -0
package/lib/generator.d.ts +6 -2
package/lib/generator.js +200 -120
package/lib/index.js +175 -10
package/lib/null-handling-guide.d.ts +47 -0
package/lib/null-handling-guide.js +290 -0
package/lib/processor.d.ts +0 -10
package/lib/processor.js +230 -83
package/lib/types.d.ts +13 -0
package/lib/utils.d.ts +165 -6
package/lib/utils.js +481 -28
package/package.json +5 -3
package/src/generator.ts +270 -128
package/src/index.ts +204 -14
package/src/null-handling-guide.ts +321 -0
package/src/processor.ts +314 -127
package/src/types.ts +20 -1
package/src/utils.ts +594 -48

package/src/generator.ts CHANGED Viewed

@@ -5,7 +5,21 @@
 import * as path from 'path';
 import * as fs from 'fs/promises';
 import { DocInfo, PluginContext, CustomLLMFile } from './types';
-import { writeFile, readMarkdownFiles } from './utils';
+import {
+  writeFile,
+  readMarkdownFiles,
+  sanitizeForFilename,
+  ensureUniqueIdentifier,
+  createMarkdownContent,
+  normalizePath,
+  validatePathLength,
+  shortenPathIfNeeded,
+  logger,
+  getErrorMessage,
+  isNonEmptyString,
+  isNonEmptyArray,
+  isDefined
+} from './utils';
 import { processFilesWithPatterns } from './processor';
 /**
@@ -14,11 +28,12 @@ import { processFilesWithPatterns } from './processor';
  * @returns Cleaned description suitable for TOC
  */
 function cleanDescriptionForToc(description: string): string {
-  if (!description) return '';
+  if (!isNonEmptyString(description)) return '';
   // Get just the first line for TOC display
-  const firstLine = description.split('\n')[0];
+  const lines = description.split('\n');
+  const firstLine = lines.length > 0 ? lines[0] : '';
   // Remove heading markers only at the beginning of the line
   // Be careful to only remove actual heading markers (# followed by space at beginning)
   // and not hashtag symbols that are part of the content (inline hashtags)
@@ -37,6 +52,7 @@ function cleanDescriptionForToc(description: string): string {
  * @param includeFullContent - Whether to include full content or just links
  * @param version - Version of the file
  * @param customRootContent - Optional custom content to include at the root level
+ * @param batchSize - Batch size for processing documents (default: 100)
  */
 export async function generateLLMFile(
   docs: DocInfo[],
@@ -45,61 +61,66 @@ export async function generateLLMFile(
   fileDescription: string,
   includeFullContent: boolean,
   version?: string,
-  customRootContent?: string
+  customRootContent?: string,
+  batchSize: number = 100
 ): Promise<void> {
-  console.log(`Generating file: ${outputPath}, version: ${version || 'undefined'}`);
+  // Validate path length before proceeding
+  if (!validatePathLength(outputPath)) {
+    throw new Error(`Output path exceeds maximum length: ${outputPath}`);
+  }
+  logger.verbose(`Generating file: ${outputPath}, version: ${version || 'undefined'}`);
   const versionInfo = version ? `\n\nVersion: ${version}` : '';
   if (includeFullContent) {
     // Generate full content file with header deduplication
+    // Process documents in batches to prevent memory issues on large sites
     const usedHeaders = new Set<string>();
-    const fullContentSections = docs.map(doc => {
+    const fullContentSections: string[] = [];
+    // Process documents in batches
+    for (let i = 0; i < docs.length; i += batchSize) {
+      const batch = docs.slice(i, i + batchSize);
+      const batchNumber = Math.floor(i / batchSize) + 1;
+      const totalBatches = Math.ceil(docs.length / batchSize);
+      if (totalBatches > 1) {
+        logger.verbose(`Processing batch ${batchNumber}/${totalBatches} (${batch.length} documents)`);
+      }
+      const batchSections = batch.map(doc => {
       // Check if content already starts with the same heading to avoid duplication
       const trimmedContent = doc.content.trim();
-      const firstLine = trimmedContent.split('\n')[0];
+      const contentLines = trimmedContent.split('\n');
+      const firstLine = contentLines.length > 0 ? contentLines[0] : '';
       // Check if the first line is a heading that matches our title
       const headingMatch = firstLine.match(/^#+\s+(.+)$/);
       const firstHeadingText = headingMatch ? headingMatch[1].trim() : null;
-      // Determine the header text to use (original title or make it unique)
-      let headerText = doc.title;
-      let uniqueHeader = headerText;
-      let counter = 1;
-      // If this header has been used before, make it unique by adding a suffix
-      while (usedHeaders.has(uniqueHeader.toLowerCase())) {
-        counter++;
-        // Try to make it more descriptive by adding the file path info if available
-        if (doc.path && counter === 2) {
-          const pathParts = doc.path.split('/');
-          const folderName = pathParts.length > 1 ? pathParts[pathParts.length - 2] : '';
-          if (folderName) {
-            uniqueHeader = `${headerText} (${folderName.charAt(0).toUpperCase() + folderName.slice(1)})`;
-          } else {
-            uniqueHeader = `${headerText} (${counter})`;
+      // Generate unique header using the utility function
+      const uniqueHeader = ensureUniqueIdentifier(
+        doc.title,
+        usedHeaders,
+        (counter, base) => {
+          // Try to make it more descriptive by adding the file path info if available
+          if (isNonEmptyString(doc.path) && counter === 2) {
+            const pathParts = doc.path.split('/');
+            const folderName = pathParts.length >= 2 ? pathParts[pathParts.length - 2] : '';
+            if (isNonEmptyString(folderName)) {
+              return `(${folderName.charAt(0).toUpperCase() + folderName.slice(1)})`;
+            }
           }
-        } else {
-          uniqueHeader = `${headerText} (${counter})`;
+          return `(${counter})`;
         }
-      }
-      usedHeaders.add(uniqueHeader.toLowerCase());
+      );
       if (firstHeadingText === doc.title) {
-        // Content already has the same heading, replace it with our unique header if needed
-        if (uniqueHeader !== doc.title) {
-          const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
-          return `## ${uniqueHeader}
-${restOfContent}`;
-        } else {
-          // Replace the existing H1 with H2 to comply with llmstxt.org standard
-          const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
-          return `## ${uniqueHeader}
+        // Content already has the same heading, replace it with our unique header
+        const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
+        return `## ${uniqueHeader}
 ${restOfContent}`;
-        }
       } else {
         // Content doesn't have the same heading, add our unique H2 header
         return `## ${uniqueHeader}
@@ -108,19 +129,24 @@ ${doc.content}`;
       }
     });
+      fullContentSections.push(...batchSections);
+    }
     // Use custom root content or default message
     const rootContent = customRootContent || 'This file contains all documentation content in a single document following the llmstxt.org standard.';
-    const llmFileContent = `# ${fileTitle}
-> ${fileDescription}${versionInfo}
-${rootContent}
-${fullContentSections.join('\n\n---\n\n')}
-`;
+    const llmFileContent = createMarkdownContent(
+      fileTitle,
+      `${fileDescription}${versionInfo}`,
+      `${rootContent}\n\n${fullContentSections.join('\n\n---\n\n')}`,
+      true // include metadata (description)
+    );
-    await writeFile(outputPath, llmFileContent);
+    try {
+      await writeFile(outputPath, llmFileContent);
+    } catch (error: unknown) {
+      throw new Error(`Failed to write file ${outputPath}: ${getErrorMessage(error)}`);
+    }
   } else {
     // Generate links-only file
     const tocItems = docs.map(doc => {
@@ -133,87 +159,180 @@ ${fullContentSections.join('\n\n---\n\n')}
     // Use custom root content or default message
     const rootContent = customRootContent || 'This file contains links to documentation sections following the llmstxt.org standard.';
-    const llmFileContent = `# ${fileTitle}
-> ${fileDescription}${versionInfo}
-${rootContent}
-## Table of Contents
-${tocItems.join('\n')}
-`;
+    const llmFileContent = createMarkdownContent(
+      fileTitle,
+      `${fileDescription}${versionInfo}`,
+      `${rootContent}\n\n## Table of Contents\n\n${tocItems.join('\n')}`,
+      true // include metadata (description)
+    );
-    await writeFile(outputPath, llmFileContent);
+    try {
+      await writeFile(outputPath, llmFileContent);
+    } catch (error: unknown) {
+      throw new Error(`Failed to write file ${outputPath}: ${getErrorMessage(error)}`);
+    }
   }
-  console.log(`Generated: ${outputPath}`);
+  logger.info(`Generated: ${outputPath}`);
 }
 /**
  * Generate individual markdown files for each document
- * @param docs - Processed document information
+ * @param docs - Processed document information
  * @param outputDir - Directory to write the markdown files
  * @param siteUrl - Base site URL
+ * @param docsDir - The configured docs directory name (e.g., 'docs', 'documentation', etc.)
+ * @param keepFrontMatter - Array of frontmatter keys to preserve in generated files
+ * @param preserveDirectoryStructure - Whether to preserve the full directory structure (default: true)
  * @returns Updated docs with new URLs pointing to generated markdown files
  */
 export async function generateIndividualMarkdownFiles(
   docs: DocInfo[],
   outputDir: string,
-  siteUrl: string
+  siteUrl: string,
+  docsDir: string = 'docs',
+  keepFrontMatter: string[] = [],
+  preserveDirectoryStructure: boolean = true
 ): Promise<DocInfo[]> {
   const updatedDocs: DocInfo[] = [];
-  // Create a map to ensure unique filenames
-  const usedFilenames = new Set<string>();
+  const usedPaths = new Set<string>();
   for (const doc of docs) {
-    // Generate a filename from the document title or URL path
-    let baseFilename = doc.title
-      .toLowerCase()
-      .replace(/[^a-z0-9]+/g, '-')
-      .replace(/^-+|-+$/g, '');
-    // Fallback to URL path if title generates empty filename
-    if (!baseFilename) {
-      baseFilename = doc.path
-        .replace(/^\/+|\/+$/g, '') // Remove leading/trailing slashes
-        .replace(/\//g, '-')
-        .replace(/[^a-z0-9-]/gi, '-')
-        .toLowerCase();
+    // Use the original path structure as default filename.
+    let relativePath = doc.path
+      .replace(/^\/+/, '') // Remove leading slashes
+      .replace(/\.mdx?$/, '.md'); // Ensure .md extension
+    // Strip the docsDir prefix only if preserveDirectoryStructure is false
+    if (!preserveDirectoryStructure) {
+      relativePath = relativePath
+        .replace(new RegExp(`^${docsDir.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/`), '');// Remove configured docs dir prefix
+    }
+    // If frontmatter has slug, use that.
+    if (isNonEmptyString(doc.frontMatter?.slug)) {
+      const slug = doc.frontMatter.slug.trim().replace(/^\/+|\/+$/g, ''); // Trim whitespace and slashes
+      if (isNonEmptyString(slug)) { // Only process if slug is not empty after trimming
+        if (slug.includes('/')) {
+          // Nested slug: create directory structure
+          relativePath = slug + '.md';
+        } else {
+          // Simple slug: replace just the filename
+          const pathParts = relativePath.replace(/\.md$/, '').split('/');
+          pathParts[pathParts.length - 1] = slug;
+          relativePath = pathParts.join('/') + '.md';
+        }
+      }
+    }
+    // Otherwise, if frontmatter has id, use that.
+    else if (isNonEmptyString(doc.frontMatter?.id)) {
+      const id = doc.frontMatter.id.trim().replace(/^\/+|\/+$/g, ''); // Trim whitespace and slashes
+      if (isNonEmptyString(id)) { // Only process if id is not empty after trimming
+        if (id.includes('/')) {
+          // Nested id: create directory structure
+          relativePath = id + '.md';
+        } else {
+          // Simple id: replace just the filename
+          const pathParts = relativePath.replace(/\.md$/, '').split('/');
+          pathParts[pathParts.length - 1] = id;
+          relativePath = pathParts.join('/') + '.md';
+        }
+      }
+    }
+    // Trim any leading/trailing whitespace from the path
+    relativePath = relativePath.trim();
+    // If path is empty or invalid, create a fallback path
+    if (!isNonEmptyString(relativePath) || relativePath === '.md') {
+      const sanitizedTitle = sanitizeForFilename(doc.title, 'untitled');
+      relativePath = `${sanitizedTitle}.md`;
     }
-    // Ensure filename uniqueness
-    let filename = `${baseFilename}.md`;
+    // Ensure path uniqueness
+    let uniquePath = relativePath;
     let counter = 1;
-    while (usedFilenames.has(filename)) {
-      filename = `${baseFilename}-${counter}.md`;
+    const MAX_PATH_ITERATIONS = 10000;
+    let pathIterations = 0;
+    while (usedPaths.has(uniquePath.toLowerCase())) {
       counter++;
+      const pathParts = relativePath.split('.');
+      const extension = pathParts.pop() || 'md';
+      const basePath = pathParts.join('.');
+      uniquePath = `${basePath}-${counter}.${extension}`;
+      pathIterations++;
+      if (pathIterations >= MAX_PATH_ITERATIONS) {
+        // Fallback to timestamp
+        const timestamp = Date.now();
+        uniquePath = `${basePath}-${timestamp}.${extension}`;
+        logger.warn(`Maximum iterations reached for unique path. Using timestamp: ${uniquePath}`);
+        break;
+      }
     }
-    usedFilenames.add(filename);
-    // Create markdown content following llmstxt.org standard
-    const markdownContent = `# ${doc.title}
+    usedPaths.add(uniquePath.toLowerCase());
-> ${doc.description}
+    // Create the full file path and validate/shorten if needed
+    let fullPath = path.join(outputDir, uniquePath);
+    fullPath = shortenPathIfNeeded(fullPath, outputDir, uniquePath);
-${doc.content}
-`;
+    // Update uniquePath to reflect the shortened path if it was changed
+    if (fullPath !== path.join(outputDir, uniquePath)) {
+      uniquePath = path.relative(outputDir, fullPath);
+    }
+    const directory = path.dirname(fullPath);
+    // Create directory structure if it doesn't exist
+    try {
+      await fs.mkdir(directory, { recursive: true });
+    } catch (error: unknown) {
+      throw new Error(`Failed to create directory ${directory}: ${getErrorMessage(error)}`);
+    }
+    // Extract preserved frontmatter if specified
+    let preservedFrontMatter: Record<string, any> = {};
+    if (isNonEmptyArray(keepFrontMatter) && isDefined(doc.frontMatter)) {
+      for (const key of keepFrontMatter) {
+        if (key in doc.frontMatter) {
+          preservedFrontMatter[key] = doc.frontMatter[key];
+        }
+      }
+    }
+    // Create markdown content using the utility function
+    const markdownContent = createMarkdownContent(
+      doc.title,
+      doc.description,
+      doc.content,
+      true, // includeMetadata
+      Object.keys(preservedFrontMatter).length > 0 ? preservedFrontMatter : undefined
+    );
     // Write the markdown file
-    const markdownPath = path.join(outputDir, filename);
-    await writeFile(markdownPath, markdownContent);
+    try {
+      await writeFile(fullPath, markdownContent);
+    } catch (error: unknown) {
+      throw new Error(`Failed to write file ${fullPath}: ${getErrorMessage(error)}`);
+    }
     // Create updated DocInfo with new URL pointing to the generated markdown file
-    const newUrl = `${siteUrl}/${filename}`;
+    // Convert file path to URL path (use forward slashes)
+    const urlPath = normalizePath(uniquePath);
+    const newUrl = `${siteUrl}/${urlPath}`;
     updatedDocs.push({
       ...doc,
       url: newUrl,
-      path: `/${filename}` // Update path to the new markdown file
+      path: `/${urlPath}` // Update path to the new markdown file
     });
-    console.log(`Generated markdown file: ${filename}`);
+    logger.verbose(`Generated markdown file: ${uniquePath}`);
   }
   return updatedDocs;
@@ -236,8 +355,8 @@ export async function generateStandardLLMFiles(
     options
   } = context;
-  const {
-    generateLLMsTxt,
+  const {
+    generateLLMsTxt,
     generateLLMsFullTxt,
     llmsTxtFilename = 'llms.txt',
     llmsFullTxtFilename = 'llms-full.txt',
@@ -246,10 +365,12 @@ export async function generateStandardLLMFiles(
     version,
     generateMarkdownFiles = false,
     rootContent,
-    fullRootContent
+    fullRootContent,
+    processingBatchSize = 100
   } = options;
   if (!generateLLMsTxt && !generateLLMsFullTxt) {
+    logger.warn('No standard LLM files configured for generation. Skipping.');
     return;
   }
@@ -263,15 +384,24 @@ export async function generateStandardLLMFiles(
     includeUnmatchedLast
   );
-  console.log(`Processed ${processedDocs.length} documentation files for standard LLM files`);
+  logger.verbose(`Processed ${processedDocs.length} documentation files for standard LLM files`);
+  // Check if we have documents to process
+  if (!isNonEmptyArray(processedDocs)) {
+    logger.warn('No documents found matching patterns for standard LLM files. Skipping.');
+    return;
+  }
   // Generate individual markdown files if requested
-  if (generateMarkdownFiles && processedDocs.length > 0) {
-    console.log('Generating individual markdown files...');
+  if (generateMarkdownFiles) {
+    logger.info('Generating individual markdown files...');
     processedDocs = await generateIndividualMarkdownFiles(
       processedDocs,
       outDir,
-      siteUrl
+      siteUrl,
+      context.docsDir,
+      context.options.keepFrontMatter || [],
+      context.options.preserveDirectoryStructure !== false // Default to true
     );
   }
@@ -285,7 +415,8 @@ export async function generateStandardLLMFiles(
       docDescription,
       false, // links only
       version,
-      rootContent
+      rootContent,
+      processingBatchSize
     );
   }
@@ -299,7 +430,8 @@ export async function generateStandardLLMFiles(
       docDescription,
       true, // full content
       version,
-      fullRootContent
+      fullRootContent,
+      processingBatchSize
     );
   }
 }
@@ -314,16 +446,22 @@ export async function generateCustomLLMFiles(
   allDocFiles: string[]
 ): Promise<void> {
   const { outDir, siteUrl, docTitle, docDescription, options } = context;
-  const { customLLMFiles = [], ignoreFiles = [], generateMarkdownFiles = false } = options;
+  const {
+    customLLMFiles = [],
+    ignoreFiles = [],
+    generateMarkdownFiles = false,
+    processingBatchSize = 100
+  } = options;
   if (customLLMFiles.length === 0) {
+    logger.warn('No custom LLM files configured. Skipping.');
     return;
   }
-  console.log(`Generating ${customLLMFiles.length} custom LLM files...`);
+  logger.info(`Generating ${customLLMFiles.length} custom LLM files...`);
   for (const customFile of customLLMFiles) {
-    console.log(`Processing custom file: ${customFile.filename}, version: ${customFile.version || 'undefined'}`);
+    logger.verbose(`Processing custom file: ${customFile.filename}, version: ${customFile.version || 'undefined'}`);
     // Combine global ignores with custom ignores
     const combinedIgnores = [...ignoreFiles];
@@ -344,11 +482,14 @@ export async function generateCustomLLMFiles(
     if (customDocs.length > 0) {
       // Generate individual markdown files if requested
       if (generateMarkdownFiles) {
-        console.log(`Generating individual markdown files for custom file: ${customFile.filename}...`);
+        logger.info(`Generating individual markdown files for custom file: ${customFile.filename}...`);
         customDocs = await generateIndividualMarkdownFiles(
           customDocs,
           outDir,
-          siteUrl
+          siteUrl,
+          context.docsDir,
+          context.options.keepFrontMatter || [],
+          context.options.preserveDirectoryStructure !== false // Default to true
         );
       }
@@ -365,12 +506,13 @@ export async function generateCustomLLMFiles(
         customDescription,
         customFile.fullContent,
         customFile.version,
-        customFile.rootContent
+        customFile.rootContent,
+        processingBatchSize
       );
-      console.log(`Generated custom LLM file: ${customFile.filename} with ${customDocs.length} documents`);
+      logger.info(`Generated custom LLM file: ${customFile.filename} with ${customDocs.length} documents`);
     } else {
-      console.warn(`No matching documents found for custom LLM file: ${customFile.filename}`);
+      logger.warn(`No matching documents found for custom LLM file: ${customFile.filename}`);
     }
   }
 }
@@ -382,7 +524,7 @@ export async function generateCustomLLMFiles(
  */
 export async function collectDocFiles(context: PluginContext): Promise<string[]> {
   const { siteDir, docsDir, options } = context;
-  const { ignoreFiles = [], includeBlog = false } = options;
+  const { ignoreFiles = [], includeBlog = false, warnOnIgnoredFiles = false } = options;
   const allDocFiles: string[] = [];
@@ -391,13 +533,13 @@ export async function collectDocFiles(context: PluginContext): Promise<string[]>
   try {
     await fs.access(fullDocsDir);
     // Collect all markdown files from docs directory
-    const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles);
+    const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles, docsDir, warnOnIgnoredFiles);
     allDocFiles.push(...docFiles);
-  } catch (err) {
-    console.warn(`Docs directory not found: ${fullDocsDir}`);
+  } catch (err: unknown) {
+    logger.warn(`Docs directory not found: ${fullDocsDir}`);
   }
   // Process blog if enabled
@@ -406,13 +548,13 @@ export async function collectDocFiles(context: PluginContext): Promise<string[]>
     try {
       await fs.access(blogDir);
       // Collect all markdown files from blog directory
-      const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles);
+      const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles, docsDir, warnOnIgnoredFiles);
       allDocFiles.push(...blogFiles);
-    } catch (err) {
-      console.warn(`Blog directory not found: ${blogDir}`);
+    } catch (err: unknown) {
+      logger.warn(`Blog directory not found: ${blogDir}`);
     }
   }