npm - docusaurus-plugin-llms - Versions diffs - 0.2.0 → 0.3.0 - Mend

docusaurus-plugin-llms 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +246 -15
package/lib/generator-current.d.ts +44 -0
package/lib/generator-current.js +398 -0
package/lib/generator.d.ts +6 -2
package/lib/generator.js +200 -120
package/lib/index.js +175 -10
package/lib/null-handling-guide.d.ts +47 -0
package/lib/null-handling-guide.js +290 -0
package/lib/processor.d.ts +0 -10
package/lib/processor.js +230 -83
package/lib/types.d.ts +13 -0
package/lib/utils.d.ts +165 -6
package/lib/utils.js +481 -28
package/package.json +5 -3
package/src/generator.ts +270 -128
package/src/index.ts +204 -14
package/src/null-handling-guide.ts +321 -0
package/src/processor.ts +314 -127
package/src/types.ts +20 -1
package/src/utils.ts +594 -48

package/lib/generator.js CHANGED Viewed

@@ -51,10 +51,11 @@ const processor_1 = require("./processor");
  * @returns Cleaned description suitable for TOC
  */
 function cleanDescriptionForToc(description) {
-    if (!description)
+    if (!(0, utils_1.isNonEmptyString)(description))
         return '';
     // Get just the first line for TOC display
-    const firstLine = description.split('\n')[0];
+    const lines = description.split('\n');
+    const firstLine = lines.length > 0 ? lines[0] : '';
     // Remove heading markers only at the beginning of the line
     // Be careful to only remove actual heading markers (# followed by space at beginning)
     // and not hashtag symbols that are part of the content (inline hashtags)
@@ -71,77 +72,74 @@ function cleanDescriptionForToc(description) {
  * @param includeFullContent - Whether to include full content or just links
  * @param version - Version of the file
  * @param customRootContent - Optional custom content to include at the root level
+ * @param batchSize - Batch size for processing documents (default: 100)
  */
-async function generateLLMFile(docs, outputPath, fileTitle, fileDescription, includeFullContent, version, customRootContent) {
-    console.log(`Generating file: ${outputPath}, version: ${version || 'undefined'}`);
+async function generateLLMFile(docs, outputPath, fileTitle, fileDescription, includeFullContent, version, customRootContent, batchSize = 100) {
+    // Validate path length before proceeding
+    if (!(0, utils_1.validatePathLength)(outputPath)) {
+        throw new Error(`Output path exceeds maximum length: ${outputPath}`);
+    }
+    utils_1.logger.verbose(`Generating file: ${outputPath}, version: ${version || 'undefined'}`);
     const versionInfo = version ? `\n\nVersion: ${version}` : '';
     if (includeFullContent) {
         // Generate full content file with header deduplication
+        // Process documents in batches to prevent memory issues on large sites
         const usedHeaders = new Set();
-        const fullContentSections = docs.map(doc => {
-            // Check if content already starts with the same heading to avoid duplication
-            const trimmedContent = doc.content.trim();
-            const firstLine = trimmedContent.split('\n')[0];
-            // Check if the first line is a heading that matches our title
-            const headingMatch = firstLine.match(/^#+\s+(.+)$/);
-            const firstHeadingText = headingMatch ? headingMatch[1].trim() : null;
-            // Determine the header text to use (original title or make it unique)
-            let headerText = doc.title;
-            let uniqueHeader = headerText;
-            let counter = 1;
-            // If this header has been used before, make it unique by adding a suffix
-            while (usedHeaders.has(uniqueHeader.toLowerCase())) {
-                counter++;
-                // Try to make it more descriptive by adding the file path info if available
-                if (doc.path && counter === 2) {
-                    const pathParts = doc.path.split('/');
-                    const folderName = pathParts.length > 1 ? pathParts[pathParts.length - 2] : '';
-                    if (folderName) {
-                        uniqueHeader = `${headerText} (${folderName.charAt(0).toUpperCase() + folderName.slice(1)})`;
-                    }
-                    else {
-                        uniqueHeader = `${headerText} (${counter})`;
-                    }
-                }
-                else {
-                    uniqueHeader = `${headerText} (${counter})`;
-                }
+        const fullContentSections = [];
+        // Process documents in batches
+        for (let i = 0; i < docs.length; i += batchSize) {
+            const batch = docs.slice(i, i + batchSize);
+            const batchNumber = Math.floor(i / batchSize) + 1;
+            const totalBatches = Math.ceil(docs.length / batchSize);
+            if (totalBatches > 1) {
+                utils_1.logger.verbose(`Processing batch ${batchNumber}/${totalBatches} (${batch.length} documents)`);
             }
-            usedHeaders.add(uniqueHeader.toLowerCase());
-            if (firstHeadingText === doc.title) {
-                // Content already has the same heading, replace it with our unique header if needed
-                if (uniqueHeader !== doc.title) {
+            const batchSections = batch.map(doc => {
+                // Check if content already starts with the same heading to avoid duplication
+                const trimmedContent = doc.content.trim();
+                const contentLines = trimmedContent.split('\n');
+                const firstLine = contentLines.length > 0 ? contentLines[0] : '';
+                // Check if the first line is a heading that matches our title
+                const headingMatch = firstLine.match(/^#+\s+(.+)$/);
+                const firstHeadingText = headingMatch ? headingMatch[1].trim() : null;
+                // Generate unique header using the utility function
+                const uniqueHeader = (0, utils_1.ensureUniqueIdentifier)(doc.title, usedHeaders, (counter, base) => {
+                    // Try to make it more descriptive by adding the file path info if available
+                    if ((0, utils_1.isNonEmptyString)(doc.path) && counter === 2) {
+                        const pathParts = doc.path.split('/');
+                        const folderName = pathParts.length >= 2 ? pathParts[pathParts.length - 2] : '';
+                        if ((0, utils_1.isNonEmptyString)(folderName)) {
+                            return `(${folderName.charAt(0).toUpperCase() + folderName.slice(1)})`;
+                        }
+                    }
+                    return `(${counter})`;
+                });
+                if (firstHeadingText === doc.title) {
+                    // Content already has the same heading, replace it with our unique header
                     const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
                     return `## ${uniqueHeader}
 ${restOfContent}`;
                 }
                 else {
-                    // Replace the existing H1 with H2 to comply with llmstxt.org standard
-                    const restOfContent = trimmedContent.split('\n').slice(1).join('\n');
+                    // Content doesn't have the same heading, add our unique H2 header
                     return `## ${uniqueHeader}
-${restOfContent}`;
-                }
-            }
-            else {
-                // Content doesn't have the same heading, add our unique H2 header
-                return `## ${uniqueHeader}
 ${doc.content}`;
-            }
-        });
+                }
+            });
+            fullContentSections.push(...batchSections);
+        }
         // Use custom root content or default message
         const rootContent = customRootContent || 'This file contains all documentation content in a single document following the llmstxt.org standard.';
-        const llmFileContent = `# ${fileTitle}
-> ${fileDescription}${versionInfo}
-${rootContent}
-${fullContentSections.join('\n\n---\n\n')}
-`;
-        await (0, utils_1.writeFile)(outputPath, llmFileContent);
+        const llmFileContent = (0, utils_1.createMarkdownContent)(fileTitle, `${fileDescription}${versionInfo}`, `${rootContent}\n\n${fullContentSections.join('\n\n---\n\n')}`, true // include metadata (description)
+        );
+        try {
+            await (0, utils_1.writeFile)(outputPath, llmFileContent);
+        }
+        catch (error) {
+            throw new Error(`Failed to write file ${outputPath}: ${(0, utils_1.getErrorMessage)(error)}`);
+        }
     }
     else {
         // Generate links-only file
@@ -152,71 +150,144 @@ ${fullContentSections.join('\n\n---\n\n')}
         });
         // Use custom root content or default message
         const rootContent = customRootContent || 'This file contains links to documentation sections following the llmstxt.org standard.';
-        const llmFileContent = `# ${fileTitle}
-> ${fileDescription}${versionInfo}
-${rootContent}
-## Table of Contents
-${tocItems.join('\n')}
-`;
-        await (0, utils_1.writeFile)(outputPath, llmFileContent);
+        const llmFileContent = (0, utils_1.createMarkdownContent)(fileTitle, `${fileDescription}${versionInfo}`, `${rootContent}\n\n## Table of Contents\n\n${tocItems.join('\n')}`, true // include metadata (description)
+        );
+        try {
+            await (0, utils_1.writeFile)(outputPath, llmFileContent);
+        }
+        catch (error) {
+            throw new Error(`Failed to write file ${outputPath}: ${(0, utils_1.getErrorMessage)(error)}`);
+        }
     }
-    console.log(`Generated: ${outputPath}`);
+    utils_1.logger.info(`Generated: ${outputPath}`);
 }
 /**
  * Generate individual markdown files for each document
  * @param docs - Processed document information
  * @param outputDir - Directory to write the markdown files
  * @param siteUrl - Base site URL
+ * @param docsDir - The configured docs directory name (e.g., 'docs', 'documentation', etc.)
+ * @param keepFrontMatter - Array of frontmatter keys to preserve in generated files
+ * @param preserveDirectoryStructure - Whether to preserve the full directory structure (default: true)
  * @returns Updated docs with new URLs pointing to generated markdown files
  */
-async function generateIndividualMarkdownFiles(docs, outputDir, siteUrl) {
+async function generateIndividualMarkdownFiles(docs, outputDir, siteUrl, docsDir = 'docs', keepFrontMatter = [], preserveDirectoryStructure = true) {
     const updatedDocs = [];
-    // Create a map to ensure unique filenames
-    const usedFilenames = new Set();
+    const usedPaths = new Set();
     for (const doc of docs) {
-        // Generate a filename from the document title or URL path
-        let baseFilename = doc.title
-            .toLowerCase()
-            .replace(/[^a-z0-9]+/g, '-')
-            .replace(/^-+|-+$/g, '');
-        // Fallback to URL path if title generates empty filename
-        if (!baseFilename) {
-            baseFilename = doc.path
-                .replace(/^\/+|\/+$/g, '') // Remove leading/trailing slashes
-                .replace(/\//g, '-')
-                .replace(/[^a-z0-9-]/gi, '-')
-                .toLowerCase();
+        // Use the original path structure as default filename.
+        let relativePath = doc.path
+            .replace(/^\/+/, '') // Remove leading slashes
+            .replace(/\.mdx?$/, '.md'); // Ensure .md extension
+        // Strip the docsDir prefix only if preserveDirectoryStructure is false
+        if (!preserveDirectoryStructure) {
+            relativePath = relativePath
+                .replace(new RegExp(`^${docsDir.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/`), ''); // Remove configured docs dir prefix
+        }
+        // If frontmatter has slug, use that.
+        if ((0, utils_1.isNonEmptyString)(doc.frontMatter?.slug)) {
+            const slug = doc.frontMatter.slug.trim().replace(/^\/+|\/+$/g, ''); // Trim whitespace and slashes
+            if ((0, utils_1.isNonEmptyString)(slug)) { // Only process if slug is not empty after trimming
+                if (slug.includes('/')) {
+                    // Nested slug: create directory structure
+                    relativePath = slug + '.md';
+                }
+                else {
+                    // Simple slug: replace just the filename
+                    const pathParts = relativePath.replace(/\.md$/, '').split('/');
+                    pathParts[pathParts.length - 1] = slug;
+                    relativePath = pathParts.join('/') + '.md';
+                }
+            }
+        }
+        // Otherwise, if frontmatter has id, use that.
+        else if ((0, utils_1.isNonEmptyString)(doc.frontMatter?.id)) {
+            const id = doc.frontMatter.id.trim().replace(/^\/+|\/+$/g, ''); // Trim whitespace and slashes
+            if ((0, utils_1.isNonEmptyString)(id)) { // Only process if id is not empty after trimming
+                if (id.includes('/')) {
+                    // Nested id: create directory structure
+                    relativePath = id + '.md';
+                }
+                else {
+                    // Simple id: replace just the filename
+                    const pathParts = relativePath.replace(/\.md$/, '').split('/');
+                    pathParts[pathParts.length - 1] = id;
+                    relativePath = pathParts.join('/') + '.md';
+                }
+            }
         }
-        // Ensure filename uniqueness
-        let filename = `${baseFilename}.md`;
+        // Trim any leading/trailing whitespace from the path
+        relativePath = relativePath.trim();
+        // If path is empty or invalid, create a fallback path
+        if (!(0, utils_1.isNonEmptyString)(relativePath) || relativePath === '.md') {
+            const sanitizedTitle = (0, utils_1.sanitizeForFilename)(doc.title, 'untitled');
+            relativePath = `${sanitizedTitle}.md`;
+        }
+        // Ensure path uniqueness
+        let uniquePath = relativePath;
         let counter = 1;
-        while (usedFilenames.has(filename)) {
-            filename = `${baseFilename}-${counter}.md`;
+        const MAX_PATH_ITERATIONS = 10000;
+        let pathIterations = 0;
+        while (usedPaths.has(uniquePath.toLowerCase())) {
             counter++;
+            const pathParts = relativePath.split('.');
+            const extension = pathParts.pop() || 'md';
+            const basePath = pathParts.join('.');
+            uniquePath = `${basePath}-${counter}.${extension}`;
+            pathIterations++;
+            if (pathIterations >= MAX_PATH_ITERATIONS) {
+                // Fallback to timestamp
+                const timestamp = Date.now();
+                uniquePath = `${basePath}-${timestamp}.${extension}`;
+                utils_1.logger.warn(`Maximum iterations reached for unique path. Using timestamp: ${uniquePath}`);
+                break;
+            }
         }
-        usedFilenames.add(filename);
-        // Create markdown content following llmstxt.org standard
-        const markdownContent = `# ${doc.title}
-> ${doc.description}
-${doc.content}
-`;
+        usedPaths.add(uniquePath.toLowerCase());
+        // Create the full file path and validate/shorten if needed
+        let fullPath = path.join(outputDir, uniquePath);
+        fullPath = (0, utils_1.shortenPathIfNeeded)(fullPath, outputDir, uniquePath);
+        // Update uniquePath to reflect the shortened path if it was changed
+        if (fullPath !== path.join(outputDir, uniquePath)) {
+            uniquePath = path.relative(outputDir, fullPath);
+        }
+        const directory = path.dirname(fullPath);
+        // Create directory structure if it doesn't exist
+        try {
+            await fs.mkdir(directory, { recursive: true });
+        }
+        catch (error) {
+            throw new Error(`Failed to create directory ${directory}: ${(0, utils_1.getErrorMessage)(error)}`);
+        }
+        // Extract preserved frontmatter if specified
+        let preservedFrontMatter = {};
+        if ((0, utils_1.isNonEmptyArray)(keepFrontMatter) && (0, utils_1.isDefined)(doc.frontMatter)) {
+            for (const key of keepFrontMatter) {
+                if (key in doc.frontMatter) {
+                    preservedFrontMatter[key] = doc.frontMatter[key];
+                }
+            }
+        }
+        // Create markdown content using the utility function
+        const markdownContent = (0, utils_1.createMarkdownContent)(doc.title, doc.description, doc.content, true, // includeMetadata
+        Object.keys(preservedFrontMatter).length > 0 ? preservedFrontMatter : undefined);
         // Write the markdown file
-        const markdownPath = path.join(outputDir, filename);
-        await (0, utils_1.writeFile)(markdownPath, markdownContent);
+        try {
+            await (0, utils_1.writeFile)(fullPath, markdownContent);
+        }
+        catch (error) {
+            throw new Error(`Failed to write file ${fullPath}: ${(0, utils_1.getErrorMessage)(error)}`);
+        }
         // Create updated DocInfo with new URL pointing to the generated markdown file
-        const newUrl = `${siteUrl}/${filename}`;
+        // Convert file path to URL path (use forward slashes)
+        const urlPath = (0, utils_1.normalizePath)(uniquePath);
+        const newUrl = `${siteUrl}/${urlPath}`;
         updatedDocs.push({
             ...doc,
             url: newUrl,
-            path: `/${filename}` // Update path to the new markdown file
+            path: `/${urlPath}` // Update path to the new markdown file
         });
-        console.log(`Generated markdown file: ${filename}`);
+        utils_1.logger.verbose(`Generated markdown file: ${uniquePath}`);
     }
     return updatedDocs;
 }
@@ -227,31 +298,38 @@ ${doc.content}
  */
 async function generateStandardLLMFiles(context, allDocFiles) {
     const { outDir, siteUrl, docTitle, docDescription, options } = context;
-    const { generateLLMsTxt, generateLLMsFullTxt, llmsTxtFilename = 'llms.txt', llmsFullTxtFilename = 'llms-full.txt', includeOrder = [], includeUnmatchedLast = true, version, generateMarkdownFiles = false, rootContent, fullRootContent } = options;
+    const { generateLLMsTxt, generateLLMsFullTxt, llmsTxtFilename = 'llms.txt', llmsFullTxtFilename = 'llms-full.txt', includeOrder = [], includeUnmatchedLast = true, version, generateMarkdownFiles = false, rootContent, fullRootContent, processingBatchSize = 100 } = options;
     if (!generateLLMsTxt && !generateLLMsFullTxt) {
+        utils_1.logger.warn('No standard LLM files configured for generation. Skipping.');
         return;
     }
     // Process files for the standard outputs
     let processedDocs = await (0, processor_1.processFilesWithPatterns)(context, allDocFiles, [], // No specific include patterns - include all
     [], // No additional ignore patterns beyond global ignoreFiles
     includeOrder, includeUnmatchedLast);
-    console.log(`Processed ${processedDocs.length} documentation files for standard LLM files`);
+    utils_1.logger.verbose(`Processed ${processedDocs.length} documentation files for standard LLM files`);
+    // Check if we have documents to process
+    if (!(0, utils_1.isNonEmptyArray)(processedDocs)) {
+        utils_1.logger.warn('No documents found matching patterns for standard LLM files. Skipping.');
+        return;
+    }
     // Generate individual markdown files if requested
-    if (generateMarkdownFiles && processedDocs.length > 0) {
-        console.log('Generating individual markdown files...');
-        processedDocs = await generateIndividualMarkdownFiles(processedDocs, outDir, siteUrl);
+    if (generateMarkdownFiles) {
+        utils_1.logger.info('Generating individual markdown files...');
+        processedDocs = await generateIndividualMarkdownFiles(processedDocs, outDir, siteUrl, context.docsDir, context.options.keepFrontMatter || [], context.options.preserveDirectoryStructure !== false // Default to true
+        );
     }
     // Generate llms.txt
     if (generateLLMsTxt) {
         const llmsTxtPath = path.join(outDir, llmsTxtFilename);
         await generateLLMFile(processedDocs, llmsTxtPath, docTitle, docDescription, false, // links only
-        version, rootContent);
+        version, rootContent, processingBatchSize);
     }
     // Generate llms-full.txt
     if (generateLLMsFullTxt) {
         const llmsFullTxtPath = path.join(outDir, llmsFullTxtFilename);
         await generateLLMFile(processedDocs, llmsFullTxtPath, docTitle, docDescription, true, // full content
-        version, fullRootContent);
+        version, fullRootContent, processingBatchSize);
     }
 }
 /**
@@ -261,13 +339,14 @@ async function generateStandardLLMFiles(context, allDocFiles) {
  */
 async function generateCustomLLMFiles(context, allDocFiles) {
     const { outDir, siteUrl, docTitle, docDescription, options } = context;
-    const { customLLMFiles = [], ignoreFiles = [], generateMarkdownFiles = false } = options;
+    const { customLLMFiles = [], ignoreFiles = [], generateMarkdownFiles = false, processingBatchSize = 100 } = options;
     if (customLLMFiles.length === 0) {
+        utils_1.logger.warn('No custom LLM files configured. Skipping.');
         return;
     }
-    console.log(`Generating ${customLLMFiles.length} custom LLM files...`);
+    utils_1.logger.info(`Generating ${customLLMFiles.length} custom LLM files...`);
     for (const customFile of customLLMFiles) {
-        console.log(`Processing custom file: ${customFile.filename}, version: ${customFile.version || 'undefined'}`);
+        utils_1.logger.verbose(`Processing custom file: ${customFile.filename}, version: ${customFile.version || 'undefined'}`);
         // Combine global ignores with custom ignores
         const combinedIgnores = [...ignoreFiles];
         if (customFile.ignorePatterns) {
@@ -278,19 +357,20 @@ async function generateCustomLLMFiles(context, allDocFiles) {
         if (customDocs.length > 0) {
             // Generate individual markdown files if requested
             if (generateMarkdownFiles) {
-                console.log(`Generating individual markdown files for custom file: ${customFile.filename}...`);
-                customDocs = await generateIndividualMarkdownFiles(customDocs, outDir, siteUrl);
+                utils_1.logger.info(`Generating individual markdown files for custom file: ${customFile.filename}...`);
+                customDocs = await generateIndividualMarkdownFiles(customDocs, outDir, siteUrl, context.docsDir, context.options.keepFrontMatter || [], context.options.preserveDirectoryStructure !== false // Default to true
+                );
             }
             // Use custom title/description or fall back to defaults
             const customTitle = customFile.title || docTitle;
             const customDescription = customFile.description || docDescription;
             // Generate the custom LLM file
             const customFilePath = path.join(outDir, customFile.filename);
-            await generateLLMFile(customDocs, customFilePath, customTitle, customDescription, customFile.fullContent, customFile.version, customFile.rootContent);
-            console.log(`Generated custom LLM file: ${customFile.filename} with ${customDocs.length} documents`);
+            await generateLLMFile(customDocs, customFilePath, customTitle, customDescription, customFile.fullContent, customFile.version, customFile.rootContent, processingBatchSize);
+            utils_1.logger.info(`Generated custom LLM file: ${customFile.filename} with ${customDocs.length} documents`);
         }
         else {
-            console.warn(`No matching documents found for custom LLM file: ${customFile.filename}`);
+            utils_1.logger.warn(`No matching documents found for custom LLM file: ${customFile.filename}`);
         }
     }
 }
@@ -301,18 +381,18 @@ async function generateCustomLLMFiles(context, allDocFiles) {
  */
 async function collectDocFiles(context) {
     const { siteDir, docsDir, options } = context;
-    const { ignoreFiles = [], includeBlog = false } = options;
+    const { ignoreFiles = [], includeBlog = false, warnOnIgnoredFiles = false } = options;
     const allDocFiles = [];
     // Process docs directory
     const fullDocsDir = path.join(siteDir, docsDir);
     try {
         await fs.access(fullDocsDir);
         // Collect all markdown files from docs directory
-        const docFiles = await (0, utils_1.readMarkdownFiles)(fullDocsDir, siteDir, ignoreFiles);
+        const docFiles = await (0, utils_1.readMarkdownFiles)(fullDocsDir, siteDir, ignoreFiles, docsDir, warnOnIgnoredFiles);
         allDocFiles.push(...docFiles);
     }
     catch (err) {
-        console.warn(`Docs directory not found: ${fullDocsDir}`);
+        utils_1.logger.warn(`Docs directory not found: ${fullDocsDir}`);
     }
     // Process blog if enabled
     if (includeBlog) {
@@ -320,11 +400,11 @@ async function collectDocFiles(context) {
         try {
             await fs.access(blogDir);
             // Collect all markdown files from blog directory
-            const blogFiles = await (0, utils_1.readMarkdownFiles)(blogDir, siteDir, ignoreFiles);
+            const blogFiles = await (0, utils_1.readMarkdownFiles)(blogDir, siteDir, ignoreFiles, docsDir, warnOnIgnoredFiles);
             allDocFiles.push(...blogFiles);
         }
         catch (err) {
-            console.warn(`Blog directory not found: ${blogDir}`);
+            utils_1.logger.warn(`Blog directory not found: ${blogDir}`);
         }
     }
     return allDocFiles;