docusaurus-plugin-llms 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,236 @@
1
+ /**
2
+ * Document processing functions for the docusaurus-plugin-llms plugin
3
+ */
4
+
5
+ import * as path from 'path';
6
+ import matter from 'gray-matter';
7
+ import { minimatch } from 'minimatch';
8
+ import { DocInfo, PluginContext } from './types';
9
+ import {
10
+ readFile,
11
+ extractTitle,
12
+ cleanMarkdownContent,
13
+ applyPathTransformations
14
+ } from './utils';
15
+
16
+ /**
17
+ * Process a markdown file and extract its metadata and content
18
+ * @param filePath - Path to the markdown file
19
+ * @param baseDir - Base directory
20
+ * @param siteUrl - Base URL of the site
21
+ * @param pathPrefix - Path prefix for URLs (e.g., 'docs' or 'blog')
22
+ * @param pathTransformation - Path transformation configuration
23
+ * @returns Processed file data
24
+ */
25
+ export async function processMarkdownFile(
26
+ filePath: string,
27
+ baseDir: string,
28
+ siteUrl: string,
29
+ pathPrefix: string = 'docs',
30
+ pathTransformation?: {
31
+ ignorePaths?: string[];
32
+ addPaths?: string[];
33
+ }
34
+ ): Promise<DocInfo> {
35
+ const content = await readFile(filePath);
36
+ const { data, content: markdownContent } = matter(content);
37
+
38
+ const relativePath = path.relative(baseDir, filePath);
39
+ // Convert to URL path format (replace backslashes with forward slashes on Windows)
40
+ const normalizedPath = relativePath.replace(/\\/g, '/');
41
+
42
+ // Convert .md extension to appropriate path
43
+ const linkPathBase = normalizedPath.replace(/\.mdx?$/, '');
44
+
45
+ // Handle index files specially
46
+ const linkPath = linkPathBase.endsWith('index')
47
+ ? linkPathBase.replace(/\/index$/, '')
48
+ : linkPathBase;
49
+
50
+ // Apply path transformations to the link path
51
+ const transformedLinkPath = applyPathTransformations(linkPath, pathTransformation);
52
+
53
+ // Also apply path transformations to the pathPrefix if it's not empty
54
+ // This allows removing 'docs' from the path when specified in ignorePaths
55
+ let transformedPathPrefix = pathPrefix;
56
+ if (pathPrefix && pathTransformation?.ignorePaths?.includes(pathPrefix)) {
57
+ transformedPathPrefix = '';
58
+ }
59
+
60
+ // Generate full URL with transformed path and path prefix
61
+ const fullUrl = new URL(
62
+ `${transformedPathPrefix ? `${transformedPathPrefix}/` : ''}${transformedLinkPath}`,
63
+ siteUrl
64
+ ).toString();
65
+
66
+ // Extract title
67
+ const title = extractTitle(data, markdownContent, filePath);
68
+
69
+ // Get description from frontmatter or first paragraph
70
+ let description = '';
71
+
72
+ // First priority: Use frontmatter description if available
73
+ if (data.description) {
74
+ description = data.description;
75
+ } else {
76
+ // Second priority: Find the first non-heading paragraph
77
+ const paragraphs = markdownContent.split('\n\n');
78
+ for (const para of paragraphs) {
79
+ const trimmedPara = para.trim();
80
+ // Skip empty paragraphs and headings
81
+ if (trimmedPara && !trimmedPara.startsWith('#')) {
82
+ description = trimmedPara;
83
+ break;
84
+ }
85
+ }
86
+
87
+ // Third priority: If still no description, use the first heading's content
88
+ if (!description) {
89
+ const firstHeadingMatch = markdownContent.match(/^#\s+(.*?)$/m);
90
+ if (firstHeadingMatch && firstHeadingMatch[1]) {
91
+ description = firstHeadingMatch[1].trim();
92
+ }
93
+ }
94
+ }
95
+
96
+ // Only remove heading markers at the beginning of descriptions or lines
97
+ // This preserves # characters that are part of the content
98
+ if (description) {
99
+ // Original approach had issues with hashtags inside content
100
+ // Fix: Only remove # symbols at the beginning of lines or description
101
+ // that are followed by a space (actual heading markers)
102
+ description = description.replace(/^(#+)\s+/gm, '');
103
+
104
+ // Special handling for description frontmatter with heading markers
105
+ if (data.description && data.description.startsWith('#')) {
106
+ // If the description in frontmatter starts with a heading marker,
107
+ // we should preserve it in the extracted description
108
+ description = description.replace(/^#+\s+/, '');
109
+ }
110
+
111
+ // Preserve inline hashtags (not heading markers)
112
+ // We don't want to treat hashtags in the middle of content as headings
113
+
114
+ // Validate that the description doesn't contain markdown headings
115
+ if (description.match(/^#+\s+/m)) {
116
+ console.warn(`Warning: Description for "${title}" may still contain heading markers`);
117
+ }
118
+
119
+ // Warn if the description contains HTML tags
120
+ if (/<[^>]+>/g.test(description)) {
121
+ console.warn(`Warning: Description for "${title}" contains HTML tags`);
122
+ }
123
+
124
+ // Warn if the description is very long
125
+ if (description.length > 500) {
126
+ console.warn(`Warning: Description for "${title}" is very long (${description.length} characters)`);
127
+ }
128
+ }
129
+
130
+ // Clean and process content
131
+ const cleanedContent = cleanMarkdownContent(markdownContent);
132
+
133
+ return {
134
+ title,
135
+ path: normalizedPath,
136
+ url: fullUrl,
137
+ content: cleanedContent,
138
+ description: description || '',
139
+ };
140
+ }
141
+
142
+ /**
143
+ * Process files based on include patterns, ignore patterns, and ordering
144
+ * @param context - Plugin context
145
+ * @param allFiles - All available files
146
+ * @param includePatterns - Patterns for files to include
147
+ * @param ignorePatterns - Patterns for files to ignore
148
+ * @param orderPatterns - Patterns for ordering files
149
+ * @param includeUnmatched - Whether to include unmatched files
150
+ * @returns Processed files
151
+ */
152
+ export async function processFilesWithPatterns(
153
+ context: PluginContext,
154
+ allFiles: string[],
155
+ includePatterns: string[] = [],
156
+ ignorePatterns: string[] = [],
157
+ orderPatterns: string[] = [],
158
+ includeUnmatched: boolean = false
159
+ ): Promise<DocInfo[]> {
160
+ const { siteDir, siteUrl, docsDir } = context;
161
+
162
+ // Filter files based on include patterns
163
+ let filteredFiles = allFiles;
164
+
165
+ if (includePatterns.length > 0) {
166
+ filteredFiles = allFiles.filter(file => {
167
+ const relativePath = path.relative(siteDir, file);
168
+ return includePatterns.some(pattern =>
169
+ minimatch(relativePath, pattern, { matchBase: true })
170
+ );
171
+ });
172
+ }
173
+
174
+ // Apply ignore patterns
175
+ if (ignorePatterns.length > 0) {
176
+ filteredFiles = filteredFiles.filter(file => {
177
+ const relativePath = path.relative(siteDir, file);
178
+ return !ignorePatterns.some(pattern =>
179
+ minimatch(relativePath, pattern, { matchBase: true })
180
+ );
181
+ });
182
+ }
183
+
184
+ // Order files according to orderPatterns
185
+ let filesToProcess: string[] = [];
186
+
187
+ if (orderPatterns.length > 0) {
188
+ const matchedFiles = new Set<string>();
189
+
190
+ // Process files according to orderPatterns
191
+ for (const pattern of orderPatterns) {
192
+ const matchingFiles = filteredFiles.filter(file => {
193
+ const relativePath = path.relative(siteDir, file);
194
+ return minimatch(relativePath, pattern, { matchBase: true }) && !matchedFiles.has(file);
195
+ });
196
+
197
+ for (const file of matchingFiles) {
198
+ filesToProcess.push(file);
199
+ matchedFiles.add(file);
200
+ }
201
+ }
202
+
203
+ // Add remaining files if includeUnmatched is true
204
+ if (includeUnmatched) {
205
+ const remainingFiles = filteredFiles.filter(file => !matchedFiles.has(file));
206
+ filesToProcess.push(...remainingFiles);
207
+ }
208
+ } else {
209
+ filesToProcess = filteredFiles;
210
+ }
211
+
212
+ // Process each file to generate DocInfo
213
+ const processedDocs: DocInfo[] = [];
214
+
215
+ for (const filePath of filesToProcess) {
216
+ try {
217
+ // Determine if this is a blog or docs file
218
+ const isBlogFile = filePath.includes(path.join(siteDir, 'blog'));
219
+ const baseDir = isBlogFile ? path.join(siteDir, 'blog') : path.join(siteDir, docsDir);
220
+ const pathPrefix = isBlogFile ? 'blog' : 'docs';
221
+
222
+ const docInfo = await processMarkdownFile(
223
+ filePath,
224
+ baseDir,
225
+ siteUrl,
226
+ pathPrefix,
227
+ context.options.pathTransformation
228
+ );
229
+ processedDocs.push(docInfo);
230
+ } catch (err: any) {
231
+ console.warn(`Error processing ${filePath}: ${err.message}`);
232
+ }
233
+ }
234
+
235
+ return processedDocs;
236
+ }
package/src/types.ts ADDED
@@ -0,0 +1,113 @@
1
+ /**
2
+ * Type definitions for the docusaurus-plugin-llms plugin
3
+ */
4
+
5
+ import type { LoadContext } from '@docusaurus/types';
6
+
7
+ /**
8
+ * Interface for processed document information
9
+ */
10
+ export interface DocInfo {
11
+ title: string;
12
+ path: string;
13
+ url: string;
14
+ content: string;
15
+ description: string;
16
+ }
17
+
18
+ /**
19
+ * Interface for custom LLM file configuration
20
+ */
21
+ export interface CustomLLMFile {
22
+ /** Name of the output file (e.g., 'llms-python.txt') */
23
+ filename: string;
24
+
25
+ /** Glob patterns for files to include */
26
+ includePatterns: string[];
27
+
28
+ /** Whether to include full content (true) or just links (false) */
29
+ fullContent: boolean;
30
+
31
+ /** Custom title for this file (defaults to site title) */
32
+ title?: string;
33
+
34
+ /** Custom description for this file (defaults to site description) */
35
+ description?: string;
36
+
37
+ /** Additional patterns to exclude (combined with global ignoreFiles) */
38
+ ignorePatterns?: string[];
39
+
40
+ /** Order patterns for controlling file ordering (similar to includeOrder) */
41
+ orderPatterns?: string[];
42
+
43
+ /** Whether to include unmatched files last (default: false) */
44
+ includeUnmatchedLast?: boolean;
45
+
46
+ /** Version information for this LLM file */
47
+ version?: string;
48
+ }
49
+
50
+ /**
51
+ * Plugin options interface
52
+ */
53
+ export interface PluginOptions {
54
+ /** Whether to generate the llms.txt file (default: true) */
55
+ generateLLMsTxt?: boolean;
56
+
57
+ /** Whether to generate the llms-full.txt file (default: true) */
58
+ generateLLMsFullTxt?: boolean;
59
+
60
+ /** Base directory for documentation files (default: 'docs') */
61
+ docsDir?: string;
62
+
63
+ /** Array of glob patterns for files to ignore */
64
+ ignoreFiles?: string[];
65
+
66
+ /** Custom title to use in generated files (defaults to site title) */
67
+ title?: string;
68
+
69
+ /** Custom description to use in generated files (defaults to site tagline) */
70
+ description?: string;
71
+
72
+ /** Custom file name for the links file (default: 'llms.txt') */
73
+ llmsTxtFilename?: string;
74
+
75
+ /** Custom file name for the full content file (default: 'llms-full.txt') */
76
+ llmsFullTxtFilename?: string;
77
+
78
+ /** Whether to include blog content (default: false) */
79
+ includeBlog?: boolean;
80
+
81
+ /** Path transformation options for URL construction */
82
+ pathTransformation?: {
83
+ /** Path segments to ignore when constructing URLs (will be removed if found) */
84
+ ignorePaths?: string[];
85
+ /** Path segments to add when constructing URLs (will be prepended if not already present) */
86
+ addPaths?: string[];
87
+ };
88
+
89
+ /** Array of glob patterns for controlling the order of files (files will be processed in the order of patterns) */
90
+ includeOrder?: string[];
91
+
92
+ /** Whether to include files that don't match any pattern in includeOrder at the end (default: true) */
93
+ includeUnmatchedLast?: boolean;
94
+
95
+ /** Array of custom LLM file configurations */
96
+ customLLMFiles?: CustomLLMFile[];
97
+
98
+ /** Global version for all generated LLM files */
99
+ version?: string;
100
+ }
101
+
102
+ /**
103
+ * Plugin context with processed options
104
+ */
105
+ export interface PluginContext {
106
+ siteDir: string;
107
+ outDir: string;
108
+ siteUrl: string;
109
+ docsDir: string;
110
+ docTitle: string;
111
+ docDescription: string;
112
+ options: PluginOptions;
113
+ }
package/src/utils.ts ADDED
@@ -0,0 +1,165 @@
1
+ /**
2
+ * Utility functions for the docusaurus-plugin-llms plugin
3
+ */
4
+
5
+ import * as fs from 'fs/promises';
6
+ import * as path from 'path';
7
+ import { minimatch } from 'minimatch';
8
+ import { PluginOptions } from './types';
9
+
10
+ /**
11
+ * Write content to a file
12
+ * @param filePath - Path to write the file to
13
+ * @param data - Content to write
14
+ */
15
+ export async function writeFile(filePath: string, data: string): Promise<void> {
16
+ return fs.writeFile(filePath, data, 'utf8');
17
+ }
18
+
19
+ /**
20
+ * Read content from a file
21
+ * @param filePath - Path of the file to read
22
+ * @returns Content of the file
23
+ */
24
+ export async function readFile(filePath: string): Promise<string> {
25
+ return fs.readFile(filePath, 'utf8');
26
+ }
27
+
28
+ /**
29
+ * Check if a file should be ignored based on glob patterns
30
+ * @param filePath - Path to the file
31
+ * @param baseDir - Base directory for relative paths
32
+ * @param ignorePatterns - Glob patterns for files to ignore
33
+ * @returns Whether the file should be ignored
34
+ */
35
+ export function shouldIgnoreFile(filePath: string, baseDir: string, ignorePatterns: string[]): boolean {
36
+ if (ignorePatterns.length === 0) {
37
+ return false;
38
+ }
39
+
40
+ const relativePath = path.relative(baseDir, filePath);
41
+
42
+ return ignorePatterns.some(pattern =>
43
+ minimatch(relativePath, pattern, { matchBase: true })
44
+ );
45
+ }
46
+
47
+ /**
48
+ * Recursively reads all Markdown files in a directory
49
+ * @param dir - Directory to scan
50
+ * @param baseDir - Base directory for relative paths
51
+ * @param ignorePatterns - Glob patterns for files to ignore
52
+ * @returns Array of file paths
53
+ */
54
+ export async function readMarkdownFiles(dir: string, baseDir: string, ignorePatterns: string[] = []): Promise<string[]> {
55
+ const files: string[] = [];
56
+ const entries = await fs.readdir(dir, { withFileTypes: true });
57
+
58
+ for (const entry of entries) {
59
+ const fullPath = path.join(dir, entry.name);
60
+
61
+ if (shouldIgnoreFile(fullPath, baseDir, ignorePatterns)) {
62
+ continue;
63
+ }
64
+
65
+ if (entry.isDirectory()) {
66
+ const subDirFiles = await readMarkdownFiles(fullPath, baseDir, ignorePatterns);
67
+ files.push(...subDirFiles);
68
+ } else if (entry.name.endsWith('.md') || entry.name.endsWith('.mdx')) {
69
+ files.push(fullPath);
70
+ }
71
+ }
72
+
73
+ return files;
74
+ }
75
+
76
+ /**
77
+ * Extract title from content or use the filename
78
+ * @param data - Frontmatter data
79
+ * @param content - Markdown content
80
+ * @param filePath - Path to the file
81
+ * @returns Extracted title
82
+ */
83
+ export function extractTitle(data: any, content: string, filePath: string): string {
84
+ // First try frontmatter
85
+ if (data.title) {
86
+ return data.title;
87
+ }
88
+
89
+ // Then try first heading
90
+ const headingMatch = content.match(/^#\s+(.*)/m);
91
+ if (headingMatch) {
92
+ return headingMatch[1].trim();
93
+ }
94
+
95
+ // Finally use filename
96
+ return path.basename(filePath, path.extname(filePath))
97
+ .replace(/-/g, ' ')
98
+ .replace(/\b\w/g, c => c.toUpperCase());
99
+ }
100
+
101
+ /**
102
+ * Clean markdown content for LLM consumption
103
+ * @param content - Raw markdown content
104
+ * @returns Cleaned content
105
+ */
106
+ export function cleanMarkdownContent(content: string): string {
107
+ // Remove HTML tags
108
+ let cleaned = content.replace(/<[^>]*>/g, '');
109
+
110
+ // Normalize whitespace
111
+ cleaned = cleaned.replace(/\r\n/g, '\n')
112
+ .replace(/\n{3,}/g, '\n\n')
113
+ .trim();
114
+
115
+ return cleaned;
116
+ }
117
+
118
+ /**
119
+ * Apply path transformations according to configuration
120
+ * @param urlPath - Original URL path
121
+ * @param pathTransformation - Path transformation configuration
122
+ * @returns Transformed URL path
123
+ */
124
+ export function applyPathTransformations(
125
+ urlPath: string,
126
+ pathTransformation?: PluginOptions['pathTransformation']
127
+ ): string {
128
+ if (!pathTransformation) {
129
+ return urlPath;
130
+ }
131
+
132
+ let transformedPath = urlPath;
133
+
134
+ // Remove ignored path segments
135
+ if (pathTransformation.ignorePaths?.length) {
136
+ for (const ignorePath of pathTransformation.ignorePaths) {
137
+ // Create a regex that matches the ignore path at the beginning, middle, or end of the path
138
+ // We use word boundaries to ensure we match complete path segments
139
+ const ignoreRegex = new RegExp(`(^|/)(${ignorePath})(/|$)`, 'g');
140
+ transformedPath = transformedPath.replace(ignoreRegex, '$1$3');
141
+ }
142
+
143
+ // Clean up any double slashes that might have been created
144
+ transformedPath = transformedPath.replace(/\/+/g, '/');
145
+
146
+ // Remove leading slash if present
147
+ transformedPath = transformedPath.replace(/^\//, '');
148
+ }
149
+
150
+ // Add path segments if they're not already present
151
+ if (pathTransformation.addPaths?.length) {
152
+ // Process in reverse order to maintain the specified order in the final path
153
+ // This is because each path is prepended to the front
154
+ const pathsToAdd = [...pathTransformation.addPaths].reverse();
155
+
156
+ for (const addPath of pathsToAdd) {
157
+ // Only add if not already present at the beginning
158
+ if (!transformedPath.startsWith(addPath + '/') && transformedPath !== addPath) {
159
+ transformedPath = `${addPath}/${transformedPath}`;
160
+ }
161
+ }
162
+ }
163
+
164
+ return transformedPath;
165
+ }