docusaurus-plugin-llms 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +306 -17
- package/lib/generator.d.ts +32 -0
- package/lib/generator.js +212 -0
- package/lib/index.d.ts +1 -24
- package/lib/index.js +39 -288
- package/lib/processor.d.ts +28 -0
- package/lib/processor.js +211 -0
- package/lib/utils.d.ts +53 -0
- package/lib/utils.js +177 -0
- package/package.json +4 -2
- package/src/generator.ts +266 -0
- package/src/index.ts +48 -348
- package/src/processor.ts +236 -0
- package/src/types.ts +113 -0
- package/src/utils.ts +165 -0
package/lib/index.js
CHANGED
@@ -8,184 +8,9 @@
|
|
8
8
|
*
|
9
9
|
* The plugin runs during the Docusaurus build process and scans all Markdown files in the docs directory.
|
10
10
|
*/
|
11
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
12
|
-
if (k2 === undefined) k2 = k;
|
13
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
14
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
15
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
16
|
-
}
|
17
|
-
Object.defineProperty(o, k2, desc);
|
18
|
-
}) : (function(o, m, k, k2) {
|
19
|
-
if (k2 === undefined) k2 = k;
|
20
|
-
o[k2] = m[k];
|
21
|
-
}));
|
22
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
23
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
24
|
-
}) : function(o, v) {
|
25
|
-
o["default"] = v;
|
26
|
-
});
|
27
|
-
var __importStar = (this && this.__importStar) || (function () {
|
28
|
-
var ownKeys = function(o) {
|
29
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
30
|
-
var ar = [];
|
31
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
32
|
-
return ar;
|
33
|
-
};
|
34
|
-
return ownKeys(o);
|
35
|
-
};
|
36
|
-
return function (mod) {
|
37
|
-
if (mod && mod.__esModule) return mod;
|
38
|
-
var result = {};
|
39
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
40
|
-
__setModuleDefault(result, mod);
|
41
|
-
return result;
|
42
|
-
};
|
43
|
-
})();
|
44
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
45
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
46
|
-
};
|
47
11
|
Object.defineProperty(exports, "__esModule", { value: true });
|
48
12
|
exports.default = docusaurusPluginLLMs;
|
49
|
-
const
|
50
|
-
const path = __importStar(require("path"));
|
51
|
-
const gray_matter_1 = __importDefault(require("gray-matter"));
|
52
|
-
const minimatch_1 = require("minimatch");
|
53
|
-
/**
|
54
|
-
* Write content to a file
|
55
|
-
* @param filePath - Path to write the file to
|
56
|
-
* @param data - Content to write
|
57
|
-
*/
|
58
|
-
async function writeFile(filePath, data) {
|
59
|
-
return fs.writeFile(filePath, data, 'utf8');
|
60
|
-
}
|
61
|
-
/**
|
62
|
-
* Read content from a file
|
63
|
-
* @param filePath - Path of the file to read
|
64
|
-
* @returns Content of the file
|
65
|
-
*/
|
66
|
-
async function readFile(filePath) {
|
67
|
-
return fs.readFile(filePath, 'utf8');
|
68
|
-
}
|
69
|
-
/**
|
70
|
-
* Check if a file should be ignored based on glob patterns
|
71
|
-
* @param filePath - Path to the file
|
72
|
-
* @param baseDir - Base directory for relative paths
|
73
|
-
* @param ignorePatterns - Glob patterns for files to ignore
|
74
|
-
* @returns Whether the file should be ignored
|
75
|
-
*/
|
76
|
-
function shouldIgnoreFile(filePath, baseDir, ignorePatterns) {
|
77
|
-
if (ignorePatterns.length === 0) {
|
78
|
-
return false;
|
79
|
-
}
|
80
|
-
const relativePath = path.relative(baseDir, filePath);
|
81
|
-
return ignorePatterns.some(pattern => (0, minimatch_1.minimatch)(relativePath, pattern, { matchBase: true }));
|
82
|
-
}
|
83
|
-
/**
|
84
|
-
* Recursively reads all Markdown files in a directory
|
85
|
-
* @param dir - Directory to scan
|
86
|
-
* @param baseDir - Base directory for relative paths
|
87
|
-
* @param ignorePatterns - Glob patterns for files to ignore
|
88
|
-
* @returns Array of file paths
|
89
|
-
*/
|
90
|
-
async function readMarkdownFiles(dir, baseDir, ignorePatterns = []) {
|
91
|
-
const files = [];
|
92
|
-
const entries = await fs.readdir(dir, { withFileTypes: true });
|
93
|
-
for (const entry of entries) {
|
94
|
-
const fullPath = path.join(dir, entry.name);
|
95
|
-
if (shouldIgnoreFile(fullPath, baseDir, ignorePatterns)) {
|
96
|
-
continue;
|
97
|
-
}
|
98
|
-
if (entry.isDirectory()) {
|
99
|
-
const subDirFiles = await readMarkdownFiles(fullPath, baseDir, ignorePatterns);
|
100
|
-
files.push(...subDirFiles);
|
101
|
-
}
|
102
|
-
else if (entry.name.endsWith('.md') || entry.name.endsWith('.mdx')) {
|
103
|
-
files.push(fullPath);
|
104
|
-
}
|
105
|
-
}
|
106
|
-
return files;
|
107
|
-
}
|
108
|
-
/**
|
109
|
-
* Extract title from content or use the filename
|
110
|
-
* @param data - Frontmatter data
|
111
|
-
* @param content - Markdown content
|
112
|
-
* @param filePath - Path to the file
|
113
|
-
* @returns Extracted title
|
114
|
-
*/
|
115
|
-
function extractTitle(data, content, filePath) {
|
116
|
-
// First try frontmatter
|
117
|
-
if (data.title) {
|
118
|
-
return data.title;
|
119
|
-
}
|
120
|
-
// Then try first heading
|
121
|
-
const headingMatch = content.match(/^#\s+(.*)/m);
|
122
|
-
if (headingMatch) {
|
123
|
-
return headingMatch[1].trim();
|
124
|
-
}
|
125
|
-
// Finally use filename
|
126
|
-
return path.basename(filePath, path.extname(filePath))
|
127
|
-
.replace(/-/g, ' ')
|
128
|
-
.replace(/\b\w/g, c => c.toUpperCase());
|
129
|
-
}
|
130
|
-
/**
|
131
|
-
* Clean markdown content for LLM consumption
|
132
|
-
* @param content - Raw markdown content
|
133
|
-
* @returns Cleaned content
|
134
|
-
*/
|
135
|
-
function cleanMarkdownContent(content) {
|
136
|
-
// Remove HTML tags
|
137
|
-
let cleaned = content.replace(/<[^>]*>/g, '');
|
138
|
-
// Normalize whitespace
|
139
|
-
cleaned = cleaned.replace(/\r\n/g, '\n')
|
140
|
-
.replace(/\n{3,}/g, '\n\n')
|
141
|
-
.trim();
|
142
|
-
return cleaned;
|
143
|
-
}
|
144
|
-
/**
|
145
|
-
* Process a markdown file and extract its metadata and content
|
146
|
-
* @param filePath - Path to the markdown file
|
147
|
-
* @param baseDir - Base directory
|
148
|
-
* @param siteUrl - Base URL of the site
|
149
|
-
* @param pathPrefix - Path prefix for URLs (e.g., 'docs' or 'blog')
|
150
|
-
* @returns Processed file data
|
151
|
-
*/
|
152
|
-
async function processMarkdownFile(filePath, baseDir, siteUrl, pathPrefix = 'docs') {
|
153
|
-
const content = await readFile(filePath);
|
154
|
-
const { data, content: markdownContent } = (0, gray_matter_1.default)(content);
|
155
|
-
const relativePath = path.relative(baseDir, filePath);
|
156
|
-
// Convert to URL path format (replace backslashes with forward slashes on Windows)
|
157
|
-
const normalizedPath = relativePath.replace(/\\/g, '/');
|
158
|
-
// Convert .md extension to appropriate path
|
159
|
-
const linkPathBase = normalizedPath.replace(/\.mdx?$/, '');
|
160
|
-
// Handle index files specially
|
161
|
-
const linkPath = linkPathBase.endsWith('index')
|
162
|
-
? linkPathBase.replace(/\/index$/, '')
|
163
|
-
: linkPathBase;
|
164
|
-
// Generate full URL
|
165
|
-
const fullUrl = new URL(`${pathPrefix}/${linkPath}`, siteUrl).toString();
|
166
|
-
// Extract title
|
167
|
-
const title = extractTitle(data, markdownContent, filePath);
|
168
|
-
// Get description from frontmatter or first paragraph
|
169
|
-
let description = data.description || '';
|
170
|
-
if (!description) {
|
171
|
-
const paragraphs = markdownContent.split('\n\n');
|
172
|
-
for (const para of paragraphs) {
|
173
|
-
if (para.trim() && !para.startsWith('#')) {
|
174
|
-
description = para.trim();
|
175
|
-
break;
|
176
|
-
}
|
177
|
-
}
|
178
|
-
}
|
179
|
-
// Clean and process content
|
180
|
-
const cleanedContent = cleanMarkdownContent(markdownContent);
|
181
|
-
return {
|
182
|
-
title,
|
183
|
-
path: normalizedPath,
|
184
|
-
url: fullUrl,
|
185
|
-
content: cleanedContent,
|
186
|
-
description: description || '',
|
187
|
-
};
|
188
|
-
}
|
13
|
+
const generator_1 = require("./generator");
|
189
14
|
/**
|
190
15
|
* A Docusaurus plugin to generate LLM-friendly documentation following
|
191
16
|
* the llmtxt.org standard
|
@@ -196,8 +21,36 @@ async function processMarkdownFile(filePath, baseDir, siteUrl, pathPrefix = 'doc
|
|
196
21
|
*/
|
197
22
|
function docusaurusPluginLLMs(context, options = {}) {
|
198
23
|
// Set default options
|
199
|
-
const { generateLLMsTxt = true, generateLLMsFullTxt = true, docsDir = 'docs', ignoreFiles = [], title, description, llmsTxtFilename = 'llms.txt', llmsFullTxtFilename = 'llms-full.txt', includeBlog = false, } = options;
|
24
|
+
const { generateLLMsTxt = true, generateLLMsFullTxt = true, docsDir = 'docs', ignoreFiles = [], title, description, llmsTxtFilename = 'llms.txt', llmsFullTxtFilename = 'llms-full.txt', includeBlog = false, pathTransformation, includeOrder = [], includeUnmatchedLast = true, customLLMFiles = [], } = options;
|
200
25
|
const { siteDir, siteConfig, outDir, } = context;
|
26
|
+
// Build the site URL with proper trailing slash
|
27
|
+
const siteUrl = siteConfig.url + (siteConfig.baseUrl.endsWith('/')
|
28
|
+
? siteConfig.baseUrl.slice(0, -1)
|
29
|
+
: siteConfig.baseUrl || '');
|
30
|
+
// Create a plugin context object with processed options
|
31
|
+
const pluginContext = {
|
32
|
+
siteDir,
|
33
|
+
outDir,
|
34
|
+
siteUrl,
|
35
|
+
docsDir,
|
36
|
+
docTitle: title || siteConfig.title,
|
37
|
+
docDescription: description || siteConfig.tagline || '',
|
38
|
+
options: {
|
39
|
+
generateLLMsTxt,
|
40
|
+
generateLLMsFullTxt,
|
41
|
+
docsDir,
|
42
|
+
ignoreFiles,
|
43
|
+
title,
|
44
|
+
description,
|
45
|
+
llmsTxtFilename,
|
46
|
+
llmsFullTxtFilename,
|
47
|
+
includeBlog,
|
48
|
+
pathTransformation,
|
49
|
+
includeOrder,
|
50
|
+
includeUnmatchedLast,
|
51
|
+
customLLMFiles,
|
52
|
+
}
|
53
|
+
};
|
201
54
|
return {
|
202
55
|
name: 'docusaurus-plugin-llms',
|
203
56
|
/**
|
@@ -205,122 +58,20 @@ function docusaurusPluginLLMs(context, options = {}) {
|
|
205
58
|
*/
|
206
59
|
async postBuild() {
|
207
60
|
console.log('Generating LLM-friendly documentation...');
|
208
|
-
// Custom title and description or fallback to site values
|
209
|
-
const docTitle = title || siteConfig.title;
|
210
|
-
const docDescription = description || siteConfig.tagline || '';
|
211
|
-
// Build the site URL with proper trailing slash
|
212
|
-
const siteUrl = siteConfig.url + (siteConfig.baseUrl.endsWith('/')
|
213
|
-
? siteConfig.baseUrl.slice(0, -1)
|
214
|
-
: siteConfig.baseUrl || '');
|
215
|
-
// Initialize docs collection
|
216
|
-
const allDocs = [];
|
217
61
|
try {
|
218
|
-
//
|
219
|
-
const
|
220
|
-
try {
|
221
|
-
await fs.access(fullDocsDir);
|
222
|
-
// Collect all markdown files from docs directory
|
223
|
-
const docFiles = await readMarkdownFiles(fullDocsDir, siteDir, ignoreFiles);
|
224
|
-
if (docFiles.length > 0) {
|
225
|
-
// Process each file
|
226
|
-
for (const filePath of docFiles) {
|
227
|
-
try {
|
228
|
-
const docInfo = await processMarkdownFile(filePath, fullDocsDir, siteUrl, 'docs');
|
229
|
-
allDocs.push(docInfo);
|
230
|
-
}
|
231
|
-
catch (err) {
|
232
|
-
console.warn(`Error processing ${filePath}: ${err.message}`);
|
233
|
-
}
|
234
|
-
}
|
235
|
-
console.log(`Processed ${docFiles.length} documentation files`);
|
236
|
-
}
|
237
|
-
else {
|
238
|
-
console.warn('No markdown files found in docs directory.');
|
239
|
-
}
|
240
|
-
}
|
241
|
-
catch (err) {
|
242
|
-
console.warn(`Docs directory not found: ${fullDocsDir}`);
|
243
|
-
}
|
244
|
-
// Process blog if enabled
|
245
|
-
if (includeBlog) {
|
246
|
-
const blogDir = path.join(siteDir, 'blog');
|
247
|
-
try {
|
248
|
-
await fs.access(blogDir);
|
249
|
-
// Collect all markdown files from blog directory
|
250
|
-
const blogFiles = await readMarkdownFiles(blogDir, siteDir, ignoreFiles);
|
251
|
-
if (blogFiles.length > 0) {
|
252
|
-
// Process each file
|
253
|
-
for (const filePath of blogFiles) {
|
254
|
-
try {
|
255
|
-
const docInfo = await processMarkdownFile(filePath, blogDir, siteUrl, 'blog');
|
256
|
-
allDocs.push(docInfo);
|
257
|
-
}
|
258
|
-
catch (err) {
|
259
|
-
console.warn(`Error processing ${filePath}: ${err.message}`);
|
260
|
-
}
|
261
|
-
}
|
262
|
-
console.log(`Processed ${blogFiles.length} blog files`);
|
263
|
-
}
|
264
|
-
else {
|
265
|
-
console.warn('No markdown files found in blog directory.');
|
266
|
-
}
|
267
|
-
}
|
268
|
-
catch (err) {
|
269
|
-
console.warn(`Blog directory not found: ${blogDir}`);
|
270
|
-
}
|
271
|
-
}
|
62
|
+
// Collect all document files
|
63
|
+
const allDocFiles = await (0, generator_1.collectDocFiles)(pluginContext);
|
272
64
|
// Skip further processing if no documents were found
|
273
|
-
if (
|
65
|
+
if (allDocFiles.length === 0) {
|
274
66
|
console.warn('No documents found to process.');
|
275
67
|
return;
|
276
68
|
}
|
277
|
-
//
|
278
|
-
|
279
|
-
//
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
return `- [${doc.title}](${doc.url})${doc.description ? `: ${doc.description.split('\n')[0]}` : ''}`;
|
284
|
-
});
|
285
|
-
const llmsTxtContent = `# ${docTitle}
|
286
|
-
|
287
|
-
> ${docDescription}
|
288
|
-
|
289
|
-
This file contains links to all documentation sections following the llmtxt.org standard.
|
290
|
-
|
291
|
-
## Table of Contents
|
292
|
-
|
293
|
-
${tocItems.join('\n')}
|
294
|
-
`;
|
295
|
-
await writeFile(llmsTxtPath, llmsTxtContent);
|
296
|
-
console.log(`Generated ${llmsTxtFilename}: ${llmsTxtPath}`);
|
297
|
-
}
|
298
|
-
// Generate llms-full.txt with all content
|
299
|
-
if (generateLLMsFullTxt) {
|
300
|
-
const llmsFullTxtPath = path.join(outDir, llmsFullTxtFilename);
|
301
|
-
const fullContentSections = allDocs.map(doc => {
|
302
|
-
return `## ${doc.title}
|
303
|
-
|
304
|
-
${doc.content}`;
|
305
|
-
});
|
306
|
-
const llmsFullTxtContent = `# ${docTitle}
|
307
|
-
|
308
|
-
> ${docDescription}
|
309
|
-
|
310
|
-
This file contains all documentation content in a single document following the llmtxt.org standard.
|
311
|
-
|
312
|
-
${fullContentSections.join('\n\n---\n\n')}
|
313
|
-
`;
|
314
|
-
await writeFile(llmsFullTxtPath, llmsFullTxtContent);
|
315
|
-
console.log(`Generated ${llmsFullTxtFilename}: ${llmsFullTxtPath}`);
|
316
|
-
}
|
317
|
-
// Output statistics
|
318
|
-
const stats = {
|
319
|
-
totalDocuments: allDocs.length,
|
320
|
-
totalBytes: allDocs.reduce((sum, doc) => sum + doc.content.length, 0),
|
321
|
-
approxTokens: Math.round(allDocs.reduce((sum, doc) => sum + doc.content.length, 0) / 4), // Rough token estimate
|
322
|
-
};
|
323
|
-
console.log(`Stats: ${stats.totalDocuments} documents, ${Math.round(stats.totalBytes / 1024)}KB, ~${stats.approxTokens} tokens`);
|
69
|
+
// Process standard LLM files (llms.txt and llms-full.txt)
|
70
|
+
await (0, generator_1.generateStandardLLMFiles)(pluginContext, allDocFiles);
|
71
|
+
// Process custom LLM files
|
72
|
+
await (0, generator_1.generateCustomLLMFiles)(pluginContext, allDocFiles);
|
73
|
+
// Output overall statistics
|
74
|
+
console.log(`Stats: ${allDocFiles.length} total available documents processed`);
|
324
75
|
}
|
325
76
|
catch (err) {
|
326
77
|
console.error('Error generating LLM documentation:', err);
|
@@ -0,0 +1,28 @@
|
|
1
|
+
/**
|
2
|
+
* Document processing functions for the docusaurus-plugin-llms plugin
|
3
|
+
*/
|
4
|
+
import { DocInfo, PluginContext } from './types';
|
5
|
+
/**
|
6
|
+
* Process a markdown file and extract its metadata and content
|
7
|
+
* @param filePath - Path to the markdown file
|
8
|
+
* @param baseDir - Base directory
|
9
|
+
* @param siteUrl - Base URL of the site
|
10
|
+
* @param pathPrefix - Path prefix for URLs (e.g., 'docs' or 'blog')
|
11
|
+
* @param pathTransformation - Path transformation configuration
|
12
|
+
* @returns Processed file data
|
13
|
+
*/
|
14
|
+
export declare function processMarkdownFile(filePath: string, baseDir: string, siteUrl: string, pathPrefix?: string, pathTransformation?: {
|
15
|
+
ignorePaths?: string[];
|
16
|
+
addPaths?: string[];
|
17
|
+
}): Promise<DocInfo>;
|
18
|
+
/**
|
19
|
+
* Process files based on include patterns, ignore patterns, and ordering
|
20
|
+
* @param context - Plugin context
|
21
|
+
* @param allFiles - All available files
|
22
|
+
* @param includePatterns - Patterns for files to include
|
23
|
+
* @param ignorePatterns - Patterns for files to ignore
|
24
|
+
* @param orderPatterns - Patterns for ordering files
|
25
|
+
* @param includeUnmatched - Whether to include unmatched files
|
26
|
+
* @returns Processed files
|
27
|
+
*/
|
28
|
+
export declare function processFilesWithPatterns(context: PluginContext, allFiles: string[], includePatterns?: string[], ignorePatterns?: string[], orderPatterns?: string[], includeUnmatched?: boolean): Promise<DocInfo[]>;
|
package/lib/processor.js
ADDED
@@ -0,0 +1,211 @@
|
|
1
|
+
"use strict";
|
2
|
+
/**
|
3
|
+
* Document processing functions for the docusaurus-plugin-llms plugin
|
4
|
+
*/
|
5
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
6
|
+
if (k2 === undefined) k2 = k;
|
7
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
8
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
9
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
10
|
+
}
|
11
|
+
Object.defineProperty(o, k2, desc);
|
12
|
+
}) : (function(o, m, k, k2) {
|
13
|
+
if (k2 === undefined) k2 = k;
|
14
|
+
o[k2] = m[k];
|
15
|
+
}));
|
16
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
17
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
18
|
+
}) : function(o, v) {
|
19
|
+
o["default"] = v;
|
20
|
+
});
|
21
|
+
var __importStar = (this && this.__importStar) || (function () {
|
22
|
+
var ownKeys = function(o) {
|
23
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
24
|
+
var ar = [];
|
25
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
26
|
+
return ar;
|
27
|
+
};
|
28
|
+
return ownKeys(o);
|
29
|
+
};
|
30
|
+
return function (mod) {
|
31
|
+
if (mod && mod.__esModule) return mod;
|
32
|
+
var result = {};
|
33
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
34
|
+
__setModuleDefault(result, mod);
|
35
|
+
return result;
|
36
|
+
};
|
37
|
+
})();
|
38
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
39
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
40
|
+
};
|
41
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
42
|
+
exports.processMarkdownFile = processMarkdownFile;
|
43
|
+
exports.processFilesWithPatterns = processFilesWithPatterns;
|
44
|
+
const path = __importStar(require("path"));
|
45
|
+
const gray_matter_1 = __importDefault(require("gray-matter"));
|
46
|
+
const minimatch_1 = require("minimatch");
|
47
|
+
const utils_1 = require("./utils");
|
48
|
+
/**
|
49
|
+
* Process a markdown file and extract its metadata and content
|
50
|
+
* @param filePath - Path to the markdown file
|
51
|
+
* @param baseDir - Base directory
|
52
|
+
* @param siteUrl - Base URL of the site
|
53
|
+
* @param pathPrefix - Path prefix for URLs (e.g., 'docs' or 'blog')
|
54
|
+
* @param pathTransformation - Path transformation configuration
|
55
|
+
* @returns Processed file data
|
56
|
+
*/
|
57
|
+
async function processMarkdownFile(filePath, baseDir, siteUrl, pathPrefix = 'docs', pathTransformation) {
|
58
|
+
const content = await (0, utils_1.readFile)(filePath);
|
59
|
+
const { data, content: markdownContent } = (0, gray_matter_1.default)(content);
|
60
|
+
const relativePath = path.relative(baseDir, filePath);
|
61
|
+
// Convert to URL path format (replace backslashes with forward slashes on Windows)
|
62
|
+
const normalizedPath = relativePath.replace(/\\/g, '/');
|
63
|
+
// Convert .md extension to appropriate path
|
64
|
+
const linkPathBase = normalizedPath.replace(/\.mdx?$/, '');
|
65
|
+
// Handle index files specially
|
66
|
+
const linkPath = linkPathBase.endsWith('index')
|
67
|
+
? linkPathBase.replace(/\/index$/, '')
|
68
|
+
: linkPathBase;
|
69
|
+
// Apply path transformations to the link path
|
70
|
+
const transformedLinkPath = (0, utils_1.applyPathTransformations)(linkPath, pathTransformation);
|
71
|
+
// Also apply path transformations to the pathPrefix if it's not empty
|
72
|
+
// This allows removing 'docs' from the path when specified in ignorePaths
|
73
|
+
let transformedPathPrefix = pathPrefix;
|
74
|
+
if (pathPrefix && pathTransformation?.ignorePaths?.includes(pathPrefix)) {
|
75
|
+
transformedPathPrefix = '';
|
76
|
+
}
|
77
|
+
// Generate full URL with transformed path and path prefix
|
78
|
+
const fullUrl = new URL(`${transformedPathPrefix ? `${transformedPathPrefix}/` : ''}${transformedLinkPath}`, siteUrl).toString();
|
79
|
+
// Extract title
|
80
|
+
const title = (0, utils_1.extractTitle)(data, markdownContent, filePath);
|
81
|
+
// Get description from frontmatter or first paragraph
|
82
|
+
let description = '';
|
83
|
+
// First priority: Use frontmatter description if available
|
84
|
+
if (data.description) {
|
85
|
+
description = data.description;
|
86
|
+
}
|
87
|
+
else {
|
88
|
+
// Second priority: Find the first non-heading paragraph
|
89
|
+
const paragraphs = markdownContent.split('\n\n');
|
90
|
+
for (const para of paragraphs) {
|
91
|
+
const trimmedPara = para.trim();
|
92
|
+
// Skip empty paragraphs and headings
|
93
|
+
if (trimmedPara && !trimmedPara.startsWith('#')) {
|
94
|
+
description = trimmedPara;
|
95
|
+
break;
|
96
|
+
}
|
97
|
+
}
|
98
|
+
// Third priority: If still no description, use the first heading's content
|
99
|
+
if (!description) {
|
100
|
+
const firstHeadingMatch = markdownContent.match(/^#\s+(.*?)$/m);
|
101
|
+
if (firstHeadingMatch && firstHeadingMatch[1]) {
|
102
|
+
description = firstHeadingMatch[1].trim();
|
103
|
+
}
|
104
|
+
}
|
105
|
+
}
|
106
|
+
// Only remove heading markers at the beginning of descriptions or lines
|
107
|
+
// This preserves # characters that are part of the content
|
108
|
+
if (description) {
|
109
|
+
// Original approach had issues with hashtags inside content
|
110
|
+
// Fix: Only remove # symbols at the beginning of lines or description
|
111
|
+
// that are followed by a space (actual heading markers)
|
112
|
+
description = description.replace(/^(#+)\s+/gm, '');
|
113
|
+
// Special handling for description frontmatter with heading markers
|
114
|
+
if (data.description && data.description.startsWith('#')) {
|
115
|
+
// If the description in frontmatter starts with a heading marker,
|
116
|
+
// we should preserve it in the extracted description
|
117
|
+
description = description.replace(/^#+\s+/, '');
|
118
|
+
}
|
119
|
+
// Preserve inline hashtags (not heading markers)
|
120
|
+
// We don't want to treat hashtags in the middle of content as headings
|
121
|
+
// Validate that the description doesn't contain markdown headings
|
122
|
+
if (description.match(/^#+\s+/m)) {
|
123
|
+
console.warn(`Warning: Description for "${title}" may still contain heading markers`);
|
124
|
+
}
|
125
|
+
// Warn if the description contains HTML tags
|
126
|
+
if (/<[^>]+>/g.test(description)) {
|
127
|
+
console.warn(`Warning: Description for "${title}" contains HTML tags`);
|
128
|
+
}
|
129
|
+
// Warn if the description is very long
|
130
|
+
if (description.length > 500) {
|
131
|
+
console.warn(`Warning: Description for "${title}" is very long (${description.length} characters)`);
|
132
|
+
}
|
133
|
+
}
|
134
|
+
// Clean and process content
|
135
|
+
const cleanedContent = (0, utils_1.cleanMarkdownContent)(markdownContent);
|
136
|
+
return {
|
137
|
+
title,
|
138
|
+
path: normalizedPath,
|
139
|
+
url: fullUrl,
|
140
|
+
content: cleanedContent,
|
141
|
+
description: description || '',
|
142
|
+
};
|
143
|
+
}
|
144
|
+
/**
|
145
|
+
* Process files based on include patterns, ignore patterns, and ordering
|
146
|
+
* @param context - Plugin context
|
147
|
+
* @param allFiles - All available files
|
148
|
+
* @param includePatterns - Patterns for files to include
|
149
|
+
* @param ignorePatterns - Patterns for files to ignore
|
150
|
+
* @param orderPatterns - Patterns for ordering files
|
151
|
+
* @param includeUnmatched - Whether to include unmatched files
|
152
|
+
* @returns Processed files
|
153
|
+
*/
|
154
|
+
async function processFilesWithPatterns(context, allFiles, includePatterns = [], ignorePatterns = [], orderPatterns = [], includeUnmatched = false) {
|
155
|
+
const { siteDir, siteUrl, docsDir } = context;
|
156
|
+
// Filter files based on include patterns
|
157
|
+
let filteredFiles = allFiles;
|
158
|
+
if (includePatterns.length > 0) {
|
159
|
+
filteredFiles = allFiles.filter(file => {
|
160
|
+
const relativePath = path.relative(siteDir, file);
|
161
|
+
return includePatterns.some(pattern => (0, minimatch_1.minimatch)(relativePath, pattern, { matchBase: true }));
|
162
|
+
});
|
163
|
+
}
|
164
|
+
// Apply ignore patterns
|
165
|
+
if (ignorePatterns.length > 0) {
|
166
|
+
filteredFiles = filteredFiles.filter(file => {
|
167
|
+
const relativePath = path.relative(siteDir, file);
|
168
|
+
return !ignorePatterns.some(pattern => (0, minimatch_1.minimatch)(relativePath, pattern, { matchBase: true }));
|
169
|
+
});
|
170
|
+
}
|
171
|
+
// Order files according to orderPatterns
|
172
|
+
let filesToProcess = [];
|
173
|
+
if (orderPatterns.length > 0) {
|
174
|
+
const matchedFiles = new Set();
|
175
|
+
// Process files according to orderPatterns
|
176
|
+
for (const pattern of orderPatterns) {
|
177
|
+
const matchingFiles = filteredFiles.filter(file => {
|
178
|
+
const relativePath = path.relative(siteDir, file);
|
179
|
+
return (0, minimatch_1.minimatch)(relativePath, pattern, { matchBase: true }) && !matchedFiles.has(file);
|
180
|
+
});
|
181
|
+
for (const file of matchingFiles) {
|
182
|
+
filesToProcess.push(file);
|
183
|
+
matchedFiles.add(file);
|
184
|
+
}
|
185
|
+
}
|
186
|
+
// Add remaining files if includeUnmatched is true
|
187
|
+
if (includeUnmatched) {
|
188
|
+
const remainingFiles = filteredFiles.filter(file => !matchedFiles.has(file));
|
189
|
+
filesToProcess.push(...remainingFiles);
|
190
|
+
}
|
191
|
+
}
|
192
|
+
else {
|
193
|
+
filesToProcess = filteredFiles;
|
194
|
+
}
|
195
|
+
// Process each file to generate DocInfo
|
196
|
+
const processedDocs = [];
|
197
|
+
for (const filePath of filesToProcess) {
|
198
|
+
try {
|
199
|
+
// Determine if this is a blog or docs file
|
200
|
+
const isBlogFile = filePath.includes(path.join(siteDir, 'blog'));
|
201
|
+
const baseDir = isBlogFile ? path.join(siteDir, 'blog') : path.join(siteDir, docsDir);
|
202
|
+
const pathPrefix = isBlogFile ? 'blog' : 'docs';
|
203
|
+
const docInfo = await processMarkdownFile(filePath, baseDir, siteUrl, pathPrefix, context.options.pathTransformation);
|
204
|
+
processedDocs.push(docInfo);
|
205
|
+
}
|
206
|
+
catch (err) {
|
207
|
+
console.warn(`Error processing ${filePath}: ${err.message}`);
|
208
|
+
}
|
209
|
+
}
|
210
|
+
return processedDocs;
|
211
|
+
}
|
package/lib/utils.d.ts
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
/**
|
2
|
+
* Utility functions for the docusaurus-plugin-llms plugin
|
3
|
+
*/
|
4
|
+
import { PluginOptions } from './types';
|
5
|
+
/**
|
6
|
+
* Write content to a file
|
7
|
+
* @param filePath - Path to write the file to
|
8
|
+
* @param data - Content to write
|
9
|
+
*/
|
10
|
+
export declare function writeFile(filePath: string, data: string): Promise<void>;
|
11
|
+
/**
|
12
|
+
* Read content from a file
|
13
|
+
* @param filePath - Path of the file to read
|
14
|
+
* @returns Content of the file
|
15
|
+
*/
|
16
|
+
export declare function readFile(filePath: string): Promise<string>;
|
17
|
+
/**
|
18
|
+
* Check if a file should be ignored based on glob patterns
|
19
|
+
* @param filePath - Path to the file
|
20
|
+
* @param baseDir - Base directory for relative paths
|
21
|
+
* @param ignorePatterns - Glob patterns for files to ignore
|
22
|
+
* @returns Whether the file should be ignored
|
23
|
+
*/
|
24
|
+
export declare function shouldIgnoreFile(filePath: string, baseDir: string, ignorePatterns: string[]): boolean;
|
25
|
+
/**
|
26
|
+
* Recursively reads all Markdown files in a directory
|
27
|
+
* @param dir - Directory to scan
|
28
|
+
* @param baseDir - Base directory for relative paths
|
29
|
+
* @param ignorePatterns - Glob patterns for files to ignore
|
30
|
+
* @returns Array of file paths
|
31
|
+
*/
|
32
|
+
export declare function readMarkdownFiles(dir: string, baseDir: string, ignorePatterns?: string[]): Promise<string[]>;
|
33
|
+
/**
|
34
|
+
* Extract title from content or use the filename
|
35
|
+
* @param data - Frontmatter data
|
36
|
+
* @param content - Markdown content
|
37
|
+
* @param filePath - Path to the file
|
38
|
+
* @returns Extracted title
|
39
|
+
*/
|
40
|
+
export declare function extractTitle(data: any, content: string, filePath: string): string;
|
41
|
+
/**
|
42
|
+
* Clean markdown content for LLM consumption
|
43
|
+
* @param content - Raw markdown content
|
44
|
+
* @returns Cleaned content
|
45
|
+
*/
|
46
|
+
export declare function cleanMarkdownContent(content: string): string;
|
47
|
+
/**
|
48
|
+
* Apply path transformations according to configuration
|
49
|
+
* @param urlPath - Original URL path
|
50
|
+
* @param pathTransformation - Path transformation configuration
|
51
|
+
* @returns Transformed URL path
|
52
|
+
*/
|
53
|
+
export declare function applyPathTransformations(urlPath: string, pathTransformation?: PluginOptions['pathTransformation']): string;
|