docusaurus-plugin-llms 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/processor.ts CHANGED
@@ -6,12 +6,16 @@ import * as path from 'path';
6
6
  import matter from 'gray-matter';
7
7
  import { minimatch } from 'minimatch';
8
8
  import { DocInfo, PluginContext } from './types';
9
- import {
10
- readFile,
11
- extractTitle,
12
- cleanMarkdownContent,
9
+ import {
10
+ readFile,
11
+ extractTitle,
12
+ cleanMarkdownContent,
13
13
  applyPathTransformations,
14
- resolvePartialImports
14
+ resolvePartialImports,
15
+ normalizePath,
16
+ logger,
17
+ getErrorMessage,
18
+ isNonEmptyString
15
19
  } from './utils';
16
20
 
17
21
  /**
@@ -24,8 +28,8 @@ import {
24
28
  * @returns Processed file data
25
29
  */
26
30
  export async function processMarkdownFile(
27
- filePath: string,
28
- baseDir: string,
31
+ filePath: string,
32
+ baseDir: string,
29
33
  siteUrl: string,
30
34
  pathPrefix: string = 'docs',
31
35
  pathTransformation?: {
@@ -38,24 +42,51 @@ export async function processMarkdownFile(
38
42
  ): Promise<DocInfo | null> {
39
43
  const content = await readFile(filePath);
40
44
  const { data, content: markdownContent } = matter(content);
41
-
45
+
42
46
  // Skip draft files
43
47
  if (data.draft === true) {
44
48
  return null;
45
49
  }
50
+
51
+ // Validate and clean empty frontmatter fields
52
+ // Empty strings should be treated as undefined to allow fallback logic
53
+ if (data.title !== undefined && !isNonEmptyString(data.title)) {
54
+ logger.warn(`Empty title in frontmatter for ${filePath}. Using fallback.`);
55
+ data.title = undefined;
56
+ }
57
+
58
+ if (data.description !== undefined && !isNonEmptyString(data.description)) {
59
+ data.description = undefined;
60
+ }
61
+
62
+ if (data.slug !== undefined && !isNonEmptyString(data.slug)) {
63
+ data.slug = undefined;
64
+ }
65
+
66
+ if (data.id !== undefined && !isNonEmptyString(data.id)) {
67
+ data.id = undefined;
68
+ }
46
69
 
47
70
  // Resolve partial imports before processing
48
71
  const resolvedContent = await resolvePartialImports(markdownContent, filePath);
49
72
 
50
73
  const relativePath = path.relative(baseDir, filePath);
51
74
  // Convert to URL path format (replace backslashes with forward slashes on Windows)
52
- const normalizedPath = relativePath.replace(/\\/g, '/');
75
+ const normalizedPath = normalizePath(relativePath);
53
76
 
54
77
  let fullUrl: string;
55
-
56
- if (resolvedUrl) {
78
+
79
+ if (isNonEmptyString(resolvedUrl)) {
57
80
  // Use the actual resolved URL from Docusaurus if provided
58
- fullUrl = new URL(resolvedUrl, siteUrl).toString();
81
+ try {
82
+ fullUrl = new URL(resolvedUrl, siteUrl).toString();
83
+ } catch (error: unknown) {
84
+ logger.warn(`Invalid URL construction: ${resolvedUrl} with base ${siteUrl}. Using fallback.`);
85
+ // Fallback to string concatenation with proper path joining
86
+ const baseUrl = siteUrl.endsWith('/') ? siteUrl.slice(0, -1) : siteUrl;
87
+ const urlPath = resolvedUrl.startsWith('/') ? resolvedUrl : `/${resolvedUrl}`;
88
+ fullUrl = baseUrl + urlPath;
89
+ }
59
90
  } else {
60
91
  // Fallback to the old path construction method
61
92
  // Convert .md extension to appropriate path
@@ -84,13 +115,44 @@ export async function processMarkdownFile(
84
115
  transformedPathPrefix = '';
85
116
  }
86
117
 
87
- // Generate full URL with transformed path and path prefix
88
- fullUrl = new URL(
89
- `${transformedPathPrefix ? `${transformedPathPrefix}/` : ''}${transformedLinkPath}`,
90
- siteUrl
91
- ).toString();
118
+ // Ensure path segments are URL-safe with sophisticated encoding detection
119
+ const encodedLinkPath = transformedLinkPath.split('/').map(segment => {
120
+ // Check if segment contains characters that need encoding
121
+ // Unreserved characters (per RFC 3986): A-Z a-z 0-9 - . _ ~
122
+ if (!/[^A-Za-z0-9\-._~]/.test(segment)) {
123
+ // Segment only contains unreserved characters, no encoding needed
124
+ return segment;
125
+ }
126
+
127
+ try {
128
+ // Try to decode - if it changes, it was already encoded
129
+ const decoded = decodeURIComponent(segment);
130
+ if (decoded !== segment) {
131
+ // Was already encoded, return as-is
132
+ return segment;
133
+ }
134
+ // Not encoded, encode it
135
+ return encodeURIComponent(segment);
136
+ } catch {
137
+ // Malformed encoding, re-encode
138
+ return encodeURIComponent(segment);
139
+ }
140
+ }).join('/');
141
+
142
+ // Construct URL by encoding path components, then combine with site URL
143
+ // We don't use URL constructor for the full path because it decodes some characters
144
+ const pathPart = transformedPathPrefix ? `${transformedPathPrefix}/${encodedLinkPath}` : encodedLinkPath;
145
+ try {
146
+ const baseUrl = new URL(siteUrl);
147
+ fullUrl = `${baseUrl.origin}/${pathPart}`;
148
+ } catch (error: unknown) {
149
+ logger.warn(`Invalid siteUrl: ${siteUrl}. Using fallback.`);
150
+ // Fallback to string concatenation with proper path joining
151
+ const baseUrl = siteUrl.endsWith('/') ? siteUrl.slice(0, -1) : siteUrl;
152
+ fullUrl = `${baseUrl}/${pathPart}`;
153
+ }
92
154
  }
93
-
155
+
94
156
  // Extract title
95
157
  const title = extractTitle(data, resolvedContent, filePath);
96
158
 
@@ -98,7 +160,7 @@ export async function processMarkdownFile(
98
160
  let description = '';
99
161
 
100
162
  // First priority: Use frontmatter description if available
101
- if (data.description) {
163
+ if (isNonEmptyString(data.description)) {
102
164
  description = data.description;
103
165
  } else {
104
166
  // Second priority: Find the first non-heading paragraph
@@ -123,14 +185,14 @@ export async function processMarkdownFile(
123
185
 
124
186
  // Only remove heading markers at the beginning of descriptions or lines
125
187
  // This preserves # characters that are part of the content
126
- if (description) {
188
+ if (isNonEmptyString(description)) {
127
189
  // Original approach had issues with hashtags inside content
128
190
  // Fix: Only remove # symbols at the beginning of lines or description
129
191
  // that are followed by a space (actual heading markers)
130
192
  description = description.replace(/^(#+)\s+/gm, '');
131
193
 
132
194
  // Special handling for description frontmatter with heading markers
133
- if (data.description && data.description.startsWith('#')) {
195
+ if (isNonEmptyString(data.description) && data.description.startsWith('#')) {
134
196
  // If the description in frontmatter starts with a heading marker,
135
197
  // we should preserve it in the extracted description
136
198
  description = description.replace(/^#+\s+/, '');
@@ -141,17 +203,17 @@ export async function processMarkdownFile(
141
203
 
142
204
  // Validate that the description doesn't contain markdown headings
143
205
  if (description.match(/^#+\s+/m)) {
144
- console.warn(`Warning: Description for "${title}" may still contain heading markers`);
206
+ logger.warn(`Warning: Description for "${title}" may still contain heading markers`);
145
207
  }
146
208
 
147
209
  // Warn if the description contains HTML tags
148
210
  if (/<[^>]+>/g.test(description)) {
149
- console.warn(`Warning: Description for "${title}" contains HTML tags`);
211
+ logger.warn(`Warning: Description for "${title}" contains HTML tags`);
150
212
  }
151
213
 
152
214
  // Warn if the description is very long
153
215
  if (description.length > 500) {
154
- console.warn(`Warning: Description for "${title}" is very long (${description.length} characters)`);
216
+ logger.warn(`Warning: Description for "${title}" is very long (${description.length} characters)`);
155
217
  }
156
218
  }
157
219
 
@@ -168,6 +230,143 @@ export async function processMarkdownFile(
168
230
  };
169
231
  }
170
232
 
233
+ /**
234
+ * Remove numbered prefixes from path segments (e.g., "01-intro" -> "intro")
235
+ */
236
+ function removeNumberedPrefixes(path: string): string {
237
+ return path.split('/').map(segment => {
238
+ // Remove numbered prefixes like "01-", "1-", "001-" from each segment
239
+ return segment.replace(/^\d+-/, '');
240
+ }).join('/');
241
+ }
242
+
243
+ /**
244
+ * Try to find a route in the route map from a list of possible paths
245
+ */
246
+ function findRouteInMap(routeMap: Map<string, string>, possiblePaths: string[]): string | undefined {
247
+ for (const possiblePath of possiblePaths) {
248
+ const route = routeMap.get(possiblePath) || routeMap.get(possiblePath + '/');
249
+ if (route) {
250
+ return route;
251
+ }
252
+ }
253
+ return undefined;
254
+ }
255
+
256
+ /**
257
+ * Try exact match for route resolution
258
+ */
259
+ function tryExactRouteMatch(
260
+ routeMap: Map<string, string>,
261
+ relativePath: string,
262
+ pathPrefix: string
263
+ ): string | undefined {
264
+ const possiblePaths = [
265
+ `/${pathPrefix}/${relativePath}`,
266
+ `/${relativePath}`,
267
+ ];
268
+ return findRouteInMap(routeMap, possiblePaths);
269
+ }
270
+
271
+ /**
272
+ * Try route resolution with numbered prefix removal
273
+ */
274
+ function tryNumberedPrefixResolution(
275
+ routeMap: Map<string, string>,
276
+ relativePath: string,
277
+ pathPrefix: string
278
+ ): string | undefined {
279
+ const cleanPath = removeNumberedPrefixes(relativePath);
280
+
281
+ // Try basic cleaned path
282
+ const basicPaths = [`/${pathPrefix}/${cleanPath}`, `/${cleanPath}`];
283
+ const basicMatch = findRouteInMap(routeMap, basicPaths);
284
+ if (basicMatch) {
285
+ return basicMatch;
286
+ }
287
+
288
+ // Try nested folder structures with numbered prefixes at different levels
289
+ const segments = relativePath.split('/');
290
+ if (segments.length > 1) {
291
+ for (let i = 0; i < segments.length; i++) {
292
+ const modifiedSegments = [...segments];
293
+ modifiedSegments[i] = modifiedSegments[i].replace(/^\d+-/, '');
294
+ const modifiedPath = modifiedSegments.join('/');
295
+ const pathsToTry = [`/${pathPrefix}/${modifiedPath}`, `/${modifiedPath}`];
296
+
297
+ const match = findRouteInMap(routeMap, pathsToTry);
298
+ if (match) {
299
+ return match;
300
+ }
301
+ }
302
+ }
303
+
304
+ return undefined;
305
+ }
306
+
307
+ /**
308
+ * Try finding best match using routes paths array
309
+ */
310
+ function tryRoutesPathsMatch(
311
+ routesPaths: string[],
312
+ relativePath: string,
313
+ pathPrefix: string
314
+ ): string | undefined {
315
+ const cleanPath = removeNumberedPrefixes(relativePath);
316
+ const normalizedCleanPath = cleanPath.toLowerCase();
317
+
318
+ return routesPaths.find(routePath => {
319
+ const normalizedRoute = routePath.toLowerCase();
320
+ return normalizedRoute.endsWith(`/${normalizedCleanPath}`) ||
321
+ normalizedRoute === `/${pathPrefix}/${normalizedCleanPath}` ||
322
+ normalizedRoute === `/${normalizedCleanPath}`;
323
+ });
324
+ }
325
+
326
+ /**
327
+ * Resolve the URL for a document using Docusaurus routes
328
+ * @param filePath - Full path to the file
329
+ * @param baseDir - Base directory (typically siteDir)
330
+ * @param pathPrefix - Path prefix ('docs' or 'blog')
331
+ * @param context - Plugin context with route map
332
+ * @returns Resolved URL or undefined if not found
333
+ */
334
+ function resolveDocumentUrl(
335
+ filePath: string,
336
+ baseDir: string,
337
+ pathPrefix: string,
338
+ context: PluginContext
339
+ ): string | undefined {
340
+ // Early return if no route map available
341
+ if (!context.routeMap) {
342
+ return undefined;
343
+ }
344
+
345
+ // Convert file path to a potential route path
346
+ const relativePath = normalizePath(path.relative(baseDir, filePath))
347
+ .replace(/\.mdx?$/, '')
348
+ .replace(/\/index$/, '');
349
+
350
+ // Try exact match first (respects Docusaurus's resolved routes)
351
+ const exactMatch = tryExactRouteMatch(context.routeMap, relativePath, pathPrefix);
352
+ if (exactMatch) {
353
+ return exactMatch;
354
+ }
355
+
356
+ // Try numbered prefix removal as fallback
357
+ const prefixMatch = tryNumberedPrefixResolution(context.routeMap, relativePath, pathPrefix);
358
+ if (prefixMatch) {
359
+ return prefixMatch;
360
+ }
361
+
362
+ // Try to find the best match using the routesPaths array
363
+ if (context.routesPaths) {
364
+ return tryRoutesPathsMatch(context.routesPaths, relativePath, pathPrefix);
365
+ }
366
+
367
+ return undefined;
368
+ }
369
+
171
370
  /**
172
371
  * Process files based on include patterns, ignore patterns, and ordering
173
372
  * @param context - Plugin context
@@ -178,6 +377,37 @@ export async function processMarkdownFile(
178
377
  * @param includeUnmatched - Whether to include unmatched files
179
378
  * @returns Processed files
180
379
  */
380
+ /**
381
+ * Helper function to check if a file matches a pattern
382
+ * Tries matching against multiple path variants for better usability
383
+ */
384
+ function matchesPattern(file: string, pattern: string, siteDir: string, docsDir: string): boolean {
385
+ const minimatchOptions = { matchBase: true };
386
+
387
+ // Get site-relative path (e.g., "docs/quickstart/file.md")
388
+ const siteRelativePath = normalizePath(path.relative(siteDir, file));
389
+
390
+ // Get docs-relative path (e.g., "quickstart/file.md")
391
+ // Normalize both paths to handle different path separators and resolve any .. or .
392
+ const docsBaseDir = path.resolve(path.join(siteDir, docsDir));
393
+ const resolvedFile = path.resolve(file);
394
+ const docsRelativePath = resolvedFile.startsWith(docsBaseDir)
395
+ ? normalizePath(path.relative(docsBaseDir, resolvedFile))
396
+ : null;
397
+
398
+ // Try matching against site-relative path
399
+ if (minimatch(siteRelativePath, pattern, minimatchOptions)) {
400
+ return true;
401
+ }
402
+
403
+ // Try matching against docs-relative path if available
404
+ if (docsRelativePath && minimatch(docsRelativePath, pattern, minimatchOptions)) {
405
+ return true;
406
+ }
407
+
408
+ return false;
409
+ }
410
+
181
411
  export async function processFilesWithPatterns(
182
412
  context: PluginContext,
183
413
  allFiles: string[],
@@ -193,9 +423,8 @@ export async function processFilesWithPatterns(
193
423
 
194
424
  if (includePatterns.length > 0) {
195
425
  filteredFiles = allFiles.filter(file => {
196
- const relativePath = path.relative(siteDir, file);
197
- return includePatterns.some(pattern =>
198
- minimatch(relativePath, pattern, { matchBase: true })
426
+ return includePatterns.some(pattern =>
427
+ matchesPattern(file, pattern, siteDir, docsDir)
199
428
  );
200
429
  });
201
430
  }
@@ -203,9 +432,8 @@ export async function processFilesWithPatterns(
203
432
  // Apply ignore patterns
204
433
  if (ignorePatterns.length > 0) {
205
434
  filteredFiles = filteredFiles.filter(file => {
206
- const relativePath = path.relative(siteDir, file);
207
- return !ignorePatterns.some(pattern =>
208
- minimatch(relativePath, pattern, { matchBase: true })
435
+ return !ignorePatterns.some(pattern =>
436
+ matchesPattern(file, pattern, siteDir, docsDir)
209
437
  );
210
438
  });
211
439
  }
@@ -219,8 +447,7 @@ export async function processFilesWithPatterns(
219
447
  // Process files according to orderPatterns
220
448
  for (const pattern of orderPatterns) {
221
449
  const matchingFiles = filteredFiles.filter(file => {
222
- const relativePath = path.relative(siteDir, file);
223
- return minimatch(relativePath, pattern, { matchBase: true }) && !matchedFiles.has(file);
450
+ return matchesPattern(file, pattern, siteDir, docsDir) && !matchedFiles.has(file);
224
451
  });
225
452
 
226
453
  for (const file of matchingFiles) {
@@ -238,101 +465,51 @@ export async function processFilesWithPatterns(
238
465
  filesToProcess = filteredFiles;
239
466
  }
240
467
 
241
- // Process each file to generate DocInfo
242
- const processedDocs: DocInfo[] = [];
243
-
244
- for (const filePath of filesToProcess) {
245
- try {
246
- // Determine if this is a blog or docs file
247
- const isBlogFile = filePath.includes(path.join(siteDir, 'blog'));
248
- // Use siteDir as baseDir to preserve full directory structure (docs/path/file.md instead of just path/file.md)
249
- const baseDir = siteDir;
250
- const pathPrefix = isBlogFile ? 'blog' : 'docs';
251
-
252
- // Try to find the resolved URL for this file from the route map
253
- let resolvedUrl: string | undefined;
254
- if (context.routeMap) {
255
- // Convert file path to a potential route path
256
- const relativePath = path.relative(baseDir, filePath)
257
- .replace(/\\/g, '/')
258
- .replace(/\.mdx?$/, '')
259
- .replace(/\/index$/, '');
260
-
261
- // Function to remove numbered prefixes from path segments
262
- const removeNumberedPrefixes = (path: string): string => {
263
- return path.split('/').map(segment => {
264
- // Remove numbered prefixes like "01-", "1-", "001-" from each segment
265
- return segment.replace(/^\d+-/, '');
266
- }).join('/');
267
- };
268
-
269
- // Check various possible route patterns
270
- const cleanPath = removeNumberedPrefixes(relativePath);
271
- const possiblePaths = [
272
- `/${pathPrefix}/${cleanPath}`,
273
- `/${cleanPath}`,
274
- `/${pathPrefix}/${relativePath}`, // Try with original path
275
- `/${relativePath}`, // Try without prefix
276
- ];
277
-
278
- // Also handle nested folder structures with numbered prefixes
279
- const segments = relativePath.split('/');
280
- if (segments.length > 1) {
281
- // Try removing numbered prefixes from different levels
282
- for (let i = 0; i < segments.length; i++) {
283
- const modifiedSegments = [...segments];
284
- modifiedSegments[i] = modifiedSegments[i].replace(/^\d+-/, '');
285
- const modifiedPath = modifiedSegments.join('/');
286
- possiblePaths.push(`/${pathPrefix}/${modifiedPath}`);
287
- possiblePaths.push(`/${modifiedPath}`);
288
- }
289
- }
290
-
291
- // Try to find a match in the route map
292
- for (const possiblePath of possiblePaths) {
293
- if (context.routeMap.has(possiblePath)) {
294
- resolvedUrl = context.routeMap.get(possiblePath);
295
- break;
296
- }
297
- }
298
-
299
- // If still not found, try to find the best match using the routesPaths array
300
- if (!resolvedUrl && context.routesPaths) {
301
- const normalizedCleanPath = cleanPath.toLowerCase();
302
- const matchingRoute = context.routesPaths.find(routePath => {
303
- const normalizedRoute = routePath.toLowerCase();
304
- return normalizedRoute.endsWith(`/${normalizedCleanPath}`) ||
305
- normalizedRoute === `/${pathPrefix}/${normalizedCleanPath}` ||
306
- normalizedRoute === `/${normalizedCleanPath}`;
307
- });
308
- if (matchingRoute) {
309
- resolvedUrl = matchingRoute;
310
- }
311
- }
312
-
468
+ // Process files in parallel using Promise.allSettled
469
+ const results = await Promise.allSettled(
470
+ filesToProcess.map(async (filePath) => {
471
+ try {
472
+ // Determine if this is a blog or docs file
473
+ const isBlogFile = filePath.includes(path.join(siteDir, 'blog'));
474
+ // Use siteDir as baseDir to preserve full directory structure (docs/path/file.md instead of just path/file.md)
475
+ const baseDir = siteDir;
476
+ const pathPrefix = isBlogFile ? 'blog' : 'docs';
477
+
478
+ // Try to find the resolved URL for this file from the route map
479
+ const resolvedUrl = resolveDocumentUrl(filePath, baseDir, pathPrefix, context);
480
+
313
481
  // Log when we successfully resolve a URL using Docusaurus routes
314
- if (resolvedUrl && resolvedUrl !== `/${pathPrefix}/${relativePath}`) {
315
- console.log(`Resolved URL for ${path.basename(filePath)}: ${resolvedUrl} (was: /${pathPrefix}/${relativePath})`);
482
+ if (resolvedUrl && context.routeMap) {
483
+ const relativePath = normalizePath(path.relative(baseDir, filePath))
484
+ .replace(/\.mdx?$/, '')
485
+ .replace(/\/index$/, '');
486
+ if (resolvedUrl !== `/${pathPrefix}/${relativePath}`) {
487
+ logger.verbose(`Resolved URL for ${path.basename(filePath)}: ${resolvedUrl} (was: /${pathPrefix}/${relativePath})`);
488
+ }
316
489
  }
490
+
491
+ const docInfo = await processMarkdownFile(
492
+ filePath,
493
+ baseDir,
494
+ siteUrl,
495
+ pathPrefix,
496
+ context.options.pathTransformation,
497
+ context.options.excludeImports || false,
498
+ context.options.removeDuplicateHeadings || false,
499
+ resolvedUrl
500
+ );
501
+ return docInfo;
502
+ } catch (err: unknown) {
503
+ logger.warn(`Error processing ${filePath}: ${getErrorMessage(err)}`);
504
+ return null;
317
505
  }
318
-
319
- const docInfo = await processMarkdownFile(
320
- filePath,
321
- baseDir,
322
- siteUrl,
323
- pathPrefix,
324
- context.options.pathTransformation,
325
- context.options.excludeImports || false,
326
- context.options.removeDuplicateHeadings || false,
327
- resolvedUrl
328
- );
329
- if (docInfo !== null) {
330
- processedDocs.push(docInfo);
331
- }
332
- } catch (err: any) {
333
- console.warn(`Error processing ${filePath}: ${err.message}`);
334
- }
335
- }
336
-
506
+ })
507
+ );
508
+
509
+ // Filter successful results and non-null DocInfo objects
510
+ const processedDocs = results
511
+ .filter((r): r is PromiseFulfilledResult<DocInfo | null> => r.status === 'fulfilled' && r.value !== null)
512
+ .map(r => r.value as DocInfo);
513
+
337
514
  return processedDocs;
338
- }
515
+ }
package/src/types.ts CHANGED
@@ -119,6 +119,21 @@ export interface PluginOptions {
119
119
 
120
120
  /** Custom content to include at the root level of llms-full.txt (after title/description, before content sections) */
121
121
  fullRootContent?: string;
122
+
123
+ /** Whether to preserve directory structure in generated markdown files (default: true) */
124
+ preserveDirectoryStructure?: boolean;
125
+
126
+ /** Batch size for processing large document sets to prevent memory issues (default: 100) */
127
+ processingBatchSize?: number;
128
+
129
+ /** Logging level for plugin output (default: 'normal'). Options: 'quiet', 'normal', 'verbose' */
130
+ logLevel?: 'quiet' | 'normal' | 'verbose';
131
+
132
+ /** Whether to warn about files that are ignored (no extension or unsupported extension) (default: false) */
133
+ warnOnIgnoredFiles?: boolean;
134
+
135
+ /** Index signature for Docusaurus plugin compatibility */
136
+ [key: string]: unknown;
122
137
  }
123
138
 
124
139
  /**