@ebowwa/markdown-docs-scraper 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -5,6 +5,7 @@
5
5
  * - Configurable llms.txt paths with fallbacks
6
6
  * - Custom URL patterns for different doc sites
7
7
  * - Works with any markdown documentation site
8
+ * - Uses full URLs from llms.txt directly
8
9
  */
9
10
 
10
11
  // ============================================================================
@@ -32,6 +33,8 @@ export interface ScraperOptions {
32
33
  tryDocsSubdomain?: boolean;
33
34
  /** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
34
35
  linkPattern?: RegExp;
36
+ /** Use full URLs from llms.txt directly (default: true for generic pattern) */
37
+ useDirectUrls?: boolean;
35
38
  }
36
39
 
37
40
  export interface ScraperResult {
@@ -40,11 +43,18 @@ export interface ScraperResult {
40
43
  duration: number;
41
44
  }
42
45
 
46
+ /** Discovered page with full URL */
47
+ interface DiscoveredPage {
48
+ category: string;
49
+ page: string;
50
+ fullUrl: string; // The complete URL from llms.txt
51
+ }
52
+
43
53
  /** Default pattern: matches /docs/en/ or /docs/ paths */
44
54
  const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
45
55
 
46
- /** Generic pattern: matches any .md links in llms.txt */
47
- const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
56
+ /** Generic pattern: matches any .md links - captures full path after domain */
57
+ const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
48
58
 
49
59
  // ============================================================================
50
60
  // UTILITY FUNCTIONS (Composable)
@@ -116,11 +126,12 @@ export class MarkdownDocsScraper {
116
126
  llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
117
127
  tryDocsSubdomain: options.tryDocsSubdomain ?? true,
118
128
  linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
129
+ useDirectUrls: options.useDirectUrls ?? true,
119
130
  };
120
131
  }
121
132
 
122
133
  /**
123
- * Build URL for a documentation page
134
+ * Build URL for a documentation page (fallback when no direct URL)
124
135
  */
125
136
  buildUrl(category: string, page: string): string {
126
137
  if (category) {
@@ -128,16 +139,19 @@ export class MarkdownDocsScraper {
128
139
  } else if (this.options.docsPath) {
129
140
  return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
130
141
  } else {
131
- // No docsPath (like Polymarket) - direct path
132
142
  return `${this.options.baseUrl}/${page}.md`;
133
143
  }
134
144
  }
135
145
 
136
146
  /**
137
- * Download a single documentation page
147
+ * Download a page using either direct URL or built URL
138
148
  */
139
- async downloadPage(category: string, page: string): Promise<DocPage | null> {
140
- const url = this.buildUrl(category, page);
149
+ async downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null> {
150
+ // Use direct URL if available and useDirectUrls is enabled
151
+ const url = (this.options.useDirectUrls && pageInfo.fullUrl)
152
+ ? pageInfo.fullUrl
153
+ : this.buildUrl(pageInfo.category, pageInfo.page);
154
+
141
155
  const content = await fetchMarkdown(url);
142
156
 
143
157
  if (!content) {
@@ -148,8 +162,8 @@ export class MarkdownDocsScraper {
148
162
  url,
149
163
  title: extractTitle(content),
150
164
  content,
151
- category,
152
- pageName: page,
165
+ category: pageInfo.category,
166
+ pageName: pageInfo.page,
153
167
  };
154
168
  }
155
169
 
@@ -173,7 +187,6 @@ export class MarkdownDocsScraper {
173
187
 
174
188
  // Skip if already on docs/doc subdomain
175
189
  if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
176
- // Try docs.{domain}
177
190
  const docsDomain = hostname.replace(/^www\./, "");
178
191
  urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
179
192
  urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
@@ -191,9 +204,11 @@ export class MarkdownDocsScraper {
191
204
  */
192
205
  private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
193
206
  const urls = this.getLlmsUrls();
207
+ console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
194
208
 
195
209
  for (const llmsUrl of urls) {
196
210
  try {
211
+ console.log(`DEBUG: Fetching ${llmsUrl}...`);
197
212
  const response = await fetch(llmsUrl, {
198
213
  headers: {
199
214
  Accept: "text/plain",
@@ -201,13 +216,14 @@ export class MarkdownDocsScraper {
201
216
  },
202
217
  });
203
218
 
219
+ console.log(`DEBUG: Response status: ${response.status}`);
204
220
  if (response.ok) {
205
221
  const content = await response.text();
206
222
  console.log(`Found llms.txt at ${llmsUrl}`);
207
223
  return { content, url: llmsUrl };
208
224
  }
209
225
  } catch (error) {
210
- // Try next URL
226
+ console.log(`DEBUG: Error: ${error}`);
211
227
  continue;
212
228
  }
213
229
  }
@@ -218,8 +234,8 @@ export class MarkdownDocsScraper {
218
234
  /**
219
235
  * Discover pages from llms.txt index
220
236
  */
221
- async discoverPages(): Promise<Array<{ category: string; page: string }>> {
222
- const pages: Array<{ category: string; page: string }> = [];
237
+ async discoverPages(): Promise<DiscoveredPage[]> {
238
+ const pages: DiscoveredPage[] = [];
223
239
 
224
240
  try {
225
241
  const llmsResult = await this.fetchLlmsTxt();
@@ -233,15 +249,20 @@ export class MarkdownDocsScraper {
233
249
  const { content } = llmsResult;
234
250
 
235
251
  // Use provided pattern or default
236
- const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
252
+ const pattern = this.options.linkPattern;
253
+ const regex = new RegExp(pattern.source, pattern.flags);
237
254
  let match;
238
255
 
256
+ // Debug: log pattern being used
257
+ console.log(`DEBUG: Using pattern: ${pattern.source}`);
258
+ console.log(`DEBUG: Content length: ${content.length}`);
259
+
239
260
  while ((match = regex.exec(content)) !== null) {
240
- const url = match[2];
261
+ const fullUrl = match[2]; // The full URL from llms.txt
241
262
  const pagePath = match[3]; // The captured path group
242
263
 
243
264
  const { category, page } = parsePagePath(pagePath);
244
- pages.push({ category, page });
265
+ pages.push({ category, page, fullUrl });
245
266
  }
246
267
 
247
268
  console.log(`Discovered ${pages.length} pages from llms.txt`);
@@ -273,7 +294,7 @@ export class MarkdownDocsScraper {
273
294
  for (let i = 0; i < pages.length; i += this.options.concurrency) {
274
295
  const batch = pages.slice(i, i + this.options.concurrency);
275
296
  const results = await Promise.allSettled(
276
- batch.map((page) => this.downloadPage(page.category, page.page))
297
+ batch.map((page) => this.downloadPage(page))
277
298
  );
278
299
 
279
300
  results.forEach((result, index) => {
@@ -281,8 +302,11 @@ export class MarkdownDocsScraper {
281
302
  if (result.status === "fulfilled" && result.value) {
282
303
  downloaded.push(result.value);
283
304
  } else {
305
+ const url = (this.options.useDirectUrls && page.fullUrl)
306
+ ? page.fullUrl
307
+ : this.buildUrl(page.category, page.page);
284
308
  failed.push({
285
- url: this.buildUrl(page.category, page.page),
309
+ url,
286
310
  error: result.status === "rejected" ? (result.reason as string) : "Not found",
287
311
  });
288
312
  }
@@ -316,7 +340,7 @@ export class MarkdownDocsScraper {
316
340
  for (let i = 0; i < pages.length; i += this.options.concurrency) {
317
341
  const batch = pages.slice(i, i + this.options.concurrency);
318
342
  const results = await Promise.allSettled(
319
- batch.map((page) => this.downloadPage(page.category, page.page))
343
+ batch.map((page) => this.downloadPage({ ...page, fullUrl: "" }))
320
344
  );
321
345
 
322
346
  results.forEach((result, index) => {
@@ -350,7 +374,6 @@ export class MarkdownDocsScraper {
350
374
  const path = await import("path");
351
375
 
352
376
  for (const page of pages) {
353
- // Use pageName if available, otherwise extract from URL
354
377
  const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
355
378
 
356
379
  const dir = page.category
@@ -369,12 +392,12 @@ export class MarkdownDocsScraper {
369
392
  /**
370
393
  * Get list of pages to scrape based on categories
371
394
  */
372
- private getPagesToScrape(): Array<{ category: string; page: string }> {
373
- const pages: Array<{ category: string; page: string }> = [];
395
+ private getPagesToScrape(): DiscoveredPage[] {
396
+ const pages: DiscoveredPage[] = [];
374
397
 
375
398
  for (const [category, pageList] of Object.entries(this.options.categories)) {
376
399
  for (const page of pageList) {
377
- pages.push({ category, page });
400
+ pages.push({ category, page, fullUrl: "" });
378
401
  }
379
402
  }
380
403
 
@@ -424,6 +447,7 @@ export function claudeCodeOptions(outputDir: string): ScraperOptions {
424
447
  outputDir,
425
448
  concurrency: 10,
426
449
  tryDocsSubdomain: false,
450
+ useDirectUrls: false, // Claude Code can use built URLs
427
451
  };
428
452
  }
429
453
 
@@ -437,9 +461,38 @@ export function polymarketOptions(outputDir: string): ScraperOptions {
437
461
  outputDir,
438
462
  concurrency: 10,
439
463
  tryDocsSubdomain: false,
464
+ useDirectUrls: true, // Polymarket needs direct URLs
440
465
  };
441
466
  }
442
467
 
468
+ // ============================================================================
469
+ // SCRAPERS MODULE
470
+ // ============================================================================
471
+
472
+ /**
473
+ * Re-export scrapers module for composable scraper architecture.
474
+ * This provides a registry-based system for different scraper implementations.
475
+ */
476
+ export {
477
+ // Types
478
+ type SourceType,
479
+ type SourceConfig,
480
+ type Scraper,
481
+ type ScrapeResult as ScraperModuleResult,
482
+ type DownloadResult,
483
+
484
+ // Scrapers
485
+ llmsTxtScraper,
486
+ githubRawScraper,
487
+ CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
488
+ GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
489
+
490
+ // Registry
491
+ registerScraper,
492
+ getScraper,
493
+ scrapeSource,
494
+ } from "./scrapers/index";
495
+
443
496
  // ============================================================================
444
497
  // EXPORTS
445
498
  // ============================================================================
@@ -0,0 +1,154 @@
1
+ /**
2
+ * GitHub Raw Scraper
3
+ *
4
+ * Downloads markdown files directly from GitHub repositories via raw content URLs.
5
+ * Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
6
+ */
7
+
8
+ import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
9
+
10
+ // ============================================================================
11
+ // GITHUB API TYPES
12
+ // ============================================================================
13
+
14
+ interface GitHubContent {
15
+ name: string;
16
+ path: string;
17
+ download_url: string;
18
+ type: string;
19
+ }
20
+
21
+ // ============================================================================
22
+ // GITHUB RAW SCRAPER
23
+ // ============================================================================
24
+
25
+ export const githubRawScraper: Scraper = {
26
+ type: "github-raw",
27
+
28
+ async scrape(config: SourceConfig): Promise<ScrapeResult> {
29
+ const startTime = Date.now();
30
+ const downloaded: DownloadResult[] = [];
31
+ const failed: Array<{ url: string; error: string }> = [];
32
+
33
+ if (!config.github?.repo) {
34
+ throw new Error(`GitHub source "${config.name}" missing github.repo config`);
35
+ }
36
+
37
+ // Get list of markdown files from GitHub API
38
+ const files = await fetchGitHubMarkdownFiles(
39
+ config.github.repo,
40
+ config.docsPath.replace(/^\//, "")
41
+ );
42
+
43
+ // Download each file
44
+ for (const file of files) {
45
+ const content = await fetchGitHubRawContent(config.github.repo, file.path);
46
+
47
+ if (content) {
48
+ downloaded.push({
49
+ success: true,
50
+ path: file.name,
51
+ title: extractTitle(content) || file.name.replace(".md", ""),
52
+ });
53
+
54
+ // Save the file
55
+ await saveFile(config.outputDir, file.name, content);
56
+ } else {
57
+ failed.push({
58
+ url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
59
+ error: "Failed to fetch content",
60
+ });
61
+ }
62
+ }
63
+
64
+ return {
65
+ downloaded,
66
+ failed,
67
+ duration: Date.now() - startTime,
68
+ };
69
+ },
70
+ };
71
+
72
+ // ============================================================================
73
+ // GITHUB API FUNCTIONS
74
+ // ============================================================================
75
+
76
+ /**
77
+ * Fetch list of markdown files from GitHub repo directory
78
+ */
79
+ async function fetchGitHubMarkdownFiles(
80
+ repo: string,
81
+ path: string
82
+ ): Promise<GitHubContent[]> {
83
+ const url = `https://api.github.com/repos/${repo}/contents/${path}`;
84
+
85
+ const response = await fetch(url, {
86
+ headers: {
87
+ Accept: "application/vnd.github.v3+json",
88
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
89
+ },
90
+ });
91
+
92
+ if (!response.ok) {
93
+ throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
94
+ }
95
+
96
+ const contents: GitHubContent[] = await response.json();
97
+
98
+ // Filter for markdown files only
99
+ return contents.filter(
100
+ (item) => item.type === "file" && item.name.endsWith(".md")
101
+ );
102
+ }
103
+
104
+ /**
105
+ * Download markdown content from GitHub raw URL
106
+ */
107
+ async function fetchGitHubRawContent(
108
+ repo: string,
109
+ path: string
110
+ ): Promise<string | null> {
111
+ const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
112
+
113
+ try {
114
+ const response = await fetch(url, {
115
+ headers: {
116
+ Accept: "text/plain",
117
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
118
+ },
119
+ });
120
+
121
+ if (!response.ok) {
122
+ return null;
123
+ }
124
+
125
+ return await response.text();
126
+ } catch (error) {
127
+ console.error(`Error fetching ${url}:`, error);
128
+ return null;
129
+ }
130
+ }
131
+
132
+ /**
133
+ * Extract title from markdown content
134
+ */
135
+ function extractTitle(markdown: string): string | null {
136
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
137
+ return titleMatch ? titleMatch[1].trim() : null;
138
+ }
139
+
140
+ /**
141
+ * Save file to disk
142
+ */
143
+ async function saveFile(
144
+ outputDir: string,
145
+ filename: string,
146
+ content: string
147
+ ): Promise<void> {
148
+ const fs = await import("fs/promises");
149
+ const path = await import("path");
150
+
151
+ const outputPath = path.join(outputDir, filename);
152
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
153
+ await fs.writeFile(outputPath, content, "utf-8");
154
+ }
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Scrapers Module
3
+ *
4
+ * Composable scraper architecture for multiple documentation source types.
5
+ * This module provides a registry-based system for different scraper implementations.
6
+ */
7
+
8
+ // Types
9
+ export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
10
+
11
+ // Scrapers
12
+ export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
13
+ export { githubRawScraper } from "./github-raw";
14
+
15
+ // Registry
16
+ export { registerScraper, getScraper, scrapeSource } from "./registry";
@@ -0,0 +1,101 @@
1
+ /**
2
+ * LLMS-TXT Scraper
3
+ *
4
+ * Scrapes documentation sites that provide llms.txt index files.
5
+ * Uses the core MarkdownDocsScraper under the hood.
6
+ */
7
+
8
+ import { scrapeMarkdownDocs, type DocPage } from "../index";
9
+ import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
10
+
11
+ // ============================================================================
12
+ // URL PATTERNS
13
+ // ============================================================================
14
+
15
+ /** Pattern for Claude Code docs: /docs/en/page.md */
16
+ export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
17
+
18
+ /** Pattern for generic docs: any domain/path.md */
19
+ export const GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
20
+
21
+ // ============================================================================
22
+ // LLMS-TXT SCRAPER
23
+ // ============================================================================
24
+
25
+ export const llmsTxtScraper: Scraper = {
26
+ type: "llms-txt",
27
+
28
+ async scrape(config: SourceConfig): Promise<ScrapeResult> {
29
+ const options = getScraperOptions(config);
30
+ const result = await scrapeMarkdownDocs(options);
31
+
32
+ // Convert DocPage[] to DownloadResult[]
33
+ const downloaded: DownloadResult[] = result.downloaded.map((page: DocPage) => {
34
+ const category = page.category || "";
35
+ const filename = `${page.pageName || "untitled"}.md`;
36
+ const path = category ? `${category}/${filename}` : filename;
37
+
38
+ return {
39
+ success: true,
40
+ path,
41
+ title: page.title,
42
+ };
43
+ });
44
+
45
+ return {
46
+ downloaded,
47
+ failed: result.failed,
48
+ duration: result.duration,
49
+ };
50
+ },
51
+ };
52
+
53
+ // ============================================================================
54
+ // OPTIONS BUILDER
55
+ // ============================================================================
56
+
57
+ /**
58
+ * Get scraper options based on source configuration
59
+ */
60
+ function getScraperOptions(config: SourceConfig) {
61
+ const baseOptions = {
62
+ baseUrl: config.baseUrl,
63
+ docsPath: config.docsPath,
64
+ outputDir: config.outputDir,
65
+ concurrency: 10,
66
+ useLlms: true,
67
+ tryDocsSubdomain: false,
68
+ };
69
+
70
+ // Source-specific options
71
+ if (config.name === "Claude Code") {
72
+ return {
73
+ ...baseOptions,
74
+ llmsPaths: ["/docs/llms.txt"],
75
+ linkPattern: CLAUDE_CODE_PATTERN,
76
+ };
77
+ }
78
+
79
+ if (config.name === "Polymarket") {
80
+ return {
81
+ ...baseOptions,
82
+ llmsPaths: ["/llms.txt"],
83
+ linkPattern: GENERIC_PATTERN,
84
+ };
85
+ }
86
+
87
+ if (config.name === "Bun") {
88
+ return {
89
+ ...baseOptions,
90
+ llmsPaths: ["/docs/llms.txt", "/llms.txt"],
91
+ linkPattern: GENERIC_PATTERN,
92
+ };
93
+ }
94
+
95
+ // Default: use provided llmsTxtPath or try common paths
96
+ return {
97
+ ...baseOptions,
98
+ llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
99
+ linkPattern: config.linkPattern || GENERIC_PATTERN,
100
+ };
101
+ }
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Scraper Registry
3
+ *
4
+ * Maps source types to scraper implementations.
5
+ * Allows registering new scrapers and looking them up by type.
6
+ */
7
+
8
+ import type { Scraper, SourceType, SourceConfig, ScrapeResult } from "./types";
9
+ import { llmsTxtScraper } from "./llms-txt";
10
+ import { githubRawScraper } from "./github-raw";
11
+
12
+ // ============================================================================
13
+ // SCRAPER REGISTRY
14
+ // ============================================================================
15
+
16
+ /** Registry of all available scrapers keyed by type */
17
+ const scrapers: Map<SourceType, Scraper> = new Map();
18
+
19
+ /**
20
+ * Register a scraper implementation
21
+ */
22
+ export function registerScraper(scraper: Scraper): void {
23
+ scrapers.set(scraper.type, scraper);
24
+ }
25
+
26
+ /**
27
+ * Get a scraper by type
28
+ */
29
+ export function getScraper(type: SourceType): Scraper | undefined {
30
+ return scrapers.get(type);
31
+ }
32
+
33
+ /**
34
+ * Scrape a source using the appropriate scraper
35
+ */
36
+ export async function scrapeSource(config: SourceConfig): Promise<ScrapeResult> {
37
+ const scraper = scrapers.get(config.sourceType);
38
+
39
+ if (!scraper) {
40
+ throw new Error(`No scraper registered for type: ${config.sourceType}`);
41
+ }
42
+
43
+ return scraper.scrape(config);
44
+ }
45
+
46
+ // ============================================================================
47
+ // DEFAULT REGISTRATIONS
48
+ // ============================================================================
49
+
50
+ // Register built-in scrapers
51
+ registerScraper(llmsTxtScraper);
52
+ registerScraper(githubRawScraper);
53
+
54
+ // Export scrapers for direct access if needed
55
+ export { llmsTxtScraper, githubRawScraper };
@@ -0,0 +1,79 @@
1
+ /**
2
+ * Scraper Types
3
+ *
4
+ * Core types for the composable scraper architecture.
5
+ * These types define the interface that all scrapers must implement.
6
+ */
7
+
8
+ // ============================================================================
9
+ // SOURCE TYPES
10
+ // ============================================================================
11
+
12
+ /** Supported documentation source types */
13
+ export type SourceType = "llms-txt" | "github-raw";
14
+
15
+ // ============================================================================
16
+ // SCRAPER INTERFACE
17
+ // ============================================================================
18
+
19
+ /** Result from scraping a source */
20
+ export interface ScrapeResult {
21
+ downloaded: DownloadResult[];
22
+ failed: Array<{ url: string; error: string }>;
23
+ duration?: number;
24
+ }
25
+
26
+ /** Individual download result */
27
+ export interface DownloadResult {
28
+ success: boolean;
29
+ path: string;
30
+ title?: string;
31
+ }
32
+
33
+ /** Scraper interface - all scrapers must implement this */
34
+ export interface Scraper {
35
+ /** Source type identifier */
36
+ type: SourceType;
37
+
38
+ /** Scrape documentation from a source */
39
+ scrape(config: SourceConfig): Promise<ScrapeResult>;
40
+ }
41
+
42
+ // ============================================================================
43
+ // SOURCE CONFIG
44
+ // ============================================================================
45
+
46
+ /** Source configuration */
47
+ export interface SourceConfig {
48
+ /** Display name */
49
+ name: string;
50
+
51
+ /** Source type - determines which scraper to use */
52
+ sourceType: SourceType;
53
+
54
+ /** Base URL for the documentation */
55
+ baseUrl: string;
56
+
57
+ /** Path to docs (e.g., /docs, /docs/en) */
58
+ docsPath: string;
59
+
60
+ /** Output directory for downloaded docs */
61
+ outputDir: string;
62
+
63
+ /** Output directory for daily reports */
64
+ reportDir: string;
65
+
66
+ /** llms.txt path (for llms-txt sources) */
67
+ llmsTxtPath?: string;
68
+
69
+ /** Custom link pattern for llms.txt parsing */
70
+ linkPattern?: RegExp;
71
+
72
+ /** GitHub config (for github-raw sources or GitHub API data) */
73
+ github?: {
74
+ repo: string;
75
+ includeCommits: boolean;
76
+ includeReleases: boolean;
77
+ includePRs: boolean;
78
+ };
79
+ }