@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -1,7 +1,11 @@
1
1
  /**
2
2
  * @ebowwa/markdown-docs-scraper
3
3
  *
4
- * Scrape and mirror markdown-based documentation sites
4
+ * Composable markdown documentation scraper.
5
+ * - Configurable llms.txt paths with fallbacks
6
+ * - Custom URL patterns for different doc sites
7
+ * - Works with any markdown documentation site
8
+ * - Uses full URLs from llms.txt directly
5
9
  */
6
10
 
7
11
  // ============================================================================
@@ -23,6 +27,14 @@ export interface ScraperOptions {
23
27
  outputDir?: string;
24
28
  concurrency?: number;
25
29
  onProgress?: (current: number, total: number) => void;
30
+ /** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
31
+ llmsPaths?: string[];
32
+ /** Also try docs subdomain variants (e.g., docs.example.com) */
33
+ tryDocsSubdomain?: boolean;
34
+ /** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
35
+ linkPattern?: RegExp;
36
+ /** Use full URLs from llms.txt directly (default: true for generic pattern) */
37
+ useDirectUrls?: boolean;
26
38
  }
27
39
 
28
40
  export interface ScraperResult {
@@ -31,8 +43,73 @@ export interface ScraperResult {
31
43
  duration: number;
32
44
  }
33
45
 
46
+ /** Discovered page with full URL */
47
+ interface DiscoveredPage {
48
+ category: string;
49
+ page: string;
50
+ fullUrl: string; // The complete URL from llms.txt
51
+ }
52
+
53
+ /** Default pattern: matches /docs/en/ or /docs/ paths */
54
+ const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
55
+
56
+ /** Generic pattern: matches any .md links - captures full path after domain */
57
+ const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
58
+
59
+ // ============================================================================
60
+ // UTILITY FUNCTIONS (Composable)
61
+ // ============================================================================
62
+
63
+ /** Extract title from markdown content */
64
+ export function extractTitle(markdown: string): string {
65
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
66
+ return titleMatch ? titleMatch[1].trim() : "Untitled";
67
+ }
68
+
69
+ /** Parse page path into category and page name */
70
+ export function parsePagePath(pagePath: string): { category: string; page: string } {
71
+ // Remove .md extension
72
+ const pageName = pagePath.replace(".md", "");
73
+
74
+ // Check if there's a category in the path
75
+ const pathParts = pageName.split("/");
76
+
77
+ if (pathParts.length === 1) {
78
+ return { category: "", page: pathParts[0] };
79
+ } else if (pathParts.length === 2) {
80
+ return { category: pathParts[0], page: pathParts[1] };
81
+ } else {
82
+ // Deeper path: join everything except last as category
83
+ return {
84
+ category: pathParts.slice(0, -1).join("/"),
85
+ page: pathParts[pathParts.length - 1],
86
+ };
87
+ }
88
+ }
89
+
90
+ /** Fetch markdown content from URL */
91
+ export async function fetchMarkdown(url: string, userAgent = "@ebowwa/markdown-docs-scraper"): Promise<string | null> {
92
+ try {
93
+ const response = await fetch(url, {
94
+ headers: {
95
+ Accept: "text/markdown, text/plain",
96
+ "User-Agent": userAgent,
97
+ },
98
+ });
99
+
100
+ if (!response.ok) {
101
+ return null;
102
+ }
103
+
104
+ return await response.text();
105
+ } catch (error) {
106
+ console.error(`Error fetching ${url}:`, error);
107
+ return null;
108
+ }
109
+ }
110
+
34
111
  // ============================================================================
35
- // SCRAPER
112
+ // SCRAPER CLASS
36
113
  // ============================================================================
37
114
 
38
115
  export class MarkdownDocsScraper {
@@ -46,85 +123,154 @@ export class MarkdownDocsScraper {
46
123
  outputDir: options.outputDir || "./docs",
47
124
  concurrency: options.concurrency || 5,
48
125
  onProgress: options.onProgress || (() => {}),
126
+ llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
127
+ tryDocsSubdomain: options.tryDocsSubdomain ?? true,
128
+ linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
129
+ useDirectUrls: options.useDirectUrls ?? true,
49
130
  };
50
131
  }
51
132
 
52
133
  /**
53
- * Fetch markdown content from a URL
134
+ * Build URL for a documentation page (fallback when no direct URL)
54
135
  */
55
- async fetchMarkdown(url: string): Promise<string | null> {
56
- try {
57
- const response = await fetch(url, {
58
- headers: {
59
- Accept: "text/markdown, text/plain",
60
- "User-Agent": "@ebowwa/markdown-docs-scraper",
61
- },
62
- });
136
+ buildUrl(category: string, page: string): string {
137
+ if (category) {
138
+ return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
139
+ } else if (this.options.docsPath) {
140
+ return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
141
+ } else {
142
+ return `${this.options.baseUrl}/${page}.md`;
143
+ }
144
+ }
63
145
 
64
- if (!response.ok) {
65
- return null;
66
- }
146
+ /**
147
+ * Download a page using either direct URL or built URL
148
+ */
149
+ async downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null> {
150
+ // Use direct URL if available and useDirectUrls is enabled
151
+ const url = (this.options.useDirectUrls && pageInfo.fullUrl)
152
+ ? pageInfo.fullUrl
153
+ : this.buildUrl(pageInfo.category, pageInfo.page);
67
154
 
68
- const contentType = response.headers.get("content-type") || "";
69
- if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
70
- // Try to parse anyway - some sites return incorrect content-type
71
- }
155
+ const content = await fetchMarkdown(url);
72
156
 
73
- return await response.text();
74
- } catch (error) {
75
- console.error(`Error fetching ${url}:`, error);
157
+ if (!content) {
76
158
  return null;
77
159
  }
78
- }
79
160
 
80
- /**
81
- * Extract title from markdown content
82
- */
83
- extractTitle(markdown: string): string {
84
- const titleMatch = markdown.match(/^#\s+(.+)$/m);
85
- return titleMatch ? titleMatch[1].trim() : "Untitled";
161
+ return {
162
+ url,
163
+ title: extractTitle(content),
164
+ content,
165
+ category: pageInfo.category,
166
+ pageName: pageInfo.page,
167
+ };
86
168
  }
87
169
 
88
170
  /**
89
- * Sanitize filename from URL path
171
+ * Generate possible llms.txt URLs to try
90
172
  */
91
- sanitizeFilename(path: string): string {
92
- return path
93
- .toLowerCase()
94
- .replace(/[^a-z0-9/]+/g, "-")
95
- .replace(/^-|-$/g, "")
96
- .replace(/\//g, "/");
173
+ private getLlmsUrls(): string[] {
174
+ const urls: string[] = [];
175
+ const baseUrl = this.options.baseUrl;
176
+
177
+ // Try configured/custom paths first
178
+ for (const path of this.options.llmsPaths) {
179
+ urls.push(`${baseUrl}${path}`);
180
+ }
181
+
182
+ // Also try docs/doc subdomain variants if enabled
183
+ if (this.options.tryDocsSubdomain) {
184
+ try {
185
+ const url = new URL(baseUrl);
186
+ const hostname = url.hostname;
187
+
188
+ // Skip if already on docs/doc subdomain
189
+ if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
190
+ const docsDomain = hostname.replace(/^www\./, "");
191
+ urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
192
+ urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
193
+ }
194
+ } catch {
195
+ // Invalid URL, skip subdomain variants
196
+ }
197
+ }
198
+
199
+ return urls;
97
200
  }
98
201
 
99
202
  /**
100
- * Build URL for a documentation page
203
+ * Fetch llms.txt from multiple possible URLs with fallback
101
204
  */
102
- buildUrl(category: string, page: string): string {
103
- if (category) {
104
- return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
105
- } else {
106
- return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
205
+ private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
206
+ const urls = this.getLlmsUrls();
207
+ console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
208
+
209
+ for (const llmsUrl of urls) {
210
+ try {
211
+ console.log(`DEBUG: Fetching ${llmsUrl}...`);
212
+ const response = await fetch(llmsUrl, {
213
+ headers: {
214
+ Accept: "text/plain",
215
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
216
+ },
217
+ });
218
+
219
+ console.log(`DEBUG: Response status: ${response.status}`);
220
+ if (response.ok) {
221
+ const content = await response.text();
222
+ console.log(`Found llms.txt at ${llmsUrl}`);
223
+ return { content, url: llmsUrl };
224
+ }
225
+ } catch (error) {
226
+ console.log(`DEBUG: Error: ${error}`);
227
+ continue;
228
+ }
107
229
  }
230
+
231
+ return null;
108
232
  }
109
233
 
110
234
  /**
111
- * Download a single documentation page
235
+ * Discover pages from llms.txt index
112
236
  */
113
- async downloadPage(category: string, page: string): Promise<DocPage | null> {
114
- const url = this.buildUrl(category, page);
115
- const content = await this.fetchMarkdown(url);
237
+ async discoverPages(): Promise<DiscoveredPage[]> {
238
+ const pages: DiscoveredPage[] = [];
116
239
 
117
- if (!content) {
118
- return null;
240
+ try {
241
+ const llmsResult = await this.fetchLlmsTxt();
242
+
243
+ if (!llmsResult) {
244
+ const attemptedUrls = this.getLlmsUrls();
245
+ console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
246
+ return pages;
247
+ }
248
+
249
+ const { content } = llmsResult;
250
+
251
+ // Use provided pattern or default
252
+ const pattern = this.options.linkPattern;
253
+ const regex = new RegExp(pattern.source, pattern.flags);
254
+ let match;
255
+
256
+ // Debug: log pattern being used
257
+ console.log(`DEBUG: Using pattern: ${pattern.source}`);
258
+ console.log(`DEBUG: Content length: ${content.length}`);
259
+
260
+ while ((match = regex.exec(content)) !== null) {
261
+ const fullUrl = match[2]; // The full URL from llms.txt
262
+ const pagePath = match[3]; // The captured path group
263
+
264
+ const { category, page } = parsePagePath(pagePath);
265
+ pages.push({ category, page, fullUrl });
266
+ }
267
+
268
+ console.log(`Discovered ${pages.length} pages from llms.txt`);
269
+ } catch (error) {
270
+ console.error("Error discovering pages:", error);
119
271
  }
120
272
 
121
- return {
122
- url,
123
- title: this.extractTitle(content),
124
- content,
125
- category,
126
- pageName: page, // Store the page name for saving
127
- };
273
+ return pages;
128
274
  }
129
275
 
130
276
  /**
@@ -148,7 +294,7 @@ export class MarkdownDocsScraper {
148
294
  for (let i = 0; i < pages.length; i += this.options.concurrency) {
149
295
  const batch = pages.slice(i, i + this.options.concurrency);
150
296
  const results = await Promise.allSettled(
151
- batch.map((page) => this.downloadPage(page.category, page.page))
297
+ batch.map((page) => this.downloadPage(page))
152
298
  );
153
299
 
154
300
  results.forEach((result, index) => {
@@ -156,8 +302,11 @@ export class MarkdownDocsScraper {
156
302
  if (result.status === "fulfilled" && result.value) {
157
303
  downloaded.push(result.value);
158
304
  } else {
305
+ const url = (this.options.useDirectUrls && page.fullUrl)
306
+ ? page.fullUrl
307
+ : this.buildUrl(page.category, page.page);
159
308
  failed.push({
160
- url: this.buildUrl(page.category, page.page),
309
+ url,
161
310
  error: result.status === "rejected" ? (result.reason as string) : "Not found",
162
311
  });
163
312
  }
@@ -175,7 +324,7 @@ export class MarkdownDocsScraper {
175
324
  }
176
325
 
177
326
  /**
178
- * Scrape all documentation pages
327
+ * Scrape all documentation pages (uses categories)
179
328
  */
180
329
  async scrape(): Promise<ScraperResult> {
181
330
  const startTime = Date.now();
@@ -191,7 +340,7 @@ export class MarkdownDocsScraper {
191
340
  for (let i = 0; i < pages.length; i += this.options.concurrency) {
192
341
  const batch = pages.slice(i, i + this.options.concurrency);
193
342
  const results = await Promise.allSettled(
194
- batch.map((page) => this.downloadPage(page.category, page.page))
343
+ batch.map((page) => this.downloadPage({ ...page, fullUrl: "" }))
195
344
  );
196
345
 
197
346
  results.forEach((result, index) => {
@@ -225,7 +374,6 @@ export class MarkdownDocsScraper {
225
374
  const path = await import("path");
226
375
 
227
376
  for (const page of pages) {
228
- // Use pageName if available, otherwise extract from URL
229
377
  const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
230
378
 
231
379
  const dir = page.category
@@ -244,113 +392,17 @@ export class MarkdownDocsScraper {
244
392
  /**
245
393
  * Get list of pages to scrape based on categories
246
394
  */
247
- private getPagesToScrape(): Array<{ category: string; page: string }> {
248
- const pages: Array<{ category: string; page: string }> = [];
395
+ private getPagesToScrape(): DiscoveredPage[] {
396
+ const pages: DiscoveredPage[] = [];
249
397
 
250
398
  for (const [category, pageList] of Object.entries(this.options.categories)) {
251
399
  for (const page of pageList) {
252
- pages.push({ category, page });
400
+ pages.push({ category, page, fullUrl: "" });
253
401
  }
254
402
  }
255
403
 
256
404
  return pages;
257
405
  }
258
-
259
- /**
260
- * Discover pages from llms.txt index
261
- */
262
- async discoverPages(): Promise<Array<{ category: string; page: string }>> {
263
- const pages: Array<{ category: string; page: string }> = [];
264
-
265
- try {
266
- const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
267
- const response = await fetch(llmsUrl, {
268
- headers: {
269
- Accept: "text/plain",
270
- "User-Agent": "@ebowwa/markdown-docs-scraper",
271
- },
272
- });
273
-
274
- if (!response.ok) {
275
- console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
276
- return pages;
277
- }
278
-
279
- const content = await response.text();
280
-
281
- // Parse markdown links in format: [title](https://code.claude.com/docs/en/page.md)
282
- const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
283
- let match;
284
-
285
- while ((match = linkRegex.exec(content)) !== null) {
286
- const url = match[2];
287
- const pagePath = match[3]; // e.g., "agent-teams.md" or "category/page.md"
288
-
289
- // Remove .md extension
290
- const pageName = pagePath.replace(".md", "");
291
-
292
- // Check if there's a category in the path
293
- const pathParts = pageName.split("/");
294
-
295
- if (pathParts.length === 1) {
296
- // No category: just "page-name"
297
- pages.push({ category: "", page: pathParts[0] });
298
- } else if (pathParts.length === 2) {
299
- // Has category: "category/page-name"
300
- pages.push({ category: pathParts[0], page: pathParts[1] });
301
- } else {
302
- // Deeper path: join everything except last as category
303
- const category = pathParts.slice(0, -1).join("/");
304
- const page = pathParts[pathParts.length - 1];
305
- pages.push({ category, page });
306
- }
307
- }
308
-
309
- console.log(`Discovered ${pages.length} pages from llms.txt`);
310
- } catch (error) {
311
- console.error("Error discovering pages:", error);
312
- }
313
-
314
- return pages;
315
- }
316
-
317
- /**
318
- * Discover additional pages by parsing the docs index (fallback)
319
- */
320
- async discoverPagesHtml(): Promise<string[]> {
321
- const discovered: string[] = [];
322
-
323
- try {
324
- const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
325
- const response = await fetch(indexUrl, {
326
- headers: {
327
- Accept: "text/html",
328
- "User-Agent": "@ebowwa/markdown-docs-scraper",
329
- },
330
- });
331
-
332
- if (!response.ok) {
333
- return discovered;
334
- }
335
-
336
- const html = await response.text();
337
- const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
338
- let match;
339
-
340
- while ((match = mdLinkRegex.exec(html)) !== null) {
341
- const path = match[1];
342
- if (!discovered.includes(path)) {
343
- discovered.push(path);
344
- }
345
- }
346
-
347
- console.log(`Discovered ${discovered.length} additional pages from HTML`);
348
- } catch (error) {
349
- console.error("Error discovering pages from HTML:", error);
350
- }
351
-
352
- return discovered;
353
- }
354
406
  }
355
407
 
356
408
  // ============================================================================
@@ -375,6 +427,72 @@ export async function scrapeMarkdownDocs(
375
427
  return result;
376
428
  }
377
429
 
430
+ // ============================================================================
431
+ // PRESET CONFIGURATIONS (Composable)
432
+ // ============================================================================
433
+
434
+ /** Pattern for Claude Code docs: /docs/en/page.md */
435
+ export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
436
+
437
+ /** Pattern for generic docs: any domain/path.md */
438
+ export const GENERIC_PATTERN = GENERIC_LINK_PATTERN;
439
+
440
+ /** Create scraper options for Claude Code docs */
441
+ export function claudeCodeOptions(outputDir: string): ScraperOptions {
442
+ return {
443
+ baseUrl: "https://code.claude.com",
444
+ docsPath: "/docs/en",
445
+ llmsPaths: ["/docs/llms.txt"],
446
+ linkPattern: CLAUDE_CODE_PATTERN,
447
+ outputDir,
448
+ concurrency: 10,
449
+ tryDocsSubdomain: false,
450
+ useDirectUrls: false, // Claude Code can use built URLs
451
+ };
452
+ }
453
+
454
+ /** Create scraper options for Polymarket docs */
455
+ export function polymarketOptions(outputDir: string): ScraperOptions {
456
+ return {
457
+ baseUrl: "https://docs.polymarket.com",
458
+ docsPath: "",
459
+ llmsPaths: ["/llms.txt"],
460
+ linkPattern: GENERIC_PATTERN,
461
+ outputDir,
462
+ concurrency: 10,
463
+ tryDocsSubdomain: false,
464
+ useDirectUrls: true, // Polymarket needs direct URLs
465
+ };
466
+ }
467
+
468
+ // ============================================================================
469
+ // SCRAPERS MODULE
470
+ // ============================================================================
471
+
472
+ /**
473
+ * Re-export scrapers module for composable scraper architecture.
474
+ * This provides a registry-based system for different scraper implementations.
475
+ */
476
+ export {
477
+ // Types
478
+ type SourceType,
479
+ type SourceConfig,
480
+ type Scraper,
481
+ type ScrapeResult as ScraperModuleResult,
482
+ type DownloadResult,
483
+
484
+ // Scrapers
485
+ llmsTxtScraper,
486
+ githubRawScraper,
487
+ CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
488
+ GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
489
+
490
+ // Registry
491
+ registerScraper,
492
+ getScraper,
493
+ scrapeSource,
494
+ } from "./scrapers/index";
495
+
378
496
  // ============================================================================
379
497
  // EXPORTS
380
498
  // ============================================================================
@@ -0,0 +1,154 @@
1
+ /**
2
+ * GitHub Raw Scraper
3
+ *
4
+ * Downloads markdown files directly from GitHub repositories via raw content URLs.
5
+ * Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
6
+ */
7
+
8
+ import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
9
+
10
+ // ============================================================================
11
+ // GITHUB API TYPES
12
+ // ============================================================================
13
+
14
+ interface GitHubContent {
15
+ name: string;
16
+ path: string;
17
+ download_url: string;
18
+ type: string;
19
+ }
20
+
21
+ // ============================================================================
22
+ // GITHUB RAW SCRAPER
23
+ // ============================================================================
24
+
25
+ export const githubRawScraper: Scraper = {
26
+ type: "github-raw",
27
+
28
+ async scrape(config: SourceConfig): Promise<ScrapeResult> {
29
+ const startTime = Date.now();
30
+ const downloaded: DownloadResult[] = [];
31
+ const failed: Array<{ url: string; error: string }> = [];
32
+
33
+ if (!config.github?.repo) {
34
+ throw new Error(`GitHub source "${config.name}" missing github.repo config`);
35
+ }
36
+
37
+ // Get list of markdown files from GitHub API
38
+ const files = await fetchGitHubMarkdownFiles(
39
+ config.github.repo,
40
+ config.docsPath.replace(/^\//, "")
41
+ );
42
+
43
+ // Download each file
44
+ for (const file of files) {
45
+ const content = await fetchGitHubRawContent(config.github.repo, file.path);
46
+
47
+ if (content) {
48
+ downloaded.push({
49
+ success: true,
50
+ path: file.name,
51
+ title: extractTitle(content) || file.name.replace(".md", ""),
52
+ });
53
+
54
+ // Save the file
55
+ await saveFile(config.outputDir, file.name, content);
56
+ } else {
57
+ failed.push({
58
+ url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
59
+ error: "Failed to fetch content",
60
+ });
61
+ }
62
+ }
63
+
64
+ return {
65
+ downloaded,
66
+ failed,
67
+ duration: Date.now() - startTime,
68
+ };
69
+ },
70
+ };
71
+
72
+ // ============================================================================
73
+ // GITHUB API FUNCTIONS
74
+ // ============================================================================
75
+
76
+ /**
77
+ * Fetch list of markdown files from GitHub repo directory
78
+ */
79
+ async function fetchGitHubMarkdownFiles(
80
+ repo: string,
81
+ path: string
82
+ ): Promise<GitHubContent[]> {
83
+ const url = `https://api.github.com/repos/${repo}/contents/${path}`;
84
+
85
+ const response = await fetch(url, {
86
+ headers: {
87
+ Accept: "application/vnd.github.v3+json",
88
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
89
+ },
90
+ });
91
+
92
+ if (!response.ok) {
93
+ throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
94
+ }
95
+
96
+ const contents: GitHubContent[] = await response.json();
97
+
98
+ // Filter for markdown files only
99
+ return contents.filter(
100
+ (item) => item.type === "file" && item.name.endsWith(".md")
101
+ );
102
+ }
103
+
104
+ /**
105
+ * Download markdown content from GitHub raw URL
106
+ */
107
+ async function fetchGitHubRawContent(
108
+ repo: string,
109
+ path: string
110
+ ): Promise<string | null> {
111
+ const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
112
+
113
+ try {
114
+ const response = await fetch(url, {
115
+ headers: {
116
+ Accept: "text/plain",
117
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
118
+ },
119
+ });
120
+
121
+ if (!response.ok) {
122
+ return null;
123
+ }
124
+
125
+ return await response.text();
126
+ } catch (error) {
127
+ console.error(`Error fetching ${url}:`, error);
128
+ return null;
129
+ }
130
+ }
131
+
132
+ /**
133
+ * Extract title from markdown content
134
+ */
135
+ function extractTitle(markdown: string): string | null {
136
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
137
+ return titleMatch ? titleMatch[1].trim() : null;
138
+ }
139
+
140
+ /**
141
+ * Save file to disk
142
+ */
143
+ async function saveFile(
144
+ outputDir: string,
145
+ filename: string,
146
+ content: string
147
+ ): Promise<void> {
148
+ const fs = await import("fs/promises");
149
+ const path = await import("path");
150
+
151
+ const outputPath = path.join(outputDir, filename);
152
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
153
+ await fs.writeFile(outputPath, content, "utf-8");
154
+ }