@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,199 @@ var __toESM = (mod, isNodeMode, target) => {
17
17
  };
18
18
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
19
19
 
20
+ // src/scrapers/llms-txt.ts
21
+ var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
22
+ var GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
23
+ var llmsTxtScraper = {
24
+ type: "llms-txt",
25
+ async scrape(config) {
26
+ const options = getScraperOptions(config);
27
+ const result = await scrapeMarkdownDocs(options);
28
+ const downloaded = result.downloaded.map((page) => {
29
+ const category = page.category || "";
30
+ const filename = `${page.pageName || "untitled"}.md`;
31
+ const path = category ? `${category}/${filename}` : filename;
32
+ return {
33
+ success: true,
34
+ path,
35
+ title: page.title
36
+ };
37
+ });
38
+ return {
39
+ downloaded,
40
+ failed: result.failed,
41
+ duration: result.duration
42
+ };
43
+ }
44
+ };
45
+ function getScraperOptions(config) {
46
+ const baseOptions = {
47
+ baseUrl: config.baseUrl,
48
+ docsPath: config.docsPath,
49
+ outputDir: config.outputDir,
50
+ concurrency: 10,
51
+ useLlms: true,
52
+ tryDocsSubdomain: false
53
+ };
54
+ if (config.name === "Claude Code") {
55
+ return {
56
+ ...baseOptions,
57
+ llmsPaths: ["/docs/llms.txt"],
58
+ linkPattern: CLAUDE_CODE_PATTERN
59
+ };
60
+ }
61
+ if (config.name === "Polymarket") {
62
+ return {
63
+ ...baseOptions,
64
+ llmsPaths: ["/llms.txt"],
65
+ linkPattern: GENERIC_PATTERN
66
+ };
67
+ }
68
+ if (config.name === "Bun") {
69
+ return {
70
+ ...baseOptions,
71
+ llmsPaths: ["/docs/llms.txt", "/llms.txt"],
72
+ linkPattern: GENERIC_PATTERN
73
+ };
74
+ }
75
+ return {
76
+ ...baseOptions,
77
+ llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
78
+ linkPattern: config.linkPattern || GENERIC_PATTERN
79
+ };
80
+ }
81
+ // src/scrapers/github-raw.ts
82
+ var githubRawScraper = {
83
+ type: "github-raw",
84
+ async scrape(config) {
85
+ const startTime = Date.now();
86
+ const downloaded = [];
87
+ const failed = [];
88
+ if (!config.github?.repo) {
89
+ throw new Error(`GitHub source "${config.name}" missing github.repo config`);
90
+ }
91
+ const files = await fetchGitHubMarkdownFiles(config.github.repo, config.docsPath.replace(/^\//, ""));
92
+ for (const file of files) {
93
+ const content = await fetchGitHubRawContent(config.github.repo, file.path);
94
+ if (content) {
95
+ downloaded.push({
96
+ success: true,
97
+ path: file.name,
98
+ title: extractTitle(content) || file.name.replace(".md", "")
99
+ });
100
+ await saveFile(config.outputDir, file.name, content);
101
+ } else {
102
+ failed.push({
103
+ url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
104
+ error: "Failed to fetch content"
105
+ });
106
+ }
107
+ }
108
+ return {
109
+ downloaded,
110
+ failed,
111
+ duration: Date.now() - startTime
112
+ };
113
+ }
114
+ };
115
+ async function fetchGitHubMarkdownFiles(repo, path) {
116
+ const url = `https://api.github.com/repos/${repo}/contents/${path}`;
117
+ const response = await fetch(url, {
118
+ headers: {
119
+ Accept: "application/vnd.github.v3+json",
120
+ "User-Agent": "@ebowwa/markdown-docs-scraper"
121
+ }
122
+ });
123
+ if (!response.ok) {
124
+ throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
125
+ }
126
+ const contents = await response.json();
127
+ return contents.filter((item) => item.type === "file" && item.name.endsWith(".md"));
128
+ }
129
+ async function fetchGitHubRawContent(repo, path) {
130
+ const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
131
+ try {
132
+ const response = await fetch(url, {
133
+ headers: {
134
+ Accept: "text/plain",
135
+ "User-Agent": "@ebowwa/markdown-docs-scraper"
136
+ }
137
+ });
138
+ if (!response.ok) {
139
+ return null;
140
+ }
141
+ return await response.text();
142
+ } catch (error) {
143
+ console.error(`Error fetching ${url}:`, error);
144
+ return null;
145
+ }
146
+ }
147
+ function extractTitle(markdown) {
148
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
149
+ return titleMatch ? titleMatch[1].trim() : null;
150
+ }
151
+ async function saveFile(outputDir, filename, content) {
152
+ const fs = await import("fs/promises");
153
+ const path = await import("path");
154
+ const outputPath = path.join(outputDir, filename);
155
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
156
+ await fs.writeFile(outputPath, content, "utf-8");
157
+ }
158
+ // src/scrapers/registry.ts
159
+ var scrapers = new Map;
160
+ function registerScraper(scraper) {
161
+ scrapers.set(scraper.type, scraper);
162
+ }
163
+ function getScraper(type) {
164
+ return scrapers.get(type);
165
+ }
166
+ async function scrapeSource(config) {
167
+ const scraper = scrapers.get(config.sourceType);
168
+ if (!scraper) {
169
+ throw new Error(`No scraper registered for type: ${config.sourceType}`);
170
+ }
171
+ return scraper.scrape(config);
172
+ }
173
+ registerScraper(llmsTxtScraper);
174
+ registerScraper(githubRawScraper);
20
175
  // src/index.ts
176
+ var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
177
+ function extractTitle2(markdown) {
178
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
179
+ return titleMatch ? titleMatch[1].trim() : "Untitled";
180
+ }
181
+ function parsePagePath(pagePath) {
182
+ const pageName = pagePath.replace(".md", "");
183
+ const pathParts = pageName.split("/");
184
+ if (pathParts.length === 1) {
185
+ return { category: "", page: pathParts[0] };
186
+ } else if (pathParts.length === 2) {
187
+ return { category: pathParts[0], page: pathParts[1] };
188
+ } else {
189
+ return {
190
+ category: pathParts.slice(0, -1).join("/"),
191
+ page: pathParts[pathParts.length - 1]
192
+ };
193
+ }
194
+ }
195
+ async function fetchMarkdown(url, userAgent = "@ebowwa/markdown-docs-scraper") {
196
+ try {
197
+ const response = await fetch(url, {
198
+ headers: {
199
+ Accept: "text/markdown, text/plain",
200
+ "User-Agent": userAgent
201
+ }
202
+ });
203
+ if (!response.ok) {
204
+ return null;
205
+ }
206
+ return await response.text();
207
+ } catch (error) {
208
+ console.error(`Error fetching ${url}:`, error);
209
+ return null;
210
+ }
211
+ }
212
+
21
213
  class MarkdownDocsScraper {
22
214
  options;
23
215
  constructor(options) {
@@ -27,56 +219,107 @@ class MarkdownDocsScraper {
27
219
  categories: options.categories || {},
28
220
  outputDir: options.outputDir || "./docs",
29
221
  concurrency: options.concurrency || 5,
30
- onProgress: options.onProgress || (() => {})
222
+ onProgress: options.onProgress || (() => {}),
223
+ llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
224
+ tryDocsSubdomain: options.tryDocsSubdomain ?? true,
225
+ linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
226
+ useDirectUrls: options.useDirectUrls ?? true
31
227
  };
32
228
  }
33
- async fetchMarkdown(url) {
34
- try {
35
- const response = await fetch(url, {
36
- headers: {
37
- Accept: "text/markdown, text/plain",
38
- "User-Agent": "@ebowwa/markdown-docs-scraper"
39
- }
40
- });
41
- if (!response.ok) {
42
- return null;
43
- }
44
- const contentType = response.headers.get("content-type") || "";
45
- if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {}
46
- return await response.text();
47
- } catch (error) {
48
- console.error(`Error fetching ${url}:`, error);
49
- return null;
50
- }
51
- }
52
- extractTitle(markdown) {
53
- const titleMatch = markdown.match(/^#\s+(.+)$/m);
54
- return titleMatch ? titleMatch[1].trim() : "Untitled";
55
- }
56
- sanitizeFilename(path) {
57
- return path.toLowerCase().replace(/[^a-z0-9/]+/g, "-").replace(/^-|-$/g, "").replace(/\//g, "/");
58
- }
59
229
  buildUrl(category, page) {
60
230
  if (category) {
61
231
  return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
62
- } else {
232
+ } else if (this.options.docsPath) {
63
233
  return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
234
+ } else {
235
+ return `${this.options.baseUrl}/${page}.md`;
64
236
  }
65
237
  }
66
- async downloadPage(category, page) {
67
- const url = this.buildUrl(category, page);
68
- const content = await this.fetchMarkdown(url);
238
+ async downloadPage(pageInfo) {
239
+ const url = this.options.useDirectUrls && pageInfo.fullUrl ? pageInfo.fullUrl : this.buildUrl(pageInfo.category, pageInfo.page);
240
+ const content = await fetchMarkdown(url);
69
241
  if (!content) {
70
242
  return null;
71
243
  }
72
244
  return {
73
245
  url,
74
- title: this.extractTitle(content),
246
+ title: extractTitle2(content),
75
247
  content,
76
- category,
77
- pageName: page
248
+ category: pageInfo.category,
249
+ pageName: pageInfo.page
78
250
  };
79
251
  }
252
+ getLlmsUrls() {
253
+ const urls = [];
254
+ const baseUrl = this.options.baseUrl;
255
+ for (const path of this.options.llmsPaths) {
256
+ urls.push(`${baseUrl}${path}`);
257
+ }
258
+ if (this.options.tryDocsSubdomain) {
259
+ try {
260
+ const url = new URL(baseUrl);
261
+ const hostname = url.hostname;
262
+ if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
263
+ const docsDomain = hostname.replace(/^www\./, "");
264
+ urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
265
+ urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
266
+ }
267
+ } catch {}
268
+ }
269
+ return urls;
270
+ }
271
+ async fetchLlmsTxt() {
272
+ const urls = this.getLlmsUrls();
273
+ console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
274
+ for (const llmsUrl of urls) {
275
+ try {
276
+ console.log(`DEBUG: Fetching ${llmsUrl}...`);
277
+ const response = await fetch(llmsUrl, {
278
+ headers: {
279
+ Accept: "text/plain",
280
+ "User-Agent": "@ebowwa/markdown-docs-scraper"
281
+ }
282
+ });
283
+ console.log(`DEBUG: Response status: ${response.status}`);
284
+ if (response.ok) {
285
+ const content = await response.text();
286
+ console.log(`Found llms.txt at ${llmsUrl}`);
287
+ return { content, url: llmsUrl };
288
+ }
289
+ } catch (error) {
290
+ console.log(`DEBUG: Error: ${error}`);
291
+ continue;
292
+ }
293
+ }
294
+ return null;
295
+ }
296
+ async discoverPages() {
297
+ const pages = [];
298
+ try {
299
+ const llmsResult = await this.fetchLlmsTxt();
300
+ if (!llmsResult) {
301
+ const attemptedUrls = this.getLlmsUrls();
302
+ console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
303
+ return pages;
304
+ }
305
+ const { content } = llmsResult;
306
+ const pattern = this.options.linkPattern;
307
+ const regex = new RegExp(pattern.source, pattern.flags);
308
+ let match;
309
+ console.log(`DEBUG: Using pattern: ${pattern.source}`);
310
+ console.log(`DEBUG: Content length: ${content.length}`);
311
+ while ((match = regex.exec(content)) !== null) {
312
+ const fullUrl = match[2];
313
+ const pagePath = match[3];
314
+ const { category, page } = parsePagePath(pagePath);
315
+ pages.push({ category, page, fullUrl });
316
+ }
317
+ console.log(`Discovered ${pages.length} pages from llms.txt`);
318
+ } catch (error) {
319
+ console.error("Error discovering pages:", error);
320
+ }
321
+ return pages;
322
+ }
80
323
  async scrapeFromLlms() {
81
324
  const startTime = Date.now();
82
325
  const downloaded = [];
@@ -89,14 +332,15 @@ class MarkdownDocsScraper {
89
332
  console.log(`Scraping ${pages.length} discovered pages...`);
90
333
  for (let i = 0;i < pages.length; i += this.options.concurrency) {
91
334
  const batch = pages.slice(i, i + this.options.concurrency);
92
- const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page.category, page.page)));
335
+ const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page)));
93
336
  results.forEach((result, index) => {
94
337
  const page = batch[index];
95
338
  if (result.status === "fulfilled" && result.value) {
96
339
  downloaded.push(result.value);
97
340
  } else {
341
+ const url = this.options.useDirectUrls && page.fullUrl ? page.fullUrl : this.buildUrl(page.category, page.page);
98
342
  failed.push({
99
- url: this.buildUrl(page.category, page.page),
343
+ url,
100
344
  error: result.status === "rejected" ? result.reason : "Not found"
101
345
  });
102
346
  }
@@ -118,7 +362,7 @@ class MarkdownDocsScraper {
118
362
  console.log(`Scraping ${total} pages from ${this.options.baseUrl}...`);
119
363
  for (let i = 0;i < pages.length; i += this.options.concurrency) {
120
364
  const batch = pages.slice(i, i + this.options.concurrency);
121
- const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page.category, page.page)));
365
+ const results = await Promise.allSettled(batch.map((page) => this.downloadPage({ ...page, fullUrl: "" })));
122
366
  results.forEach((result, index) => {
123
367
  const page = batch[index];
124
368
  if (result.status === "fulfilled" && result.value) {
@@ -159,77 +403,11 @@ Downloaded: ${new Date().toISOString()}
159
403
  const pages = [];
160
404
  for (const [category, pageList] of Object.entries(this.options.categories)) {
161
405
  for (const page of pageList) {
162
- pages.push({ category, page });
406
+ pages.push({ category, page, fullUrl: "" });
163
407
  }
164
408
  }
165
409
  return pages;
166
410
  }
167
- async discoverPages() {
168
- const pages = [];
169
- try {
170
- const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
171
- const response = await fetch(llmsUrl, {
172
- headers: {
173
- Accept: "text/plain",
174
- "User-Agent": "@ebowwa/markdown-docs-scraper"
175
- }
176
- });
177
- if (!response.ok) {
178
- console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
179
- return pages;
180
- }
181
- const content = await response.text();
182
- const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
183
- let match;
184
- while ((match = linkRegex.exec(content)) !== null) {
185
- const url = match[2];
186
- const pagePath = match[3];
187
- const pageName = pagePath.replace(".md", "");
188
- const pathParts = pageName.split("/");
189
- if (pathParts.length === 1) {
190
- pages.push({ category: "", page: pathParts[0] });
191
- } else if (pathParts.length === 2) {
192
- pages.push({ category: pathParts[0], page: pathParts[1] });
193
- } else {
194
- const category = pathParts.slice(0, -1).join("/");
195
- const page = pathParts[pathParts.length - 1];
196
- pages.push({ category, page });
197
- }
198
- }
199
- console.log(`Discovered ${pages.length} pages from llms.txt`);
200
- } catch (error) {
201
- console.error("Error discovering pages:", error);
202
- }
203
- return pages;
204
- }
205
- async discoverPagesHtml() {
206
- const discovered = [];
207
- try {
208
- const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
209
- const response = await fetch(indexUrl, {
210
- headers: {
211
- Accept: "text/html",
212
- "User-Agent": "@ebowwa/markdown-docs-scraper"
213
- }
214
- });
215
- if (!response.ok) {
216
- return discovered;
217
- }
218
- const html = await response.text();
219
- const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
220
- let match;
221
- while ((match = mdLinkRegex.exec(html)) !== null) {
222
- const path = match[1];
223
- if (!discovered.includes(path)) {
224
- discovered.push(path);
225
- }
226
- }
227
- console.log(`Discovered ${discovered.length} additional pages from HTML`);
228
- } catch (error) {
229
- console.error("Error discovering pages from HTML:", error);
230
- }
231
- return discovered;
232
- }
233
411
  }
234
412
  async function scrapeMarkdownDocs(options) {
235
413
  const scraper = new MarkdownDocsScraper(options);
@@ -239,9 +417,49 @@ async function scrapeMarkdownDocs(options) {
239
417
  }
240
418
  return result;
241
419
  }
420
+ var CLAUDE_CODE_PATTERN2 = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
421
+ var GENERIC_PATTERN2 = GENERIC_LINK_PATTERN;
422
+ function claudeCodeOptions(outputDir) {
423
+ return {
424
+ baseUrl: "https://code.claude.com",
425
+ docsPath: "/docs/en",
426
+ llmsPaths: ["/docs/llms.txt"],
427
+ linkPattern: CLAUDE_CODE_PATTERN2,
428
+ outputDir,
429
+ concurrency: 10,
430
+ tryDocsSubdomain: false,
431
+ useDirectUrls: false
432
+ };
433
+ }
434
+ function polymarketOptions(outputDir) {
435
+ return {
436
+ baseUrl: "https://docs.polymarket.com",
437
+ docsPath: "",
438
+ llmsPaths: ["/llms.txt"],
439
+ linkPattern: GENERIC_PATTERN2,
440
+ outputDir,
441
+ concurrency: 10,
442
+ tryDocsSubdomain: false,
443
+ useDirectUrls: true
444
+ };
445
+ }
242
446
  var src_default = MarkdownDocsScraper;
243
447
  export {
448
+ scrapeSource,
244
449
  scrapeMarkdownDocs,
450
+ registerScraper,
451
+ polymarketOptions,
452
+ parsePagePath,
453
+ llmsTxtScraper,
454
+ githubRawScraper,
455
+ getScraper,
456
+ fetchMarkdown,
457
+ extractTitle2 as extractTitle,
245
458
  src_default as default,
246
- MarkdownDocsScraper
459
+ claudeCodeOptions,
460
+ GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
461
+ CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
462
+ MarkdownDocsScraper,
463
+ GENERIC_PATTERN2 as GENERIC_PATTERN,
464
+ CLAUDE_CODE_PATTERN2 as CLAUDE_CODE_PATTERN
247
465
  };
@@ -0,0 +1,9 @@
1
+ /**
2
+ * GitHub Raw Scraper
3
+ *
4
+ * Downloads markdown files directly from GitHub repositories via raw content URLs.
5
+ * Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
6
+ */
7
+ import type { Scraper } from "./types";
8
+ export declare const githubRawScraper: Scraper;
9
+ //# sourceMappingURL=github-raw.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"github-raw.d.ts","sourceRoot":"","sources":["../../src/scrapers/github-raw.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,OAAO,EAA8C,MAAM,SAAS,CAAC;AAiBnF,eAAO,MAAM,gBAAgB,EAAE,OA6C9B,CAAC"}
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Scrapers Module
3
+ *
4
+ * Composable scraper architecture for multiple documentation source types.
5
+ * This module provides a registry-based system for different scraper implementations.
6
+ */
7
+ export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
8
+ export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
9
+ export { githubRawScraper } from "./github-raw";
10
+ export { registerScraper, getScraper, scrapeSource } from "./registry";
11
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scrapers/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,YAAY,EAAE,UAAU,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AAG/F,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAClF,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAGhD,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC"}