@ebowwa/markdown-docs-scraper 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -52,6 +52,26 @@ Options:
52
52
  -o, --output <dir> Output directory (default: "./docs")
53
53
  --docs-path <path> Docs path (default: "/docs/en")
54
54
  -c, --concurrency <num> Concurrency level (default: "5")
55
+ --llms-paths <paths> Comma-separated llms.txt paths (default: "/llms.txt,/docs/llms.txt")
56
+ --no-subdomain Disable docs/doc subdomain fallback
57
+ ```
58
+
59
+ ### llms.txt Discovery
60
+
61
+ The scraper automatically tries multiple paths to find `llms.txt`:
62
+
63
+ 1. **Configured paths** (default: `/llms.txt`, `/docs/llms.txt`)
64
+ 2. **Docs subdomain** (e.g., `https://docs.example.com/llms.txt`)
65
+ 3. **Doc subdomain** (e.g., `https://doc.example.com/llms.txt`)
66
+
67
+ Example with custom paths:
68
+ ```bash
69
+ markdown-docs-scraper scrape -u https://example.com --llms-paths "/llms.txt,/api/llms.txt"
70
+ ```
71
+
72
+ Disable subdomain fallback:
73
+ ```bash
74
+ markdown-docs-scraper scrape -u https://example.com --no-subdomain
55
75
  ```
56
76
 
57
77
  ## Programmatic Usage
@@ -103,6 +123,8 @@ interface ScraperOptions {
103
123
  outputDir?: string; // Output directory (default: "./docs")
104
124
  concurrency?: number; // Concurrent downloads (default: 5)
105
125
  onProgress?: (current: number, total: number) => void;
126
+ llmsPaths?: string[]; // llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"])
127
+ tryDocsSubdomain?: boolean; // Also try docs/doc subdomains (default: true)
106
128
  }
107
129
  ```
108
130
 
package/dist/cli.js CHANGED
@@ -20,7 +20,7 @@ var __toESM = (mod, isNodeMode, target) => {
20
20
  var __commonJS = (cb, mod) => () => (mod || cb((mod = { exports: {} }).exports, mod), mod.exports);
21
21
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
22
22
 
23
- // node_modules/commander/lib/error.js
23
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/error.js
24
24
  var require_error = __commonJS((exports) => {
25
25
  class CommanderError extends Error {
26
26
  constructor(exitCode, code, message) {
@@ -44,7 +44,7 @@ var require_error = __commonJS((exports) => {
44
44
  exports.InvalidArgumentError = InvalidArgumentError;
45
45
  });
46
46
 
47
- // node_modules/commander/lib/argument.js
47
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/argument.js
48
48
  var require_argument = __commonJS((exports) => {
49
49
  var { InvalidArgumentError } = require_error();
50
50
 
@@ -123,7 +123,7 @@ var require_argument = __commonJS((exports) => {
123
123
  exports.humanReadableArgName = humanReadableArgName;
124
124
  });
125
125
 
126
- // node_modules/commander/lib/help.js
126
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/help.js
127
127
  var require_help = __commonJS((exports) => {
128
128
  var { humanReadableArgName } = require_argument();
129
129
 
@@ -372,7 +372,7 @@ var require_help = __commonJS((exports) => {
372
372
  exports.Help = Help;
373
373
  });
374
374
 
375
- // node_modules/commander/lib/option.js
375
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/option.js
376
376
  var require_option = __commonJS((exports) => {
377
377
  var { InvalidArgumentError } = require_error();
378
378
 
@@ -523,7 +523,7 @@ var require_option = __commonJS((exports) => {
523
523
  exports.DualOptions = DualOptions;
524
524
  });
525
525
 
526
- // node_modules/commander/lib/suggestSimilar.js
526
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/suggestSimilar.js
527
527
  var require_suggestSimilar = __commonJS((exports) => {
528
528
  var maxDistance = 3;
529
529
  function editDistance(a, b) {
@@ -596,7 +596,7 @@ var require_suggestSimilar = __commonJS((exports) => {
596
596
  exports.suggestSimilar = suggestSimilar;
597
597
  });
598
598
 
599
- // node_modules/commander/lib/command.js
599
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/command.js
600
600
  var require_command = __commonJS((exports) => {
601
601
  var EventEmitter = __require("node:events").EventEmitter;
602
602
  var childProcess = __require("node:child_process");
@@ -1839,7 +1839,7 @@ Expecting one of '${allowedValues.join("', '")}'`);
1839
1839
  exports.Command = Command;
1840
1840
  });
1841
1841
 
1842
- // node_modules/commander/index.js
1842
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/index.js
1843
1843
  var require_commander = __commonJS((exports) => {
1844
1844
  var { Argument } = require_argument();
1845
1845
  var { Command } = require_command();
@@ -2377,7 +2377,7 @@ Downloaded: `).concat(new Date().toISOString(), `
2377
2377
  exports.default = MarkdownDocsScraper;
2378
2378
  });
2379
2379
 
2380
- // node_modules/commander/esm.mjs
2380
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/esm.mjs
2381
2381
  var import__ = __toESM(require_commander(), 1);
2382
2382
  var {
2383
2383
  program,
@@ -2396,12 +2396,14 @@ var {
2396
2396
  // src/cli.ts
2397
2397
  var import__2 = __toESM(require_src(), 1);
2398
2398
  program.name("markdown-docs-scraper").description("Scrape and mirror markdown-based documentation sites").version("1.0.0");
2399
- program.command("scrape").description("Scrape documentation from a URL").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("-o, --output <dir>", "Output directory", "./docs").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("-c, --concurrency <num>", "Concurrency level", "5").option("--discover", "Discover pages before scraping", false).action(async (options) => {
2399
+ program.command("scrape").description("Scrape documentation from a URL").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("-o, --output <dir>", "Output directory", "./docs").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("-c, --concurrency <num>", "Concurrency level", "5").option("--discover", "Discover pages before scraping", false).option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
2400
2400
  const scraperOptions = {
2401
2401
  baseUrl: options.url,
2402
2402
  docsPath: options.docsPath,
2403
2403
  outputDir: options.output,
2404
- concurrency: parseInt(options.concurrency)
2404
+ concurrency: parseInt(options.concurrency),
2405
+ llmsPaths: options.llmsPaths.split(","),
2406
+ tryDocsSubdomain: !options.noSubdomain
2405
2407
  };
2406
2408
  console.log(`\uD83D\uDD0D Scraping ${options.url}...`);
2407
2409
  console.log(`\uD83D\uDCC1 Output: ${options.output}`);
@@ -2423,10 +2425,12 @@ program.command("scrape").description("Scrape documentation from a URL").require
2423
2425
  }
2424
2426
  }
2425
2427
  });
2426
- program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").action(async (options) => {
2428
+ program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
2427
2429
  const scraper = new import__2.MarkdownDocsScraper({
2428
2430
  baseUrl: options.url,
2429
- docsPath: options.docsPath
2431
+ docsPath: options.docsPath,
2432
+ llmsPaths: options.llmsPaths.split(","),
2433
+ tryDocsSubdomain: !options.noSubdomain
2430
2434
  });
2431
2435
  console.log(`\uD83D\uDD0D Discovering pages from ${options.url}...`);
2432
2436
  const pages = await scraper.discoverPages();
@@ -2434,7 +2438,8 @@ program.command("discover").description("Discover all available documentation pa
2434
2438
  Found ${pages.length} pages:
2435
2439
  `);
2436
2440
  pages.forEach((page) => {
2437
- console.log(` - ${page}`);
2441
+ const path = page.category ? `${page.category}/${page.page}` : page.page;
2442
+ console.log(` - ${path}`);
2438
2443
  });
2439
2444
  });
2440
2445
  program.command("anthropic").description("Quick scrape of Anthropic Claude Code docs (uses llms.txt)").option("-o, --output <dir>", "Output directory", "./docs").action(async (options) => {
package/dist/index.js CHANGED
@@ -18,6 +18,43 @@ var __toESM = (mod, isNodeMode, target) => {
18
18
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
19
19
 
20
20
  // src/index.ts
21
+ var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
22
+ function extractTitle(markdown) {
23
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
24
+ return titleMatch ? titleMatch[1].trim() : "Untitled";
25
+ }
26
+ function parsePagePath(pagePath) {
27
+ const pageName = pagePath.replace(".md", "");
28
+ const pathParts = pageName.split("/");
29
+ if (pathParts.length === 1) {
30
+ return { category: "", page: pathParts[0] };
31
+ } else if (pathParts.length === 2) {
32
+ return { category: pathParts[0], page: pathParts[1] };
33
+ } else {
34
+ return {
35
+ category: pathParts.slice(0, -1).join("/"),
36
+ page: pathParts[pathParts.length - 1]
37
+ };
38
+ }
39
+ }
40
+ async function fetchMarkdown(url, userAgent = "@ebowwa/markdown-docs-scraper") {
41
+ try {
42
+ const response = await fetch(url, {
43
+ headers: {
44
+ Accept: "text/markdown, text/plain",
45
+ "User-Agent": userAgent
46
+ }
47
+ });
48
+ if (!response.ok) {
49
+ return null;
50
+ }
51
+ return await response.text();
52
+ } catch (error) {
53
+ console.error(`Error fetching ${url}:`, error);
54
+ return null;
55
+ }
56
+ }
57
+
21
58
  class MarkdownDocsScraper {
22
59
  options;
23
60
  constructor(options) {
@@ -27,56 +64,99 @@ class MarkdownDocsScraper {
27
64
  categories: options.categories || {},
28
65
  outputDir: options.outputDir || "./docs",
29
66
  concurrency: options.concurrency || 5,
30
- onProgress: options.onProgress || (() => {})
67
+ onProgress: options.onProgress || (() => {}),
68
+ llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
69
+ tryDocsSubdomain: options.tryDocsSubdomain ?? true,
70
+ linkPattern: options.linkPattern || GENERIC_LINK_PATTERN
31
71
  };
32
72
  }
33
- async fetchMarkdown(url) {
34
- try {
35
- const response = await fetch(url, {
36
- headers: {
37
- Accept: "text/markdown, text/plain",
38
- "User-Agent": "@ebowwa/markdown-docs-scraper"
39
- }
40
- });
41
- if (!response.ok) {
42
- return null;
43
- }
44
- const contentType = response.headers.get("content-type") || "";
45
- if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {}
46
- return await response.text();
47
- } catch (error) {
48
- console.error(`Error fetching ${url}:`, error);
49
- return null;
50
- }
51
- }
52
- extractTitle(markdown) {
53
- const titleMatch = markdown.match(/^#\s+(.+)$/m);
54
- return titleMatch ? titleMatch[1].trim() : "Untitled";
55
- }
56
- sanitizeFilename(path) {
57
- return path.toLowerCase().replace(/[^a-z0-9/]+/g, "-").replace(/^-|-$/g, "").replace(/\//g, "/");
58
- }
59
73
  buildUrl(category, page) {
60
74
  if (category) {
61
75
  return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
62
- } else {
76
+ } else if (this.options.docsPath) {
63
77
  return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
78
+ } else {
79
+ return `${this.options.baseUrl}/${page}.md`;
64
80
  }
65
81
  }
66
82
  async downloadPage(category, page) {
67
83
  const url = this.buildUrl(category, page);
68
- const content = await this.fetchMarkdown(url);
84
+ const content = await fetchMarkdown(url);
69
85
  if (!content) {
70
86
  return null;
71
87
  }
72
88
  return {
73
89
  url,
74
- title: this.extractTitle(content),
90
+ title: extractTitle(content),
75
91
  content,
76
92
  category,
77
93
  pageName: page
78
94
  };
79
95
  }
96
+ getLlmsUrls() {
97
+ const urls = [];
98
+ const baseUrl = this.options.baseUrl;
99
+ for (const path of this.options.llmsPaths) {
100
+ urls.push(`${baseUrl}${path}`);
101
+ }
102
+ if (this.options.tryDocsSubdomain) {
103
+ try {
104
+ const url = new URL(baseUrl);
105
+ const hostname = url.hostname;
106
+ if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
107
+ const docsDomain = hostname.replace(/^www\./, "");
108
+ urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
109
+ urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
110
+ }
111
+ } catch {}
112
+ }
113
+ return urls;
114
+ }
115
+ async fetchLlmsTxt() {
116
+ const urls = this.getLlmsUrls();
117
+ for (const llmsUrl of urls) {
118
+ try {
119
+ const response = await fetch(llmsUrl, {
120
+ headers: {
121
+ Accept: "text/plain",
122
+ "User-Agent": "@ebowwa/markdown-docs-scraper"
123
+ }
124
+ });
125
+ if (response.ok) {
126
+ const content = await response.text();
127
+ console.log(`Found llms.txt at ${llmsUrl}`);
128
+ return { content, url: llmsUrl };
129
+ }
130
+ } catch (error) {
131
+ continue;
132
+ }
133
+ }
134
+ return null;
135
+ }
136
+ async discoverPages() {
137
+ const pages = [];
138
+ try {
139
+ const llmsResult = await this.fetchLlmsTxt();
140
+ if (!llmsResult) {
141
+ const attemptedUrls = this.getLlmsUrls();
142
+ console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
143
+ return pages;
144
+ }
145
+ const { content } = llmsResult;
146
+ const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
147
+ let match;
148
+ while ((match = regex.exec(content)) !== null) {
149
+ const url = match[2];
150
+ const pagePath = match[3];
151
+ const { category, page } = parsePagePath(pagePath);
152
+ pages.push({ category, page });
153
+ }
154
+ console.log(`Discovered ${pages.length} pages from llms.txt`);
155
+ } catch (error) {
156
+ console.error("Error discovering pages:", error);
157
+ }
158
+ return pages;
159
+ }
80
160
  async scrapeFromLlms() {
81
161
  const startTime = Date.now();
82
162
  const downloaded = [];
@@ -164,72 +244,6 @@ Downloaded: ${new Date().toISOString()}
164
244
  }
165
245
  return pages;
166
246
  }
167
- async discoverPages() {
168
- const pages = [];
169
- try {
170
- const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
171
- const response = await fetch(llmsUrl, {
172
- headers: {
173
- Accept: "text/plain",
174
- "User-Agent": "@ebowwa/markdown-docs-scraper"
175
- }
176
- });
177
- if (!response.ok) {
178
- console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
179
- return pages;
180
- }
181
- const content = await response.text();
182
- const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
183
- let match;
184
- while ((match = linkRegex.exec(content)) !== null) {
185
- const url = match[2];
186
- const pagePath = match[3];
187
- const pageName = pagePath.replace(".md", "");
188
- const pathParts = pageName.split("/");
189
- if (pathParts.length === 1) {
190
- pages.push({ category: "", page: pathParts[0] });
191
- } else if (pathParts.length === 2) {
192
- pages.push({ category: pathParts[0], page: pathParts[1] });
193
- } else {
194
- const category = pathParts.slice(0, -1).join("/");
195
- const page = pathParts[pathParts.length - 1];
196
- pages.push({ category, page });
197
- }
198
- }
199
- console.log(`Discovered ${pages.length} pages from llms.txt`);
200
- } catch (error) {
201
- console.error("Error discovering pages:", error);
202
- }
203
- return pages;
204
- }
205
- async discoverPagesHtml() {
206
- const discovered = [];
207
- try {
208
- const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
209
- const response = await fetch(indexUrl, {
210
- headers: {
211
- Accept: "text/html",
212
- "User-Agent": "@ebowwa/markdown-docs-scraper"
213
- }
214
- });
215
- if (!response.ok) {
216
- return discovered;
217
- }
218
- const html = await response.text();
219
- const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
220
- let match;
221
- while ((match = mdLinkRegex.exec(html)) !== null) {
222
- const path = match[1];
223
- if (!discovered.includes(path)) {
224
- discovered.push(path);
225
- }
226
- }
227
- console.log(`Discovered ${discovered.length} additional pages from HTML`);
228
- } catch (error) {
229
- console.error("Error discovering pages from HTML:", error);
230
- }
231
- return discovered;
232
- }
233
247
  }
234
248
  async function scrapeMarkdownDocs(options) {
235
249
  const scraper = new MarkdownDocsScraper(options);
@@ -239,9 +253,40 @@ async function scrapeMarkdownDocs(options) {
239
253
  }
240
254
  return result;
241
255
  }
256
+ var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
257
+ var GENERIC_PATTERN = GENERIC_LINK_PATTERN;
258
+ function claudeCodeOptions(outputDir) {
259
+ return {
260
+ baseUrl: "https://code.claude.com",
261
+ docsPath: "/docs/en",
262
+ llmsPaths: ["/docs/llms.txt"],
263
+ linkPattern: CLAUDE_CODE_PATTERN,
264
+ outputDir,
265
+ concurrency: 10,
266
+ tryDocsSubdomain: false
267
+ };
268
+ }
269
+ function polymarketOptions(outputDir) {
270
+ return {
271
+ baseUrl: "https://docs.polymarket.com",
272
+ docsPath: "",
273
+ llmsPaths: ["/llms.txt"],
274
+ linkPattern: GENERIC_PATTERN,
275
+ outputDir,
276
+ concurrency: 10,
277
+ tryDocsSubdomain: false
278
+ };
279
+ }
242
280
  var src_default = MarkdownDocsScraper;
243
281
  export {
244
282
  scrapeMarkdownDocs,
283
+ polymarketOptions,
284
+ parsePagePath,
285
+ fetchMarkdown,
286
+ extractTitle,
245
287
  src_default as default,
246
- MarkdownDocsScraper
288
+ claudeCodeOptions,
289
+ MarkdownDocsScraper,
290
+ GENERIC_PATTERN,
291
+ CLAUDE_CODE_PATTERN
247
292
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ebowwa/markdown-docs-scraper",
3
- "version": "1.0.0",
3
+ "version": "1.1.0",
4
4
  "description": "Scrape and mirror markdown-based documentation sites",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
package/src/cli.ts CHANGED
@@ -19,12 +19,16 @@ program
19
19
  .option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
20
20
  .option("-c, --concurrency <num>", "Concurrency level", "5")
21
21
  .option("--discover", "Discover pages before scraping", false)
22
+ .option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
23
+ .option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
22
24
  .action(async (options) => {
23
25
  const scraperOptions: ScraperOptions = {
24
26
  baseUrl: options.url,
25
27
  docsPath: options.docsPath,
26
28
  outputDir: options.output,
27
29
  concurrency: parseInt(options.concurrency),
30
+ llmsPaths: options.llmsPaths.split(","),
31
+ tryDocsSubdomain: !options.noSubdomain,
28
32
  };
29
33
 
30
34
  console.log(`🔍 Scraping ${options.url}...`);
@@ -56,10 +60,14 @@ program
56
60
  .description("Discover all available documentation pages")
57
61
  .requiredOption("-u, --url <url>", "Base URL of the documentation site")
58
62
  .option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
63
+ .option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
64
+ .option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
59
65
  .action(async (options) => {
60
66
  const scraper = new MarkdownDocsScraper({
61
67
  baseUrl: options.url,
62
68
  docsPath: options.docsPath,
69
+ llmsPaths: options.llmsPaths.split(","),
70
+ tryDocsSubdomain: !options.noSubdomain,
63
71
  });
64
72
 
65
73
  console.log(`🔍 Discovering pages from ${options.url}...`);
@@ -67,7 +75,8 @@ program
67
75
 
68
76
  console.log(`\nFound ${pages.length} pages:\n`);
69
77
  pages.forEach((page) => {
70
- console.log(` - ${page}`);
78
+ const path = page.category ? `${page.category}/${page.page}` : page.page;
79
+ console.log(` - ${path}`);
71
80
  });
72
81
  });
73
82
 
package/src/index.ts CHANGED
@@ -1,7 +1,10 @@
1
1
  /**
2
2
  * @ebowwa/markdown-docs-scraper
3
3
  *
4
- * Scrape and mirror markdown-based documentation sites
4
+ * Composable markdown documentation scraper.
5
+ * - Configurable llms.txt paths with fallbacks
6
+ * - Custom URL patterns for different doc sites
7
+ * - Works with any markdown documentation site
5
8
  */
6
9
 
7
10
  // ============================================================================
@@ -23,6 +26,12 @@ export interface ScraperOptions {
23
26
  outputDir?: string;
24
27
  concurrency?: number;
25
28
  onProgress?: (current: number, total: number) => void;
29
+ /** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
30
+ llmsPaths?: string[];
31
+ /** Also try docs subdomain variants (e.g., docs.example.com) */
32
+ tryDocsSubdomain?: boolean;
33
+ /** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
34
+ linkPattern?: RegExp;
26
35
  }
27
36
 
28
37
  export interface ScraperResult {
@@ -31,8 +40,66 @@ export interface ScraperResult {
31
40
  duration: number;
32
41
  }
33
42
 
43
+ /** Default pattern: matches /docs/en/ or /docs/ paths */
44
+ const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
45
+
46
+ /** Generic pattern: matches any .md links in llms.txt */
47
+ const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
48
+
34
49
  // ============================================================================
35
- // SCRAPER
50
+ // UTILITY FUNCTIONS (Composable)
51
+ // ============================================================================
52
+
53
+ /** Extract title from markdown content */
54
+ export function extractTitle(markdown: string): string {
55
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
56
+ return titleMatch ? titleMatch[1].trim() : "Untitled";
57
+ }
58
+
59
+ /** Parse page path into category and page name */
60
+ export function parsePagePath(pagePath: string): { category: string; page: string } {
61
+ // Remove .md extension
62
+ const pageName = pagePath.replace(".md", "");
63
+
64
+ // Check if there's a category in the path
65
+ const pathParts = pageName.split("/");
66
+
67
+ if (pathParts.length === 1) {
68
+ return { category: "", page: pathParts[0] };
69
+ } else if (pathParts.length === 2) {
70
+ return { category: pathParts[0], page: pathParts[1] };
71
+ } else {
72
+ // Deeper path: join everything except last as category
73
+ return {
74
+ category: pathParts.slice(0, -1).join("/"),
75
+ page: pathParts[pathParts.length - 1],
76
+ };
77
+ }
78
+ }
79
+
80
+ /** Fetch markdown content from URL */
81
+ export async function fetchMarkdown(url: string, userAgent = "@ebowwa/markdown-docs-scraper"): Promise<string | null> {
82
+ try {
83
+ const response = await fetch(url, {
84
+ headers: {
85
+ Accept: "text/markdown, text/plain",
86
+ "User-Agent": userAgent,
87
+ },
88
+ });
89
+
90
+ if (!response.ok) {
91
+ return null;
92
+ }
93
+
94
+ return await response.text();
95
+ } catch (error) {
96
+ console.error(`Error fetching ${url}:`, error);
97
+ return null;
98
+ }
99
+ }
100
+
101
+ // ============================================================================
102
+ // SCRAPER CLASS
36
103
  // ============================================================================
37
104
 
38
105
  export class MarkdownDocsScraper {
@@ -46,64 +113,23 @@ export class MarkdownDocsScraper {
46
113
  outputDir: options.outputDir || "./docs",
47
114
  concurrency: options.concurrency || 5,
48
115
  onProgress: options.onProgress || (() => {}),
116
+ llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
117
+ tryDocsSubdomain: options.tryDocsSubdomain ?? true,
118
+ linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
49
119
  };
50
120
  }
51
121
 
52
- /**
53
- * Fetch markdown content from a URL
54
- */
55
- async fetchMarkdown(url: string): Promise<string | null> {
56
- try {
57
- const response = await fetch(url, {
58
- headers: {
59
- Accept: "text/markdown, text/plain",
60
- "User-Agent": "@ebowwa/markdown-docs-scraper",
61
- },
62
- });
63
-
64
- if (!response.ok) {
65
- return null;
66
- }
67
-
68
- const contentType = response.headers.get("content-type") || "";
69
- if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
70
- // Try to parse anyway - some sites return incorrect content-type
71
- }
72
-
73
- return await response.text();
74
- } catch (error) {
75
- console.error(`Error fetching ${url}:`, error);
76
- return null;
77
- }
78
- }
79
-
80
- /**
81
- * Extract title from markdown content
82
- */
83
- extractTitle(markdown: string): string {
84
- const titleMatch = markdown.match(/^#\s+(.+)$/m);
85
- return titleMatch ? titleMatch[1].trim() : "Untitled";
86
- }
87
-
88
- /**
89
- * Sanitize filename from URL path
90
- */
91
- sanitizeFilename(path: string): string {
92
- return path
93
- .toLowerCase()
94
- .replace(/[^a-z0-9/]+/g, "-")
95
- .replace(/^-|-$/g, "")
96
- .replace(/\//g, "/");
97
- }
98
-
99
122
  /**
100
123
  * Build URL for a documentation page
101
124
  */
102
125
  buildUrl(category: string, page: string): string {
103
126
  if (category) {
104
127
  return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
105
- } else {
128
+ } else if (this.options.docsPath) {
106
129
  return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
130
+ } else {
131
+ // No docsPath (like Polymarket) - direct path
132
+ return `${this.options.baseUrl}/${page}.md`;
107
133
  }
108
134
  }
109
135
 
@@ -112,7 +138,7 @@ export class MarkdownDocsScraper {
112
138
  */
113
139
  async downloadPage(category: string, page: string): Promise<DocPage | null> {
114
140
  const url = this.buildUrl(category, page);
115
- const content = await this.fetchMarkdown(url);
141
+ const content = await fetchMarkdown(url);
116
142
 
117
143
  if (!content) {
118
144
  return null;
@@ -120,13 +146,112 @@ export class MarkdownDocsScraper {
120
146
 
121
147
  return {
122
148
  url,
123
- title: this.extractTitle(content),
149
+ title: extractTitle(content),
124
150
  content,
125
151
  category,
126
- pageName: page, // Store the page name for saving
152
+ pageName: page,
127
153
  };
128
154
  }
129
155
 
156
+ /**
157
+ * Generate possible llms.txt URLs to try
158
+ */
159
+ private getLlmsUrls(): string[] {
160
+ const urls: string[] = [];
161
+ const baseUrl = this.options.baseUrl;
162
+
163
+ // Try configured/custom paths first
164
+ for (const path of this.options.llmsPaths) {
165
+ urls.push(`${baseUrl}${path}`);
166
+ }
167
+
168
+ // Also try docs/doc subdomain variants if enabled
169
+ if (this.options.tryDocsSubdomain) {
170
+ try {
171
+ const url = new URL(baseUrl);
172
+ const hostname = url.hostname;
173
+
174
+ // Skip if already on docs/doc subdomain
175
+ if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
176
+ // Try docs.{domain}
177
+ const docsDomain = hostname.replace(/^www\./, "");
178
+ urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
179
+ urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
180
+ }
181
+ } catch {
182
+ // Invalid URL, skip subdomain variants
183
+ }
184
+ }
185
+
186
+ return urls;
187
+ }
188
+
189
+ /**
190
+ * Fetch llms.txt from multiple possible URLs with fallback
191
+ */
192
+ private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
193
+ const urls = this.getLlmsUrls();
194
+
195
+ for (const llmsUrl of urls) {
196
+ try {
197
+ const response = await fetch(llmsUrl, {
198
+ headers: {
199
+ Accept: "text/plain",
200
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
201
+ },
202
+ });
203
+
204
+ if (response.ok) {
205
+ const content = await response.text();
206
+ console.log(`Found llms.txt at ${llmsUrl}`);
207
+ return { content, url: llmsUrl };
208
+ }
209
+ } catch (error) {
210
+ // Try next URL
211
+ continue;
212
+ }
213
+ }
214
+
215
+ return null;
216
+ }
217
+
218
+ /**
219
+ * Discover pages from llms.txt index
220
+ */
221
+ async discoverPages(): Promise<Array<{ category: string; page: string }>> {
222
+ const pages: Array<{ category: string; page: string }> = [];
223
+
224
+ try {
225
+ const llmsResult = await this.fetchLlmsTxt();
226
+
227
+ if (!llmsResult) {
228
+ const attemptedUrls = this.getLlmsUrls();
229
+ console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
230
+ return pages;
231
+ }
232
+
233
+ const { content } = llmsResult;
234
+
235
+ // Use provided pattern or default
236
+ const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
237
+ let match;
238
+
239
+ while ((match = regex.exec(content)) !== null) {
240
+ const url = match[2];
241
+ const pagePath = match[3]; // The captured path group
242
+
243
+ const { category, page } = parsePagePath(pagePath);
244
+ pages.push({ category, page });
245
+ }
246
+
247
+ console.log(`Discovered ${pages.length} pages from llms.txt`);
248
+ } catch (error) {
249
+ console.error("Error discovering pages:", error);
250
+ }
251
+
252
+ return pages;
253
+ }
254
+
130
255
  /**
131
256
  * Scrape pages discovered from llms.txt
132
257
  */
@@ -175,7 +300,7 @@ export class MarkdownDocsScraper {
175
300
  }
176
301
 
177
302
  /**
178
- * Scrape all documentation pages
303
+ * Scrape all documentation pages (uses categories)
179
304
  */
180
305
  async scrape(): Promise<ScraperResult> {
181
306
  const startTime = Date.now();
@@ -255,102 +380,6 @@ export class MarkdownDocsScraper {
255
380
 
256
381
  return pages;
257
382
  }
258
-
259
- /**
260
- * Discover pages from llms.txt index
261
- */
262
- async discoverPages(): Promise<Array<{ category: string; page: string }>> {
263
- const pages: Array<{ category: string; page: string }> = [];
264
-
265
- try {
266
- const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
267
- const response = await fetch(llmsUrl, {
268
- headers: {
269
- Accept: "text/plain",
270
- "User-Agent": "@ebowwa/markdown-docs-scraper",
271
- },
272
- });
273
-
274
- if (!response.ok) {
275
- console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
276
- return pages;
277
- }
278
-
279
- const content = await response.text();
280
-
281
- // Parse markdown links in format: [title](https://code.claude.com/docs/en/page.md)
282
- const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
283
- let match;
284
-
285
- while ((match = linkRegex.exec(content)) !== null) {
286
- const url = match[2];
287
- const pagePath = match[3]; // e.g., "agent-teams.md" or "category/page.md"
288
-
289
- // Remove .md extension
290
- const pageName = pagePath.replace(".md", "");
291
-
292
- // Check if there's a category in the path
293
- const pathParts = pageName.split("/");
294
-
295
- if (pathParts.length === 1) {
296
- // No category: just "page-name"
297
- pages.push({ category: "", page: pathParts[0] });
298
- } else if (pathParts.length === 2) {
299
- // Has category: "category/page-name"
300
- pages.push({ category: pathParts[0], page: pathParts[1] });
301
- } else {
302
- // Deeper path: join everything except last as category
303
- const category = pathParts.slice(0, -1).join("/");
304
- const page = pathParts[pathParts.length - 1];
305
- pages.push({ category, page });
306
- }
307
- }
308
-
309
- console.log(`Discovered ${pages.length} pages from llms.txt`);
310
- } catch (error) {
311
- console.error("Error discovering pages:", error);
312
- }
313
-
314
- return pages;
315
- }
316
-
317
- /**
318
- * Discover additional pages by parsing the docs index (fallback)
319
- */
320
- async discoverPagesHtml(): Promise<string[]> {
321
- const discovered: string[] = [];
322
-
323
- try {
324
- const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
325
- const response = await fetch(indexUrl, {
326
- headers: {
327
- Accept: "text/html",
328
- "User-Agent": "@ebowwa/markdown-docs-scraper",
329
- },
330
- });
331
-
332
- if (!response.ok) {
333
- return discovered;
334
- }
335
-
336
- const html = await response.text();
337
- const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
338
- let match;
339
-
340
- while ((match = mdLinkRegex.exec(html)) !== null) {
341
- const path = match[1];
342
- if (!discovered.includes(path)) {
343
- discovered.push(path);
344
- }
345
- }
346
-
347
- console.log(`Discovered ${discovered.length} additional pages from HTML`);
348
- } catch (error) {
349
- console.error("Error discovering pages from HTML:", error);
350
- }
351
-
352
- return discovered;
353
- }
354
383
  }
355
384
 
356
385
  // ============================================================================
@@ -375,6 +404,42 @@ export async function scrapeMarkdownDocs(
375
404
  return result;
376
405
  }
377
406
 
407
+ // ============================================================================
408
+ // PRESET CONFIGURATIONS (Composable)
409
+ // ============================================================================
410
+
411
+ /** Pattern for Claude Code docs: /docs/en/page.md */
412
+ export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
413
+
414
+ /** Pattern for generic docs: any domain/path.md */
415
+ export const GENERIC_PATTERN = GENERIC_LINK_PATTERN;
416
+
417
+ /** Create scraper options for Claude Code docs */
418
+ export function claudeCodeOptions(outputDir: string): ScraperOptions {
419
+ return {
420
+ baseUrl: "https://code.claude.com",
421
+ docsPath: "/docs/en",
422
+ llmsPaths: ["/docs/llms.txt"],
423
+ linkPattern: CLAUDE_CODE_PATTERN,
424
+ outputDir,
425
+ concurrency: 10,
426
+ tryDocsSubdomain: false,
427
+ };
428
+ }
429
+
430
+ /** Create scraper options for Polymarket docs */
431
+ export function polymarketOptions(outputDir: string): ScraperOptions {
432
+ return {
433
+ baseUrl: "https://docs.polymarket.com",
434
+ docsPath: "",
435
+ llmsPaths: ["/llms.txt"],
436
+ linkPattern: GENERIC_PATTERN,
437
+ outputDir,
438
+ concurrency: 10,
439
+ tryDocsSubdomain: false,
440
+ };
441
+ }
442
+
378
443
  // ============================================================================
379
444
  // EXPORTS
380
445
  // ============================================================================