@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -52,6 +52,26 @@ Options:
52
52
  -o, --output <dir> Output directory (default: "./docs")
53
53
  --docs-path <path> Docs path (default: "/docs/en")
54
54
  -c, --concurrency <num> Concurrency level (default: "5")
55
+ --llms-paths <paths> Comma-separated llms.txt paths (default: "/llms.txt,/docs/llms.txt")
56
+ --no-subdomain Disable docs/doc subdomain fallback
57
+ ```
58
+
59
+ ### llms.txt Discovery
60
+
61
+ The scraper automatically tries multiple paths to find `llms.txt`:
62
+
63
+ 1. **Configured paths** (default: `/llms.txt`, `/docs/llms.txt`)
64
+ 2. **Docs subdomain** (e.g., `https://docs.example.com/llms.txt`)
65
+ 3. **Doc subdomain** (e.g., `https://doc.example.com/llms.txt`)
66
+
67
+ Example with custom paths:
68
+ ```bash
69
+ markdown-docs-scraper scrape -u https://example.com --llms-paths "/llms.txt,/api/llms.txt"
70
+ ```
71
+
72
+ Disable subdomain fallback:
73
+ ```bash
74
+ markdown-docs-scraper scrape -u https://example.com --no-subdomain
55
75
  ```
56
76
 
57
77
  ## Programmatic Usage
@@ -103,6 +123,8 @@ interface ScraperOptions {
103
123
  outputDir?: string; // Output directory (default: "./docs")
104
124
  concurrency?: number; // Concurrent downloads (default: 5)
105
125
  onProgress?: (current: number, total: number) => void;
126
+ llmsPaths?: string[]; // llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"])
127
+ tryDocsSubdomain?: boolean; // Also try docs/doc subdomains (default: true)
106
128
  }
107
129
  ```
108
130
 
@@ -139,6 +161,88 @@ Downloaded: 2026-02-06T00:00:00.000Z
139
161
  Original markdown content...
140
162
  ```
141
163
 
164
+ ## Composable Scrapers Module
165
+
166
+ The package includes a composable scraper architecture for multiple documentation source types.
167
+
168
+ ### Usage
169
+
170
+ ```typescript
171
+ import {
172
+ scrapeSource,
173
+ registerScraper,
174
+ llmsTxtScraper,
175
+ githubRawScraper,
176
+ type SourceConfig,
177
+ } from "@ebowwa/markdown-docs-scraper/scrapers";
178
+
179
+ // Configure a source
180
+ const config: SourceConfig = {
181
+ name: "My Docs",
182
+ sourceType: "llms-txt",
183
+ baseUrl: "https://docs.example.com",
184
+ docsPath: "/docs",
185
+ outputDir: "./docs/my-docs",
186
+ reportDir: "./reports/my-docs",
187
+ };
188
+
189
+ // Scrape using the registry (auto-selects scraper by sourceType)
190
+ const result = await scrapeSource(config);
191
+ ```
192
+
193
+ ### Built-in Scrapers
194
+
195
+ - **llms-txt**: Scrapes docs sites with llms.txt index files
196
+ - **github-raw**: Downloads markdown directly from GitHub repos
197
+
198
+ ### Custom Scrapers
199
+
200
+ ```typescript
201
+ import { registerScraper, type Scraper, type SourceType } from "@ebowwa/markdown-docs-scraper/scrapers";
202
+
203
+ const myScraper: Scraper = {
204
+ type: "my-type" as SourceType,
205
+ async scrape(config) {
206
+ // Custom scraping logic
207
+ return {
208
+ downloaded: [],
209
+ failed: [],
210
+ duration: 0,
211
+ };
212
+ },
213
+ };
214
+
215
+ registerScraper(myScraper);
216
+ ```
217
+
218
+ ### Types
219
+
220
+ ```typescript
221
+ type SourceType = "llms-txt" | "github-raw";
222
+
223
+ interface SourceConfig {
224
+ name: string;
225
+ sourceType: SourceType;
226
+ baseUrl: string;
227
+ docsPath: string;
228
+ outputDir: string;
229
+ reportDir: string;
230
+ llmsTxtPath?: string;
231
+ linkPattern?: RegExp;
232
+ github?: {
233
+ repo: string;
234
+ includeCommits: boolean;
235
+ includeReleases: boolean;
236
+ includePRs: boolean;
237
+ };
238
+ }
239
+
240
+ interface Scraper {
241
+ type: SourceType;
242
+ scrape(config: SourceConfig): Promise<ScrapeResult>;
243
+ }
244
+ ```
245
+
142
246
  ## License
143
247
 
144
248
  MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * CLI for @ebowwa/markdown-docs-scraper
4
+ */
5
+ export {};
6
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA;;GAEG"}
package/dist/cli.js CHANGED
@@ -20,7 +20,7 @@ var __toESM = (mod, isNodeMode, target) => {
20
20
  var __commonJS = (cb, mod) => () => (mod || cb((mod = { exports: {} }).exports, mod), mod.exports);
21
21
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
22
22
 
23
- // node_modules/commander/lib/error.js
23
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/error.js
24
24
  var require_error = __commonJS((exports) => {
25
25
  class CommanderError extends Error {
26
26
  constructor(exitCode, code, message) {
@@ -44,7 +44,7 @@ var require_error = __commonJS((exports) => {
44
44
  exports.InvalidArgumentError = InvalidArgumentError;
45
45
  });
46
46
 
47
- // node_modules/commander/lib/argument.js
47
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/argument.js
48
48
  var require_argument = __commonJS((exports) => {
49
49
  var { InvalidArgumentError } = require_error();
50
50
 
@@ -123,7 +123,7 @@ var require_argument = __commonJS((exports) => {
123
123
  exports.humanReadableArgName = humanReadableArgName;
124
124
  });
125
125
 
126
- // node_modules/commander/lib/help.js
126
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/help.js
127
127
  var require_help = __commonJS((exports) => {
128
128
  var { humanReadableArgName } = require_argument();
129
129
 
@@ -372,7 +372,7 @@ var require_help = __commonJS((exports) => {
372
372
  exports.Help = Help;
373
373
  });
374
374
 
375
- // node_modules/commander/lib/option.js
375
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/option.js
376
376
  var require_option = __commonJS((exports) => {
377
377
  var { InvalidArgumentError } = require_error();
378
378
 
@@ -523,7 +523,7 @@ var require_option = __commonJS((exports) => {
523
523
  exports.DualOptions = DualOptions;
524
524
  });
525
525
 
526
- // node_modules/commander/lib/suggestSimilar.js
526
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/suggestSimilar.js
527
527
  var require_suggestSimilar = __commonJS((exports) => {
528
528
  var maxDistance = 3;
529
529
  function editDistance(a, b) {
@@ -596,7 +596,7 @@ var require_suggestSimilar = __commonJS((exports) => {
596
596
  exports.suggestSimilar = suggestSimilar;
597
597
  });
598
598
 
599
- // node_modules/commander/lib/command.js
599
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/command.js
600
600
  var require_command = __commonJS((exports) => {
601
601
  var EventEmitter = __require("node:events").EventEmitter;
602
602
  var childProcess = __require("node:child_process");
@@ -1839,7 +1839,7 @@ Expecting one of '${allowedValues.join("', '")}'`);
1839
1839
  exports.Command = Command;
1840
1840
  });
1841
1841
 
1842
- // node_modules/commander/index.js
1842
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/index.js
1843
1843
  var require_commander = __commonJS((exports) => {
1844
1844
  var { Argument } = require_argument();
1845
1845
  var { Command } = require_command();
@@ -2377,7 +2377,7 @@ Downloaded: `).concat(new Date().toISOString(), `
2377
2377
  exports.default = MarkdownDocsScraper;
2378
2378
  });
2379
2379
 
2380
- // node_modules/commander/esm.mjs
2380
+ // ../../node_modules/.bun/commander@12.1.0/node_modules/commander/esm.mjs
2381
2381
  var import__ = __toESM(require_commander(), 1);
2382
2382
  var {
2383
2383
  program,
@@ -2396,12 +2396,14 @@ var {
2396
2396
  // src/cli.ts
2397
2397
  var import__2 = __toESM(require_src(), 1);
2398
2398
  program.name("markdown-docs-scraper").description("Scrape and mirror markdown-based documentation sites").version("1.0.0");
2399
- program.command("scrape").description("Scrape documentation from a URL").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("-o, --output <dir>", "Output directory", "./docs").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("-c, --concurrency <num>", "Concurrency level", "5").option("--discover", "Discover pages before scraping", false).action(async (options) => {
2399
+ program.command("scrape").description("Scrape documentation from a URL").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("-o, --output <dir>", "Output directory", "./docs").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("-c, --concurrency <num>", "Concurrency level", "5").option("--discover", "Discover pages before scraping", false).option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
2400
2400
  const scraperOptions = {
2401
2401
  baseUrl: options.url,
2402
2402
  docsPath: options.docsPath,
2403
2403
  outputDir: options.output,
2404
- concurrency: parseInt(options.concurrency)
2404
+ concurrency: parseInt(options.concurrency),
2405
+ llmsPaths: options.llmsPaths.split(","),
2406
+ tryDocsSubdomain: !options.noSubdomain
2405
2407
  };
2406
2408
  console.log(`\uD83D\uDD0D Scraping ${options.url}...`);
2407
2409
  console.log(`\uD83D\uDCC1 Output: ${options.output}`);
@@ -2423,10 +2425,13 @@ program.command("scrape").description("Scrape documentation from a URL").require
2423
2425
  }
2424
2426
  }
2425
2427
  });
2426
- program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").action(async (options) => {
2428
+ program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
2429
+ console.log("DEBUG CLI: Options received:", options);
2427
2430
  const scraper = new import__2.MarkdownDocsScraper({
2428
2431
  baseUrl: options.url,
2429
- docsPath: options.docsPath
2432
+ docsPath: options.docsPath,
2433
+ llmsPaths: options.llmsPaths.split(","),
2434
+ tryDocsSubdomain: !options.noSubdomain
2430
2435
  });
2431
2436
  console.log(`\uD83D\uDD0D Discovering pages from ${options.url}...`);
2432
2437
  const pages = await scraper.discoverPages();
@@ -2434,7 +2439,8 @@ program.command("discover").description("Discover all available documentation pa
2434
2439
  Found ${pages.length} pages:
2435
2440
  `);
2436
2441
  pages.forEach((page) => {
2437
- console.log(` - ${page}`);
2442
+ const path = page.category ? `${page.category}/${page.page}` : page.page;
2443
+ console.log(` - ${path}`);
2438
2444
  });
2439
2445
  });
2440
2446
  program.command("anthropic").description("Quick scrape of Anthropic Claude Code docs (uses llms.txt)").option("-o, --output <dir>", "Output directory", "./docs").action(async (options) => {
@@ -0,0 +1,116 @@
1
+ /**
2
+ * @ebowwa/markdown-docs-scraper
3
+ *
4
+ * Composable markdown documentation scraper.
5
+ * - Configurable llms.txt paths with fallbacks
6
+ * - Custom URL patterns for different doc sites
7
+ * - Works with any markdown documentation site
8
+ * - Uses full URLs from llms.txt directly
9
+ */
10
+ export interface DocPage {
11
+ url: string;
12
+ title: string;
13
+ content: string;
14
+ category?: string;
15
+ pageName?: string;
16
+ }
17
+ export interface ScraperOptions {
18
+ baseUrl: string;
19
+ docsPath?: string;
20
+ categories?: Record<string, string[]>;
21
+ outputDir?: string;
22
+ concurrency?: number;
23
+ onProgress?: (current: number, total: number) => void;
24
+ /** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
25
+ llmsPaths?: string[];
26
+ /** Also try docs subdomain variants (e.g., docs.example.com) */
27
+ tryDocsSubdomain?: boolean;
28
+ /** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
29
+ linkPattern?: RegExp;
30
+ /** Use full URLs from llms.txt directly (default: true for generic pattern) */
31
+ useDirectUrls?: boolean;
32
+ }
33
+ export interface ScraperResult {
34
+ downloaded: DocPage[];
35
+ failed: Array<{
36
+ url: string;
37
+ error: string;
38
+ }>;
39
+ duration: number;
40
+ }
41
+ /** Discovered page with full URL */
42
+ interface DiscoveredPage {
43
+ category: string;
44
+ page: string;
45
+ fullUrl: string;
46
+ }
47
+ /** Extract title from markdown content */
48
+ export declare function extractTitle(markdown: string): string;
49
+ /** Parse page path into category and page name */
50
+ export declare function parsePagePath(pagePath: string): {
51
+ category: string;
52
+ page: string;
53
+ };
54
+ /** Fetch markdown content from URL */
55
+ export declare function fetchMarkdown(url: string, userAgent?: string): Promise<string | null>;
56
+ export declare class MarkdownDocsScraper {
57
+ private options;
58
+ constructor(options: ScraperOptions);
59
+ /**
60
+ * Build URL for a documentation page (fallback when no direct URL)
61
+ */
62
+ buildUrl(category: string, page: string): string;
63
+ /**
64
+ * Download a page using either direct URL or built URL
65
+ */
66
+ downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null>;
67
+ /**
68
+ * Generate possible llms.txt URLs to try
69
+ */
70
+ private getLlmsUrls;
71
+ /**
72
+ * Fetch llms.txt from multiple possible URLs with fallback
73
+ */
74
+ private fetchLlmsTxt;
75
+ /**
76
+ * Discover pages from llms.txt index
77
+ */
78
+ discoverPages(): Promise<DiscoveredPage[]>;
79
+ /**
80
+ * Scrape pages discovered from llms.txt
81
+ */
82
+ scrapeFromLlms(): Promise<ScraperResult>;
83
+ /**
84
+ * Scrape all documentation pages (uses categories)
85
+ */
86
+ scrape(): Promise<ScraperResult>;
87
+ /**
88
+ * Save scraped pages to disk
89
+ */
90
+ savePages(pages: DocPage[]): Promise<void>;
91
+ /**
92
+ * Get list of pages to scrape based on categories
93
+ */
94
+ private getPagesToScrape;
95
+ }
96
+ /**
97
+ * Scrape markdown documentation with a single function call
98
+ */
99
+ export declare function scrapeMarkdownDocs(options: ScraperOptions & {
100
+ useLlms?: boolean;
101
+ }): Promise<ScraperResult>;
102
+ /** Pattern for Claude Code docs: /docs/en/page.md */
103
+ export declare const CLAUDE_CODE_PATTERN: RegExp;
104
+ /** Pattern for generic docs: any domain/path.md */
105
+ export declare const GENERIC_PATTERN: RegExp;
106
+ /** Create scraper options for Claude Code docs */
107
+ export declare function claudeCodeOptions(outputDir: string): ScraperOptions;
108
+ /** Create scraper options for Polymarket docs */
109
+ export declare function polymarketOptions(outputDir: string): ScraperOptions;
110
+ /**
111
+ * Re-export scrapers module for composable scraper architecture.
112
+ * This provides a registry-based system for different scraper implementations.
113
+ */
114
+ export { type SourceType, type SourceConfig, type Scraper, type ScrapeResult as ScraperModuleResult, type DownloadResult, llmsTxtScraper, githubRawScraper, CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN, GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN, registerScraper, getScraper, scrapeSource, } from "./scrapers/index";
115
+ export default MarkdownDocsScraper;
116
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAMH,MAAM,WAAW,OAAO;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IACtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACtD,8EAA8E;IAC9E,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,gEAAgE;IAChE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,6GAA6G;IAC7G,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+EAA+E;IAC/E,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,EAAE,OAAO,EAAE,CAAC;IACtB,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9C,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,oCAAoC;AACpC,UAAU,cAAc;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AAYD,0CAA0C;AAC1C,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAGrD;AAED,kDAAkD;AAClD,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAkBlF;AAED,sCAAsC;AACtC,wBAAsB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAkC,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAkBpH;AAMD,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAA2B;gBAE9B,OAAO,EAAE,cAAc;IAenC;;OAEG;IACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM;IAUhD;;OAEG;IACG,YAAY,CAAC,QAAQ,EAAE,cAAc,GAAG,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;IAqBrE;;OAEG;IACH,OAAO,CAAC,WAAW;IA6BnB;;OAEG;YACW,YAAY;IA6B1B;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;IAuChD;;OAEG;IACG,cAAc,IAAI,OAAO,CAAC,aAAa,CAAC;IA+C9C;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,CAAC;IAwCtC;;OAEG;IACG,SAAS,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAoBhD;;OAEG;IACH,OAAO,CAAC,gBAAgB;CAWzB;AAMD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,cAAc,GAAG;IAAE,OAAO,CAAC,EAAE,OAAO,CAAA;CAAE,GAC9C,OAAO,CAAC,aAAa,CAAC,CAWxB;AAMD,qDAAqD;AACrD,eAAO,MAAM,mBAAmB,QAAiE,CAAC;AAElG,mDAAmD;AACnD,eAAO,MAAM,eAAe,QAAuB,CAAC;AAEpD,kDAAkD;AAClD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAED,iDAAiD;AACjD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAMD;;;GAGG;AACH,OAAO,EAEL,KAAK,UAAU,EACf,KAAK,YAAY,EACjB,KAAK,OAAO,EACZ,KAAK,YAAY,IAAI,mBAAmB,EACxC,KAAK,cAAc,EAGnB,cAAc,EACd,gBAAgB,EAChB,mBAAmB,IAAI,2BAA2B,EAClD,eAAe,IAAI,uBAAuB,EAG1C,eAAe,EACf,UAAU,EACV,YAAY,GACb,MAAM,kBAAkB,CAAC;AAM1B,eAAe,mBAAmB,CAAC"}