@ebowwa/markdown-docs-scraper 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/cli.js ADDED
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env bun
2
+ "use strict";
3
+ /**
4
+ * CLI for @ebowwa/markdown-docs-scraper
5
+ */
6
+ var __assign = (this && this.__assign) || function () {
7
+ __assign = Object.assign || function(t) {
8
+ for (var s, i = 1, n = arguments.length; i < n; i++) {
9
+ s = arguments[i];
10
+ for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
11
+ t[p] = s[p];
12
+ }
13
+ return t;
14
+ };
15
+ return __assign.apply(this, arguments);
16
+ };
17
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
18
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
19
+ return new (P || (P = Promise))(function (resolve, reject) {
20
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
21
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
22
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
23
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
24
+ });
25
+ };
26
+ var __generator = (this && this.__generator) || function (thisArg, body) {
27
+ var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
28
+ return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
29
+ function verb(n) { return function (v) { return step([n, v]); }; }
30
+ function step(op) {
31
+ if (f) throw new TypeError("Generator is already executing.");
32
+ while (g && (g = 0, op[0] && (_ = 0)), _) try {
33
+ if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
34
+ if (y = 0, t) op = [op[0] & 2, t.value];
35
+ switch (op[0]) {
36
+ case 0: case 1: t = op; break;
37
+ case 4: _.label++; return { value: op[1], done: false };
38
+ case 5: _.label++; y = op[1]; op = [0]; continue;
39
+ case 7: op = _.ops.pop(); _.trys.pop(); continue;
40
+ default:
41
+ if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
42
+ if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
43
+ if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
44
+ if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
45
+ if (t[2]) _.ops.pop();
46
+ _.trys.pop(); continue;
47
+ }
48
+ op = body.call(thisArg, _);
49
+ } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
50
+ if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
51
+ }
52
+ };
53
+ Object.defineProperty(exports, "__esModule", { value: true });
54
+ var commander_1 = require("commander");
55
+ var index_js_1 = require("./index.js");
56
+ commander_1.program
57
+ .name("markdown-docs-scraper")
58
+ .description("Scrape and mirror markdown-based documentation sites")
59
+ .version("1.0.0");
60
+ commander_1.program
61
+ .command("scrape")
62
+ .description("Scrape documentation from a URL")
63
+ .requiredOption("-u, --url <url>", "Base URL of the documentation site")
64
+ .option("-o, --output <dir>", "Output directory", "./docs")
65
+ .option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
66
+ .option("-c, --concurrency <num>", "Concurrency level", "5")
67
+ .option("--discover", "Discover pages before scraping", false)
68
+ .action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
69
+ var scraperOptions, result;
70
+ return __generator(this, function (_a) {
71
+ switch (_a.label) {
72
+ case 0:
73
+ scraperOptions = {
74
+ baseUrl: options.url,
75
+ docsPath: options.docsPath,
76
+ outputDir: options.output,
77
+ concurrency: parseInt(options.concurrency),
78
+ };
79
+ console.log("\uD83D\uDD0D Scraping ".concat(options.url, "..."));
80
+ console.log("\uD83D\uDCC1 Output: ".concat(options.output));
81
+ console.log();
82
+ return [4 /*yield*/, (0, index_js_1.scrapeMarkdownDocs)(scraperOptions)];
83
+ case 1:
84
+ result = _a.sent();
85
+ console.log();
86
+ console.log("Summary:");
87
+ console.log(" Downloaded: ".concat(result.downloaded.length));
88
+ console.log(" Failed: ".concat(result.failed.length));
89
+ console.log(" Duration: ".concat((result.duration / 1000).toFixed(2), "s"));
90
+ if (result.failed.length > 0) {
91
+ console.log();
92
+ console.log("Failed pages:");
93
+ result.failed.slice(0, 10).forEach(function (f) {
94
+ console.log(" \u274C ".concat(f.url, ": ").concat(f.error));
95
+ });
96
+ if (result.failed.length > 10) {
97
+ console.log(" ... and ".concat(result.failed.length - 10, " more"));
98
+ }
99
+ }
100
+ return [2 /*return*/];
101
+ }
102
+ });
103
+ }); });
104
+ commander_1.program
105
+ .command("discover")
106
+ .description("Discover all available documentation pages")
107
+ .requiredOption("-u, --url <url>", "Base URL of the documentation site")
108
+ .option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
109
+ .action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
110
+ var scraper, pages;
111
+ return __generator(this, function (_a) {
112
+ switch (_a.label) {
113
+ case 0:
114
+ scraper = new index_js_1.MarkdownDocsScraper({
115
+ baseUrl: options.url,
116
+ docsPath: options.docsPath,
117
+ });
118
+ console.log("\uD83D\uDD0D Discovering pages from ".concat(options.url, "..."));
119
+ return [4 /*yield*/, scraper.discoverPages()];
120
+ case 1:
121
+ pages = _a.sent();
122
+ console.log("\nFound ".concat(pages.length, " pages:\n"));
123
+ pages.forEach(function (page) {
124
+ console.log(" - ".concat(page));
125
+ });
126
+ return [2 /*return*/];
127
+ }
128
+ });
129
+ }); });
130
+ commander_1.program
131
+ .command("anthropic")
132
+ .description("Quick scrape of Anthropic Claude Code docs (uses llms.txt)")
133
+ .option("-o, --output <dir>", "Output directory", "./docs")
134
+ .action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
135
+ var scraperOptions, result;
136
+ return __generator(this, function (_a) {
137
+ switch (_a.label) {
138
+ case 0:
139
+ scraperOptions = {
140
+ baseUrl: "https://code.claude.com",
141
+ docsPath: "/docs/en",
142
+ outputDir: options.output,
143
+ };
144
+ console.log("🔍 Scraping Anthropic Claude Code documentation...");
145
+ console.log("📋 Using llms.txt index for complete page list");
146
+ console.log("\uD83D\uDCC1 Output: ".concat(options.output));
147
+ console.log();
148
+ return [4 /*yield*/, (0, index_js_1.scrapeMarkdownDocs)(__assign(__assign({}, scraperOptions), { useLlms: true }))];
149
+ case 1:
150
+ result = _a.sent();
151
+ console.log();
152
+ console.log("Summary:");
153
+ console.log(" Downloaded: ".concat(result.downloaded.length));
154
+ console.log(" Failed: ".concat(result.failed.length));
155
+ console.log(" Duration: ".concat((result.duration / 1000).toFixed(2), "s"));
156
+ return [2 /*return*/];
157
+ }
158
+ });
159
+ }); });
160
+ commander_1.program.parse();
package/src/cli.ts CHANGED
@@ -63,6 +63,8 @@ program
63
63
  .option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
64
64
  .option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
65
65
  .action(async (options) => {
66
+ console.log("DEBUG CLI: Options received:", options);
67
+
66
68
  const scraper = new MarkdownDocsScraper({
67
69
  baseUrl: options.url,
68
70
  docsPath: options.docsPath,
package/src/index.js ADDED
@@ -0,0 +1,487 @@
1
+ "use strict";
2
+ /**
3
+ * @ebowwa/markdown-docs-scraper
4
+ *
5
+ * Scrape and mirror markdown-based documentation sites
6
+ */
7
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
8
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
9
+ return new (P || (P = Promise))(function (resolve, reject) {
10
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
11
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
12
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
13
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
14
+ });
15
+ };
16
+ var __generator = (this && this.__generator) || function (thisArg, body) {
17
+ var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
18
+ return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
19
+ function verb(n) { return function (v) { return step([n, v]); }; }
20
+ function step(op) {
21
+ if (f) throw new TypeError("Generator is already executing.");
22
+ while (g && (g = 0, op[0] && (_ = 0)), _) try {
23
+ if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
24
+ if (y = 0, t) op = [op[0] & 2, t.value];
25
+ switch (op[0]) {
26
+ case 0: case 1: t = op; break;
27
+ case 4: _.label++; return { value: op[1], done: false };
28
+ case 5: _.label++; y = op[1]; op = [0]; continue;
29
+ case 7: op = _.ops.pop(); _.trys.pop(); continue;
30
+ default:
31
+ if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
32
+ if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
33
+ if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
34
+ if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
35
+ if (t[2]) _.ops.pop();
36
+ _.trys.pop(); continue;
37
+ }
38
+ op = body.call(thisArg, _);
39
+ } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
40
+ if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
41
+ }
42
+ };
43
+ Object.defineProperty(exports, "__esModule", { value: true });
44
+ exports.MarkdownDocsScraper = void 0;
45
+ exports.scrapeMarkdownDocs = scrapeMarkdownDocs;
46
+ // ============================================================================
47
+ // SCRAPER
48
+ // ============================================================================
49
+ var MarkdownDocsScraper = /** @class */ (function () {
50
+ function MarkdownDocsScraper(options) {
51
+ this.options = {
52
+ baseUrl: options.baseUrl,
53
+ docsPath: options.docsPath || "/docs/en",
54
+ categories: options.categories || {},
55
+ outputDir: options.outputDir || "./docs",
56
+ concurrency: options.concurrency || 5,
57
+ onProgress: options.onProgress || (function () { }),
58
+ };
59
+ }
60
+ /**
61
+ * Fetch markdown content from a URL
62
+ */
63
+ MarkdownDocsScraper.prototype.fetchMarkdown = function (url) {
64
+ return __awaiter(this, void 0, void 0, function () {
65
+ var response, contentType, error_1;
66
+ return __generator(this, function (_a) {
67
+ switch (_a.label) {
68
+ case 0:
69
+ _a.trys.push([0, 3, , 4]);
70
+ return [4 /*yield*/, fetch(url, {
71
+ headers: {
72
+ Accept: "text/markdown, text/plain",
73
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
74
+ },
75
+ })];
76
+ case 1:
77
+ response = _a.sent();
78
+ if (!response.ok) {
79
+ return [2 /*return*/, null];
80
+ }
81
+ contentType = response.headers.get("content-type") || "";
82
+ if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
83
+ // Try to parse anyway - some sites return incorrect content-type
84
+ }
85
+ return [4 /*yield*/, response.text()];
86
+ case 2: return [2 /*return*/, _a.sent()];
87
+ case 3:
88
+ error_1 = _a.sent();
89
+ console.error("Error fetching ".concat(url, ":"), error_1);
90
+ return [2 /*return*/, null];
91
+ case 4: return [2 /*return*/];
92
+ }
93
+ });
94
+ });
95
+ };
96
+ /**
97
+ * Extract title from markdown content
98
+ */
99
+ MarkdownDocsScraper.prototype.extractTitle = function (markdown) {
100
+ var titleMatch = markdown.match(/^#\s+(.+)$/m);
101
+ return titleMatch ? titleMatch[1].trim() : "Untitled";
102
+ };
103
+ /**
104
+ * Sanitize filename from URL path
105
+ */
106
+ MarkdownDocsScraper.prototype.sanitizeFilename = function (path) {
107
+ return path
108
+ .toLowerCase()
109
+ .replace(/[^a-z0-9/]+/g, "-")
110
+ .replace(/^-|-$/g, "")
111
+ .replace(/\//g, "/");
112
+ };
113
+ /**
114
+ * Build URL for a documentation page
115
+ */
116
+ MarkdownDocsScraper.prototype.buildUrl = function (category, page) {
117
+ if (category) {
118
+ return "".concat(this.options.baseUrl).concat(this.options.docsPath, "/").concat(category, "/").concat(page, ".md");
119
+ }
120
+ else {
121
+ return "".concat(this.options.baseUrl).concat(this.options.docsPath, "/").concat(page, ".md");
122
+ }
123
+ };
124
+ /**
125
+ * Download a single documentation page
126
+ */
127
+ MarkdownDocsScraper.prototype.downloadPage = function (category, page) {
128
+ return __awaiter(this, void 0, void 0, function () {
129
+ var url, content;
130
+ return __generator(this, function (_a) {
131
+ switch (_a.label) {
132
+ case 0:
133
+ url = this.buildUrl(category, page);
134
+ return [4 /*yield*/, this.fetchMarkdown(url)];
135
+ case 1:
136
+ content = _a.sent();
137
+ if (!content) {
138
+ return [2 /*return*/, null];
139
+ }
140
+ return [2 /*return*/, {
141
+ url: url,
142
+ title: this.extractTitle(content),
143
+ content: content,
144
+ category: category,
145
+ pageName: page, // Store the page name for saving
146
+ }];
147
+ }
148
+ });
149
+ });
150
+ };
151
+ /**
152
+ * Scrape pages discovered from llms.txt
153
+ */
154
+ MarkdownDocsScraper.prototype.scrapeFromLlms = function () {
155
+ return __awaiter(this, void 0, void 0, function () {
156
+ var startTime, downloaded, failed, pages, _loop_1, this_1, i, duration;
157
+ var _this = this;
158
+ return __generator(this, function (_a) {
159
+ switch (_a.label) {
160
+ case 0:
161
+ startTime = Date.now();
162
+ downloaded = [];
163
+ failed = [];
164
+ return [4 /*yield*/, this.discoverPages()];
165
+ case 1:
166
+ pages = _a.sent();
167
+ if (pages.length === 0) {
168
+ console.log("No pages discovered, falling back to categories");
169
+ return [2 /*return*/, this.scrape()];
170
+ }
171
+ console.log("Scraping ".concat(pages.length, " discovered pages..."));
172
+ _loop_1 = function (i) {
173
+ var batch, results;
174
+ return __generator(this, function (_b) {
175
+ switch (_b.label) {
176
+ case 0:
177
+ batch = pages.slice(i, i + this_1.options.concurrency);
178
+ return [4 /*yield*/, Promise.allSettled(batch.map(function (page) { return _this.downloadPage(page.category, page.page); }))];
179
+ case 1:
180
+ results = _b.sent();
181
+ results.forEach(function (result, index) {
182
+ var page = batch[index];
183
+ if (result.status === "fulfilled" && result.value) {
184
+ downloaded.push(result.value);
185
+ }
186
+ else {
187
+ failed.push({
188
+ url: _this.buildUrl(page.category, page.page),
189
+ error: result.status === "rejected" ? result.reason : "Not found",
190
+ });
191
+ }
192
+ _this.options.onProgress(downloaded.length + failed.length, pages.length);
193
+ });
194
+ return [2 /*return*/];
195
+ }
196
+ });
197
+ };
198
+ this_1 = this;
199
+ i = 0;
200
+ _a.label = 2;
201
+ case 2:
202
+ if (!(i < pages.length)) return [3 /*break*/, 5];
203
+ return [5 /*yield**/, _loop_1(i)];
204
+ case 3:
205
+ _a.sent();
206
+ _a.label = 4;
207
+ case 4:
208
+ i += this.options.concurrency;
209
+ return [3 /*break*/, 2];
210
+ case 5:
211
+ duration = Date.now() - startTime;
212
+ console.log("\u2705 Downloaded: ".concat(downloaded.length, " pages"));
213
+ console.log("\u274C Failed: ".concat(failed.length, " pages"));
214
+ console.log("\u23F1\uFE0F Duration: ".concat((duration / 1000).toFixed(2), "s"));
215
+ return [2 /*return*/, { downloaded: downloaded, failed: failed, duration: duration }];
216
+ }
217
+ });
218
+ });
219
+ };
220
+ /**
221
+ * Scrape all documentation pages
222
+ */
223
+ MarkdownDocsScraper.prototype.scrape = function () {
224
+ return __awaiter(this, void 0, void 0, function () {
225
+ var startTime, downloaded, failed, pages, total, _loop_2, this_2, i, duration;
226
+ var _this = this;
227
+ return __generator(this, function (_a) {
228
+ switch (_a.label) {
229
+ case 0:
230
+ startTime = Date.now();
231
+ downloaded = [];
232
+ failed = [];
233
+ pages = this.getPagesToScrape();
234
+ total = pages.length;
235
+ console.log("Scraping ".concat(total, " pages from ").concat(this.options.baseUrl, "..."));
236
+ _loop_2 = function (i) {
237
+ var batch, results;
238
+ return __generator(this, function (_b) {
239
+ switch (_b.label) {
240
+ case 0:
241
+ batch = pages.slice(i, i + this_2.options.concurrency);
242
+ return [4 /*yield*/, Promise.allSettled(batch.map(function (page) { return _this.downloadPage(page.category, page.page); }))];
243
+ case 1:
244
+ results = _b.sent();
245
+ results.forEach(function (result, index) {
246
+ var page = batch[index];
247
+ if (result.status === "fulfilled" && result.value) {
248
+ downloaded.push(result.value);
249
+ }
250
+ else {
251
+ failed.push({
252
+ url: _this.buildUrl(page.category, page.page),
253
+ error: result.status === "rejected" ? result.reason : "Not found",
254
+ });
255
+ }
256
+ _this.options.onProgress(downloaded.length + failed.length, total);
257
+ });
258
+ return [2 /*return*/];
259
+ }
260
+ });
261
+ };
262
+ this_2 = this;
263
+ i = 0;
264
+ _a.label = 1;
265
+ case 1:
266
+ if (!(i < pages.length)) return [3 /*break*/, 4];
267
+ return [5 /*yield**/, _loop_2(i)];
268
+ case 2:
269
+ _a.sent();
270
+ _a.label = 3;
271
+ case 3:
272
+ i += this.options.concurrency;
273
+ return [3 /*break*/, 1];
274
+ case 4:
275
+ duration = Date.now() - startTime;
276
+ console.log("\u2705 Downloaded: ".concat(downloaded.length, " pages"));
277
+ console.log("\u274C Failed: ".concat(failed.length, " pages"));
278
+ console.log("\u23F1\uFE0F Duration: ".concat((duration / 1000).toFixed(2), "s"));
279
+ return [2 /*return*/, { downloaded: downloaded, failed: failed, duration: duration }];
280
+ }
281
+ });
282
+ });
283
+ };
284
+ /**
285
+ * Save scraped pages to disk
286
+ */
287
+ MarkdownDocsScraper.prototype.savePages = function (pages) {
288
+ return __awaiter(this, void 0, void 0, function () {
289
+ var fs, path, _i, pages_1, page, nameToUse, dir, filepath, header;
290
+ var _a;
291
+ return __generator(this, function (_b) {
292
+ switch (_b.label) {
293
+ case 0: return [4 /*yield*/, Promise.resolve().then(function () { return require("fs/promises"); })];
294
+ case 1:
295
+ fs = _b.sent();
296
+ return [4 /*yield*/, Promise.resolve().then(function () { return require("path"); })];
297
+ case 2:
298
+ path = _b.sent();
299
+ _i = 0, pages_1 = pages;
300
+ _b.label = 3;
301
+ case 3:
302
+ if (!(_i < pages_1.length)) return [3 /*break*/, 7];
303
+ page = pages_1[_i];
304
+ nameToUse = page.pageName || ((_a = page.url.split("/").pop()) === null || _a === void 0 ? void 0 : _a.replace(".md", "")) || "untitled";
305
+ dir = page.category
306
+ ? path.join(this.options.outputDir, page.category)
307
+ : this.options.outputDir;
308
+ return [4 /*yield*/, fs.mkdir(dir, { recursive: true })];
309
+ case 4:
310
+ _b.sent();
311
+ filepath = path.join(dir, "".concat(nameToUse, ".md"));
312
+ header = "<!--\nSource: ".concat(page.url, "\nDownloaded: ").concat(new Date().toISOString(), "\n-->\n\n");
313
+ return [4 /*yield*/, fs.writeFile(filepath, header + page.content, "utf-8")];
314
+ case 5:
315
+ _b.sent();
316
+ _b.label = 6;
317
+ case 6:
318
+ _i++;
319
+ return [3 /*break*/, 3];
320
+ case 7: return [2 /*return*/];
321
+ }
322
+ });
323
+ });
324
+ };
325
+ /**
326
+ * Get list of pages to scrape based on categories
327
+ */
328
+ MarkdownDocsScraper.prototype.getPagesToScrape = function () {
329
+ var pages = [];
330
+ for (var _i = 0, _a = Object.entries(this.options.categories); _i < _a.length; _i++) {
331
+ var _b = _a[_i], category = _b[0], pageList = _b[1];
332
+ for (var _c = 0, pageList_1 = pageList; _c < pageList_1.length; _c++) {
333
+ var page = pageList_1[_c];
334
+ pages.push({ category: category, page: page });
335
+ }
336
+ }
337
+ return pages;
338
+ };
339
+ /**
340
+ * Discover pages from llms.txt index
341
+ */
342
+ MarkdownDocsScraper.prototype.discoverPages = function () {
343
+ return __awaiter(this, void 0, void 0, function () {
344
+ var pages, llmsUrl, response, content, linkRegex, match, url, pagePath, pageName, pathParts, category, page, error_2;
345
+ return __generator(this, function (_a) {
346
+ switch (_a.label) {
347
+ case 0:
348
+ pages = [];
349
+ _a.label = 1;
350
+ case 1:
351
+ _a.trys.push([1, 4, , 5]);
352
+ llmsUrl = "".concat(this.options.baseUrl, "/docs/llms.txt");
353
+ return [4 /*yield*/, fetch(llmsUrl, {
354
+ headers: {
355
+ Accept: "text/plain",
356
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
357
+ },
358
+ })];
359
+ case 2:
360
+ response = _a.sent();
361
+ if (!response.ok) {
362
+ console.warn("Could not fetch llms.txt from ".concat(llmsUrl));
363
+ return [2 /*return*/, pages];
364
+ }
365
+ return [4 /*yield*/, response.text()];
366
+ case 3:
367
+ content = _a.sent();
368
+ linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
369
+ match = void 0;
370
+ while ((match = linkRegex.exec(content)) !== null) {
371
+ url = match[2];
372
+ pagePath = match[3];
373
+ pageName = pagePath.replace(".md", "");
374
+ pathParts = pageName.split("/");
375
+ if (pathParts.length === 1) {
376
+ // No category: just "page-name"
377
+ pages.push({ category: "", page: pathParts[0] });
378
+ }
379
+ else if (pathParts.length === 2) {
380
+ // Has category: "category/page-name"
381
+ pages.push({ category: pathParts[0], page: pathParts[1] });
382
+ }
383
+ else {
384
+ category = pathParts.slice(0, -1).join("/");
385
+ page = pathParts[pathParts.length - 1];
386
+ pages.push({ category: category, page: page });
387
+ }
388
+ }
389
+ console.log("Discovered ".concat(pages.length, " pages from llms.txt"));
390
+ return [3 /*break*/, 5];
391
+ case 4:
392
+ error_2 = _a.sent();
393
+ console.error("Error discovering pages:", error_2);
394
+ return [3 /*break*/, 5];
395
+ case 5: return [2 /*return*/, pages];
396
+ }
397
+ });
398
+ });
399
+ };
400
+ /**
401
+ * Discover additional pages by parsing the docs index (fallback)
402
+ */
403
+ MarkdownDocsScraper.prototype.discoverPagesHtml = function () {
404
+ return __awaiter(this, void 0, void 0, function () {
405
+ var discovered, indexUrl, response, html, mdLinkRegex, match, path, error_3;
406
+ return __generator(this, function (_a) {
407
+ switch (_a.label) {
408
+ case 0:
409
+ discovered = [];
410
+ _a.label = 1;
411
+ case 1:
412
+ _a.trys.push([1, 4, , 5]);
413
+ indexUrl = "".concat(this.options.baseUrl).concat(this.options.docsPath);
414
+ return [4 /*yield*/, fetch(indexUrl, {
415
+ headers: {
416
+ Accept: "text/html",
417
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
418
+ },
419
+ })];
420
+ case 2:
421
+ response = _a.sent();
422
+ if (!response.ok) {
423
+ return [2 /*return*/, discovered];
424
+ }
425
+ return [4 /*yield*/, response.text()];
426
+ case 3:
427
+ html = _a.sent();
428
+ mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
429
+ match = void 0;
430
+ while ((match = mdLinkRegex.exec(html)) !== null) {
431
+ path = match[1];
432
+ if (!discovered.includes(path)) {
433
+ discovered.push(path);
434
+ }
435
+ }
436
+ console.log("Discovered ".concat(discovered.length, " additional pages from HTML"));
437
+ return [3 /*break*/, 5];
438
+ case 4:
439
+ error_3 = _a.sent();
440
+ console.error("Error discovering pages from HTML:", error_3);
441
+ return [3 /*break*/, 5];
442
+ case 5: return [2 /*return*/, discovered];
443
+ }
444
+ });
445
+ });
446
+ };
447
+ return MarkdownDocsScraper;
448
+ }());
449
+ exports.MarkdownDocsScraper = MarkdownDocsScraper;
450
+ // ============================================================================
451
+ // CONVENIENCE FUNCTION
452
+ // ============================================================================
453
+ /**
454
+ * Scrape markdown documentation with a single function call
455
+ */
456
+ function scrapeMarkdownDocs(options) {
457
+ return __awaiter(this, void 0, void 0, function () {
458
+ var scraper, result, _a;
459
+ return __generator(this, function (_b) {
460
+ switch (_b.label) {
461
+ case 0:
462
+ scraper = new MarkdownDocsScraper(options);
463
+ if (!options.useLlms) return [3 /*break*/, 2];
464
+ return [4 /*yield*/, scraper.scrapeFromLlms()];
465
+ case 1:
466
+ _a = _b.sent();
467
+ return [3 /*break*/, 4];
468
+ case 2: return [4 /*yield*/, scraper.scrape()];
469
+ case 3:
470
+ _a = _b.sent();
471
+ _b.label = 4;
472
+ case 4:
473
+ result = _a;
474
+ if (!options.outputDir) return [3 /*break*/, 6];
475
+ return [4 /*yield*/, scraper.savePages(result.downloaded)];
476
+ case 5:
477
+ _b.sent();
478
+ _b.label = 6;
479
+ case 6: return [2 /*return*/, result];
480
+ }
481
+ });
482
+ });
483
+ }
484
+ // ============================================================================
485
+ // EXPORTS
486
+ // ============================================================================
487
+ exports.default = MarkdownDocsScraper;