@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +19 -13
- package/dist/index.d.ts +116 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +323 -105
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +428 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +12 -1
- package/src/index.js +487 -0
- package/src/index.ts +276 -158
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
package/src/index.js
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @ebowwa/markdown-docs-scraper
|
|
4
|
+
*
|
|
5
|
+
* Scrape and mirror markdown-based documentation sites
|
|
6
|
+
*/
|
|
7
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
8
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
9
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
10
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
11
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
12
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
13
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
14
|
+
});
|
|
15
|
+
};
|
|
16
|
+
var __generator = (this && this.__generator) || function (thisArg, body) {
|
|
17
|
+
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
|
|
18
|
+
return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
|
|
19
|
+
function verb(n) { return function (v) { return step([n, v]); }; }
|
|
20
|
+
function step(op) {
|
|
21
|
+
if (f) throw new TypeError("Generator is already executing.");
|
|
22
|
+
while (g && (g = 0, op[0] && (_ = 0)), _) try {
|
|
23
|
+
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
|
|
24
|
+
if (y = 0, t) op = [op[0] & 2, t.value];
|
|
25
|
+
switch (op[0]) {
|
|
26
|
+
case 0: case 1: t = op; break;
|
|
27
|
+
case 4: _.label++; return { value: op[1], done: false };
|
|
28
|
+
case 5: _.label++; y = op[1]; op = [0]; continue;
|
|
29
|
+
case 7: op = _.ops.pop(); _.trys.pop(); continue;
|
|
30
|
+
default:
|
|
31
|
+
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
|
|
32
|
+
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
|
|
33
|
+
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
|
|
34
|
+
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
|
|
35
|
+
if (t[2]) _.ops.pop();
|
|
36
|
+
_.trys.pop(); continue;
|
|
37
|
+
}
|
|
38
|
+
op = body.call(thisArg, _);
|
|
39
|
+
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
|
|
40
|
+
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
+
exports.MarkdownDocsScraper = void 0;
|
|
45
|
+
exports.scrapeMarkdownDocs = scrapeMarkdownDocs;
|
|
46
|
+
// ============================================================================
|
|
47
|
+
// SCRAPER
|
|
48
|
+
// ============================================================================
|
|
49
|
+
var MarkdownDocsScraper = /** @class */ (function () {
|
|
50
|
+
function MarkdownDocsScraper(options) {
|
|
51
|
+
this.options = {
|
|
52
|
+
baseUrl: options.baseUrl,
|
|
53
|
+
docsPath: options.docsPath || "/docs/en",
|
|
54
|
+
categories: options.categories || {},
|
|
55
|
+
outputDir: options.outputDir || "./docs",
|
|
56
|
+
concurrency: options.concurrency || 5,
|
|
57
|
+
onProgress: options.onProgress || (function () { }),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Fetch markdown content from a URL
|
|
62
|
+
*/
|
|
63
|
+
MarkdownDocsScraper.prototype.fetchMarkdown = function (url) {
|
|
64
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
65
|
+
var response, contentType, error_1;
|
|
66
|
+
return __generator(this, function (_a) {
|
|
67
|
+
switch (_a.label) {
|
|
68
|
+
case 0:
|
|
69
|
+
_a.trys.push([0, 3, , 4]);
|
|
70
|
+
return [4 /*yield*/, fetch(url, {
|
|
71
|
+
headers: {
|
|
72
|
+
Accept: "text/markdown, text/plain",
|
|
73
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
74
|
+
},
|
|
75
|
+
})];
|
|
76
|
+
case 1:
|
|
77
|
+
response = _a.sent();
|
|
78
|
+
if (!response.ok) {
|
|
79
|
+
return [2 /*return*/, null];
|
|
80
|
+
}
|
|
81
|
+
contentType = response.headers.get("content-type") || "";
|
|
82
|
+
if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
|
|
83
|
+
// Try to parse anyway - some sites return incorrect content-type
|
|
84
|
+
}
|
|
85
|
+
return [4 /*yield*/, response.text()];
|
|
86
|
+
case 2: return [2 /*return*/, _a.sent()];
|
|
87
|
+
case 3:
|
|
88
|
+
error_1 = _a.sent();
|
|
89
|
+
console.error("Error fetching ".concat(url, ":"), error_1);
|
|
90
|
+
return [2 /*return*/, null];
|
|
91
|
+
case 4: return [2 /*return*/];
|
|
92
|
+
}
|
|
93
|
+
});
|
|
94
|
+
});
|
|
95
|
+
};
|
|
96
|
+
/**
|
|
97
|
+
* Extract title from markdown content
|
|
98
|
+
*/
|
|
99
|
+
MarkdownDocsScraper.prototype.extractTitle = function (markdown) {
|
|
100
|
+
var titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
101
|
+
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
102
|
+
};
|
|
103
|
+
/**
|
|
104
|
+
* Sanitize filename from URL path
|
|
105
|
+
*/
|
|
106
|
+
MarkdownDocsScraper.prototype.sanitizeFilename = function (path) {
|
|
107
|
+
return path
|
|
108
|
+
.toLowerCase()
|
|
109
|
+
.replace(/[^a-z0-9/]+/g, "-")
|
|
110
|
+
.replace(/^-|-$/g, "")
|
|
111
|
+
.replace(/\//g, "/");
|
|
112
|
+
};
|
|
113
|
+
/**
|
|
114
|
+
* Build URL for a documentation page
|
|
115
|
+
*/
|
|
116
|
+
MarkdownDocsScraper.prototype.buildUrl = function (category, page) {
|
|
117
|
+
if (category) {
|
|
118
|
+
return "".concat(this.options.baseUrl).concat(this.options.docsPath, "/").concat(category, "/").concat(page, ".md");
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
return "".concat(this.options.baseUrl).concat(this.options.docsPath, "/").concat(page, ".md");
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
/**
|
|
125
|
+
* Download a single documentation page
|
|
126
|
+
*/
|
|
127
|
+
MarkdownDocsScraper.prototype.downloadPage = function (category, page) {
|
|
128
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
129
|
+
var url, content;
|
|
130
|
+
return __generator(this, function (_a) {
|
|
131
|
+
switch (_a.label) {
|
|
132
|
+
case 0:
|
|
133
|
+
url = this.buildUrl(category, page);
|
|
134
|
+
return [4 /*yield*/, this.fetchMarkdown(url)];
|
|
135
|
+
case 1:
|
|
136
|
+
content = _a.sent();
|
|
137
|
+
if (!content) {
|
|
138
|
+
return [2 /*return*/, null];
|
|
139
|
+
}
|
|
140
|
+
return [2 /*return*/, {
|
|
141
|
+
url: url,
|
|
142
|
+
title: this.extractTitle(content),
|
|
143
|
+
content: content,
|
|
144
|
+
category: category,
|
|
145
|
+
pageName: page, // Store the page name for saving
|
|
146
|
+
}];
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
};
|
|
151
|
+
/**
|
|
152
|
+
* Scrape pages discovered from llms.txt
|
|
153
|
+
*/
|
|
154
|
+
MarkdownDocsScraper.prototype.scrapeFromLlms = function () {
|
|
155
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
156
|
+
var startTime, downloaded, failed, pages, _loop_1, this_1, i, duration;
|
|
157
|
+
var _this = this;
|
|
158
|
+
return __generator(this, function (_a) {
|
|
159
|
+
switch (_a.label) {
|
|
160
|
+
case 0:
|
|
161
|
+
startTime = Date.now();
|
|
162
|
+
downloaded = [];
|
|
163
|
+
failed = [];
|
|
164
|
+
return [4 /*yield*/, this.discoverPages()];
|
|
165
|
+
case 1:
|
|
166
|
+
pages = _a.sent();
|
|
167
|
+
if (pages.length === 0) {
|
|
168
|
+
console.log("No pages discovered, falling back to categories");
|
|
169
|
+
return [2 /*return*/, this.scrape()];
|
|
170
|
+
}
|
|
171
|
+
console.log("Scraping ".concat(pages.length, " discovered pages..."));
|
|
172
|
+
_loop_1 = function (i) {
|
|
173
|
+
var batch, results;
|
|
174
|
+
return __generator(this, function (_b) {
|
|
175
|
+
switch (_b.label) {
|
|
176
|
+
case 0:
|
|
177
|
+
batch = pages.slice(i, i + this_1.options.concurrency);
|
|
178
|
+
return [4 /*yield*/, Promise.allSettled(batch.map(function (page) { return _this.downloadPage(page.category, page.page); }))];
|
|
179
|
+
case 1:
|
|
180
|
+
results = _b.sent();
|
|
181
|
+
results.forEach(function (result, index) {
|
|
182
|
+
var page = batch[index];
|
|
183
|
+
if (result.status === "fulfilled" && result.value) {
|
|
184
|
+
downloaded.push(result.value);
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
failed.push({
|
|
188
|
+
url: _this.buildUrl(page.category, page.page),
|
|
189
|
+
error: result.status === "rejected" ? result.reason : "Not found",
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
_this.options.onProgress(downloaded.length + failed.length, pages.length);
|
|
193
|
+
});
|
|
194
|
+
return [2 /*return*/];
|
|
195
|
+
}
|
|
196
|
+
});
|
|
197
|
+
};
|
|
198
|
+
this_1 = this;
|
|
199
|
+
i = 0;
|
|
200
|
+
_a.label = 2;
|
|
201
|
+
case 2:
|
|
202
|
+
if (!(i < pages.length)) return [3 /*break*/, 5];
|
|
203
|
+
return [5 /*yield**/, _loop_1(i)];
|
|
204
|
+
case 3:
|
|
205
|
+
_a.sent();
|
|
206
|
+
_a.label = 4;
|
|
207
|
+
case 4:
|
|
208
|
+
i += this.options.concurrency;
|
|
209
|
+
return [3 /*break*/, 2];
|
|
210
|
+
case 5:
|
|
211
|
+
duration = Date.now() - startTime;
|
|
212
|
+
console.log("\u2705 Downloaded: ".concat(downloaded.length, " pages"));
|
|
213
|
+
console.log("\u274C Failed: ".concat(failed.length, " pages"));
|
|
214
|
+
console.log("\u23F1\uFE0F Duration: ".concat((duration / 1000).toFixed(2), "s"));
|
|
215
|
+
return [2 /*return*/, { downloaded: downloaded, failed: failed, duration: duration }];
|
|
216
|
+
}
|
|
217
|
+
});
|
|
218
|
+
});
|
|
219
|
+
};
|
|
220
|
+
/**
|
|
221
|
+
* Scrape all documentation pages
|
|
222
|
+
*/
|
|
223
|
+
MarkdownDocsScraper.prototype.scrape = function () {
|
|
224
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
225
|
+
var startTime, downloaded, failed, pages, total, _loop_2, this_2, i, duration;
|
|
226
|
+
var _this = this;
|
|
227
|
+
return __generator(this, function (_a) {
|
|
228
|
+
switch (_a.label) {
|
|
229
|
+
case 0:
|
|
230
|
+
startTime = Date.now();
|
|
231
|
+
downloaded = [];
|
|
232
|
+
failed = [];
|
|
233
|
+
pages = this.getPagesToScrape();
|
|
234
|
+
total = pages.length;
|
|
235
|
+
console.log("Scraping ".concat(total, " pages from ").concat(this.options.baseUrl, "..."));
|
|
236
|
+
_loop_2 = function (i) {
|
|
237
|
+
var batch, results;
|
|
238
|
+
return __generator(this, function (_b) {
|
|
239
|
+
switch (_b.label) {
|
|
240
|
+
case 0:
|
|
241
|
+
batch = pages.slice(i, i + this_2.options.concurrency);
|
|
242
|
+
return [4 /*yield*/, Promise.allSettled(batch.map(function (page) { return _this.downloadPage(page.category, page.page); }))];
|
|
243
|
+
case 1:
|
|
244
|
+
results = _b.sent();
|
|
245
|
+
results.forEach(function (result, index) {
|
|
246
|
+
var page = batch[index];
|
|
247
|
+
if (result.status === "fulfilled" && result.value) {
|
|
248
|
+
downloaded.push(result.value);
|
|
249
|
+
}
|
|
250
|
+
else {
|
|
251
|
+
failed.push({
|
|
252
|
+
url: _this.buildUrl(page.category, page.page),
|
|
253
|
+
error: result.status === "rejected" ? result.reason : "Not found",
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
_this.options.onProgress(downloaded.length + failed.length, total);
|
|
257
|
+
});
|
|
258
|
+
return [2 /*return*/];
|
|
259
|
+
}
|
|
260
|
+
});
|
|
261
|
+
};
|
|
262
|
+
this_2 = this;
|
|
263
|
+
i = 0;
|
|
264
|
+
_a.label = 1;
|
|
265
|
+
case 1:
|
|
266
|
+
if (!(i < pages.length)) return [3 /*break*/, 4];
|
|
267
|
+
return [5 /*yield**/, _loop_2(i)];
|
|
268
|
+
case 2:
|
|
269
|
+
_a.sent();
|
|
270
|
+
_a.label = 3;
|
|
271
|
+
case 3:
|
|
272
|
+
i += this.options.concurrency;
|
|
273
|
+
return [3 /*break*/, 1];
|
|
274
|
+
case 4:
|
|
275
|
+
duration = Date.now() - startTime;
|
|
276
|
+
console.log("\u2705 Downloaded: ".concat(downloaded.length, " pages"));
|
|
277
|
+
console.log("\u274C Failed: ".concat(failed.length, " pages"));
|
|
278
|
+
console.log("\u23F1\uFE0F Duration: ".concat((duration / 1000).toFixed(2), "s"));
|
|
279
|
+
return [2 /*return*/, { downloaded: downloaded, failed: failed, duration: duration }];
|
|
280
|
+
}
|
|
281
|
+
});
|
|
282
|
+
});
|
|
283
|
+
};
|
|
284
|
+
/**
|
|
285
|
+
* Save scraped pages to disk
|
|
286
|
+
*/
|
|
287
|
+
MarkdownDocsScraper.prototype.savePages = function (pages) {
|
|
288
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
289
|
+
var fs, path, _i, pages_1, page, nameToUse, dir, filepath, header;
|
|
290
|
+
var _a;
|
|
291
|
+
return __generator(this, function (_b) {
|
|
292
|
+
switch (_b.label) {
|
|
293
|
+
case 0: return [4 /*yield*/, Promise.resolve().then(function () { return require("fs/promises"); })];
|
|
294
|
+
case 1:
|
|
295
|
+
fs = _b.sent();
|
|
296
|
+
return [4 /*yield*/, Promise.resolve().then(function () { return require("path"); })];
|
|
297
|
+
case 2:
|
|
298
|
+
path = _b.sent();
|
|
299
|
+
_i = 0, pages_1 = pages;
|
|
300
|
+
_b.label = 3;
|
|
301
|
+
case 3:
|
|
302
|
+
if (!(_i < pages_1.length)) return [3 /*break*/, 7];
|
|
303
|
+
page = pages_1[_i];
|
|
304
|
+
nameToUse = page.pageName || ((_a = page.url.split("/").pop()) === null || _a === void 0 ? void 0 : _a.replace(".md", "")) || "untitled";
|
|
305
|
+
dir = page.category
|
|
306
|
+
? path.join(this.options.outputDir, page.category)
|
|
307
|
+
: this.options.outputDir;
|
|
308
|
+
return [4 /*yield*/, fs.mkdir(dir, { recursive: true })];
|
|
309
|
+
case 4:
|
|
310
|
+
_b.sent();
|
|
311
|
+
filepath = path.join(dir, "".concat(nameToUse, ".md"));
|
|
312
|
+
header = "<!--\nSource: ".concat(page.url, "\nDownloaded: ").concat(new Date().toISOString(), "\n-->\n\n");
|
|
313
|
+
return [4 /*yield*/, fs.writeFile(filepath, header + page.content, "utf-8")];
|
|
314
|
+
case 5:
|
|
315
|
+
_b.sent();
|
|
316
|
+
_b.label = 6;
|
|
317
|
+
case 6:
|
|
318
|
+
_i++;
|
|
319
|
+
return [3 /*break*/, 3];
|
|
320
|
+
case 7: return [2 /*return*/];
|
|
321
|
+
}
|
|
322
|
+
});
|
|
323
|
+
});
|
|
324
|
+
};
|
|
325
|
+
/**
|
|
326
|
+
* Get list of pages to scrape based on categories
|
|
327
|
+
*/
|
|
328
|
+
MarkdownDocsScraper.prototype.getPagesToScrape = function () {
|
|
329
|
+
var pages = [];
|
|
330
|
+
for (var _i = 0, _a = Object.entries(this.options.categories); _i < _a.length; _i++) {
|
|
331
|
+
var _b = _a[_i], category = _b[0], pageList = _b[1];
|
|
332
|
+
for (var _c = 0, pageList_1 = pageList; _c < pageList_1.length; _c++) {
|
|
333
|
+
var page = pageList_1[_c];
|
|
334
|
+
pages.push({ category: category, page: page });
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
return pages;
|
|
338
|
+
};
|
|
339
|
+
/**
|
|
340
|
+
* Discover pages from llms.txt index
|
|
341
|
+
*/
|
|
342
|
+
MarkdownDocsScraper.prototype.discoverPages = function () {
|
|
343
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
344
|
+
var pages, llmsUrl, response, content, linkRegex, match, url, pagePath, pageName, pathParts, category, page, error_2;
|
|
345
|
+
return __generator(this, function (_a) {
|
|
346
|
+
switch (_a.label) {
|
|
347
|
+
case 0:
|
|
348
|
+
pages = [];
|
|
349
|
+
_a.label = 1;
|
|
350
|
+
case 1:
|
|
351
|
+
_a.trys.push([1, 4, , 5]);
|
|
352
|
+
llmsUrl = "".concat(this.options.baseUrl, "/docs/llms.txt");
|
|
353
|
+
return [4 /*yield*/, fetch(llmsUrl, {
|
|
354
|
+
headers: {
|
|
355
|
+
Accept: "text/plain",
|
|
356
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
357
|
+
},
|
|
358
|
+
})];
|
|
359
|
+
case 2:
|
|
360
|
+
response = _a.sent();
|
|
361
|
+
if (!response.ok) {
|
|
362
|
+
console.warn("Could not fetch llms.txt from ".concat(llmsUrl));
|
|
363
|
+
return [2 /*return*/, pages];
|
|
364
|
+
}
|
|
365
|
+
return [4 /*yield*/, response.text()];
|
|
366
|
+
case 3:
|
|
367
|
+
content = _a.sent();
|
|
368
|
+
linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
369
|
+
match = void 0;
|
|
370
|
+
while ((match = linkRegex.exec(content)) !== null) {
|
|
371
|
+
url = match[2];
|
|
372
|
+
pagePath = match[3];
|
|
373
|
+
pageName = pagePath.replace(".md", "");
|
|
374
|
+
pathParts = pageName.split("/");
|
|
375
|
+
if (pathParts.length === 1) {
|
|
376
|
+
// No category: just "page-name"
|
|
377
|
+
pages.push({ category: "", page: pathParts[0] });
|
|
378
|
+
}
|
|
379
|
+
else if (pathParts.length === 2) {
|
|
380
|
+
// Has category: "category/page-name"
|
|
381
|
+
pages.push({ category: pathParts[0], page: pathParts[1] });
|
|
382
|
+
}
|
|
383
|
+
else {
|
|
384
|
+
category = pathParts.slice(0, -1).join("/");
|
|
385
|
+
page = pathParts[pathParts.length - 1];
|
|
386
|
+
pages.push({ category: category, page: page });
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
console.log("Discovered ".concat(pages.length, " pages from llms.txt"));
|
|
390
|
+
return [3 /*break*/, 5];
|
|
391
|
+
case 4:
|
|
392
|
+
error_2 = _a.sent();
|
|
393
|
+
console.error("Error discovering pages:", error_2);
|
|
394
|
+
return [3 /*break*/, 5];
|
|
395
|
+
case 5: return [2 /*return*/, pages];
|
|
396
|
+
}
|
|
397
|
+
});
|
|
398
|
+
});
|
|
399
|
+
};
|
|
400
|
+
/**
|
|
401
|
+
* Discover additional pages by parsing the docs index (fallback)
|
|
402
|
+
*/
|
|
403
|
+
MarkdownDocsScraper.prototype.discoverPagesHtml = function () {
|
|
404
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
405
|
+
var discovered, indexUrl, response, html, mdLinkRegex, match, path, error_3;
|
|
406
|
+
return __generator(this, function (_a) {
|
|
407
|
+
switch (_a.label) {
|
|
408
|
+
case 0:
|
|
409
|
+
discovered = [];
|
|
410
|
+
_a.label = 1;
|
|
411
|
+
case 1:
|
|
412
|
+
_a.trys.push([1, 4, , 5]);
|
|
413
|
+
indexUrl = "".concat(this.options.baseUrl).concat(this.options.docsPath);
|
|
414
|
+
return [4 /*yield*/, fetch(indexUrl, {
|
|
415
|
+
headers: {
|
|
416
|
+
Accept: "text/html",
|
|
417
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
418
|
+
},
|
|
419
|
+
})];
|
|
420
|
+
case 2:
|
|
421
|
+
response = _a.sent();
|
|
422
|
+
if (!response.ok) {
|
|
423
|
+
return [2 /*return*/, discovered];
|
|
424
|
+
}
|
|
425
|
+
return [4 /*yield*/, response.text()];
|
|
426
|
+
case 3:
|
|
427
|
+
html = _a.sent();
|
|
428
|
+
mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
|
|
429
|
+
match = void 0;
|
|
430
|
+
while ((match = mdLinkRegex.exec(html)) !== null) {
|
|
431
|
+
path = match[1];
|
|
432
|
+
if (!discovered.includes(path)) {
|
|
433
|
+
discovered.push(path);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
console.log("Discovered ".concat(discovered.length, " additional pages from HTML"));
|
|
437
|
+
return [3 /*break*/, 5];
|
|
438
|
+
case 4:
|
|
439
|
+
error_3 = _a.sent();
|
|
440
|
+
console.error("Error discovering pages from HTML:", error_3);
|
|
441
|
+
return [3 /*break*/, 5];
|
|
442
|
+
case 5: return [2 /*return*/, discovered];
|
|
443
|
+
}
|
|
444
|
+
});
|
|
445
|
+
});
|
|
446
|
+
};
|
|
447
|
+
return MarkdownDocsScraper;
|
|
448
|
+
}());
|
|
449
|
+
exports.MarkdownDocsScraper = MarkdownDocsScraper;
|
|
450
|
+
// ============================================================================
|
|
451
|
+
// CONVENIENCE FUNCTION
|
|
452
|
+
// ============================================================================
|
|
453
|
+
/**
|
|
454
|
+
* Scrape markdown documentation with a single function call
|
|
455
|
+
*/
|
|
456
|
+
function scrapeMarkdownDocs(options) {
|
|
457
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
458
|
+
var scraper, result, _a;
|
|
459
|
+
return __generator(this, function (_b) {
|
|
460
|
+
switch (_b.label) {
|
|
461
|
+
case 0:
|
|
462
|
+
scraper = new MarkdownDocsScraper(options);
|
|
463
|
+
if (!options.useLlms) return [3 /*break*/, 2];
|
|
464
|
+
return [4 /*yield*/, scraper.scrapeFromLlms()];
|
|
465
|
+
case 1:
|
|
466
|
+
_a = _b.sent();
|
|
467
|
+
return [3 /*break*/, 4];
|
|
468
|
+
case 2: return [4 /*yield*/, scraper.scrape()];
|
|
469
|
+
case 3:
|
|
470
|
+
_a = _b.sent();
|
|
471
|
+
_b.label = 4;
|
|
472
|
+
case 4:
|
|
473
|
+
result = _a;
|
|
474
|
+
if (!options.outputDir) return [3 /*break*/, 6];
|
|
475
|
+
return [4 /*yield*/, scraper.savePages(result.downloaded)];
|
|
476
|
+
case 5:
|
|
477
|
+
_b.sent();
|
|
478
|
+
_b.label = 6;
|
|
479
|
+
case 6: return [2 /*return*/, result];
|
|
480
|
+
}
|
|
481
|
+
});
|
|
482
|
+
});
|
|
483
|
+
}
|
|
484
|
+
// ============================================================================
|
|
485
|
+
// EXPORTS
|
|
486
|
+
// ============================================================================
|
|
487
|
+
exports.default = MarkdownDocsScraper;
|