@ebowwa/markdown-docs-scraper 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +1 -0
- package/dist/index.d.ts +128 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +219 -26
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +448 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +2 -0
- package/src/index.js +487 -0
- package/src/index.ts +115 -28
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
package/src/cli.js
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* CLI for @ebowwa/markdown-docs-scraper
|
|
5
|
+
*/
|
|
6
|
+
var __assign = (this && this.__assign) || function () {
|
|
7
|
+
__assign = Object.assign || function(t) {
|
|
8
|
+
for (var s, i = 1, n = arguments.length; i < n; i++) {
|
|
9
|
+
s = arguments[i];
|
|
10
|
+
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
|
|
11
|
+
t[p] = s[p];
|
|
12
|
+
}
|
|
13
|
+
return t;
|
|
14
|
+
};
|
|
15
|
+
return __assign.apply(this, arguments);
|
|
16
|
+
};
|
|
17
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
18
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
19
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
20
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
21
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
22
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
23
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
24
|
+
});
|
|
25
|
+
};
|
|
26
|
+
var __generator = (this && this.__generator) || function (thisArg, body) {
|
|
27
|
+
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
|
|
28
|
+
return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
|
|
29
|
+
function verb(n) { return function (v) { return step([n, v]); }; }
|
|
30
|
+
function step(op) {
|
|
31
|
+
if (f) throw new TypeError("Generator is already executing.");
|
|
32
|
+
while (g && (g = 0, op[0] && (_ = 0)), _) try {
|
|
33
|
+
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
|
|
34
|
+
if (y = 0, t) op = [op[0] & 2, t.value];
|
|
35
|
+
switch (op[0]) {
|
|
36
|
+
case 0: case 1: t = op; break;
|
|
37
|
+
case 4: _.label++; return { value: op[1], done: false };
|
|
38
|
+
case 5: _.label++; y = op[1]; op = [0]; continue;
|
|
39
|
+
case 7: op = _.ops.pop(); _.trys.pop(); continue;
|
|
40
|
+
default:
|
|
41
|
+
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
|
|
42
|
+
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
|
|
43
|
+
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
|
|
44
|
+
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
|
|
45
|
+
if (t[2]) _.ops.pop();
|
|
46
|
+
_.trys.pop(); continue;
|
|
47
|
+
}
|
|
48
|
+
op = body.call(thisArg, _);
|
|
49
|
+
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
|
|
50
|
+
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
54
|
+
var commander_1 = require("commander");
|
|
55
|
+
var index_js_1 = require("./index.js");
|
|
56
|
+
commander_1.program
|
|
57
|
+
.name("markdown-docs-scraper")
|
|
58
|
+
.description("Scrape and mirror markdown-based documentation sites")
|
|
59
|
+
.version("1.0.0");
|
|
60
|
+
commander_1.program
|
|
61
|
+
.command("scrape")
|
|
62
|
+
.description("Scrape documentation from a URL")
|
|
63
|
+
.requiredOption("-u, --url <url>", "Base URL of the documentation site")
|
|
64
|
+
.option("-o, --output <dir>", "Output directory", "./docs")
|
|
65
|
+
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
66
|
+
.option("-c, --concurrency <num>", "Concurrency level", "5")
|
|
67
|
+
.option("--discover", "Discover pages before scraping", false)
|
|
68
|
+
.action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
|
|
69
|
+
var scraperOptions, result;
|
|
70
|
+
return __generator(this, function (_a) {
|
|
71
|
+
switch (_a.label) {
|
|
72
|
+
case 0:
|
|
73
|
+
scraperOptions = {
|
|
74
|
+
baseUrl: options.url,
|
|
75
|
+
docsPath: options.docsPath,
|
|
76
|
+
outputDir: options.output,
|
|
77
|
+
concurrency: parseInt(options.concurrency),
|
|
78
|
+
};
|
|
79
|
+
console.log("\uD83D\uDD0D Scraping ".concat(options.url, "..."));
|
|
80
|
+
console.log("\uD83D\uDCC1 Output: ".concat(options.output));
|
|
81
|
+
console.log();
|
|
82
|
+
return [4 /*yield*/, (0, index_js_1.scrapeMarkdownDocs)(scraperOptions)];
|
|
83
|
+
case 1:
|
|
84
|
+
result = _a.sent();
|
|
85
|
+
console.log();
|
|
86
|
+
console.log("Summary:");
|
|
87
|
+
console.log(" Downloaded: ".concat(result.downloaded.length));
|
|
88
|
+
console.log(" Failed: ".concat(result.failed.length));
|
|
89
|
+
console.log(" Duration: ".concat((result.duration / 1000).toFixed(2), "s"));
|
|
90
|
+
if (result.failed.length > 0) {
|
|
91
|
+
console.log();
|
|
92
|
+
console.log("Failed pages:");
|
|
93
|
+
result.failed.slice(0, 10).forEach(function (f) {
|
|
94
|
+
console.log(" \u274C ".concat(f.url, ": ").concat(f.error));
|
|
95
|
+
});
|
|
96
|
+
if (result.failed.length > 10) {
|
|
97
|
+
console.log(" ... and ".concat(result.failed.length - 10, " more"));
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return [2 /*return*/];
|
|
101
|
+
}
|
|
102
|
+
});
|
|
103
|
+
}); });
|
|
104
|
+
commander_1.program
|
|
105
|
+
.command("discover")
|
|
106
|
+
.description("Discover all available documentation pages")
|
|
107
|
+
.requiredOption("-u, --url <url>", "Base URL of the documentation site")
|
|
108
|
+
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
109
|
+
.action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
|
|
110
|
+
var scraper, pages;
|
|
111
|
+
return __generator(this, function (_a) {
|
|
112
|
+
switch (_a.label) {
|
|
113
|
+
case 0:
|
|
114
|
+
scraper = new index_js_1.MarkdownDocsScraper({
|
|
115
|
+
baseUrl: options.url,
|
|
116
|
+
docsPath: options.docsPath,
|
|
117
|
+
});
|
|
118
|
+
console.log("\uD83D\uDD0D Discovering pages from ".concat(options.url, "..."));
|
|
119
|
+
return [4 /*yield*/, scraper.discoverPages()];
|
|
120
|
+
case 1:
|
|
121
|
+
pages = _a.sent();
|
|
122
|
+
console.log("\nFound ".concat(pages.length, " pages:\n"));
|
|
123
|
+
pages.forEach(function (page) {
|
|
124
|
+
console.log(" - ".concat(page));
|
|
125
|
+
});
|
|
126
|
+
return [2 /*return*/];
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
}); });
|
|
130
|
+
commander_1.program
|
|
131
|
+
.command("anthropic")
|
|
132
|
+
.description("Quick scrape of Anthropic Claude Code docs (uses llms.txt)")
|
|
133
|
+
.option("-o, --output <dir>", "Output directory", "./docs")
|
|
134
|
+
.action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
|
|
135
|
+
var scraperOptions, result;
|
|
136
|
+
return __generator(this, function (_a) {
|
|
137
|
+
switch (_a.label) {
|
|
138
|
+
case 0:
|
|
139
|
+
scraperOptions = {
|
|
140
|
+
baseUrl: "https://code.claude.com",
|
|
141
|
+
docsPath: "/docs/en",
|
|
142
|
+
outputDir: options.output,
|
|
143
|
+
};
|
|
144
|
+
console.log("🔍 Scraping Anthropic Claude Code documentation...");
|
|
145
|
+
console.log("📋 Using llms.txt index for complete page list");
|
|
146
|
+
console.log("\uD83D\uDCC1 Output: ".concat(options.output));
|
|
147
|
+
console.log();
|
|
148
|
+
return [4 /*yield*/, (0, index_js_1.scrapeMarkdownDocs)(__assign(__assign({}, scraperOptions), { useLlms: true }))];
|
|
149
|
+
case 1:
|
|
150
|
+
result = _a.sent();
|
|
151
|
+
console.log();
|
|
152
|
+
console.log("Summary:");
|
|
153
|
+
console.log(" Downloaded: ".concat(result.downloaded.length));
|
|
154
|
+
console.log(" Failed: ".concat(result.failed.length));
|
|
155
|
+
console.log(" Duration: ".concat((result.duration / 1000).toFixed(2), "s"));
|
|
156
|
+
return [2 /*return*/];
|
|
157
|
+
}
|
|
158
|
+
});
|
|
159
|
+
}); });
|
|
160
|
+
commander_1.program.parse();
|
package/src/cli.ts
CHANGED
|
@@ -63,6 +63,8 @@ program
|
|
|
63
63
|
.option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
|
|
64
64
|
.option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
|
|
65
65
|
.action(async (options) => {
|
|
66
|
+
console.log("DEBUG CLI: Options received:", options);
|
|
67
|
+
|
|
66
68
|
const scraper = new MarkdownDocsScraper({
|
|
67
69
|
baseUrl: options.url,
|
|
68
70
|
docsPath: options.docsPath,
|
package/src/index.js
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @ebowwa/markdown-docs-scraper
|
|
4
|
+
*
|
|
5
|
+
* Scrape and mirror markdown-based documentation sites
|
|
6
|
+
*/
|
|
7
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
8
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
9
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
10
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
11
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
12
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
13
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
14
|
+
});
|
|
15
|
+
};
|
|
16
|
+
var __generator = (this && this.__generator) || function (thisArg, body) {
|
|
17
|
+
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
|
|
18
|
+
return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
|
|
19
|
+
function verb(n) { return function (v) { return step([n, v]); }; }
|
|
20
|
+
function step(op) {
|
|
21
|
+
if (f) throw new TypeError("Generator is already executing.");
|
|
22
|
+
while (g && (g = 0, op[0] && (_ = 0)), _) try {
|
|
23
|
+
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
|
|
24
|
+
if (y = 0, t) op = [op[0] & 2, t.value];
|
|
25
|
+
switch (op[0]) {
|
|
26
|
+
case 0: case 1: t = op; break;
|
|
27
|
+
case 4: _.label++; return { value: op[1], done: false };
|
|
28
|
+
case 5: _.label++; y = op[1]; op = [0]; continue;
|
|
29
|
+
case 7: op = _.ops.pop(); _.trys.pop(); continue;
|
|
30
|
+
default:
|
|
31
|
+
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
|
|
32
|
+
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
|
|
33
|
+
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
|
|
34
|
+
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
|
|
35
|
+
if (t[2]) _.ops.pop();
|
|
36
|
+
_.trys.pop(); continue;
|
|
37
|
+
}
|
|
38
|
+
op = body.call(thisArg, _);
|
|
39
|
+
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
|
|
40
|
+
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
+
exports.MarkdownDocsScraper = void 0;
|
|
45
|
+
exports.scrapeMarkdownDocs = scrapeMarkdownDocs;
|
|
46
|
+
// ============================================================================
|
|
47
|
+
// SCRAPER
|
|
48
|
+
// ============================================================================
|
|
49
|
+
var MarkdownDocsScraper = /** @class */ (function () {
|
|
50
|
+
function MarkdownDocsScraper(options) {
|
|
51
|
+
this.options = {
|
|
52
|
+
baseUrl: options.baseUrl,
|
|
53
|
+
docsPath: options.docsPath || "/docs/en",
|
|
54
|
+
categories: options.categories || {},
|
|
55
|
+
outputDir: options.outputDir || "./docs",
|
|
56
|
+
concurrency: options.concurrency || 5,
|
|
57
|
+
onProgress: options.onProgress || (function () { }),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Fetch markdown content from a URL
|
|
62
|
+
*/
|
|
63
|
+
MarkdownDocsScraper.prototype.fetchMarkdown = function (url) {
|
|
64
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
65
|
+
var response, contentType, error_1;
|
|
66
|
+
return __generator(this, function (_a) {
|
|
67
|
+
switch (_a.label) {
|
|
68
|
+
case 0:
|
|
69
|
+
_a.trys.push([0, 3, , 4]);
|
|
70
|
+
return [4 /*yield*/, fetch(url, {
|
|
71
|
+
headers: {
|
|
72
|
+
Accept: "text/markdown, text/plain",
|
|
73
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
74
|
+
},
|
|
75
|
+
})];
|
|
76
|
+
case 1:
|
|
77
|
+
response = _a.sent();
|
|
78
|
+
if (!response.ok) {
|
|
79
|
+
return [2 /*return*/, null];
|
|
80
|
+
}
|
|
81
|
+
contentType = response.headers.get("content-type") || "";
|
|
82
|
+
if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
|
|
83
|
+
// Try to parse anyway - some sites return incorrect content-type
|
|
84
|
+
}
|
|
85
|
+
return [4 /*yield*/, response.text()];
|
|
86
|
+
case 2: return [2 /*return*/, _a.sent()];
|
|
87
|
+
case 3:
|
|
88
|
+
error_1 = _a.sent();
|
|
89
|
+
console.error("Error fetching ".concat(url, ":"), error_1);
|
|
90
|
+
return [2 /*return*/, null];
|
|
91
|
+
case 4: return [2 /*return*/];
|
|
92
|
+
}
|
|
93
|
+
});
|
|
94
|
+
});
|
|
95
|
+
};
|
|
96
|
+
/**
|
|
97
|
+
* Extract title from markdown content
|
|
98
|
+
*/
|
|
99
|
+
MarkdownDocsScraper.prototype.extractTitle = function (markdown) {
|
|
100
|
+
var titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
101
|
+
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
102
|
+
};
|
|
103
|
+
/**
|
|
104
|
+
* Sanitize filename from URL path
|
|
105
|
+
*/
|
|
106
|
+
MarkdownDocsScraper.prototype.sanitizeFilename = function (path) {
|
|
107
|
+
return path
|
|
108
|
+
.toLowerCase()
|
|
109
|
+
.replace(/[^a-z0-9/]+/g, "-")
|
|
110
|
+
.replace(/^-|-$/g, "")
|
|
111
|
+
.replace(/\//g, "/");
|
|
112
|
+
};
|
|
113
|
+
/**
|
|
114
|
+
* Build URL for a documentation page
|
|
115
|
+
*/
|
|
116
|
+
MarkdownDocsScraper.prototype.buildUrl = function (category, page) {
|
|
117
|
+
if (category) {
|
|
118
|
+
return "".concat(this.options.baseUrl).concat(this.options.docsPath, "/").concat(category, "/").concat(page, ".md");
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
return "".concat(this.options.baseUrl).concat(this.options.docsPath, "/").concat(page, ".md");
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
/**
|
|
125
|
+
* Download a single documentation page
|
|
126
|
+
*/
|
|
127
|
+
MarkdownDocsScraper.prototype.downloadPage = function (category, page) {
|
|
128
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
129
|
+
var url, content;
|
|
130
|
+
return __generator(this, function (_a) {
|
|
131
|
+
switch (_a.label) {
|
|
132
|
+
case 0:
|
|
133
|
+
url = this.buildUrl(category, page);
|
|
134
|
+
return [4 /*yield*/, this.fetchMarkdown(url)];
|
|
135
|
+
case 1:
|
|
136
|
+
content = _a.sent();
|
|
137
|
+
if (!content) {
|
|
138
|
+
return [2 /*return*/, null];
|
|
139
|
+
}
|
|
140
|
+
return [2 /*return*/, {
|
|
141
|
+
url: url,
|
|
142
|
+
title: this.extractTitle(content),
|
|
143
|
+
content: content,
|
|
144
|
+
category: category,
|
|
145
|
+
pageName: page, // Store the page name for saving
|
|
146
|
+
}];
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
};
|
|
151
|
+
/**
|
|
152
|
+
* Scrape pages discovered from llms.txt
|
|
153
|
+
*/
|
|
154
|
+
MarkdownDocsScraper.prototype.scrapeFromLlms = function () {
|
|
155
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
156
|
+
var startTime, downloaded, failed, pages, _loop_1, this_1, i, duration;
|
|
157
|
+
var _this = this;
|
|
158
|
+
return __generator(this, function (_a) {
|
|
159
|
+
switch (_a.label) {
|
|
160
|
+
case 0:
|
|
161
|
+
startTime = Date.now();
|
|
162
|
+
downloaded = [];
|
|
163
|
+
failed = [];
|
|
164
|
+
return [4 /*yield*/, this.discoverPages()];
|
|
165
|
+
case 1:
|
|
166
|
+
pages = _a.sent();
|
|
167
|
+
if (pages.length === 0) {
|
|
168
|
+
console.log("No pages discovered, falling back to categories");
|
|
169
|
+
return [2 /*return*/, this.scrape()];
|
|
170
|
+
}
|
|
171
|
+
console.log("Scraping ".concat(pages.length, " discovered pages..."));
|
|
172
|
+
_loop_1 = function (i) {
|
|
173
|
+
var batch, results;
|
|
174
|
+
return __generator(this, function (_b) {
|
|
175
|
+
switch (_b.label) {
|
|
176
|
+
case 0:
|
|
177
|
+
batch = pages.slice(i, i + this_1.options.concurrency);
|
|
178
|
+
return [4 /*yield*/, Promise.allSettled(batch.map(function (page) { return _this.downloadPage(page.category, page.page); }))];
|
|
179
|
+
case 1:
|
|
180
|
+
results = _b.sent();
|
|
181
|
+
results.forEach(function (result, index) {
|
|
182
|
+
var page = batch[index];
|
|
183
|
+
if (result.status === "fulfilled" && result.value) {
|
|
184
|
+
downloaded.push(result.value);
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
failed.push({
|
|
188
|
+
url: _this.buildUrl(page.category, page.page),
|
|
189
|
+
error: result.status === "rejected" ? result.reason : "Not found",
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
_this.options.onProgress(downloaded.length + failed.length, pages.length);
|
|
193
|
+
});
|
|
194
|
+
return [2 /*return*/];
|
|
195
|
+
}
|
|
196
|
+
});
|
|
197
|
+
};
|
|
198
|
+
this_1 = this;
|
|
199
|
+
i = 0;
|
|
200
|
+
_a.label = 2;
|
|
201
|
+
case 2:
|
|
202
|
+
if (!(i < pages.length)) return [3 /*break*/, 5];
|
|
203
|
+
return [5 /*yield**/, _loop_1(i)];
|
|
204
|
+
case 3:
|
|
205
|
+
_a.sent();
|
|
206
|
+
_a.label = 4;
|
|
207
|
+
case 4:
|
|
208
|
+
i += this.options.concurrency;
|
|
209
|
+
return [3 /*break*/, 2];
|
|
210
|
+
case 5:
|
|
211
|
+
duration = Date.now() - startTime;
|
|
212
|
+
console.log("\u2705 Downloaded: ".concat(downloaded.length, " pages"));
|
|
213
|
+
console.log("\u274C Failed: ".concat(failed.length, " pages"));
|
|
214
|
+
console.log("\u23F1\uFE0F Duration: ".concat((duration / 1000).toFixed(2), "s"));
|
|
215
|
+
return [2 /*return*/, { downloaded: downloaded, failed: failed, duration: duration }];
|
|
216
|
+
}
|
|
217
|
+
});
|
|
218
|
+
});
|
|
219
|
+
};
|
|
220
|
+
/**
|
|
221
|
+
* Scrape all documentation pages
|
|
222
|
+
*/
|
|
223
|
+
MarkdownDocsScraper.prototype.scrape = function () {
|
|
224
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
225
|
+
var startTime, downloaded, failed, pages, total, _loop_2, this_2, i, duration;
|
|
226
|
+
var _this = this;
|
|
227
|
+
return __generator(this, function (_a) {
|
|
228
|
+
switch (_a.label) {
|
|
229
|
+
case 0:
|
|
230
|
+
startTime = Date.now();
|
|
231
|
+
downloaded = [];
|
|
232
|
+
failed = [];
|
|
233
|
+
pages = this.getPagesToScrape();
|
|
234
|
+
total = pages.length;
|
|
235
|
+
console.log("Scraping ".concat(total, " pages from ").concat(this.options.baseUrl, "..."));
|
|
236
|
+
_loop_2 = function (i) {
|
|
237
|
+
var batch, results;
|
|
238
|
+
return __generator(this, function (_b) {
|
|
239
|
+
switch (_b.label) {
|
|
240
|
+
case 0:
|
|
241
|
+
batch = pages.slice(i, i + this_2.options.concurrency);
|
|
242
|
+
return [4 /*yield*/, Promise.allSettled(batch.map(function (page) { return _this.downloadPage(page.category, page.page); }))];
|
|
243
|
+
case 1:
|
|
244
|
+
results = _b.sent();
|
|
245
|
+
results.forEach(function (result, index) {
|
|
246
|
+
var page = batch[index];
|
|
247
|
+
if (result.status === "fulfilled" && result.value) {
|
|
248
|
+
downloaded.push(result.value);
|
|
249
|
+
}
|
|
250
|
+
else {
|
|
251
|
+
failed.push({
|
|
252
|
+
url: _this.buildUrl(page.category, page.page),
|
|
253
|
+
error: result.status === "rejected" ? result.reason : "Not found",
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
_this.options.onProgress(downloaded.length + failed.length, total);
|
|
257
|
+
});
|
|
258
|
+
return [2 /*return*/];
|
|
259
|
+
}
|
|
260
|
+
});
|
|
261
|
+
};
|
|
262
|
+
this_2 = this;
|
|
263
|
+
i = 0;
|
|
264
|
+
_a.label = 1;
|
|
265
|
+
case 1:
|
|
266
|
+
if (!(i < pages.length)) return [3 /*break*/, 4];
|
|
267
|
+
return [5 /*yield**/, _loop_2(i)];
|
|
268
|
+
case 2:
|
|
269
|
+
_a.sent();
|
|
270
|
+
_a.label = 3;
|
|
271
|
+
case 3:
|
|
272
|
+
i += this.options.concurrency;
|
|
273
|
+
return [3 /*break*/, 1];
|
|
274
|
+
case 4:
|
|
275
|
+
duration = Date.now() - startTime;
|
|
276
|
+
console.log("\u2705 Downloaded: ".concat(downloaded.length, " pages"));
|
|
277
|
+
console.log("\u274C Failed: ".concat(failed.length, " pages"));
|
|
278
|
+
console.log("\u23F1\uFE0F Duration: ".concat((duration / 1000).toFixed(2), "s"));
|
|
279
|
+
return [2 /*return*/, { downloaded: downloaded, failed: failed, duration: duration }];
|
|
280
|
+
}
|
|
281
|
+
});
|
|
282
|
+
});
|
|
283
|
+
};
|
|
284
|
+
/**
|
|
285
|
+
* Save scraped pages to disk
|
|
286
|
+
*/
|
|
287
|
+
MarkdownDocsScraper.prototype.savePages = function (pages) {
|
|
288
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
289
|
+
var fs, path, _i, pages_1, page, nameToUse, dir, filepath, header;
|
|
290
|
+
var _a;
|
|
291
|
+
return __generator(this, function (_b) {
|
|
292
|
+
switch (_b.label) {
|
|
293
|
+
case 0: return [4 /*yield*/, Promise.resolve().then(function () { return require("fs/promises"); })];
|
|
294
|
+
case 1:
|
|
295
|
+
fs = _b.sent();
|
|
296
|
+
return [4 /*yield*/, Promise.resolve().then(function () { return require("path"); })];
|
|
297
|
+
case 2:
|
|
298
|
+
path = _b.sent();
|
|
299
|
+
_i = 0, pages_1 = pages;
|
|
300
|
+
_b.label = 3;
|
|
301
|
+
case 3:
|
|
302
|
+
if (!(_i < pages_1.length)) return [3 /*break*/, 7];
|
|
303
|
+
page = pages_1[_i];
|
|
304
|
+
nameToUse = page.pageName || ((_a = page.url.split("/").pop()) === null || _a === void 0 ? void 0 : _a.replace(".md", "")) || "untitled";
|
|
305
|
+
dir = page.category
|
|
306
|
+
? path.join(this.options.outputDir, page.category)
|
|
307
|
+
: this.options.outputDir;
|
|
308
|
+
return [4 /*yield*/, fs.mkdir(dir, { recursive: true })];
|
|
309
|
+
case 4:
|
|
310
|
+
_b.sent();
|
|
311
|
+
filepath = path.join(dir, "".concat(nameToUse, ".md"));
|
|
312
|
+
header = "<!--\nSource: ".concat(page.url, "\nDownloaded: ").concat(new Date().toISOString(), "\n-->\n\n");
|
|
313
|
+
return [4 /*yield*/, fs.writeFile(filepath, header + page.content, "utf-8")];
|
|
314
|
+
case 5:
|
|
315
|
+
_b.sent();
|
|
316
|
+
_b.label = 6;
|
|
317
|
+
case 6:
|
|
318
|
+
_i++;
|
|
319
|
+
return [3 /*break*/, 3];
|
|
320
|
+
case 7: return [2 /*return*/];
|
|
321
|
+
}
|
|
322
|
+
});
|
|
323
|
+
});
|
|
324
|
+
};
|
|
325
|
+
/**
|
|
326
|
+
* Get list of pages to scrape based on categories
|
|
327
|
+
*/
|
|
328
|
+
MarkdownDocsScraper.prototype.getPagesToScrape = function () {
|
|
329
|
+
var pages = [];
|
|
330
|
+
for (var _i = 0, _a = Object.entries(this.options.categories); _i < _a.length; _i++) {
|
|
331
|
+
var _b = _a[_i], category = _b[0], pageList = _b[1];
|
|
332
|
+
for (var _c = 0, pageList_1 = pageList; _c < pageList_1.length; _c++) {
|
|
333
|
+
var page = pageList_1[_c];
|
|
334
|
+
pages.push({ category: category, page: page });
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
return pages;
|
|
338
|
+
};
|
|
339
|
+
/**
|
|
340
|
+
* Discover pages from llms.txt index
|
|
341
|
+
*/
|
|
342
|
+
MarkdownDocsScraper.prototype.discoverPages = function () {
|
|
343
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
344
|
+
var pages, llmsUrl, response, content, linkRegex, match, url, pagePath, pageName, pathParts, category, page, error_2;
|
|
345
|
+
return __generator(this, function (_a) {
|
|
346
|
+
switch (_a.label) {
|
|
347
|
+
case 0:
|
|
348
|
+
pages = [];
|
|
349
|
+
_a.label = 1;
|
|
350
|
+
case 1:
|
|
351
|
+
_a.trys.push([1, 4, , 5]);
|
|
352
|
+
llmsUrl = "".concat(this.options.baseUrl, "/docs/llms.txt");
|
|
353
|
+
return [4 /*yield*/, fetch(llmsUrl, {
|
|
354
|
+
headers: {
|
|
355
|
+
Accept: "text/plain",
|
|
356
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
357
|
+
},
|
|
358
|
+
})];
|
|
359
|
+
case 2:
|
|
360
|
+
response = _a.sent();
|
|
361
|
+
if (!response.ok) {
|
|
362
|
+
console.warn("Could not fetch llms.txt from ".concat(llmsUrl));
|
|
363
|
+
return [2 /*return*/, pages];
|
|
364
|
+
}
|
|
365
|
+
return [4 /*yield*/, response.text()];
|
|
366
|
+
case 3:
|
|
367
|
+
content = _a.sent();
|
|
368
|
+
linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
369
|
+
match = void 0;
|
|
370
|
+
while ((match = linkRegex.exec(content)) !== null) {
|
|
371
|
+
url = match[2];
|
|
372
|
+
pagePath = match[3];
|
|
373
|
+
pageName = pagePath.replace(".md", "");
|
|
374
|
+
pathParts = pageName.split("/");
|
|
375
|
+
if (pathParts.length === 1) {
|
|
376
|
+
// No category: just "page-name"
|
|
377
|
+
pages.push({ category: "", page: pathParts[0] });
|
|
378
|
+
}
|
|
379
|
+
else if (pathParts.length === 2) {
|
|
380
|
+
// Has category: "category/page-name"
|
|
381
|
+
pages.push({ category: pathParts[0], page: pathParts[1] });
|
|
382
|
+
}
|
|
383
|
+
else {
|
|
384
|
+
category = pathParts.slice(0, -1).join("/");
|
|
385
|
+
page = pathParts[pathParts.length - 1];
|
|
386
|
+
pages.push({ category: category, page: page });
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
console.log("Discovered ".concat(pages.length, " pages from llms.txt"));
|
|
390
|
+
return [3 /*break*/, 5];
|
|
391
|
+
case 4:
|
|
392
|
+
error_2 = _a.sent();
|
|
393
|
+
console.error("Error discovering pages:", error_2);
|
|
394
|
+
return [3 /*break*/, 5];
|
|
395
|
+
case 5: return [2 /*return*/, pages];
|
|
396
|
+
}
|
|
397
|
+
});
|
|
398
|
+
});
|
|
399
|
+
};
|
|
400
|
+
/**
|
|
401
|
+
* Discover additional pages by parsing the docs index (fallback)
|
|
402
|
+
*/
|
|
403
|
+
MarkdownDocsScraper.prototype.discoverPagesHtml = function () {
|
|
404
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
405
|
+
var discovered, indexUrl, response, html, mdLinkRegex, match, path, error_3;
|
|
406
|
+
return __generator(this, function (_a) {
|
|
407
|
+
switch (_a.label) {
|
|
408
|
+
case 0:
|
|
409
|
+
discovered = [];
|
|
410
|
+
_a.label = 1;
|
|
411
|
+
case 1:
|
|
412
|
+
_a.trys.push([1, 4, , 5]);
|
|
413
|
+
indexUrl = "".concat(this.options.baseUrl).concat(this.options.docsPath);
|
|
414
|
+
return [4 /*yield*/, fetch(indexUrl, {
|
|
415
|
+
headers: {
|
|
416
|
+
Accept: "text/html",
|
|
417
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
418
|
+
},
|
|
419
|
+
})];
|
|
420
|
+
case 2:
|
|
421
|
+
response = _a.sent();
|
|
422
|
+
if (!response.ok) {
|
|
423
|
+
return [2 /*return*/, discovered];
|
|
424
|
+
}
|
|
425
|
+
return [4 /*yield*/, response.text()];
|
|
426
|
+
case 3:
|
|
427
|
+
html = _a.sent();
|
|
428
|
+
mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
|
|
429
|
+
match = void 0;
|
|
430
|
+
while ((match = mdLinkRegex.exec(html)) !== null) {
|
|
431
|
+
path = match[1];
|
|
432
|
+
if (!discovered.includes(path)) {
|
|
433
|
+
discovered.push(path);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
console.log("Discovered ".concat(discovered.length, " additional pages from HTML"));
|
|
437
|
+
return [3 /*break*/, 5];
|
|
438
|
+
case 4:
|
|
439
|
+
error_3 = _a.sent();
|
|
440
|
+
console.error("Error discovering pages from HTML:", error_3);
|
|
441
|
+
return [3 /*break*/, 5];
|
|
442
|
+
case 5: return [2 /*return*/, discovered];
|
|
443
|
+
}
|
|
444
|
+
});
|
|
445
|
+
});
|
|
446
|
+
};
|
|
447
|
+
return MarkdownDocsScraper;
|
|
448
|
+
}());
|
|
449
|
+
exports.MarkdownDocsScraper = MarkdownDocsScraper;
|
|
450
|
+
// ============================================================================
|
|
451
|
+
// CONVENIENCE FUNCTION
|
|
452
|
+
// ============================================================================
|
|
453
|
+
/**
|
|
454
|
+
* Scrape markdown documentation with a single function call
|
|
455
|
+
*/
|
|
456
|
+
function scrapeMarkdownDocs(options) {
|
|
457
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
458
|
+
var scraper, result, _a;
|
|
459
|
+
return __generator(this, function (_b) {
|
|
460
|
+
switch (_b.label) {
|
|
461
|
+
case 0:
|
|
462
|
+
scraper = new MarkdownDocsScraper(options);
|
|
463
|
+
if (!options.useLlms) return [3 /*break*/, 2];
|
|
464
|
+
return [4 /*yield*/, scraper.scrapeFromLlms()];
|
|
465
|
+
case 1:
|
|
466
|
+
_a = _b.sent();
|
|
467
|
+
return [3 /*break*/, 4];
|
|
468
|
+
case 2: return [4 /*yield*/, scraper.scrape()];
|
|
469
|
+
case 3:
|
|
470
|
+
_a = _b.sent();
|
|
471
|
+
_b.label = 4;
|
|
472
|
+
case 4:
|
|
473
|
+
result = _a;
|
|
474
|
+
if (!options.outputDir) return [3 /*break*/, 6];
|
|
475
|
+
return [4 /*yield*/, scraper.savePages(result.downloaded)];
|
|
476
|
+
case 5:
|
|
477
|
+
_b.sent();
|
|
478
|
+
_b.label = 6;
|
|
479
|
+
case 6: return [2 /*return*/, result];
|
|
480
|
+
}
|
|
481
|
+
});
|
|
482
|
+
});
|
|
483
|
+
}
|
|
484
|
+
// ============================================================================
|
|
485
|
+
// EXPORTS
|
|
486
|
+
// ============================================================================
|
|
487
|
+
exports.default = MarkdownDocsScraper;
|