@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +19 -13
- package/dist/index.d.ts +116 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +323 -105
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +428 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +12 -1
- package/src/index.js +487 -0
- package/src/index.ts +276 -158
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
package/src/cli.js
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* CLI for @ebowwa/markdown-docs-scraper
|
|
5
|
+
*/
|
|
6
|
+
var __assign = (this && this.__assign) || function () {
|
|
7
|
+
__assign = Object.assign || function(t) {
|
|
8
|
+
for (var s, i = 1, n = arguments.length; i < n; i++) {
|
|
9
|
+
s = arguments[i];
|
|
10
|
+
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
|
|
11
|
+
t[p] = s[p];
|
|
12
|
+
}
|
|
13
|
+
return t;
|
|
14
|
+
};
|
|
15
|
+
return __assign.apply(this, arguments);
|
|
16
|
+
};
|
|
17
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
18
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
19
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
20
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
21
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
22
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
23
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
24
|
+
});
|
|
25
|
+
};
|
|
26
|
+
var __generator = (this && this.__generator) || function (thisArg, body) {
|
|
27
|
+
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
|
|
28
|
+
return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
|
|
29
|
+
function verb(n) { return function (v) { return step([n, v]); }; }
|
|
30
|
+
function step(op) {
|
|
31
|
+
if (f) throw new TypeError("Generator is already executing.");
|
|
32
|
+
while (g && (g = 0, op[0] && (_ = 0)), _) try {
|
|
33
|
+
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
|
|
34
|
+
if (y = 0, t) op = [op[0] & 2, t.value];
|
|
35
|
+
switch (op[0]) {
|
|
36
|
+
case 0: case 1: t = op; break;
|
|
37
|
+
case 4: _.label++; return { value: op[1], done: false };
|
|
38
|
+
case 5: _.label++; y = op[1]; op = [0]; continue;
|
|
39
|
+
case 7: op = _.ops.pop(); _.trys.pop(); continue;
|
|
40
|
+
default:
|
|
41
|
+
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
|
|
42
|
+
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
|
|
43
|
+
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
|
|
44
|
+
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
|
|
45
|
+
if (t[2]) _.ops.pop();
|
|
46
|
+
_.trys.pop(); continue;
|
|
47
|
+
}
|
|
48
|
+
op = body.call(thisArg, _);
|
|
49
|
+
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
|
|
50
|
+
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
54
|
+
var commander_1 = require("commander");
|
|
55
|
+
var index_js_1 = require("./index.js");
|
|
56
|
+
commander_1.program
|
|
57
|
+
.name("markdown-docs-scraper")
|
|
58
|
+
.description("Scrape and mirror markdown-based documentation sites")
|
|
59
|
+
.version("1.0.0");
|
|
60
|
+
commander_1.program
|
|
61
|
+
.command("scrape")
|
|
62
|
+
.description("Scrape documentation from a URL")
|
|
63
|
+
.requiredOption("-u, --url <url>", "Base URL of the documentation site")
|
|
64
|
+
.option("-o, --output <dir>", "Output directory", "./docs")
|
|
65
|
+
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
66
|
+
.option("-c, --concurrency <num>", "Concurrency level", "5")
|
|
67
|
+
.option("--discover", "Discover pages before scraping", false)
|
|
68
|
+
.action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
|
|
69
|
+
var scraperOptions, result;
|
|
70
|
+
return __generator(this, function (_a) {
|
|
71
|
+
switch (_a.label) {
|
|
72
|
+
case 0:
|
|
73
|
+
scraperOptions = {
|
|
74
|
+
baseUrl: options.url,
|
|
75
|
+
docsPath: options.docsPath,
|
|
76
|
+
outputDir: options.output,
|
|
77
|
+
concurrency: parseInt(options.concurrency),
|
|
78
|
+
};
|
|
79
|
+
console.log("\uD83D\uDD0D Scraping ".concat(options.url, "..."));
|
|
80
|
+
console.log("\uD83D\uDCC1 Output: ".concat(options.output));
|
|
81
|
+
console.log();
|
|
82
|
+
return [4 /*yield*/, (0, index_js_1.scrapeMarkdownDocs)(scraperOptions)];
|
|
83
|
+
case 1:
|
|
84
|
+
result = _a.sent();
|
|
85
|
+
console.log();
|
|
86
|
+
console.log("Summary:");
|
|
87
|
+
console.log(" Downloaded: ".concat(result.downloaded.length));
|
|
88
|
+
console.log(" Failed: ".concat(result.failed.length));
|
|
89
|
+
console.log(" Duration: ".concat((result.duration / 1000).toFixed(2), "s"));
|
|
90
|
+
if (result.failed.length > 0) {
|
|
91
|
+
console.log();
|
|
92
|
+
console.log("Failed pages:");
|
|
93
|
+
result.failed.slice(0, 10).forEach(function (f) {
|
|
94
|
+
console.log(" \u274C ".concat(f.url, ": ").concat(f.error));
|
|
95
|
+
});
|
|
96
|
+
if (result.failed.length > 10) {
|
|
97
|
+
console.log(" ... and ".concat(result.failed.length - 10, " more"));
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return [2 /*return*/];
|
|
101
|
+
}
|
|
102
|
+
});
|
|
103
|
+
}); });
|
|
104
|
+
commander_1.program
|
|
105
|
+
.command("discover")
|
|
106
|
+
.description("Discover all available documentation pages")
|
|
107
|
+
.requiredOption("-u, --url <url>", "Base URL of the documentation site")
|
|
108
|
+
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
109
|
+
.action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
|
|
110
|
+
var scraper, pages;
|
|
111
|
+
return __generator(this, function (_a) {
|
|
112
|
+
switch (_a.label) {
|
|
113
|
+
case 0:
|
|
114
|
+
scraper = new index_js_1.MarkdownDocsScraper({
|
|
115
|
+
baseUrl: options.url,
|
|
116
|
+
docsPath: options.docsPath,
|
|
117
|
+
});
|
|
118
|
+
console.log("\uD83D\uDD0D Discovering pages from ".concat(options.url, "..."));
|
|
119
|
+
return [4 /*yield*/, scraper.discoverPages()];
|
|
120
|
+
case 1:
|
|
121
|
+
pages = _a.sent();
|
|
122
|
+
console.log("\nFound ".concat(pages.length, " pages:\n"));
|
|
123
|
+
pages.forEach(function (page) {
|
|
124
|
+
console.log(" - ".concat(page));
|
|
125
|
+
});
|
|
126
|
+
return [2 /*return*/];
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
}); });
|
|
130
|
+
commander_1.program
|
|
131
|
+
.command("anthropic")
|
|
132
|
+
.description("Quick scrape of Anthropic Claude Code docs (uses llms.txt)")
|
|
133
|
+
.option("-o, --output <dir>", "Output directory", "./docs")
|
|
134
|
+
.action(function (options) { return __awaiter(void 0, void 0, void 0, function () {
|
|
135
|
+
var scraperOptions, result;
|
|
136
|
+
return __generator(this, function (_a) {
|
|
137
|
+
switch (_a.label) {
|
|
138
|
+
case 0:
|
|
139
|
+
scraperOptions = {
|
|
140
|
+
baseUrl: "https://code.claude.com",
|
|
141
|
+
docsPath: "/docs/en",
|
|
142
|
+
outputDir: options.output,
|
|
143
|
+
};
|
|
144
|
+
console.log("🔍 Scraping Anthropic Claude Code documentation...");
|
|
145
|
+
console.log("📋 Using llms.txt index for complete page list");
|
|
146
|
+
console.log("\uD83D\uDCC1 Output: ".concat(options.output));
|
|
147
|
+
console.log();
|
|
148
|
+
return [4 /*yield*/, (0, index_js_1.scrapeMarkdownDocs)(__assign(__assign({}, scraperOptions), { useLlms: true }))];
|
|
149
|
+
case 1:
|
|
150
|
+
result = _a.sent();
|
|
151
|
+
console.log();
|
|
152
|
+
console.log("Summary:");
|
|
153
|
+
console.log(" Downloaded: ".concat(result.downloaded.length));
|
|
154
|
+
console.log(" Failed: ".concat(result.failed.length));
|
|
155
|
+
console.log(" Duration: ".concat((result.duration / 1000).toFixed(2), "s"));
|
|
156
|
+
return [2 /*return*/];
|
|
157
|
+
}
|
|
158
|
+
});
|
|
159
|
+
}); });
|
|
160
|
+
commander_1.program.parse();
|
package/src/cli.ts
CHANGED
|
@@ -19,12 +19,16 @@ program
|
|
|
19
19
|
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
20
20
|
.option("-c, --concurrency <num>", "Concurrency level", "5")
|
|
21
21
|
.option("--discover", "Discover pages before scraping", false)
|
|
22
|
+
.option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
|
|
23
|
+
.option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
|
|
22
24
|
.action(async (options) => {
|
|
23
25
|
const scraperOptions: ScraperOptions = {
|
|
24
26
|
baseUrl: options.url,
|
|
25
27
|
docsPath: options.docsPath,
|
|
26
28
|
outputDir: options.output,
|
|
27
29
|
concurrency: parseInt(options.concurrency),
|
|
30
|
+
llmsPaths: options.llmsPaths.split(","),
|
|
31
|
+
tryDocsSubdomain: !options.noSubdomain,
|
|
28
32
|
};
|
|
29
33
|
|
|
30
34
|
console.log(`🔍 Scraping ${options.url}...`);
|
|
@@ -56,10 +60,16 @@ program
|
|
|
56
60
|
.description("Discover all available documentation pages")
|
|
57
61
|
.requiredOption("-u, --url <url>", "Base URL of the documentation site")
|
|
58
62
|
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
63
|
+
.option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
|
|
64
|
+
.option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
|
|
59
65
|
.action(async (options) => {
|
|
66
|
+
console.log("DEBUG CLI: Options received:", options);
|
|
67
|
+
|
|
60
68
|
const scraper = new MarkdownDocsScraper({
|
|
61
69
|
baseUrl: options.url,
|
|
62
70
|
docsPath: options.docsPath,
|
|
71
|
+
llmsPaths: options.llmsPaths.split(","),
|
|
72
|
+
tryDocsSubdomain: !options.noSubdomain,
|
|
63
73
|
});
|
|
64
74
|
|
|
65
75
|
console.log(`🔍 Discovering pages from ${options.url}...`);
|
|
@@ -67,7 +77,8 @@ program
|
|
|
67
77
|
|
|
68
78
|
console.log(`\nFound ${pages.length} pages:\n`);
|
|
69
79
|
pages.forEach((page) => {
|
|
70
|
-
|
|
80
|
+
const path = page.category ? `${page.category}/${page.page}` : page.page;
|
|
81
|
+
console.log(` - ${path}`);
|
|
71
82
|
});
|
|
72
83
|
});
|
|
73
84
|
|