mintlify 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -0
- package/bin/browser.js +24 -0
- package/bin/browser.js.map +1 -0
- package/bin/downloadImage.js +27 -0
- package/bin/downloadImage.js.map +1 -0
- package/bin/index.js +144 -22
- package/bin/index.js.map +1 -1
- package/bin/scraping/detectFramework.js +25 -0
- package/bin/scraping/detectFramework.js.map +1 -0
- package/bin/scraping/downloadAllImages.js +57 -0
- package/bin/scraping/downloadAllImages.js.map +1 -0
- package/bin/scraping/getSitemapLinks.js +16 -0
- package/bin/scraping/getSitemapLinks.js.map +1 -0
- package/bin/scraping/replaceImagePaths.js +17 -0
- package/bin/scraping/replaceImagePaths.js.map +1 -0
- package/bin/scraping/scrapeGettingFileNameFromUrl.js +43 -0
- package/bin/scraping/scrapeGettingFileNameFromUrl.js.map +1 -0
- package/bin/scraping/scrapePage.js +9 -0
- package/bin/scraping/scrapePage.js.map +1 -0
- package/bin/scraping/scrapeSection.js +9 -0
- package/bin/scraping/scrapeSection.js.map +1 -0
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js +43 -0
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js.map +1 -0
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js +52 -0
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js.map +1 -0
- package/bin/{scrapeGitBookPage.js → scraping/site-scrapers/scrapeGitBookPage.js} +10 -5
- package/bin/scraping/site-scrapers/scrapeGitBookPage.js.map +1 -0
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js +74 -0
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js.map +1 -0
- package/bin/{scrapeReadMePage.js → scraping/site-scrapers/scrapeReadMePage.js} +15 -9
- package/bin/scraping/site-scrapers/scrapeReadMePage.js.map +1 -0
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js +48 -0
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js.map +1 -0
- package/bin/util.js +27 -8
- package/bin/util.js.map +1 -1
- package/package.json +3 -2
- package/src/browser.ts +24 -0
- package/src/downloadImage.ts +35 -0
- package/src/index.ts +173 -22
- package/src/scraping/detectFramework.ts +31 -0
- package/src/scraping/downloadAllImages.ts +79 -0
- package/src/scraping/getSitemapLinks.ts +16 -0
- package/src/scraping/replaceImagePaths.ts +21 -0
- package/src/scraping/scrapeGettingFileNameFromUrl.ts +81 -0
- package/src/scraping/scrapePage.ts +24 -0
- package/src/scraping/scrapeSection.ts +16 -0
- package/src/scraping/site-scrapers/scrapeDocusaurusPage.ts +67 -0
- package/src/scraping/site-scrapers/scrapeDocusaurusSection.ts +80 -0
- package/src/{scrapeGitBookPage.ts → scraping/site-scrapers/scrapeGitBookPage.ts} +25 -5
- package/src/scraping/site-scrapers/scrapeGitBookSection.ts +116 -0
- package/src/{scrapeReadMePage.ts → scraping/site-scrapers/scrapeReadMePage.ts} +28 -10
- package/src/scraping/site-scrapers/scrapeReadMeSection.ts +77 -0
- package/src/util.ts +25 -7
- package/tsconfig.json +1 -1
- package/bin/scrapeGitBook.js +0 -28
- package/bin/scrapeGitBook.js.map +0 -1
- package/bin/scrapeGitBookPage.js.map +0 -1
- package/bin/scrapeReadMe.js +0 -60
- package/bin/scrapeReadMe.js.map +0 -1
- package/bin/scrapeReadMePage.js.map +0 -1
- package/src/scrapeReadMe.ts +0 -79
package/README.md
CHANGED
|
@@ -5,3 +5,11 @@ To test changes locally run:
|
|
|
5
5
|
`npm run build`
|
|
6
6
|
then
|
|
7
7
|
`npm i -g .`
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
`npm i -g mintlify`
|
|
12
|
+
|
|
13
|
+
If you installed a local version, you may need to uninstall it with
|
|
14
|
+
|
|
15
|
+
`npm uninstall -g mintlify` before installing the published version.
|
package/bin/browser.js
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { launch } from "puppeteer";
|
|
2
|
+
export async function startBrowser() {
|
|
3
|
+
try {
|
|
4
|
+
return await launch({
|
|
5
|
+
headless: true,
|
|
6
|
+
ignoreHTTPSErrors: true,
|
|
7
|
+
});
|
|
8
|
+
}
|
|
9
|
+
catch (err) {
|
|
10
|
+
console.log("Could not create a browser instance: ", err);
|
|
11
|
+
process.exit(1);
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
export async function getHtmlWithPuppeteer(href) {
|
|
15
|
+
const browser = await startBrowser();
|
|
16
|
+
const page = await browser.newPage();
|
|
17
|
+
await page.goto(href, {
|
|
18
|
+
waitUntil: "networkidle2",
|
|
19
|
+
});
|
|
20
|
+
const html = await page.content();
|
|
21
|
+
browser.close();
|
|
22
|
+
return html;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=browser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser.js","sourceRoot":"","sources":["../src/browser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,WAAW,CAAC;AAEnC,MAAM,CAAC,KAAK,UAAU,YAAY;IAChC,IAAI;QACF,OAAO,MAAM,MAAM,CAAC;YAClB,QAAQ,EAAE,IAAI;YACd,iBAAiB,EAAE,IAAI;SACxB,CAAC,CAAC;KACJ;IAAC,OAAO,GAAG,EAAE;QACZ,OAAO,CAAC,GAAG,CAAC,uCAAuC,EAAE,GAAG,CAAC,CAAC;QAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;KACjB;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,IAAY;IACrD,MAAM,OAAO,GAAG,MAAM,YAAY,EAAE,CAAC;IACrC,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IACrC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QACpB,SAAS,EAAE,cAAc;KAC1B,CAAC,CAAC;IACH,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;IAClC,OAAO,CAAC,KAAK,EAAE,CAAC;IAChB,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, createWriteStream } from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import axios from "axios";
|
|
4
|
+
export default async function downloadImage(imageSrc, writePath) {
|
|
5
|
+
// Avoid unnecessary downloads
|
|
6
|
+
if (existsSync(writePath)) {
|
|
7
|
+
return Promise.reject({
|
|
8
|
+
code: "EEXIST",
|
|
9
|
+
});
|
|
10
|
+
}
|
|
11
|
+
// Create the folders needed if they're missing
|
|
12
|
+
mkdirSync(path.dirname(writePath), { recursive: true });
|
|
13
|
+
const writer = createWriteStream(writePath);
|
|
14
|
+
const response = await axios.default.get(imageSrc, {
|
|
15
|
+
responseType: "stream",
|
|
16
|
+
});
|
|
17
|
+
// wx prevents overwriting an image with the exact same name
|
|
18
|
+
// being created in the time we were downloading
|
|
19
|
+
response.data.pipe(writer, {
|
|
20
|
+
flag: "wx",
|
|
21
|
+
});
|
|
22
|
+
return new Promise((resolve, reject) => {
|
|
23
|
+
writer.on("finish", resolve);
|
|
24
|
+
writer.on("error", reject);
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
//# sourceMappingURL=downloadImage.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"downloadImage.js","sourceRoot":"","sources":["../src/downloadImage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,iBAAiB,EAAE,MAAM,IAAI,CAAC;AAC9D,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,KAAK,MAAM,OAAO,CAAC;AAE1B,MAAM,CAAC,OAAO,CAAC,KAAK,UAAU,aAAa,CACzC,QAAgB,EAChB,SAAiB;IAEjB,8BAA8B;IAC9B,IAAI,UAAU,CAAC,SAAS,CAAC,EAAE;QACzB,OAAO,OAAO,CAAC,MAAM,CAAC;YACpB,IAAI,EAAE,QAAQ;SACf,CAAC,CAAC;KACJ;IAED,+CAA+C;IAC/C,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAExD,MAAM,MAAM,GAAG,iBAAiB,CAAC,SAAS,CAAC,CAAC;IAE5C,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE;QACjD,YAAY,EAAE,QAAQ;KACvB,CAAC,CAAC;IAEH,4DAA4D;IAC5D,gDAAgD;IAChD,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE;QACzB,IAAI,EAAE,IAAI;KACX,CAAC,CAAC;IAEH,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,MAAM,CAAC,EAAE,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC7B,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/bin/index.js
CHANGED
|
@@ -1,17 +1,31 @@
|
|
|
1
1
|
#! /usr/bin/env node
|
|
2
|
+
import axios from "axios";
|
|
2
3
|
import { writeFileSync } from "fs";
|
|
3
4
|
import inquirer from "inquirer";
|
|
5
|
+
import minimistLite from "minimist-lite";
|
|
4
6
|
import { MintConfig } from "./templates.js";
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
import { scrapePage } from "./scraping/scrapePage.js";
|
|
8
|
+
import { scrapeSection } from "./scraping/scrapeSection.js";
|
|
9
|
+
import { createPage, toFilename, getOrigin } from "./util.js";
|
|
10
|
+
import { scrapeDocusaurusPage } from "./scraping/site-scrapers/scrapeDocusaurusPage.js";
|
|
11
|
+
import { scrapeDocusaurusSection } from "./scraping/site-scrapers/scrapeDocusaurusSection.js";
|
|
12
|
+
import { scrapeGitBookPage } from "./scraping/site-scrapers/scrapeGitBookPage.js";
|
|
13
|
+
import { scrapeGitBookSection } from "./scraping/site-scrapers/scrapeGitBookSection.js";
|
|
14
|
+
import { scrapeReadMePage } from "./scraping/site-scrapers/scrapeReadMePage.js";
|
|
15
|
+
import { scrapeReadMeSection } from "./scraping/site-scrapers/scrapeReadMeSection.js";
|
|
16
|
+
import { detectFramework, Frameworks } from "./scraping/detectFramework.js";
|
|
17
|
+
import { startBrowser, getHtmlWithPuppeteer } from "./browser.js";
|
|
18
|
+
const argv = minimistLite(process.argv.slice(2), {
|
|
19
|
+
boolean: ["overwrite"],
|
|
20
|
+
default: {
|
|
21
|
+
overwrite: false,
|
|
22
|
+
},
|
|
23
|
+
});
|
|
24
|
+
if (argv._.length === 0) {
|
|
11
25
|
console.error(`No command specified. Here are is the list that you can use:\ninit: initialize a Mintlify documentation instance`);
|
|
12
26
|
process.exit(1); //an error occurred
|
|
13
27
|
}
|
|
14
|
-
const command =
|
|
28
|
+
const command = argv._[0];
|
|
15
29
|
if (command === "init") {
|
|
16
30
|
inquirer
|
|
17
31
|
.prompt([
|
|
@@ -47,7 +61,7 @@ if (command === "init") {
|
|
|
47
61
|
])
|
|
48
62
|
.then((answers) => {
|
|
49
63
|
const { name, color, ctaName, ctaUrl, title } = answers;
|
|
50
|
-
writeFileSync("mint.
|
|
64
|
+
writeFileSync("mint.json", JSON.stringify(MintConfig(name, color, ctaName, ctaUrl, toFilename(title)), null, "\t"));
|
|
51
65
|
createPage(title);
|
|
52
66
|
console.log("🌱 Created initial files for Mintlify docs");
|
|
53
67
|
process.exit(1);
|
|
@@ -83,25 +97,133 @@ if (command === "page") {
|
|
|
83
97
|
process.exit(1);
|
|
84
98
|
});
|
|
85
99
|
}
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
100
|
+
function validateFramework(framework) {
|
|
101
|
+
if (!framework) {
|
|
102
|
+
console.log("Could not detect the framework automatically. Please use one of:");
|
|
103
|
+
console.log("scrape-page-docusaurus");
|
|
104
|
+
console.log("scrape-page-gitbook");
|
|
105
|
+
console.log("scrape-page-readme");
|
|
106
|
+
return process.exit(1);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
async function scrapePageAutomatically() {
|
|
110
|
+
const href = argv._[1];
|
|
111
|
+
const res = await axios.default.get(href);
|
|
112
|
+
const html = res.data;
|
|
113
|
+
const framework = detectFramework(html);
|
|
114
|
+
validateFramework(framework);
|
|
115
|
+
console.log("Detected framework: " + framework);
|
|
116
|
+
if (framework === Frameworks.DOCUSAURUS) {
|
|
117
|
+
await scrapePageWrapper(scrapeDocusaurusPage);
|
|
118
|
+
}
|
|
119
|
+
else if (framework === Frameworks.GITBOOK) {
|
|
120
|
+
await scrapePageWrapper(scrapeGitBookPage, true);
|
|
121
|
+
}
|
|
122
|
+
else if (framework === Frameworks.README) {
|
|
123
|
+
await scrapePageWrapper(scrapeReadMePage);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
async function scrapePageWrapper(scrapeFunc, puppeteer = false) {
|
|
127
|
+
const href = argv._[1];
|
|
128
|
+
let html;
|
|
129
|
+
if (puppeteer) {
|
|
130
|
+
html = await getHtmlWithPuppeteer(href);
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
const res = await axios.default.get(href);
|
|
134
|
+
html = res.data;
|
|
135
|
+
}
|
|
136
|
+
await scrapePage(scrapeFunc, href, html, argv.overwrite);
|
|
90
137
|
process.exit(1);
|
|
91
138
|
}
|
|
139
|
+
if (command === "scrape-page") {
|
|
140
|
+
await scrapePageAutomatically();
|
|
141
|
+
}
|
|
142
|
+
if (command === "scrape-docusaurus-page") {
|
|
143
|
+
await scrapePageWrapper(scrapeDocusaurusPage);
|
|
144
|
+
}
|
|
145
|
+
if (command === "scrape-gitbook-page") {
|
|
146
|
+
await scrapePageWrapper(scrapeGitBookPage, true);
|
|
147
|
+
}
|
|
92
148
|
if (command === "scrape-readme-page") {
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
149
|
+
await scrapePageWrapper(scrapeReadMePage);
|
|
150
|
+
}
|
|
151
|
+
async function scrapeSectionAutomatically() {
|
|
152
|
+
const href = argv._[1];
|
|
153
|
+
const res = await axios.default.get(href);
|
|
154
|
+
const html = res.data;
|
|
155
|
+
const framework = detectFramework(html);
|
|
156
|
+
validateFramework(framework);
|
|
157
|
+
console.log("Detected framework: " + framework);
|
|
158
|
+
if (framework === Frameworks.DOCUSAURUS) {
|
|
159
|
+
await scrapeSectionAxiosWrapper(scrapeDocusaurusSection);
|
|
160
|
+
}
|
|
161
|
+
else if (framework === Frameworks.GITBOOK) {
|
|
162
|
+
await scrapeSectionGitBookWrapper(scrapeGitBookSection);
|
|
163
|
+
}
|
|
164
|
+
else if (framework === Frameworks.README) {
|
|
165
|
+
await scrapeSectionAxiosWrapper(scrapeReadMeSection);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
async function scrapeSectionAxiosWrapper(scrapeFunc) {
|
|
169
|
+
const href = argv._[1];
|
|
170
|
+
const res = await axios.default.get(href);
|
|
171
|
+
const html = res.data;
|
|
172
|
+
await scrapeSection(scrapeFunc, html, getOrigin(href), argv.overwrite);
|
|
96
173
|
process.exit(1);
|
|
97
174
|
}
|
|
98
|
-
|
|
99
|
-
const
|
|
100
|
-
|
|
101
|
-
const
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
175
|
+
async function scrapeSectionGitBookWrapper(scrapeFunc) {
|
|
176
|
+
const href = argv._[1];
|
|
177
|
+
const browser = await startBrowser();
|
|
178
|
+
const page = await browser.newPage();
|
|
179
|
+
await page.goto(href, {
|
|
180
|
+
waitUntil: "networkidle2",
|
|
181
|
+
});
|
|
182
|
+
let prevEncountered = [];
|
|
183
|
+
let encounteredHref = ["fake"];
|
|
184
|
+
// Loop until we've encountered every link
|
|
185
|
+
while (!encounteredHref.every((href) => prevEncountered.includes(href))) {
|
|
186
|
+
prevEncountered = encounteredHref;
|
|
187
|
+
encounteredHref = await page.evaluate((encounteredHref) => {
|
|
188
|
+
const icons = Array.from(document.querySelectorAll('path[d="M9 18l6-6-6-6"]'));
|
|
189
|
+
const linksFound = [];
|
|
190
|
+
icons.forEach(async (icon) => {
|
|
191
|
+
const toClick = icon.parentElement.parentElement;
|
|
192
|
+
const link = toClick.parentElement.parentElement;
|
|
193
|
+
// Skip icons not in the side navigation
|
|
194
|
+
if (!link.hasAttribute("href")) {
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
197
|
+
const href = link.getAttribute("href");
|
|
198
|
+
// Should never occur but we keep it as a fail-safe
|
|
199
|
+
if (href.startsWith("https://") || href.startsWith("http://")) {
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
// Click any links we haven't seen before
|
|
203
|
+
if (!encounteredHref.includes(href)) {
|
|
204
|
+
toClick.click();
|
|
205
|
+
}
|
|
206
|
+
linksFound.push(href);
|
|
207
|
+
});
|
|
208
|
+
return linksFound;
|
|
209
|
+
}, encounteredHref // Need to pass array into the browser
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
const html = await page.content();
|
|
213
|
+
browser.close();
|
|
214
|
+
await scrapeSection(scrapeFunc, html, getOrigin(href), argv.overwrite);
|
|
105
215
|
process.exit(1);
|
|
106
216
|
}
|
|
217
|
+
if (command === "scrape-section") {
|
|
218
|
+
await scrapeSectionAutomatically();
|
|
219
|
+
}
|
|
220
|
+
if (command === "scrape-docusaurus-section") {
|
|
221
|
+
await scrapeSectionAxiosWrapper(scrapeDocusaurusSection);
|
|
222
|
+
}
|
|
223
|
+
if (command === "scrape-gitbook-section") {
|
|
224
|
+
await scrapeSectionGitBookWrapper(scrapeGitBookSection);
|
|
225
|
+
}
|
|
226
|
+
if (command === "scrape-readme-section") {
|
|
227
|
+
await scrapeSectionAxiosWrapper(scrapeReadMeSection);
|
|
228
|
+
}
|
|
107
229
|
//# sourceMappingURL=index.js.map
|
package/bin/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA,OAAO,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AACnC,OAAO,QAAQ,MAAM,UAAU,CAAC;AAChC,OAAO,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC5C,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AACnC,OAAO,QAAQ,MAAM,UAAU,CAAC;AAChC,OAAO,YAAY,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC5C,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAC9D,OAAO,EAAE,oBAAoB,EAAE,MAAM,kDAAkD,CAAC;AACxF,OAAO,EAAE,uBAAuB,EAAE,MAAM,qDAAqD,CAAC;AAC9F,OAAO,EAAE,iBAAiB,EAAE,MAAM,+CAA+C,CAAC;AAClF,OAAO,EAAE,oBAAoB,EAAE,MAAM,kDAAkD,CAAC;AACxF,OAAO,EAAE,gBAAgB,EAAE,MAAM,8CAA8C,CAAC;AAChF,OAAO,EAAE,mBAAmB,EAAE,MAAM,iDAAiD,CAAC;AACtF,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,MAAM,+BAA+B,CAAC;AAC5E,OAAO,EAAE,YAAY,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAC;AAElE,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE;IAC/C,OAAO,EAAE,CAAC,WAAW,CAAC;IACtB,OAAO,EAAE;QACP,SAAS,EAAE,KAAK;KACjB;CACF,CAAC,CAAC;AAEH,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE;IACvB,OAAO,CAAC,KAAK,CACX,kHAAkH,CACnH,CAAC;IACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,mBAAmB;CACrC;AAED,MAAM,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAE1B,IAAI,OAAO,KAAK,MAAM,EAAE;IACtB,QAAQ;SACL,MAAM,CAAC;QACN;YACE,IAAI,EAAE,OAAO;YACb,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE,uCAAuC;SACjD;QACD;YACE,IAAI,EAAE,OAAO;YACb,IAAI,EAAE,OAAO;YACb,OAAO,EAAE,yCAAyC;YAClD,OAAO,EAAE,SAAS;SACnB;QACD;YACE,IAAI,EAAE,OAAO;YACb,IAAI,EAAE,SAAS;YACf,OAAO,EAAE,gDAAgD;YACzD,OAAO,EAAE,aAAa;SACvB;QACD;YACE,IAAI,EAAE,OAAO;YACb,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,2DAA2D;YACpE,OAAO,EAAE,GAAG;SACb;QACD;YACE,IAAI,EAAE,OAAO;YACb,IAAI,EAAE,OAAO;YACb,OAAO,EAAE,sCAAsC;YAC/C,OAAO,EAAE,cAAc;SACxB;KACF,CAAC;SACD,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE;QAChB,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC;QACxD,aAAa,CACX,WAAW,EACX,IAAI,CAAC,SAAS,CACZ,UAAU,CAAC,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,EAC3D,IAAI,EACJ,IAAI,CACL,CACF,CAAC;QACF,UAAU,CAAC,KAAK,CAAC,CAAC;QAClB,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC,CAAC;QAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC;SACD,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;QACf,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,CAAC;CACN;AAED,IAAI,OAAO,KAAK,MAAM,EAAE;IACtB,QAAQ;SACL,MAAM,CAAC;QACN;YACE,IAAI,EAAE,OAAO;YACb,IAAI,EAAE,OAAO;YACb,OAAO,EAAE,oCAAoC;SAC9C;QACD;YACE,IAAI,EAAE,OAAO;YACb,IAAI,EAAE,aAAa;YACnB,OAAO,EAAE,0BAA0B;YACnC,OAAO,EAAE,EAAE;SACZ;KACF,CAAC;SACD,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE;QAChB,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC;QAEvC,UAAU,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;QAC/B,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC,CAAC;QAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC;SACD,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;QACf,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,CAAC;CACN;AAED,SAAS,iBAAiB,CAAC,SAAS;IAClC,IAAI,CAAC,SAAS,EAAE;QACd,OAAO,CAAC,GAAG,CACT,kEAAkE,CACnE,CAAC;QACF,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;QAClC,OAAO,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;KACxB;AACH,CAAC;AAED,KAAK,UAAU,uBAAuB;IACpC,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC;IACtB,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;IAExC,iBAAiB,CAAC,SAAS,CAAC,CAAC;IAE7B,OAAO,CAAC,GAAG,CAAC,sBAAsB,GAAG,SAAS,CAAC,CAAC;IAEhD,IAAI,SAAS,KAAK,UAAU,CAAC,UAAU,EAAE;QACvC,MAAM,iBAAiB,CAAC,oBAAoB,CAAC,CAAC;KAC/C;SAAM,IAAI,SAAS,KAAK,UAAU,CAAC,OAAO,EAAE;QAC3C,MAAM,iBAAiB,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAC;KAClD;SAAM,IAAI,SAAS,KAAK,UAAU,CAAC,MAAM,EAAE;QAC1C,MAAM,iBAAiB,CAAC,gBAAgB,CAAC,CAAC;KAC3C;AACH,CAAC;AAED,KAAK,UAAU,iBAAiB,CAAC,UAAU,EAAE,SAAS,GAAG,KAAK;IAC5D,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,IAAI,IAAI,CAAC;IACT,IAAI,SAAS,EAAE;QACb,IAAI,GAAG,MAAM,oBAAoB,CAAC,IAAI,CAAC,CAAC;KACzC;SAAM;QACL,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAC1C,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC;KACjB;IACD,MAAM,UAAU,CAAC,UAAU,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;IACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,IAAI,OAAO,KAAK,aAAa,EAAE;IAC7B,MAAM,uBAAuB,EAAE,CAAC;CACjC;AAED,IAAI,OAAO,KAAK,wBAAwB,EAAE;IACxC,MAAM,iBAAiB,CAAC,oBAAoB,CAAC,CAAC;CAC/C;AAED,IAAI,OAAO,KAAK,qBAAqB,EAAE;IACrC,MAAM,iBAAiB,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAC;CAClD;AAED,IAAI,OAAO,KAAK,oBAAoB,EAAE;IACpC,MAAM,iBAAiB,CAAC,gBAAgB,CAAC,CAAC;CAC3C;AAED,KAAK,UAAU,0BAA0B;IACvC,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC;IACtB,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;IAExC,iBAAiB,CAAC,SAAS,CAAC,CAAC;IAE7B,OAAO,CAAC,GAAG,CAAC,sBAAsB,GAAG,SAAS,CAAC,CAAC;IAEhD,IAAI,SAAS,KAAK,UAAU,CAAC,UAAU,EAAE;QACvC,MAAM,yBAAyB,CAAC,uBAAuB,CAAC,CAAC;KAC1D;SAAM,IAAI,SAAS,KAAK,UAAU,CAAC,OAAO,EAAE;QAC3C,MAAM,2BAA2B,CAAC,oBAAoB,CAAC,CAAC;KACzD;SAAM,IAAI,SAAS,KAAK,UAAU,CAAC,MAAM,EAAE;QAC1C,MAAM,yBAAyB,CAAC,mBAAmB,CAAC,CAAC;KACtD;AACH,CAAC;AAED,KAAK,UAAU,yBAAyB,CAAC,UAAe;IACtD,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC;IACtB,MAAM,aAAa,CAAC,UAAU,EAAE,IAAI,EAAE,SAAS,CAAC,IAAI,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;IACvE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,KAAK,UAAU,2BAA2B,CAAC,UAAe;IACxD,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEvB,MAAM,OAAO,GAAG,MAAM,YAAY,EAAE,CAAC;IACrC,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IACrC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QACpB,SAAS,EAAE,cAAc;KAC1B,CAAC,CAAC;IAEH,IAAI,eAAe,GAAG,EAAE,CAAC;IACzB,IAAI,eAAe,GAAG,CAAC,MAAM,CAAC,CAAC;IAE/B,0CAA0C;IAC1C,OAAO,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,EAAE;QACvE,eAAe,GAAG,eAAe,CAAC;QAClC,eAAe,GAAG,MAAM,IAAI,CAAC,QAAQ,CACnC,CAAC,eAAe,EAAE,EAAE;YAClB,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CACtB,QAAQ,CAAC,gBAAgB,CAAC,yBAAyB,CAAC,CACrD,CAAC;YAEF,MAAM,UAAU,GAAG,EAAE,CAAC;YACtB,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,IAAiB,EAAE,EAAE;gBACxC,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,CAAC;gBACjD,MAAM,IAAI,GAAG,OAAO,CAAC,aAAa,CAAC,aAAa,CAAC;gBAEjD,wCAAwC;gBACxC,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,EAAE;oBAC9B,OAAO;iBACR;gBAED,MAAM,IAAI,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;gBAEvC,mDAAmD;gBACnD,IAAI,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;oBAC7D,OAAO;iBACR;gBAED,yCAAyC;gBACzC,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE;oBACnC,OAAO,CAAC,KAAK,EAAE,CAAC;iBACjB;gBAED,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACxB,CAAC,CAAC,CAAC;YAEH,OAAO,UAAU,CAAC;QACpB,CAAC,EACD,eAAe,CAAC,sCAAsC;SACvD,CAAC;KACH;IAED,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;IAClC,OAAO,CAAC,KAAK,EAAE,CAAC;IAChB,MAAM,aAAa,CAAC,UAAU,EAAE,IAAI,EAAE,SAAS,CAAC,IAAI,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;IACvE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,IAAI,OAAO,KAAK,gBAAgB,EAAE;IAChC,MAAM,0BAA0B,EAAE,CAAC;CACpC;AAED,IAAI,OAAO,KAAK,2BAA2B,EAAE;IAC3C,MAAM,yBAAyB,CAAC,uBAAuB,CAAC,CAAC;CAC1D;AAED,IAAI,OAAO,KAAK,wBAAwB,EAAE;IACxC,MAAM,2BAA2B,CAAC,oBAAoB,CAAC,CAAC;CACzD;AAED,IAAI,OAAO,KAAK,uBAAuB,EAAE;IACvC,MAAM,yBAAyB,CAAC,mBAAmB,CAAC,CAAC;CACtD"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import cheerio from "cheerio";
|
|
2
|
+
export var Frameworks;
|
|
3
|
+
(function (Frameworks) {
|
|
4
|
+
Frameworks["DOCUSAURUS"] = "DOCUSAURUS";
|
|
5
|
+
Frameworks["GITBOOK"] = "GITBOOK";
|
|
6
|
+
Frameworks["README"] = "README";
|
|
7
|
+
})(Frameworks = Frameworks || (Frameworks = {}));
|
|
8
|
+
export function detectFramework(html) {
|
|
9
|
+
const $ = cheerio.load(html);
|
|
10
|
+
const docusaurusMeta = $('meta[name="generator"]');
|
|
11
|
+
if (docusaurusMeta.length > 0 &&
|
|
12
|
+
docusaurusMeta.attr("content").includes("Docusaurus")) {
|
|
13
|
+
return Frameworks.DOCUSAURUS;
|
|
14
|
+
}
|
|
15
|
+
const isGitBook = $(".gitbook-root").length > 0;
|
|
16
|
+
if (isGitBook) {
|
|
17
|
+
return Frameworks.GITBOOK;
|
|
18
|
+
}
|
|
19
|
+
const isReadMe = $('meta[name="readme-deploy"]').length > 0;
|
|
20
|
+
if (isReadMe) {
|
|
21
|
+
return Frameworks.README;
|
|
22
|
+
}
|
|
23
|
+
return undefined;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=detectFramework.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detectFramework.js","sourceRoot":"","sources":["../../src/scraping/detectFramework.ts"],"names":[],"mappings":"AAAA,OAAO,OAAO,MAAM,SAAS,CAAC;AAE9B,MAAM,CAAN,IAAY,UAIX;AAJD,WAAY,UAAU;IACpB,uCAAyB,CAAA;IACzB,iCAAmB,CAAA;IACnB,+BAAiB,CAAA;AACnB,CAAC,EAJW,UAAU,GAAV,UAAU,KAAV,UAAU,QAIrB;AAED,MAAM,UAAU,eAAe,CAAC,IAAI;IAClC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,cAAc,GAAG,CAAC,CAAC,wBAAwB,CAAC,CAAC;IAEnD,IACE,cAAc,CAAC,MAAM,GAAG,CAAC;QACzB,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EACrD;QACA,OAAO,UAAU,CAAC,UAAU,CAAC;KAC9B;IAED,MAAM,SAAS,GAAG,CAAC,CAAC,eAAe,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;IAChD,IAAI,SAAS,EAAE;QACb,OAAO,UAAU,CAAC,OAAO,CAAC;KAC3B;IAED,MAAM,QAAQ,GAAG,CAAC,CAAC,4BAA4B,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;IAC5D,IAAI,QAAQ,EAAE;QACZ,OAAO,UAAU,CAAC,MAAM,CAAC;KAC1B;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import downloadImage from "../downloadImage.js";
|
|
3
|
+
// To Do: Use CheerioElement instead of any when we bump the cheerio version
|
|
4
|
+
export default async function downloadAllImages($, content, origin, baseDir, modifyFileName) {
|
|
5
|
+
if (!baseDir) {
|
|
6
|
+
console.debug("Skipping image downloading");
|
|
7
|
+
return;
|
|
8
|
+
}
|
|
9
|
+
// We remove duplicates because some frameworks duplicate img tags
|
|
10
|
+
// to show the image larger when clicked on.
|
|
11
|
+
const imageSrcs = [
|
|
12
|
+
...new Set(content
|
|
13
|
+
.find("img[src]")
|
|
14
|
+
.map((i, image) => $(image).attr("src"))
|
|
15
|
+
.toArray()),
|
|
16
|
+
];
|
|
17
|
+
// Wait to all images to download before continuing
|
|
18
|
+
const origToNewArray = await Promise.all(imageSrcs.map(async (origImageSrc) => {
|
|
19
|
+
// Add origin if the image tags are using relative sources
|
|
20
|
+
const imageHref = origImageSrc.startsWith("http")
|
|
21
|
+
? origImageSrc
|
|
22
|
+
: new URL(origImageSrc, origin).href;
|
|
23
|
+
let fileName = removeMetadataFromExtension(path.basename(imageHref));
|
|
24
|
+
if (modifyFileName) {
|
|
25
|
+
fileName = modifyFileName(fileName);
|
|
26
|
+
}
|
|
27
|
+
if (!fileName) {
|
|
28
|
+
console.error("Invalid image path " + imageHref);
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
const writePath = path.join(baseDir, fileName);
|
|
32
|
+
await downloadImage(imageHref, writePath)
|
|
33
|
+
.then(() => {
|
|
34
|
+
console.log("🖼️ - " + writePath);
|
|
35
|
+
})
|
|
36
|
+
.catch((e) => {
|
|
37
|
+
if (e.code === "EEXIST") {
|
|
38
|
+
console.log(`❌ Skipping existing image ${writePath}`);
|
|
39
|
+
}
|
|
40
|
+
else {
|
|
41
|
+
console.error(e);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
return { [origImageSrc]: writePath };
|
|
45
|
+
}));
|
|
46
|
+
return origToNewArray.reduce((result, current) => Object.assign(result, current), {});
|
|
47
|
+
}
|
|
48
|
+
function removeMetadataFromExtension(src) {
|
|
49
|
+
// Part of the URL standard
|
|
50
|
+
const metadataSymbols = ["?", "#"];
|
|
51
|
+
metadataSymbols.forEach((dividerSymbol) => {
|
|
52
|
+
// Some frameworks add metadata after the file extension, we need to remove that.
|
|
53
|
+
src = src.split(dividerSymbol)[0];
|
|
54
|
+
});
|
|
55
|
+
return src;
|
|
56
|
+
}
|
|
57
|
+
//# sourceMappingURL=downloadAllImages.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"downloadAllImages.js","sourceRoot":"","sources":["../../src/scraping/downloadAllImages.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,aAAa,MAAM,qBAAqB,CAAC;AAEhD,4EAA4E;AAC5E,MAAM,CAAC,OAAO,CAAC,KAAK,UAAU,iBAAiB,CAC7C,CAAM,EACN,OAAY,EACZ,MAAc,EACd,OAAe,EACf,cAAoB;IAEpB,IAAI,CAAC,OAAO,EAAE;QACZ,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAC5C,OAAO;KACR;IAED,kEAAkE;IAClE,4CAA4C;IAC5C,MAAM,SAAS,GAAG;QAChB,GAAG,IAAI,GAAG,CACR,OAAO;aACJ,IAAI,CAAC,UAAU,CAAC;aAChB,GAAG,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;aACvC,OAAO,EAAE,CACb;KACF,CAAC;IAEF,mDAAmD;IACnD,MAAM,cAAc,GAAG,MAAM,OAAO,CAAC,GAAG,CACtC,SAAS,CAAC,GAAG,CAAC,KAAK,EAAE,YAAoB,EAAE,EAAE;QAC3C,0DAA0D;QAC1D,MAAM,SAAS,GAAG,YAAY,CAAC,UAAU,CAAC,MAAM,CAAC;YAC/C,CAAC,CAAC,YAAY;YACd,CAAC,CAAC,IAAI,GAAG,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,IAAI,CAAC;QAEvC,IAAI,QAAQ,GAAG,2BAA2B,CAAC,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC;QACrE,IAAI,cAAc,EAAE;YAClB,QAAQ,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;SACrC;QAED,IAAI,CAAC,QAAQ,EAAE;YACb,OAAO,CAAC,KAAK,CAAC,qBAAqB,GAAG,SAAS,CAAC,CAAC;YACjD,OAAO;SACR;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;QAE/C,MAAM,aAAa,CAAC,SAAS,EAAE,SAAS,CAAC;aACtC,IAAI,CAAC,GAAG,EAAE;YACT,OAAO,CAAC,GAAG,CAAC,QAAQ,GAAG,SAAS,CAAC,CAAC;QACpC,CAAC,CAAC;aACD,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;YACX,IAAI,CAAC,CAAC,IAAI,KAAK,QAAQ,EAAE;gBACvB,OAAO,CAAC,GAAG,CAAC,6BAA6B,SAAS,EAAE,CAAC,CAAC;aACvD;iBAAM;gBACL,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;aAClB;QACH,CAAC,CAAC,CAAC;QAEL,OAAO,EAAE,CAAC,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC;IACvC,CAAC,CAAC,CACH,CAAC;IAEF,OAAO,cAAc,CAAC,MAAM,CAC1B,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EACnD,EAAE,CACH,CAAC;AACJ,CAAC;AAED,SAAS,2BAA2B,CAAC,GAAW;IAC9C,2BAA2B;IAC3B,MAAM,eAAe,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAEnC,eAAe,CAAC,OAAO,CAAC,CAAC,aAAa,EAAE,EAAE;QACxC,iFAAiF;QACjF,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;IACH,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import axios from "axios";
|
|
2
|
+
export const getSitemapLinks = async (url) => {
|
|
3
|
+
const hostname = url.hostname.replace(".", "\\.");
|
|
4
|
+
const regex = new RegExp(`https?:\/\/${hostname}.+?(?=<\/loc>)`, "gmi");
|
|
5
|
+
try {
|
|
6
|
+
const indexData = (await axios.default.get(url.href)).data;
|
|
7
|
+
const array = indexData.match(regex);
|
|
8
|
+
return array || [];
|
|
9
|
+
}
|
|
10
|
+
catch (err) {
|
|
11
|
+
console.error(err);
|
|
12
|
+
console.log("Skipping sitemap links because we encountered an error.");
|
|
13
|
+
return [];
|
|
14
|
+
}
|
|
15
|
+
};
|
|
16
|
+
//# sourceMappingURL=getSitemapLinks.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"getSitemapLinks.js","sourceRoot":"","sources":["../../src/scraping/getSitemapLinks.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAE1B,MAAM,CAAC,MAAM,eAAe,GAAG,KAAK,EAAE,GAAQ,EAAE,EAAE;IAChD,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAClD,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,cAAc,QAAQ,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAExE,IAAI;QACF,MAAM,SAAS,GAAG,CAAC,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,IAAc,CAAC;QACrE,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,KAAK,CAAoB,CAAC;QACxD,OAAO,KAAK,IAAI,EAAE,CAAC;KACpB;IAAC,OAAO,GAAG,EAAE;QACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,yDAAyD,CAAC,CAAC;QACvE,OAAO,EAAE,CAAC;KACX;AACH,CAAC,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export default function replaceImagePaths(origToWritePath, cliDir, markdown) {
|
|
2
|
+
if (origToWritePath == null) {
|
|
3
|
+
return markdown;
|
|
4
|
+
}
|
|
5
|
+
// Change image paths to use the downloaded locations
|
|
6
|
+
for (const [origHref, writePath] of Object.entries(origToWritePath)) {
|
|
7
|
+
// Use relative paths within the folder we are in
|
|
8
|
+
if (writePath.startsWith(cliDir)) {
|
|
9
|
+
markdown = markdown.replaceAll(origHref, writePath.slice(cliDir.length));
|
|
10
|
+
}
|
|
11
|
+
else {
|
|
12
|
+
markdown = markdown.replaceAll(origHref, writePath);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
return markdown;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=replaceImagePaths.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"replaceImagePaths.js","sourceRoot":"","sources":["../../src/scraping/replaceImagePaths.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,OAAO,UAAU,iBAAiB,CACvC,eAAuB,EACvB,MAAc,EACd,QAAgB;IAEhB,IAAI,eAAe,IAAI,IAAI,EAAE;QAC3B,OAAO,QAAQ,CAAC;KACjB;IAED,qDAAqD;IACrD,KAAK,MAAM,CAAC,QAAQ,EAAE,SAAS,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,EAAE;QACnE,iDAAiD;QACjD,IAAI,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE;YAChC,QAAQ,GAAG,QAAQ,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;SAC1E;aAAM;YACL,QAAQ,GAAG,QAAQ,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;SACrD;KACF;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import axios from "axios";
|
|
3
|
+
import { getHtmlWithPuppeteer } from "../browser.js";
|
|
4
|
+
import { createPage } from "../util.js";
|
|
5
|
+
export async function scrapeGettingFileNameFromUrl(cliDir, origin, pathname, overwrite, scrapePageFunc, puppeteer = false, baseToRemove) {
|
|
6
|
+
// Skip scraping external links
|
|
7
|
+
if (pathname.startsWith("https://") || pathname.startsWith("http://")) {
|
|
8
|
+
return pathname;
|
|
9
|
+
}
|
|
10
|
+
// Removes file name from the end
|
|
11
|
+
const splitSubpath = pathname.split("/");
|
|
12
|
+
let folders = splitSubpath.slice(0, splitSubpath.length - 1).join("/");
|
|
13
|
+
// Remove base dir if passed in
|
|
14
|
+
if (baseToRemove && folders.startsWith(baseToRemove)) {
|
|
15
|
+
folders = folders.replace(baseToRemove, "");
|
|
16
|
+
}
|
|
17
|
+
// TO DO: Improve this by putting each page's images in a separate
|
|
18
|
+
// folder named after the title of the page.
|
|
19
|
+
const imageBaseDir = path.join(cliDir, "images", folders);
|
|
20
|
+
// Scrape each page separately
|
|
21
|
+
const href = new URL(pathname, origin).href;
|
|
22
|
+
let html;
|
|
23
|
+
if (puppeteer) {
|
|
24
|
+
html = await getHtmlWithPuppeteer(href);
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
const res = await axios.default.get(href);
|
|
28
|
+
html = res.data;
|
|
29
|
+
}
|
|
30
|
+
const { title, description, markdown } = await scrapePageFunc(html, origin, cliDir, imageBaseDir);
|
|
31
|
+
// Check if page didn't have content
|
|
32
|
+
if (!title && !markdown) {
|
|
33
|
+
return undefined;
|
|
34
|
+
}
|
|
35
|
+
const newFileLocation = folders ? path.join(cliDir, folders) : cliDir;
|
|
36
|
+
// Default to introduction.mdx if we encountered index.html
|
|
37
|
+
const fileName = splitSubpath[splitSubpath.length - 1] || "introduction";
|
|
38
|
+
// Will create subfolders as needed
|
|
39
|
+
createPage(title, description, markdown, overwrite, newFileLocation, fileName);
|
|
40
|
+
// Removes first slash if we are in a folder, Mintlify doesn't need it
|
|
41
|
+
return folders ? path.join(folders, fileName).substring(1) : fileName;
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=scrapeGettingFileNameFromUrl.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeGettingFileNameFromUrl.js","sourceRoot":"","sources":["../../src/scraping/scrapeGettingFileNameFromUrl.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,oBAAoB,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAExC,MAAM,CAAC,KAAK,UAAU,4BAA4B,CAChD,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,SAAkB,EAClB,cASE,EACF,SAAS,GAAG,KAAK,EACjB,YAAqB;IAErB,+BAA+B;IAC/B,IAAI,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC,IAAI,QAAQ,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE;QACrE,OAAO,QAAQ,CAAC;KACjB;IAED,iCAAiC;IACjC,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IACzC,IAAI,OAAO,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,EAAE,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAEvE,+BAA+B;IAC/B,IAAI,YAAY,IAAI,OAAO,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE;QACpD,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC;KAC7C;IAED,kEAAkE;IAClE,4CAA4C;IAC5C,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;IAE1D,8BAA8B;IAC9B,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,IAAI,CAAC;IAC5C,IAAI,IAAY,CAAC;IACjB,IAAI,SAAS,EAAE;QACb,IAAI,GAAG,MAAM,oBAAoB,CAAC,IAAI,CAAC,CAAC;KACzC;SAAM;QACL,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAC1C,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC;KACjB;IAED,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,GAAG,MAAM,cAAc,CAC3D,IAAI,EACJ,MAAM,EACN,MAAM,EACN,YAAY,CACb,CAAC;IAEF,oCAAoC;IACpC,IAAI,CAAC,KAAK,IAAI,CAAC,QAAQ,EAAE;QACvB,OAAO,SAAS,CAAC;KAClB;IAED,MAAM,eAAe,GAAG,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IAEtE,2DAA2D;IAC3D,MAAM,QAAQ,GAAG,YAAY,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,IAAI,cAAc,CAAC;IAEzE,mCAAmC;IACnC,UAAU,CACR,KAAK,EACL,WAAW,EACX,QAAQ,EACR,SAAS,EACT,eAAe,EACf,QAAQ,CACT,CAAC;IAEF,sEAAsE;IACtE,OAAO,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC;AACxE,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import { createPage, getOrigin } from "../util.js";
|
|
3
|
+
export async function scrapePage(scrapeFunc, href, html, overwrite) {
|
|
4
|
+
const origin = getOrigin(href);
|
|
5
|
+
const imageBaseDir = path.join(process.cwd(), "images");
|
|
6
|
+
const { title, description, markdown } = await scrapeFunc(html, origin, process.cwd(), imageBaseDir);
|
|
7
|
+
createPage(title, description, markdown, overwrite, process.cwd());
|
|
8
|
+
}
|
|
9
|
+
//# sourceMappingURL=scrapePage.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapePage.js","sourceRoot":"","sources":["../../src/scraping/scrapePage.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAEnD,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,UAKiB,EACjB,IAAY,EACZ,IAAY,EACZ,SAAkB;IAElB,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC/B,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,QAAQ,CAAC,CAAC;IACxD,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,GAAG,MAAM,UAAU,CACvD,IAAI,EACJ,MAAM,EACN,OAAO,CAAC,GAAG,EAAE,EACb,YAAY,CACb,CAAC;IACF,UAAU,CAAC,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,SAAS,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC;AACrE,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { objToReadableString } from "../util.js";
|
|
2
|
+
export async function scrapeSection(scrapeFunc, html, origin, overwrite) {
|
|
3
|
+
console.log(`Started scraping${overwrite ? ", overwrite mode is on" : ""}...`);
|
|
4
|
+
const groupsConfig = await scrapeFunc(html, origin, process.cwd(), overwrite);
|
|
5
|
+
console.log("Finished scraping.");
|
|
6
|
+
console.log("Add the following to your navigation in mint.json:");
|
|
7
|
+
console.log(objToReadableString(groupsConfig));
|
|
8
|
+
}
|
|
9
|
+
//# sourceMappingURL=scrapeSection.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeSection.js","sourceRoot":"","sources":["../../src/scraping/scrapeSection.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,YAAY,CAAC;AAEjD,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,UAAe,EACf,IAAY,EACZ,MAAc,EACd,SAAkB;IAElB,OAAO,CAAC,GAAG,CACT,mBAAmB,SAAS,CAAC,CAAC,CAAC,wBAAwB,CAAC,CAAC,CAAC,EAAE,KAAK,CAClE,CAAC;IACF,MAAM,YAAY,GAAG,MAAM,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CAAC,CAAC;IAC9E,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;IAClC,OAAO,CAAC,GAAG,CAAC,oDAAoD,CAAC,CAAC;IAClE,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,YAAY,CAAC,CAAC,CAAC;AACjD,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import cheerio from "cheerio";
|
|
2
|
+
import { NodeHtmlMarkdown } from "node-html-markdown";
|
|
3
|
+
import downloadAllImages from "../downloadAllImages.js";
|
|
4
|
+
import replaceImagePaths from "../replaceImagePaths.js";
|
|
5
|
+
export async function scrapeDocusaurusPage(html, origin, cliDir, imageBaseDir) {
|
|
6
|
+
const $ = cheerio.load(html);
|
|
7
|
+
const content = $(".theme-doc-markdown").first();
|
|
8
|
+
// Index pages with no additional text don't have the markdown class
|
|
9
|
+
if (content.length === 0) {
|
|
10
|
+
return {};
|
|
11
|
+
}
|
|
12
|
+
const titleComponent = content.find("h1");
|
|
13
|
+
const title = titleComponent.text().trim();
|
|
14
|
+
// Do not include title in the content when we insert it in our metadata
|
|
15
|
+
titleComponent.remove();
|
|
16
|
+
const origToWritePath = await downloadAllImages($, content, origin, imageBaseDir);
|
|
17
|
+
const contentHtml = content.html();
|
|
18
|
+
const nhm = new NodeHtmlMarkdown();
|
|
19
|
+
let markdown = nhm.translate(contentHtml);
|
|
20
|
+
// Description only exists in meta tags. The code is commented out because its prone to incorrectly
|
|
21
|
+
// including a description if the first line of text had markdown annotations like `.
|
|
22
|
+
// The commented out alternative is to ignore description if it's the first line of text,
|
|
23
|
+
// this means it was not set in the metadata and Docusaurus defaulted to the text.
|
|
24
|
+
const description = null;
|
|
25
|
+
// let description = $('meta[property="og:description"]').attr("content");
|
|
26
|
+
// if (markdown.startsWith(description)) {
|
|
27
|
+
// description = null;
|
|
28
|
+
// }
|
|
29
|
+
// Remove Docusaurus links from headers
|
|
30
|
+
// When we parse their HTML the parser adds things like:
|
|
31
|
+
// [](#setup "Direct link to heading")
|
|
32
|
+
// to the end of each header.
|
|
33
|
+
markdown = markdown.replace(/\[\]\(#.+ ".+"\)\n/g, "\n");
|
|
34
|
+
// Remove unnecessary nonwidth blank space characters
|
|
35
|
+
markdown = markdown.replace(/\u200b/g, "");
|
|
36
|
+
// Reduce unnecessary blank lines
|
|
37
|
+
markdown = markdown.replace(/\n\n\n/g, "\n\n");
|
|
38
|
+
// Mintlify doesn't support bolded headers, remove the asterisks
|
|
39
|
+
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, "$1 $2\n");
|
|
40
|
+
markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
|
|
41
|
+
return { title, description, markdown };
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=scrapeDocusaurusPage.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeDocusaurusPage.js","sourceRoot":"","sources":["../../../src/scraping/site-scrapers/scrapeDocusaurusPage.ts"],"names":[],"mappings":"AAAA,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACtD,OAAO,iBAAiB,MAAM,yBAAyB,CAAC;AACxD,OAAO,iBAAiB,MAAM,yBAAyB,CAAC;AAExD,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,IAAY,EACZ,MAAc,EACd,MAAc,EACd,YAAoB;IAEpB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,MAAM,OAAO,GAAG,CAAC,CAAC,qBAAqB,CAAC,CAAC,KAAK,EAAE,CAAC;IAEjD,oEAAoE;IACpE,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE;QACxB,OAAO,EAAE,CAAC;KACX;IAED,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,KAAK,GAAG,cAAc,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAE3C,wEAAwE;IACxE,cAAc,CAAC,MAAM,EAAE,CAAC;IAExB,MAAM,eAAe,GAAG,MAAM,iBAAiB,CAC7C,CAAC,EACD,OAAO,EACP,MAAM,EACN,YAAY,CACb,CAAC;IAEF,MAAM,WAAW,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAEnC,MAAM,GAAG,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACnC,IAAI,QAAQ,GAAG,GAAG,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;IAE1C,mGAAmG;IACnG,qFAAqF;IACrF,yFAAyF;IACzF,kFAAkF;IAClF,MAAM,WAAW,GAAG,IAAI,CAAC;IACzB,0EAA0E;IAC1E,0CAA0C;IAC1C,wBAAwB;IACxB,IAAI;IAEJ,uCAAuC;IACvC,wDAAwD;IACxD,sCAAsC;IACtC,6BAA6B;IAC7B,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,qBAAqB,EAAE,IAAI,CAAC,CAAC;IAEzD,qDAAqD;IACrD,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAE3C,iCAAiC;IACjC,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE/C,gEAAgE;IAChE,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,wBAAwB,EAAE,SAAS,CAAC,CAAC;IAEjE,QAAQ,GAAG,iBAAiB,CAAC,eAAe,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;IAEhE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,CAAC;AAC1C,CAAC"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import cheerio from "cheerio";
|
|
2
|
+
import { scrapeGettingFileNameFromUrl } from "../scrapeGettingFileNameFromUrl.js";
|
|
3
|
+
import { scrapeDocusaurusPage } from "./scrapeDocusaurusPage.js";
|
|
4
|
+
export async function scrapeDocusaurusSection(html, origin, cliDir, overwrite) {
|
|
5
|
+
const $ = cheerio.load(html);
|
|
6
|
+
// Get all the navigation sections
|
|
7
|
+
const navigationSections = $(".theme-doc-sidebar-menu").first().children();
|
|
8
|
+
// Get all links per group
|
|
9
|
+
const groupsConfig = navigationSections
|
|
10
|
+
.map((i, section) => {
|
|
11
|
+
const sectionComponent = $(section);
|
|
12
|
+
// Links without a group
|
|
13
|
+
if (sectionComponent.hasClass("theme-doc-sidebar-item-link")) {
|
|
14
|
+
const linkHref = sectionComponent.find("a[href]").first().attr("href");
|
|
15
|
+
return {
|
|
16
|
+
group: "",
|
|
17
|
+
pages: [linkHref],
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
const sectionTitle = sectionComponent
|
|
21
|
+
.find(".menu__list-item-collapsible")
|
|
22
|
+
.first()
|
|
23
|
+
.text();
|
|
24
|
+
// The category title can be a page too so we find from the
|
|
25
|
+
// section component instead of the more specific menu__list child
|
|
26
|
+
const linkPaths = sectionComponent
|
|
27
|
+
.find("a[href]")
|
|
28
|
+
.map((i, link) => {
|
|
29
|
+
return $(link).attr("href");
|
|
30
|
+
})
|
|
31
|
+
.filter((i, link) => link !== "#")
|
|
32
|
+
.toArray();
|
|
33
|
+
// Follows the same structure as mint.json
|
|
34
|
+
return {
|
|
35
|
+
group: sectionTitle,
|
|
36
|
+
pages: linkPaths,
|
|
37
|
+
};
|
|
38
|
+
})
|
|
39
|
+
.toArray();
|
|
40
|
+
// Scrape each link in the navigation.
|
|
41
|
+
const groupsConfigCleanPaths = await Promise.all(groupsConfig.map(async (groupConfig) => {
|
|
42
|
+
groupConfig.pages = (await Promise.all(groupConfig.pages.map(async (pathname) =>
|
|
43
|
+
// Docusaurus requires a directory on all sections wheras we use root.
|
|
44
|
+
// /docs is their default directory so we remove it
|
|
45
|
+
scrapeGettingFileNameFromUrl(cliDir, origin, pathname, overwrite, scrapeDocusaurusPage, false, "/docs"))))
|
|
46
|
+
// Remove skipped index pages (they return undefined from the above function)
|
|
47
|
+
.filter(Boolean);
|
|
48
|
+
return groupConfig;
|
|
49
|
+
}));
|
|
50
|
+
return groupsConfigCleanPaths;
|
|
51
|
+
}
|
|
52
|
+
//# sourceMappingURL=scrapeDocusaurusSection.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeDocusaurusSection.js","sourceRoot":"","sources":["../../../src/scraping/site-scrapers/scrapeDocusaurusSection.ts"],"names":[],"mappings":"AAAA,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,EAAE,4BAA4B,EAAE,MAAM,oCAAoC,CAAC;AAClF,OAAO,EAAE,oBAAoB,EAAE,MAAM,2BAA2B,CAAC;AAEjE,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC3C,IAAY,EACZ,MAAc,EACd,MAAc,EACd,SAAkB;IAElB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,kCAAkC;IAClC,MAAM,kBAAkB,GAAG,CAAC,CAAC,yBAAyB,CAAC,CAAC,KAAK,EAAE,CAAC,QAAQ,EAAE,CAAC;IAE3E,0BAA0B;IAC1B,MAAM,YAAY,GAAG,kBAAkB;SACpC,GAAG,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAClB,MAAM,gBAAgB,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;QAEpC,wBAAwB;QACxB,IAAI,gBAAgB,CAAC,QAAQ,CAAC,6BAA6B,CAAC,EAAE;YAC5D,MAAM,QAAQ,GAAG,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvE,OAAO;gBACL,KAAK,EAAE,EAAE;gBACT,KAAK,EAAE,CAAC,QAAQ,CAAC;aAClB,CAAC;SACH;QAED,MAAM,YAAY,GAAG,gBAAgB;aAClC,IAAI,CAAC,8BAA8B,CAAC;aACpC,KAAK,EAAE;aACP,IAAI,EAAE,CAAC;QAEV,2DAA2D;QAC3D,kEAAkE;QAClE,MAAM,SAAS,GAAG,gBAAgB;aAC/B,IAAI,CAAC,SAAS,CAAC;aACf,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;YACf,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAC9B,CAAC,CAAC;aACD,MAAM,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,GAAG,CAAC;aACjC,OAAO,EAAE,CAAC;QAEb,0CAA0C;QAC1C,OAAO;YACL,KAAK,EAAE,YAAY;YACnB,KAAK,EAAE,SAAS;SACjB,CAAC;IACJ,CAAC,CAAC;SACD,OAAO,EAAE,CAAC;IAEb,sCAAsC;IACtC,MAAM,sBAAsB,GAAG,MAAM,OAAO,CAAC,GAAG,CAC9C,YAAY,CAAC,GAAG,CAAC,KAAK,EAAE,WAAW,EAAE,EAAE;QACrC,WAAW,CAAC,KAAK,GAAG,CAClB,MAAM,OAAO,CAAC,GAAG,CACf,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,QAAgB,EAAE,EAAE;QAC/C,sEAAsE;QACtE,mDAAmD;QACnD,4BAA4B,CAC1B,MAAM,EACN,MAAM,EACN,QAAQ,EACR,SAAS,EACT,oBAAoB,EACpB,KAAK,EACL,OAAO,CACR,CACF,CACF,CACF;YACC,6EAA6E;aAC5E,MAAM,CAAC,OAAO,CAAC,CAAC;QACnB,OAAO,WAAW,CAAC;IACrB,CAAC,CAAC,CACH,CAAC;IAEF,OAAO,sBAAsB,CAAC;AAChC,CAAC"}
|