mintlify 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -0
- package/bin/browser.js +24 -0
- package/bin/browser.js.map +1 -0
- package/bin/downloadImage.js +27 -0
- package/bin/downloadImage.js.map +1 -0
- package/bin/index.js +144 -22
- package/bin/index.js.map +1 -1
- package/bin/scraping/detectFramework.js +25 -0
- package/bin/scraping/detectFramework.js.map +1 -0
- package/bin/scraping/downloadAllImages.js +57 -0
- package/bin/scraping/downloadAllImages.js.map +1 -0
- package/bin/scraping/getSitemapLinks.js +16 -0
- package/bin/scraping/getSitemapLinks.js.map +1 -0
- package/bin/scraping/replaceImagePaths.js +17 -0
- package/bin/scraping/replaceImagePaths.js.map +1 -0
- package/bin/scraping/scrapeGettingFileNameFromUrl.js +43 -0
- package/bin/scraping/scrapeGettingFileNameFromUrl.js.map +1 -0
- package/bin/scraping/scrapePage.js +9 -0
- package/bin/scraping/scrapePage.js.map +1 -0
- package/bin/scraping/scrapeSection.js +9 -0
- package/bin/scraping/scrapeSection.js.map +1 -0
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js +43 -0
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js.map +1 -0
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js +52 -0
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js.map +1 -0
- package/bin/{scrapeGitBookPage.js → scraping/site-scrapers/scrapeGitBookPage.js} +10 -5
- package/bin/scraping/site-scrapers/scrapeGitBookPage.js.map +1 -0
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js +74 -0
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js.map +1 -0
- package/bin/{scrapeReadMePage.js → scraping/site-scrapers/scrapeReadMePage.js} +15 -9
- package/bin/scraping/site-scrapers/scrapeReadMePage.js.map +1 -0
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js +48 -0
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js.map +1 -0
- package/bin/util.js +27 -8
- package/bin/util.js.map +1 -1
- package/package.json +3 -2
- package/src/browser.ts +24 -0
- package/src/downloadImage.ts +35 -0
- package/src/index.ts +173 -22
- package/src/scraping/detectFramework.ts +31 -0
- package/src/scraping/downloadAllImages.ts +79 -0
- package/src/scraping/getSitemapLinks.ts +16 -0
- package/src/scraping/replaceImagePaths.ts +21 -0
- package/src/scraping/scrapeGettingFileNameFromUrl.ts +81 -0
- package/src/scraping/scrapePage.ts +24 -0
- package/src/scraping/scrapeSection.ts +16 -0
- package/src/scraping/site-scrapers/scrapeDocusaurusPage.ts +67 -0
- package/src/scraping/site-scrapers/scrapeDocusaurusSection.ts +80 -0
- package/src/{scrapeGitBookPage.ts → scraping/site-scrapers/scrapeGitBookPage.ts} +25 -5
- package/src/scraping/site-scrapers/scrapeGitBookSection.ts +116 -0
- package/src/{scrapeReadMePage.ts → scraping/site-scrapers/scrapeReadMePage.ts} +28 -10
- package/src/scraping/site-scrapers/scrapeReadMeSection.ts +77 -0
- package/src/util.ts +25 -7
- package/tsconfig.json +1 -1
- package/bin/scrapeGitBook.js +0 -28
- package/bin/scrapeGitBook.js.map +0 -1
- package/bin/scrapeGitBookPage.js.map +0 -1
- package/bin/scrapeReadMe.js +0 -60
- package/bin/scrapeReadMe.js.map +0 -1
- package/bin/scrapeReadMePage.js.map +0 -1
- package/src/scrapeReadMe.ts +0 -79
|
@@ -1,18 +1,22 @@
|
|
|
1
|
-
import axios from "axios";
|
|
2
1
|
import cheerio from "cheerio";
|
|
3
2
|
import { NodeHtmlMarkdown } from "node-html-markdown";
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
import downloadAllImages from "../downloadAllImages.js";
|
|
4
|
+
import replaceImagePaths from "../replaceImagePaths.js";
|
|
5
|
+
export async function scrapeGitBookPage(html, origin, cliDir, imageBaseDir) {
|
|
6
|
+
const $ = cheerio.load(html);
|
|
7
7
|
const titleComponent = $('[data-testid="page.title"]').first();
|
|
8
8
|
const titleAndDescription = titleComponent.parent().parent().parent().text();
|
|
9
|
-
console.log(titleAndDescription);
|
|
10
9
|
const description = titleAndDescription
|
|
11
10
|
.replace(titleComponent.text(), "")
|
|
12
11
|
.trim();
|
|
13
12
|
const title = titleComponent.text().trim();
|
|
14
13
|
const content = $('[data-testid="page.contentEditor"]').first();
|
|
15
14
|
const contentHtml = $.html(content);
|
|
15
|
+
const modifyFileName = (fileName) =>
|
|
16
|
+
// Remove GitBook metadata from the start
|
|
17
|
+
// The first four %2F split metadata fields. Remaining ones are part of the file name.
|
|
18
|
+
fileName.split("%2F").slice(4).join("%2F");
|
|
19
|
+
const origToWritePath = await downloadAllImages($, content, origin, imageBaseDir, modifyFileName);
|
|
16
20
|
const nhm = new NodeHtmlMarkdown();
|
|
17
21
|
let markdown = nhm.translate(contentHtml);
|
|
18
22
|
// Keep headers on one line and increase their depth by one
|
|
@@ -23,6 +27,7 @@ export async function scrapeGitBookPage(url) {
|
|
|
23
27
|
markdown = markdown.replace(/\n\n\n/g, "\n\n");
|
|
24
28
|
// Mintlify doesn't support bolded headers, remove the asterisks
|
|
25
29
|
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, "$1 $2\n");
|
|
30
|
+
markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
|
|
26
31
|
return { title, description, markdown };
|
|
27
32
|
}
|
|
28
33
|
//# sourceMappingURL=scrapeGitBookPage.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeGitBookPage.js","sourceRoot":"","sources":["../../../src/scraping/site-scrapers/scrapeGitBookPage.ts"],"names":[],"mappings":"AAAA,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACtD,OAAO,iBAAiB,MAAM,yBAAyB,CAAC;AACxD,OAAO,iBAAiB,MAAM,yBAAyB,CAAC;AAExD,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,IAAY,EACZ,MAAc,EACd,MAAc,EACd,YAAoB;IAEpB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,MAAM,cAAc,GAAG,CAAC,CAAC,4BAA4B,CAAC,CAAC,KAAK,EAAE,CAAC;IAC/D,MAAM,mBAAmB,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC;IAE7E,MAAM,WAAW,GAAG,mBAAmB;SACpC,OAAO,CAAC,cAAc,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC;SAClC,IAAI,EAAE,CAAC;IACV,MAAM,KAAK,GAAG,cAAc,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAE3C,MAAM,OAAO,GAAG,CAAC,CAAC,oCAAoC,CAAC,CAAC,KAAK,EAAE,CAAC;IAChE,MAAM,WAAW,GAAG,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAEpC,MAAM,cAAc,GAAG,CAAC,QAAQ,EAAE,EAAE;IAClC,yCAAyC;IACzC,sFAAsF;IACtF,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAE7C,MAAM,eAAe,GAAG,MAAM,iBAAiB,CAC7C,CAAC,EACD,OAAO,EACP,MAAM,EACN,YAAY,EACZ,cAAc,CACf,CAAC;IAEF,MAAM,GAAG,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACnC,IAAI,QAAQ,GAAG,GAAG,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;IAE1C,2DAA2D;IAC3D,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;IAE9C,qDAAqD;IACrD,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAE3C,iCAAiC;IACjC,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE/C,gEAAgE;IAChE,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,wBAAwB,EAAE,SAAS,CAAC,CAAC;IAEjE,QAAQ,GAAG,iBAAiB,CAAC,eAAe,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;IAEhE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,CAAC;AAC1C,CAAC"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import cheerio from "cheerio";
|
|
2
|
+
import { scrapeGettingFileNameFromUrl } from "../scrapeGettingFileNameFromUrl.js";
|
|
3
|
+
import { getSitemapLinks } from "../getSitemapLinks.js";
|
|
4
|
+
import { scrapeGitBookPage } from "./scrapeGitBookPage.js";
|
|
5
|
+
export async function scrapeGitBookSection(html, origin, cliDir, overwrite) {
|
|
6
|
+
const $ = cheerio.load(html);
|
|
7
|
+
// Get all the navigation sections
|
|
8
|
+
const navigationSections = $('div[data-testid="page.desktopTableOfContents"] > div > div:first-child')
|
|
9
|
+
.children()
|
|
10
|
+
.first()
|
|
11
|
+
.children()
|
|
12
|
+
.first()
|
|
13
|
+
.children();
|
|
14
|
+
// Get all links per group
|
|
15
|
+
let allNavPathnames = [];
|
|
16
|
+
const groupsConfig = navigationSections
|
|
17
|
+
.map((i, section) => {
|
|
18
|
+
const sectionTitle = $(section)
|
|
19
|
+
.find('div > div[dir="auto"]')
|
|
20
|
+
.first()
|
|
21
|
+
.text();
|
|
22
|
+
const linkPaths = $(section)
|
|
23
|
+
.find("a[href]")
|
|
24
|
+
.map((i, link) => {
|
|
25
|
+
const linkHref = $(link).attr("href");
|
|
26
|
+
// Skip external links until Mintlify supports them
|
|
27
|
+
if (linkHref.startsWith("https://") ||
|
|
28
|
+
linkHref.startsWith("http://")) {
|
|
29
|
+
return undefined;
|
|
30
|
+
}
|
|
31
|
+
return linkHref;
|
|
32
|
+
})
|
|
33
|
+
.toArray();
|
|
34
|
+
allNavPathnames = allNavPathnames.concat(linkPaths);
|
|
35
|
+
// Follows the same structure as mint.json
|
|
36
|
+
return {
|
|
37
|
+
group: sectionTitle,
|
|
38
|
+
pages: linkPaths,
|
|
39
|
+
};
|
|
40
|
+
})
|
|
41
|
+
.toArray();
|
|
42
|
+
// Scrape every link not in the navigation. Nested docs
|
|
43
|
+
// don't show up in navigation without clicking buttons,
|
|
44
|
+
// so this lets us download the files for the user to add
|
|
45
|
+
// manually to mint.json.
|
|
46
|
+
const sitemapPaths = (await getSitemapLinks(new URL("sitemap.xml", origin)))
|
|
47
|
+
.map((sitemapLinks) => {
|
|
48
|
+
return new URL(sitemapLinks).pathname;
|
|
49
|
+
})
|
|
50
|
+
.filter((pathname) => !allNavPathnames.includes(pathname));
|
|
51
|
+
const sitemapPathnamesForConfig = [];
|
|
52
|
+
for (const pathname of sitemapPaths) {
|
|
53
|
+
sitemapPathnamesForConfig.push(await scrapeGettingFileNameFromUrl(cliDir, origin, pathname, overwrite, scrapeGitBookPage, true));
|
|
54
|
+
}
|
|
55
|
+
// Scrape each link in the navigation.
|
|
56
|
+
const groupsConfigCleanPaths = await Promise.all(groupsConfig.map(async (groupConfig) => {
|
|
57
|
+
const newPages = [];
|
|
58
|
+
for (const pathname of groupConfig.pages) {
|
|
59
|
+
newPages.push(await scrapeGettingFileNameFromUrl(cliDir, origin, pathname, overwrite, scrapeGitBookPage, true));
|
|
60
|
+
}
|
|
61
|
+
groupConfig.pages = newPages;
|
|
62
|
+
return groupConfig;
|
|
63
|
+
}));
|
|
64
|
+
if (sitemapPathnamesForConfig.length > 0) {
|
|
65
|
+
return groupsConfigCleanPaths.concat([
|
|
66
|
+
{
|
|
67
|
+
group: "ATTENTION! WE CANNOT DETECT GROUPS FOR NESTED DOCS. PLEASE MOVE THEM INTO THEIR ORIGINAL GROUPS.",
|
|
68
|
+
pages: sitemapPathnamesForConfig,
|
|
69
|
+
},
|
|
70
|
+
]);
|
|
71
|
+
}
|
|
72
|
+
return groupsConfigCleanPaths;
|
|
73
|
+
}
|
|
74
|
+
//# sourceMappingURL=scrapeGitBookSection.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeGitBookSection.js","sourceRoot":"","sources":["../../../src/scraping/site-scrapers/scrapeGitBookSection.ts"],"names":[],"mappings":"AAAA,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,EAAE,4BAA4B,EAAE,MAAM,oCAAoC,CAAC;AAClF,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAE3D,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,IAAY,EACZ,MAAc,EACd,MAAc,EACd,SAAkB;IAElB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,kCAAkC;IAClC,MAAM,kBAAkB,GAAG,CAAC,CAC1B,wEAAwE,CACzE;SACE,QAAQ,EAAE;SACV,KAAK,EAAE;SACP,QAAQ,EAAE;SACV,KAAK,EAAE;SACP,QAAQ,EAAE,CAAC;IAEd,0BAA0B;IAC1B,IAAI,eAAe,GAAG,EAAE,CAAC;IACzB,MAAM,YAAY,GAAG,kBAAkB;SACpC,GAAG,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAClB,MAAM,YAAY,GAAG,CAAC,CAAC,OAAO,CAAC;aAC5B,IAAI,CAAC,uBAAuB,CAAC;aAC7B,KAAK,EAAE;aACP,IAAI,EAAE,CAAC;QAEV,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC;aACzB,IAAI,CAAC,SAAS,CAAC;aACf,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;YACf,MAAM,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAEtC,mDAAmD;YACnD,IACE,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC;gBAC/B,QAAQ,CAAC,UAAU,CAAC,SAAS,CAAC,EAC9B;gBACA,OAAO,SAAS,CAAC;aAClB;YAED,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC;aACD,OAAO,EAAE,CAAC;QAEb,eAAe,GAAG,eAAe,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAEpD,0CAA0C;QAC1C,OAAO;YACL,KAAK,EAAE,YAAY;YACnB,KAAK,EAAE,SAAS;SACjB,CAAC;IACJ,CAAC,CAAC;SACD,OAAO,EAAE,CAAC;IAEb,uDAAuD;IACvD,wDAAwD;IACxD,yDAAyD;IACzD,yBAAyB;IACzB,MAAM,YAAY,GAAG,CAAC,MAAM,eAAe,CAAC,IAAI,GAAG,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC,CAAC;SACzE,GAAG,CAAC,CAAC,YAAoB,EAAE,EAAE;QAC5B,OAAO,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC,QAAQ,CAAC;IACxC,CAAC,CAAC;SACD,MAAM,CAAC,CAAC,QAAgB,EAAE,EAAE,CAAC,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;IAErE,MAAM,yBAAyB,GAAG,EAAE,CAAC;IACrC,KAAK,MAAM,QAAQ,IAAI,YAAY,EAAE;QACnC,yBAAyB,CAAC,IAAI,CAC5B,MAAM,4BAA4B,CAChC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,SAAS,EACT,iBAAiB,EACjB,IAAI,CACL,CACF,CAAC;KACH;IAED,sCAAsC;IACtC,MAAM,sBAAsB,GAAG,MAAM,OAAO,CAAC,GAAG,CAC9C,YAAY,CAAC,GAAG,CAAC,KAAK,EAAE,WAAW,EAAE,EAAE;QACrC,MAAM,QAAQ,GAAG,EAAE,CAAC;QACpB,KAAK,MAAM,QAAQ,IAAI,WAAW,CAAC,KAAK,EAAE;YACxC,QAAQ,CAAC,IAAI,CACX,MAAM,4BAA4B,CAChC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,SAAS,EACT,iBAAiB,EACjB,IAAI,CACL,CACF,CAAC;SACH;QACD,WAAW,CAAC,KAAK,GAAG,QAAQ,CAAC;QAC7B,OAAO,WAAW,CAAC;IACrB,CAAC,CAAC,CACH,CAAC;IAEF,IAAI,yBAAyB,CAAC,MAAM,GAAG,CAAC,EAAE;QACxC,OAAO,sBAAsB,CAAC,MAAM,CAAC;YACnC;gBACE,KAAK,EACH,kGAAkG;gBACpG,KAAK,EAAE,yBAAyB;aACjC;SACF,CAAC,CAAC;KACJ;IAED,OAAO,sBAAsB,CAAC;AAChC,CAAC"}
|
|
@@ -1,16 +1,21 @@
|
|
|
1
|
-
import axios from "axios";
|
|
2
1
|
import cheerio from "cheerio";
|
|
3
2
|
import { NodeHtmlMarkdown } from "node-html-markdown";
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
import downloadAllImages from "../downloadAllImages.js";
|
|
4
|
+
import replaceImagePaths from "../replaceImagePaths.js";
|
|
5
|
+
export async function scrapeReadMePage(html, origin, cliDir, imageBaseDir) {
|
|
6
|
+
const $ = cheerio.load(html);
|
|
7
7
|
const titleComponent = $("h1").first();
|
|
8
8
|
const title = titleComponent.text().trim();
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
.trim();
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
let description = $(".markdown-body", titleComponent.parent()).text().trim();
|
|
10
|
+
if (!description) {
|
|
11
|
+
description = $(".rm-Article > header p").text().trim();
|
|
12
|
+
}
|
|
13
|
+
let content = $(".content-body .markdown-body").first();
|
|
14
|
+
if (content.length === 0) {
|
|
15
|
+
content = $(".rm-Article > .markdown-body");
|
|
16
|
+
}
|
|
17
|
+
const contentHtml = content.html();
|
|
18
|
+
const origToWritePath = await downloadAllImages($, content, origin, imageBaseDir);
|
|
14
19
|
const nhm = new NodeHtmlMarkdown();
|
|
15
20
|
let markdown = nhm.translate(contentHtml);
|
|
16
21
|
// Keep headers on one line and increase their depth by one
|
|
@@ -23,6 +28,7 @@ export async function scrapeReadMePage(url) {
|
|
|
23
28
|
markdown = markdown.replace(/\n\n\n/g, "\n\n");
|
|
24
29
|
// Mintlify doesn't support bolded headers, remove the asterisks
|
|
25
30
|
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, "$1 $2\n");
|
|
31
|
+
markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
|
|
26
32
|
return { title, description, markdown };
|
|
27
33
|
}
|
|
28
34
|
//# sourceMappingURL=scrapeReadMePage.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeReadMePage.js","sourceRoot":"","sources":["../../../src/scraping/site-scrapers/scrapeReadMePage.ts"],"names":[],"mappings":"AAAA,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACtD,OAAO,iBAAiB,MAAM,yBAAyB,CAAC;AACxD,OAAO,iBAAiB,MAAM,yBAAyB,CAAC;AAExD,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,IAAY,EACZ,MAAc,EACd,MAAc,EACd,YAAoB;IAEpB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,MAAM,cAAc,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;IACvC,MAAM,KAAK,GAAG,cAAc,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAC3C,IAAI,WAAW,GAAG,CAAC,CAAC,gBAAgB,EAAE,cAAc,CAAC,MAAM,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAC7E,IAAI,CAAC,WAAW,EAAE;QAChB,WAAW,GAAG,CAAC,CAAC,wBAAwB,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;KACzD;IAED,IAAI,OAAO,GAAG,CAAC,CAAC,8BAA8B,CAAC,CAAC,KAAK,EAAE,CAAC;IACxD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE;QACxB,OAAO,GAAG,CAAC,CAAC,8BAA8B,CAAC,CAAC;KAC7C;IACD,MAAM,WAAW,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAEnC,MAAM,eAAe,GAAG,MAAM,iBAAiB,CAC7C,CAAC,EACD,OAAO,EACP,MAAM,EACN,YAAY,CACb,CAAC;IAEF,MAAM,GAAG,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACnC,IAAI,QAAQ,GAAG,GAAG,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;IAE1C,2DAA2D;IAC3D,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;IAE9C,qDAAqD;IACrD,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAE3C,6BAA6B;IAC7B,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,kBAAkB,EAAE,IAAI,CAAC,CAAC;IAEtD,iCAAiC;IACjC,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE/C,gEAAgE;IAChE,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,wBAAwB,EAAE,SAAS,CAAC,CAAC;IAEjE,QAAQ,GAAG,iBAAiB,CAAC,eAAe,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;IAEhE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,CAAC;AAC1C,CAAC"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import cheerio from "cheerio";
|
|
2
|
+
import { scrapeReadMePage } from "./scrapeReadMePage.js";
|
|
3
|
+
import { scrapeGettingFileNameFromUrl } from "../scrapeGettingFileNameFromUrl.js";
|
|
4
|
+
export async function scrapeReadMeSection(html, origin, cliDir, overwrite) {
|
|
5
|
+
const $ = cheerio.load(html);
|
|
6
|
+
// Get all the navigation sections, but only from the first
|
|
7
|
+
// sidebar found. There are multiple in the HTML for mobile
|
|
8
|
+
// responsiveness but they all have the same links.
|
|
9
|
+
const navigationSections = $(".rm-Sidebar")
|
|
10
|
+
.first()
|
|
11
|
+
.find(".rm-Sidebar-section");
|
|
12
|
+
const groupsConfig = navigationSections
|
|
13
|
+
.map((i, section) => {
|
|
14
|
+
const sectionTitle = $(section).find("h3").first().text();
|
|
15
|
+
// Get all links, then use filter to remove duplicates.
|
|
16
|
+
// There are duplicates because of nested navigation, eg:
|
|
17
|
+
// subgroupTitle -> /first-page
|
|
18
|
+
// -- First Page -> /first-page ** DUPLICATE **
|
|
19
|
+
// -- Second Page -> /second-page
|
|
20
|
+
const linkPaths = $(section)
|
|
21
|
+
.find("a[href]")
|
|
22
|
+
.map((i, link) => {
|
|
23
|
+
const linkHref = $(link).attr("href");
|
|
24
|
+
// Skip external links until Mintlify supports them
|
|
25
|
+
if (linkHref.startsWith("https://") ||
|
|
26
|
+
linkHref.startsWith("http://")) {
|
|
27
|
+
return undefined;
|
|
28
|
+
}
|
|
29
|
+
return linkHref;
|
|
30
|
+
})
|
|
31
|
+
.toArray()
|
|
32
|
+
.filter((value, index, self) => self.indexOf(value) === index);
|
|
33
|
+
// Follows the same structure as mint.json
|
|
34
|
+
return {
|
|
35
|
+
group: sectionTitle,
|
|
36
|
+
pages: linkPaths,
|
|
37
|
+
};
|
|
38
|
+
})
|
|
39
|
+
.toArray();
|
|
40
|
+
return await Promise.all(groupsConfig.map(async (groupConfig) => {
|
|
41
|
+
groupConfig.pages = await Promise.all(groupConfig.pages.map(async (pathname) =>
|
|
42
|
+
// ReadMe requires a directory on all sections wheras we use root.
|
|
43
|
+
// /docs is their default directory so we remove it
|
|
44
|
+
scrapeGettingFileNameFromUrl(cliDir, origin, pathname, overwrite, scrapeReadMePage, false, "/docs")));
|
|
45
|
+
return groupConfig;
|
|
46
|
+
}));
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=scrapeReadMeSection.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeReadMeSection.js","sourceRoot":"","sources":["../../../src/scraping/site-scrapers/scrapeReadMeSection.ts"],"names":[],"mappings":"AAAA,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACzD,OAAO,EAAE,4BAA4B,EAAE,MAAM,oCAAoC,CAAC;AAElF,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,IAAY,EACZ,MAAc,EACd,MAAc,EACd,SAAkB;IAElB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,2DAA2D;IAC3D,2DAA2D;IAC3D,mDAAmD;IACnD,MAAM,kBAAkB,GAAG,CAAC,CAAC,aAAa,CAAC;SACxC,KAAK,EAAE;SACP,IAAI,CAAC,qBAAqB,CAAC,CAAC;IAE/B,MAAM,YAAY,GAAG,kBAAkB;SACpC,GAAG,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAClB,MAAM,YAAY,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC;QAE1D,uDAAuD;QACvD,yDAAyD;QACzD,+BAA+B;QAC/B,iDAAiD;QACjD,iCAAiC;QACjC,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC;aACzB,IAAI,CAAC,SAAS,CAAC;aACf,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;YACf,MAAM,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAEtC,mDAAmD;YACnD,IACE,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC;gBAC/B,QAAQ,CAAC,UAAU,CAAC,SAAS,CAAC,EAC9B;gBACA,OAAO,SAAS,CAAC;aAClB;YAED,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC;aACD,OAAO,EAAE;aACT,MAAM,CACL,CAAC,KAAa,EAAE,KAAa,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,KAAK,CACtE,CAAC;QAEJ,0CAA0C;QAC1C,OAAO;YACL,KAAK,EAAE,YAAY;YACnB,KAAK,EAAE,SAAS;SACjB,CAAC;IACJ,CAAC,CAAC;SACD,OAAO,EAAE,CAAC;IAEb,OAAO,MAAM,OAAO,CAAC,GAAG,CACtB,YAAY,CAAC,GAAG,CAAC,KAAK,EAAE,WAAW,EAAE,EAAE;QACrC,WAAW,CAAC,KAAK,GAAG,MAAM,OAAO,CAAC,GAAG,CACnC,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,QAAgB,EAAE,EAAE;QAC/C,kEAAkE;QAClE,mDAAmD;QACnD,4BAA4B,CAC1B,MAAM,EACN,MAAM,EACN,QAAQ,EACR,SAAS,EACT,gBAAgB,EAChB,KAAK,EACL,OAAO,CACR,CACF,CACF,CAAC;QACF,OAAO,WAAW,CAAC;IACrB,CAAC,CAAC,CACH,CAAC;AACJ,CAAC"}
|
package/bin/util.js
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import { mkdirSync, writeFileSync } from "fs";
|
|
2
|
-
import { Page } from "./templates.js";
|
|
3
2
|
import path from "path";
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
import { Page } from "./templates.js";
|
|
4
|
+
export function getOrigin(url) {
|
|
6
5
|
// eg. https://google.com -> https://google.com
|
|
7
6
|
// https://google.com/page -> https://google.com
|
|
8
|
-
return url
|
|
7
|
+
return new URL(url).origin;
|
|
9
8
|
}
|
|
10
9
|
export function objToReadableString(objs) {
|
|
11
10
|
// Two spaces as indentation
|
|
@@ -26,12 +25,32 @@ export const addMdx = (fileName) => {
|
|
|
26
25
|
}
|
|
27
26
|
return fileName + ".mdx";
|
|
28
27
|
};
|
|
29
|
-
export const createPage = (title, description, markdown, rootDir = "", fileName) => {
|
|
28
|
+
export const createPage = (title, description, markdown, overwrite = false, rootDir = "", fileName) => {
|
|
29
|
+
const writePath = path.join(rootDir, addMdx(fileName || toFilename(title)));
|
|
30
30
|
// Create the folders needed if they're missing
|
|
31
31
|
mkdirSync(rootDir, { recursive: true });
|
|
32
32
|
// Write the page to memory
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
if (overwrite) {
|
|
34
|
+
writeFileSync(writePath, Page(title, description, markdown));
|
|
35
|
+
console.log("✏️ - " + writePath);
|
|
36
|
+
}
|
|
37
|
+
else {
|
|
38
|
+
try {
|
|
39
|
+
writeFileSync(writePath, Page(title, description, markdown), {
|
|
40
|
+
flag: "wx",
|
|
41
|
+
});
|
|
42
|
+
console.log("✏️ - " + writePath);
|
|
43
|
+
}
|
|
44
|
+
catch (e) {
|
|
45
|
+
// We do a try-catch instead of an if-statement to avoid a race condition
|
|
46
|
+
// of the file being created after we started writing.
|
|
47
|
+
if (e.code === "EEXIST") {
|
|
48
|
+
console.log(`❌ Skipping existing file ${writePath}`);
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
console.error(e);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
36
55
|
};
|
|
37
56
|
//# sourceMappingURL=util.js.map
|
package/bin/util.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"util.js","sourceRoot":"","sources":["../src/util.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAC9C,OAAO,
|
|
1
|
+
{"version":3,"file":"util.js","sourceRoot":"","sources":["../src/util.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAC9C,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,IAAI,EAAE,MAAM,gBAAgB,CAAC;AAEtC,MAAM,UAAU,SAAS,CAAC,GAAW;IACnC,+CAA+C;IAC/C,gDAAgD;IAChD,OAAO,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;AAC7B,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,IAAc;IAChD,4BAA4B;IAC5B,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;AACrE,CAAC;AAED,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,KAAa,EAAE,EAAE;IAC1C,sDAAsD;IACtD,uDAAuD;IACvD,OAAO,KAAK;SACT,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,IAAI,EAAE;SACN,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC;SAClB,WAAW,EAAE,CAAC;AACnB,CAAC,CAAC;AAEF,MAAM,CAAC,MAAM,MAAM,GAAG,CAAC,QAAgB,EAAE,EAAE;IACzC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE;QAC7B,OAAO,QAAQ,CAAC;KACjB;IACD,OAAO,QAAQ,GAAG,MAAM,CAAC;AAC3B,CAAC,CAAC;AAEF,MAAM,CAAC,MAAM,UAAU,GAAG,CACxB,KAAa,EACb,WAAoB,EACpB,QAAiB,EACjB,YAAqB,KAAK,EAC1B,UAAkB,EAAE,EACpB,QAAiB,EACjB,EAAE;IACF,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,QAAQ,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAE5E,+CAA+C;IAC/C,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAExC,2BAA2B;IAC3B,IAAI,SAAS,EAAE;QACb,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,KAAK,EAAE,WAAW,EAAE,QAAQ,CAAC,CAAC,CAAC;QAC7D,OAAO,CAAC,GAAG,CAAC,OAAO,GAAG,SAAS,CAAC,CAAC;KAClC;SAAM;QACL,IAAI;YACF,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,KAAK,EAAE,WAAW,EAAE,QAAQ,CAAC,EAAE;gBAC3D,IAAI,EAAE,IAAI;aACX,CAAC,CAAC;YACH,OAAO,CAAC,GAAG,CAAC,OAAO,GAAG,SAAS,CAAC,CAAC;SAClC;QAAC,OAAO,CAAC,EAAE;YACV,yEAAyE;YACzE,sDAAsD;YACtD,IAAI,CAAC,CAAC,IAAI,KAAK,QAAQ,EAAE;gBACvB,OAAO,CAAC,GAAG,CAAC,4BAA4B,SAAS,EAAE,CAAC,CAAC;aACtD;iBAAM;gBACL,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;aAClB;SACF;KACF;AACH,CAAC,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mintlify",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.7",
|
|
4
4
|
"description": "Mintlify CLI",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=14.16"
|
|
@@ -19,8 +19,9 @@
|
|
|
19
19
|
"axios": "^0.27.2",
|
|
20
20
|
"cheerio": "^0.22.0",
|
|
21
21
|
"inquirer": "^9.1.0",
|
|
22
|
+
"minimist-lite": "^2.2.1",
|
|
22
23
|
"node-html-markdown": "^1.2.0",
|
|
23
|
-
"puppeteer": "^17.1.
|
|
24
|
+
"puppeteer": "^17.1.3"
|
|
24
25
|
},
|
|
25
26
|
"devDependencies": {
|
|
26
27
|
"@types/inquirer": "^9.0.1",
|
package/src/browser.ts
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { launch } from "puppeteer";
|
|
2
|
+
|
|
3
|
+
export async function startBrowser() {
|
|
4
|
+
try {
|
|
5
|
+
return await launch({
|
|
6
|
+
headless: true,
|
|
7
|
+
ignoreHTTPSErrors: true,
|
|
8
|
+
});
|
|
9
|
+
} catch (err) {
|
|
10
|
+
console.log("Could not create a browser instance: ", err);
|
|
11
|
+
process.exit(1);
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export async function getHtmlWithPuppeteer(href: string) {
|
|
16
|
+
const browser = await startBrowser();
|
|
17
|
+
const page = await browser.newPage();
|
|
18
|
+
await page.goto(href, {
|
|
19
|
+
waitUntil: "networkidle2",
|
|
20
|
+
});
|
|
21
|
+
const html = await page.content();
|
|
22
|
+
browser.close();
|
|
23
|
+
return html;
|
|
24
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, createWriteStream } from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import axios from "axios";
|
|
4
|
+
|
|
5
|
+
export default async function downloadImage(
|
|
6
|
+
imageSrc: string,
|
|
7
|
+
writePath: string
|
|
8
|
+
) {
|
|
9
|
+
// Avoid unnecessary downloads
|
|
10
|
+
if (existsSync(writePath)) {
|
|
11
|
+
return Promise.reject({
|
|
12
|
+
code: "EEXIST",
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
// Create the folders needed if they're missing
|
|
17
|
+
mkdirSync(path.dirname(writePath), { recursive: true });
|
|
18
|
+
|
|
19
|
+
const writer = createWriteStream(writePath);
|
|
20
|
+
|
|
21
|
+
const response = await axios.default.get(imageSrc, {
|
|
22
|
+
responseType: "stream",
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
// wx prevents overwriting an image with the exact same name
|
|
26
|
+
// being created in the time we were downloading
|
|
27
|
+
response.data.pipe(writer, {
|
|
28
|
+
flag: "wx",
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
return new Promise((resolve, reject) => {
|
|
32
|
+
writer.on("finish", resolve);
|
|
33
|
+
writer.on("error", reject);
|
|
34
|
+
});
|
|
35
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -1,23 +1,37 @@
|
|
|
1
1
|
#! /usr/bin/env node
|
|
2
2
|
|
|
3
|
+
import axios from "axios";
|
|
3
4
|
import { writeFileSync } from "fs";
|
|
4
5
|
import inquirer from "inquirer";
|
|
6
|
+
import minimistLite from "minimist-lite";
|
|
5
7
|
import { MintConfig } from "./templates.js";
|
|
6
|
-
import {
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
8
|
+
import { scrapePage } from "./scraping/scrapePage.js";
|
|
9
|
+
import { scrapeSection } from "./scraping/scrapeSection.js";
|
|
10
|
+
import { createPage, toFilename, getOrigin } from "./util.js";
|
|
11
|
+
import { scrapeDocusaurusPage } from "./scraping/site-scrapers/scrapeDocusaurusPage.js";
|
|
12
|
+
import { scrapeDocusaurusSection } from "./scraping/site-scrapers/scrapeDocusaurusSection.js";
|
|
13
|
+
import { scrapeGitBookPage } from "./scraping/site-scrapers/scrapeGitBookPage.js";
|
|
14
|
+
import { scrapeGitBookSection } from "./scraping/site-scrapers/scrapeGitBookSection.js";
|
|
15
|
+
import { scrapeReadMePage } from "./scraping/site-scrapers/scrapeReadMePage.js";
|
|
16
|
+
import { scrapeReadMeSection } from "./scraping/site-scrapers/scrapeReadMeSection.js";
|
|
17
|
+
import { detectFramework, Frameworks } from "./scraping/detectFramework.js";
|
|
18
|
+
import { startBrowser, getHtmlWithPuppeteer } from "./browser.js";
|
|
10
19
|
|
|
11
|
-
const
|
|
20
|
+
const argv = minimistLite(process.argv.slice(2), {
|
|
21
|
+
boolean: ["overwrite"],
|
|
22
|
+
default: {
|
|
23
|
+
overwrite: false,
|
|
24
|
+
},
|
|
25
|
+
});
|
|
12
26
|
|
|
13
|
-
if (
|
|
27
|
+
if (argv._.length === 0) {
|
|
14
28
|
console.error(
|
|
15
29
|
`No command specified. Here are is the list that you can use:\ninit: initialize a Mintlify documentation instance`
|
|
16
30
|
);
|
|
17
31
|
process.exit(1); //an error occurred
|
|
18
32
|
}
|
|
19
33
|
|
|
20
|
-
const command =
|
|
34
|
+
const command = argv._[0];
|
|
21
35
|
|
|
22
36
|
if (command === "init") {
|
|
23
37
|
inquirer
|
|
@@ -55,7 +69,7 @@ if (command === "init") {
|
|
|
55
69
|
.then((answers) => {
|
|
56
70
|
const { name, color, ctaName, ctaUrl, title } = answers;
|
|
57
71
|
writeFileSync(
|
|
58
|
-
"mint.
|
|
72
|
+
"mint.json",
|
|
59
73
|
JSON.stringify(
|
|
60
74
|
MintConfig(name, color, ctaName, ctaUrl, toFilename(title)),
|
|
61
75
|
null,
|
|
@@ -100,26 +114,163 @@ if (command === "page") {
|
|
|
100
114
|
});
|
|
101
115
|
}
|
|
102
116
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
117
|
+
function validateFramework(framework) {
|
|
118
|
+
if (!framework) {
|
|
119
|
+
console.log(
|
|
120
|
+
"Could not detect the framework automatically. Please use one of:"
|
|
121
|
+
);
|
|
122
|
+
console.log("scrape-page-docusaurus");
|
|
123
|
+
console.log("scrape-page-gitbook");
|
|
124
|
+
console.log("scrape-page-readme");
|
|
125
|
+
return process.exit(1);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
async function scrapePageAutomatically() {
|
|
130
|
+
const href = argv._[1];
|
|
131
|
+
const res = await axios.default.get(href);
|
|
132
|
+
const html = res.data;
|
|
133
|
+
const framework = detectFramework(html);
|
|
134
|
+
|
|
135
|
+
validateFramework(framework);
|
|
136
|
+
|
|
137
|
+
console.log("Detected framework: " + framework);
|
|
138
|
+
|
|
139
|
+
if (framework === Frameworks.DOCUSAURUS) {
|
|
140
|
+
await scrapePageWrapper(scrapeDocusaurusPage);
|
|
141
|
+
} else if (framework === Frameworks.GITBOOK) {
|
|
142
|
+
await scrapePageWrapper(scrapeGitBookPage, true);
|
|
143
|
+
} else if (framework === Frameworks.README) {
|
|
144
|
+
await scrapePageWrapper(scrapeReadMePage);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
async function scrapePageWrapper(scrapeFunc, puppeteer = false) {
|
|
149
|
+
const href = argv._[1];
|
|
150
|
+
let html;
|
|
151
|
+
if (puppeteer) {
|
|
152
|
+
html = await getHtmlWithPuppeteer(href);
|
|
153
|
+
} else {
|
|
154
|
+
const res = await axios.default.get(href);
|
|
155
|
+
html = res.data;
|
|
156
|
+
}
|
|
157
|
+
await scrapePage(scrapeFunc, href, html, argv.overwrite);
|
|
107
158
|
process.exit(1);
|
|
108
159
|
}
|
|
109
160
|
|
|
161
|
+
if (command === "scrape-page") {
|
|
162
|
+
await scrapePageAutomatically();
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
if (command === "scrape-docusaurus-page") {
|
|
166
|
+
await scrapePageWrapper(scrapeDocusaurusPage);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (command === "scrape-gitbook-page") {
|
|
170
|
+
await scrapePageWrapper(scrapeGitBookPage, true);
|
|
171
|
+
}
|
|
172
|
+
|
|
110
173
|
if (command === "scrape-readme-page") {
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
174
|
+
await scrapePageWrapper(scrapeReadMePage);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
async function scrapeSectionAutomatically() {
|
|
178
|
+
const href = argv._[1];
|
|
179
|
+
const res = await axios.default.get(href);
|
|
180
|
+
const html = res.data;
|
|
181
|
+
const framework = detectFramework(html);
|
|
182
|
+
|
|
183
|
+
validateFramework(framework);
|
|
184
|
+
|
|
185
|
+
console.log("Detected framework: " + framework);
|
|
186
|
+
|
|
187
|
+
if (framework === Frameworks.DOCUSAURUS) {
|
|
188
|
+
await scrapeSectionAxiosWrapper(scrapeDocusaurusSection);
|
|
189
|
+
} else if (framework === Frameworks.GITBOOK) {
|
|
190
|
+
await scrapeSectionGitBookWrapper(scrapeGitBookSection);
|
|
191
|
+
} else if (framework === Frameworks.README) {
|
|
192
|
+
await scrapeSectionAxiosWrapper(scrapeReadMeSection);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
async function scrapeSectionAxiosWrapper(scrapeFunc: any) {
|
|
197
|
+
const href = argv._[1];
|
|
198
|
+
const res = await axios.default.get(href);
|
|
199
|
+
const html = res.data;
|
|
200
|
+
await scrapeSection(scrapeFunc, html, getOrigin(href), argv.overwrite);
|
|
114
201
|
process.exit(1);
|
|
115
202
|
}
|
|
116
203
|
|
|
117
|
-
|
|
118
|
-
const
|
|
119
|
-
|
|
120
|
-
const
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
204
|
+
async function scrapeSectionGitBookWrapper(scrapeFunc: any) {
|
|
205
|
+
const href = argv._[1];
|
|
206
|
+
|
|
207
|
+
const browser = await startBrowser();
|
|
208
|
+
const page = await browser.newPage();
|
|
209
|
+
await page.goto(href, {
|
|
210
|
+
waitUntil: "networkidle2",
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
let prevEncountered = [];
|
|
214
|
+
let encounteredHref = ["fake"];
|
|
215
|
+
|
|
216
|
+
// Loop until we've encountered every link
|
|
217
|
+
while (!encounteredHref.every((href) => prevEncountered.includes(href))) {
|
|
218
|
+
prevEncountered = encounteredHref;
|
|
219
|
+
encounteredHref = await page.evaluate(
|
|
220
|
+
(encounteredHref) => {
|
|
221
|
+
const icons = Array.from(
|
|
222
|
+
document.querySelectorAll('path[d="M9 18l6-6-6-6"]')
|
|
223
|
+
);
|
|
224
|
+
|
|
225
|
+
const linksFound = [];
|
|
226
|
+
icons.forEach(async (icon: HTMLElement) => {
|
|
227
|
+
const toClick = icon.parentElement.parentElement;
|
|
228
|
+
const link = toClick.parentElement.parentElement;
|
|
229
|
+
|
|
230
|
+
// Skip icons not in the side navigation
|
|
231
|
+
if (!link.hasAttribute("href")) {
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const href = link.getAttribute("href");
|
|
236
|
+
|
|
237
|
+
// Should never occur but we keep it as a fail-safe
|
|
238
|
+
if (href.startsWith("https://") || href.startsWith("http://")) {
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Click any links we haven't seen before
|
|
243
|
+
if (!encounteredHref.includes(href)) {
|
|
244
|
+
toClick.click();
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
linksFound.push(href);
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
return linksFound;
|
|
251
|
+
},
|
|
252
|
+
encounteredHref // Need to pass array into the browser
|
|
253
|
+
);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const html = await page.content();
|
|
257
|
+
browser.close();
|
|
258
|
+
await scrapeSection(scrapeFunc, html, getOrigin(href), argv.overwrite);
|
|
124
259
|
process.exit(1);
|
|
125
260
|
}
|
|
261
|
+
|
|
262
|
+
if (command === "scrape-section") {
|
|
263
|
+
await scrapeSectionAutomatically();
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
if (command === "scrape-docusaurus-section") {
|
|
267
|
+
await scrapeSectionAxiosWrapper(scrapeDocusaurusSection);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if (command === "scrape-gitbook-section") {
|
|
271
|
+
await scrapeSectionGitBookWrapper(scrapeGitBookSection);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if (command === "scrape-readme-section") {
|
|
275
|
+
await scrapeSectionAxiosWrapper(scrapeReadMeSection);
|
|
276
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import cheerio from "cheerio";
|
|
2
|
+
|
|
3
|
+
export enum Frameworks {
|
|
4
|
+
DOCUSAURUS = "DOCUSAURUS",
|
|
5
|
+
GITBOOK = "GITBOOK",
|
|
6
|
+
README = "README",
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function detectFramework(html) {
|
|
10
|
+
const $ = cheerio.load(html);
|
|
11
|
+
const docusaurusMeta = $('meta[name="generator"]');
|
|
12
|
+
|
|
13
|
+
if (
|
|
14
|
+
docusaurusMeta.length > 0 &&
|
|
15
|
+
docusaurusMeta.attr("content").includes("Docusaurus")
|
|
16
|
+
) {
|
|
17
|
+
return Frameworks.DOCUSAURUS;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const isGitBook = $(".gitbook-root").length > 0;
|
|
21
|
+
if (isGitBook) {
|
|
22
|
+
return Frameworks.GITBOOK;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const isReadMe = $('meta[name="readme-deploy"]').length > 0;
|
|
26
|
+
if (isReadMe) {
|
|
27
|
+
return Frameworks.README;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return undefined;
|
|
31
|
+
}
|