@mintlify/scraping 3.0.14 → 3.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +1 -0
- package/bin/browser.js +3 -3
- package/bin/constants.js +23 -23
- package/bin/constants.js.map +1 -1
- package/bin/downloadImage.js +18 -18
- package/bin/downloadImage.js.map +1 -1
- package/bin/scraping/detectFramework.js +13 -13
- package/bin/scraping/detectFramework.js.map +1 -1
- package/bin/scraping/downloadAllImages.js +5 -5
- package/bin/scraping/downloadAllImages.js.map +1 -1
- package/bin/scraping/downloadLogoImage.js +4 -4
- package/bin/scraping/downloadLogoImage.js.map +1 -1
- package/bin/scraping/getSitemapLinks.js +4 -4
- package/bin/scraping/scrapeFileGettingFileNameFromUrl.js +10 -10
- package/bin/scraping/scrapeFileGettingFileNameFromUrl.js.map +1 -1
- package/bin/scraping/scrapeGettingFileNameFromUrl.js +2 -2
- package/bin/scraping/scrapeGettingFileNameFromUrl.js.map +1 -1
- package/bin/scraping/scrapePage.js +3 -3
- package/bin/scraping/scrapePage.js.map +1 -1
- package/bin/scraping/scrapePageCommands.d.ts +1 -1
- package/bin/scraping/scrapePageCommands.js +15 -15
- package/bin/scraping/scrapePageCommands.js.map +1 -1
- package/bin/scraping/scrapeSection.js +6 -6
- package/bin/scraping/scrapeSection.js.map +1 -1
- package/bin/scraping/scrapeSectionCommands.d.ts +1 -1
- package/bin/scraping/scrapeSectionCommands.js +14 -14
- package/bin/scraping/scrapeSectionCommands.js.map +1 -1
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js +11 -11
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js.map +1 -1
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js +12 -14
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js.map +1 -1
- package/bin/scraping/site-scrapers/alternateGroupTitle.js +1 -1
- package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js +8 -11
- package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js.map +1 -1
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.js +6 -6
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.js +4 -4
- package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.d.ts +1 -1
- package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.js +4 -4
- package/bin/scraping/site-scrapers/openNestedGitbookMenus.d.ts +1 -1
- package/bin/scraping/site-scrapers/openNestedGitbookMenus.js.map +1 -1
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js +14 -14
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js.map +1 -1
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js +9 -9
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js.map +1 -1
- package/bin/scraping/site-scrapers/scrapeGitBookPage.js +12 -14
- package/bin/scraping/site-scrapers/scrapeGitBookPage.js.map +1 -1
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js +10 -15
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js.map +1 -1
- package/bin/scraping/site-scrapers/scrapeReadMePage.js +15 -15
- package/bin/scraping/site-scrapers/scrapeReadMePage.js.map +1 -1
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js +11 -15
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js.map +1 -1
- package/bin/tsconfig.tsbuildinfo +1 -1
- package/bin/util.d.ts +1 -1
- package/bin/util.js +23 -26
- package/bin/util.js.map +1 -1
- package/bin/validation/stopIfInvalidLink.js +3 -3
- package/package.json +9 -9
- package/src/browser.ts +3 -3
- package/src/constants.ts +23 -23
- package/src/downloadImage.ts +21 -26
- package/src/scraping/detectFramework.ts +18 -18
- package/src/scraping/downloadAllImages.ts +7 -9
- package/src/scraping/downloadLogoImage.ts +5 -4
- package/src/scraping/getSitemapLinks.ts +4 -4
- package/src/scraping/scrapeFileGettingFileNameFromUrl.ts +12 -18
- package/src/scraping/scrapeGettingFileNameFromUrl.ts +7 -5
- package/src/scraping/scrapePage.ts +4 -3
- package/src/scraping/scrapePageCommands.ts +17 -18
- package/src/scraping/scrapeSection.ts +8 -16
- package/src/scraping/scrapeSectionCommands.ts +19 -34
- package/src/scraping/site-scrapers/Intercom/scrapeIntercomPage.ts +12 -11
- package/src/scraping/site-scrapers/Intercom/scrapeIntercomSection.ts +23 -24
- package/src/scraping/site-scrapers/alternateGroupTitle.ts +1 -1
- package/src/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.ts +8 -11
- package/src/scraping/site-scrapers/links-per-group/getLinksRecursively.ts +6 -6
- package/src/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.ts +4 -4
- package/src/scraping/site-scrapers/openNestedDocusaurusMenus.ts +5 -5
- package/src/scraping/site-scrapers/openNestedGitbookMenus.ts +2 -4
- package/src/scraping/site-scrapers/scrapeDocusaurusPage.ts +15 -18
- package/src/scraping/site-scrapers/scrapeDocusaurusSection.ts +11 -14
- package/src/scraping/site-scrapers/scrapeGitBookPage.ts +13 -14
- package/src/scraping/site-scrapers/scrapeGitBookSection.ts +11 -15
- package/src/scraping/site-scrapers/scrapeReadMePage.ts +17 -22
- package/src/scraping/site-scrapers/scrapeReadMeSection.ts +27 -31
- package/src/util.ts +25 -36
- package/src/validation/stopIfInvalidLink.ts +3 -3
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import path from
|
|
2
|
-
|
|
1
|
+
import path from 'path';
|
|
2
|
+
|
|
3
|
+
import { createPage, getOrigin } from '../util.js';
|
|
3
4
|
|
|
4
5
|
export async function scrapePage(
|
|
5
6
|
scrapeFunc: ScrapePageFn,
|
|
@@ -10,7 +11,7 @@ export async function scrapePage(
|
|
|
10
11
|
) {
|
|
11
12
|
const origin = getOrigin(href);
|
|
12
13
|
const cwd = process.cwd();
|
|
13
|
-
const imageBaseDir = path.join(cwd,
|
|
14
|
+
const imageBaseDir = path.join(cwd, 'images');
|
|
14
15
|
|
|
15
16
|
const { title, description, markdown } = await scrapeFunc(
|
|
16
17
|
html,
|
|
@@ -1,23 +1,22 @@
|
|
|
1
|
-
import axios from
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import { detectFramework, Frameworks } from
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import { ArgumentsCamelCase } from 'yargs';
|
|
3
|
+
|
|
4
|
+
import { getHtmlWithPuppeteer } from '../browser.js';
|
|
5
|
+
import { getHrefFromArgs } from '../util.js';
|
|
6
|
+
import { detectFramework, Frameworks } from './detectFramework.js';
|
|
7
|
+
import { scrapePage } from './scrapePage.js';
|
|
8
|
+
import { scrapeIntercomPage } from './site-scrapers/Intercom/scrapeIntercomPage.js';
|
|
9
|
+
import { scrapeDocusaurusPage } from './site-scrapers/scrapeDocusaurusPage.js';
|
|
10
|
+
import { scrapeGitBookPage } from './site-scrapers/scrapeGitBookPage.js';
|
|
11
|
+
import { scrapeReadMePage } from './site-scrapers/scrapeReadMePage.js';
|
|
11
12
|
|
|
12
13
|
function validateFramework(framework) {
|
|
13
14
|
if (!framework) {
|
|
14
|
-
console.log(
|
|
15
|
-
|
|
16
|
-
);
|
|
17
|
-
console.log(
|
|
18
|
-
console.log(
|
|
19
|
-
console.log("scrape-page-readme");
|
|
20
|
-
console.log("scrape-page-intercom");
|
|
15
|
+
console.log('Could not detect the framework automatically. Please use one of:');
|
|
16
|
+
console.log('scrape-page-docusaurus');
|
|
17
|
+
console.log('scrape-page-gitbook');
|
|
18
|
+
console.log('scrape-page-readme');
|
|
19
|
+
console.log('scrape-page-intercom');
|
|
21
20
|
return process.exit(1);
|
|
22
21
|
}
|
|
23
22
|
}
|
|
@@ -47,7 +46,7 @@ export async function scrapePageAutomatically(argv: any) {
|
|
|
47
46
|
|
|
48
47
|
validateFramework(framework);
|
|
49
48
|
|
|
50
|
-
console.log(
|
|
49
|
+
console.log('Detected framework: ' + framework);
|
|
51
50
|
|
|
52
51
|
switch (framework) {
|
|
53
52
|
case Frameworks.DOCUSAURUS:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import path from
|
|
2
|
-
|
|
1
|
+
import path from 'path';
|
|
2
|
+
|
|
3
|
+
import { objToReadableString } from '../util.js';
|
|
3
4
|
|
|
4
5
|
export async function scrapeSection(
|
|
5
6
|
scrapeFunc: ScrapeSectionFn,
|
|
@@ -8,21 +9,12 @@ export async function scrapeSection(
|
|
|
8
9
|
overwrite: boolean,
|
|
9
10
|
version: string | undefined
|
|
10
11
|
) {
|
|
11
|
-
console.log(
|
|
12
|
-
`Started scraping${overwrite ? ", overwrite mode is on" : ""}...`
|
|
13
|
-
);
|
|
12
|
+
console.log(`Started scraping${overwrite ? ', overwrite mode is on' : ''}...`);
|
|
14
13
|
const cwd = process.cwd();
|
|
15
|
-
const imageBaseDir = path.join(cwd,
|
|
14
|
+
const imageBaseDir = path.join(cwd, 'images');
|
|
16
15
|
|
|
17
|
-
const groupsConfig = await scrapeFunc(
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
cwd,
|
|
21
|
-
imageBaseDir,
|
|
22
|
-
overwrite,
|
|
23
|
-
version
|
|
24
|
-
);
|
|
25
|
-
console.log("Finished scraping.");
|
|
26
|
-
console.log("Add the following to your navigation in mint.json:");
|
|
16
|
+
const groupsConfig = await scrapeFunc(html, origin, cwd, imageBaseDir, overwrite, version);
|
|
17
|
+
console.log('Finished scraping.');
|
|
18
|
+
console.log('Add the following to your navigation in mint.json:');
|
|
27
19
|
console.log(objToReadableString(groupsConfig));
|
|
28
20
|
}
|
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
import axios from
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
7
|
-
import {
|
|
8
|
-
import
|
|
9
|
-
import
|
|
10
|
-
import
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import { ArgumentsCamelCase } from 'yargs';
|
|
3
|
+
|
|
4
|
+
import { startBrowser } from '../browser.js';
|
|
5
|
+
import { getHrefFromArgs, getOrigin } from '../util.js';
|
|
6
|
+
import { detectFramework, Frameworks } from './detectFramework.js';
|
|
7
|
+
import { scrapeSection } from './scrapeSection.js';
|
|
8
|
+
import { scrapeIntercomSection } from './site-scrapers/Intercom/scrapeIntercomSection.js';
|
|
9
|
+
import openNestedDocusaurusMenus from './site-scrapers/openNestedDocusaurusMenus.js';
|
|
10
|
+
import openNestedGitbookMenus from './site-scrapers/openNestedGitbookMenus.js';
|
|
11
|
+
import { scrapeDocusaurusSection } from './site-scrapers/scrapeDocusaurusSection.js';
|
|
12
|
+
import { scrapeGitBookSection } from './site-scrapers/scrapeGitBookSection.js';
|
|
13
|
+
import { scrapeReadMeSection } from './site-scrapers/scrapeReadMeSection.js';
|
|
13
14
|
|
|
14
15
|
export async function scrapeSectionAxiosWrapper(
|
|
15
16
|
argv: ArgumentsCamelCase,
|
|
@@ -18,13 +19,7 @@ export async function scrapeSectionAxiosWrapper(
|
|
|
18
19
|
const href = getHrefFromArgs(argv);
|
|
19
20
|
const res = await axios.get(href);
|
|
20
21
|
const html = res.data;
|
|
21
|
-
await scrapeSection(
|
|
22
|
-
scrapeFunc,
|
|
23
|
-
html,
|
|
24
|
-
getOrigin(href),
|
|
25
|
-
!!argv.overwrite,
|
|
26
|
-
undefined
|
|
27
|
-
);
|
|
22
|
+
await scrapeSection(scrapeFunc, html, getOrigin(href), !!argv.overwrite, undefined);
|
|
28
23
|
process.exit(0);
|
|
29
24
|
}
|
|
30
25
|
|
|
@@ -41,11 +36,7 @@ export async function scrapeDocusaurusSectionCommand(
|
|
|
41
36
|
}
|
|
42
37
|
|
|
43
38
|
export async function scrapeGitbookSectionCommand(argv: any) {
|
|
44
|
-
await scrapeSectionOpeningAllNested(
|
|
45
|
-
argv,
|
|
46
|
-
openNestedGitbookMenus,
|
|
47
|
-
scrapeGitBookSection
|
|
48
|
-
);
|
|
39
|
+
await scrapeSectionOpeningAllNested(argv, openNestedGitbookMenus, scrapeGitBookSection);
|
|
49
40
|
}
|
|
50
41
|
|
|
51
42
|
async function scrapeSectionOpeningAllNested(
|
|
@@ -59,18 +50,12 @@ async function scrapeSectionOpeningAllNested(
|
|
|
59
50
|
const browser = await startBrowser();
|
|
60
51
|
const page = await browser.newPage();
|
|
61
52
|
await page.goto(href, {
|
|
62
|
-
waitUntil:
|
|
53
|
+
waitUntil: 'networkidle2',
|
|
63
54
|
});
|
|
64
55
|
|
|
65
56
|
const html = await openLinks(page);
|
|
66
57
|
browser.close();
|
|
67
|
-
await scrapeSection(
|
|
68
|
-
scrapeFunc,
|
|
69
|
-
html,
|
|
70
|
-
getOrigin(href),
|
|
71
|
-
!!argv.overwrite,
|
|
72
|
-
version
|
|
73
|
-
);
|
|
58
|
+
await scrapeSection(scrapeFunc, html, getOrigin(href), !!argv.overwrite, version);
|
|
74
59
|
process.exit(0);
|
|
75
60
|
}
|
|
76
61
|
|
|
@@ -81,7 +66,7 @@ export async function scrapeSectionAutomatically(argv: any) {
|
|
|
81
66
|
const { framework, version } = detectFramework(html);
|
|
82
67
|
|
|
83
68
|
validateFramework(framework);
|
|
84
|
-
console.log(
|
|
69
|
+
console.log('Detected framework: ' + framework);
|
|
85
70
|
|
|
86
71
|
switch (framework) {
|
|
87
72
|
case Frameworks.DOCUSAURUS:
|
|
@@ -102,7 +87,7 @@ export async function scrapeSectionAutomatically(argv: any) {
|
|
|
102
87
|
function validateFramework(framework: Frameworks | undefined) {
|
|
103
88
|
if (!framework) {
|
|
104
89
|
console.log(
|
|
105
|
-
|
|
90
|
+
'Could not detect the framework automatically. We only support Docusaurus (V2 and V3), GitBook, and ReadMe.'
|
|
106
91
|
);
|
|
107
92
|
process.exit();
|
|
108
93
|
}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import cheerio from
|
|
2
|
-
import { NodeHtmlMarkdown } from
|
|
3
|
-
|
|
4
|
-
import
|
|
1
|
+
import cheerio from 'cheerio';
|
|
2
|
+
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
|
3
|
+
|
|
4
|
+
import downloadAllImages from '../../downloadAllImages.js';
|
|
5
|
+
import replaceImagePaths from '../../replaceImagePaths.js';
|
|
5
6
|
|
|
6
7
|
export async function scrapeIntercomPage(
|
|
7
8
|
html: string,
|
|
@@ -13,11 +14,11 @@ export async function scrapeIntercomPage(
|
|
|
13
14
|
) {
|
|
14
15
|
const $ = cheerio.load(html);
|
|
15
16
|
|
|
16
|
-
const titleComponent = $(
|
|
17
|
+
const titleComponent = $('.t__h1').first();
|
|
17
18
|
const title = titleComponent.text().trim();
|
|
18
|
-
const description = $(
|
|
19
|
+
const description = $('.article__desc', titleComponent.parent()).text().trim();
|
|
19
20
|
|
|
20
|
-
const content = $(
|
|
21
|
+
const content = $('article').first();
|
|
21
22
|
const contentHtml = $.html(content);
|
|
22
23
|
|
|
23
24
|
const origToWritePath = await downloadAllImages(
|
|
@@ -33,16 +34,16 @@ export async function scrapeIntercomPage(
|
|
|
33
34
|
let markdown = nhm.translate(contentHtml);
|
|
34
35
|
|
|
35
36
|
// Keep headers on one line
|
|
36
|
-
markdown = markdown.replace(/# \n\n/g,
|
|
37
|
+
markdown = markdown.replace(/# \n\n/g, '# ');
|
|
37
38
|
|
|
38
39
|
// Remove unnecessary nonwidth blank space characters
|
|
39
|
-
markdown = markdown.replace(/\u200b/g,
|
|
40
|
+
markdown = markdown.replace(/\u200b/g, '');
|
|
40
41
|
|
|
41
42
|
// Reduce unnecessary blank lines
|
|
42
|
-
markdown = markdown.replace(/\n\n\n/g,
|
|
43
|
+
markdown = markdown.replace(/\n\n\n/g, '\n\n');
|
|
43
44
|
|
|
44
45
|
// Mintlify doesn't support bolded headers, remove the asterisks
|
|
45
|
-
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g,
|
|
46
|
+
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
|
|
46
47
|
if (origToWritePath) {
|
|
47
48
|
markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
|
|
48
49
|
}
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
import downloadLogoImage from
|
|
5
|
-
import
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import cheerio from 'cheerio';
|
|
3
|
+
|
|
4
|
+
import downloadLogoImage from '../../downloadLogoImage.js';
|
|
5
|
+
import { scrapeGettingFileNameFromUrl } from '../../scrapeGettingFileNameFromUrl.js';
|
|
6
|
+
import { scrapeIntercomPage } from './scrapeIntercomPage.js';
|
|
6
7
|
|
|
7
8
|
export async function scrapeIntercomSection(
|
|
8
9
|
html: string,
|
|
@@ -14,27 +15,25 @@ export async function scrapeIntercomSection(
|
|
|
14
15
|
): Promise<MintNavigationEntry[]> {
|
|
15
16
|
let $ = cheerio.load(html);
|
|
16
17
|
|
|
17
|
-
const logoSrc = $(
|
|
18
|
+
const logoSrc = $('.header__logo img').first().attr('src');
|
|
18
19
|
downloadLogoImage(logoSrc, imageBaseDir, origin, overwrite);
|
|
19
20
|
|
|
20
|
-
const collectionsLink = $(
|
|
21
|
-
const collectionsMap = collectionsLink
|
|
22
|
-
.
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
};
|
|
37
|
-
});
|
|
21
|
+
const collectionsLink = $('.section .g__space a');
|
|
22
|
+
const collectionsMap = collectionsLink.toArray().map(async (s: cheerio.Element) => {
|
|
23
|
+
const href = $(s).attr('href');
|
|
24
|
+
const res = await axios.get(`${origin}${href}`);
|
|
25
|
+
const html = res.data;
|
|
26
|
+
$ = cheerio.load(html);
|
|
27
|
+
const sectionTitle = $('.collection h1').first().text().trim();
|
|
28
|
+
const sectionPages = $('.section .g__space a')
|
|
29
|
+
.toArray()
|
|
30
|
+
.map((s: cheerio.Element) => $(s).attr('href'))
|
|
31
|
+
.filter((page) => page !== undefined) as string[];
|
|
32
|
+
return {
|
|
33
|
+
group: sectionTitle,
|
|
34
|
+
pages: sectionPages,
|
|
35
|
+
};
|
|
36
|
+
});
|
|
38
37
|
|
|
39
38
|
const collections: MintNavigation[] = await Promise.all(collectionsMap);
|
|
40
39
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
import alternateGroupTitle from
|
|
2
|
-
import getLinksRecursively from
|
|
1
|
+
import alternateGroupTitle from '../alternateGroupTitle.js';
|
|
2
|
+
import getLinksRecursively from './getLinksRecursively.js';
|
|
3
3
|
|
|
4
4
|
export function getDocusaurusLinksPerGroup(
|
|
5
5
|
navigationSections: any,
|
|
6
6
|
$: any,
|
|
7
7
|
version: string | undefined
|
|
8
8
|
) {
|
|
9
|
-
if (version ===
|
|
9
|
+
if (version === '3' || version === '2') {
|
|
10
10
|
return getDocusaurusLinksPerGroupLoop(navigationSections, $);
|
|
11
11
|
}
|
|
12
12
|
return [];
|
|
@@ -17,21 +17,18 @@ function getDocusaurusLinksPerGroupLoop(navigationSections: any, $: any) {
|
|
|
17
17
|
const section = $(s);
|
|
18
18
|
|
|
19
19
|
// Links without a group
|
|
20
|
-
if (section.hasClass(
|
|
21
|
-
const linkHref = section.find(
|
|
20
|
+
if (section.hasClass('theme-doc-sidebar-item-link')) {
|
|
21
|
+
const linkHref = section.find('a[href]').first().attr('href');
|
|
22
22
|
return {
|
|
23
|
-
group:
|
|
23
|
+
group: '',
|
|
24
24
|
pages: [linkHref],
|
|
25
25
|
};
|
|
26
26
|
}
|
|
27
27
|
|
|
28
|
-
const firstLink = section
|
|
29
|
-
.find(".menu__list-item-collapsible")
|
|
30
|
-
.first()
|
|
31
|
-
.find("a[href]");
|
|
28
|
+
const firstLink = section.find('.menu__list-item-collapsible').first().find('a[href]');
|
|
32
29
|
|
|
33
30
|
const sectionTitle = firstLink.text();
|
|
34
|
-
const firstHref = firstLink.attr(
|
|
31
|
+
const firstHref = firstLink.attr('href');
|
|
35
32
|
const linkSections = section.children().eq(1).children();
|
|
36
33
|
|
|
37
34
|
const pages = getLinksRecursively(linkSections, $);
|
|
@@ -9,20 +9,20 @@ export default function getLinksRecursively(linkSections: any, $: any) {
|
|
|
9
9
|
const subsection = $(s);
|
|
10
10
|
let link = subsection.children().first();
|
|
11
11
|
|
|
12
|
-
if (!link.attr(
|
|
12
|
+
if (!link.attr('href')) {
|
|
13
13
|
// Docusaurus nests the <a> inside a <div>
|
|
14
|
-
link = link.find(
|
|
14
|
+
link = link.find('a[href]').first();
|
|
15
15
|
}
|
|
16
|
-
const linkHref = link.attr(
|
|
16
|
+
const linkHref = link.attr('href');
|
|
17
17
|
|
|
18
18
|
// Skip missing links. For example, GitBook uses
|
|
19
19
|
// empty divs are used for styling a line beside the nav.
|
|
20
20
|
// Skip external links until Mintlify supports them
|
|
21
21
|
if (
|
|
22
22
|
!linkHref ||
|
|
23
|
-
linkHref ===
|
|
24
|
-
linkHref.startsWith(
|
|
25
|
-
linkHref.startsWith(
|
|
23
|
+
linkHref === '#' ||
|
|
24
|
+
linkHref.startsWith('https://') ||
|
|
25
|
+
linkHref.startsWith('http://')
|
|
26
26
|
) {
|
|
27
27
|
return undefined;
|
|
28
28
|
}
|
|
@@ -14,16 +14,16 @@ export default function getLinksRecursivelyGitBook(linkSections: any, $: any) {
|
|
|
14
14
|
}
|
|
15
15
|
|
|
16
16
|
const link = subsection.children().first();
|
|
17
|
-
const linkHref = link.attr(
|
|
17
|
+
const linkHref = link.attr('href');
|
|
18
18
|
|
|
19
19
|
// Skip missing links. For example, GitBook uses
|
|
20
20
|
// empty divs are used for styling a line beside the nav.
|
|
21
21
|
// Skip external links until Mintlify supports them
|
|
22
22
|
if (
|
|
23
23
|
!linkHref ||
|
|
24
|
-
linkHref ===
|
|
25
|
-
linkHref.startsWith(
|
|
26
|
-
linkHref.startsWith(
|
|
24
|
+
linkHref === '#' ||
|
|
25
|
+
linkHref.startsWith('https://') ||
|
|
26
|
+
linkHref.startsWith('http://')
|
|
27
27
|
) {
|
|
28
28
|
return undefined;
|
|
29
29
|
}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { Page } from
|
|
1
|
+
import { Page } from 'puppeteer';
|
|
2
2
|
|
|
3
3
|
export default async function openNestedDocusaurusMenus(page: Page) {
|
|
4
4
|
let prevEncountered: string[] = [];
|
|
5
|
-
let encounteredHref = [
|
|
5
|
+
let encounteredHref = ['fake-href-to-make-loop-run-at-least-once'];
|
|
6
6
|
|
|
7
7
|
// Loop until we've encountered every link
|
|
8
8
|
while (!encounteredHref.every((href) => prevEncountered.includes(href))) {
|
|
@@ -10,15 +10,15 @@ export default async function openNestedDocusaurusMenus(page: Page) {
|
|
|
10
10
|
encounteredHref = await page.evaluate(
|
|
11
11
|
(encounteredHref) => {
|
|
12
12
|
const collapsible: HTMLElement[] = Array.from(
|
|
13
|
-
document.querySelectorAll(
|
|
13
|
+
document.querySelectorAll('.menu__link.menu__link--sublist')
|
|
14
14
|
);
|
|
15
15
|
|
|
16
16
|
const linksFound: string[] = [];
|
|
17
17
|
collapsible.forEach(async (collapsibleItem: HTMLElement) => {
|
|
18
|
-
const href = collapsibleItem?.getAttribute(
|
|
18
|
+
const href = collapsibleItem?.getAttribute('href');
|
|
19
19
|
|
|
20
20
|
// Should never occur but we keep it as a fail-safe
|
|
21
|
-
if (href?.startsWith(
|
|
21
|
+
if (href?.startsWith('https://') || href?.startsWith('http://')) {
|
|
22
22
|
return;
|
|
23
23
|
}
|
|
24
24
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Page } from
|
|
1
|
+
import { Page } from 'puppeteer';
|
|
2
2
|
|
|
3
3
|
export default async function openNestedGitbookMenus(page: Page) {
|
|
4
4
|
let clickedAny = true;
|
|
@@ -7,9 +7,7 @@ export default async function openNestedGitbookMenus(page: Page) {
|
|
|
7
7
|
while (clickedAny) {
|
|
8
8
|
clickedAny = await page.evaluate(() => {
|
|
9
9
|
// Right pointing arrow. Only closed menus have this icon
|
|
10
|
-
const icons: HTMLElement[] = Array.from(
|
|
11
|
-
document.querySelectorAll('path[d="M9 18l6-6-6-6"]')
|
|
12
|
-
);
|
|
10
|
+
const icons: HTMLElement[] = Array.from(document.querySelectorAll('path[d="M9 18l6-6-6-6"]'));
|
|
13
11
|
|
|
14
12
|
icons.forEach(async (icon: HTMLElement) => {
|
|
15
13
|
const toClick = icon?.parentElement?.parentElement;
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import cheerio from
|
|
2
|
-
import { NodeHtmlMarkdown } from
|
|
3
|
-
|
|
4
|
-
import
|
|
1
|
+
import cheerio from 'cheerio';
|
|
2
|
+
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
|
3
|
+
|
|
4
|
+
import downloadAllImages from '../downloadAllImages.js';
|
|
5
|
+
import replaceImagePaths from '../replaceImagePaths.js';
|
|
5
6
|
|
|
6
7
|
export async function scrapeDocusaurusPage(
|
|
7
8
|
html: string,
|
|
@@ -17,24 +18,22 @@ export async function scrapeDocusaurusPage(
|
|
|
17
18
|
}> {
|
|
18
19
|
const $ = cheerio.load(html);
|
|
19
20
|
|
|
20
|
-
const article =
|
|
21
|
-
version === "3" ? $(".theme-doc-markdown").first() : $("article").first();
|
|
21
|
+
const article = version === '3' ? $('.theme-doc-markdown').first() : $('article').first();
|
|
22
22
|
|
|
23
23
|
if (article.length === 0) {
|
|
24
24
|
// Index pages with no additional text don't have the markdown class
|
|
25
25
|
return {
|
|
26
|
-
title: ''
|
|
26
|
+
title: '',
|
|
27
27
|
};
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
const titleComponent = article.find(
|
|
30
|
+
const titleComponent = article.find('h1');
|
|
31
31
|
const title = titleComponent.text().trim();
|
|
32
32
|
|
|
33
33
|
// Do not include title in the content when we insert it in our metadata
|
|
34
34
|
titleComponent.remove();
|
|
35
35
|
|
|
36
|
-
const markdownContent =
|
|
37
|
-
version === "3" ? article : article.find(".markdown").first();
|
|
36
|
+
const markdownContent = version === '3' ? article : article.find('.markdown').first();
|
|
38
37
|
|
|
39
38
|
const origToWritePath = await downloadAllImages(
|
|
40
39
|
$,
|
|
@@ -50,10 +49,8 @@ export async function scrapeDocusaurusPage(
|
|
|
50
49
|
let markdown = markdownHtml ? nhm.translate(markdownHtml) : null;
|
|
51
50
|
|
|
52
51
|
if (markdown == null) {
|
|
53
|
-
console.error(
|
|
54
|
-
|
|
55
|
-
);
|
|
56
|
-
return { title, description: undefined, markdown: "" };
|
|
52
|
+
console.error('We do not support scraping this page. Content will be empty');
|
|
53
|
+
return { title, description: undefined, markdown: '' };
|
|
57
54
|
}
|
|
58
55
|
|
|
59
56
|
// Description only exists in meta tags. The code is commented out because its prone to incorrectly
|
|
@@ -70,16 +67,16 @@ export async function scrapeDocusaurusPage(
|
|
|
70
67
|
// When we parse their HTML the parser adds things like:
|
|
71
68
|
// [](#setup "Direct link to heading")
|
|
72
69
|
// to the end of each header.
|
|
73
|
-
markdown = markdown.replace(/\[\]\(#.+ ".+"\)\n/g,
|
|
70
|
+
markdown = markdown.replace(/\[\]\(#.+ ".+"\)\n/g, '\n');
|
|
74
71
|
|
|
75
72
|
// Remove unnecessary nonwidth blank space characters
|
|
76
|
-
markdown = markdown.replace(/\u200b/g,
|
|
73
|
+
markdown = markdown.replace(/\u200b/g, '');
|
|
77
74
|
|
|
78
75
|
// Reduce unnecessary blank lines
|
|
79
|
-
markdown = markdown.replace(/\n\n\n/g,
|
|
76
|
+
markdown = markdown.replace(/\n\n\n/g, '\n\n');
|
|
80
77
|
|
|
81
78
|
// Mintlify doesn't support bolded headers, remove the asterisks
|
|
82
|
-
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g,
|
|
79
|
+
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
|
|
83
80
|
if (origToWritePath) {
|
|
84
81
|
markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
|
|
85
82
|
}
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import cheerio from
|
|
2
|
-
|
|
3
|
-
import combineNavWithEmptyGroupTitles from
|
|
4
|
-
import
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
1
|
+
import cheerio from 'cheerio';
|
|
2
|
+
|
|
3
|
+
import combineNavWithEmptyGroupTitles from '../combineNavWithEmptyGroupTitles.js';
|
|
4
|
+
import downloadLogoImage from '../downloadLogoImage.js';
|
|
5
|
+
import { scrapeGettingFileNameFromUrl } from '../scrapeGettingFileNameFromUrl.js';
|
|
6
|
+
import { getDocusaurusLinksPerGroup } from './links-per-group/getDocusaurusLinksPerGroup.js';
|
|
7
|
+
import { scrapeDocusaurusPage } from './scrapeDocusaurusPage.js';
|
|
7
8
|
|
|
8
9
|
export async function scrapeDocusaurusSection(
|
|
9
10
|
html: string,
|
|
@@ -16,18 +17,14 @@ export async function scrapeDocusaurusSection(
|
|
|
16
17
|
const $ = cheerio.load(html);
|
|
17
18
|
|
|
18
19
|
// Download the logo
|
|
19
|
-
const logoSrc = $(
|
|
20
|
+
const logoSrc = $('.navbar__logo img').attr('src');
|
|
20
21
|
downloadLogoImage(logoSrc, imageBaseDir, origin, overwrite);
|
|
21
22
|
|
|
22
23
|
// Get all the navigation sections
|
|
23
|
-
const navigationSections = $(
|
|
24
|
+
const navigationSections = $('.theme-doc-sidebar-menu').first().children();
|
|
24
25
|
|
|
25
26
|
// Get all links per group
|
|
26
|
-
const groupsConfig: MintNavigation[] = getDocusaurusLinksPerGroup(
|
|
27
|
-
navigationSections,
|
|
28
|
-
$,
|
|
29
|
-
version
|
|
30
|
-
);
|
|
27
|
+
const groupsConfig: MintNavigation[] = getDocusaurusLinksPerGroup(navigationSections, $, version);
|
|
31
28
|
|
|
32
29
|
// Merge groups with empty titles together
|
|
33
30
|
const reducedGroupsConfig = combineNavWithEmptyGroupTitles(groupsConfig);
|
|
@@ -48,7 +45,7 @@ export async function scrapeDocusaurusSection(
|
|
|
48
45
|
scrapeDocusaurusPage,
|
|
49
46
|
false,
|
|
50
47
|
version,
|
|
51
|
-
|
|
48
|
+
'/docs'
|
|
52
49
|
)
|
|
53
50
|
)
|
|
54
51
|
)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import cheerio from
|
|
2
|
-
import { NodeHtmlMarkdown } from
|
|
3
|
-
|
|
4
|
-
import
|
|
1
|
+
import cheerio from 'cheerio';
|
|
2
|
+
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
|
3
|
+
|
|
4
|
+
import downloadAllImages from '../downloadAllImages.js';
|
|
5
|
+
import replaceImagePaths from '../replaceImagePaths.js';
|
|
5
6
|
|
|
6
7
|
export async function scrapeGitBookPage(
|
|
7
8
|
html: string,
|
|
@@ -16,9 +17,7 @@ export async function scrapeGitBookPage(
|
|
|
16
17
|
const titleComponent = $('[data-testid="page.title"]').first();
|
|
17
18
|
const titleAndDescription = titleComponent.parent().parent().parent().text();
|
|
18
19
|
|
|
19
|
-
const description = titleAndDescription
|
|
20
|
-
.replace(titleComponent.text(), "")
|
|
21
|
-
.trim();
|
|
20
|
+
const description = titleAndDescription.replace(titleComponent.text(), '').trim();
|
|
22
21
|
const title = titleComponent.text().trim();
|
|
23
22
|
|
|
24
23
|
const content = $('[data-testid="page.contentEditor"]').first();
|
|
@@ -32,8 +31,8 @@ export async function scrapeGitBookPage(
|
|
|
32
31
|
.children()
|
|
33
32
|
.toArray()
|
|
34
33
|
.map((d) => $(d).text())
|
|
35
|
-
.filter((text) => text !==
|
|
36
|
-
.join(
|
|
34
|
+
.filter((text) => text !== '')
|
|
35
|
+
.join('\n');
|
|
37
36
|
code.replaceWith(`<pre><code>${codeContent}</code></pre>`);
|
|
38
37
|
});
|
|
39
38
|
|
|
@@ -42,7 +41,7 @@ export async function scrapeGitBookPage(
|
|
|
42
41
|
const modifyFileName = (fileName: string) =>
|
|
43
42
|
// Remove GitBook metadata from the start
|
|
44
43
|
// The first four %2F split metadata fields. Remaining ones are part of the file name.
|
|
45
|
-
fileName.split(
|
|
44
|
+
fileName.split('%2F').slice(4).join('%2F');
|
|
46
45
|
|
|
47
46
|
const origToWritePath = await downloadAllImages(
|
|
48
47
|
$,
|
|
@@ -57,16 +56,16 @@ export async function scrapeGitBookPage(
|
|
|
57
56
|
let markdown = nhm.translate(contentHtml);
|
|
58
57
|
|
|
59
58
|
// Keep headers on one line
|
|
60
|
-
markdown = markdown.replace(/# \n\n/g,
|
|
59
|
+
markdown = markdown.replace(/# \n\n/g, '# ');
|
|
61
60
|
|
|
62
61
|
// Remove unnecessary nonwidth blank space characters
|
|
63
|
-
markdown = markdown.replace(/\u200b/g,
|
|
62
|
+
markdown = markdown.replace(/\u200b/g, '');
|
|
64
63
|
|
|
65
64
|
// Reduce unnecessary blank lines
|
|
66
|
-
markdown = markdown.replace(/\n\n\n/g,
|
|
65
|
+
markdown = markdown.replace(/\n\n\n/g, '\n\n');
|
|
67
66
|
|
|
68
67
|
// Mintlify doesn't support bolded headers, remove the asterisks
|
|
69
|
-
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g,
|
|
68
|
+
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
|
|
70
69
|
if (origToWritePath) {
|
|
71
70
|
markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
|
|
72
71
|
}
|