@mintlify/scraping 3.0.187 → 3.0.189
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -5
- package/bin/assert.d.ts +5 -0
- package/bin/assert.js +13 -0
- package/bin/assert.js.map +1 -0
- package/bin/cli.js +43 -72
- package/bin/cli.js.map +1 -1
- package/bin/components/Accordion.d.ts +5 -0
- package/bin/components/Accordion.js +54 -0
- package/bin/components/Accordion.js.map +1 -0
- package/bin/components/AccordionGroup.d.ts +5 -0
- package/bin/components/AccordionGroup.js +52 -0
- package/bin/components/AccordionGroup.js.map +1 -0
- package/bin/components/Callout.d.ts +5 -0
- package/bin/components/Callout.js +114 -0
- package/bin/components/Callout.js.map +1 -0
- package/bin/components/Card.d.ts +5 -0
- package/bin/components/Card.js +135 -0
- package/bin/components/Card.js.map +1 -0
- package/bin/components/CardGroup.d.ts +5 -0
- package/bin/components/CardGroup.js +52 -0
- package/bin/components/CardGroup.js.map +1 -0
- package/bin/components/CodeGroup.d.ts +5 -0
- package/bin/components/CodeGroup.js +166 -0
- package/bin/components/CodeGroup.js.map +1 -0
- package/bin/components/Frame.d.ts +5 -0
- package/bin/components/Frame.js +51 -0
- package/bin/components/Frame.js.map +1 -0
- package/bin/components/Tabs.d.ts +5 -0
- package/bin/components/Tabs.js +122 -0
- package/bin/components/Tabs.js.map +1 -0
- package/bin/components/link.d.ts +2 -0
- package/bin/components/link.js +16 -0
- package/bin/components/link.js.map +1 -0
- package/bin/constants.d.ts +6 -7
- package/bin/constants.js +31 -12
- package/bin/constants.js.map +1 -1
- package/bin/customComponents/create.d.ts +10 -0
- package/bin/customComponents/create.js +69 -0
- package/bin/customComponents/create.js.map +1 -0
- package/bin/customComponents/plugin.d.ts +2 -0
- package/bin/customComponents/plugin.js +26 -0
- package/bin/customComponents/plugin.js.map +1 -0
- package/bin/customComponents/selective.d.ts +6 -0
- package/bin/customComponents/selective.js +29 -0
- package/bin/customComponents/selective.js.map +1 -0
- package/bin/nav/iterate.d.ts +2 -0
- package/bin/nav/iterate.js +15 -0
- package/bin/nav/iterate.js.map +1 -0
- package/bin/nav/listItems.d.ts +8 -0
- package/bin/nav/listItems.js +62 -0
- package/bin/nav/listItems.js.map +1 -0
- package/bin/nav/retrieve.d.ts +3 -0
- package/bin/nav/retrieve.js +75 -0
- package/bin/nav/retrieve.js.map +1 -0
- package/bin/nav/root.d.ts +2 -0
- package/bin/nav/root.js +40 -0
- package/bin/nav/root.js.map +1 -0
- package/bin/openapi/generateOpenApiPages.js +2 -2
- package/bin/openapi/generateOpenApiPages.js.map +1 -1
- package/bin/root/retrieve.d.ts +2 -0
- package/bin/root/retrieve.js +46 -0
- package/bin/root/retrieve.js.map +1 -0
- package/bin/scrapingPipeline/group.d.ts +5 -0
- package/bin/scrapingPipeline/group.js +46 -0
- package/bin/scrapingPipeline/group.js.map +1 -0
- package/bin/scrapingPipeline/icon.d.ts +2 -0
- package/bin/scrapingPipeline/icon.js +22 -0
- package/bin/scrapingPipeline/icon.js.map +1 -0
- package/bin/scrapingPipeline/images.d.ts +3 -0
- package/bin/scrapingPipeline/images.js +50 -0
- package/bin/scrapingPipeline/images.js.map +1 -0
- package/bin/scrapingPipeline/logo.d.ts +5 -0
- package/bin/scrapingPipeline/logo.js +92 -0
- package/bin/scrapingPipeline/logo.js.map +1 -0
- package/bin/scrapingPipeline/page.d.ts +6 -0
- package/bin/scrapingPipeline/page.js +102 -0
- package/bin/scrapingPipeline/page.js.map +1 -0
- package/bin/scrapingPipeline/root.d.ts +2 -0
- package/bin/scrapingPipeline/root.js +8 -0
- package/bin/scrapingPipeline/root.js.map +1 -0
- package/bin/scrapingPipeline/site.d.ts +7 -0
- package/bin/scrapingPipeline/site.js +129 -0
- package/bin/scrapingPipeline/site.js.map +1 -0
- package/bin/scrapingPipeline/tabs.d.ts +3 -0
- package/bin/scrapingPipeline/tabs.js +67 -0
- package/bin/scrapingPipeline/tabs.js.map +1 -0
- package/bin/tabs/retrieveReadme.d.ts +3 -0
- package/bin/tabs/retrieveReadme.js +78 -0
- package/bin/tabs/retrieveReadme.js.map +1 -0
- package/bin/tsconfig.build.tsbuildinfo +1 -1
- package/bin/types/components.d.ts +2 -0
- package/bin/types/components.js +2 -0
- package/bin/types/components.js.map +1 -0
- package/bin/types/framework.d.ts +8 -0
- package/bin/types/framework.js +3 -0
- package/bin/types/framework.js.map +1 -0
- package/bin/types/hast.d.ts +6 -0
- package/bin/types/hast.js +2 -0
- package/bin/types/hast.js.map +1 -0
- package/bin/types/result.d.ts +7 -0
- package/bin/types/result.js +2 -0
- package/bin/types/result.js.map +1 -0
- package/bin/types/scrapeFunc.d.ts +3 -0
- package/bin/types/scrapeFunc.js +2 -0
- package/bin/types/scrapeFunc.js.map +1 -0
- package/bin/utils/append.d.ts +1 -0
- package/bin/utils/append.js +12 -0
- package/bin/utils/append.js.map +1 -0
- package/bin/utils/children.d.ts +5 -0
- package/bin/utils/children.js +35 -0
- package/bin/utils/children.js.map +1 -0
- package/bin/utils/className.d.ts +3 -0
- package/bin/utils/className.js +13 -0
- package/bin/utils/className.js.map +1 -0
- package/bin/utils/detectFramework.d.ts +4 -0
- package/bin/utils/detectFramework.js +60 -0
- package/bin/utils/detectFramework.js.map +1 -0
- package/bin/utils/emptyParagraphs.d.ts +3 -0
- package/bin/utils/emptyParagraphs.js +19 -0
- package/bin/utils/emptyParagraphs.js.map +1 -0
- package/bin/utils/errors.d.ts +3 -0
- package/bin/utils/errors.js +16 -0
- package/bin/utils/errors.js.map +1 -0
- package/bin/utils/escape.d.ts +2 -0
- package/bin/utils/escape.js +25 -0
- package/bin/utils/escape.js.map +1 -0
- package/bin/utils/extension.d.ts +3 -0
- package/bin/utils/extension.js +18 -0
- package/bin/utils/extension.js.map +1 -0
- package/bin/utils/file.d.ts +4 -0
- package/bin/utils/file.js +43 -0
- package/bin/utils/file.js.map +1 -0
- package/bin/utils/firstChild.d.ts +2 -0
- package/bin/utils/firstChild.js +12 -0
- package/bin/utils/firstChild.js.map +1 -0
- package/bin/utils/images.d.ts +5 -0
- package/bin/utils/images.js +86 -0
- package/bin/utils/images.js.map +1 -0
- package/bin/utils/img.d.ts +2 -0
- package/bin/utils/img.js +15 -0
- package/bin/utils/img.js.map +1 -0
- package/bin/utils/log.d.ts +18 -0
- package/bin/utils/log.js +68 -0
- package/bin/utils/log.js.map +1 -0
- package/bin/utils/nestedRoots.d.ts +7 -0
- package/bin/utils/nestedRoots.js +19 -0
- package/bin/utils/nestedRoots.js.map +1 -0
- package/bin/utils/network.d.ts +5 -0
- package/bin/utils/network.js +82 -0
- package/bin/utils/network.js.map +1 -0
- package/bin/utils/path.d.ts +1 -0
- package/bin/utils/path.js +22 -0
- package/bin/utils/path.js.map +1 -0
- package/bin/utils/position.d.ts +3 -0
- package/bin/utils/position.js +12 -0
- package/bin/utils/position.js.map +1 -0
- package/bin/utils/reservedNames.d.ts +4 -0
- package/bin/utils/reservedNames.js +27 -0
- package/bin/utils/reservedNames.js.map +1 -0
- package/bin/utils/strings.d.ts +2 -0
- package/bin/utils/strings.js +7 -0
- package/bin/utils/strings.js.map +1 -0
- package/bin/utils/text.d.ts +2 -0
- package/bin/utils/text.js +11 -0
- package/bin/utils/text.js.map +1 -0
- package/bin/utils/title.d.ts +10 -0
- package/bin/utils/title.js +58 -0
- package/bin/utils/title.js.map +1 -0
- package/bin/utils/url.d.ts +3 -0
- package/bin/utils/url.js +10 -0
- package/bin/utils/url.js.map +1 -0
- package/package.json +20 -11
- package/src/assert.ts +15 -0
- package/src/cli.ts +53 -90
- package/src/components/Accordion.ts +84 -0
- package/src/components/AccordionGroup.ts +69 -0
- package/src/components/Callout.ts +159 -0
- package/src/components/Card.ts +168 -0
- package/src/components/CardGroup.ts +69 -0
- package/src/components/CodeGroup.ts +209 -0
- package/src/components/Frame.ts +86 -0
- package/src/components/Tabs.ts +154 -0
- package/src/components/link.ts +17 -0
- package/src/constants.ts +37 -19
- package/src/customComponents/create.ts +106 -0
- package/src/customComponents/plugin.ts +31 -0
- package/src/customComponents/selective.ts +37 -0
- package/src/nav/iterate.ts +18 -0
- package/src/nav/listItems.ts +82 -0
- package/src/nav/retrieve.ts +88 -0
- package/src/nav/root.ts +47 -0
- package/src/openapi/generateOpenApiPages.ts +2 -2
- package/src/root/retrieve.ts +52 -0
- package/src/scrapingPipeline/group.ts +62 -0
- package/src/scrapingPipeline/icon.ts +26 -0
- package/src/scrapingPipeline/images.ts +67 -0
- package/src/scrapingPipeline/logo.ts +127 -0
- package/src/scrapingPipeline/page.ts +130 -0
- package/src/scrapingPipeline/root.ts +10 -0
- package/src/scrapingPipeline/site.ts +161 -0
- package/src/scrapingPipeline/tabs.ts +87 -0
- package/src/tabs/retrieveReadme.ts +99 -0
- package/src/types/components.ts +3 -0
- package/src/types/framework.ts +10 -0
- package/src/types/hast.ts +12 -0
- package/src/types/result.ts +1 -0
- package/src/types/scrapeFunc.ts +9 -0
- package/src/utils/append.ts +9 -0
- package/src/utils/children.ts +51 -0
- package/src/utils/className.ts +14 -0
- package/src/utils/detectFramework.ts +72 -0
- package/src/utils/emptyParagraphs.ts +21 -0
- package/src/utils/errors.ts +24 -0
- package/src/utils/escape.ts +30 -0
- package/src/utils/extension.ts +19 -0
- package/src/utils/file.ts +58 -0
- package/src/utils/firstChild.ts +13 -0
- package/src/utils/images.ts +101 -0
- package/src/utils/img.ts +17 -0
- package/src/utils/log.ts +82 -0
- package/src/utils/nestedRoots.ts +20 -0
- package/src/utils/network.ts +95 -0
- package/src/utils/path.ts +27 -0
- package/src/utils/position.ts +14 -0
- package/src/utils/reservedNames.ts +31 -0
- package/src/utils/strings.ts +7 -0
- package/src/utils/text.ts +11 -0
- package/src/utils/title.ts +68 -0
- package/src/utils/url.ts +8 -0
- package/bin/browser.d.ts +0 -2
- package/bin/browser.js +0 -24
- package/bin/browser.js.map +0 -1
- package/bin/checks.d.ts +0 -8
- package/bin/checks.js +0 -24
- package/bin/checks.js.map +0 -1
- package/bin/downloadImage.d.ts +0 -5
- package/bin/downloadImage.js +0 -88
- package/bin/downloadImage.js.map +0 -1
- package/bin/scraping/combineNavWithEmptyGroupTitles.d.ts +0 -2
- package/bin/scraping/combineNavWithEmptyGroupTitles.js +0 -20
- package/bin/scraping/combineNavWithEmptyGroupTitles.js.map +0 -1
- package/bin/scraping/detectFramework.d.ts +0 -9
- package/bin/scraping/detectFramework.js +0 -36
- package/bin/scraping/detectFramework.js.map +0 -1
- package/bin/scraping/downloadAllImages.d.ts +0 -4
- package/bin/scraping/downloadAllImages.js +0 -36
- package/bin/scraping/downloadAllImages.js.map +0 -1
- package/bin/scraping/downloadLogoImage.d.ts +0 -1
- package/bin/scraping/downloadLogoImage.js +0 -12
- package/bin/scraping/downloadLogoImage.js.map +0 -1
- package/bin/scraping/replaceImagePaths.d.ts +0 -1
- package/bin/scraping/replaceImagePaths.js +0 -14
- package/bin/scraping/replaceImagePaths.js.map +0 -1
- package/bin/scraping/scrapeFileGettingFileNameFromUrl.d.ts +0 -6
- package/bin/scraping/scrapeFileGettingFileNameFromUrl.js +0 -46
- package/bin/scraping/scrapeFileGettingFileNameFromUrl.js.map +0 -1
- package/bin/scraping/scrapeGettingFileNameFromUrl.d.ts +0 -6
- package/bin/scraping/scrapeGettingFileNameFromUrl.js +0 -13
- package/bin/scraping/scrapeGettingFileNameFromUrl.js.map +0 -1
- package/bin/scraping/scrapePage.d.ts +0 -8
- package/bin/scraping/scrapePage.js +0 -10
- package/bin/scraping/scrapePage.js.map +0 -1
- package/bin/scraping/scrapePageCommands.d.ts +0 -7
- package/bin/scraping/scrapePageCommands.js +0 -50
- package/bin/scraping/scrapePageCommands.js.map +0 -1
- package/bin/scraping/scrapeSection.d.ts +0 -3
- package/bin/scraping/scrapeSection.js +0 -12
- package/bin/scraping/scrapeSection.js.map +0 -1
- package/bin/scraping/scrapeSectionCommands.d.ts +0 -6
- package/bin/scraping/scrapeSectionCommands.js +0 -63
- package/bin/scraping/scrapeSectionCommands.js.map +0 -1
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.d.ts +0 -5
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js +0 -29
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js.map +0 -1
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.d.ts +0 -2
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js +0 -31
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js.map +0 -1
- package/bin/scraping/site-scrapers/alternateGroupTitle.d.ts +0 -3
- package/bin/scraping/site-scrapers/alternateGroupTitle.js +0 -9
- package/bin/scraping/site-scrapers/alternateGroupTitle.js.map +0 -1
- package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.d.ts +0 -5
- package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js +0 -33
- package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js.map +0 -1
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.d.ts +0 -3
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.js +0 -35
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.js.map +0 -1
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.d.ts +0 -3
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.js +0 -33
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.js.map +0 -1
- package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.d.ts +0 -2
- package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.js +0 -30
- package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.js.map +0 -1
- package/bin/scraping/site-scrapers/openNestedGitbookMenus.d.ts +0 -2
- package/bin/scraping/site-scrapers/openNestedGitbookMenus.js +0 -21
- package/bin/scraping/site-scrapers/openNestedGitbookMenus.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.d.ts +0 -5
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js +0 -53
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.d.ts +0 -2
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js +0 -32
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeGitBookPage.d.ts +0 -5
- package/bin/scraping/site-scrapers/scrapeGitBookPage.js +0 -56
- package/bin/scraping/site-scrapers/scrapeGitBookPage.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeGitBookSection.d.ts +0 -2
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js +0 -42
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeReadMePage.d.ts +0 -5
- package/bin/scraping/site-scrapers/scrapeReadMePage.js +0 -38
- package/bin/scraping/site-scrapers/scrapeReadMePage.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeReadMeSection.d.ts +0 -2
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js +0 -39
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js.map +0 -1
- package/bin/util.d.ts +0 -29
- package/bin/util.js +0 -97
- package/bin/util.js.map +0 -1
- package/src/browser.ts +0 -24
- package/src/checks.ts +0 -32
- package/src/downloadImage.ts +0 -102
- package/src/scraping/combineNavWithEmptyGroupTitles.ts +0 -21
- package/src/scraping/detectFramework.ts +0 -55
- package/src/scraping/downloadAllImages.ts +0 -61
- package/src/scraping/downloadLogoImage.ts +0 -24
- package/src/scraping/replaceImagePaths.ts +0 -17
- package/src/scraping/scrapeFileGettingFileNameFromUrl.ts +0 -84
- package/src/scraping/scrapeGettingFileNameFromUrl.ts +0 -56
- package/src/scraping/scrapePage.ts +0 -40
- package/src/scraping/scrapePageCommands.ts +0 -68
- package/src/scraping/scrapeSection.ts +0 -30
- package/src/scraping/scrapeSectionCommands.ts +0 -98
- package/src/scraping/site-scrapers/Intercom/scrapeIntercomPage.ts +0 -52
- package/src/scraping/site-scrapers/Intercom/scrapeIntercomSection.ts +0 -54
- package/src/scraping/site-scrapers/alternateGroupTitle.ts +0 -11
- package/src/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.ts +0 -45
- package/src/scraping/site-scrapers/links-per-group/getLinksRecursively.ts +0 -47
- package/src/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.ts +0 -44
- package/src/scraping/site-scrapers/openNestedDocusaurusMenus.ts +0 -42
- package/src/scraping/site-scrapers/openNestedGitbookMenus.ts +0 -27
- package/src/scraping/site-scrapers/scrapeDocusaurusPage.ts +0 -85
- package/src/scraping/site-scrapers/scrapeDocusaurusSection.ts +0 -63
- package/src/scraping/site-scrapers/scrapeGitBookPage.ts +0 -82
- package/src/scraping/site-scrapers/scrapeGitBookSection.ts +0 -69
- package/src/scraping/site-scrapers/scrapeReadMePage.ts +0 -56
- package/src/scraping/site-scrapers/scrapeReadMeSection.ts +0 -66
- package/src/util.ts +0 -122
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import path from 'path';
|
|
2
|
-
|
|
3
|
-
import { createPage, getOrigin } from '../util.js';
|
|
4
|
-
|
|
5
|
-
type ScrapePageResult = {
|
|
6
|
-
title: string;
|
|
7
|
-
description?: string;
|
|
8
|
-
markdown?: string;
|
|
9
|
-
};
|
|
10
|
-
|
|
11
|
-
export type ScrapePageFn = (
|
|
12
|
-
html: string,
|
|
13
|
-
origin: string,
|
|
14
|
-
cliDir: string,
|
|
15
|
-
imageBaseDir: string,
|
|
16
|
-
overwrite: boolean,
|
|
17
|
-
version: string | undefined
|
|
18
|
-
) => Promise<ScrapePageResult>;
|
|
19
|
-
|
|
20
|
-
export async function scrapePage(
|
|
21
|
-
scrapeFunc: ScrapePageFn,
|
|
22
|
-
href: string,
|
|
23
|
-
html: string,
|
|
24
|
-
overwrite: boolean,
|
|
25
|
-
version: string | undefined
|
|
26
|
-
) {
|
|
27
|
-
const origin = getOrigin(href);
|
|
28
|
-
const cwd = process.cwd();
|
|
29
|
-
const imageBaseDir = path.join(cwd, 'images');
|
|
30
|
-
|
|
31
|
-
const { title, description, markdown } = await scrapeFunc(
|
|
32
|
-
html,
|
|
33
|
-
origin,
|
|
34
|
-
cwd,
|
|
35
|
-
imageBaseDir,
|
|
36
|
-
overwrite,
|
|
37
|
-
version
|
|
38
|
-
);
|
|
39
|
-
createPage(title, description, markdown, overwrite, process.cwd());
|
|
40
|
-
}
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
import axios from 'axios';
|
|
2
|
-
|
|
3
|
-
import { getHtmlWithPuppeteer } from '../browser.js';
|
|
4
|
-
import { detectFramework, Framework, FrameworkHint, frameworks } from './detectFramework.js';
|
|
5
|
-
import { scrapePage, ScrapePageFn } from './scrapePage.js';
|
|
6
|
-
import { scrapeIntercomPage } from './site-scrapers/Intercom/scrapeIntercomPage.js';
|
|
7
|
-
import { scrapeDocusaurusPage } from './site-scrapers/scrapeDocusaurusPage.js';
|
|
8
|
-
import { scrapeGitBookPage } from './site-scrapers/scrapeGitBookPage.js';
|
|
9
|
-
import { scrapeReadMePage } from './site-scrapers/scrapeReadMePage.js';
|
|
10
|
-
|
|
11
|
-
function validateFramework(framework: Framework | undefined) {
|
|
12
|
-
if (!framework) {
|
|
13
|
-
console.log(
|
|
14
|
-
`Could not detect the framework automatically. Please use the -t flag to specify one of: ${frameworks.join(
|
|
15
|
-
', '
|
|
16
|
-
)}`
|
|
17
|
-
);
|
|
18
|
-
return process.exit(1);
|
|
19
|
-
}
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
export async function scrapePageWrapper(
|
|
23
|
-
url: string,
|
|
24
|
-
overwrite: boolean,
|
|
25
|
-
scrapeFunc: ScrapePageFn,
|
|
26
|
-
options?: { version?: string; puppeteer?: boolean }
|
|
27
|
-
) {
|
|
28
|
-
let html: string;
|
|
29
|
-
if (options?.puppeteer) {
|
|
30
|
-
html = await getHtmlWithPuppeteer(url);
|
|
31
|
-
} else {
|
|
32
|
-
const res = await axios.get(url);
|
|
33
|
-
html = res.data;
|
|
34
|
-
}
|
|
35
|
-
await scrapePage(scrapeFunc, url, html, overwrite, options?.version);
|
|
36
|
-
process.exit(0);
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
export async function scrapePageAutomatically(
|
|
40
|
-
url: string,
|
|
41
|
-
overwrite: boolean,
|
|
42
|
-
frameworkHint: FrameworkHint
|
|
43
|
-
) {
|
|
44
|
-
const res = await axios.get(url);
|
|
45
|
-
const html = res.data;
|
|
46
|
-
frameworkHint = frameworkHint.framework ? frameworkHint : detectFramework(html);
|
|
47
|
-
|
|
48
|
-
validateFramework(frameworkHint.framework);
|
|
49
|
-
|
|
50
|
-
console.log('Detected framework: ' + frameworkHint.framework);
|
|
51
|
-
|
|
52
|
-
switch (frameworkHint.framework) {
|
|
53
|
-
case 'docusaurus':
|
|
54
|
-
await scrapePageWrapper(url, overwrite, scrapeDocusaurusPage, {
|
|
55
|
-
version: frameworkHint.version,
|
|
56
|
-
});
|
|
57
|
-
break;
|
|
58
|
-
case 'gitbook':
|
|
59
|
-
await scrapePageWrapper(url, overwrite, scrapeGitBookPage, { puppeteer: true });
|
|
60
|
-
break;
|
|
61
|
-
case 'readme':
|
|
62
|
-
await scrapePageWrapper(url, overwrite, scrapeReadMePage);
|
|
63
|
-
break;
|
|
64
|
-
case 'intercom':
|
|
65
|
-
await scrapePageWrapper(url, overwrite, scrapeIntercomPage);
|
|
66
|
-
break;
|
|
67
|
-
}
|
|
68
|
-
}
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
import { NavigationEntry } from '@mintlify/models';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
|
|
4
|
-
import { objToReadableString } from '../util.js';
|
|
5
|
-
|
|
6
|
-
export type ScrapeSectionFn = (
|
|
7
|
-
html: string,
|
|
8
|
-
origin: string,
|
|
9
|
-
cliDir: string,
|
|
10
|
-
imageBaseDir: string,
|
|
11
|
-
overwrite: boolean,
|
|
12
|
-
version: string | undefined
|
|
13
|
-
) => Promise<NavigationEntry[]>;
|
|
14
|
-
|
|
15
|
-
export async function scrapeSection(
|
|
16
|
-
scrapeFunc: ScrapeSectionFn,
|
|
17
|
-
html: string,
|
|
18
|
-
origin: string,
|
|
19
|
-
overwrite: boolean,
|
|
20
|
-
version: string | undefined
|
|
21
|
-
) {
|
|
22
|
-
console.log(`Started scraping${overwrite ? ', overwrite mode is on' : ''}...`);
|
|
23
|
-
const cwd = process.cwd();
|
|
24
|
-
const imageBaseDir = path.join(cwd, 'images');
|
|
25
|
-
|
|
26
|
-
const groupsConfig = await scrapeFunc(html, origin, cwd, imageBaseDir, overwrite, version);
|
|
27
|
-
console.log('Finished scraping.');
|
|
28
|
-
console.log('Add the following to your navigation in mint.json:');
|
|
29
|
-
console.log(objToReadableString(groupsConfig));
|
|
30
|
-
}
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import axios from 'axios';
|
|
2
|
-
import { Page } from 'puppeteer';
|
|
3
|
-
|
|
4
|
-
import { startBrowser } from '../browser.js';
|
|
5
|
-
import { getOrigin } from '../util.js';
|
|
6
|
-
import { detectFramework, Framework, FrameworkHint } from './detectFramework.js';
|
|
7
|
-
import { ScrapeSectionFn, scrapeSection } from './scrapeSection.js';
|
|
8
|
-
import { scrapeIntercomSection } from './site-scrapers/Intercom/scrapeIntercomSection.js';
|
|
9
|
-
import openNestedDocusaurusMenus from './site-scrapers/openNestedDocusaurusMenus.js';
|
|
10
|
-
import openNestedGitbookMenus from './site-scrapers/openNestedGitbookMenus.js';
|
|
11
|
-
import { scrapeDocusaurusSection } from './site-scrapers/scrapeDocusaurusSection.js';
|
|
12
|
-
import { scrapeGitBookSection } from './site-scrapers/scrapeGitBookSection.js';
|
|
13
|
-
import { scrapeReadMeSection } from './site-scrapers/scrapeReadMeSection.js';
|
|
14
|
-
|
|
15
|
-
export async function scrapeSectionAxiosWrapper(
|
|
16
|
-
url: string,
|
|
17
|
-
overwrite: boolean,
|
|
18
|
-
scrapeFunc: ScrapeSectionFn
|
|
19
|
-
) {
|
|
20
|
-
const res = await axios.get(url);
|
|
21
|
-
const html = res.data;
|
|
22
|
-
await scrapeSection(scrapeFunc, html, getOrigin(url), overwrite, undefined);
|
|
23
|
-
process.exit(0);
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
export async function scrapeDocusaurusSectionCommand(
|
|
27
|
-
url: string,
|
|
28
|
-
overwrite: boolean,
|
|
29
|
-
version: string | undefined // "1" | "2" | "3"
|
|
30
|
-
) {
|
|
31
|
-
await scrapeSectionOpeningAllNested(
|
|
32
|
-
url,
|
|
33
|
-
overwrite,
|
|
34
|
-
openNestedDocusaurusMenus,
|
|
35
|
-
scrapeDocusaurusSection,
|
|
36
|
-
version
|
|
37
|
-
);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
export async function scrapeGitbookSectionCommand(url: string, overwrite: boolean) {
|
|
41
|
-
await scrapeSectionOpeningAllNested(url, overwrite, openNestedGitbookMenus, scrapeGitBookSection);
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
async function scrapeSectionOpeningAllNested(
|
|
45
|
-
url: string,
|
|
46
|
-
overwrite: boolean,
|
|
47
|
-
openLinks: (page: Page) => Promise<string>,
|
|
48
|
-
scrapeFunc: ScrapeSectionFn,
|
|
49
|
-
version?: string
|
|
50
|
-
) {
|
|
51
|
-
const browser = await startBrowser();
|
|
52
|
-
const page = await browser.newPage();
|
|
53
|
-
await page.goto(url, {
|
|
54
|
-
waitUntil: 'networkidle2',
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
const html = await openLinks(page);
|
|
58
|
-
void browser.close();
|
|
59
|
-
await scrapeSection(scrapeFunc, html, getOrigin(url), overwrite, version);
|
|
60
|
-
process.exit(0);
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
export async function scrapeSectionAutomatically(
|
|
64
|
-
url: string,
|
|
65
|
-
overwrite: boolean,
|
|
66
|
-
frameworkHint: FrameworkHint
|
|
67
|
-
) {
|
|
68
|
-
const res = await axios.get(url);
|
|
69
|
-
const html = res.data;
|
|
70
|
-
frameworkHint = frameworkHint.framework ? frameworkHint : detectFramework(html);
|
|
71
|
-
|
|
72
|
-
validateFramework(frameworkHint.framework);
|
|
73
|
-
console.log('Detected framework: ' + frameworkHint.framework);
|
|
74
|
-
|
|
75
|
-
switch (frameworkHint.framework) {
|
|
76
|
-
case 'docusaurus':
|
|
77
|
-
await scrapeDocusaurusSectionCommand(url, overwrite, frameworkHint.version);
|
|
78
|
-
break;
|
|
79
|
-
case 'gitbook':
|
|
80
|
-
await scrapeGitbookSectionCommand(url, overwrite);
|
|
81
|
-
break;
|
|
82
|
-
case 'readme':
|
|
83
|
-
await scrapeSectionAxiosWrapper(url, overwrite, scrapeReadMeSection);
|
|
84
|
-
break;
|
|
85
|
-
case 'intercom':
|
|
86
|
-
await scrapeSectionAxiosWrapper(url, overwrite, scrapeIntercomSection);
|
|
87
|
-
break;
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
function validateFramework(framework: Framework | undefined) {
|
|
92
|
-
if (!framework) {
|
|
93
|
-
console.log(
|
|
94
|
-
'Could not detect the framework automatically. We only support Docusaurus (V2 and V3), GitBook, and ReadMe.'
|
|
95
|
-
);
|
|
96
|
-
process.exit();
|
|
97
|
-
}
|
|
98
|
-
}
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import * as cheerio from 'cheerio';
|
|
2
|
-
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
|
3
|
-
|
|
4
|
-
import downloadAllImages from '../../downloadAllImages.js';
|
|
5
|
-
import replaceImagePaths from '../../replaceImagePaths.js';
|
|
6
|
-
|
|
7
|
-
export async function scrapeIntercomPage(
|
|
8
|
-
html: string,
|
|
9
|
-
origin: string,
|
|
10
|
-
cliDir: string,
|
|
11
|
-
imageBaseDir: string,
|
|
12
|
-
overwrite: boolean,
|
|
13
|
-
_: string | undefined // version
|
|
14
|
-
) {
|
|
15
|
-
const $ = cheerio.load(html);
|
|
16
|
-
|
|
17
|
-
const titleComponent = $('.t__h1').first();
|
|
18
|
-
const title = titleComponent.text().trim();
|
|
19
|
-
const description = $('.article__desc', titleComponent.parent()).text().trim();
|
|
20
|
-
|
|
21
|
-
const content = $('article').first();
|
|
22
|
-
const contentHtml = $.html(content);
|
|
23
|
-
|
|
24
|
-
const origToWritePath = await downloadAllImages(
|
|
25
|
-
$,
|
|
26
|
-
content,
|
|
27
|
-
origin,
|
|
28
|
-
imageBaseDir,
|
|
29
|
-
overwrite,
|
|
30
|
-
undefined
|
|
31
|
-
);
|
|
32
|
-
|
|
33
|
-
const nhm = new NodeHtmlMarkdown({ useInlineLinks: false });
|
|
34
|
-
let markdown = nhm.translate(contentHtml);
|
|
35
|
-
|
|
36
|
-
// Keep headers on one line
|
|
37
|
-
markdown = markdown.replace(/# \n\n/g, '# ');
|
|
38
|
-
|
|
39
|
-
// Remove unnecessary nonwidth blank space characters
|
|
40
|
-
markdown = markdown.replace(/\u200b/g, '');
|
|
41
|
-
|
|
42
|
-
// Reduce unnecessary blank lines
|
|
43
|
-
markdown = markdown.replace(/\n\n\n/g, '\n\n');
|
|
44
|
-
|
|
45
|
-
// Mintlify doesn't support bolded headers, remove the asterisks
|
|
46
|
-
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
|
|
47
|
-
if (origToWritePath) {
|
|
48
|
-
markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
return { title, description, markdown };
|
|
52
|
-
}
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import { Navigation, NavigationEntry } from '@mintlify/models';
|
|
2
|
-
import axios from 'axios';
|
|
3
|
-
import * as cheerio from 'cheerio';
|
|
4
|
-
|
|
5
|
-
import downloadLogoImage from '../../downloadLogoImage.js';
|
|
6
|
-
import { scrapeGettingFileNameFromUrl } from '../../scrapeGettingFileNameFromUrl.js';
|
|
7
|
-
import { scrapeIntercomPage } from './scrapeIntercomPage.js';
|
|
8
|
-
|
|
9
|
-
export async function scrapeIntercomSection(
|
|
10
|
-
html: string,
|
|
11
|
-
origin: string,
|
|
12
|
-
cliDir: string,
|
|
13
|
-
imageBaseDir: string,
|
|
14
|
-
overwrite: boolean,
|
|
15
|
-
version: string | undefined
|
|
16
|
-
): Promise<NavigationEntry[]> {
|
|
17
|
-
let $ = cheerio.load(html);
|
|
18
|
-
|
|
19
|
-
const logoSrc = $('.header__logo img').first().attr('src');
|
|
20
|
-
void downloadLogoImage(logoSrc, imageBaseDir, origin, overwrite);
|
|
21
|
-
|
|
22
|
-
const collectionsLink = $('.section .g__space a');
|
|
23
|
-
const collectionsMap = collectionsLink.toArray().map(async (s: cheerio.Element) => {
|
|
24
|
-
const href = $(s).attr('href');
|
|
25
|
-
const res = await axios.get(`${origin}${href}`);
|
|
26
|
-
const html = res.data;
|
|
27
|
-
$ = cheerio.load(html);
|
|
28
|
-
const sectionTitle = $('.collection h1').first().text().trim();
|
|
29
|
-
const sectionPages = $('.section .g__space a')
|
|
30
|
-
.toArray()
|
|
31
|
-
.map((s: cheerio.Element) => $(s).attr('href'))
|
|
32
|
-
.filter((page) => page !== undefined) as string[];
|
|
33
|
-
return {
|
|
34
|
-
group: sectionTitle,
|
|
35
|
-
pages: sectionPages,
|
|
36
|
-
};
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
const collections: Navigation = await Promise.all(collectionsMap);
|
|
40
|
-
|
|
41
|
-
return await Promise.all(
|
|
42
|
-
collections.map(async (entry: NavigationEntry) => {
|
|
43
|
-
return await scrapeGettingFileNameFromUrl(
|
|
44
|
-
entry,
|
|
45
|
-
cliDir,
|
|
46
|
-
origin,
|
|
47
|
-
overwrite,
|
|
48
|
-
scrapeIntercomPage,
|
|
49
|
-
false,
|
|
50
|
-
version
|
|
51
|
-
);
|
|
52
|
-
})
|
|
53
|
-
);
|
|
54
|
-
}
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
import { NavigationEntry } from '@mintlify/models';
|
|
2
|
-
import { Cheerio, Element } from 'cheerio';
|
|
3
|
-
|
|
4
|
-
export default function alternateGroupTitle(firstLink: Cheerio<Element>, pages: NavigationEntry[]) {
|
|
5
|
-
// Only assign titles to nested navigation menus outside a section.
|
|
6
|
-
// Others should not have a title so we can merge them into one section.
|
|
7
|
-
if (pages.length > 0) {
|
|
8
|
-
return firstLink.text();
|
|
9
|
-
}
|
|
10
|
-
return '';
|
|
11
|
-
}
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import { Cheerio, CheerioAPI, Element } from 'cheerio';
|
|
2
|
-
|
|
3
|
-
import alternateGroupTitle from '../alternateGroupTitle.js';
|
|
4
|
-
import getLinksRecursively from './getLinksRecursively.js';
|
|
5
|
-
|
|
6
|
-
export function getDocusaurusLinksPerGroup(
|
|
7
|
-
navigationSections: Cheerio<Element>,
|
|
8
|
-
$: CheerioAPI,
|
|
9
|
-
version: string | undefined
|
|
10
|
-
) {
|
|
11
|
-
if (version === '3' || version === '2') {
|
|
12
|
-
return getDocusaurusLinksPerGroupLoop(navigationSections, $);
|
|
13
|
-
}
|
|
14
|
-
return [];
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
function getDocusaurusLinksPerGroupLoop(navigationSections: Cheerio<Element>, $: CheerioAPI) {
|
|
18
|
-
return navigationSections.toArray().map((s) => {
|
|
19
|
-
const section = $(s);
|
|
20
|
-
|
|
21
|
-
// Links without a group
|
|
22
|
-
if (section.hasClass('theme-doc-sidebar-item-link') || section.hasClass('menu__link')) {
|
|
23
|
-
const linkHref = section.find('a[href]').first().attr('href');
|
|
24
|
-
return {
|
|
25
|
-
group: '',
|
|
26
|
-
pages: linkHref !== undefined ? [linkHref] : [],
|
|
27
|
-
};
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
const firstLink = !section.find('.menu__list-item-collapsible').first().find('a[href]').length
|
|
31
|
-
? section.find('.menu__link--sublist').first().find('a[href]')
|
|
32
|
-
: section.find('.menu__list-item-collapsible').first().find('a[href]');
|
|
33
|
-
|
|
34
|
-
const sectionTitle = firstLink.text();
|
|
35
|
-
const firstHref = firstLink.attr('href');
|
|
36
|
-
const linkSections = section.children().eq(1).children();
|
|
37
|
-
|
|
38
|
-
const pages = getLinksRecursively(linkSections, $);
|
|
39
|
-
|
|
40
|
-
return {
|
|
41
|
-
group: sectionTitle || alternateGroupTitle(firstLink, pages),
|
|
42
|
-
pages: firstHref ? [firstHref, ...pages] : pages,
|
|
43
|
-
};
|
|
44
|
-
});
|
|
45
|
-
}
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import { NavigationEntry } from '@mintlify/models';
|
|
2
|
-
import { Cheerio, CheerioAPI, Element } from 'cheerio';
|
|
3
|
-
|
|
4
|
-
// Used by Docusaurus and ReadMe section scrapers
|
|
5
|
-
export default function getLinksRecursively(
|
|
6
|
-
linkSections: Cheerio<Element>,
|
|
7
|
-
$: CheerioAPI
|
|
8
|
-
): NavigationEntry[] {
|
|
9
|
-
return linkSections
|
|
10
|
-
.map((_, s) => {
|
|
11
|
-
const subsection = $(s);
|
|
12
|
-
let link = subsection.children().first();
|
|
13
|
-
|
|
14
|
-
if (!link.attr('href')) {
|
|
15
|
-
// Docusaurus nests the <a> inside a <div>
|
|
16
|
-
link = link.find('a[href]').first();
|
|
17
|
-
}
|
|
18
|
-
const linkHref = link.attr('href');
|
|
19
|
-
|
|
20
|
-
// Skip missing links. For example, GitBook uses
|
|
21
|
-
// empty divs are used for styling a line beside the nav.
|
|
22
|
-
// Skip external links until Mintlify supports them
|
|
23
|
-
if (
|
|
24
|
-
!linkHref ||
|
|
25
|
-
linkHref === '#' ||
|
|
26
|
-
linkHref.startsWith('https://') ||
|
|
27
|
-
linkHref.startsWith('http://')
|
|
28
|
-
) {
|
|
29
|
-
return undefined;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
const childLinks = subsection.children().eq(1).children();
|
|
33
|
-
|
|
34
|
-
if (childLinks.length > 0) {
|
|
35
|
-
// Put the section link in the list of pages.
|
|
36
|
-
// When we support the section itself being a link we should update this
|
|
37
|
-
return {
|
|
38
|
-
group: link.text(),
|
|
39
|
-
pages: [linkHref, ...getLinksRecursively(childLinks, $)],
|
|
40
|
-
};
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
return linkHref;
|
|
44
|
-
})
|
|
45
|
-
.toArray()
|
|
46
|
-
.filter(Boolean);
|
|
47
|
-
}
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
import { NavigationEntry } from '@mintlify/models';
|
|
2
|
-
import { Cheerio, CheerioAPI, Element } from 'cheerio';
|
|
3
|
-
|
|
4
|
-
// Used by GitBook section scraper
|
|
5
|
-
export default function getLinksRecursivelyGitBook(
|
|
6
|
-
linkSections: Cheerio<Element>,
|
|
7
|
-
$: CheerioAPI
|
|
8
|
-
): NavigationEntry[] {
|
|
9
|
-
return linkSections
|
|
10
|
-
.map((_, s) => {
|
|
11
|
-
const subsection = $(s);
|
|
12
|
-
const sectionHeader = subsection.find('div').first();
|
|
13
|
-
const link = subsection.find('a').first();
|
|
14
|
-
const linkHref = link.attr('href');
|
|
15
|
-
|
|
16
|
-
// Skip missing links. For example, GitBook uses
|
|
17
|
-
// empty divs are used for styling a line beside the nav.
|
|
18
|
-
// Skip external links until Mintlify supports them
|
|
19
|
-
if (
|
|
20
|
-
!linkHref ||
|
|
21
|
-
linkHref === '#' ||
|
|
22
|
-
linkHref.startsWith('https://') ||
|
|
23
|
-
linkHref.startsWith('http://')
|
|
24
|
-
) {
|
|
25
|
-
return undefined;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
const childLinks = subsection.find('ul').first().children();
|
|
29
|
-
const title = link.text() ? link.text() : sectionHeader.text() ? sectionHeader.text() : '';
|
|
30
|
-
|
|
31
|
-
if (childLinks.length > 0) {
|
|
32
|
-
// Put the section link in the list of pages.
|
|
33
|
-
// When we support the section itself being a link we should update this
|
|
34
|
-
return {
|
|
35
|
-
group: title,
|
|
36
|
-
pages: [linkHref, ...getLinksRecursivelyGitBook(childLinks, $)],
|
|
37
|
-
};
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
return linkHref;
|
|
41
|
-
})
|
|
42
|
-
.toArray()
|
|
43
|
-
.filter(Boolean);
|
|
44
|
-
}
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import { Page } from 'puppeteer';
|
|
2
|
-
|
|
3
|
-
export default async function openNestedDocusaurusMenus(page: Page) {
|
|
4
|
-
let prevEncountered: string[] = [];
|
|
5
|
-
let encounteredHref = ['fake-href-to-make-loop-run-at-least-once'];
|
|
6
|
-
|
|
7
|
-
// Loop until we've encountered every link
|
|
8
|
-
while (!encounteredHref.every((href) => prevEncountered.includes(href))) {
|
|
9
|
-
prevEncountered = encounteredHref;
|
|
10
|
-
encounteredHref = await page.evaluate(
|
|
11
|
-
(encounteredHref) => {
|
|
12
|
-
const collapsible: HTMLElement[] = Array.from(
|
|
13
|
-
document.querySelectorAll('.menu__link.menu__link--sublist')
|
|
14
|
-
);
|
|
15
|
-
|
|
16
|
-
const linksFound: string[] = [];
|
|
17
|
-
collapsible.forEach((collapsibleItem) => {
|
|
18
|
-
const href = collapsibleItem.getAttribute('href');
|
|
19
|
-
|
|
20
|
-
// Should never occur but we keep it as a fail-safe
|
|
21
|
-
if (href?.startsWith('https://') || href?.startsWith('http://')) {
|
|
22
|
-
return;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
// Click any links we haven't seen before
|
|
26
|
-
if (href && !encounteredHref.includes(href)) {
|
|
27
|
-
collapsibleItem.click();
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
if (href) {
|
|
31
|
-
linksFound.push(href);
|
|
32
|
-
}
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
return linksFound;
|
|
36
|
-
},
|
|
37
|
-
encounteredHref // Need to pass array into the browser
|
|
38
|
-
);
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
return await page.content();
|
|
42
|
-
}
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import { Page } from 'puppeteer';
|
|
2
|
-
|
|
3
|
-
export default async function openNestedGitbookMenus(page: Page) {
|
|
4
|
-
let clickedAny = true;
|
|
5
|
-
|
|
6
|
-
// Loop until we've encountered every closed menu
|
|
7
|
-
while (clickedAny) {
|
|
8
|
-
clickedAny = await page.evaluate(() => {
|
|
9
|
-
let clicked = false;
|
|
10
|
-
// Right pointing arrow. Only menus have this icon
|
|
11
|
-
const icons = document.querySelectorAll(
|
|
12
|
-
'div > a > span > svg[style*="mask-image:url(https://ka-p.fontawesome.com/releases/v6.6.0/svgs/regular/chevron-right.svg?v=1&token=a463935e93)"]'
|
|
13
|
-
);
|
|
14
|
-
|
|
15
|
-
icons.forEach((icon) => {
|
|
16
|
-
const span = icon.parentElement;
|
|
17
|
-
if (span && span.className.includes('rotate-0')) {
|
|
18
|
-
span.click();
|
|
19
|
-
clicked = true;
|
|
20
|
-
}
|
|
21
|
-
});
|
|
22
|
-
return clicked;
|
|
23
|
-
});
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
return await page.content();
|
|
27
|
-
}
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
import * as cheerio from 'cheerio';
|
|
2
|
-
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
|
3
|
-
|
|
4
|
-
import downloadAllImages from '../downloadAllImages.js';
|
|
5
|
-
import replaceImagePaths from '../replaceImagePaths.js';
|
|
6
|
-
|
|
7
|
-
export async function scrapeDocusaurusPage(
|
|
8
|
-
html: string,
|
|
9
|
-
origin: string,
|
|
10
|
-
cliDir: string,
|
|
11
|
-
imageBaseDir: string,
|
|
12
|
-
overwrite: boolean,
|
|
13
|
-
version: string | undefined // expects "2", or "3". Have not written support for "1" yet
|
|
14
|
-
): Promise<{
|
|
15
|
-
title: string;
|
|
16
|
-
description?: string;
|
|
17
|
-
markdown?: string;
|
|
18
|
-
}> {
|
|
19
|
-
const $ = cheerio.load(html);
|
|
20
|
-
|
|
21
|
-
const article = version === '3' ? $('.theme-doc-markdown').first() : $('article').first();
|
|
22
|
-
|
|
23
|
-
if (article.length === 0) {
|
|
24
|
-
// Index pages with no additional text don't have the markdown class
|
|
25
|
-
return {
|
|
26
|
-
title: '',
|
|
27
|
-
};
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
const titleComponent = article.find('h1');
|
|
31
|
-
const title = titleComponent.text().trim();
|
|
32
|
-
|
|
33
|
-
// Do not include title in the content when we insert it in our metadata
|
|
34
|
-
titleComponent.remove();
|
|
35
|
-
|
|
36
|
-
const markdownContent = version === '3' ? article : article.find('.markdown').first();
|
|
37
|
-
|
|
38
|
-
const origToWritePath = await downloadAllImages(
|
|
39
|
-
$,
|
|
40
|
-
markdownContent,
|
|
41
|
-
origin,
|
|
42
|
-
imageBaseDir,
|
|
43
|
-
overwrite
|
|
44
|
-
);
|
|
45
|
-
|
|
46
|
-
const markdownHtml = markdownContent.html();
|
|
47
|
-
|
|
48
|
-
const nhm = new NodeHtmlMarkdown({ useInlineLinks: false });
|
|
49
|
-
let markdown = markdownHtml ? nhm.translate(markdownHtml) : null;
|
|
50
|
-
|
|
51
|
-
if (markdown == null) {
|
|
52
|
-
console.error('We do not support scraping this page. Content will be empty');
|
|
53
|
-
return { title, description: undefined, markdown: '' };
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
// Description only exists in meta tags. The code is commented out because its prone to incorrectly
|
|
57
|
-
// including a description if the first line of text had markdown annotations like `.
|
|
58
|
-
// The commented out alternative is to ignore description if it's the first line of text,
|
|
59
|
-
// this means it was not set in the metadata and Docusaurus defaulted to the text.
|
|
60
|
-
const description = undefined;
|
|
61
|
-
// let description = $('meta[property="og:description"]').attr("content");
|
|
62
|
-
// if (markdown.startsWith(description)) {
|
|
63
|
-
// description = null;
|
|
64
|
-
// }
|
|
65
|
-
|
|
66
|
-
// Remove Docusaurus links from headers
|
|
67
|
-
// When we parse their HTML the parser adds things like:
|
|
68
|
-
// [](#setup "Direct link to heading")
|
|
69
|
-
// to the end of each header.
|
|
70
|
-
markdown = markdown.replace(/\[\]\(#.+ ".+"\)\n/g, '\n');
|
|
71
|
-
|
|
72
|
-
// Remove unnecessary nonwidth blank space characters
|
|
73
|
-
markdown = markdown.replace(/\u200b/g, '');
|
|
74
|
-
|
|
75
|
-
// Reduce unnecessary blank lines
|
|
76
|
-
markdown = markdown.replace(/\n\n\n/g, '\n\n');
|
|
77
|
-
|
|
78
|
-
// Mintlify doesn't support bolded headers, remove the asterisks
|
|
79
|
-
markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
|
|
80
|
-
if (origToWritePath) {
|
|
81
|
-
markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
return { title, description, markdown };
|
|
85
|
-
}
|