@mintlify/scraping 3.0.186 → 3.0.188
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/assert.d.ts +5 -0
- package/bin/assert.js +13 -0
- package/bin/assert.js.map +1 -0
- package/bin/cli.js +43 -72
- package/bin/cli.js.map +1 -1
- package/bin/components/Accordion.d.ts +5 -0
- package/bin/components/Accordion.js +54 -0
- package/bin/components/Accordion.js.map +1 -0
- package/bin/components/AccordionGroup.d.ts +5 -0
- package/bin/components/AccordionGroup.js +52 -0
- package/bin/components/AccordionGroup.js.map +1 -0
- package/bin/components/Callout.d.ts +5 -0
- package/bin/components/Callout.js +114 -0
- package/bin/components/Callout.js.map +1 -0
- package/bin/components/Card.d.ts +5 -0
- package/bin/components/Card.js +135 -0
- package/bin/components/Card.js.map +1 -0
- package/bin/components/CardGroup.d.ts +5 -0
- package/bin/components/CardGroup.js +52 -0
- package/bin/components/CardGroup.js.map +1 -0
- package/bin/components/CodeGroup.d.ts +5 -0
- package/bin/components/CodeGroup.js +166 -0
- package/bin/components/CodeGroup.js.map +1 -0
- package/bin/components/Frame.d.ts +5 -0
- package/bin/components/Frame.js +51 -0
- package/bin/components/Frame.js.map +1 -0
- package/bin/components/Tabs.d.ts +5 -0
- package/bin/components/Tabs.js +122 -0
- package/bin/components/Tabs.js.map +1 -0
- package/bin/components/link.d.ts +2 -0
- package/bin/components/link.js +16 -0
- package/bin/components/link.js.map +1 -0
- package/bin/constants.d.ts +6 -7
- package/bin/constants.js +31 -12
- package/bin/constants.js.map +1 -1
- package/bin/customComponents/create.d.ts +10 -0
- package/bin/customComponents/create.js +69 -0
- package/bin/customComponents/create.js.map +1 -0
- package/bin/customComponents/plugin.d.ts +2 -0
- package/bin/customComponents/plugin.js +26 -0
- package/bin/customComponents/plugin.js.map +1 -0
- package/bin/customComponents/selective.d.ts +6 -0
- package/bin/customComponents/selective.js +29 -0
- package/bin/customComponents/selective.js.map +1 -0
- package/bin/nav/iterate.d.ts +2 -0
- package/bin/nav/iterate.js +15 -0
- package/bin/nav/iterate.js.map +1 -0
- package/bin/nav/listItems.d.ts +8 -0
- package/bin/nav/listItems.js +62 -0
- package/bin/nav/listItems.js.map +1 -0
- package/bin/nav/retrieve.d.ts +3 -0
- package/bin/nav/retrieve.js +75 -0
- package/bin/nav/retrieve.js.map +1 -0
- package/bin/nav/root.d.ts +2 -0
- package/bin/nav/root.js +40 -0
- package/bin/nav/root.js.map +1 -0
- package/bin/openapi/generateOpenApiPages.js +18 -5
- package/bin/openapi/generateOpenApiPages.js.map +1 -1
- package/bin/root/retrieve.d.ts +2 -0
- package/bin/root/retrieve.js +46 -0
- package/bin/root/retrieve.js.map +1 -0
- package/bin/scrapingPipeline/group.d.ts +5 -0
- package/bin/scrapingPipeline/group.js +46 -0
- package/bin/scrapingPipeline/group.js.map +1 -0
- package/bin/scrapingPipeline/icon.d.ts +2 -0
- package/bin/scrapingPipeline/icon.js +22 -0
- package/bin/scrapingPipeline/icon.js.map +1 -0
- package/bin/scrapingPipeline/images.d.ts +3 -0
- package/bin/scrapingPipeline/images.js +50 -0
- package/bin/scrapingPipeline/images.js.map +1 -0
- package/bin/scrapingPipeline/logo.d.ts +5 -0
- package/bin/scrapingPipeline/logo.js +92 -0
- package/bin/scrapingPipeline/logo.js.map +1 -0
- package/bin/scrapingPipeline/page.d.ts +6 -0
- package/bin/scrapingPipeline/page.js +102 -0
- package/bin/scrapingPipeline/page.js.map +1 -0
- package/bin/scrapingPipeline/root.d.ts +2 -0
- package/bin/scrapingPipeline/root.js +8 -0
- package/bin/scrapingPipeline/root.js.map +1 -0
- package/bin/scrapingPipeline/site.d.ts +7 -0
- package/bin/scrapingPipeline/site.js +129 -0
- package/bin/scrapingPipeline/site.js.map +1 -0
- package/bin/scrapingPipeline/tabs.d.ts +3 -0
- package/bin/scrapingPipeline/tabs.js +67 -0
- package/bin/scrapingPipeline/tabs.js.map +1 -0
- package/bin/tabs/retrieveReadme.d.ts +3 -0
- package/bin/tabs/retrieveReadme.js +78 -0
- package/bin/tabs/retrieveReadme.js.map +1 -0
- package/bin/tsconfig.build.tsbuildinfo +1 -1
- package/bin/types/components.d.ts +2 -0
- package/bin/types/components.js +2 -0
- package/bin/types/components.js.map +1 -0
- package/bin/types/framework.d.ts +8 -0
- package/bin/types/framework.js +3 -0
- package/bin/types/framework.js.map +1 -0
- package/bin/types/hast.d.ts +6 -0
- package/bin/types/hast.js +2 -0
- package/bin/types/hast.js.map +1 -0
- package/bin/types/result.d.ts +7 -0
- package/bin/types/result.js +2 -0
- package/bin/types/result.js.map +1 -0
- package/bin/types/scrapeFunc.d.ts +3 -0
- package/bin/types/scrapeFunc.js +2 -0
- package/bin/types/scrapeFunc.js.map +1 -0
- package/bin/utils/append.d.ts +1 -0
- package/bin/utils/append.js +12 -0
- package/bin/utils/append.js.map +1 -0
- package/bin/utils/children.d.ts +5 -0
- package/bin/utils/children.js +35 -0
- package/bin/utils/children.js.map +1 -0
- package/bin/utils/className.d.ts +3 -0
- package/bin/utils/className.js +13 -0
- package/bin/utils/className.js.map +1 -0
- package/bin/utils/detectFramework.d.ts +4 -0
- package/bin/utils/detectFramework.js +60 -0
- package/bin/utils/detectFramework.js.map +1 -0
- package/bin/utils/emptyParagraphs.d.ts +3 -0
- package/bin/utils/emptyParagraphs.js +19 -0
- package/bin/utils/emptyParagraphs.js.map +1 -0
- package/bin/utils/errors.d.ts +3 -0
- package/bin/utils/errors.js +16 -0
- package/bin/utils/errors.js.map +1 -0
- package/bin/utils/escape.d.ts +2 -0
- package/bin/utils/escape.js +25 -0
- package/bin/utils/escape.js.map +1 -0
- package/bin/utils/extension.d.ts +3 -0
- package/bin/utils/extension.js +18 -0
- package/bin/utils/extension.js.map +1 -0
- package/bin/utils/file.d.ts +4 -0
- package/bin/utils/file.js +43 -0
- package/bin/utils/file.js.map +1 -0
- package/bin/utils/firstChild.d.ts +2 -0
- package/bin/utils/firstChild.js +12 -0
- package/bin/utils/firstChild.js.map +1 -0
- package/bin/utils/images.d.ts +5 -0
- package/bin/utils/images.js +86 -0
- package/bin/utils/images.js.map +1 -0
- package/bin/utils/img.d.ts +2 -0
- package/bin/utils/img.js +15 -0
- package/bin/utils/img.js.map +1 -0
- package/bin/utils/log.d.ts +18 -0
- package/bin/utils/log.js +68 -0
- package/bin/utils/log.js.map +1 -0
- package/bin/utils/nestedRoots.d.ts +7 -0
- package/bin/utils/nestedRoots.js +19 -0
- package/bin/utils/nestedRoots.js.map +1 -0
- package/bin/utils/network.d.ts +5 -0
- package/bin/utils/network.js +82 -0
- package/bin/utils/network.js.map +1 -0
- package/bin/utils/path.d.ts +1 -0
- package/bin/utils/path.js +22 -0
- package/bin/utils/path.js.map +1 -0
- package/bin/utils/position.d.ts +3 -0
- package/bin/utils/position.js +12 -0
- package/bin/utils/position.js.map +1 -0
- package/bin/utils/reservedNames.d.ts +4 -0
- package/bin/utils/reservedNames.js +27 -0
- package/bin/utils/reservedNames.js.map +1 -0
- package/bin/utils/strings.d.ts +2 -0
- package/bin/utils/strings.js +7 -0
- package/bin/utils/strings.js.map +1 -0
- package/bin/utils/text.d.ts +2 -0
- package/bin/utils/text.js +11 -0
- package/bin/utils/text.js.map +1 -0
- package/bin/utils/title.d.ts +10 -0
- package/bin/utils/title.js +58 -0
- package/bin/utils/title.js.map +1 -0
- package/bin/utils/url.d.ts +3 -0
- package/bin/utils/url.js +10 -0
- package/bin/utils/url.js.map +1 -0
- package/package.json +18 -9
- package/src/assert.ts +15 -0
- package/src/cli.ts +53 -90
- package/src/components/Accordion.ts +84 -0
- package/src/components/AccordionGroup.ts +69 -0
- package/src/components/Callout.ts +159 -0
- package/src/components/Card.ts +168 -0
- package/src/components/CardGroup.ts +69 -0
- package/src/components/CodeGroup.ts +209 -0
- package/src/components/Frame.ts +86 -0
- package/src/components/Tabs.ts +154 -0
- package/src/components/link.ts +17 -0
- package/src/constants.ts +37 -19
- package/src/customComponents/create.ts +106 -0
- package/src/customComponents/plugin.ts +31 -0
- package/src/customComponents/selective.ts +37 -0
- package/src/nav/iterate.ts +18 -0
- package/src/nav/listItems.ts +82 -0
- package/src/nav/retrieve.ts +88 -0
- package/src/nav/root.ts +47 -0
- package/src/openapi/generateOpenApiPages.ts +19 -4
- package/src/root/retrieve.ts +52 -0
- package/src/scrapingPipeline/group.ts +62 -0
- package/src/scrapingPipeline/icon.ts +26 -0
- package/src/scrapingPipeline/images.ts +67 -0
- package/src/scrapingPipeline/logo.ts +127 -0
- package/src/scrapingPipeline/page.ts +130 -0
- package/src/scrapingPipeline/root.ts +10 -0
- package/src/scrapingPipeline/site.ts +161 -0
- package/src/scrapingPipeline/tabs.ts +87 -0
- package/src/tabs/retrieveReadme.ts +99 -0
- package/src/types/components.ts +3 -0
- package/src/types/framework.ts +10 -0
- package/src/types/hast.ts +12 -0
- package/src/types/result.ts +1 -0
- package/src/types/scrapeFunc.ts +9 -0
- package/src/utils/append.ts +9 -0
- package/src/utils/children.ts +51 -0
- package/src/utils/className.ts +14 -0
- package/src/utils/detectFramework.ts +72 -0
- package/src/utils/emptyParagraphs.ts +21 -0
- package/src/utils/errors.ts +24 -0
- package/src/utils/escape.ts +30 -0
- package/src/utils/extension.ts +19 -0
- package/src/utils/file.ts +58 -0
- package/src/utils/firstChild.ts +13 -0
- package/src/utils/images.ts +101 -0
- package/src/utils/img.ts +17 -0
- package/src/utils/log.ts +82 -0
- package/src/utils/nestedRoots.ts +20 -0
- package/src/utils/network.ts +95 -0
- package/src/utils/path.ts +27 -0
- package/src/utils/position.ts +14 -0
- package/src/utils/reservedNames.ts +31 -0
- package/src/utils/strings.ts +7 -0
- package/src/utils/text.ts +11 -0
- package/src/utils/title.ts +68 -0
- package/src/utils/url.ts +8 -0
- package/bin/browser.d.ts +0 -2
- package/bin/browser.js +0 -24
- package/bin/browser.js.map +0 -1
- package/bin/checks.d.ts +0 -8
- package/bin/checks.js +0 -24
- package/bin/checks.js.map +0 -1
- package/bin/downloadImage.d.ts +0 -5
- package/bin/downloadImage.js +0 -88
- package/bin/downloadImage.js.map +0 -1
- package/bin/scraping/combineNavWithEmptyGroupTitles.d.ts +0 -2
- package/bin/scraping/combineNavWithEmptyGroupTitles.js +0 -20
- package/bin/scraping/combineNavWithEmptyGroupTitles.js.map +0 -1
- package/bin/scraping/detectFramework.d.ts +0 -9
- package/bin/scraping/detectFramework.js +0 -36
- package/bin/scraping/detectFramework.js.map +0 -1
- package/bin/scraping/downloadAllImages.d.ts +0 -4
- package/bin/scraping/downloadAllImages.js +0 -36
- package/bin/scraping/downloadAllImages.js.map +0 -1
- package/bin/scraping/downloadLogoImage.d.ts +0 -1
- package/bin/scraping/downloadLogoImage.js +0 -12
- package/bin/scraping/downloadLogoImage.js.map +0 -1
- package/bin/scraping/replaceImagePaths.d.ts +0 -1
- package/bin/scraping/replaceImagePaths.js +0 -14
- package/bin/scraping/replaceImagePaths.js.map +0 -1
- package/bin/scraping/scrapeFileGettingFileNameFromUrl.d.ts +0 -6
- package/bin/scraping/scrapeFileGettingFileNameFromUrl.js +0 -46
- package/bin/scraping/scrapeFileGettingFileNameFromUrl.js.map +0 -1
- package/bin/scraping/scrapeGettingFileNameFromUrl.d.ts +0 -6
- package/bin/scraping/scrapeGettingFileNameFromUrl.js +0 -13
- package/bin/scraping/scrapeGettingFileNameFromUrl.js.map +0 -1
- package/bin/scraping/scrapePage.d.ts +0 -8
- package/bin/scraping/scrapePage.js +0 -10
- package/bin/scraping/scrapePage.js.map +0 -1
- package/bin/scraping/scrapePageCommands.d.ts +0 -7
- package/bin/scraping/scrapePageCommands.js +0 -50
- package/bin/scraping/scrapePageCommands.js.map +0 -1
- package/bin/scraping/scrapeSection.d.ts +0 -3
- package/bin/scraping/scrapeSection.js +0 -12
- package/bin/scraping/scrapeSection.js.map +0 -1
- package/bin/scraping/scrapeSectionCommands.d.ts +0 -6
- package/bin/scraping/scrapeSectionCommands.js +0 -63
- package/bin/scraping/scrapeSectionCommands.js.map +0 -1
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.d.ts +0 -5
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js +0 -29
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js.map +0 -1
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.d.ts +0 -2
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js +0 -31
- package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js.map +0 -1
- package/bin/scraping/site-scrapers/alternateGroupTitle.d.ts +0 -3
- package/bin/scraping/site-scrapers/alternateGroupTitle.js +0 -9
- package/bin/scraping/site-scrapers/alternateGroupTitle.js.map +0 -1
- package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.d.ts +0 -5
- package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js +0 -33
- package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js.map +0 -1
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.d.ts +0 -3
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.js +0 -35
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.js.map +0 -1
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.d.ts +0 -3
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.js +0 -33
- package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.js.map +0 -1
- package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.d.ts +0 -2
- package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.js +0 -30
- package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.js.map +0 -1
- package/bin/scraping/site-scrapers/openNestedGitbookMenus.d.ts +0 -2
- package/bin/scraping/site-scrapers/openNestedGitbookMenus.js +0 -21
- package/bin/scraping/site-scrapers/openNestedGitbookMenus.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.d.ts +0 -5
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js +0 -53
- package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.d.ts +0 -2
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js +0 -32
- package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeGitBookPage.d.ts +0 -5
- package/bin/scraping/site-scrapers/scrapeGitBookPage.js +0 -56
- package/bin/scraping/site-scrapers/scrapeGitBookPage.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeGitBookSection.d.ts +0 -2
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js +0 -42
- package/bin/scraping/site-scrapers/scrapeGitBookSection.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeReadMePage.d.ts +0 -5
- package/bin/scraping/site-scrapers/scrapeReadMePage.js +0 -38
- package/bin/scraping/site-scrapers/scrapeReadMePage.js.map +0 -1
- package/bin/scraping/site-scrapers/scrapeReadMeSection.d.ts +0 -2
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js +0 -39
- package/bin/scraping/site-scrapers/scrapeReadMeSection.js.map +0 -1
- package/bin/util.d.ts +0 -29
- package/bin/util.js +0 -97
- package/bin/util.js.map +0 -1
- package/src/browser.ts +0 -24
- package/src/checks.ts +0 -32
- package/src/downloadImage.ts +0 -102
- package/src/scraping/combineNavWithEmptyGroupTitles.ts +0 -21
- package/src/scraping/detectFramework.ts +0 -55
- package/src/scraping/downloadAllImages.ts +0 -61
- package/src/scraping/downloadLogoImage.ts +0 -24
- package/src/scraping/replaceImagePaths.ts +0 -17
- package/src/scraping/scrapeFileGettingFileNameFromUrl.ts +0 -84
- package/src/scraping/scrapeGettingFileNameFromUrl.ts +0 -56
- package/src/scraping/scrapePage.ts +0 -40
- package/src/scraping/scrapePageCommands.ts +0 -68
- package/src/scraping/scrapeSection.ts +0 -30
- package/src/scraping/scrapeSectionCommands.ts +0 -98
- package/src/scraping/site-scrapers/Intercom/scrapeIntercomPage.ts +0 -52
- package/src/scraping/site-scrapers/Intercom/scrapeIntercomSection.ts +0 -54
- package/src/scraping/site-scrapers/alternateGroupTitle.ts +0 -11
- package/src/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.ts +0 -45
- package/src/scraping/site-scrapers/links-per-group/getLinksRecursively.ts +0 -47
- package/src/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.ts +0 -44
- package/src/scraping/site-scrapers/openNestedDocusaurusMenus.ts +0 -42
- package/src/scraping/site-scrapers/openNestedGitbookMenus.ts +0 -27
- package/src/scraping/site-scrapers/scrapeDocusaurusPage.ts +0 -85
- package/src/scraping/site-scrapers/scrapeDocusaurusSection.ts +0 -63
- package/src/scraping/site-scrapers/scrapeGitBookPage.ts +0 -82
- package/src/scraping/site-scrapers/scrapeGitBookSection.ts +0 -69
- package/src/scraping/site-scrapers/scrapeReadMePage.ts +0 -56
- package/src/scraping/site-scrapers/scrapeReadMeSection.ts +0 -66
- package/src/util.ts +0 -122
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { join } from 'node:path';
|
|
2
|
+
import { EXIT, visit } from 'unist-util-visit';
|
|
3
|
+
import { framework } from '../utils/detectFramework.js';
|
|
4
|
+
import { downloadImage } from '../utils/images.js';
|
|
5
|
+
import { fetchPageHtml } from '../utils/network.js';
|
|
6
|
+
import { htmlToHast } from './root.js';
|
|
7
|
+
function findReadmeLogoNodes(root) {
|
|
8
|
+
const elements = [];
|
|
9
|
+
visit(root, 'element', function (node) {
|
|
10
|
+
if (node.tagName === 'img' &&
|
|
11
|
+
Array.isArray(node.properties.className) &&
|
|
12
|
+
node.properties.className.includes('rm-Logo-img'))
|
|
13
|
+
elements.push(node);
|
|
14
|
+
});
|
|
15
|
+
return elements.length ? elements : undefined;
|
|
16
|
+
}
|
|
17
|
+
function findGitBookLogoNodes(root) {
|
|
18
|
+
const elements = [];
|
|
19
|
+
visit(root, 'element', function (node) {
|
|
20
|
+
if (node.tagName === 'img' && node.properties.alt === 'Logo') {
|
|
21
|
+
elements.push(node);
|
|
22
|
+
}
|
|
23
|
+
});
|
|
24
|
+
return elements.length ? elements : undefined;
|
|
25
|
+
}
|
|
26
|
+
function findDocusaurusLogoNodes(root) {
|
|
27
|
+
const elements = [];
|
|
28
|
+
visit(root, 'element', function (node) {
|
|
29
|
+
if (node.tagName === 'div' &&
|
|
30
|
+
Array.isArray(node.properties.className) &&
|
|
31
|
+
node.properties.className.includes('navbar__brand')) {
|
|
32
|
+
visit(node, 'element', function (subNode) {
|
|
33
|
+
if (subNode.tagName === 'img')
|
|
34
|
+
elements.push(subNode);
|
|
35
|
+
});
|
|
36
|
+
return EXIT;
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
return elements.length ? elements : undefined;
|
|
40
|
+
}
|
|
41
|
+
async function findLogosFromHtml(html, downloadFn, filepaths) {
|
|
42
|
+
const hast = htmlToHast(html);
|
|
43
|
+
const imgNodes = downloadFn(hast);
|
|
44
|
+
if (imgNodes) {
|
|
45
|
+
filepaths.push(...(await Promise.all(imgNodes.map(async (node) => {
|
|
46
|
+
const res = await downloadImage(node.properties.src, join(process.cwd(), 'images'));
|
|
47
|
+
if (res.success && res.data) {
|
|
48
|
+
return res.data[1];
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
return '';
|
|
52
|
+
}
|
|
53
|
+
}))));
|
|
54
|
+
}
|
|
55
|
+
filepaths.forEach((filepath, index) => {
|
|
56
|
+
if (!filepath)
|
|
57
|
+
filepaths.splice(index, 1);
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
export async function downloadLogos(url, browser) {
|
|
61
|
+
url = new URL(url);
|
|
62
|
+
const filepaths = [];
|
|
63
|
+
if (browser) {
|
|
64
|
+
const htmls = [];
|
|
65
|
+
const page = await browser.newPage();
|
|
66
|
+
await page.goto(url.toString(), {
|
|
67
|
+
waitUntil: 'networkidle2',
|
|
68
|
+
});
|
|
69
|
+
htmls.push(await page.content());
|
|
70
|
+
await page.click('.rm-ThemeToggle');
|
|
71
|
+
htmls.push(await page.content());
|
|
72
|
+
await Promise.all(htmls.map(async (html) => {
|
|
73
|
+
return await findLogosFromHtml(html, findReadmeLogoNodes, filepaths);
|
|
74
|
+
}));
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
const html = await fetchPageHtml(url);
|
|
78
|
+
await findLogosFromHtml(html, framework.vendor === 'gitbook' ? findGitBookLogoNodes : findDocusaurusLogoNodes, filepaths);
|
|
79
|
+
}
|
|
80
|
+
if (browser)
|
|
81
|
+
await browser.close();
|
|
82
|
+
const uniqueFilepaths = [...new Set(filepaths).values()];
|
|
83
|
+
return uniqueFilepaths.length === 1
|
|
84
|
+
? uniqueFilepaths[0]
|
|
85
|
+
: uniqueFilepaths.length > 1
|
|
86
|
+
? {
|
|
87
|
+
light: uniqueFilepaths[0],
|
|
88
|
+
dark: uniqueFilepaths[1],
|
|
89
|
+
}
|
|
90
|
+
: undefined;
|
|
91
|
+
}
|
|
92
|
+
//# sourceMappingURL=logo.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"logo.js","sourceRoot":"","sources":["../../src/scrapingPipeline/logo.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAE/C,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,SAAS,mBAAmB,CAAC,IAAc;IACzC,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IACE,IAAI,CAAC,OAAO,KAAK,KAAK;YACtB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC;YAEjD,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACxB,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,SAAS,oBAAoB,CAAC,IAAc;IAC1C,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IAAI,IAAI,CAAC,OAAO,KAAK,KAAK,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,KAAK,MAAM,EAAE,CAAC;YAC7D,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtB,CAAC;IACH,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,SAAS,uBAAuB,CAAC,IAAc;IAC7C,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IACE,IAAI,CAAC,OAAO,KAAK,KAAK;YACtB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,eAAe,CAAC,EACnD,CAAC;YACD,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,OAAO;gBACtC,IAAI,OAAO,CAAC,OAAO,KAAK,KAAK;oBAAE,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACxD,CAAC,CAAC,CAAC;YACH,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,IAAY,EACZ,UAA0D,EAC1D,SAAwB;IAExB,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAC9B,MAAM,QAAQ,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAElC,IAAI,QAAQ,EAAE,CAAC;QACb,SAAS,CAAC,IAAI,CACZ,GAAG,CAAC,MAAM,OAAO,CAAC,GAAG,CACnB,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC1B,MAAM,GAAG,GAAG,MAAM,aAAa,CAC7B,IAAI,CAAC,UAAU,CAAC,GAAa,EAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,QAAQ,CAAC,CAC9B,CAAC;YAEF,IAAI,GAAG,CAAC,OAAO,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC;gBAC5B,OAAO,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACrB,CAAC;iBAAM,CAAC;gBACN,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC,CAAC,CACH,CAAC,CACH,CAAC;IACJ,CAAC;IAED,SAAS,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,EAAE;QACpC,IAAI,CAAC,QAAQ;YAAE,SAAS,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAiB,EACjB,OAA4B;IAE5B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IACnB,MAAM,SAAS,GAAkB,EAAE,CAAC;IACpC,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,KAAK,GAAkB,EAAE,CAAC;QAEhC,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACrC,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE;YAC9B,SAAS,EAAE,cAAc;SAC1B,CAAC,CAAC;QAEH,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QACjC,MAAM,IAAI,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QAEjC,MAAM,OAAO,CAAC,GAAG,CACf,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YACvB,OAAO,MAAM,iBAAiB,CAAC,IAAI,EAAE,mBAAmB,EAAE,SAAS,CAAC,CAAC;QACvE,CAAC,CAAC,CACH,CAAC;IACJ,CAAC;SAAM,CAAC;QACN,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,GAAG,CAAC,CAAC;QACtC,MAAM,iBAAiB,CACrB,IAAI,EACJ,SAAS,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,uBAAuB,EAC/E,SAAS,CACV,CAAC;IACJ,CAAC;IAED,IAAI,OAAO;QAAE,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IAEnC,MAAM,eAAe,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;IAEzD,OAAO,eAAe,CAAC,MAAM,KAAK,CAAC;QACjC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC;QACpB,CAAC,CAAC,eAAe,CAAC,MAAM,GAAG,CAAC;YAC1B,CAAC,CAAC;gBACE,KAAK,EAAE,eAAe,CAAC,CAAC,CAAW;gBACnC,IAAI,EAAE,eAAe,CAAC,CAAC,CAAW;aACnC;YACH,CAAC,CAAC,SAAS,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import remarkGfm from 'remark-gfm';
|
|
2
|
+
import remarkMdx from 'remark-mdx';
|
|
3
|
+
import remarkStringify from 'remark-stringify';
|
|
4
|
+
import { unified } from 'unified';
|
|
5
|
+
import { convertHeaderLinksToText } from '../components/link.js';
|
|
6
|
+
import { CONTENT_FAILURE_MSG, MDAST_FAILURE_MSG } from '../constants.js';
|
|
7
|
+
import { createCallout, createCard, createAccordion, createAccordionGroup, createFrame, createCodeGroup, createTabs, createCardGroup, } from '../customComponents/create.js';
|
|
8
|
+
import { rehypeToRemarkCustomComponents } from '../customComponents/plugin.js';
|
|
9
|
+
import { selectiveRehypeRemark } from '../customComponents/selective.js';
|
|
10
|
+
import { retrieveRootContent } from '../root/retrieve.js';
|
|
11
|
+
import { unifiedRemoveClassNames } from '../utils/className.js';
|
|
12
|
+
import { detectFramework, framework } from '../utils/detectFramework.js';
|
|
13
|
+
import { unifiedRemoveEmptyParagraphs } from '../utils/emptyParagraphs.js';
|
|
14
|
+
import { getErrorMessage, logErrorResults } from '../utils/errors.js';
|
|
15
|
+
import { escapeCharactersOutsideCodeBlocks } from '../utils/escape.js';
|
|
16
|
+
import { write, writePage } from '../utils/file.js';
|
|
17
|
+
import { log } from '../utils/log.js';
|
|
18
|
+
import { unifiedRemoveNestedRoots } from '../utils/nestedRoots.js';
|
|
19
|
+
import { unifiedRemovePositions } from '../utils/position.js';
|
|
20
|
+
import { removeLeadingSlash, removeTrailingSlash } from '../utils/strings.js';
|
|
21
|
+
import { getDescriptionFromRoot, getTitleFromHeading } from '../utils/title.js';
|
|
22
|
+
import { downloadImagesFromFile } from './images.js';
|
|
23
|
+
import { htmlToHast } from './root.js';
|
|
24
|
+
export async function scrapePage(html, url, opts = { externalLink: false }) {
|
|
25
|
+
url = new URL(url);
|
|
26
|
+
if (opts.externalLink) {
|
|
27
|
+
const filename = html;
|
|
28
|
+
const filenameWithExt = `${filename}.mdx`;
|
|
29
|
+
writePage(filenameWithExt, '', '', '', url.toString());
|
|
30
|
+
return { success: true, data: [url.toString(), filename] };
|
|
31
|
+
}
|
|
32
|
+
const hast = htmlToHast(html);
|
|
33
|
+
if (!framework.vendor)
|
|
34
|
+
detectFramework(hast);
|
|
35
|
+
const urlStr = url.toString();
|
|
36
|
+
const content = retrieveRootContent(hast);
|
|
37
|
+
if (!content)
|
|
38
|
+
return { success: false, message: `${urlStr}: ${CONTENT_FAILURE_MSG}` };
|
|
39
|
+
const contentAsRoot = {
|
|
40
|
+
type: 'root',
|
|
41
|
+
children: [content],
|
|
42
|
+
};
|
|
43
|
+
const mdastTree = unified()
|
|
44
|
+
.use(createCard)
|
|
45
|
+
.use(createAccordion)
|
|
46
|
+
.use(createFrame)
|
|
47
|
+
.use(createTabs)
|
|
48
|
+
.use(createCallout)
|
|
49
|
+
.use(createCardGroup)
|
|
50
|
+
.use(createAccordionGroup)
|
|
51
|
+
.use(createCodeGroup)
|
|
52
|
+
.use(unifiedRemoveClassNames)
|
|
53
|
+
.use(unifiedRemovePositions)
|
|
54
|
+
.use(unifiedRemoveEmptyParagraphs)
|
|
55
|
+
.use(escapeCharactersOutsideCodeBlocks)
|
|
56
|
+
.use(selectiveRehypeRemark)
|
|
57
|
+
// Cleans up any nested components left untouched
|
|
58
|
+
// by `selectiveRehypeRemark`, and converts them to
|
|
59
|
+
// MDX compatible components
|
|
60
|
+
.use(rehypeToRemarkCustomComponents)
|
|
61
|
+
.use(convertHeaderLinksToText)
|
|
62
|
+
.use(unifiedRemoveNestedRoots)
|
|
63
|
+
.runSync(contentAsRoot);
|
|
64
|
+
try {
|
|
65
|
+
const imageResults = await downloadImagesFromFile(mdastTree, url);
|
|
66
|
+
logErrorResults(`scraping images from ${url.toString()}`, imageResults);
|
|
67
|
+
}
|
|
68
|
+
catch (error) {
|
|
69
|
+
const errorMessage = getErrorMessage(error);
|
|
70
|
+
log(`We encountered an error when scraping the images from ${url.toString()}${errorMessage}`);
|
|
71
|
+
throw error;
|
|
72
|
+
}
|
|
73
|
+
const title = getTitleFromHeading(mdastTree);
|
|
74
|
+
const description = getDescriptionFromRoot(mdastTree);
|
|
75
|
+
try {
|
|
76
|
+
const result = unified()
|
|
77
|
+
.use(remarkMdx)
|
|
78
|
+
.use(remarkGfm)
|
|
79
|
+
// @ts-expect-error remarkStringify errors even if used for valid code from documentation examples
|
|
80
|
+
.use(remarkStringify)
|
|
81
|
+
.stringify(mdastTree);
|
|
82
|
+
if (opts.rootPath) {
|
|
83
|
+
url = new URL(opts.rootPath, url.origin);
|
|
84
|
+
}
|
|
85
|
+
else if (url.origin === removeTrailingSlash(url.toString())) {
|
|
86
|
+
url = new URL('home', new URL(url).origin);
|
|
87
|
+
}
|
|
88
|
+
writePage(url, opts.isOverviewPage ? 'Overview' : title, description, String(result));
|
|
89
|
+
return {
|
|
90
|
+
success: true,
|
|
91
|
+
data: opts.rootPath
|
|
92
|
+
? [removeLeadingSlash(removeTrailingSlash(new URL(urlStr).pathname)), opts.rootPath]
|
|
93
|
+
: undefined,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
catch (error) {
|
|
97
|
+
write('error.json', JSON.stringify(mdastTree, undefined, 2));
|
|
98
|
+
const errorMessage = getErrorMessage(error);
|
|
99
|
+
return { success: false, message: `${urlStr}: ${MDAST_FAILURE_MSG}${errorMessage}` };
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=page.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"page.js","sourceRoot":"","sources":["../../src/scrapingPipeline/page.ts"],"names":[],"mappings":"AAEA,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,eAAe,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAElC,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACjE,OAAO,EAAE,mBAAmB,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EACL,aAAa,EACb,UAAU,EACV,eAAe,EACf,oBAAoB,EACpB,WAAW,EACX,eAAe,EACf,UAAU,EACV,eAAe,GAChB,MAAM,+BAA+B,CAAC;AACvC,OAAO,EAAE,8BAA8B,EAAE,MAAM,+BAA+B,CAAC;AAC/E,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AACzE,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAE1D,OAAO,EAAE,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACzE,OAAO,EAAE,4BAA4B,EAAE,MAAM,6BAA6B,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACtE,OAAO,EAAE,iCAAiC,EAAE,MAAM,oBAAoB,CAAC;AACvE,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACtC,OAAO,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAC9E,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAChF,OAAO,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,IAAY,EACZ,GAAiB,EACjB,OAII,EAAE,YAAY,EAAE,KAAK,EAAE;IAE3B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAEnB,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;QACtB,MAAM,QAAQ,GAAG,IAAI,CAAC;QACtB,MAAM,eAAe,GAAG,GAAG,QAAQ,MAAM,CAAC;QAC1C,SAAS,CAAC,eAAe,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;QACvD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE,QAAQ,CAAC,EAAE,CAAC;IAC7D,CAAC;IAED,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAE9B,IAAI,CAAC,SAAS,CAAC,MAAM;QAAE,eAAe,CAAC,IAAI,CAAC,CAAC;IAE7C,MAAM,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;IAC9B,MAAM,OAAO,GAAG,mBAAmB,CAAC,IAAI,CAAC,CAAC;IAC1C,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,KAAK,mBAAmB,EAAE,EAAE,CAAC;IAEtF,MAAM,aAAa,GAAa;QAC9B,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,CAAC,OAAO,CAAC;KACpB,CAAC;IAEF,MAAM,SAAS,GAAc,OAAO,EAAE;SACnC,GAAG,CAAC,UAAU,CAAC;SACf,GAAG,CAAC,eAAe,CAAC;SACpB,GAAG,CAAC,WAAW,CAAC;SAChB,GAAG,CAAC,UAAU,CAAC;SACf,GAAG,CAAC,aAAa,CAAC;SAClB,GAAG,CAAC,eAAe,CAAC;SACpB,GAAG,CAAC,oBAAoB,CAAC;SACzB,GAAG,CAAC,eAAe,CAAC;SACpB,GAAG,CAAC,uBAAuB,CAAC;SAC5B,GAAG,CAAC,sBAAsB,CAAC;SAC3B,GAAG,CAAC,4BAA4B,CAAC;SACjC,GAAG,CAAC,iCAAiC,CAAC;SACtC,GAAG,CAAC,qBAAqB,CAAC;QAE3B,iDAAiD;QACjD,mDAAmD;QACnD,4BAA4B;SAC3B,GAAG,CAAC,8BAA8B,CAAC;SACnC,GAAG,CAAC,wBAAwB,CAAC;SAC7B,GAAG,CAAC,wBAAwB,CAAC;SAC7B,OAAO,CAAC,aAAa,CAAc,CAAC;IAEvC,IAAI,CAAC;QACH,MAAM,YAAY,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;QAClE,eAAe,CAAC,wBAAwB,GAAG,CAAC,QAAQ,EAAE,EAAE,EAAE,YAAY,CAAC,CAAC;IAC1E,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,GAAG,CAAC,yDAAyD,GAAG,CAAC,QAAQ,EAAE,GAAG,YAAY,EAAE,CAAC,CAAC;QAC9F,MAAM,KAAK,CAAC;IACd,CAAC;IAED,MAAM,KAAK,GAAG,mBAAmB,CAAC,SAAS,CAAC,CAAC;IAC7C,MAAM,WAAW,GAAG,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAEtD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,OAAO,EAAE;aACrB,GAAG,CAAC,SAAS,CAAC;aACd,GAAG,CAAC,SAAS,CAAC;YACf,kGAAkG;aACjG,GAAG,CAAC,eAAe,CAAC;aACpB,SAAS,CAAC,SAAS,CAAC,CAAC;QAExB,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;QAC3C,CAAC;aAAM,IAAI,GAAG,CAAC,MAAM,KAAK,mBAAmB,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,CAAC;YAC9D,GAAG,GAAG,IAAI,GAAG,CAAC,MAAM,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC;QAC7C,CAAC;QAED,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,KAAK,EAAE,WAAW,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;QACtF,OAAO;YACL,OAAO,EAAE,IAAI;YACb,IAAI,EAAE,IAAI,CAAC,QAAQ;gBACjB,CAAC,CAAC,CAAC,kBAAkB,CAAC,mBAAmB,CAAC,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,CAAC,QAAQ,CAAC;gBACpF,CAAC,CAAC,SAAS;SACd,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,KAAK,CAAC,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC;QAC7D,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,KAAK,iBAAiB,GAAG,YAAY,EAAE,EAAE,CAAC;IACvF,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import rehypeParse from 'rehype-parse';
|
|
2
|
+
import { unified } from 'unified';
|
|
3
|
+
import { unifiedRemovePositions } from '../utils/position.js';
|
|
4
|
+
export function htmlToHast(html) {
|
|
5
|
+
// @ts-expect-error remarkStringify errors even if used for valid code from documentation examples
|
|
6
|
+
return unified().use(rehypeParse).use(unifiedRemovePositions).parse(html);
|
|
7
|
+
}
|
|
8
|
+
//# sourceMappingURL=root.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"root.js","sourceRoot":"","sources":["../../src/scrapingPipeline/root.ts"],"names":[],"mappings":"AACA,OAAO,WAAW,MAAM,cAAc,CAAC;AACvC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAElC,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAE9D,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,kGAAkG;IAClG,OAAO,OAAO,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,GAAG,CAAC,sBAAsB,CAAC,CAAC,KAAK,CAAC,IAAI,CAAa,CAAC;AACxF,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import { MintConfig, Tab } from '@mintlify/models';
|
|
2
|
+
import type { Root as HastRoot } from 'hast';
|
|
3
|
+
import type { Result } from '../types/result.js';
|
|
4
|
+
export declare function scrapeSite(html: string, url: string | URL, opts?: {
|
|
5
|
+
hast?: HastRoot;
|
|
6
|
+
tabs?: Array<Tab>;
|
|
7
|
+
}): Promise<Result<MintConfig>>;
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import traverse from 'traverse';
|
|
2
|
+
import { NAV_FAILURE_MSG } from '../constants.js';
|
|
3
|
+
import { iterateOverNavItems } from '../nav/iterate.js';
|
|
4
|
+
import { retrieveNavItems } from '../nav/retrieve.js';
|
|
5
|
+
import { retrieveRootNavElement } from '../nav/root.js';
|
|
6
|
+
import { detectFramework, framework } from '../utils/detectFramework.js';
|
|
7
|
+
import { logErrorResults } from '../utils/errors.js';
|
|
8
|
+
import { startPuppeteer } from '../utils/network.js';
|
|
9
|
+
import { INDEX_NAMES, iterateThroughReservedNames } from '../utils/reservedNames.js';
|
|
10
|
+
import { removeTrailingSlash } from '../utils/strings.js';
|
|
11
|
+
import { scrapePageGroup } from './group.js';
|
|
12
|
+
import { downloadFavicon } from './icon.js';
|
|
13
|
+
import { downloadLogos } from './logo.js';
|
|
14
|
+
import { htmlToHast } from './root.js';
|
|
15
|
+
export async function scrapeSite(html, url, opts = {}) {
|
|
16
|
+
let hast = opts.hast;
|
|
17
|
+
if (!hast)
|
|
18
|
+
hast = htmlToHast(html);
|
|
19
|
+
url = new URL(url);
|
|
20
|
+
const origin = url.origin;
|
|
21
|
+
detectFramework(hast);
|
|
22
|
+
const sidebar = retrieveRootNavElement(hast);
|
|
23
|
+
if (!sidebar)
|
|
24
|
+
return { success: false, message: `${url.toString()}: ${NAV_FAILURE_MSG}` };
|
|
25
|
+
const navItems = retrieveNavItems(sidebar);
|
|
26
|
+
if (origin === '') {
|
|
27
|
+
return { success: false, message: `invalid URL provided to scrape site: ${url}` };
|
|
28
|
+
}
|
|
29
|
+
const listOfLinks = iterateOverNavItems(navItems, origin);
|
|
30
|
+
if (listOfLinks.length === 0) {
|
|
31
|
+
return { success: false, message: `no navigation links were able to be found: ${url}` };
|
|
32
|
+
}
|
|
33
|
+
const needsBrowser = framework.vendor === 'gitbook';
|
|
34
|
+
const externalLinks = listOfLinks.filter((url) => url.origin !== origin);
|
|
35
|
+
const internalLinks = listOfLinks.filter((url) => url.origin === origin && removeTrailingSlash(url.toString()) !== origin);
|
|
36
|
+
const rootLinks = listOfLinks.filter((url) => url.origin === origin && removeTrailingSlash(url.toString()) === origin);
|
|
37
|
+
const allPathnames = [
|
|
38
|
+
...internalLinks.map((url) => url.toString()),
|
|
39
|
+
...rootLinks.map((url) => url.toString()),
|
|
40
|
+
];
|
|
41
|
+
const rootPaths = rootLinks.map(() => {
|
|
42
|
+
const name = iterateThroughReservedNames(INDEX_NAMES, allPathnames);
|
|
43
|
+
allPathnames.push(name);
|
|
44
|
+
return name;
|
|
45
|
+
});
|
|
46
|
+
try {
|
|
47
|
+
const externalResults = await scrapePageGroup(externalLinks, needsBrowser, {
|
|
48
|
+
externalLinks: true,
|
|
49
|
+
});
|
|
50
|
+
const internalResults = await scrapePageGroup(internalLinks, needsBrowser);
|
|
51
|
+
const rootResults = await scrapePageGroup(rootLinks, needsBrowser, {
|
|
52
|
+
externalLinks: false,
|
|
53
|
+
rootPaths,
|
|
54
|
+
});
|
|
55
|
+
const externalLinkReplaceMap = new Map(externalResults
|
|
56
|
+
.filter((result) => result.success)
|
|
57
|
+
.map((result) => result.data));
|
|
58
|
+
const rootPathReplaceMap = new Map(rootResults
|
|
59
|
+
.filter((result) => result.success)
|
|
60
|
+
.map((result) => result.data));
|
|
61
|
+
traverse(navItems).forEach(function (value) {
|
|
62
|
+
if (typeof value === 'string') {
|
|
63
|
+
if (externalLinkReplaceMap.has(value)) {
|
|
64
|
+
this.update(externalLinkReplaceMap.get(value) ?? value);
|
|
65
|
+
}
|
|
66
|
+
else if (rootPathReplaceMap.has(value)) {
|
|
67
|
+
this.update(rootPathReplaceMap.get(value) ?? value);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
else if (Array.isArray(value)) {
|
|
71
|
+
if (value.find((item) => externalLinkReplaceMap.has(item))) {
|
|
72
|
+
this.update(value.map((item) => externalLinkReplaceMap.get(item) ?? item));
|
|
73
|
+
}
|
|
74
|
+
else if (value.find((item) => rootPathReplaceMap.has(item))) {
|
|
75
|
+
this.update(value.map((item) => rootPathReplaceMap.get(item) ?? item));
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
traverse(navItems).forEach(function (value) {
|
|
80
|
+
if (typeof value === 'string') {
|
|
81
|
+
this.update(value.replace('/mintie_overview', ''));
|
|
82
|
+
}
|
|
83
|
+
else if (Array.isArray(value)) {
|
|
84
|
+
this.update(value.map((item) => typeof item === 'string' ? item.replace('/mintie_overview', '') : item));
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
navItems.forEach((navItem, index) => {
|
|
88
|
+
if (typeof navItem !== 'string')
|
|
89
|
+
return;
|
|
90
|
+
const name = navItem
|
|
91
|
+
.split('-')
|
|
92
|
+
.map((str) => (str[0] ? `${str[0].toUpperCase()}${str.substring(1)}` : str))
|
|
93
|
+
.join(' ');
|
|
94
|
+
navItems[index] = {
|
|
95
|
+
group: name,
|
|
96
|
+
pages: [navItem],
|
|
97
|
+
};
|
|
98
|
+
});
|
|
99
|
+
logErrorResults('linking to external pages', externalResults);
|
|
100
|
+
logErrorResults('scraping your docs', [...internalResults, ...rootResults]);
|
|
101
|
+
const browser = needsBrowser ? await startPuppeteer() : undefined;
|
|
102
|
+
const favicon = await downloadFavicon(hast);
|
|
103
|
+
const logo = await downloadLogos(url, browser);
|
|
104
|
+
return {
|
|
105
|
+
success: true,
|
|
106
|
+
data: {
|
|
107
|
+
$schema: 'https://mintlify.com/schema.json',
|
|
108
|
+
name: '',
|
|
109
|
+
logo,
|
|
110
|
+
colors: {
|
|
111
|
+
primary: '',
|
|
112
|
+
},
|
|
113
|
+
favicon: favicon ?? '',
|
|
114
|
+
navigation: navItems,
|
|
115
|
+
tabs: opts.tabs,
|
|
116
|
+
},
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
catch (error) {
|
|
120
|
+
if (error instanceof Error) {
|
|
121
|
+
return { success: false, message: error.message };
|
|
122
|
+
}
|
|
123
|
+
return {
|
|
124
|
+
success: false,
|
|
125
|
+
message: 'An unknown error occurred when scraping this site. Please try again.',
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
//# sourceMappingURL=site.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"site.js","sourceRoot":"","sources":["../../src/scrapingPipeline/site.ts"],"names":[],"mappings":"AAEA,OAAO,QAAQ,MAAM,UAAU,CAAC;AAEhC,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACtD,OAAO,EAAE,sBAAsB,EAAE,MAAM,gBAAgB,CAAC;AAExD,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACzE,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,2BAA2B,EAAE,MAAM,2BAA2B,CAAC;AACrF,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAC1D,OAAO,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAC7C,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,IAAY,EACZ,GAAiB,EACjB,OAA+C,EAAE;IAEjD,IAAI,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC;IACrB,IAAI,CAAC,IAAI;QAAE,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAEnC,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IACnB,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC;IAE1B,eAAe,CAAC,IAAI,CAAC,CAAC;IAEtB,MAAM,OAAO,GAAG,sBAAsB,CAAC,IAAI,CAAC,CAAC;IAC7C,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,GAAG,CAAC,QAAQ,EAAE,KAAK,eAAe,EAAE,EAAE,CAAC;IAE1F,MAAM,QAAQ,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;IAE3C,IAAI,MAAM,KAAK,EAAE,EAAE,CAAC;QAClB,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,wCAAwC,GAAG,EAAE,EAAE,CAAC;IACpF,CAAC;IAED,MAAM,WAAW,GAAG,mBAAmB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC1D,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC7B,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,8CAA8C,GAAG,EAAE,EAAE,CAAC;IAC1F,CAAC;IAED,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,KAAK,SAAS,CAAC;IAEpD,MAAM,aAAa,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,WAAW,CAAC,MAAM,CACtC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,KAAK,MAAM,IAAI,mBAAmB,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,KAAK,MAAM,CACjF,CAAC;IACF,MAAM,SAAS,GAAG,WAAW,CAAC,MAAM,CAClC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,KAAK,MAAM,IAAI,mBAAmB,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,KAAK,MAAM,CACjF,CAAC;IAEF,MAAM,YAAY,GAAG;QACnB,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC7C,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;KAC1C,CAAC;IACF,MAAM,SAAS,GAAG,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE;QACnC,MAAM,IAAI,GAAG,2BAA2B,CAAC,WAAW,EAAE,YAAY,CAAC,CAAC;QACpE,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxB,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;IAEH,IAAI,CAAC;QACH,MAAM,eAAe,GAAG,MAAM,eAAe,CAAC,aAAa,EAAE,YAAY,EAAE;YACzE,aAAa,EAAE,IAAI;SACpB,CAAC,CAAC;QACH,MAAM,eAAe,GAAG,MAAM,eAAe,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;QAC3E,MAAM,WAAW,GAAG,MAAM,eAAe,CAAC,SAAS,EAAE,YAAY,EAAE;YACjE,aAAa,EAAE,KAAK;YACpB,SAAS;SACV,CAAC,CAAC;QAEH,MAAM,sBAAsB,GAAG,IAAI,GAAG,CACpC,eAAe;aACZ,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,OAAO,CAAC;aAClC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,IAAwB,CAAC,CACpD,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,GAAG,CAChC,WAAW;aACR,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,OAAO,CAAC;aAClC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,IAAwB,CAAC,CACpD,CAAC;QAEF,QAAQ,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,UAAU,KAAK;YACxC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;gBAC9B,IAAI,sBAAsB,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;oBACtC,IAAI,CAAC,MAAM,CAAC,sBAAsB,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,CAAC;gBAC1D,CAAC;qBAAM,IAAI,kBAAkB,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;oBACzC,IAAI,CAAC,MAAM,CAAC,kBAAkB,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,CAAC;gBACtD,CAAC;YACH,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;gBAChC,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,sBAAsB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;oBAC3D,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,sBAAsB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC;gBAC7E,CAAC;qBAAM,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,kBAAkB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;oBAC9D,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,kBAAkB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC;gBACzE,CAAC;YACH,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,QAAQ,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,UAAU,KAAK;YACxC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;gBAC9B,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC,CAAC;YACrD,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;gBAChC,IAAI,CAAC,MAAM,CACT,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CACjB,OAAO,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CACvE,CACF,CAAC;YACJ,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE;YAClC,IAAI,OAAO,OAAO,KAAK,QAAQ;gBAAE,OAAO;YACxC,MAAM,IAAI,GAAG,OAAO;iBACjB,KAAK,CAAC,GAAG,CAAC;iBACV,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,GAAG,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;iBAC3E,IAAI,CAAC,GAAG,CAAC,CAAC;YAEb,QAAQ,CAAC,KAAK,CAAC,GAAG;gBAChB,KAAK,EAAE,IAAI;gBACX,KAAK,EAAE,CAAC,OAAO,CAAC;aACjB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,eAAe,CAAC,2BAA2B,EAAE,eAAe,CAAC,CAAC;QAC9D,eAAe,CAAC,oBAAoB,EAAE,CAAC,GAAG,eAAe,EAAE,GAAG,WAAW,CAAC,CAAC,CAAC;QAE5E,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,MAAM,cAAc,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;QAElE,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,IAAI,CAAC,CAAC;QAC5C,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QAE/C,OAAO;YACL,OAAO,EAAE,IAAI;YACb,IAAI,EAAE;gBACJ,OAAO,EAAE,kCAAkC;gBAC3C,IAAI,EAAE,EAAE;gBACR,IAAI;gBACJ,MAAM,EAAE;oBACN,OAAO,EAAE,EAAE;iBACZ;gBACD,OAAO,EAAE,OAAO,IAAI,EAAE;gBACtB,UAAU,EAAE,QAAsB;gBAClC,IAAI,EAAE,IAAI,CAAC,IAAI;aAChB;SACF,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,KAAK,CAAC,OAAO,EAAE,CAAC;QACpD,CAAC;QACD,OAAO;YACL,OAAO,EAAE,KAAK;YACd,OAAO,EAAE,sEAAsE;SAChF,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { retrieveTabLinks } from '../tabs/retrieveReadme.js';
|
|
2
|
+
import { detectFramework, framework } from '../utils/detectFramework.js';
|
|
3
|
+
import { log } from '../utils/log.js';
|
|
4
|
+
import { fetchPageHtml, startPuppeteer } from '../utils/network.js';
|
|
5
|
+
import { getTitleFromLink } from '../utils/title.js';
|
|
6
|
+
import { downloadFavicon } from './icon.js';
|
|
7
|
+
import { downloadLogos } from './logo.js';
|
|
8
|
+
import { htmlToHast } from './root.js';
|
|
9
|
+
import { scrapeSite } from './site.js';
|
|
10
|
+
export async function scrapeAllSiteTabs(html, url) {
|
|
11
|
+
const hast = htmlToHast(html);
|
|
12
|
+
url = new URL(url);
|
|
13
|
+
detectFramework(hast);
|
|
14
|
+
const needsBrowser = framework.vendor === 'gitbook';
|
|
15
|
+
const browser = needsBrowser ? await startPuppeteer() : undefined;
|
|
16
|
+
const favicon = await downloadFavicon(hast);
|
|
17
|
+
const logo = await downloadLogos(url, browser);
|
|
18
|
+
if (framework.vendor === 'readme' || framework.vendor === 'docusaurus') {
|
|
19
|
+
const links = retrieveTabLinks(hast);
|
|
20
|
+
if (!links ||
|
|
21
|
+
!links.length ||
|
|
22
|
+
(links.length === 1 && links[0] && links[0].url === url.pathname))
|
|
23
|
+
return scrapeSite(html, url, { hast });
|
|
24
|
+
if (!links.find((link) => url.pathname.startsWith(link.url))) {
|
|
25
|
+
links.push({
|
|
26
|
+
name: getTitleFromLink(url.pathname),
|
|
27
|
+
url: url.pathname,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
const results = await Promise.all(links.map(async (tabEntry) => {
|
|
31
|
+
const newUrl = new URL(url);
|
|
32
|
+
newUrl.pathname = tabEntry.url;
|
|
33
|
+
const newHtml = await fetchPageHtml(newUrl, undefined);
|
|
34
|
+
return await scrapeSite(newHtml, newUrl, { tabs: [tabEntry] });
|
|
35
|
+
}));
|
|
36
|
+
const navigations = [];
|
|
37
|
+
const tabs = [];
|
|
38
|
+
const successes = results.filter((result) => result.success);
|
|
39
|
+
successes.forEach((result) => {
|
|
40
|
+
if (!result.data)
|
|
41
|
+
return;
|
|
42
|
+
navigations.push(...result.data.navigation);
|
|
43
|
+
if (result.data.tabs)
|
|
44
|
+
tabs.push(...result.data.tabs);
|
|
45
|
+
});
|
|
46
|
+
const failures = results.filter((result) => !result.success);
|
|
47
|
+
failures.forEach((result) => {
|
|
48
|
+
log('Failed to scrape tab: ' + result.message);
|
|
49
|
+
});
|
|
50
|
+
return {
|
|
51
|
+
success: true,
|
|
52
|
+
data: {
|
|
53
|
+
$schema: 'https://mintlify.com/schema.json',
|
|
54
|
+
name: '',
|
|
55
|
+
logo,
|
|
56
|
+
colors: {
|
|
57
|
+
primary: '',
|
|
58
|
+
},
|
|
59
|
+
favicon: favicon ?? '',
|
|
60
|
+
navigation: navigations,
|
|
61
|
+
tabs,
|
|
62
|
+
},
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
return scrapeSite(html, url, { hast });
|
|
66
|
+
}
|
|
67
|
+
//# sourceMappingURL=tabs.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tabs.js","sourceRoot":"","sources":["../../src/scrapingPipeline/tabs.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAE7D,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACzE,OAAO,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACtC,OAAO,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AACvC,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,IAAY,EACZ,GAAiB;IAEjB,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAC9B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAEnB,eAAe,CAAC,IAAI,CAAC,CAAC;IAEtB,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,KAAK,SAAS,CAAC;IACpD,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,MAAM,cAAc,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAElE,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,IAAI,CAAC,CAAC;IAC5C,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IAE/C,IAAI,SAAS,CAAC,MAAM,KAAK,QAAQ,IAAI,SAAS,CAAC,MAAM,KAAK,YAAY,EAAE,CAAC;QACvE,MAAM,KAAK,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;QACrC,IACE,CAAC,KAAK;YACN,CAAC,KAAK,CAAC,MAAM;YACb,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,KAAK,GAAG,CAAC,QAAQ,CAAC;YAEjE,OAAO,UAAU,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;QAEzC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC7D,KAAK,CAAC,IAAI,CAAC;gBACT,IAAI,EAAE,gBAAgB,CAAC,GAAG,CAAC,QAAQ,CAAC;gBACpC,GAAG,EAAE,GAAG,CAAC,QAAQ;aAClB,CAAC,CAAC;QACL,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;YAC3B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC5B,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC;YAC/B,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;YACvD,OAAO,MAAM,UAAU,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;QACjE,CAAC,CAAC,CACH,CAAC;QAEF,MAAM,WAAW,GAA2B,EAAE,CAAC;QAC/C,MAAM,IAAI,GAAe,EAAE,CAAC;QAE5B,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC7D,SAAS,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YAC3B,IAAI,CAAC,MAAM,CAAC,IAAI;gBAAE,OAAO;YACzB,WAAW,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAC5C,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI;gBAAE,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvD,CAAC,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC7D,QAAQ,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YAC1B,GAAG,CAAC,wBAAwB,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,OAAO,EAAE,IAAI;YACb,IAAI,EAAE;gBACJ,OAAO,EAAE,kCAAkC;gBAC3C,IAAI,EAAE,EAAE;gBACR,IAAI;gBACJ,MAAM,EAAE;oBACN,OAAO,EAAE,EAAE;iBACZ;gBACD,OAAO,EAAE,OAAO,IAAI,EAAE;gBACtB,UAAU,EAAE,WAAyB;gBACrC,IAAI;aACL;SACF,CAAC;IACJ,CAAC;IAED,OAAO,UAAU,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;AACzC,CAAC"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { visit, EXIT, CONTINUE } from 'unist-util-visit';
|
|
2
|
+
import { framework } from '../utils/detectFramework.js';
|
|
3
|
+
import { findTitle, getTitleFromLink } from '../utils/title.js';
|
|
4
|
+
export function retrieveTabLinks(rootNode) {
|
|
5
|
+
if (framework.vendor !== 'readme' && framework.vendor !== 'docusaurus')
|
|
6
|
+
return undefined;
|
|
7
|
+
let element = undefined;
|
|
8
|
+
visit(rootNode, 'element', function (node) {
|
|
9
|
+
if (framework.vendor === 'readme') {
|
|
10
|
+
if (node.tagName === 'header' &&
|
|
11
|
+
node.properties.className &&
|
|
12
|
+
Array.isArray(node.properties.className) &&
|
|
13
|
+
node.properties.className.includes('rm-Header')) {
|
|
14
|
+
element = node;
|
|
15
|
+
return EXIT;
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
if (framework.vendor === 'docusaurus') {
|
|
19
|
+
if (node.tagName === 'nav' &&
|
|
20
|
+
node.properties.className &&
|
|
21
|
+
Array.isArray(node.properties.className) &&
|
|
22
|
+
node.properties.className.includes('navbar')) {
|
|
23
|
+
element = node;
|
|
24
|
+
return EXIT;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
|
29
|
+
if (!element)
|
|
30
|
+
return undefined;
|
|
31
|
+
const links = [];
|
|
32
|
+
visit(element, 'element', function (node) {
|
|
33
|
+
if (framework.vendor === 'readme') {
|
|
34
|
+
if (node.tagName !== 'nav' &&
|
|
35
|
+
!(node.tagName === 'div' &&
|
|
36
|
+
node.properties.className &&
|
|
37
|
+
Array.isArray(node.properties.className) &&
|
|
38
|
+
node.properties.className.includes('rm-Header-right')))
|
|
39
|
+
return CONTINUE;
|
|
40
|
+
visit(node, 'element', function (subNode) {
|
|
41
|
+
if (subNode.tagName !== 'a' ||
|
|
42
|
+
!subNode.properties.href ||
|
|
43
|
+
typeof subNode.properties.href !== 'string' ||
|
|
44
|
+
subNode.properties.href.startsWith('http'))
|
|
45
|
+
return CONTINUE;
|
|
46
|
+
const title = findTitle(subNode);
|
|
47
|
+
links.push({
|
|
48
|
+
name: title || getTitleFromLink(subNode.properties.href),
|
|
49
|
+
url: subNode.properties.href,
|
|
50
|
+
});
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
if (framework.vendor === 'docusaurus') {
|
|
54
|
+
if (node.tagName !== 'nav')
|
|
55
|
+
return CONTINUE;
|
|
56
|
+
visit(node, 'element', function (subNode, _, parent) {
|
|
57
|
+
if (subNode.tagName !== 'a' ||
|
|
58
|
+
!subNode.properties.href ||
|
|
59
|
+
typeof subNode.properties.href !== 'string' ||
|
|
60
|
+
subNode.properties.href.startsWith('http') ||
|
|
61
|
+
!parent ||
|
|
62
|
+
parent.type !== 'element' ||
|
|
63
|
+
!Array.isArray(parent.properties.className) ||
|
|
64
|
+
parent.properties.className.length !== 1 ||
|
|
65
|
+
parent.properties.className[0] !== 'navbar__items' ||
|
|
66
|
+
parent.properties.className.includes('navbar__items--right'))
|
|
67
|
+
return CONTINUE;
|
|
68
|
+
const title = findTitle(subNode);
|
|
69
|
+
links.push({
|
|
70
|
+
name: title || getTitleFromLink(subNode.properties.href),
|
|
71
|
+
url: subNode.properties.href,
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
return links;
|
|
77
|
+
}
|
|
78
|
+
//# sourceMappingURL=retrieveReadme.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"retrieveReadme.js","sourceRoot":"","sources":["../../src/tabs/retrieveReadme.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAEzD,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAEhE,MAAM,UAAU,gBAAgB,CAAC,QAAkB;IACjD,IAAI,SAAS,CAAC,MAAM,KAAK,QAAQ,IAAI,SAAS,CAAC,MAAM,KAAK,YAAY;QAAE,OAAO,SAAS,CAAC;IAEzF,IAAI,OAAO,GAAwB,SAAS,CAAC;IAC7C,KAAK,CAAC,QAAQ,EAAE,SAAS,EAAE,UAAU,IAAI;QACvC,IAAI,SAAS,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YAClC,IACE,IAAI,CAAC,OAAO,KAAK,QAAQ;gBACzB,IAAI,CAAC,UAAU,CAAC,SAAS;gBACzB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;gBACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,WAAW,CAAC,EAC/C,CAAC;gBACD,OAAO,GAAG,IAAI,CAAC;gBACf,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,IAAI,SAAS,CAAC,MAAM,KAAK,YAAY,EAAE,CAAC;YACtC,IACE,IAAI,CAAC,OAAO,KAAK,KAAK;gBACtB,IAAI,CAAC,UAAU,CAAC,SAAS;gBACzB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;gBACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAC5C,CAAC;gBACD,OAAO,GAAG,IAAI,CAAC;gBACf,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,uEAAuE;IACvE,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAC;IAE/B,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,KAAK,CAAC,OAAkB,EAAE,SAAS,EAAE,UAAU,IAAI;QACjD,IAAI,SAAS,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YAClC,IACE,IAAI,CAAC,OAAO,KAAK,KAAK;gBACtB,CAAC,CACC,IAAI,CAAC,OAAO,KAAK,KAAK;oBACtB,IAAI,CAAC,UAAU,CAAC,SAAS;oBACzB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;oBACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CACtD;gBAED,OAAO,QAAQ,CAAC;YAElB,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,OAAO;gBACtC,IACE,OAAO,CAAC,OAAO,KAAK,GAAG;oBACvB,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI;oBACxB,OAAO,OAAO,CAAC,UAAU,CAAC,IAAI,KAAK,QAAQ;oBAC3C,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;oBAE1C,OAAO,QAAQ,CAAC;gBAClB,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC;gBACjC,KAAK,CAAC,IAAI,CAAC;oBACT,IAAI,EAAE,KAAK,IAAI,gBAAgB,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC;oBACxD,GAAG,EAAE,OAAO,CAAC,UAAU,CAAC,IAAI;iBAC7B,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;QAED,IAAI,SAAS,CAAC,MAAM,KAAK,YAAY,EAAE,CAAC;YACtC,IAAI,IAAI,CAAC,OAAO,KAAK,KAAK;gBAAE,OAAO,QAAQ,CAAC;YAE5C,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,OAAO,EAAE,CAAC,EAAE,MAAM;gBACjD,IACE,OAAO,CAAC,OAAO,KAAK,GAAG;oBACvB,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI;oBACxB,OAAO,OAAO,CAAC,UAAU,CAAC,IAAI,KAAK,QAAQ;oBAC3C,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;oBAC1C,CAAC,MAAM;oBACP,MAAM,CAAC,IAAI,KAAK,SAAS;oBACzB,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC;oBAC3C,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC;oBACxC,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC,KAAK,eAAe;oBAClD,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,sBAAsB,CAAC;oBAE5D,OAAO,QAAQ,CAAC;gBAElB,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC;gBACjC,KAAK,CAAC,IAAI,CAAC;oBACT,IAAI,EAAE,KAAK,IAAI,gBAAgB,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC;oBACxD,GAAG,EAAE,OAAO,CAAC,UAAU,CAAC,IAAI;iBAC7B,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
|