@mintlify/scraping 4.0.5 → 4.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/components/AccordionGroup.d.ts +3 -3
- package/bin/components/AccordionGroup.js +54 -27
- package/bin/components/AccordionGroup.js.map +1 -1
- package/bin/components/Card.js +3 -2
- package/bin/components/Card.js.map +1 -1
- package/bin/components/CardGroup.js +3 -6
- package/bin/components/CardGroup.js.map +1 -1
- package/bin/components/CodeGroup.d.ts +1 -1
- package/bin/components/CodeGroup.js +107 -79
- package/bin/components/CodeGroup.js.map +1 -1
- package/bin/components/Tabs.d.ts +1 -1
- package/bin/components/Tabs.js +50 -23
- package/bin/components/Tabs.js.map +1 -1
- package/bin/constants.js +3 -3
- package/bin/constants.js.map +1 -1
- package/bin/nav/listItems.js +0 -1
- package/bin/nav/listItems.js.map +1 -1
- package/bin/scrapingPipeline/color.d.ts +8 -0
- package/bin/scrapingPipeline/color.js +91 -0
- package/bin/scrapingPipeline/color.js.map +1 -0
- package/bin/scrapingPipeline/group.js +1 -3
- package/bin/scrapingPipeline/group.js.map +1 -1
- package/bin/scrapingPipeline/icon.d.ts +1 -1
- package/bin/scrapingPipeline/icon.js +7 -6
- package/bin/scrapingPipeline/icon.js.map +1 -1
- package/bin/scrapingPipeline/logo.js +13 -9
- package/bin/scrapingPipeline/logo.js.map +1 -1
- package/bin/scrapingPipeline/page.js +28 -9
- package/bin/scrapingPipeline/page.js.map +1 -1
- package/bin/scrapingPipeline/site.js +64 -7
- package/bin/scrapingPipeline/site.js.map +1 -1
- package/bin/scrapingPipeline/tabs.js +15 -10
- package/bin/scrapingPipeline/tabs.js.map +1 -1
- package/bin/scrapingPipeline/title.d.ts +2 -0
- package/bin/scrapingPipeline/title.js +34 -0
- package/bin/scrapingPipeline/title.js.map +1 -0
- package/bin/tabs/retrieveReadme.js +0 -1
- package/bin/tabs/retrieveReadme.js.map +1 -1
- package/bin/tsconfig.build.tsbuildinfo +1 -1
- package/bin/types/result.d.ts +1 -0
- package/bin/utils/breaks.d.ts +3 -0
- package/bin/utils/breaks.js +17 -0
- package/bin/utils/breaks.js.map +1 -0
- package/bin/utils/children.js +9 -3
- package/bin/utils/children.js.map +1 -1
- package/bin/utils/className.d.ts +0 -1
- package/bin/utils/className.js +1 -1
- package/bin/utils/className.js.map +1 -1
- package/bin/utils/copyButton.d.ts +3 -0
- package/bin/utils/copyButton.js +30 -0
- package/bin/utils/copyButton.js.map +1 -0
- package/bin/utils/emptyEmphasis.d.ts +2 -0
- package/bin/utils/emptyEmphasis.js +18 -0
- package/bin/utils/emptyEmphasis.js.map +1 -0
- package/bin/utils/emptyParagraphs.d.ts +0 -1
- package/bin/utils/emptyParagraphs.js +1 -1
- package/bin/utils/emptyParagraphs.js.map +1 -1
- package/bin/utils/formatEmphasis.d.ts +2 -0
- package/bin/utils/formatEmphasis.js +32 -0
- package/bin/utils/formatEmphasis.js.map +1 -0
- package/bin/utils/images.js +9 -1
- package/bin/utils/images.js.map +1 -1
- package/bin/utils/lists.d.ts +2 -0
- package/bin/utils/lists.js +21 -0
- package/bin/utils/lists.js.map +1 -0
- package/bin/utils/log.d.ts +17 -0
- package/bin/utils/log.js +15 -5
- package/bin/utils/log.js.map +1 -1
- package/bin/utils/metadata.d.ts +2 -0
- package/bin/utils/metadata.js +23 -0
- package/bin/utils/metadata.js.map +1 -0
- package/bin/utils/nestedRoots.d.ts +0 -1
- package/bin/utils/nestedRoots.js +1 -1
- package/bin/utils/nestedRoots.js.map +1 -1
- package/bin/utils/position.d.ts +0 -1
- package/bin/utils/position.js +1 -1
- package/bin/utils/position.js.map +1 -1
- package/bin/utils/tableCells.d.ts +2 -0
- package/bin/utils/tableCells.js +22 -0
- package/bin/utils/tableCells.js.map +1 -0
- package/bin/utils/title.d.ts +1 -0
- package/bin/utils/title.js +9 -3
- package/bin/utils/title.js.map +1 -1
- package/bin/utils/updatedAt.d.ts +2 -0
- package/bin/utils/updatedAt.js +21 -0
- package/bin/utils/updatedAt.js.map +1 -0
- package/package.json +2 -2
- package/src/components/AccordionGroup.ts +55 -25
- package/src/components/Card.ts +3 -2
- package/src/components/CardGroup.ts +3 -6
- package/src/components/CodeGroup.ts +127 -83
- package/src/components/Tabs.ts +57 -24
- package/src/constants.ts +3 -3
- package/src/nav/listItems.ts +1 -2
- package/src/scrapingPipeline/color.ts +107 -0
- package/src/scrapingPipeline/group.ts +1 -4
- package/src/scrapingPipeline/icon.ts +8 -6
- package/src/scrapingPipeline/logo.ts +14 -9
- package/src/scrapingPipeline/page.ts +30 -9
- package/src/scrapingPipeline/site.ts +83 -7
- package/src/scrapingPipeline/tabs.ts +15 -13
- package/src/scrapingPipeline/title.ts +38 -0
- package/src/tabs/retrieveReadme.ts +1 -2
- package/src/types/result.ts +1 -1
- package/src/utils/breaks.ts +19 -0
- package/src/utils/children.ts +10 -3
- package/src/utils/className.ts +1 -1
- package/src/utils/copyButton.ts +35 -0
- package/src/utils/emptyEmphasis.ts +18 -0
- package/src/utils/emptyParagraphs.ts +1 -1
- package/src/utils/formatEmphasis.ts +37 -0
- package/src/utils/images.ts +13 -2
- package/src/utils/lists.ts +22 -0
- package/src/utils/log.ts +18 -5
- package/src/utils/metadata.ts +26 -0
- package/src/utils/nestedRoots.ts +1 -1
- package/src/utils/position.ts +1 -1
- package/src/utils/tableCells.ts +23 -0
- package/src/utils/title.ts +10 -4
- package/src/utils/updatedAt.ts +25 -0
- package/bin/utils/escape.d.ts +0 -2
- package/bin/utils/escape.js +0 -25
- package/bin/utils/escape.js.map +0 -1
- package/src/utils/escape.ts +0 -30
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { Colors } from '@mintlify/models';
|
|
2
|
+
import type { Root as HastRoot } from 'hast';
|
|
3
|
+
import { CONTINUE, visit } from 'unist-util-visit';
|
|
4
|
+
|
|
5
|
+
import { framework } from '../utils/detectFramework.js';
|
|
6
|
+
|
|
7
|
+
function toHex(value: number) {
|
|
8
|
+
Math.round(value).toString(16).padStart(2, '0');
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function checkValidHex(str: string | undefined): boolean {
|
|
12
|
+
if (!str) return false;
|
|
13
|
+
return /^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})$/.test(str);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function checkRgbBounds(...numbers: Array<number>): boolean {
|
|
17
|
+
for (const num of numbers) {
|
|
18
|
+
if (num < 0 || num > 255) return false;
|
|
19
|
+
}
|
|
20
|
+
return true;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function rgbToHex(color: string): string | undefined {
|
|
24
|
+
if (checkValidHex(color)) return color;
|
|
25
|
+
color = color.trim().toLowerCase();
|
|
26
|
+
|
|
27
|
+
let r: number | undefined, g: number | undefined, b: number | undefined;
|
|
28
|
+
|
|
29
|
+
if (/^\d+\s+\d+\s+\d+(\s+[0-9.]+)?$/.test(color)) {
|
|
30
|
+
[r, g, b] = color.split(/\s+/).map(Number);
|
|
31
|
+
} else {
|
|
32
|
+
const values = color.match(/^rgba?\((\d+),(\d+),(\d+)(?:,([0-9.]+))?\)$/);
|
|
33
|
+
|
|
34
|
+
if (!values) {
|
|
35
|
+
return undefined;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
[, r, g, b] = values.map(Number);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (!r || !g || !b) return undefined;
|
|
42
|
+
|
|
43
|
+
if (!checkRgbBounds(r, g, b)) return undefined;
|
|
44
|
+
|
|
45
|
+
return `#${toHex(r)}${toHex(g)}${toHex(b)}`.toUpperCase();
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function getCssValue(cssString: string, key: string): string | undefined {
|
|
49
|
+
const regex = new RegExp(`${key}\\s*[:|,]\\s*([^;)]+)`, 'i');
|
|
50
|
+
const match = cssString.match(regex);
|
|
51
|
+
return match && match[1] ? match[1].trim() : undefined;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export const defaultColors = {
|
|
55
|
+
primary: '#0D9373',
|
|
56
|
+
light: '#55D799',
|
|
57
|
+
dark: '#0D9373',
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
export async function downloadColors(hast: HastRoot): Promise<Colors> {
|
|
61
|
+
if (framework.vendor === 'docusaurus') return defaultColors;
|
|
62
|
+
|
|
63
|
+
let primaryHexCode: string | undefined = undefined;
|
|
64
|
+
let lightHexCode: string | undefined = undefined;
|
|
65
|
+
visit(hast, 'element', function (node) {
|
|
66
|
+
if (node.tagName !== 'style') return CONTINUE;
|
|
67
|
+
if (
|
|
68
|
+
(framework.vendor === 'gitbook' && !!Object.keys(node.properties).length) ||
|
|
69
|
+
(framework.vendor === 'readme' && node.properties.title !== 'rm-custom-css')
|
|
70
|
+
)
|
|
71
|
+
return CONTINUE;
|
|
72
|
+
|
|
73
|
+
if (node.children.length !== 1 || !node.children[0] || node.children[0].type !== 'text')
|
|
74
|
+
return CONTINUE;
|
|
75
|
+
|
|
76
|
+
const cssStr = node.children[0].value;
|
|
77
|
+
const primaryColorKey =
|
|
78
|
+
framework.vendor === 'readme' ? '--color-link-primary' : '--primary-color-600';
|
|
79
|
+
const lightColorKey =
|
|
80
|
+
framework.vendor === 'readme' ? '--color-link-primary' : '--primary-color-400';
|
|
81
|
+
|
|
82
|
+
const primaryCssColorValue = getCssValue(cssStr, primaryColorKey);
|
|
83
|
+
const lightCssColorValue = getCssValue(cssStr, lightColorKey);
|
|
84
|
+
if (!primaryCssColorValue || !lightCssColorValue) return CONTINUE;
|
|
85
|
+
|
|
86
|
+
primaryHexCode = rgbToHex(primaryCssColorValue);
|
|
87
|
+
lightHexCode = rgbToHex(lightCssColorValue);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
const isPrimaryValid = checkValidHex(primaryHexCode);
|
|
91
|
+
const isLightValid = checkValidHex(lightHexCode);
|
|
92
|
+
|
|
93
|
+
if (isPrimaryValid && isLightValid) {
|
|
94
|
+
return {
|
|
95
|
+
primary: primaryHexCode!,
|
|
96
|
+
light: lightHexCode,
|
|
97
|
+
dark: primaryHexCode,
|
|
98
|
+
};
|
|
99
|
+
} else if (isPrimaryValid) {
|
|
100
|
+
return {
|
|
101
|
+
primary: primaryHexCode!,
|
|
102
|
+
dark: primaryHexCode,
|
|
103
|
+
};
|
|
104
|
+
} else {
|
|
105
|
+
return defaultColors;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
@@ -52,11 +52,8 @@ export async function scrapePageGroup(
|
|
|
52
52
|
navGroup[0]?.origin ?? 'the URL provided'
|
|
53
53
|
}${errorMessage}`
|
|
54
54
|
);
|
|
55
|
-
console.error(error);
|
|
56
55
|
throw error;
|
|
57
56
|
} finally {
|
|
58
|
-
if (browser)
|
|
59
|
-
await browser.close();
|
|
60
|
-
}
|
|
57
|
+
if (browser) await browser.close();
|
|
61
58
|
}
|
|
62
59
|
}
|
|
@@ -3,24 +3,26 @@ import { EXIT, visit } from 'unist-util-visit';
|
|
|
3
3
|
|
|
4
4
|
import { downloadImage } from '../utils/images.js';
|
|
5
5
|
|
|
6
|
-
export async function downloadFavicon(hast: HastRoot): Promise<string
|
|
6
|
+
export async function downloadFavicon(hast: HastRoot): Promise<string> {
|
|
7
7
|
let src: string = '';
|
|
8
8
|
visit(hast, 'element', function (node) {
|
|
9
9
|
if (
|
|
10
10
|
node.tagName === 'link' &&
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
Array.isArray(node.properties.rel) &&
|
|
12
|
+
node.properties.rel.includes('icon')
|
|
13
13
|
) {
|
|
14
14
|
src = node.properties.href as string;
|
|
15
15
|
return EXIT;
|
|
16
16
|
}
|
|
17
17
|
});
|
|
18
18
|
|
|
19
|
-
if (!src)
|
|
19
|
+
if (!src) {
|
|
20
|
+
return '/favicon.svg';
|
|
21
|
+
}
|
|
20
22
|
|
|
21
23
|
const res = await downloadImage(src, process.cwd());
|
|
22
|
-
if (!res.success) return
|
|
23
|
-
if (!res.data) return
|
|
24
|
+
if (!res.success) return '/favicon.svg';
|
|
25
|
+
if (!res.data) return '/favicon.svg';
|
|
24
26
|
|
|
25
27
|
return res.data[1];
|
|
26
28
|
}
|
|
@@ -88,17 +88,24 @@ export async function downloadLogos(
|
|
|
88
88
|
): Promise<string | { light: string; dark: string } | undefined> {
|
|
89
89
|
url = new URL(url);
|
|
90
90
|
const filepaths: Array<string> = [];
|
|
91
|
+
|
|
91
92
|
if (browser) {
|
|
92
93
|
const htmls: Array<string> = [];
|
|
93
94
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
95
|
+
try {
|
|
96
|
+
const page = await browser.newPage();
|
|
97
|
+
await page.goto(url.toString(), {
|
|
98
|
+
waitUntil: 'networkidle2',
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
htmls.push(await page.content());
|
|
102
|
+
await page.click('.rm-ThemeToggle');
|
|
103
|
+
htmls.push(await page.content());
|
|
104
|
+
} catch {
|
|
105
|
+
// do nothing, it just means there's no theme toggle
|
|
106
|
+
}
|
|
98
107
|
|
|
99
|
-
|
|
100
|
-
await page.click('.rm-ThemeToggle');
|
|
101
|
-
htmls.push(await page.content());
|
|
108
|
+
await browser.close();
|
|
102
109
|
|
|
103
110
|
await Promise.all(
|
|
104
111
|
htmls.map(async (html) => {
|
|
@@ -119,8 +126,6 @@ export async function downloadLogos(
|
|
|
119
126
|
}
|
|
120
127
|
}
|
|
121
128
|
|
|
122
|
-
if (browser) await browser.close();
|
|
123
|
-
|
|
124
129
|
const uniqueFilepaths = [...new Set(filepaths).values()];
|
|
125
130
|
|
|
126
131
|
return uniqueFilepaths.length === 1
|
|
@@ -21,18 +21,25 @@ import { rehypeToRemarkCustomComponents } from '../customComponents/plugin.js';
|
|
|
21
21
|
import { selectiveRehypeRemark } from '../customComponents/selective.js';
|
|
22
22
|
import { retrieveRootContent } from '../root/retrieve.js';
|
|
23
23
|
import type { Result } from '../types/result.js';
|
|
24
|
+
import { unifiedRemoveBreaks } from '../utils/breaks.js';
|
|
24
25
|
import { unifiedRemoveClassNames } from '../utils/className.js';
|
|
26
|
+
import { unifiedRemoveCopyButtons } from '../utils/copyButton.js';
|
|
25
27
|
import { detectFramework, framework } from '../utils/detectFramework.js';
|
|
28
|
+
import { remarkRemoveEmptyEmphases } from '../utils/emptyEmphasis.js';
|
|
26
29
|
import { unifiedRemoveEmptyParagraphs } from '../utils/emptyParagraphs.js';
|
|
27
30
|
import { getErrorMessage, logErrorResults } from '../utils/errors.js';
|
|
28
|
-
import {
|
|
29
|
-
import {
|
|
31
|
+
import { writePage } from '../utils/file.js';
|
|
32
|
+
import { remarkProperlyFormatEmphasis } from '../utils/formatEmphasis.js';
|
|
30
33
|
import { removeHastComments } from '../utils/hastComments.js';
|
|
34
|
+
import { remarkSpaceListsOut } from '../utils/lists.js';
|
|
31
35
|
import { log } from '../utils/log.js';
|
|
36
|
+
import { remarkRemoveBottomMetadata } from '../utils/metadata.js';
|
|
32
37
|
import { unifiedRemoveNestedRoots } from '../utils/nestedRoots.js';
|
|
33
38
|
import { unifiedRemovePositions } from '../utils/position.js';
|
|
34
39
|
import { removeLeadingSlash, removeTrailingSlash } from '../utils/strings.js';
|
|
40
|
+
import { remarkRemoveCodeBlocksInCells } from '../utils/tableCells.js';
|
|
35
41
|
import { getDescriptionFromRoot, getTitleFromHeading } from '../utils/title.js';
|
|
42
|
+
import { remarkRemoveUpdatedAt } from '../utils/updatedAt.js';
|
|
36
43
|
import { downloadImagesFromFile } from './images.js';
|
|
37
44
|
import { htmlToHast } from './root.js';
|
|
38
45
|
|
|
@@ -61,7 +68,8 @@ export async function scrapePage(
|
|
|
61
68
|
|
|
62
69
|
const urlStr = url.toString();
|
|
63
70
|
const content = retrieveRootContent(hast);
|
|
64
|
-
if (!content)
|
|
71
|
+
if (!content)
|
|
72
|
+
return { success: false, message: `${urlStr}: ${CONTENT_FAILURE_MSG}`, data: [urlStr, ''] };
|
|
65
73
|
|
|
66
74
|
const contentAsRoot: HastRoot = {
|
|
67
75
|
type: 'root',
|
|
@@ -69,18 +77,19 @@ export async function scrapePage(
|
|
|
69
77
|
};
|
|
70
78
|
|
|
71
79
|
const mdastTree: MdastRoot = unified()
|
|
80
|
+
.use(unifiedRemoveBreaks)
|
|
81
|
+
.use(unifiedRemoveCopyButtons)
|
|
72
82
|
.use(createCard)
|
|
73
83
|
.use(createAccordion)
|
|
74
84
|
.use(createFrame)
|
|
75
|
-
.use(createTabs)
|
|
76
85
|
.use(createCallout)
|
|
77
86
|
.use(createCardGroup)
|
|
78
87
|
.use(createAccordionGroup)
|
|
79
88
|
.use(createCodeGroup)
|
|
89
|
+
.use(createTabs)
|
|
80
90
|
.use(unifiedRemoveClassNames)
|
|
81
|
-
.use(unifiedRemovePositions)
|
|
82
91
|
.use(unifiedRemoveEmptyParagraphs)
|
|
83
|
-
.use(
|
|
92
|
+
.use(unifiedRemovePositions)
|
|
84
93
|
.use(selectiveRehypeRemark)
|
|
85
94
|
|
|
86
95
|
// Cleans up any nested components left untouched
|
|
@@ -89,6 +98,13 @@ export async function scrapePage(
|
|
|
89
98
|
.use(rehypeToRemarkCustomComponents)
|
|
90
99
|
.use(convertHeaderLinksToText)
|
|
91
100
|
.use(unifiedRemoveNestedRoots)
|
|
101
|
+
.use(remarkSpaceListsOut)
|
|
102
|
+
.use(remarkRemoveBottomMetadata)
|
|
103
|
+
.use(remarkRemoveUpdatedAt)
|
|
104
|
+
.use(remarkRemoveEmptyEmphases)
|
|
105
|
+
.use(remarkProperlyFormatEmphasis)
|
|
106
|
+
.use(remarkRemoveCodeBlocksInCells)
|
|
107
|
+
// @ts-expect-error moving some of the pipeline around results in contentAsRoot being treated differently than its type which is Root Element
|
|
92
108
|
.runSync(contentAsRoot) as MdastRoot;
|
|
93
109
|
|
|
94
110
|
try {
|
|
@@ -111,13 +127,15 @@ export async function scrapePage(
|
|
|
111
127
|
.use(remarkStringify)
|
|
112
128
|
.stringify(mdastTree);
|
|
113
129
|
|
|
130
|
+
const resultStr = String(result).replace(/\n{3,}/g, '\n\n');
|
|
131
|
+
|
|
114
132
|
if (opts.rootPath) {
|
|
115
133
|
url = new URL(opts.rootPath, url.origin);
|
|
116
134
|
} else if (url.origin === removeTrailingSlash(url.toString())) {
|
|
117
135
|
url = new URL('home', new URL(url).origin);
|
|
118
136
|
}
|
|
119
137
|
|
|
120
|
-
writePage(url, opts.isOverviewPage ? 'Overview' : title, description,
|
|
138
|
+
writePage(url, opts.isOverviewPage ? 'Overview' : title, description, resultStr);
|
|
121
139
|
return {
|
|
122
140
|
success: true,
|
|
123
141
|
data: opts.rootPath
|
|
@@ -125,8 +143,11 @@ export async function scrapePage(
|
|
|
125
143
|
: undefined,
|
|
126
144
|
};
|
|
127
145
|
} catch (error) {
|
|
128
|
-
write('error.json', JSON.stringify(mdastTree, undefined, 2));
|
|
129
146
|
const errorMessage = getErrorMessage(error);
|
|
130
|
-
return {
|
|
147
|
+
return {
|
|
148
|
+
success: false,
|
|
149
|
+
message: `${urlStr}: ${MDAST_FAILURE_MSG}${errorMessage}`,
|
|
150
|
+
data: [urlStr, ''],
|
|
151
|
+
};
|
|
131
152
|
}
|
|
132
153
|
}
|
|
@@ -11,11 +11,13 @@ import { detectFramework, framework } from '../utils/detectFramework.js';
|
|
|
11
11
|
import { logErrorResults } from '../utils/errors.js';
|
|
12
12
|
import { startPuppeteer } from '../utils/network.js';
|
|
13
13
|
import { INDEX_NAMES, iterateThroughReservedNames } from '../utils/reservedNames.js';
|
|
14
|
-
import { removeTrailingSlash } from '../utils/strings.js';
|
|
14
|
+
import { removeTrailingSlash, removeLeadingSlash } from '../utils/strings.js';
|
|
15
|
+
import { downloadColors } from './color.js';
|
|
15
16
|
import { scrapePageGroup } from './group.js';
|
|
16
17
|
import { downloadFavicon } from './icon.js';
|
|
17
18
|
import { downloadLogos } from './logo.js';
|
|
18
19
|
import { htmlToHast } from './root.js';
|
|
20
|
+
import { downloadTitle } from './title.js';
|
|
19
21
|
|
|
20
22
|
export async function scrapeSite(
|
|
21
23
|
html: string,
|
|
@@ -127,24 +129,98 @@ export async function scrapeSite(
|
|
|
127
129
|
};
|
|
128
130
|
});
|
|
129
131
|
|
|
132
|
+
const allErrors = [
|
|
133
|
+
...externalResults.filter((result) => !result.success),
|
|
134
|
+
...internalResults.filter((result) => !result.success),
|
|
135
|
+
...rootResults.filter((result) => !result.success),
|
|
136
|
+
];
|
|
137
|
+
|
|
138
|
+
const allErroredPaths = allErrors
|
|
139
|
+
.map((result) => {
|
|
140
|
+
if (result.data) {
|
|
141
|
+
const url = new URL(result.data[0]);
|
|
142
|
+
const pathname = url.pathname;
|
|
143
|
+
const normalizedPathname = removeLeadingSlash(removeTrailingSlash(pathname));
|
|
144
|
+
return normalizedPathname;
|
|
145
|
+
} else {
|
|
146
|
+
return '';
|
|
147
|
+
}
|
|
148
|
+
})
|
|
149
|
+
.filter(Boolean);
|
|
150
|
+
|
|
151
|
+
traverse(navItems).forEach(function (value) {
|
|
152
|
+
if (typeof value === 'string' && allErroredPaths.includes(value)) {
|
|
153
|
+
this.remove();
|
|
154
|
+
} else if (Array.isArray(value)) {
|
|
155
|
+
this.update(
|
|
156
|
+
value
|
|
157
|
+
.filter((item) =>
|
|
158
|
+
typeof item === 'string' && allErroredPaths.includes(item) ? undefined : item
|
|
159
|
+
)
|
|
160
|
+
.filter(Boolean)
|
|
161
|
+
);
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
let count = 1;
|
|
166
|
+
while (count > 0) {
|
|
167
|
+
count = 0;
|
|
168
|
+
traverse(navItems).forEach(function (value) {
|
|
169
|
+
if (Array.isArray(value) && value.filter(Boolean).length === 0) {
|
|
170
|
+
count++;
|
|
171
|
+
if (this.parent) {
|
|
172
|
+
this.parent.remove();
|
|
173
|
+
} else {
|
|
174
|
+
this.remove();
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
traverse(navItems).forEach(function (value) {
|
|
181
|
+
if (
|
|
182
|
+
typeof value === 'string' &&
|
|
183
|
+
(value.startsWith('https://') || value.startsWith('http://'))
|
|
184
|
+
) {
|
|
185
|
+
this.remove();
|
|
186
|
+
} else if (
|
|
187
|
+
Array.isArray(value) &&
|
|
188
|
+
value.find(
|
|
189
|
+
(val) =>
|
|
190
|
+
typeof val === 'string' && (val.startsWith('https://') || val.startsWith('http://'))
|
|
191
|
+
)
|
|
192
|
+
) {
|
|
193
|
+
this.update(
|
|
194
|
+
value.filter(
|
|
195
|
+
(val) =>
|
|
196
|
+
!(
|
|
197
|
+
typeof val === 'string' &&
|
|
198
|
+
(val.startsWith('https://') || val.startsWith('http://'))
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
);
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
|
|
130
205
|
logErrorResults('linking to external pages', externalResults);
|
|
131
206
|
logErrorResults('scraping your docs', [...internalResults, ...rootResults]);
|
|
132
207
|
|
|
133
|
-
const
|
|
208
|
+
const needsBrowserForLogos = framework.vendor === 'readme';
|
|
209
|
+
const browser = needsBrowserForLogos ? await startPuppeteer() : undefined;
|
|
134
210
|
|
|
135
211
|
const favicon = await downloadFavicon(hast);
|
|
212
|
+
const colors = await downloadColors(hast);
|
|
136
213
|
const logo = await downloadLogos(url, browser);
|
|
214
|
+
const name = await downloadTitle(hast);
|
|
137
215
|
|
|
138
216
|
return {
|
|
139
217
|
success: true,
|
|
140
218
|
data: {
|
|
141
219
|
$schema: 'https://mintlify.com/schema.json',
|
|
142
|
-
name
|
|
220
|
+
name,
|
|
143
221
|
logo,
|
|
144
|
-
colors
|
|
145
|
-
|
|
146
|
-
},
|
|
147
|
-
favicon: favicon ?? '',
|
|
222
|
+
colors,
|
|
223
|
+
favicon,
|
|
148
224
|
navigation: navItems as Navigation,
|
|
149
225
|
tabs: opts.tabs,
|
|
150
226
|
},
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { Navigation, NavigationEntry } from '@mintlify/models';
|
|
1
|
+
import type { Colors, Navigation, NavigationEntry } from '@mintlify/models';
|
|
2
2
|
import { MintConfig, Tab } from '@mintlify/models';
|
|
3
3
|
|
|
4
4
|
import { retrieveTabLinks } from '../tabs/retrieveReadme.js';
|
|
@@ -8,10 +8,11 @@ import { getErrorMessage } from '../utils/errors.js';
|
|
|
8
8
|
import { log } from '../utils/log.js';
|
|
9
9
|
import { fetchPageHtml, startPuppeteer } from '../utils/network.js';
|
|
10
10
|
import { getTitleFromLink } from '../utils/title.js';
|
|
11
|
-
import {
|
|
11
|
+
import { defaultColors } from './color.js';
|
|
12
12
|
import { downloadLogos } from './logo.js';
|
|
13
13
|
import { htmlToHast } from './root.js';
|
|
14
14
|
import { scrapeSite } from './site.js';
|
|
15
|
+
import { downloadTitle } from './title.js';
|
|
15
16
|
|
|
16
17
|
export async function scrapeAllSiteTabs(
|
|
17
18
|
html: string,
|
|
@@ -22,12 +23,6 @@ export async function scrapeAllSiteTabs(
|
|
|
22
23
|
|
|
23
24
|
detectFramework(hast);
|
|
24
25
|
|
|
25
|
-
const needsBrowser = framework.vendor === 'gitbook';
|
|
26
|
-
const browser = needsBrowser ? await startPuppeteer() : undefined;
|
|
27
|
-
|
|
28
|
-
const favicon = await downloadFavicon(hast);
|
|
29
|
-
const logo = await downloadLogos(url, browser);
|
|
30
|
-
|
|
31
26
|
if (framework.vendor === 'readme' || framework.vendor === 'docusaurus') {
|
|
32
27
|
const links = retrieveTabLinks(hast);
|
|
33
28
|
if (
|
|
@@ -59,12 +54,16 @@ export async function scrapeAllSiteTabs(
|
|
|
59
54
|
|
|
60
55
|
const navigations: Array<NavigationEntry> = [];
|
|
61
56
|
const tabs: Array<Tab> = [];
|
|
57
|
+
let favicon = '/favicon.svg';
|
|
58
|
+
let colors: Colors = defaultColors;
|
|
62
59
|
|
|
63
60
|
const successes = results.filter((result) => result.success);
|
|
64
61
|
successes.forEach((result) => {
|
|
65
62
|
if (!result.data) return;
|
|
66
63
|
navigations.push(...result.data.navigation);
|
|
67
64
|
if (result.data.tabs) tabs.push(...result.data.tabs);
|
|
65
|
+
if (result.data.favicon !== '/favicon.svg') favicon = result.data.favicon;
|
|
66
|
+
if (result.data.colors !== defaultColors) colors = result.data.colors;
|
|
68
67
|
});
|
|
69
68
|
|
|
70
69
|
const failures = results.filter((result) => !result.success);
|
|
@@ -72,16 +71,19 @@ export async function scrapeAllSiteTabs(
|
|
|
72
71
|
log('Failed to scrape tab' + result.message);
|
|
73
72
|
});
|
|
74
73
|
|
|
74
|
+
const needsBrowser = framework.vendor === 'readme';
|
|
75
|
+
const browser = needsBrowser ? await startPuppeteer() : undefined;
|
|
76
|
+
const logo = await downloadLogos(url, browser);
|
|
77
|
+
const name = await downloadTitle(hast);
|
|
78
|
+
|
|
75
79
|
return {
|
|
76
80
|
success: true,
|
|
77
81
|
data: {
|
|
78
82
|
$schema: 'https://mintlify.com/schema.json',
|
|
79
|
-
name
|
|
83
|
+
name,
|
|
80
84
|
logo,
|
|
81
|
-
colors
|
|
82
|
-
|
|
83
|
-
},
|
|
84
|
-
favicon: favicon ?? '',
|
|
85
|
+
colors,
|
|
86
|
+
favicon,
|
|
85
87
|
navigation: navigations as Navigation,
|
|
86
88
|
tabs,
|
|
87
89
|
},
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import type { Root as HastRoot } from 'hast';
|
|
2
|
+
import { CONTINUE, EXIT, visit } from 'unist-util-visit';
|
|
3
|
+
|
|
4
|
+
const defaultTitle = 'Enter name here';
|
|
5
|
+
|
|
6
|
+
export async function downloadTitle(hast: HastRoot): Promise<string> {
|
|
7
|
+
let text: string | undefined = undefined as string | undefined;
|
|
8
|
+
|
|
9
|
+
visit(hast, 'element', function (node) {
|
|
10
|
+
if (node.tagName !== 'title') return CONTINUE;
|
|
11
|
+
|
|
12
|
+
visit(node, 'text', function (subNode) {
|
|
13
|
+
text = subNode.value;
|
|
14
|
+
return EXIT;
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
if (text) {
|
|
18
|
+
return EXIT;
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
if (!text) return defaultTitle;
|
|
23
|
+
|
|
24
|
+
const title = text as string;
|
|
25
|
+
let siteGroupTitle = '';
|
|
26
|
+
|
|
27
|
+
if (title.includes('|')) {
|
|
28
|
+
siteGroupTitle = (title.split('|').at(-1) ?? '').trim() as string;
|
|
29
|
+
} else if (title.includes('–')) {
|
|
30
|
+
siteGroupTitle = (title.split('–').at(-1) ?? '').trim() as string;
|
|
31
|
+
} else if (title.includes('-')) {
|
|
32
|
+
siteGroupTitle = (title.split('-').at(-1) ?? '').trim() as string;
|
|
33
|
+
} else {
|
|
34
|
+
siteGroupTitle = title.trim();
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return siteGroupTitle ? siteGroupTitle : defaultTitle;
|
|
38
|
+
}
|
|
@@ -8,7 +8,7 @@ import { findTitle, getTitleFromLink } from '../utils/title.js';
|
|
|
8
8
|
export function retrieveTabLinks(rootNode: HastRoot): Array<Tab> | undefined {
|
|
9
9
|
if (framework.vendor !== 'readme' && framework.vendor !== 'docusaurus') return undefined;
|
|
10
10
|
|
|
11
|
-
let element: Element | undefined = undefined;
|
|
11
|
+
let element: Element | undefined = undefined as Element | undefined;
|
|
12
12
|
visit(rootNode, 'element', function (node) {
|
|
13
13
|
if (framework.vendor === 'readme') {
|
|
14
14
|
if (
|
|
@@ -35,7 +35,6 @@ export function retrieveTabLinks(rootNode: HastRoot): Array<Tab> | undefined {
|
|
|
35
35
|
}
|
|
36
36
|
});
|
|
37
37
|
|
|
38
|
-
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
|
39
38
|
if (!element) return undefined;
|
|
40
39
|
|
|
41
40
|
const links: Array<Tab> = [];
|
package/src/types/result.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export type Result<T> = { success: true; data?: T } | { success: false; message: string };
|
|
1
|
+
export type Result<T> = { success: true; data?: T } | { success: false; message: string; data?: T };
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { Element } from 'hast';
|
|
2
|
+
import { visit } from 'unist-util-visit';
|
|
3
|
+
|
|
4
|
+
export function unifiedRemoveBreaks() {
|
|
5
|
+
return function (node: Element) {
|
|
6
|
+
return removeBreaks(node);
|
|
7
|
+
};
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
// ReadMe-specific function since they use breaks in between
|
|
11
|
+
// every element, but either way our parser adds whitespace
|
|
12
|
+
// automatically
|
|
13
|
+
export function removeBreaks(node: Element) {
|
|
14
|
+
return visit(node, 'element', function (subNode, index, parent) {
|
|
15
|
+
if (subNode.tagName === 'br' && parent && typeof index === 'number') {
|
|
16
|
+
parent.children.splice(index, 1);
|
|
17
|
+
}
|
|
18
|
+
});
|
|
19
|
+
}
|
package/src/utils/children.ts
CHANGED
|
@@ -11,6 +11,7 @@ import type { State, Handle } from 'hast-util-to-mdast';
|
|
|
11
11
|
import type { RootContent as MdastRootContent, Root as MdastRoot } from 'mdast';
|
|
12
12
|
import { unified } from 'unified';
|
|
13
13
|
|
|
14
|
+
import { ESCAPED_COMPONENTS } from '../constants.js';
|
|
14
15
|
import { mdxJsxFlowElementHandler } from '../customComponents/selective.js';
|
|
15
16
|
|
|
16
17
|
export function turnChildrenIntoMdx(
|
|
@@ -19,19 +20,25 @@ export function turnChildrenIntoMdx(
|
|
|
19
20
|
): Array<MdastRootContent> {
|
|
20
21
|
const hast: HastRoot = {
|
|
21
22
|
type: 'root',
|
|
22
|
-
children
|
|
23
|
+
children,
|
|
23
24
|
};
|
|
24
25
|
|
|
25
26
|
const handlers: Record<string, Handle> = { ...defaultHandlers };
|
|
26
27
|
if (opts.jsxImages) {
|
|
27
|
-
handlers['img'] = function (
|
|
28
|
+
handlers['img'] = function (h: State, node: Element) {
|
|
28
29
|
Object.keys(node.properties).forEach((key) => {
|
|
29
30
|
if (key !== 'src') delete node.properties[key];
|
|
30
31
|
});
|
|
31
|
-
return mdxJsxFlowElementHandler(
|
|
32
|
+
return mdxJsxFlowElementHandler(h, node);
|
|
32
33
|
};
|
|
33
34
|
}
|
|
34
35
|
|
|
36
|
+
ESCAPED_COMPONENTS.forEach((component) => {
|
|
37
|
+
handlers[component] = function (h: State, node: Element) {
|
|
38
|
+
return mdxJsxFlowElementHandler(h, node);
|
|
39
|
+
};
|
|
40
|
+
});
|
|
41
|
+
|
|
35
42
|
const mdxAst = unified()
|
|
36
43
|
.use(function () {
|
|
37
44
|
return function (tree: HastRoot): MdastRoot {
|
package/src/utils/className.ts
CHANGED
|
@@ -7,7 +7,7 @@ export function unifiedRemoveClassNames() {
|
|
|
7
7
|
};
|
|
8
8
|
}
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
function removeClassNames(node: Element) {
|
|
11
11
|
return visit(node, 'element', function (subNode) {
|
|
12
12
|
if ('properties' in subNode) delete subNode.properties.className;
|
|
13
13
|
});
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import type { Root as HastRoot } from 'hast';
|
|
2
|
+
import { CONTINUE, EXIT, visit } from 'unist-util-visit';
|
|
3
|
+
|
|
4
|
+
export function unifiedRemoveCopyButtons() {
|
|
5
|
+
return function (root: HastRoot) {
|
|
6
|
+
return removeCopyButtons(root);
|
|
7
|
+
};
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
// GitBook specific, since they have a 'Copy' button in every
|
|
11
|
+
// code block which can't be not scraped since it's included
|
|
12
|
+
// in every HTML output
|
|
13
|
+
export function removeCopyButtons(root: HastRoot) {
|
|
14
|
+
visit(root, 'element', function (node, index, parent) {
|
|
15
|
+
if (
|
|
16
|
+
node.tagName !== 'button' ||
|
|
17
|
+
!Array.isArray(node.properties.className) ||
|
|
18
|
+
!node.properties.className.includes('group-hover/codeblock:opacity-[1]')
|
|
19
|
+
)
|
|
20
|
+
return CONTINUE;
|
|
21
|
+
|
|
22
|
+
let isCopyButton = false as boolean;
|
|
23
|
+
visit(node, 'text', function (textNode) {
|
|
24
|
+
if (textNode.value === 'Copy' || textNode.value === 'copy') {
|
|
25
|
+
isCopyButton = true;
|
|
26
|
+
return EXIT;
|
|
27
|
+
}
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
if (isCopyButton) {
|
|
31
|
+
if (!parent || typeof index !== 'number') return CONTINUE;
|
|
32
|
+
parent.children.splice(index, 1);
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { Root as MdastRoot } from 'mdast';
|
|
2
|
+
import { CONTINUE, visit } from 'unist-util-visit';
|
|
3
|
+
|
|
4
|
+
export function remarkRemoveEmptyEmphases() {
|
|
5
|
+
return function (root: MdastRoot) {
|
|
6
|
+
return removeEmptyEmphases(root);
|
|
7
|
+
};
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function removeEmptyEmphases(root: MdastRoot) {
|
|
11
|
+
visit(root, function (node, index, parent) {
|
|
12
|
+
if (node.type !== 'emphasis' && node.type !== 'strong') return CONTINUE;
|
|
13
|
+
if (node.children.length === 0) {
|
|
14
|
+
if (!parent || typeof index !== 'number') return CONTINUE;
|
|
15
|
+
parent.children.splice(index, 1);
|
|
16
|
+
}
|
|
17
|
+
});
|
|
18
|
+
}
|