@mintlify/scraping 3.0.189 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +69 -24
- package/bin/cli.js.map +1 -1
- package/bin/scrapingPipeline/logo.js +10 -2
- package/bin/scrapingPipeline/logo.js.map +1 -1
- package/bin/scrapingPipeline/page.js +2 -0
- package/bin/scrapingPipeline/page.js.map +1 -1
- package/bin/scrapingPipeline/root.js +7 -2
- package/bin/scrapingPipeline/root.js.map +1 -1
- package/bin/scrapingPipeline/tabs.js +9 -3
- package/bin/scrapingPipeline/tabs.js.map +1 -1
- package/bin/tsconfig.build.tsbuildinfo +1 -1
- package/bin/utils/hastComments.d.ts +3 -0
- package/bin/utils/hastComments.js +15 -0
- package/bin/utils/hastComments.js.map +1 -0
- package/bin/utils/network.js +26 -18
- package/bin/utils/network.js.map +1 -1
- package/package.json +2 -2
- package/src/cli.ts +76 -24
- package/src/scrapingPipeline/logo.ts +13 -6
- package/src/scrapingPipeline/page.ts +2 -0
- package/src/scrapingPipeline/root.ts +9 -2
- package/src/scrapingPipeline/tabs.ts +8 -3
- package/src/utils/hastComments.ts +17 -0
- package/src/utils/network.ts +23 -18
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { visit, CONTINUE } from 'unist-util-visit';
|
|
2
|
+
export function removeHastComments(root) {
|
|
3
|
+
visit(root, 'comment', function (_, index, parent) {
|
|
4
|
+
if (parent && typeof index === 'number') {
|
|
5
|
+
parent.children.splice(index, 1);
|
|
6
|
+
return [CONTINUE, index];
|
|
7
|
+
}
|
|
8
|
+
});
|
|
9
|
+
}
|
|
10
|
+
export function rehypeRemoveHastComments() {
|
|
11
|
+
return function (root) {
|
|
12
|
+
return removeHastComments(root);
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=hastComments.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hastComments.js","sourceRoot":"","sources":["../../src/utils/hastComments.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAEnD,MAAM,UAAU,kBAAkB,CAAC,IAAc;IAC/C,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,CAAC,EAAE,KAAK,EAAE,MAAM;QAC/C,IAAI,MAAM,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YACxC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YACjC,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAC3B,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,wBAAwB;IACtC,OAAO,UAAU,IAAc;QAC7B,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;IAClC,CAAC,CAAC;AACJ,CAAC"}
|
package/bin/utils/network.js
CHANGED
|
@@ -29,12 +29,17 @@ export async function startPuppeteer() {
|
|
|
29
29
|
}
|
|
30
30
|
}
|
|
31
31
|
export async function getHtmlWithPuppeteer(browser, url) {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
try {
|
|
33
|
+
const page = await browser.newPage();
|
|
34
|
+
await page.goto(url.toString(), {
|
|
35
|
+
waitUntil: 'networkidle2',
|
|
36
|
+
});
|
|
37
|
+
return await exponentialBackoff(() => page.content());
|
|
38
|
+
}
|
|
39
|
+
catch (error) {
|
|
40
|
+
const errorMessage = getErrorMessage(error);
|
|
41
|
+
throw new Error(`Failed to download page from Puppeteer${errorMessage}`);
|
|
42
|
+
}
|
|
38
43
|
}
|
|
39
44
|
async function fetchPageResponse(url) {
|
|
40
45
|
try {
|
|
@@ -46,23 +51,26 @@ async function fetchPageResponse(url) {
|
|
|
46
51
|
}
|
|
47
52
|
catch (error) {
|
|
48
53
|
const errorMessage = getErrorMessage(error);
|
|
49
|
-
throw new Error(
|
|
54
|
+
throw new Error(`Failed to fetch page from source${errorMessage}`);
|
|
50
55
|
}
|
|
51
56
|
}
|
|
52
57
|
export async function fetchPageHtml(url, browser = undefined) {
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
return res;
|
|
58
|
-
throw new Error('an unknown error occured');
|
|
58
|
+
try {
|
|
59
|
+
let res = undefined;
|
|
60
|
+
if (browser) {
|
|
61
|
+
res = await getHtmlWithPuppeteer(browser, url);
|
|
59
62
|
}
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
throw new Error(`${url}\n\t- Puppeteer failed to retrieve page from source${errorMessage}`);
|
|
63
|
+
else {
|
|
64
|
+
res = await exponentialBackoff(() => fetchPageResponse(url));
|
|
63
65
|
}
|
|
66
|
+
if (res)
|
|
67
|
+
return res;
|
|
68
|
+
throw new Error('An unknown error occured.');
|
|
69
|
+
}
|
|
70
|
+
catch (error) {
|
|
71
|
+
const errorMessage = getErrorMessage(error);
|
|
72
|
+
throw new Error(`Error retrieving HTML for ${url.toString()}${errorMessage}`);
|
|
64
73
|
}
|
|
65
|
-
return await exponentialBackoff(() => fetchPageResponse(url));
|
|
66
74
|
}
|
|
67
75
|
export async function fetchImage(url) {
|
|
68
76
|
try {
|
|
@@ -76,7 +84,7 @@ export async function fetchImage(url) {
|
|
|
76
84
|
}
|
|
77
85
|
catch (error) {
|
|
78
86
|
const errorMessage = getErrorMessage(error);
|
|
79
|
-
throw new Error(`${url}
|
|
87
|
+
throw new Error(`${url} - failed to retrieve image from source${errorMessage}`);
|
|
80
88
|
}
|
|
81
89
|
}
|
|
82
90
|
//# sourceMappingURL=network.js.map
|
package/bin/utils/network.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"network.js","sourceRoot":"","sources":["../../src/utils/network.ts"],"names":[],"mappings":"AAAA,OAAO,EAAW,MAAM,EAAE,MAAM,WAAW,CAAC;AAE5C,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,KAAK,UAAU,kBAAkB,CAC/B,SAA2B,EAC3B,UAAkB,CAAC,EACnB,QAAgB,IAAI,EACpB,SAAiB,CAAC;IAElB,IAAI,CAAC;QACH,OAAO,MAAM,SAAS,EAAE,CAAC;IAC3B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC3D,OAAO,kBAAkB,CAAC,SAAS,EAAE,OAAO,GAAG,CAAC,EAAE,KAAK,GAAG,MAAM,EAAE,MAAM,CAAC,CAAC;QAC5E,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc;IAClC,IAAI,CAAC;QACH,OAAO,MAAM,MAAM,CAAC;YAClB,QAAQ,EAAE,IAAI;YACd,iBAAiB,EAAE,IAAI;SACxB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,GAAG,CAAC,wCAAwC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAgB,EAChB,GAAiB;IAEjB,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"network.js","sourceRoot":"","sources":["../../src/utils/network.ts"],"names":[],"mappings":"AAAA,OAAO,EAAW,MAAM,EAAE,MAAM,WAAW,CAAC;AAE5C,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,KAAK,UAAU,kBAAkB,CAC/B,SAA2B,EAC3B,UAAkB,CAAC,EACnB,QAAgB,IAAI,EACpB,SAAiB,CAAC;IAElB,IAAI,CAAC;QACH,OAAO,MAAM,SAAS,EAAE,CAAC;IAC3B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC3D,OAAO,kBAAkB,CAAC,SAAS,EAAE,OAAO,GAAG,CAAC,EAAE,KAAK,GAAG,MAAM,EAAE,MAAM,CAAC,CAAC;QAC5E,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc;IAClC,IAAI,CAAC;QACH,OAAO,MAAM,MAAM,CAAC;YAClB,QAAQ,EAAE,IAAI;YACd,iBAAiB,EAAE,IAAI;SACxB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,GAAG,CAAC,wCAAwC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAgB,EAChB,GAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACrC,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE;YAC9B,SAAS,EAAE,cAAc;SAC1B,CAAC,CAAC;QACH,OAAO,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;IACxD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,yCAAyC,YAAY,EAAE,CAAC,CAAC;IAC3E,CAAC;AACH,CAAC;AAED,KAAK,UAAU,iBAAiB,CAAC,GAAiB;IAChD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7B,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QACD,OAAO,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;IAC1B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,mCAAmC,YAAY,EAAE,CAAC,CAAC;IACrE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAiB,EACjB,UAA+B,SAAS;IAExC,IAAI,CAAC;QACH,IAAI,GAAG,GAAuB,SAAS,CAAC;QACxC,IAAI,OAAO,EAAE,CAAC;YACZ,GAAG,GAAG,MAAM,oBAAoB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QACjD,CAAC;aAAM,CAAC;YACN,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC;QAC/D,CAAC;QACD,IAAI,GAAG;YAAE,OAAO,GAAG,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;IAC/C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,CAAC,QAAQ,EAAE,GAAG,YAAY,EAAE,CAAC,CAAC;IAChF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,GAAW;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;QACvD,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,SAAS,GAAG,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC;QAE9C,OAAO,SAAS,CAAC;IACnB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,0CAA0C,YAAY,EAAE,CAAC,CAAC;IAClF,CAAC;AACH,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mintlify/scraping",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.0.1",
|
|
4
4
|
"description": "Scrape documentation frameworks to Mintlify docs",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=18.0.0"
|
|
@@ -81,5 +81,5 @@
|
|
|
81
81
|
"typescript": "^5.5.3",
|
|
82
82
|
"vitest": "^2.0.4"
|
|
83
83
|
},
|
|
84
|
-
"gitHead": "
|
|
84
|
+
"gitHead": "4ef3ed8269c758723ad39cb522b831ee6db81067"
|
|
85
85
|
}
|
package/src/cli.ts
CHANGED
|
@@ -3,10 +3,12 @@ import yargs from 'yargs';
|
|
|
3
3
|
import { hideBin } from 'yargs/helpers';
|
|
4
4
|
|
|
5
5
|
import { FINAL_SUCCESS_MESSAGE } from './constants.js';
|
|
6
|
+
import { generateOpenApiPages } from './openapi/generateOpenApiPages.js';
|
|
6
7
|
import { scrapePageGroup } from './scrapingPipeline/group.js';
|
|
7
8
|
import { htmlToHast } from './scrapingPipeline/root.js';
|
|
8
9
|
import { scrapeAllSiteTabs } from './scrapingPipeline/tabs.js';
|
|
9
10
|
import { detectFramework, framework } from './utils/detectFramework.js';
|
|
11
|
+
import { getErrorMessage } from './utils/errors.js';
|
|
10
12
|
import { write } from './utils/file.js';
|
|
11
13
|
import { log } from './utils/log.js';
|
|
12
14
|
import { fetchPageHtml } from './utils/network.js';
|
|
@@ -27,6 +29,46 @@ await yargs(hideBin(process.argv))
|
|
|
27
29
|
async ({ url }) => await site(url)
|
|
28
30
|
)
|
|
29
31
|
|
|
32
|
+
.command(
|
|
33
|
+
'openapi-file <openapiFilename>',
|
|
34
|
+
'Creates MDX files from an OpenAPI spec',
|
|
35
|
+
(yargs) =>
|
|
36
|
+
yargs
|
|
37
|
+
.positional('openapiFilename', {
|
|
38
|
+
describe: 'The filename of the OpenAPI spec',
|
|
39
|
+
type: 'string',
|
|
40
|
+
demandOption: true,
|
|
41
|
+
})
|
|
42
|
+
.option('writeFiles', {
|
|
43
|
+
describe: 'Whether or not to write the frontmatter files',
|
|
44
|
+
default: true,
|
|
45
|
+
type: 'boolean',
|
|
46
|
+
alias: 'w',
|
|
47
|
+
})
|
|
48
|
+
.option('outDir', {
|
|
49
|
+
describe: 'The folder in which to write any created frontmatter files',
|
|
50
|
+
type: 'string',
|
|
51
|
+
alias: 'o',
|
|
52
|
+
}),
|
|
53
|
+
async (argv) => {
|
|
54
|
+
try {
|
|
55
|
+
const { nav } = await generateOpenApiPages(
|
|
56
|
+
argv.openapiFilename,
|
|
57
|
+
argv.writeFiles,
|
|
58
|
+
argv.outDir
|
|
59
|
+
);
|
|
60
|
+
console.log('navigation object suggestion:');
|
|
61
|
+
console.log(JSON.stringify(nav, undefined, 2));
|
|
62
|
+
} catch (error) {
|
|
63
|
+
if (error instanceof Error) {
|
|
64
|
+
console.error(error.message);
|
|
65
|
+
} else {
|
|
66
|
+
console.error(error);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
)
|
|
71
|
+
|
|
30
72
|
.strictCommands()
|
|
31
73
|
.demandCommand(1, 'Unknown command. See above for the list of supported commands.')
|
|
32
74
|
.alias('h', 'help')
|
|
@@ -34,37 +76,47 @@ await yargs(hideBin(process.argv))
|
|
|
34
76
|
.parse();
|
|
35
77
|
|
|
36
78
|
async function page(url: string) {
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
79
|
+
try {
|
|
80
|
+
const urlObj = new URL(url);
|
|
81
|
+
const html = await fetchPageHtml(urlObj);
|
|
82
|
+
log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
|
|
40
83
|
|
|
41
|
-
|
|
42
|
-
|
|
84
|
+
const hast = htmlToHast(html);
|
|
85
|
+
detectFramework(hast);
|
|
43
86
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
87
|
+
const needsBrowser = framework.vendor === 'gitbook';
|
|
88
|
+
const results = await scrapePageGroup([urlObj], needsBrowser);
|
|
89
|
+
const result = results[0] || {
|
|
90
|
+
success: false,
|
|
91
|
+
message: `An unknown error occurred when scraping ${url}`,
|
|
92
|
+
};
|
|
50
93
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
94
|
+
if (result.success) {
|
|
95
|
+
log(`Successfully scraped ${url} ${result.data ? `into ${result.data[1]}` : ''}`);
|
|
96
|
+
} else {
|
|
97
|
+
log(result.message);
|
|
98
|
+
}
|
|
99
|
+
} catch (error) {
|
|
100
|
+
const errorMessage = getErrorMessage(error);
|
|
101
|
+
log(errorMessage);
|
|
55
102
|
}
|
|
56
103
|
}
|
|
57
104
|
|
|
58
105
|
async function site(url: string) {
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
106
|
+
try {
|
|
107
|
+
const urlObj = new URL(url);
|
|
108
|
+
const html = await fetchPageHtml(urlObj);
|
|
109
|
+
log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
|
|
62
110
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
111
|
+
const result = await scrapeAllSiteTabs(html, urlObj);
|
|
112
|
+
if (result.success) {
|
|
113
|
+
write('mint.json', JSON.stringify(result.data, undefined, 2));
|
|
114
|
+
log(FINAL_SUCCESS_MESSAGE);
|
|
115
|
+
} else {
|
|
116
|
+
log(result.message);
|
|
117
|
+
}
|
|
118
|
+
} catch (error) {
|
|
119
|
+
const errorMessage = getErrorMessage(error);
|
|
120
|
+
log(errorMessage);
|
|
69
121
|
}
|
|
70
122
|
}
|
|
@@ -4,7 +4,9 @@ import { Browser } from 'puppeteer';
|
|
|
4
4
|
import { EXIT, visit } from 'unist-util-visit';
|
|
5
5
|
|
|
6
6
|
import { framework } from '../utils/detectFramework.js';
|
|
7
|
+
import { getErrorMessage } from '../utils/errors.js';
|
|
7
8
|
import { downloadImage } from '../utils/images.js';
|
|
9
|
+
import { log } from '../utils/log.js';
|
|
8
10
|
import { fetchPageHtml } from '../utils/network.js';
|
|
9
11
|
import { htmlToHast } from './root.js';
|
|
10
12
|
|
|
@@ -104,12 +106,17 @@ export async function downloadLogos(
|
|
|
104
106
|
})
|
|
105
107
|
);
|
|
106
108
|
} else {
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
109
|
+
try {
|
|
110
|
+
const html = await fetchPageHtml(url);
|
|
111
|
+
await findLogosFromHtml(
|
|
112
|
+
html,
|
|
113
|
+
framework.vendor === 'gitbook' ? findGitBookLogoNodes : findDocusaurusLogoNodes,
|
|
114
|
+
filepaths
|
|
115
|
+
);
|
|
116
|
+
} catch (error) {
|
|
117
|
+
const errorMessage = getErrorMessage(error);
|
|
118
|
+
log(`Failed to retrieve logo from HTML: ${errorMessage}`);
|
|
119
|
+
}
|
|
113
120
|
}
|
|
114
121
|
|
|
115
122
|
if (browser) await browser.close();
|
|
@@ -27,6 +27,7 @@ import { unifiedRemoveEmptyParagraphs } from '../utils/emptyParagraphs.js';
|
|
|
27
27
|
import { getErrorMessage, logErrorResults } from '../utils/errors.js';
|
|
28
28
|
import { escapeCharactersOutsideCodeBlocks } from '../utils/escape.js';
|
|
29
29
|
import { write, writePage } from '../utils/file.js';
|
|
30
|
+
import { removeHastComments } from '../utils/hastComments.js';
|
|
30
31
|
import { log } from '../utils/log.js';
|
|
31
32
|
import { unifiedRemoveNestedRoots } from '../utils/nestedRoots.js';
|
|
32
33
|
import { unifiedRemovePositions } from '../utils/position.js';
|
|
@@ -54,6 +55,7 @@ export async function scrapePage(
|
|
|
54
55
|
}
|
|
55
56
|
|
|
56
57
|
const hast = htmlToHast(html);
|
|
58
|
+
removeHastComments(hast);
|
|
57
59
|
|
|
58
60
|
if (!framework.vendor) detectFramework(hast);
|
|
59
61
|
|
|
@@ -2,9 +2,16 @@ import type { Root as HastRoot } from 'hast';
|
|
|
2
2
|
import rehypeParse from 'rehype-parse';
|
|
3
3
|
import { unified } from 'unified';
|
|
4
4
|
|
|
5
|
+
import { rehypeRemoveHastComments } from '../utils/hastComments.js';
|
|
5
6
|
import { unifiedRemovePositions } from '../utils/position.js';
|
|
6
7
|
|
|
7
8
|
export function htmlToHast(html: string): HastRoot {
|
|
8
|
-
|
|
9
|
-
|
|
9
|
+
return (
|
|
10
|
+
unified()
|
|
11
|
+
// @ts-expect-error remarkStringify errors even if used for valid code from documentation examples
|
|
12
|
+
.use(rehypeParse)
|
|
13
|
+
.use(unifiedRemovePositions)
|
|
14
|
+
.use(rehypeRemoveHastComments)
|
|
15
|
+
.parse(html) as HastRoot
|
|
16
|
+
);
|
|
10
17
|
}
|
|
@@ -4,6 +4,7 @@ import { MintConfig, Tab } from '@mintlify/models';
|
|
|
4
4
|
import { retrieveTabLinks } from '../tabs/retrieveReadme.js';
|
|
5
5
|
import type { Result } from '../types/result.js';
|
|
6
6
|
import { detectFramework, framework } from '../utils/detectFramework.js';
|
|
7
|
+
import { getErrorMessage } from '../utils/errors.js';
|
|
7
8
|
import { log } from '../utils/log.js';
|
|
8
9
|
import { fetchPageHtml, startPuppeteer } from '../utils/network.js';
|
|
9
10
|
import { getTitleFromLink } from '../utils/title.js';
|
|
@@ -47,8 +48,12 @@ export async function scrapeAllSiteTabs(
|
|
|
47
48
|
links.map(async (tabEntry) => {
|
|
48
49
|
const newUrl = new URL(url);
|
|
49
50
|
newUrl.pathname = tabEntry.url;
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
try {
|
|
52
|
+
const newHtml = await fetchPageHtml(newUrl, undefined);
|
|
53
|
+
return await scrapeSite(newHtml, newUrl, { tabs: [tabEntry] });
|
|
54
|
+
} catch (error) {
|
|
55
|
+
return { success: false as const, message: getErrorMessage(error) };
|
|
56
|
+
}
|
|
52
57
|
})
|
|
53
58
|
);
|
|
54
59
|
|
|
@@ -64,7 +69,7 @@ export async function scrapeAllSiteTabs(
|
|
|
64
69
|
|
|
65
70
|
const failures = results.filter((result) => !result.success);
|
|
66
71
|
failures.forEach((result) => {
|
|
67
|
-
log('Failed to scrape tab
|
|
72
|
+
log('Failed to scrape tab' + result.message);
|
|
68
73
|
});
|
|
69
74
|
|
|
70
75
|
return {
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { Root as HastRoot } from 'hast';
|
|
2
|
+
import { visit, CONTINUE } from 'unist-util-visit';
|
|
3
|
+
|
|
4
|
+
export function removeHastComments(root: HastRoot): void {
|
|
5
|
+
visit(root, 'comment', function (_, index, parent) {
|
|
6
|
+
if (parent && typeof index === 'number') {
|
|
7
|
+
parent.children.splice(index, 1);
|
|
8
|
+
return [CONTINUE, index];
|
|
9
|
+
}
|
|
10
|
+
});
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export function rehypeRemoveHastComments() {
|
|
14
|
+
return function (root: HastRoot) {
|
|
15
|
+
return removeHastComments(root);
|
|
16
|
+
};
|
|
17
|
+
}
|
package/src/utils/network.ts
CHANGED
|
@@ -38,12 +38,16 @@ export async function getHtmlWithPuppeteer(
|
|
|
38
38
|
browser: Browser,
|
|
39
39
|
url: string | URL
|
|
40
40
|
): Promise<string | undefined> {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
try {
|
|
42
|
+
const page = await browser.newPage();
|
|
43
|
+
await page.goto(url.toString(), {
|
|
44
|
+
waitUntil: 'networkidle2',
|
|
45
|
+
});
|
|
46
|
+
return await exponentialBackoff(() => page.content());
|
|
47
|
+
} catch (error) {
|
|
48
|
+
const errorMessage = getErrorMessage(error);
|
|
49
|
+
throw new Error(`Failed to download page from Puppeteer${errorMessage}`);
|
|
50
|
+
}
|
|
47
51
|
}
|
|
48
52
|
|
|
49
53
|
async function fetchPageResponse(url: string | URL): Promise<string> {
|
|
@@ -55,7 +59,7 @@ async function fetchPageResponse(url: string | URL): Promise<string> {
|
|
|
55
59
|
return await res.text();
|
|
56
60
|
} catch (error) {
|
|
57
61
|
const errorMessage = getErrorMessage(error);
|
|
58
|
-
throw new Error(
|
|
62
|
+
throw new Error(`Failed to fetch page from source${errorMessage}`);
|
|
59
63
|
}
|
|
60
64
|
}
|
|
61
65
|
|
|
@@ -63,18 +67,19 @@ export async function fetchPageHtml(
|
|
|
63
67
|
url: string | URL,
|
|
64
68
|
browser: Browser | undefined = undefined
|
|
65
69
|
): Promise<string> {
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
const errorMessage = getErrorMessage(error);
|
|
73
|
-
throw new Error(`${url}\n\t- Puppeteer failed to retrieve page from source${errorMessage}`);
|
|
70
|
+
try {
|
|
71
|
+
let res: string | undefined = undefined;
|
|
72
|
+
if (browser) {
|
|
73
|
+
res = await getHtmlWithPuppeteer(browser, url);
|
|
74
|
+
} else {
|
|
75
|
+
res = await exponentialBackoff(() => fetchPageResponse(url));
|
|
74
76
|
}
|
|
77
|
+
if (res) return res;
|
|
78
|
+
throw new Error('An unknown error occured.');
|
|
79
|
+
} catch (error) {
|
|
80
|
+
const errorMessage = getErrorMessage(error);
|
|
81
|
+
throw new Error(`Error retrieving HTML for ${url.toString()}${errorMessage}`);
|
|
75
82
|
}
|
|
76
|
-
|
|
77
|
-
return await exponentialBackoff(() => fetchPageResponse(url));
|
|
78
83
|
}
|
|
79
84
|
|
|
80
85
|
export async function fetchImage(url: string): Promise<NodeJS.TypedArray> {
|
|
@@ -90,6 +95,6 @@ export async function fetchImage(url: string): Promise<NodeJS.TypedArray> {
|
|
|
90
95
|
return imageData;
|
|
91
96
|
} catch (error) {
|
|
92
97
|
const errorMessage = getErrorMessage(error);
|
|
93
|
-
throw new Error(`${url}
|
|
98
|
+
throw new Error(`${url} - failed to retrieve image from source${errorMessage}`);
|
|
94
99
|
}
|
|
95
100
|
}
|