@mintlify/scraping 3.0.189 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ import type { Root as HastRoot } from 'hast';
2
+ export declare function removeHastComments(root: HastRoot): void;
3
+ export declare function rehypeRemoveHastComments(): (root: HastRoot) => void;
@@ -0,0 +1,15 @@
1
+ import { visit, CONTINUE } from 'unist-util-visit';
2
+ export function removeHastComments(root) {
3
+ visit(root, 'comment', function (_, index, parent) {
4
+ if (parent && typeof index === 'number') {
5
+ parent.children.splice(index, 1);
6
+ return [CONTINUE, index];
7
+ }
8
+ });
9
+ }
10
+ export function rehypeRemoveHastComments() {
11
+ return function (root) {
12
+ return removeHastComments(root);
13
+ };
14
+ }
15
+ //# sourceMappingURL=hastComments.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hastComments.js","sourceRoot":"","sources":["../../src/utils/hastComments.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAEnD,MAAM,UAAU,kBAAkB,CAAC,IAAc;IAC/C,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,CAAC,EAAE,KAAK,EAAE,MAAM;QAC/C,IAAI,MAAM,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YACxC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YACjC,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAC3B,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,wBAAwB;IACtC,OAAO,UAAU,IAAc;QAC7B,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;IAClC,CAAC,CAAC;AACJ,CAAC"}
@@ -29,12 +29,17 @@ export async function startPuppeteer() {
29
29
  }
30
30
  }
31
31
  export async function getHtmlWithPuppeteer(browser, url) {
32
- const page = await browser.newPage();
33
- await page.goto(url.toString(), {
34
- waitUntil: 'networkidle2',
35
- });
36
- const html = await exponentialBackoff(() => page.content());
37
- return html;
32
+ try {
33
+ const page = await browser.newPage();
34
+ await page.goto(url.toString(), {
35
+ waitUntil: 'networkidle2',
36
+ });
37
+ return await exponentialBackoff(() => page.content());
38
+ }
39
+ catch (error) {
40
+ const errorMessage = getErrorMessage(error);
41
+ throw new Error(`Failed to download page from Puppeteer${errorMessage}`);
42
+ }
38
43
  }
39
44
  async function fetchPageResponse(url) {
40
45
  try {
@@ -46,23 +51,26 @@ async function fetchPageResponse(url) {
46
51
  }
47
52
  catch (error) {
48
53
  const errorMessage = getErrorMessage(error);
49
- throw new Error(`${url}\n\t- failed to fetch page from source${errorMessage}`);
54
+ throw new Error(`Failed to fetch page from source${errorMessage}`);
50
55
  }
51
56
  }
52
57
  export async function fetchPageHtml(url, browser = undefined) {
53
- if (browser) {
54
- try {
55
- const res = await getHtmlWithPuppeteer(browser, url);
56
- if (res)
57
- return res;
58
- throw new Error('an unknown error occured');
58
+ try {
59
+ let res = undefined;
60
+ if (browser) {
61
+ res = await getHtmlWithPuppeteer(browser, url);
59
62
  }
60
- catch (error) {
61
- const errorMessage = getErrorMessage(error);
62
- throw new Error(`${url}\n\t- Puppeteer failed to retrieve page from source${errorMessage}`);
63
+ else {
64
+ res = await exponentialBackoff(() => fetchPageResponse(url));
63
65
  }
66
+ if (res)
67
+ return res;
68
+ throw new Error('An unknown error occured.');
69
+ }
70
+ catch (error) {
71
+ const errorMessage = getErrorMessage(error);
72
+ throw new Error(`Error retrieving HTML for ${url.toString()}${errorMessage}`);
64
73
  }
65
- return await exponentialBackoff(() => fetchPageResponse(url));
66
74
  }
67
75
  export async function fetchImage(url) {
68
76
  try {
@@ -76,7 +84,7 @@ export async function fetchImage(url) {
76
84
  }
77
85
  catch (error) {
78
86
  const errorMessage = getErrorMessage(error);
79
- throw new Error(`${url}\n\t- failed to retrieve image from source${errorMessage}`);
87
+ throw new Error(`${url} - failed to retrieve image from source${errorMessage}`);
80
88
  }
81
89
  }
82
90
  //# sourceMappingURL=network.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"network.js","sourceRoot":"","sources":["../../src/utils/network.ts"],"names":[],"mappings":"AAAA,OAAO,EAAW,MAAM,EAAE,MAAM,WAAW,CAAC;AAE5C,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,KAAK,UAAU,kBAAkB,CAC/B,SAA2B,EAC3B,UAAkB,CAAC,EACnB,QAAgB,IAAI,EACpB,SAAiB,CAAC;IAElB,IAAI,CAAC;QACH,OAAO,MAAM,SAAS,EAAE,CAAC;IAC3B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC3D,OAAO,kBAAkB,CAAC,SAAS,EAAE,OAAO,GAAG,CAAC,EAAE,KAAK,GAAG,MAAM,EAAE,MAAM,CAAC,CAAC;QAC5E,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc;IAClC,IAAI,CAAC;QACH,OAAO,MAAM,MAAM,CAAC;YAClB,QAAQ,EAAE,IAAI;YACd,iBAAiB,EAAE,IAAI;SACxB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,GAAG,CAAC,wCAAwC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAgB,EAChB,GAAiB;IAEjB,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IACrC,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE;QAC9B,SAAS,EAAE,cAAc;KAC1B,CAAC,CAAC;IACH,MAAM,IAAI,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;IAC5D,OAAO,IAAI,CAAC;AACd,CAAC;AAED,KAAK,UAAU,iBAAiB,CAAC,GAAiB;IAChD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7B,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QACD,OAAO,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;IAC1B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,yCAAyC,YAAY,EAAE,CAAC,CAAC;IACjF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAiB,EACjB,UAA+B,SAAS;IAExC,IAAI,OAAO,EAAE,CAAC;QACZ,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,MAAM,oBAAoB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YACrD,IAAI,GAAG;gBAAE,OAAO,GAAG,CAAC;YACpB,MAAM,IAAI,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAC9C,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;YAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,sDAAsD,YAAY,EAAE,CAAC,CAAC;QAC9F,CAAC;IACH,CAAC;IAED,OAAO,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC;AAChE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,GAAW;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;QACvD,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,SAAS,GAAG,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC;QAE9C,OAAO,SAAS,CAAC;IACnB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,6CAA6C,YAAY,EAAE,CAAC,CAAC;IACrF,CAAC;AACH,CAAC"}
1
+ {"version":3,"file":"network.js","sourceRoot":"","sources":["../../src/utils/network.ts"],"names":[],"mappings":"AAAA,OAAO,EAAW,MAAM,EAAE,MAAM,WAAW,CAAC;AAE5C,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,KAAK,UAAU,kBAAkB,CAC/B,SAA2B,EAC3B,UAAkB,CAAC,EACnB,QAAgB,IAAI,EACpB,SAAiB,CAAC;IAElB,IAAI,CAAC;QACH,OAAO,MAAM,SAAS,EAAE,CAAC;IAC3B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC3D,OAAO,kBAAkB,CAAC,SAAS,EAAE,OAAO,GAAG,CAAC,EAAE,KAAK,GAAG,MAAM,EAAE,MAAM,CAAC,CAAC;QAC5E,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc;IAClC,IAAI,CAAC;QACH,OAAO,MAAM,MAAM,CAAC;YAClB,QAAQ,EAAE,IAAI;YACd,iBAAiB,EAAE,IAAI;SACxB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,GAAG,CAAC,wCAAwC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAgB,EAChB,GAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACrC,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE;YAC9B,SAAS,EAAE,cAAc;SAC1B,CAAC,CAAC;QACH,OAAO,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;IACxD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,yCAAyC,YAAY,EAAE,CAAC,CAAC;IAC3E,CAAC;AACH,CAAC;AAED,KAAK,UAAU,iBAAiB,CAAC,GAAiB;IAChD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7B,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QACD,OAAO,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;IAC1B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,mCAAmC,YAAY,EAAE,CAAC,CAAC;IACrE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAiB,EACjB,UAA+B,SAAS;IAExC,IAAI,CAAC;QACH,IAAI,GAAG,GAAuB,SAAS,CAAC;QACxC,IAAI,OAAO,EAAE,CAAC;YACZ,GAAG,GAAG,MAAM,oBAAoB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QACjD,CAAC;aAAM,CAAC;YACN,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC;QAC/D,CAAC;QACD,IAAI,GAAG;YAAE,OAAO,GAAG,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;IAC/C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,CAAC,QAAQ,EAAE,GAAG,YAAY,EAAE,CAAC,CAAC;IAChF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,GAAW;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;QACvD,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,SAAS,GAAG,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC;QAE9C,OAAO,SAAS,CAAC;IACnB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,0CAA0C,YAAY,EAAE,CAAC,CAAC;IAClF,CAAC;AACH,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mintlify/scraping",
3
- "version": "3.0.189",
3
+ "version": "4.0.2",
4
4
  "description": "Scrape documentation frameworks to Mintlify docs",
5
5
  "engines": {
6
6
  "node": ">=18.0.0"
@@ -38,7 +38,7 @@
38
38
  "format:check": "prettier . --check"
39
39
  },
40
40
  "dependencies": {
41
- "@mintlify/common": "1.0.165",
41
+ "@mintlify/common": "1.0.166",
42
42
  "@mintlify/openapi-parser": "^0.0.0",
43
43
  "fs-extra": "^11.1.1",
44
44
  "hast": "^1.0.0",
@@ -61,10 +61,10 @@
61
61
  "devDependencies": {
62
62
  "@mintlify/eslint-config": "1.0.5",
63
63
  "@mintlify/eslint-config-typescript": "1.0.10",
64
- "@mintlify/models": "0.0.137",
64
+ "@mintlify/models": "0.0.138",
65
65
  "@mintlify/prettier-config": "1.0.4",
66
66
  "@mintlify/ts-config": "2.0.2",
67
- "@mintlify/validation": "0.1.206",
67
+ "@mintlify/validation": "0.1.207",
68
68
  "@trivago/prettier-plugin-sort-imports": "^4.2.1",
69
69
  "@tsconfig/recommended": "1.x",
70
70
  "@types/node": "^18.7.13",
@@ -81,5 +81,5 @@
81
81
  "typescript": "^5.5.3",
82
82
  "vitest": "^2.0.4"
83
83
  },
84
- "gitHead": "e69843f13b6c803075baa3325cc70f0ab62062c8"
84
+ "gitHead": "b1ee4946ab3011662b0bb8491415b00e26e7ea24"
85
85
  }
package/src/cli.ts CHANGED
@@ -3,10 +3,12 @@ import yargs from 'yargs';
3
3
  import { hideBin } from 'yargs/helpers';
4
4
 
5
5
  import { FINAL_SUCCESS_MESSAGE } from './constants.js';
6
+ import { generateOpenApiPages } from './openapi/generateOpenApiPages.js';
6
7
  import { scrapePageGroup } from './scrapingPipeline/group.js';
7
8
  import { htmlToHast } from './scrapingPipeline/root.js';
8
9
  import { scrapeAllSiteTabs } from './scrapingPipeline/tabs.js';
9
10
  import { detectFramework, framework } from './utils/detectFramework.js';
11
+ import { getErrorMessage } from './utils/errors.js';
10
12
  import { write } from './utils/file.js';
11
13
  import { log } from './utils/log.js';
12
14
  import { fetchPageHtml } from './utils/network.js';
@@ -27,6 +29,46 @@ await yargs(hideBin(process.argv))
27
29
  async ({ url }) => await site(url)
28
30
  )
29
31
 
32
+ .command(
33
+ 'openapi-file <openapiFilename>',
34
+ 'Creates MDX files from an OpenAPI spec',
35
+ (yargs) =>
36
+ yargs
37
+ .positional('openapiFilename', {
38
+ describe: 'The filename of the OpenAPI spec',
39
+ type: 'string',
40
+ demandOption: true,
41
+ })
42
+ .option('writeFiles', {
43
+ describe: 'Whether or not to write the frontmatter files',
44
+ default: true,
45
+ type: 'boolean',
46
+ alias: 'w',
47
+ })
48
+ .option('outDir', {
49
+ describe: 'The folder in which to write any created frontmatter files',
50
+ type: 'string',
51
+ alias: 'o',
52
+ }),
53
+ async (argv) => {
54
+ try {
55
+ const { nav } = await generateOpenApiPages(
56
+ argv.openapiFilename,
57
+ argv.writeFiles,
58
+ argv.outDir
59
+ );
60
+ console.log('navigation object suggestion:');
61
+ console.log(JSON.stringify(nav, undefined, 2));
62
+ } catch (error) {
63
+ if (error instanceof Error) {
64
+ console.error(error.message);
65
+ } else {
66
+ console.error(error);
67
+ }
68
+ }
69
+ }
70
+ )
71
+
30
72
  .strictCommands()
31
73
  .demandCommand(1, 'Unknown command. See above for the list of supported commands.')
32
74
  .alias('h', 'help')
@@ -34,37 +76,47 @@ await yargs(hideBin(process.argv))
34
76
  .parse();
35
77
 
36
78
  async function page(url: string) {
37
- const urlObj = new URL(url);
38
- const html = await fetchPageHtml(urlObj);
39
- log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
79
+ try {
80
+ const urlObj = new URL(url);
81
+ const html = await fetchPageHtml(urlObj);
82
+ log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
40
83
 
41
- const hast = htmlToHast(html);
42
- detectFramework(hast);
84
+ const hast = htmlToHast(html);
85
+ detectFramework(hast);
43
86
 
44
- const needsBrowser = framework.vendor === 'gitbook';
45
- const results = await scrapePageGroup([urlObj], needsBrowser);
46
- const result = results[0] || {
47
- success: false,
48
- message: `An unknown error occurred when scraping ${url}`,
49
- };
87
+ const needsBrowser = framework.vendor === 'gitbook';
88
+ const results = await scrapePageGroup([urlObj], needsBrowser);
89
+ const result = results[0] || {
90
+ success: false,
91
+ message: `An unknown error occurred when scraping ${url}`,
92
+ };
50
93
 
51
- if (result.success) {
52
- log(`Successfully scraped ${url} ${result.data ? `into ${result.data[1]}` : ''}`);
53
- } else {
54
- log(result.message);
94
+ if (result.success) {
95
+ log(`Successfully scraped ${url} ${result.data ? `into ${result.data[1]}` : ''}`);
96
+ } else {
97
+ log(result.message);
98
+ }
99
+ } catch (error) {
100
+ const errorMessage = getErrorMessage(error);
101
+ log(errorMessage);
55
102
  }
56
103
  }
57
104
 
58
105
  async function site(url: string) {
59
- const urlObj = new URL(url);
60
- const html = await fetchPageHtml(urlObj);
61
- log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
106
+ try {
107
+ const urlObj = new URL(url);
108
+ const html = await fetchPageHtml(urlObj);
109
+ log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
62
110
 
63
- const result = await scrapeAllSiteTabs(html, urlObj);
64
- if (result.success) {
65
- write('mint.json', JSON.stringify(result.data, undefined, 2));
66
- log(FINAL_SUCCESS_MESSAGE);
67
- } else {
68
- log(result.message);
111
+ const result = await scrapeAllSiteTabs(html, urlObj);
112
+ if (result.success) {
113
+ write('mint.json', JSON.stringify(result.data, undefined, 2));
114
+ log(FINAL_SUCCESS_MESSAGE);
115
+ } else {
116
+ log(result.message);
117
+ }
118
+ } catch (error) {
119
+ const errorMessage = getErrorMessage(error);
120
+ log(errorMessage);
69
121
  }
70
122
  }
@@ -4,7 +4,9 @@ import { Browser } from 'puppeteer';
4
4
  import { EXIT, visit } from 'unist-util-visit';
5
5
 
6
6
  import { framework } from '../utils/detectFramework.js';
7
+ import { getErrorMessage } from '../utils/errors.js';
7
8
  import { downloadImage } from '../utils/images.js';
9
+ import { log } from '../utils/log.js';
8
10
  import { fetchPageHtml } from '../utils/network.js';
9
11
  import { htmlToHast } from './root.js';
10
12
 
@@ -104,12 +106,17 @@ export async function downloadLogos(
104
106
  })
105
107
  );
106
108
  } else {
107
- const html = await fetchPageHtml(url);
108
- await findLogosFromHtml(
109
- html,
110
- framework.vendor === 'gitbook' ? findGitBookLogoNodes : findDocusaurusLogoNodes,
111
- filepaths
112
- );
109
+ try {
110
+ const html = await fetchPageHtml(url);
111
+ await findLogosFromHtml(
112
+ html,
113
+ framework.vendor === 'gitbook' ? findGitBookLogoNodes : findDocusaurusLogoNodes,
114
+ filepaths
115
+ );
116
+ } catch (error) {
117
+ const errorMessage = getErrorMessage(error);
118
+ log(`Failed to retrieve logo from HTML: ${errorMessage}`);
119
+ }
113
120
  }
114
121
 
115
122
  if (browser) await browser.close();
@@ -27,6 +27,7 @@ import { unifiedRemoveEmptyParagraphs } from '../utils/emptyParagraphs.js';
27
27
  import { getErrorMessage, logErrorResults } from '../utils/errors.js';
28
28
  import { escapeCharactersOutsideCodeBlocks } from '../utils/escape.js';
29
29
  import { write, writePage } from '../utils/file.js';
30
+ import { removeHastComments } from '../utils/hastComments.js';
30
31
  import { log } from '../utils/log.js';
31
32
  import { unifiedRemoveNestedRoots } from '../utils/nestedRoots.js';
32
33
  import { unifiedRemovePositions } from '../utils/position.js';
@@ -54,6 +55,7 @@ export async function scrapePage(
54
55
  }
55
56
 
56
57
  const hast = htmlToHast(html);
58
+ removeHastComments(hast);
57
59
 
58
60
  if (!framework.vendor) detectFramework(hast);
59
61
 
@@ -2,9 +2,16 @@ import type { Root as HastRoot } from 'hast';
2
2
  import rehypeParse from 'rehype-parse';
3
3
  import { unified } from 'unified';
4
4
 
5
+ import { rehypeRemoveHastComments } from '../utils/hastComments.js';
5
6
  import { unifiedRemovePositions } from '../utils/position.js';
6
7
 
7
8
  export function htmlToHast(html: string): HastRoot {
8
- // @ts-expect-error remarkStringify errors even if used for valid code from documentation examples
9
- return unified().use(rehypeParse).use(unifiedRemovePositions).parse(html) as HastRoot;
9
+ return (
10
+ unified()
11
+ // @ts-expect-error remarkStringify errors even if used for valid code from documentation examples
12
+ .use(rehypeParse)
13
+ .use(unifiedRemovePositions)
14
+ .use(rehypeRemoveHastComments)
15
+ .parse(html) as HastRoot
16
+ );
10
17
  }
@@ -4,6 +4,7 @@ import { MintConfig, Tab } from '@mintlify/models';
4
4
  import { retrieveTabLinks } from '../tabs/retrieveReadme.js';
5
5
  import type { Result } from '../types/result.js';
6
6
  import { detectFramework, framework } from '../utils/detectFramework.js';
7
+ import { getErrorMessage } from '../utils/errors.js';
7
8
  import { log } from '../utils/log.js';
8
9
  import { fetchPageHtml, startPuppeteer } from '../utils/network.js';
9
10
  import { getTitleFromLink } from '../utils/title.js';
@@ -47,8 +48,12 @@ export async function scrapeAllSiteTabs(
47
48
  links.map(async (tabEntry) => {
48
49
  const newUrl = new URL(url);
49
50
  newUrl.pathname = tabEntry.url;
50
- const newHtml = await fetchPageHtml(newUrl, undefined);
51
- return await scrapeSite(newHtml, newUrl, { tabs: [tabEntry] });
51
+ try {
52
+ const newHtml = await fetchPageHtml(newUrl, undefined);
53
+ return await scrapeSite(newHtml, newUrl, { tabs: [tabEntry] });
54
+ } catch (error) {
55
+ return { success: false as const, message: getErrorMessage(error) };
56
+ }
52
57
  })
53
58
  );
54
59
 
@@ -64,7 +69,7 @@ export async function scrapeAllSiteTabs(
64
69
 
65
70
  const failures = results.filter((result) => !result.success);
66
71
  failures.forEach((result) => {
67
- log('Failed to scrape tab: ' + result.message);
72
+ log('Failed to scrape tab' + result.message);
68
73
  });
69
74
 
70
75
  return {
@@ -0,0 +1,17 @@
1
+ import type { Root as HastRoot } from 'hast';
2
+ import { visit, CONTINUE } from 'unist-util-visit';
3
+
4
+ export function removeHastComments(root: HastRoot): void {
5
+ visit(root, 'comment', function (_, index, parent) {
6
+ if (parent && typeof index === 'number') {
7
+ parent.children.splice(index, 1);
8
+ return [CONTINUE, index];
9
+ }
10
+ });
11
+ }
12
+
13
+ export function rehypeRemoveHastComments() {
14
+ return function (root: HastRoot) {
15
+ return removeHastComments(root);
16
+ };
17
+ }
@@ -38,12 +38,16 @@ export async function getHtmlWithPuppeteer(
38
38
  browser: Browser,
39
39
  url: string | URL
40
40
  ): Promise<string | undefined> {
41
- const page = await browser.newPage();
42
- await page.goto(url.toString(), {
43
- waitUntil: 'networkidle2',
44
- });
45
- const html = await exponentialBackoff(() => page.content());
46
- return html;
41
+ try {
42
+ const page = await browser.newPage();
43
+ await page.goto(url.toString(), {
44
+ waitUntil: 'networkidle2',
45
+ });
46
+ return await exponentialBackoff(() => page.content());
47
+ } catch (error) {
48
+ const errorMessage = getErrorMessage(error);
49
+ throw new Error(`Failed to download page from Puppeteer${errorMessage}`);
50
+ }
47
51
  }
48
52
 
49
53
  async function fetchPageResponse(url: string | URL): Promise<string> {
@@ -55,7 +59,7 @@ async function fetchPageResponse(url: string | URL): Promise<string> {
55
59
  return await res.text();
56
60
  } catch (error) {
57
61
  const errorMessage = getErrorMessage(error);
58
- throw new Error(`${url}\n\t- failed to fetch page from source${errorMessage}`);
62
+ throw new Error(`Failed to fetch page from source${errorMessage}`);
59
63
  }
60
64
  }
61
65
 
@@ -63,18 +67,19 @@ export async function fetchPageHtml(
63
67
  url: string | URL,
64
68
  browser: Browser | undefined = undefined
65
69
  ): Promise<string> {
66
- if (browser) {
67
- try {
68
- const res = await getHtmlWithPuppeteer(browser, url);
69
- if (res) return res;
70
- throw new Error('an unknown error occured');
71
- } catch (error) {
72
- const errorMessage = getErrorMessage(error);
73
- throw new Error(`${url}\n\t- Puppeteer failed to retrieve page from source${errorMessage}`);
70
+ try {
71
+ let res: string | undefined = undefined;
72
+ if (browser) {
73
+ res = await getHtmlWithPuppeteer(browser, url);
74
+ } else {
75
+ res = await exponentialBackoff(() => fetchPageResponse(url));
74
76
  }
77
+ if (res) return res;
78
+ throw new Error('An unknown error occured.');
79
+ } catch (error) {
80
+ const errorMessage = getErrorMessage(error);
81
+ throw new Error(`Error retrieving HTML for ${url.toString()}${errorMessage}`);
75
82
  }
76
-
77
- return await exponentialBackoff(() => fetchPageResponse(url));
78
83
  }
79
84
 
80
85
  export async function fetchImage(url: string): Promise<NodeJS.TypedArray> {
@@ -90,6 +95,6 @@ export async function fetchImage(url: string): Promise<NodeJS.TypedArray> {
90
95
  return imageData;
91
96
  } catch (error) {
92
97
  const errorMessage = getErrorMessage(error);
93
- throw new Error(`${url}\n\t- failed to retrieve image from source${errorMessage}`);
98
+ throw new Error(`${url} - failed to retrieve image from source${errorMessage}`);
94
99
  }
95
100
  }