@mintlify/scraping 3.0.14 → 3.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/.prettierrc +1 -0
  2. package/bin/browser.js +3 -3
  3. package/bin/constants.js +23 -23
  4. package/bin/constants.js.map +1 -1
  5. package/bin/downloadImage.js +18 -18
  6. package/bin/downloadImage.js.map +1 -1
  7. package/bin/scraping/detectFramework.js +13 -13
  8. package/bin/scraping/detectFramework.js.map +1 -1
  9. package/bin/scraping/downloadAllImages.js +5 -5
  10. package/bin/scraping/downloadAllImages.js.map +1 -1
  11. package/bin/scraping/downloadLogoImage.js +4 -4
  12. package/bin/scraping/downloadLogoImage.js.map +1 -1
  13. package/bin/scraping/getSitemapLinks.js +4 -4
  14. package/bin/scraping/scrapeFileGettingFileNameFromUrl.js +10 -10
  15. package/bin/scraping/scrapeFileGettingFileNameFromUrl.js.map +1 -1
  16. package/bin/scraping/scrapeGettingFileNameFromUrl.js +2 -2
  17. package/bin/scraping/scrapeGettingFileNameFromUrl.js.map +1 -1
  18. package/bin/scraping/scrapePage.js +3 -3
  19. package/bin/scraping/scrapePage.js.map +1 -1
  20. package/bin/scraping/scrapePageCommands.d.ts +1 -1
  21. package/bin/scraping/scrapePageCommands.js +15 -15
  22. package/bin/scraping/scrapePageCommands.js.map +1 -1
  23. package/bin/scraping/scrapeSection.js +6 -6
  24. package/bin/scraping/scrapeSection.js.map +1 -1
  25. package/bin/scraping/scrapeSectionCommands.d.ts +1 -1
  26. package/bin/scraping/scrapeSectionCommands.js +14 -14
  27. package/bin/scraping/scrapeSectionCommands.js.map +1 -1
  28. package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js +11 -11
  29. package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js.map +1 -1
  30. package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js +12 -14
  31. package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js.map +1 -1
  32. package/bin/scraping/site-scrapers/alternateGroupTitle.js +1 -1
  33. package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js +8 -11
  34. package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js.map +1 -1
  35. package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.js +6 -6
  36. package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.js +4 -4
  37. package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.d.ts +1 -1
  38. package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.js +4 -4
  39. package/bin/scraping/site-scrapers/openNestedGitbookMenus.d.ts +1 -1
  40. package/bin/scraping/site-scrapers/openNestedGitbookMenus.js.map +1 -1
  41. package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js +14 -14
  42. package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js.map +1 -1
  43. package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js +9 -9
  44. package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js.map +1 -1
  45. package/bin/scraping/site-scrapers/scrapeGitBookPage.js +12 -14
  46. package/bin/scraping/site-scrapers/scrapeGitBookPage.js.map +1 -1
  47. package/bin/scraping/site-scrapers/scrapeGitBookSection.js +10 -15
  48. package/bin/scraping/site-scrapers/scrapeGitBookSection.js.map +1 -1
  49. package/bin/scraping/site-scrapers/scrapeReadMePage.js +15 -15
  50. package/bin/scraping/site-scrapers/scrapeReadMePage.js.map +1 -1
  51. package/bin/scraping/site-scrapers/scrapeReadMeSection.js +11 -15
  52. package/bin/scraping/site-scrapers/scrapeReadMeSection.js.map +1 -1
  53. package/bin/tsconfig.tsbuildinfo +1 -1
  54. package/bin/util.d.ts +1 -1
  55. package/bin/util.js +23 -26
  56. package/bin/util.js.map +1 -1
  57. package/bin/validation/stopIfInvalidLink.js +3 -3
  58. package/package.json +9 -9
  59. package/src/browser.ts +3 -3
  60. package/src/constants.ts +23 -23
  61. package/src/downloadImage.ts +21 -26
  62. package/src/scraping/detectFramework.ts +18 -18
  63. package/src/scraping/downloadAllImages.ts +7 -9
  64. package/src/scraping/downloadLogoImage.ts +5 -4
  65. package/src/scraping/getSitemapLinks.ts +4 -4
  66. package/src/scraping/scrapeFileGettingFileNameFromUrl.ts +12 -18
  67. package/src/scraping/scrapeGettingFileNameFromUrl.ts +7 -5
  68. package/src/scraping/scrapePage.ts +4 -3
  69. package/src/scraping/scrapePageCommands.ts +17 -18
  70. package/src/scraping/scrapeSection.ts +8 -16
  71. package/src/scraping/scrapeSectionCommands.ts +19 -34
  72. package/src/scraping/site-scrapers/Intercom/scrapeIntercomPage.ts +12 -11
  73. package/src/scraping/site-scrapers/Intercom/scrapeIntercomSection.ts +23 -24
  74. package/src/scraping/site-scrapers/alternateGroupTitle.ts +1 -1
  75. package/src/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.ts +8 -11
  76. package/src/scraping/site-scrapers/links-per-group/getLinksRecursively.ts +6 -6
  77. package/src/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.ts +4 -4
  78. package/src/scraping/site-scrapers/openNestedDocusaurusMenus.ts +5 -5
  79. package/src/scraping/site-scrapers/openNestedGitbookMenus.ts +2 -4
  80. package/src/scraping/site-scrapers/scrapeDocusaurusPage.ts +15 -18
  81. package/src/scraping/site-scrapers/scrapeDocusaurusSection.ts +11 -14
  82. package/src/scraping/site-scrapers/scrapeGitBookPage.ts +13 -14
  83. package/src/scraping/site-scrapers/scrapeGitBookSection.ts +11 -15
  84. package/src/scraping/site-scrapers/scrapeReadMePage.ts +17 -22
  85. package/src/scraping/site-scrapers/scrapeReadMeSection.ts +27 -31
  86. package/src/util.ts +25 -36
  87. package/src/validation/stopIfInvalidLink.ts +3 -3
@@ -1,5 +1,6 @@
1
- import path from "path";
2
- import { createPage, getOrigin } from "../util.js";
1
+ import path from 'path';
2
+
3
+ import { createPage, getOrigin } from '../util.js';
3
4
 
4
5
  export async function scrapePage(
5
6
  scrapeFunc: ScrapePageFn,
@@ -10,7 +11,7 @@ export async function scrapePage(
10
11
  ) {
11
12
  const origin = getOrigin(href);
12
13
  const cwd = process.cwd();
13
- const imageBaseDir = path.join(cwd, "images");
14
+ const imageBaseDir = path.join(cwd, 'images');
14
15
 
15
16
  const { title, description, markdown } = await scrapeFunc(
16
17
  html,
@@ -1,23 +1,22 @@
1
- import axios from "axios";
2
- import { scrapePage } from "./scrapePage.js";
3
- import { scrapeDocusaurusPage } from "./site-scrapers/scrapeDocusaurusPage.js";
4
- import { scrapeGitBookPage } from "./site-scrapers/scrapeGitBookPage.js";
5
- import { scrapeReadMePage } from "./site-scrapers/scrapeReadMePage.js";
6
- import { detectFramework, Frameworks } from "./detectFramework.js";
7
- import { getHrefFromArgs } from "../util.js";
8
- import { getHtmlWithPuppeteer } from "../browser.js";
9
- import { ArgumentsCamelCase } from "yargs";
10
- import { scrapeIntercomPage } from "./site-scrapers/Intercom/scrapeIntercomPage.js";
1
+ import axios from 'axios';
2
+ import { ArgumentsCamelCase } from 'yargs';
3
+
4
+ import { getHtmlWithPuppeteer } from '../browser.js';
5
+ import { getHrefFromArgs } from '../util.js';
6
+ import { detectFramework, Frameworks } from './detectFramework.js';
7
+ import { scrapePage } from './scrapePage.js';
8
+ import { scrapeIntercomPage } from './site-scrapers/Intercom/scrapeIntercomPage.js';
9
+ import { scrapeDocusaurusPage } from './site-scrapers/scrapeDocusaurusPage.js';
10
+ import { scrapeGitBookPage } from './site-scrapers/scrapeGitBookPage.js';
11
+ import { scrapeReadMePage } from './site-scrapers/scrapeReadMePage.js';
11
12
 
12
13
  function validateFramework(framework) {
13
14
  if (!framework) {
14
- console.log(
15
- "Could not detect the framework automatically. Please use one of:"
16
- );
17
- console.log("scrape-page-docusaurus");
18
- console.log("scrape-page-gitbook");
19
- console.log("scrape-page-readme");
20
- console.log("scrape-page-intercom");
15
+ console.log('Could not detect the framework automatically. Please use one of:');
16
+ console.log('scrape-page-docusaurus');
17
+ console.log('scrape-page-gitbook');
18
+ console.log('scrape-page-readme');
19
+ console.log('scrape-page-intercom');
21
20
  return process.exit(1);
22
21
  }
23
22
  }
@@ -47,7 +46,7 @@ export async function scrapePageAutomatically(argv: any) {
47
46
 
48
47
  validateFramework(framework);
49
48
 
50
- console.log("Detected framework: " + framework);
49
+ console.log('Detected framework: ' + framework);
51
50
 
52
51
  switch (framework) {
53
52
  case Frameworks.DOCUSAURUS:
@@ -1,5 +1,6 @@
1
- import path from "path";
2
- import { objToReadableString } from "../util.js";
1
+ import path from 'path';
2
+
3
+ import { objToReadableString } from '../util.js';
3
4
 
4
5
  export async function scrapeSection(
5
6
  scrapeFunc: ScrapeSectionFn,
@@ -8,21 +9,12 @@ export async function scrapeSection(
8
9
  overwrite: boolean,
9
10
  version: string | undefined
10
11
  ) {
11
- console.log(
12
- `Started scraping${overwrite ? ", overwrite mode is on" : ""}...`
13
- );
12
+ console.log(`Started scraping${overwrite ? ', overwrite mode is on' : ''}...`);
14
13
  const cwd = process.cwd();
15
- const imageBaseDir = path.join(cwd, "images");
14
+ const imageBaseDir = path.join(cwd, 'images');
16
15
 
17
- const groupsConfig = await scrapeFunc(
18
- html,
19
- origin,
20
- cwd,
21
- imageBaseDir,
22
- overwrite,
23
- version
24
- );
25
- console.log("Finished scraping.");
26
- console.log("Add the following to your navigation in mint.json:");
16
+ const groupsConfig = await scrapeFunc(html, origin, cwd, imageBaseDir, overwrite, version);
17
+ console.log('Finished scraping.');
18
+ console.log('Add the following to your navigation in mint.json:');
27
19
  console.log(objToReadableString(groupsConfig));
28
20
  }
@@ -1,15 +1,16 @@
1
- import axios from "axios";
2
- import { detectFramework, Frameworks } from "./detectFramework.js";
3
- import { getHrefFromArgs, getOrigin } from "../util.js";
4
- import { scrapeSection } from "./scrapeSection.js";
5
- import { scrapeDocusaurusSection } from "./site-scrapers/scrapeDocusaurusSection.js";
6
- import openNestedDocusaurusMenus from "./site-scrapers/openNestedDocusaurusMenus.js";
7
- import { scrapeGitBookSection } from "./site-scrapers/scrapeGitBookSection.js";
8
- import openNestedGitbookMenus from "./site-scrapers/openNestedGitbookMenus.js";
9
- import { scrapeReadMeSection } from "./site-scrapers/scrapeReadMeSection.js";
10
- import { startBrowser } from "../browser.js";
11
- import { ArgumentsCamelCase } from "yargs";
12
- import { scrapeIntercomSection } from "./site-scrapers/Intercom/scrapeIntercomSection.js";
1
+ import axios from 'axios';
2
+ import { ArgumentsCamelCase } from 'yargs';
3
+
4
+ import { startBrowser } from '../browser.js';
5
+ import { getHrefFromArgs, getOrigin } from '../util.js';
6
+ import { detectFramework, Frameworks } from './detectFramework.js';
7
+ import { scrapeSection } from './scrapeSection.js';
8
+ import { scrapeIntercomSection } from './site-scrapers/Intercom/scrapeIntercomSection.js';
9
+ import openNestedDocusaurusMenus from './site-scrapers/openNestedDocusaurusMenus.js';
10
+ import openNestedGitbookMenus from './site-scrapers/openNestedGitbookMenus.js';
11
+ import { scrapeDocusaurusSection } from './site-scrapers/scrapeDocusaurusSection.js';
12
+ import { scrapeGitBookSection } from './site-scrapers/scrapeGitBookSection.js';
13
+ import { scrapeReadMeSection } from './site-scrapers/scrapeReadMeSection.js';
13
14
 
14
15
  export async function scrapeSectionAxiosWrapper(
15
16
  argv: ArgumentsCamelCase,
@@ -18,13 +19,7 @@ export async function scrapeSectionAxiosWrapper(
18
19
  const href = getHrefFromArgs(argv);
19
20
  const res = await axios.get(href);
20
21
  const html = res.data;
21
- await scrapeSection(
22
- scrapeFunc,
23
- html,
24
- getOrigin(href),
25
- !!argv.overwrite,
26
- undefined
27
- );
22
+ await scrapeSection(scrapeFunc, html, getOrigin(href), !!argv.overwrite, undefined);
28
23
  process.exit(0);
29
24
  }
30
25
 
@@ -41,11 +36,7 @@ export async function scrapeDocusaurusSectionCommand(
41
36
  }
42
37
 
43
38
  export async function scrapeGitbookSectionCommand(argv: any) {
44
- await scrapeSectionOpeningAllNested(
45
- argv,
46
- openNestedGitbookMenus,
47
- scrapeGitBookSection
48
- );
39
+ await scrapeSectionOpeningAllNested(argv, openNestedGitbookMenus, scrapeGitBookSection);
49
40
  }
50
41
 
51
42
  async function scrapeSectionOpeningAllNested(
@@ -59,18 +50,12 @@ async function scrapeSectionOpeningAllNested(
59
50
  const browser = await startBrowser();
60
51
  const page = await browser.newPage();
61
52
  await page.goto(href, {
62
- waitUntil: "networkidle2",
53
+ waitUntil: 'networkidle2',
63
54
  });
64
55
 
65
56
  const html = await openLinks(page);
66
57
  browser.close();
67
- await scrapeSection(
68
- scrapeFunc,
69
- html,
70
- getOrigin(href),
71
- !!argv.overwrite,
72
- version
73
- );
58
+ await scrapeSection(scrapeFunc, html, getOrigin(href), !!argv.overwrite, version);
74
59
  process.exit(0);
75
60
  }
76
61
 
@@ -81,7 +66,7 @@ export async function scrapeSectionAutomatically(argv: any) {
81
66
  const { framework, version } = detectFramework(html);
82
67
 
83
68
  validateFramework(framework);
84
- console.log("Detected framework: " + framework);
69
+ console.log('Detected framework: ' + framework);
85
70
 
86
71
  switch (framework) {
87
72
  case Frameworks.DOCUSAURUS:
@@ -102,7 +87,7 @@ export async function scrapeSectionAutomatically(argv: any) {
102
87
  function validateFramework(framework: Frameworks | undefined) {
103
88
  if (!framework) {
104
89
  console.log(
105
- "Could not detect the framework automatically. We only support Docusaurus (V2 and V3), GitBook, and ReadMe."
90
+ 'Could not detect the framework automatically. We only support Docusaurus (V2 and V3), GitBook, and ReadMe.'
106
91
  );
107
92
  process.exit();
108
93
  }
@@ -1,7 +1,8 @@
1
- import cheerio from "cheerio";
2
- import { NodeHtmlMarkdown } from "node-html-markdown";
3
- import downloadAllImages from "../../downloadAllImages.js";
4
- import replaceImagePaths from "../../replaceImagePaths.js";
1
+ import cheerio from 'cheerio';
2
+ import { NodeHtmlMarkdown } from 'node-html-markdown';
3
+
4
+ import downloadAllImages from '../../downloadAllImages.js';
5
+ import replaceImagePaths from '../../replaceImagePaths.js';
5
6
 
6
7
  export async function scrapeIntercomPage(
7
8
  html: string,
@@ -13,11 +14,11 @@ export async function scrapeIntercomPage(
13
14
  ) {
14
15
  const $ = cheerio.load(html);
15
16
 
16
- const titleComponent = $(".t__h1").first();
17
+ const titleComponent = $('.t__h1').first();
17
18
  const title = titleComponent.text().trim();
18
- const description = $(".article__desc", titleComponent.parent()).text().trim();
19
+ const description = $('.article__desc', titleComponent.parent()).text().trim();
19
20
 
20
- const content = $("article").first();
21
+ const content = $('article').first();
21
22
  const contentHtml = $.html(content);
22
23
 
23
24
  const origToWritePath = await downloadAllImages(
@@ -33,16 +34,16 @@ export async function scrapeIntercomPage(
33
34
  let markdown = nhm.translate(contentHtml);
34
35
 
35
36
  // Keep headers on one line
36
- markdown = markdown.replace(/# \n\n/g, "# ");
37
+ markdown = markdown.replace(/# \n\n/g, '# ');
37
38
 
38
39
  // Remove unnecessary nonwidth blank space characters
39
- markdown = markdown.replace(/\u200b/g, "");
40
+ markdown = markdown.replace(/\u200b/g, '');
40
41
 
41
42
  // Reduce unnecessary blank lines
42
- markdown = markdown.replace(/\n\n\n/g, "\n\n");
43
+ markdown = markdown.replace(/\n\n\n/g, '\n\n');
43
44
 
44
45
  // Mintlify doesn't support bolded headers, remove the asterisks
45
- markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, "$1 $2\n");
46
+ markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
46
47
  if (origToWritePath) {
47
48
  markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
48
49
  }
@@ -1,8 +1,9 @@
1
- import cheerio from "cheerio";
2
- import { scrapeIntercomPage } from "./scrapeIntercomPage.js";
3
- import { scrapeGettingFileNameFromUrl } from "../../scrapeGettingFileNameFromUrl.js";
4
- import downloadLogoImage from "../../downloadLogoImage.js";
5
- import axios from "axios";
1
+ import axios from 'axios';
2
+ import cheerio from 'cheerio';
3
+
4
+ import downloadLogoImage from '../../downloadLogoImage.js';
5
+ import { scrapeGettingFileNameFromUrl } from '../../scrapeGettingFileNameFromUrl.js';
6
+ import { scrapeIntercomPage } from './scrapeIntercomPage.js';
6
7
 
7
8
  export async function scrapeIntercomSection(
8
9
  html: string,
@@ -14,27 +15,25 @@ export async function scrapeIntercomSection(
14
15
  ): Promise<MintNavigationEntry[]> {
15
16
  let $ = cheerio.load(html);
16
17
 
17
- const logoSrc = $(".header__logo img").first().attr("src");
18
+ const logoSrc = $('.header__logo img').first().attr('src');
18
19
  downloadLogoImage(logoSrc, imageBaseDir, origin, overwrite);
19
20
 
20
- const collectionsLink = $(".section .g__space a");
21
- const collectionsMap = collectionsLink
22
- .toArray()
23
- .map(async (s: cheerio.Element) => {
24
- const href = $(s).attr("href");
25
- const res = await axios.get(`${origin}${href}`);
26
- const html = res.data;
27
- $ = cheerio.load(html);
28
- const sectionTitle = $(".collection h1").first().text().trim();
29
- const sectionPages = $(".section .g__space a")
30
- .toArray()
31
- .map((s: cheerio.Element) => $(s).attr("href"))
32
- .filter((page) => page !== undefined) as string[];
33
- return {
34
- group: sectionTitle,
35
- pages: sectionPages,
36
- };
37
- });
21
+ const collectionsLink = $('.section .g__space a');
22
+ const collectionsMap = collectionsLink.toArray().map(async (s: cheerio.Element) => {
23
+ const href = $(s).attr('href');
24
+ const res = await axios.get(`${origin}${href}`);
25
+ const html = res.data;
26
+ $ = cheerio.load(html);
27
+ const sectionTitle = $('.collection h1').first().text().trim();
28
+ const sectionPages = $('.section .g__space a')
29
+ .toArray()
30
+ .map((s: cheerio.Element) => $(s).attr('href'))
31
+ .filter((page) => page !== undefined) as string[];
32
+ return {
33
+ group: sectionTitle,
34
+ pages: sectionPages,
35
+ };
36
+ });
38
37
 
39
38
  const collections: MintNavigation[] = await Promise.all(collectionsMap);
40
39
 
@@ -4,5 +4,5 @@ export default function alternateGroupTitle(firstLink: cheerio.Cheerio, pages) {
4
4
  if (pages.length > 0) {
5
5
  return firstLink?.text();
6
6
  }
7
- return "";
7
+ return '';
8
8
  }
@@ -1,12 +1,12 @@
1
- import alternateGroupTitle from "../alternateGroupTitle.js";
2
- import getLinksRecursively from "./getLinksRecursively.js";
1
+ import alternateGroupTitle from '../alternateGroupTitle.js';
2
+ import getLinksRecursively from './getLinksRecursively.js';
3
3
 
4
4
  export function getDocusaurusLinksPerGroup(
5
5
  navigationSections: any,
6
6
  $: any,
7
7
  version: string | undefined
8
8
  ) {
9
- if (version === "3" || version === "2") {
9
+ if (version === '3' || version === '2') {
10
10
  return getDocusaurusLinksPerGroupLoop(navigationSections, $);
11
11
  }
12
12
  return [];
@@ -17,21 +17,18 @@ function getDocusaurusLinksPerGroupLoop(navigationSections: any, $: any) {
17
17
  const section = $(s);
18
18
 
19
19
  // Links without a group
20
- if (section.hasClass("theme-doc-sidebar-item-link")) {
21
- const linkHref = section.find("a[href]").first().attr("href");
20
+ if (section.hasClass('theme-doc-sidebar-item-link')) {
21
+ const linkHref = section.find('a[href]').first().attr('href');
22
22
  return {
23
- group: "",
23
+ group: '',
24
24
  pages: [linkHref],
25
25
  };
26
26
  }
27
27
 
28
- const firstLink = section
29
- .find(".menu__list-item-collapsible")
30
- .first()
31
- .find("a[href]");
28
+ const firstLink = section.find('.menu__list-item-collapsible').first().find('a[href]');
32
29
 
33
30
  const sectionTitle = firstLink.text();
34
- const firstHref = firstLink.attr("href");
31
+ const firstHref = firstLink.attr('href');
35
32
  const linkSections = section.children().eq(1).children();
36
33
 
37
34
  const pages = getLinksRecursively(linkSections, $);
@@ -9,20 +9,20 @@ export default function getLinksRecursively(linkSections: any, $: any) {
9
9
  const subsection = $(s);
10
10
  let link = subsection.children().first();
11
11
 
12
- if (!link.attr("href")) {
12
+ if (!link.attr('href')) {
13
13
  // Docusaurus nests the <a> inside a <div>
14
- link = link.find("a[href]").first();
14
+ link = link.find('a[href]').first();
15
15
  }
16
- const linkHref = link.attr("href");
16
+ const linkHref = link.attr('href');
17
17
 
18
18
  // Skip missing links. For example, GitBook uses
19
19
  // empty divs are used for styling a line beside the nav.
20
20
  // Skip external links until Mintlify supports them
21
21
  if (
22
22
  !linkHref ||
23
- linkHref === "#" ||
24
- linkHref.startsWith("https://") ||
25
- linkHref.startsWith("http://")
23
+ linkHref === '#' ||
24
+ linkHref.startsWith('https://') ||
25
+ linkHref.startsWith('http://')
26
26
  ) {
27
27
  return undefined;
28
28
  }
@@ -14,16 +14,16 @@ export default function getLinksRecursivelyGitBook(linkSections: any, $: any) {
14
14
  }
15
15
 
16
16
  const link = subsection.children().first();
17
- const linkHref = link.attr("href");
17
+ const linkHref = link.attr('href');
18
18
 
19
19
  // Skip missing links. For example, GitBook uses
20
20
  // empty divs are used for styling a line beside the nav.
21
21
  // Skip external links until Mintlify supports them
22
22
  if (
23
23
  !linkHref ||
24
- linkHref === "#" ||
25
- linkHref.startsWith("https://") ||
26
- linkHref.startsWith("http://")
24
+ linkHref === '#' ||
25
+ linkHref.startsWith('https://') ||
26
+ linkHref.startsWith('http://')
27
27
  ) {
28
28
  return undefined;
29
29
  }
@@ -1,8 +1,8 @@
1
- import { Page } from "puppeteer";
1
+ import { Page } from 'puppeteer';
2
2
 
3
3
  export default async function openNestedDocusaurusMenus(page: Page) {
4
4
  let prevEncountered: string[] = [];
5
- let encounteredHref = ["fake-href-to-make-loop-run-at-least-once"];
5
+ let encounteredHref = ['fake-href-to-make-loop-run-at-least-once'];
6
6
 
7
7
  // Loop until we've encountered every link
8
8
  while (!encounteredHref.every((href) => prevEncountered.includes(href))) {
@@ -10,15 +10,15 @@ export default async function openNestedDocusaurusMenus(page: Page) {
10
10
  encounteredHref = await page.evaluate(
11
11
  (encounteredHref) => {
12
12
  const collapsible: HTMLElement[] = Array.from(
13
- document.querySelectorAll(".menu__link.menu__link--sublist")
13
+ document.querySelectorAll('.menu__link.menu__link--sublist')
14
14
  );
15
15
 
16
16
  const linksFound: string[] = [];
17
17
  collapsible.forEach(async (collapsibleItem: HTMLElement) => {
18
- const href = collapsibleItem?.getAttribute("href");
18
+ const href = collapsibleItem?.getAttribute('href');
19
19
 
20
20
  // Should never occur but we keep it as a fail-safe
21
- if (href?.startsWith("https://") || href?.startsWith("http://")) {
21
+ if (href?.startsWith('https://') || href?.startsWith('http://')) {
22
22
  return;
23
23
  }
24
24
 
@@ -1,4 +1,4 @@
1
- import { Page } from "puppeteer";
1
+ import { Page } from 'puppeteer';
2
2
 
3
3
  export default async function openNestedGitbookMenus(page: Page) {
4
4
  let clickedAny = true;
@@ -7,9 +7,7 @@ export default async function openNestedGitbookMenus(page: Page) {
7
7
  while (clickedAny) {
8
8
  clickedAny = await page.evaluate(() => {
9
9
  // Right pointing arrow. Only closed menus have this icon
10
- const icons: HTMLElement[] = Array.from(
11
- document.querySelectorAll('path[d="M9 18l6-6-6-6"]')
12
- );
10
+ const icons: HTMLElement[] = Array.from(document.querySelectorAll('path[d="M9 18l6-6-6-6"]'));
13
11
 
14
12
  icons.forEach(async (icon: HTMLElement) => {
15
13
  const toClick = icon?.parentElement?.parentElement;
@@ -1,7 +1,8 @@
1
- import cheerio from "cheerio";
2
- import { NodeHtmlMarkdown } from "node-html-markdown";
3
- import downloadAllImages from "../downloadAllImages.js";
4
- import replaceImagePaths from "../replaceImagePaths.js";
1
+ import cheerio from 'cheerio';
2
+ import { NodeHtmlMarkdown } from 'node-html-markdown';
3
+
4
+ import downloadAllImages from '../downloadAllImages.js';
5
+ import replaceImagePaths from '../replaceImagePaths.js';
5
6
 
6
7
  export async function scrapeDocusaurusPage(
7
8
  html: string,
@@ -17,24 +18,22 @@ export async function scrapeDocusaurusPage(
17
18
  }> {
18
19
  const $ = cheerio.load(html);
19
20
 
20
- const article =
21
- version === "3" ? $(".theme-doc-markdown").first() : $("article").first();
21
+ const article = version === '3' ? $('.theme-doc-markdown').first() : $('article').first();
22
22
 
23
23
  if (article.length === 0) {
24
24
  // Index pages with no additional text don't have the markdown class
25
25
  return {
26
- title: ''
26
+ title: '',
27
27
  };
28
28
  }
29
29
 
30
- const titleComponent = article.find("h1");
30
+ const titleComponent = article.find('h1');
31
31
  const title = titleComponent.text().trim();
32
32
 
33
33
  // Do not include title in the content when we insert it in our metadata
34
34
  titleComponent.remove();
35
35
 
36
- const markdownContent =
37
- version === "3" ? article : article.find(".markdown").first();
36
+ const markdownContent = version === '3' ? article : article.find('.markdown').first();
38
37
 
39
38
  const origToWritePath = await downloadAllImages(
40
39
  $,
@@ -50,10 +49,8 @@ export async function scrapeDocusaurusPage(
50
49
  let markdown = markdownHtml ? nhm.translate(markdownHtml) : null;
51
50
 
52
51
  if (markdown == null) {
53
- console.error(
54
- "We do not support scraping this page. Content will be empty"
55
- );
56
- return { title, description: undefined, markdown: "" };
52
+ console.error('We do not support scraping this page. Content will be empty');
53
+ return { title, description: undefined, markdown: '' };
57
54
  }
58
55
 
59
56
  // Description only exists in meta tags. The code is commented out because its prone to incorrectly
@@ -70,16 +67,16 @@ export async function scrapeDocusaurusPage(
70
67
  // When we parse their HTML the parser adds things like:
71
68
  // [](#setup "Direct link to heading")
72
69
  // to the end of each header.
73
- markdown = markdown.replace(/\[\]\(#.+ ".+"\)\n/g, "\n");
70
+ markdown = markdown.replace(/\[\]\(#.+ ".+"\)\n/g, '\n');
74
71
 
75
72
  // Remove unnecessary nonwidth blank space characters
76
- markdown = markdown.replace(/\u200b/g, "");
73
+ markdown = markdown.replace(/\u200b/g, '');
77
74
 
78
75
  // Reduce unnecessary blank lines
79
- markdown = markdown.replace(/\n\n\n/g, "\n\n");
76
+ markdown = markdown.replace(/\n\n\n/g, '\n\n');
80
77
 
81
78
  // Mintlify doesn't support bolded headers, remove the asterisks
82
- markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, "$1 $2\n");
79
+ markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
83
80
  if (origToWritePath) {
84
81
  markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
85
82
  }
@@ -1,9 +1,10 @@
1
- import cheerio from "cheerio";
2
- import { scrapeGettingFileNameFromUrl } from "../scrapeGettingFileNameFromUrl.js";
3
- import combineNavWithEmptyGroupTitles from "../combineNavWithEmptyGroupTitles.js";
4
- import { scrapeDocusaurusPage } from "./scrapeDocusaurusPage.js";
5
- import { getDocusaurusLinksPerGroup } from "./links-per-group/getDocusaurusLinksPerGroup.js";
6
- import downloadLogoImage from "../downloadLogoImage.js";
1
+ import cheerio from 'cheerio';
2
+
3
+ import combineNavWithEmptyGroupTitles from '../combineNavWithEmptyGroupTitles.js';
4
+ import downloadLogoImage from '../downloadLogoImage.js';
5
+ import { scrapeGettingFileNameFromUrl } from '../scrapeGettingFileNameFromUrl.js';
6
+ import { getDocusaurusLinksPerGroup } from './links-per-group/getDocusaurusLinksPerGroup.js';
7
+ import { scrapeDocusaurusPage } from './scrapeDocusaurusPage.js';
7
8
 
8
9
  export async function scrapeDocusaurusSection(
9
10
  html: string,
@@ -16,18 +17,14 @@ export async function scrapeDocusaurusSection(
16
17
  const $ = cheerio.load(html);
17
18
 
18
19
  // Download the logo
19
- const logoSrc = $(".navbar__logo img").attr("src");
20
+ const logoSrc = $('.navbar__logo img').attr('src');
20
21
  downloadLogoImage(logoSrc, imageBaseDir, origin, overwrite);
21
22
 
22
23
  // Get all the navigation sections
23
- const navigationSections = $(".theme-doc-sidebar-menu").first().children();
24
+ const navigationSections = $('.theme-doc-sidebar-menu').first().children();
24
25
 
25
26
  // Get all links per group
26
- const groupsConfig: MintNavigation[] = getDocusaurusLinksPerGroup(
27
- navigationSections,
28
- $,
29
- version
30
- );
27
+ const groupsConfig: MintNavigation[] = getDocusaurusLinksPerGroup(navigationSections, $, version);
31
28
 
32
29
  // Merge groups with empty titles together
33
30
  const reducedGroupsConfig = combineNavWithEmptyGroupTitles(groupsConfig);
@@ -48,7 +45,7 @@ export async function scrapeDocusaurusSection(
48
45
  scrapeDocusaurusPage,
49
46
  false,
50
47
  version,
51
- "/docs"
48
+ '/docs'
52
49
  )
53
50
  )
54
51
  )
@@ -1,7 +1,8 @@
1
- import cheerio from "cheerio";
2
- import { NodeHtmlMarkdown } from "node-html-markdown";
3
- import downloadAllImages from "../downloadAllImages.js";
4
- import replaceImagePaths from "../replaceImagePaths.js";
1
+ import cheerio from 'cheerio';
2
+ import { NodeHtmlMarkdown } from 'node-html-markdown';
3
+
4
+ import downloadAllImages from '../downloadAllImages.js';
5
+ import replaceImagePaths from '../replaceImagePaths.js';
5
6
 
6
7
  export async function scrapeGitBookPage(
7
8
  html: string,
@@ -16,9 +17,7 @@ export async function scrapeGitBookPage(
16
17
  const titleComponent = $('[data-testid="page.title"]').first();
17
18
  const titleAndDescription = titleComponent.parent().parent().parent().text();
18
19
 
19
- const description = titleAndDescription
20
- .replace(titleComponent.text(), "")
21
- .trim();
20
+ const description = titleAndDescription.replace(titleComponent.text(), '').trim();
22
21
  const title = titleComponent.text().trim();
23
22
 
24
23
  const content = $('[data-testid="page.contentEditor"]').first();
@@ -32,8 +31,8 @@ export async function scrapeGitBookPage(
32
31
  .children()
33
32
  .toArray()
34
33
  .map((d) => $(d).text())
35
- .filter((text) => text !== "")
36
- .join("\n");
34
+ .filter((text) => text !== '')
35
+ .join('\n');
37
36
  code.replaceWith(`<pre><code>${codeContent}</code></pre>`);
38
37
  });
39
38
 
@@ -42,7 +41,7 @@ export async function scrapeGitBookPage(
42
41
  const modifyFileName = (fileName: string) =>
43
42
  // Remove GitBook metadata from the start
44
43
  // The first four %2F split metadata fields. Remaining ones are part of the file name.
45
- fileName.split("%2F").slice(4).join("%2F");
44
+ fileName.split('%2F').slice(4).join('%2F');
46
45
 
47
46
  const origToWritePath = await downloadAllImages(
48
47
  $,
@@ -57,16 +56,16 @@ export async function scrapeGitBookPage(
57
56
  let markdown = nhm.translate(contentHtml);
58
57
 
59
58
  // Keep headers on one line
60
- markdown = markdown.replace(/# \n\n/g, "# ");
59
+ markdown = markdown.replace(/# \n\n/g, '# ');
61
60
 
62
61
  // Remove unnecessary nonwidth blank space characters
63
- markdown = markdown.replace(/\u200b/g, "");
62
+ markdown = markdown.replace(/\u200b/g, '');
64
63
 
65
64
  // Reduce unnecessary blank lines
66
- markdown = markdown.replace(/\n\n\n/g, "\n\n");
65
+ markdown = markdown.replace(/\n\n\n/g, '\n\n');
67
66
 
68
67
  // Mintlify doesn't support bolded headers, remove the asterisks
69
- markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, "$1 $2\n");
68
+ markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
70
69
  if (origToWritePath) {
71
70
  markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
72
71
  }