@mintlify/scraping 3.0.14 → 3.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/.prettierrc +1 -0
  2. package/bin/browser.js +3 -3
  3. package/bin/constants.js +23 -23
  4. package/bin/constants.js.map +1 -1
  5. package/bin/downloadImage.js +18 -18
  6. package/bin/downloadImage.js.map +1 -1
  7. package/bin/scraping/detectFramework.js +13 -13
  8. package/bin/scraping/detectFramework.js.map +1 -1
  9. package/bin/scraping/downloadAllImages.js +5 -5
  10. package/bin/scraping/downloadAllImages.js.map +1 -1
  11. package/bin/scraping/downloadLogoImage.js +4 -4
  12. package/bin/scraping/downloadLogoImage.js.map +1 -1
  13. package/bin/scraping/getSitemapLinks.js +4 -4
  14. package/bin/scraping/scrapeFileGettingFileNameFromUrl.js +10 -10
  15. package/bin/scraping/scrapeFileGettingFileNameFromUrl.js.map +1 -1
  16. package/bin/scraping/scrapeGettingFileNameFromUrl.js +2 -2
  17. package/bin/scraping/scrapeGettingFileNameFromUrl.js.map +1 -1
  18. package/bin/scraping/scrapePage.js +3 -3
  19. package/bin/scraping/scrapePage.js.map +1 -1
  20. package/bin/scraping/scrapePageCommands.d.ts +1 -1
  21. package/bin/scraping/scrapePageCommands.js +15 -15
  22. package/bin/scraping/scrapePageCommands.js.map +1 -1
  23. package/bin/scraping/scrapeSection.js +6 -6
  24. package/bin/scraping/scrapeSection.js.map +1 -1
  25. package/bin/scraping/scrapeSectionCommands.d.ts +1 -1
  26. package/bin/scraping/scrapeSectionCommands.js +14 -14
  27. package/bin/scraping/scrapeSectionCommands.js.map +1 -1
  28. package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js +11 -11
  29. package/bin/scraping/site-scrapers/Intercom/scrapeIntercomPage.js.map +1 -1
  30. package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js +12 -14
  31. package/bin/scraping/site-scrapers/Intercom/scrapeIntercomSection.js.map +1 -1
  32. package/bin/scraping/site-scrapers/alternateGroupTitle.js +1 -1
  33. package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js +8 -11
  34. package/bin/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.js.map +1 -1
  35. package/bin/scraping/site-scrapers/links-per-group/getLinksRecursively.js +6 -6
  36. package/bin/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.js +4 -4
  37. package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.d.ts +1 -1
  38. package/bin/scraping/site-scrapers/openNestedDocusaurusMenus.js +4 -4
  39. package/bin/scraping/site-scrapers/openNestedGitbookMenus.d.ts +1 -1
  40. package/bin/scraping/site-scrapers/openNestedGitbookMenus.js.map +1 -1
  41. package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js +14 -14
  42. package/bin/scraping/site-scrapers/scrapeDocusaurusPage.js.map +1 -1
  43. package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js +9 -9
  44. package/bin/scraping/site-scrapers/scrapeDocusaurusSection.js.map +1 -1
  45. package/bin/scraping/site-scrapers/scrapeGitBookPage.js +12 -14
  46. package/bin/scraping/site-scrapers/scrapeGitBookPage.js.map +1 -1
  47. package/bin/scraping/site-scrapers/scrapeGitBookSection.js +10 -15
  48. package/bin/scraping/site-scrapers/scrapeGitBookSection.js.map +1 -1
  49. package/bin/scraping/site-scrapers/scrapeReadMePage.js +15 -15
  50. package/bin/scraping/site-scrapers/scrapeReadMePage.js.map +1 -1
  51. package/bin/scraping/site-scrapers/scrapeReadMeSection.js +11 -15
  52. package/bin/scraping/site-scrapers/scrapeReadMeSection.js.map +1 -1
  53. package/bin/tsconfig.tsbuildinfo +1 -1
  54. package/bin/util.d.ts +1 -1
  55. package/bin/util.js +23 -26
  56. package/bin/util.js.map +1 -1
  57. package/bin/validation/stopIfInvalidLink.js +3 -3
  58. package/package.json +9 -9
  59. package/src/browser.ts +3 -3
  60. package/src/constants.ts +23 -23
  61. package/src/downloadImage.ts +21 -26
  62. package/src/scraping/detectFramework.ts +18 -18
  63. package/src/scraping/downloadAllImages.ts +7 -9
  64. package/src/scraping/downloadLogoImage.ts +5 -4
  65. package/src/scraping/getSitemapLinks.ts +4 -4
  66. package/src/scraping/scrapeFileGettingFileNameFromUrl.ts +12 -18
  67. package/src/scraping/scrapeGettingFileNameFromUrl.ts +7 -5
  68. package/src/scraping/scrapePage.ts +4 -3
  69. package/src/scraping/scrapePageCommands.ts +17 -18
  70. package/src/scraping/scrapeSection.ts +8 -16
  71. package/src/scraping/scrapeSectionCommands.ts +19 -34
  72. package/src/scraping/site-scrapers/Intercom/scrapeIntercomPage.ts +12 -11
  73. package/src/scraping/site-scrapers/Intercom/scrapeIntercomSection.ts +23 -24
  74. package/src/scraping/site-scrapers/alternateGroupTitle.ts +1 -1
  75. package/src/scraping/site-scrapers/links-per-group/getDocusaurusLinksPerGroup.ts +8 -11
  76. package/src/scraping/site-scrapers/links-per-group/getLinksRecursively.ts +6 -6
  77. package/src/scraping/site-scrapers/links-per-group/getLinksRecursivelyGitBook.ts +4 -4
  78. package/src/scraping/site-scrapers/openNestedDocusaurusMenus.ts +5 -5
  79. package/src/scraping/site-scrapers/openNestedGitbookMenus.ts +2 -4
  80. package/src/scraping/site-scrapers/scrapeDocusaurusPage.ts +15 -18
  81. package/src/scraping/site-scrapers/scrapeDocusaurusSection.ts +11 -14
  82. package/src/scraping/site-scrapers/scrapeGitBookPage.ts +13 -14
  83. package/src/scraping/site-scrapers/scrapeGitBookSection.ts +11 -15
  84. package/src/scraping/site-scrapers/scrapeReadMePage.ts +17 -22
  85. package/src/scraping/site-scrapers/scrapeReadMeSection.ts +27 -31
  86. package/src/util.ts +25 -36
  87. package/src/validation/stopIfInvalidLink.ts +3 -3
@@ -1,10 +1,11 @@
1
- import cheerio from "cheerio";
2
- import { scrapeGettingFileNameFromUrl } from "../scrapeGettingFileNameFromUrl.js";
3
- import { scrapeGitBookPage } from "./scrapeGitBookPage.js";
4
- import combineNavWithEmptyGroupTitles from "../combineNavWithEmptyGroupTitles.js";
5
- import getLinksRecursivelyGitBook from "./links-per-group/getLinksRecursivelyGitBook.js";
6
- import alternateGroupTitle from "./alternateGroupTitle.js";
7
- import downloadLogoImage from "../downloadLogoImage.js";
1
+ import cheerio from 'cheerio';
2
+
3
+ import combineNavWithEmptyGroupTitles from '../combineNavWithEmptyGroupTitles.js';
4
+ import downloadLogoImage from '../downloadLogoImage.js';
5
+ import { scrapeGettingFileNameFromUrl } from '../scrapeGettingFileNameFromUrl.js';
6
+ import alternateGroupTitle from './alternateGroupTitle.js';
7
+ import getLinksRecursivelyGitBook from './links-per-group/getLinksRecursivelyGitBook.js';
8
+ import { scrapeGitBookPage } from './scrapeGitBookPage.js';
8
9
 
9
10
  export async function scrapeGitBookSection(
10
11
  html: string,
@@ -17,9 +18,7 @@ export async function scrapeGitBookSection(
17
18
  const $ = cheerio.load(html);
18
19
 
19
20
  // Download the logo
20
- const logoSrc = $('a[data-testid="public.headerHomeLink"] img')
21
- .first()
22
- .attr("src");
21
+ const logoSrc = $('a[data-testid="public.headerHomeLink"] img').first().attr('src');
23
22
  downloadLogoImage(logoSrc, imageBaseDir, origin, overwrite).catch(console.error);
24
23
 
25
24
  // Get all the navigation sections
@@ -39,14 +38,11 @@ export async function scrapeGitBookSection(
39
38
  .toArray()
40
39
  .map((s: cheerio.Element) => {
41
40
  const section = $(s);
42
- const sectionTitle = $(section)
43
- .find('div > div[dir="auto"]')
44
- .first()
45
- .text();
41
+ const sectionTitle = $(section).find('div > div[dir="auto"]').first().text();
46
42
 
47
43
  // Only present if the nested navigation is not in a group
48
44
  const firstLink = section.children().eq(0);
49
- const firstHref = firstLink.attr("href");
45
+ const firstHref = firstLink.attr('href');
50
46
 
51
47
  const linkSections: cheerio.Cheerio = section.children().eq(1).children();
52
48
  const pages = getLinksRecursivelyGitBook(linkSections, $);
@@ -1,7 +1,8 @@
1
- import cheerio from "cheerio";
2
- import { NodeHtmlMarkdown } from "node-html-markdown";
3
- import downloadAllImages from "../downloadAllImages.js";
4
- import replaceImagePaths from "../replaceImagePaths.js";
1
+ import cheerio from 'cheerio';
2
+ import { NodeHtmlMarkdown } from 'node-html-markdown';
3
+
4
+ import downloadAllImages from '../downloadAllImages.js';
5
+ import replaceImagePaths from '../replaceImagePaths.js';
5
6
 
6
7
  export async function scrapeReadMePage(
7
8
  html: string,
@@ -13,46 +14,40 @@ export async function scrapeReadMePage(
13
14
  ) {
14
15
  const $ = cheerio.load(html);
15
16
 
16
- const titleComponent = $("h1").first();
17
+ const titleComponent = $('h1').first();
17
18
  const title = titleComponent.text().trim();
18
- let description = $(".markdown-body", titleComponent.parent()).text().trim();
19
+ let description = $('.markdown-body', titleComponent.parent()).text().trim();
19
20
  if (!description) {
20
- description = $(".rm-Article > header p").text().trim();
21
+ description = $('.rm-Article > header p').text().trim();
21
22
  }
22
23
 
23
- let content = $(".content-body .markdown-body").first();
24
+ let content = $('.content-body .markdown-body').first();
24
25
  if (content.length === 0) {
25
- content = $(".rm-Article > .markdown-body");
26
+ content = $('.rm-Article > .markdown-body');
26
27
  }
27
28
 
28
29
  // API Pages don't have a markdown body in the same position so there's no HTML
29
- const contentHtml = content.html() || "";
30
+ const contentHtml = content.html() || '';
30
31
 
31
- const origToWritePath = await downloadAllImages(
32
- $,
33
- content,
34
- origin,
35
- imageBaseDir,
36
- overwrite
37
- );
32
+ const origToWritePath = await downloadAllImages($, content, origin, imageBaseDir, overwrite);
38
33
 
39
34
  const nhm = new NodeHtmlMarkdown({ useInlineLinks: false });
40
35
  let markdown = nhm.translate(contentHtml);
41
36
 
42
37
  // Keep headers on one line and increase their depth by one
43
- markdown = markdown.replace(/# \n\n/g, "## ");
38
+ markdown = markdown.replace(/# \n\n/g, '## ');
44
39
 
45
40
  // Remove unnecessary nonwidth blank space characters
46
- markdown = markdown.replace(/\u200b/g, "");
41
+ markdown = markdown.replace(/\u200b/g, '');
47
42
 
48
43
  // Remove ReadMe anchor links
49
- markdown = markdown.replace(/\n\[\]\(#.+\)\n/g, "\n");
44
+ markdown = markdown.replace(/\n\[\]\(#.+\)\n/g, '\n');
50
45
 
51
46
  // Reduce unnecessary blank lines
52
- markdown = markdown.replace(/\n\n\n/g, "\n\n");
47
+ markdown = markdown.replace(/\n\n\n/g, '\n\n');
53
48
 
54
49
  // Mintlify doesn't support bolded headers, remove the asterisks
55
- markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, "$1 $2\n");
50
+ markdown = markdown.replace(/(\n#+) \*\*(.*)\*\*\n/g, '$1 $2\n');
56
51
  if (origToWritePath) {
57
52
  markdown = replaceImagePaths(origToWritePath, cliDir, markdown);
58
53
  }
@@ -1,8 +1,9 @@
1
- import cheerio from "cheerio";
2
- import { scrapeReadMePage } from "./scrapeReadMePage.js";
3
- import { scrapeGettingFileNameFromUrl } from "../scrapeGettingFileNameFromUrl.js";
4
- import getLinksRecursively from "./links-per-group/getLinksRecursively.js";
5
- import downloadLogoImage from "../downloadLogoImage.js";
1
+ import cheerio from 'cheerio';
2
+
3
+ import downloadLogoImage from '../downloadLogoImage.js';
4
+ import { scrapeGettingFileNameFromUrl } from '../scrapeGettingFileNameFromUrl.js';
5
+ import getLinksRecursively from './links-per-group/getLinksRecursively.js';
6
+ import { scrapeReadMePage } from './scrapeReadMePage.js';
6
7
 
7
8
  export async function scrapeReadMeSection(
8
9
  html: string,
@@ -15,39 +16,34 @@ export async function scrapeReadMeSection(
15
16
  const $ = cheerio.load(html);
16
17
 
17
18
  // Download the logo
18
- const logoSrc = $(".rm-Logo-img").first().attr("src");
19
+ const logoSrc = $('.rm-Logo-img').first().attr('src');
19
20
  downloadLogoImage(logoSrc, imageBaseDir, origin, overwrite).catch(console.error);
20
21
 
21
22
  // Get all the navigation sections, but only from the first
22
23
  // sidebar found. There are multiple in the HTML for mobile
23
24
  // responsiveness but they all have the same links.
24
- const navigationSections = $(".rm-Sidebar")
25
- .first()
26
- .find(".rm-Sidebar-section");
25
+ const navigationSections = $('.rm-Sidebar').first().find('.rm-Sidebar-section');
27
26
 
28
- const groupsConfig: MintNavigation[] = navigationSections
29
- .toArray()
30
- .map((s: cheerio.Element) => {
31
- const section = $(s);
32
- const sectionTitle = section.find("h3").first().text();
27
+ const groupsConfig: MintNavigation[] = navigationSections.toArray().map((s: cheerio.Element) => {
28
+ const section = $(s);
29
+ const sectionTitle = section.find('h3').first().text();
33
30
 
34
- // Get all links, then use filter to remove duplicates.
35
- // There are duplicates because of nested navigation, eg:
36
- // subgroupTitle -> /first-page
37
- // -- First Page -> /first-page ** DUPLICATE **
38
- // -- Second Page -> /second-page
39
- const linkSections = section.find(".rm-Sidebar-list").first().children();
40
- const pages = getLinksRecursively(linkSections, $).filter(
41
- (value: string, index: number, self: any) =>
42
- self.indexOf(value) === index
43
- );
31
+ // Get all links, then use filter to remove duplicates.
32
+ // There are duplicates because of nested navigation, eg:
33
+ // subgroupTitle -> /first-page
34
+ // -- First Page -> /first-page ** DUPLICATE **
35
+ // -- Second Page -> /second-page
36
+ const linkSections = section.find('.rm-Sidebar-list').first().children();
37
+ const pages = getLinksRecursively(linkSections, $).filter(
38
+ (value: string, index: number, self: any) => self.indexOf(value) === index
39
+ );
44
40
 
45
- // Follows the same structure as mint.json
46
- return {
47
- group: sectionTitle,
48
- pages: pages,
49
- };
50
- });
41
+ // Follows the same structure as mint.json
42
+ return {
43
+ group: sectionTitle,
44
+ pages: pages,
45
+ };
46
+ });
51
47
 
52
48
  // Scrape each link in the navigation.
53
49
  return Promise.all(
@@ -62,7 +58,7 @@ export async function scrapeReadMeSection(
62
58
  scrapeReadMePage,
63
59
  false,
64
60
  version,
65
- "/docs"
61
+ '/docs'
66
62
  );
67
63
  })
68
64
  );
package/src/util.ts CHANGED
@@ -1,8 +1,9 @@
1
- import { mkdirSync, writeFileSync } from "fs";
2
- import Ora, { Ora as OraType } from "ora";
3
- import path from "path";
4
- import shell from "shelljs";
5
- import stopIfInvalidLink from "./validation/stopIfInvalidLink.js";
1
+ import { mkdirSync, writeFileSync } from 'fs';
2
+ import Ora, { Ora as OraType } from 'ora';
3
+ import path from 'path';
4
+ import shell from 'shelljs';
5
+
6
+ import stopIfInvalidLink from './validation/stopIfInvalidLink.js';
6
7
 
7
8
  export const MintConfig = (
8
9
  name: string,
@@ -13,8 +14,8 @@ export const MintConfig = (
13
14
  ) => {
14
15
  return {
15
16
  name,
16
- logo: "",
17
- favicon: "",
17
+ logo: '',
18
+ favicon: '',
18
19
  colors: {
19
20
  primary: color,
20
21
  },
@@ -26,7 +27,7 @@ export const MintConfig = (
26
27
  anchors: [],
27
28
  navigation: [
28
29
  {
29
- group: "Home",
30
+ group: 'Home',
30
31
  pages: [filename],
31
32
  },
32
33
  ],
@@ -34,11 +35,7 @@ export const MintConfig = (
34
35
  };
35
36
  };
36
37
 
37
- export const Page = (
38
- title: string,
39
- description?: string,
40
- markdown?: string
41
- ) => {
38
+ export const Page = (title: string, description?: string, markdown?: string) => {
42
39
  // If we are an empty String we want to add two quotes,
43
40
  // if we added as we went we would detect the first quote
44
41
  // as the closing quote.
@@ -51,9 +48,7 @@ export const Page = (
51
48
  title = title + '"';
52
49
  }
53
50
 
54
- const optionalDescription = description
55
- ? `\ndescription: "${description}"`
56
- : "";
51
+ const optionalDescription = description ? `\ndescription: "${description}"` : '';
57
52
  return `---\ntitle: ${title}${optionalDescription}\n---\n\n${markdown}`;
58
53
  };
59
54
 
@@ -65,24 +60,24 @@ export function getOrigin(url: string) {
65
60
 
66
61
  export function objToReadableString(objs: MintNavigationEntry[]) {
67
62
  // Two spaces as indentation
68
- return objs.map((obj) => JSON.stringify(obj, null, 2)).join(",\n");
63
+ return objs.map((obj) => JSON.stringify(obj, null, 2)).join(',\n');
69
64
  }
70
65
 
71
66
  export const toFilename = (title: string) => {
72
67
  // Gets rid of special characters at the start and end
73
68
  // of the name by converting to spaces then using trim.
74
69
  return title
75
- .replace(/[^a-z0-9]/gi, " ")
70
+ .replace(/[^a-z0-9]/gi, ' ')
76
71
  .trim()
77
- .replace(/ /g, "-")
72
+ .replace(/ /g, '-')
78
73
  .toLowerCase();
79
74
  };
80
75
 
81
76
  export const addMdx = (fileName: string) => {
82
- if (fileName.endsWith(".mdx")) {
77
+ if (fileName.endsWith('.mdx')) {
83
78
  return fileName;
84
79
  }
85
- return fileName + ".mdx";
80
+ return fileName + '.mdx';
86
81
  };
87
82
 
88
83
  export const createPage = (
@@ -90,7 +85,7 @@ export const createPage = (
90
85
  description?: string,
91
86
  markdown?: string,
92
87
  overwrite = false,
93
- rootDir = "",
88
+ rootDir = '',
94
89
  fileName?: string
95
90
  ) => {
96
91
  const writePath = path.join(rootDir, addMdx(fileName || toFilename(title)));
@@ -101,17 +96,17 @@ export const createPage = (
101
96
  // Write the page to memory
102
97
  if (overwrite) {
103
98
  writeFileSync(writePath, Page(title, description, markdown));
104
- console.log("✏️ - " + writePath);
99
+ console.log('✏️ - ' + writePath);
105
100
  } else {
106
101
  try {
107
102
  writeFileSync(writePath, Page(title, description, markdown), {
108
- flag: "wx",
103
+ flag: 'wx',
109
104
  });
110
- console.log("✏️ - " + writePath);
105
+ console.log('✏️ - ' + writePath);
111
106
  } catch (e) {
112
107
  // We do a try-catch instead of an if-statement to avoid a race condition
113
108
  // of the file being created after we started writing.
114
- if ((e as { code: string })?.code === "EEXIST") {
109
+ if ((e as { code: string })?.code === 'EEXIST') {
115
110
  console.log(`❌ Skipping existing file ${writePath}`);
116
111
  } else {
117
112
  console.error(e);
@@ -126,30 +121,24 @@ export function getHrefFromArgs(argv: any) {
126
121
  return href;
127
122
  }
128
123
 
129
- export const buildLogger = (startText = ""): OraType => {
124
+ export const buildLogger = (startText = ''): OraType => {
130
125
  const logger = Ora().start(startText);
131
126
  return logger;
132
127
  };
133
128
 
134
129
  export const getFileExtension = (filename: string) => {
135
- const ext = filename.substring(
136
- filename.lastIndexOf(".") + 1,
137
- filename.length
138
- );
130
+ const ext = filename.substring(filename.lastIndexOf('.') + 1, filename.length);
139
131
  if (filename === ext) return undefined;
140
132
  return ext.toLowerCase();
141
133
  };
142
134
 
143
135
  export const fileBelongsInPagesFolder = (filename: string) => {
144
136
  const extension = getFileExtension(filename);
145
- return (
146
- extension &&
147
- (extension === "mdx" || extension === "md" || extension === "tsx")
148
- );
137
+ return extension && (extension === 'mdx' || extension === 'md' || extension === 'tsx');
149
138
  };
150
139
 
151
140
  export const ensureYarn = (logger: OraType) => {
152
- const yarnInstalled = shell.which("yarn");
141
+ const yarnInstalled = shell.which('yarn');
153
142
  if (!yarnInstalled) {
154
143
  logger.fail(`yarn must be installed, run
155
144
 
@@ -1,9 +1,9 @@
1
- import isValidLink from "./isValidLink.js";
1
+ import isValidLink from './isValidLink.js';
2
2
 
3
3
  export default function stopIfInvalidLink(href: string) {
4
4
  if (!isValidLink(href)) {
5
- console.log("Invalid link: " + href);
6
- console.log("Make sure the link starts with http:// or https://");
5
+ console.log('Invalid link: ' + href);
6
+ console.log('Make sure the link starts with http:// or https://');
7
7
  process.exit(1);
8
8
  }
9
9
  }