@mintlify/scraping 4.0.204 → 4.0.205

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mintlify/scraping",
3
- "version": "4.0.204",
3
+ "version": "4.0.205",
4
4
  "description": "Scrape documentation frameworks to Mintlify docs",
5
5
  "engines": {
6
6
  "node": ">=18.0.0"
@@ -44,13 +44,13 @@
44
44
  "hast-util-to-mdast": "^10.1.0",
45
45
  "js-yaml": "^4.1.0",
46
46
  "mdast-util-mdx-jsx": "^3.1.3",
47
+ "neotraverse": "^0.6.18",
47
48
  "puppeteer": "^22.14.0",
48
49
  "rehype-parse": "^9.0.0",
49
50
  "remark-gfm": "^4.0.0",
50
51
  "remark-mdx": "^3.0.1",
51
52
  "remark-parse": "^11.0.0",
52
53
  "remark-stringify": "^11.0.0",
53
- "traverse": "^0.6.10",
54
54
  "unified": "^11.0.5",
55
55
  "unist-util-visit": "^5.0.0",
56
56
  "yargs": "^17.6.0",
@@ -67,7 +67,6 @@
67
67
  "@types/hast": "^3.0.4",
68
68
  "@types/mdast": "^4.0.4",
69
69
  "@types/node": "^18.7.13",
70
- "@types/traverse": "^0.6.37",
71
70
  "@types/yargs": "^17.0.13",
72
71
  "@typescript-eslint/eslint-plugin": "6.x",
73
72
  "@typescript-eslint/parser": "6.x",
@@ -78,5 +77,5 @@
78
77
  "typescript": "^5.5.3",
79
78
  "vitest": "^2.0.4"
80
79
  },
81
- "gitHead": "d2f72c3ad5d1a63bc43deb817b33d3d0e92f6353"
80
+ "gitHead": "78cfff36cf9563d70d2a9a2033a23a49633c73be"
82
81
  }
package/src/nav/root.ts CHANGED
@@ -23,7 +23,7 @@ export function retrieveRootNavElement(rootNode: HastRoot): Element | undefined
23
23
  rootSelector = 'menu';
24
24
  break;
25
25
  case 'gitbook':
26
- rootSelector = 'page-no-toc:hidden';
26
+ rootSelector = 'page-no-toc:lg:hidden';
27
27
  break;
28
28
  case 'readme':
29
29
  rootSelector = 'rm-Sidebar';
@@ -1,3 +1,4 @@
1
+ import { convertStrToTitle } from '@mintlify/common';
1
2
  import type { Browser } from 'puppeteer';
2
3
 
3
4
  import { OVERVIEW_PAGE_SLUG } from '../constants.js';
@@ -25,7 +26,11 @@ export async function scrapePageGroup(
25
26
  chunk.map(async (url, index) => {
26
27
  try {
27
28
  if (opts.externalLinks) {
28
- const res = scrapePage(`external-link-${index}`, url, { externalLink: true });
29
+ let externalLinkTitle =
30
+ convertStrToTitle(url.pathname.split('/').at(-1) ?? url.pathname) ||
31
+ `external-link-${index}`;
32
+ externalLinkTitle = externalLinkTitle.replace(/\s+/g, '-').toLowerCase();
33
+ const res = scrapePage(externalLinkTitle, url, { externalLink: true });
29
34
  return res;
30
35
  }
31
36
 
@@ -1,6 +1,6 @@
1
1
  import { MintConfig, Navigation, Tab } from '@mintlify/models';
2
2
  import type { Root as HastRoot } from 'hast';
3
- import traverse from 'traverse';
3
+ import traverse from 'neotraverse';
4
4
 
5
5
  import { NAV_FAILURE_MSG } from '../constants.js';
6
6
  import { OVERVIEW_PAGE_SLUG } from '../constants.js';
@@ -23,8 +23,12 @@ export async function scrapeAllSiteTabs(
23
23
 
24
24
  detectFramework(hast);
25
25
 
26
- if (framework.vendor === 'readme' || framework.vendor === 'docusaurus') {
27
- const links = retrieveTabLinks(hast);
26
+ if (
27
+ framework.vendor === 'readme' ||
28
+ framework.vendor === 'docusaurus' ||
29
+ framework.vendor === 'gitbook'
30
+ ) {
31
+ const links = retrieveTabLinks(hast, url);
28
32
  if (
29
33
  !links ||
30
34
  !links.length ||
@@ -5,8 +5,14 @@ import { visit, EXIT, CONTINUE } from 'unist-util-visit';
5
5
  import { framework } from '../utils/detectFramework.js';
6
6
  import { findTitle, getTitleFromLink } from '../utils/title.js';
7
7
 
8
- export function retrieveTabLinks(rootNode: HastRoot): Array<Tab> | undefined {
9
- if (framework.vendor !== 'readme' && framework.vendor !== 'docusaurus') return undefined;
8
+ export function retrieveTabLinks(rootNode: HastRoot, url: URL): Array<Tab> | undefined {
9
+ if (
10
+ framework.vendor !== 'readme' &&
11
+ framework.vendor !== 'docusaurus' &&
12
+ framework.vendor !== 'gitbook'
13
+ ) {
14
+ return undefined;
15
+ }
10
16
 
11
17
  let element: Element | undefined = undefined as Element | undefined;
12
18
  visit(rootNode, 'element', function (node) {
@@ -33,64 +39,102 @@ export function retrieveTabLinks(rootNode: HastRoot): Array<Tab> | undefined {
33
39
  return EXIT;
34
40
  }
35
41
  }
42
+
43
+ if (framework.vendor === 'gitbook') {
44
+ if (
45
+ node.tagName === 'nav' &&
46
+ node.properties.id === 'sections' &&
47
+ node.properties.ariaLabel === 'Sections'
48
+ ) {
49
+ element = node;
50
+ return EXIT;
51
+ }
52
+ }
36
53
  });
37
54
 
38
55
  if (!element) return undefined;
39
56
 
40
57
  const links: Array<Tab> = [];
41
- visit(element as Element, 'element', function (node) {
42
- if (framework.vendor === 'readme') {
43
- if (
44
- node.tagName !== 'nav' &&
45
- !(
46
- node.tagName === 'div' &&
47
- node.properties.className &&
48
- Array.isArray(node.properties.className) &&
49
- node.properties.className.includes('rm-Header-right')
50
- )
51
- )
52
- return CONTINUE;
53
-
54
- visit(node, 'element', function (subNode) {
58
+ visit(element, 'element', function (node) {
59
+ switch (framework.vendor) {
60
+ case 'readme':
55
61
  if (
56
- subNode.tagName !== 'a' ||
57
- !subNode.properties.href ||
58
- typeof subNode.properties.href !== 'string' ||
59
- subNode.properties.href.startsWith('http')
62
+ node.tagName !== 'nav' &&
63
+ !(
64
+ node.tagName === 'div' &&
65
+ node.properties.className &&
66
+ Array.isArray(node.properties.className) &&
67
+ node.properties.className.includes('rm-Header-right')
68
+ )
60
69
  )
61
70
  return CONTINUE;
62
- const title = findTitle(subNode);
63
- links.push({
64
- name: title || getTitleFromLink(subNode.properties.href),
65
- url: subNode.properties.href,
71
+
72
+ visit(node, 'element', function (subNode) {
73
+ if (
74
+ subNode.tagName !== 'a' ||
75
+ !subNode.properties.href ||
76
+ typeof subNode.properties.href !== 'string' ||
77
+ subNode.properties.href.startsWith('http')
78
+ )
79
+ return CONTINUE;
80
+ const title = findTitle(subNode);
81
+ links.push({
82
+ name: title || getTitleFromLink(subNode.properties.href),
83
+ url: subNode.properties.href,
84
+ });
66
85
  });
67
- });
68
- }
86
+ break;
69
87
 
70
- if (framework.vendor === 'docusaurus') {
71
- if (node.tagName !== 'nav') return CONTINUE;
88
+ case 'docusaurus':
89
+ if (node.tagName !== 'nav') return CONTINUE;
72
90
 
73
- visit(node, 'element', function (subNode, _, parent) {
74
- if (
75
- subNode.tagName !== 'a' ||
76
- !subNode.properties.href ||
77
- typeof subNode.properties.href !== 'string' ||
78
- subNode.properties.href.startsWith('http') ||
79
- !parent ||
80
- parent.type !== 'element' ||
81
- !Array.isArray(parent.properties.className) ||
82
- parent.properties.className.length !== 1 ||
83
- parent.properties.className[0] !== 'navbar__items' ||
84
- parent.properties.className.includes('navbar__items--right')
85
- )
86
- return CONTINUE;
91
+ visit(node, 'element', function (subNode, _, parent) {
92
+ if (
93
+ subNode.tagName !== 'a' ||
94
+ !subNode.properties.href ||
95
+ typeof subNode.properties.href !== 'string' ||
96
+ subNode.properties.href.startsWith('http') ||
97
+ !parent ||
98
+ parent.type !== 'element' ||
99
+ !Array.isArray(parent.properties.className) ||
100
+ parent.properties.className.length !== 1 ||
101
+ parent.properties.className[0] !== 'navbar__items' ||
102
+ parent.properties.className.includes('navbar__items--right')
103
+ )
104
+ return CONTINUE;
87
105
 
88
- const title = findTitle(subNode);
89
- links.push({
90
- name: title || getTitleFromLink(subNode.properties.href),
91
- url: subNode.properties.href,
106
+ const title = findTitle(subNode);
107
+ links.push({
108
+ name: title || getTitleFromLink(subNode.properties.href),
109
+ url: subNode.properties.href,
110
+ });
92
111
  });
93
- });
112
+ break;
113
+
114
+ case 'gitbook':
115
+ if (node.tagName !== 'nav') return CONTINUE;
116
+
117
+ visit(node, 'element', function (subNode, _, parent) {
118
+ if (
119
+ subNode.tagName !== 'a' ||
120
+ !subNode.properties.href ||
121
+ typeof subNode.properties.href !== 'string' ||
122
+ !parent ||
123
+ parent.type !== 'element'
124
+ )
125
+ return CONTINUE;
126
+
127
+ const title = findTitle(subNode);
128
+ const link = new URL(subNode.properties.href);
129
+ links.push({
130
+ name: title || getTitleFromLink(subNode.properties.href),
131
+ url: link.origin === url.origin ? link.pathname : link.toString(),
132
+ });
133
+ });
134
+ break;
135
+
136
+ default:
137
+ break;
94
138
  }
95
139
  });
96
140