@mintlify/scraping 4.0.38 → 4.0.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/pipeline/images.js +7 -5
- package/bin/pipeline/images.js.map +1 -1
- package/bin/pipeline/page.js +4 -0
- package/bin/pipeline/page.js.map +1 -1
- package/bin/tsconfig.build.tsbuildinfo +1 -1
- package/bin/utils/breadcrumbs.d.ts +8 -0
- package/bin/utils/breadcrumbs.js +26 -0
- package/bin/utils/breadcrumbs.js.map +1 -0
- package/bin/utils/breaks.d.ts +5 -0
- package/bin/utils/breaks.js +5 -3
- package/bin/utils/breaks.js.map +1 -1
- package/bin/utils/toc.d.ts +8 -0
- package/bin/utils/toc.js +26 -0
- package/bin/utils/toc.js.map +1 -0
- package/package.json +5 -5
- package/src/pipeline/images.ts +6 -5
- package/src/pipeline/page.ts +4 -0
- package/src/utils/breadcrumbs.ts +31 -0
- package/src/utils/breaks.ts +5 -3
- package/src/utils/toc.ts +31 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { Element } from 'hast';
|
|
2
|
+
export declare function unifiedRemoveBreadCrumbs(): (node: Element) => undefined;
|
|
3
|
+
/**
|
|
4
|
+
* Docusaurus-specific function since their breadcrumbs
|
|
5
|
+
* are within the content in the `article` itself instead
|
|
6
|
+
* of outside of the `article` element
|
|
7
|
+
*/
|
|
8
|
+
export declare function removeBreadCrumbs(node: Element): undefined;
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { visit } from 'unist-util-visit';
|
|
2
|
+
import { framework } from './detectFramework.js';
|
|
3
|
+
export function unifiedRemoveBreadCrumbs() {
|
|
4
|
+
return function (node) {
|
|
5
|
+
return removeBreadCrumbs(node);
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Docusaurus-specific function since their breadcrumbs
|
|
10
|
+
* are within the content in the `article` itself instead
|
|
11
|
+
* of outside of the `article` element
|
|
12
|
+
*/
|
|
13
|
+
export function removeBreadCrumbs(node) {
|
|
14
|
+
return visit(node, 'element', function (subNode, index, parent) {
|
|
15
|
+
if (framework.vendor === 'docusaurus' &&
|
|
16
|
+
subNode.tagName === 'nav' &&
|
|
17
|
+
subNode.properties.className &&
|
|
18
|
+
Array.isArray(subNode.properties.className) &&
|
|
19
|
+
subNode.properties.className.includes('theme-doc-breadcrumbs') &&
|
|
20
|
+
parent &&
|
|
21
|
+
typeof index === 'number') {
|
|
22
|
+
parent.children.splice(index, 1);
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
//# sourceMappingURL=breadcrumbs.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"breadcrumbs.js","sourceRoot":"","sources":["../../src/utils/breadcrumbs.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAEzC,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAEjD,MAAM,UAAU,wBAAwB;IACtC,OAAO,UAAU,IAAa;QAC5B,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAC;IACjC,CAAC,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAa;IAC7C,OAAO,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,OAAO,EAAE,KAAK,EAAE,MAAM;QAC5D,IACE,SAAS,CAAC,MAAM,KAAK,YAAY;YACjC,OAAO,CAAC,OAAO,KAAK,KAAK;YACzB,OAAO,CAAC,UAAU,CAAC,SAAS;YAC5B,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,SAAS,CAAC;YAC3C,OAAO,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,uBAAuB,CAAC;YAC9D,MAAM;YACN,OAAO,KAAK,KAAK,QAAQ,EACzB,CAAC;YACD,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACnC,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/bin/utils/breaks.d.ts
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
1
|
import type { Element } from 'hast';
|
|
2
2
|
export declare function unifiedRemoveBreaks(): (node: Element) => undefined;
|
|
3
|
+
/**
|
|
4
|
+
* ReadMe-specific function since they use breaks in between
|
|
5
|
+
* every element, but either way our parser adds whitespace
|
|
6
|
+
* automatically
|
|
7
|
+
*/
|
|
3
8
|
export declare function removeBreaks(node: Element): undefined;
|
package/bin/utils/breaks.js
CHANGED
|
@@ -4,9 +4,11 @@ export function unifiedRemoveBreaks() {
|
|
|
4
4
|
return removeBreaks(node);
|
|
5
5
|
};
|
|
6
6
|
}
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
/**
|
|
8
|
+
* ReadMe-specific function since they use breaks in between
|
|
9
|
+
* every element, but either way our parser adds whitespace
|
|
10
|
+
* automatically
|
|
11
|
+
*/
|
|
10
12
|
export function removeBreaks(node) {
|
|
11
13
|
return visit(node, 'element', function (subNode, index, parent) {
|
|
12
14
|
if (subNode.tagName === 'br' && parent && typeof index === 'number') {
|
package/bin/utils/breaks.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"breaks.js","sourceRoot":"","sources":["../../src/utils/breaks.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAEzC,MAAM,UAAU,mBAAmB;IACjC,OAAO,UAAU,IAAa;QAC5B,OAAO,YAAY,CAAC,IAAI,CAAC,CAAC;IAC5B,CAAC,CAAC;AACJ,CAAC;AAED
|
|
1
|
+
{"version":3,"file":"breaks.js","sourceRoot":"","sources":["../../src/utils/breaks.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAEzC,MAAM,UAAU,mBAAmB;IACjC,OAAO,UAAU,IAAa;QAC5B,OAAO,YAAY,CAAC,IAAI,CAAC,CAAC;IAC5B,CAAC,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,YAAY,CAAC,IAAa;IACxC,OAAO,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,OAAO,EAAE,KAAK,EAAE,MAAM;QAC5D,IAAI,OAAO,CAAC,OAAO,KAAK,IAAI,IAAI,MAAM,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YACpE,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACnC,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { Element } from 'hast';
|
|
2
|
+
export declare function unifiedRemoveTableOfContents(): (node: Element) => undefined;
|
|
3
|
+
/**
|
|
4
|
+
* Docusaurus-specific function since their mobile ToC
|
|
5
|
+
* is within the content in the `article` itself instead
|
|
6
|
+
* of outside of the `article` element
|
|
7
|
+
*/
|
|
8
|
+
export declare function removeTableOfContents(node: Element): undefined;
|
package/bin/utils/toc.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { visit } from 'unist-util-visit';
|
|
2
|
+
import { framework } from './detectFramework.js';
|
|
3
|
+
export function unifiedRemoveTableOfContents() {
|
|
4
|
+
return function (node) {
|
|
5
|
+
return removeTableOfContents(node);
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Docusaurus-specific function since their mobile ToC
|
|
10
|
+
* is within the content in the `article` itself instead
|
|
11
|
+
* of outside of the `article` element
|
|
12
|
+
*/
|
|
13
|
+
export function removeTableOfContents(node) {
|
|
14
|
+
return visit(node, 'element', function (subNode, index, parent) {
|
|
15
|
+
if (framework.vendor === 'docusaurus' &&
|
|
16
|
+
subNode.tagName === 'div' &&
|
|
17
|
+
subNode.properties.className &&
|
|
18
|
+
Array.isArray(subNode.properties.className) &&
|
|
19
|
+
subNode.properties.className.includes('theme-doc-toc-mobile') &&
|
|
20
|
+
parent &&
|
|
21
|
+
typeof index === 'number') {
|
|
22
|
+
parent.children.splice(index, 1);
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
//# sourceMappingURL=toc.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc.js","sourceRoot":"","sources":["../../src/utils/toc.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAEzC,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAEjD,MAAM,UAAU,4BAA4B;IAC1C,OAAO,UAAU,IAAa;QAC5B,OAAO,qBAAqB,CAAC,IAAI,CAAC,CAAC;IACrC,CAAC,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CAAC,IAAa;IACjD,OAAO,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,OAAO,EAAE,KAAK,EAAE,MAAM;QAC5D,IACE,SAAS,CAAC,MAAM,KAAK,YAAY;YACjC,OAAO,CAAC,OAAO,KAAK,KAAK;YACzB,OAAO,CAAC,UAAU,CAAC,SAAS;YAC5B,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,SAAS,CAAC;YAC3C,OAAO,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,sBAAsB,CAAC;YAC7D,MAAM;YACN,OAAO,KAAK,KAAK,QAAQ,EACzB,CAAC;YACD,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACnC,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mintlify/scraping",
|
|
3
|
-
"version": "4.0.
|
|
3
|
+
"version": "4.0.40",
|
|
4
4
|
"description": "Scrape documentation frameworks to Mintlify docs",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=18.0.0"
|
|
@@ -38,7 +38,7 @@
|
|
|
38
38
|
"format:check": "prettier . --check"
|
|
39
39
|
},
|
|
40
40
|
"dependencies": {
|
|
41
|
-
"@mintlify/common": "1.0.
|
|
41
|
+
"@mintlify/common": "1.0.196",
|
|
42
42
|
"@mintlify/openapi-parser": "^0.0.7",
|
|
43
43
|
"fs-extra": "^11.1.1",
|
|
44
44
|
"hast": "^1.0.0",
|
|
@@ -60,10 +60,10 @@
|
|
|
60
60
|
},
|
|
61
61
|
"devDependencies": {
|
|
62
62
|
"@mintlify/eslint-config-typescript": "1.0.13",
|
|
63
|
-
"@mintlify/models": "0.0.
|
|
63
|
+
"@mintlify/models": "0.0.152",
|
|
64
64
|
"@mintlify/prettier-config": "1.0.4",
|
|
65
65
|
"@mintlify/ts-config": "2.0.2",
|
|
66
|
-
"@mintlify/validation": "0.1.
|
|
66
|
+
"@mintlify/validation": "0.1.225",
|
|
67
67
|
"@trivago/prettier-plugin-sort-imports": "^4.2.1",
|
|
68
68
|
"@tsconfig/recommended": "1.x",
|
|
69
69
|
"@types/node": "^18.7.13",
|
|
@@ -78,5 +78,5 @@
|
|
|
78
78
|
"typescript": "^5.5.3",
|
|
79
79
|
"vitest": "^2.0.4"
|
|
80
80
|
},
|
|
81
|
-
"gitHead": "
|
|
81
|
+
"gitHead": "f1d4cc0ef3018924aff0a5c69f4664df24824c37"
|
|
82
82
|
}
|
package/src/pipeline/images.ts
CHANGED
|
@@ -37,10 +37,7 @@ export async function downloadImagesFromFile(
|
|
|
37
37
|
const localRootPath = filename ? dirname(filename) : rootPath;
|
|
38
38
|
|
|
39
39
|
const imageResults = await Promise.all(
|
|
40
|
-
imageUrls.map(async (imageUrl) =>
|
|
41
|
-
const result = await downloadImage(imageUrl, localRootPath);
|
|
42
|
-
return result;
|
|
43
|
-
})
|
|
40
|
+
imageUrls.map(async (imageUrl) => await downloadImage(imageUrl, localRootPath))
|
|
44
41
|
);
|
|
45
42
|
|
|
46
43
|
const imagePathsMap = new Map<string, string>(
|
|
@@ -49,7 +46,11 @@ export async function downloadImagesFromFile(
|
|
|
49
46
|
|
|
50
47
|
visit(root, function (node, index, parent) {
|
|
51
48
|
if (node.type === 'image') {
|
|
52
|
-
node.url
|
|
49
|
+
if (node.url.startsWith('/')) {
|
|
50
|
+
node.url = imagePathsMap.get(new URL(node.url, url.origin).toString()) ?? node.url;
|
|
51
|
+
} else {
|
|
52
|
+
node.url = imagePathsMap.get(node.url) ?? node.url;
|
|
53
|
+
}
|
|
53
54
|
if (parent && typeof index === 'number') parent.children[index] = node;
|
|
54
55
|
} else if (node.type === 'mdxJsxFlowElement') {
|
|
55
56
|
const urlAttr = (node.attributes as Array<MdxJsxAttribute>).find(
|
package/src/pipeline/page.ts
CHANGED
|
@@ -21,6 +21,7 @@ import { rehypeToRemarkCustomComponents } from '../customComponents/plugin.js';
|
|
|
21
21
|
import { selectiveRehypeRemark } from '../customComponents/selective.js';
|
|
22
22
|
import { retrieveRootContent } from '../root/retrieve.js';
|
|
23
23
|
import type { Result } from '../types/result.js';
|
|
24
|
+
import { unifiedRemoveBreadCrumbs } from '../utils/breadcrumbs.js';
|
|
24
25
|
import { unifiedRemoveBreaks } from '../utils/breaks.js';
|
|
25
26
|
import { unifiedRemoveClassNames } from '../utils/className.js';
|
|
26
27
|
import { unifiedRemoveCopyButtons } from '../utils/copyButton.js';
|
|
@@ -39,6 +40,7 @@ import { unifiedRemovePositions } from '../utils/position.js';
|
|
|
39
40
|
import { removeLeadingSlash, removeTrailingSlash } from '../utils/strings.js';
|
|
40
41
|
import { remarkRemoveCodeBlocksInCells } from '../utils/tableCells.js';
|
|
41
42
|
import { getDescriptionFromRoot, getTitleFromHeading } from '../utils/title.js';
|
|
43
|
+
import { unifiedRemoveTableOfContents } from '../utils/toc.js';
|
|
42
44
|
import { remarkRemoveUpdatedAt } from '../utils/updatedAt.js';
|
|
43
45
|
import { downloadImagesFromFile } from './images.js';
|
|
44
46
|
import { htmlToHast } from './root.js';
|
|
@@ -78,6 +80,8 @@ export async function scrapePage(
|
|
|
78
80
|
|
|
79
81
|
const mdastTree: MdastRoot = unified()
|
|
80
82
|
.use(unifiedRemoveBreaks)
|
|
83
|
+
.use(unifiedRemoveBreadCrumbs)
|
|
84
|
+
.use(unifiedRemoveTableOfContents)
|
|
81
85
|
.use(unifiedRemoveCopyButtons)
|
|
82
86
|
.use(createCard)
|
|
83
87
|
.use(createAccordion)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import type { Element } from 'hast';
|
|
2
|
+
import { visit } from 'unist-util-visit';
|
|
3
|
+
|
|
4
|
+
import { framework } from './detectFramework.js';
|
|
5
|
+
|
|
6
|
+
export function unifiedRemoveBreadCrumbs() {
|
|
7
|
+
return function (node: Element) {
|
|
8
|
+
return removeBreadCrumbs(node);
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Docusaurus-specific function since their breadcrumbs
|
|
14
|
+
* are within the content in the `article` itself instead
|
|
15
|
+
* of outside of the `article` element
|
|
16
|
+
*/
|
|
17
|
+
export function removeBreadCrumbs(node: Element) {
|
|
18
|
+
return visit(node, 'element', function (subNode, index, parent) {
|
|
19
|
+
if (
|
|
20
|
+
framework.vendor === 'docusaurus' &&
|
|
21
|
+
subNode.tagName === 'nav' &&
|
|
22
|
+
subNode.properties.className &&
|
|
23
|
+
Array.isArray(subNode.properties.className) &&
|
|
24
|
+
subNode.properties.className.includes('theme-doc-breadcrumbs') &&
|
|
25
|
+
parent &&
|
|
26
|
+
typeof index === 'number'
|
|
27
|
+
) {
|
|
28
|
+
parent.children.splice(index, 1);
|
|
29
|
+
}
|
|
30
|
+
});
|
|
31
|
+
}
|
package/src/utils/breaks.ts
CHANGED
|
@@ -7,9 +7,11 @@ export function unifiedRemoveBreaks() {
|
|
|
7
7
|
};
|
|
8
8
|
}
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
/**
|
|
11
|
+
* ReadMe-specific function since they use breaks in between
|
|
12
|
+
* every element, but either way our parser adds whitespace
|
|
13
|
+
* automatically
|
|
14
|
+
*/
|
|
13
15
|
export function removeBreaks(node: Element) {
|
|
14
16
|
return visit(node, 'element', function (subNode, index, parent) {
|
|
15
17
|
if (subNode.tagName === 'br' && parent && typeof index === 'number') {
|
package/src/utils/toc.ts
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import type { Element } from 'hast';
|
|
2
|
+
import { visit } from 'unist-util-visit';
|
|
3
|
+
|
|
4
|
+
import { framework } from './detectFramework.js';
|
|
5
|
+
|
|
6
|
+
export function unifiedRemoveTableOfContents() {
|
|
7
|
+
return function (node: Element) {
|
|
8
|
+
return removeTableOfContents(node);
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Docusaurus-specific function since their mobile ToC
|
|
14
|
+
* is within the content in the `article` itself instead
|
|
15
|
+
* of outside of the `article` element
|
|
16
|
+
*/
|
|
17
|
+
export function removeTableOfContents(node: Element) {
|
|
18
|
+
return visit(node, 'element', function (subNode, index, parent) {
|
|
19
|
+
if (
|
|
20
|
+
framework.vendor === 'docusaurus' &&
|
|
21
|
+
subNode.tagName === 'div' &&
|
|
22
|
+
subNode.properties.className &&
|
|
23
|
+
Array.isArray(subNode.properties.className) &&
|
|
24
|
+
subNode.properties.className.includes('theme-doc-toc-mobile') &&
|
|
25
|
+
parent &&
|
|
26
|
+
typeof index === 'number'
|
|
27
|
+
) {
|
|
28
|
+
parent.children.splice(index, 1);
|
|
29
|
+
}
|
|
30
|
+
});
|
|
31
|
+
}
|