@mintlify/scraping 4.0.589 → 4.0.591
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +10 -3
- package/bin/cli.js.map +1 -1
- package/bin/constants.d.ts +1 -1
- package/bin/constants.js +1 -4
- package/bin/constants.js.map +1 -1
- package/bin/index.d.ts +9 -0
- package/bin/index.js +9 -0
- package/bin/index.js.map +1 -1
- package/bin/nav/retrieve.js +15 -0
- package/bin/nav/retrieve.js.map +1 -1
- package/bin/nav/root.js +7 -0
- package/bin/nav/root.js.map +1 -1
- package/bin/pipeline/site.d.ts +1 -0
- package/bin/pipeline/site.js +37 -8
- package/bin/pipeline/site.js.map +1 -1
- package/bin/pipeline/tabs.d.ts +3 -1
- package/bin/pipeline/tabs.js +4 -4
- package/bin/pipeline/tabs.js.map +1 -1
- package/bin/tsconfig.build.tsbuildinfo +1 -1
- package/bin/utils/network.js +46 -0
- package/bin/utils/network.js.map +1 -1
- package/bin/utils/strings.d.ts +1 -0
- package/bin/utils/strings.js +3 -0
- package/bin/utils/strings.js.map +1 -1
- package/bin/utils/text.js +7 -3
- package/bin/utils/text.js.map +1 -1
- package/package.json +4 -4
- package/src/cli.ts +13 -4
- package/src/constants.ts +1 -5
- package/src/index.ts +10 -0
- package/src/nav/retrieve.ts +22 -0
- package/src/nav/root.ts +11 -0
- package/src/pipeline/site.ts +42 -19
- package/src/pipeline/tabs.ts +5 -4
- package/src/utils/network.ts +47 -0
- package/src/utils/strings.ts +4 -0
- package/src/utils/text.ts +5 -3
package/bin/utils/network.js
CHANGED
|
@@ -65,6 +65,52 @@ export async function getHtmlWithPuppeteer(browser, url) {
|
|
|
65
65
|
clickItems(document);
|
|
66
66
|
});
|
|
67
67
|
}
|
|
68
|
+
if (framework.vendor === 'gitbook') {
|
|
69
|
+
for (let round = 0; round < 10; round++) {
|
|
70
|
+
const clickedCount = await page.evaluate(() => {
|
|
71
|
+
const tocEl = document.getElementById('table-of-contents');
|
|
72
|
+
if (!tocEl)
|
|
73
|
+
return 0;
|
|
74
|
+
let count = 0;
|
|
75
|
+
const items = tocEl.querySelectorAll('li.page-document-item');
|
|
76
|
+
items.forEach((li) => {
|
|
77
|
+
const btn = li.querySelector(':scope > a button');
|
|
78
|
+
if (!btn || !(btn instanceof HTMLElement))
|
|
79
|
+
return;
|
|
80
|
+
const anchor = btn.closest('a');
|
|
81
|
+
if (!anchor)
|
|
82
|
+
return;
|
|
83
|
+
const sibling = anchor.nextElementSibling;
|
|
84
|
+
if (sibling instanceof HTMLElement &&
|
|
85
|
+
sibling.style.opacity === '1' &&
|
|
86
|
+
sibling.style.height === 'auto')
|
|
87
|
+
return;
|
|
88
|
+
btn.click();
|
|
89
|
+
count++;
|
|
90
|
+
});
|
|
91
|
+
return count;
|
|
92
|
+
});
|
|
93
|
+
if (clickedCount === 0)
|
|
94
|
+
break;
|
|
95
|
+
await page
|
|
96
|
+
.waitForFunction(() => {
|
|
97
|
+
const tocEl = document.getElementById('table-of-contents');
|
|
98
|
+
if (!tocEl)
|
|
99
|
+
return true;
|
|
100
|
+
const anchors = tocEl.querySelectorAll('li.page-document-item > a');
|
|
101
|
+
return Array.from(anchors).every((a) => {
|
|
102
|
+
const btn = a.querySelector('button');
|
|
103
|
+
if (!btn)
|
|
104
|
+
return true;
|
|
105
|
+
const sibling = a.nextElementSibling;
|
|
106
|
+
if (!sibling || !(sibling instanceof HTMLElement))
|
|
107
|
+
return true;
|
|
108
|
+
return sibling.style.opacity === '1' && sibling.style.height === 'auto';
|
|
109
|
+
});
|
|
110
|
+
}, { timeout: 5000 })
|
|
111
|
+
.catch(() => { });
|
|
112
|
+
}
|
|
113
|
+
}
|
|
68
114
|
const content = await exponentialBackoff(() => page.content());
|
|
69
115
|
await page.close();
|
|
70
116
|
return content;
|
package/bin/utils/network.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"network.js","sourceRoot":"","sources":["../../src/utils/network.ts"],"names":[],"mappings":"AAAA,OAAO,EAA6B,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACnG,OAAO,IAAI,MAAM,SAAS,CAAC;AAE3B,OAAO,EAAW,MAAM,EAAE,MAAM,WAAW,CAAC;AAE5C,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,MAAM,UAAU,GAAG;IACjB,uHAAuH;IACvH,uHAAuH;IACvH,uHAAuH;IACvH,uHAAuH;IACvH,uHAAuH;CAC/G,CAAC;AAEX,MAAM,OAAO,GAAG;IACd,iBAAiB,EAAE,gBAAgB;IACnC,MAAM,EACJ,yIAAyI;IAC3I,YAAY,EACV,uHAAuH;IACzH,iBAAiB,EAAE,yBAAyB;IAC5C,UAAU,EAAE,YAAY;CAChB,CAAC;AAEX,MAAM,CAAC,KAAK,UAAU,cAAc;IAClC,IAAI,CAAC;QACH,OAAO,MAAM,MAAM,CAAC;YAClB,QAAQ,EAAE,IAAI;YACd,iBAAiB,EAAE,IAAI;SACxB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,GAAG,CAAC,wCAAwC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAgB,EAChB,GAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QAErC,MAAM,IAAI,CAAC,WAAW,CAAC;YACrB,KAAK,EAAE,IAAI;YACX,MAAM,EAAE,IAAI;YACZ,iBAAiB,EAAE,CAAC;YACpB,QAAQ,EAAE,KAAK;YACf,QAAQ,EAAE,KAAK;YACf,WAAW,EAAE,IAAI;SAClB,CAAC,CAAC;QACH,MAAM,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;QACxC,MAAM,IAAI,CAAC,YAAY,CACrB,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,UAAU,CAAC,CAAC,CAAC,CAC3E,CAAC;QACF,MAAM,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAC;QAEtC,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAC5B,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE;YACxB,SAAS,EAAE,cAAc;YACzB,OAAO,EAAE,KAAK;SACf,CAAC,CACH,CAAC;QAEF,IAAI,SAAS,CAAC,MAAM,KAAK,YAAY,EAAE,CAAC;YACtC,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAG,EAAE;gBACvB,QAAQ,CAAC,gBAAgB,CACvB,OAAO,EACP,CAAC,CAAC,EAAE,EAAE;oBACJ,IAAI,CAAC,CAAC,MAAM,YAAY,OAAO,IAAI,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC,qBAAqB,CAAC;wBACnF,CAAC,CAAC,cAAc,EAAE,CAAC;gBACvB,CAAC,EACD,IAAI,CACL,CAAC;gBAEF,SAAS,UAAU,CAAC,EAA0B;oBAC5C,MAAM,SAAS,GAAG,EAAE,CAAC,sBAAsB,CACzC,qBAAqB,CACW,CAAC;oBACnC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;wBAC7B,IAAI,CAAC,KAAK,EAAE,CAAC;wBACb,UAAU,CAAC,IAAI,CAAC,CAAC;oBACnB,CAAC;gBACH,CAAC;gBACD,UAAU,CAAC,QAAQ,CAAC,CAAC;YACvB,CAAC,CAAC,CAAC;QACL,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QAC/D,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;QACnB,OAAO,OAAO,CAAC;IACjB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,yCAAyC,YAAY,EAAE,CAAC,CAAC;IAC3E,CAAC;AACH,CAAC;AAED,KAAK,UAAU,iBAAiB,CAAC,GAAiB;IAChD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7B,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QACD,OAAO,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;IAC1B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,mCAAmC,YAAY,EAAE,CAAC,CAAC;IACrE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAiB,EACjB,UAA+B,SAAS;IAExC,IAAI,CAAC;QACH,IAAI,GAAG,GAAuB,SAAS,CAAC;QACxC,IAAI,OAAO,EAAE,CAAC;YACZ,GAAG,GAAG,MAAM,oBAAoB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QACjD,CAAC;aAAM,CAAC;YACN,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC;QAC/D,CAAC;QACD,IAAI,GAAG;YAAE,OAAO,GAAG,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;IAC/C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,CAAC,QAAQ,EAAE,GAAG,YAAY,EAAE,CAAC,CAAC;IAChF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,GAAW;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;QACvD,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,SAAS,GAAG,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC;QAE9C,OAAO,SAAS,CAAC;IACnB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,0CAA0C,YAAY,EAAE,CAAC,CAAC;IAClF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,GAAQ;IACzC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,KAAK,IAAI,EAAE;YAC9C,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,GAAG,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YAC/D,CAAC;YACD,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QAC9B,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAqB,CAAC;IAC7C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,iDAAiD,YAAY,EAAE,CAAC,CAAC;IACzF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,GAAQ;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,KAAK,IAAI,EAAE;YAC9C,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,GAAG,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YAC/D,CAAC;YACD,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QAC9B,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,GAAG,MAAM,gBAAgB,CAAC,IAAI,CAAC,CAAC;QAChE,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,uDAAuD,YAAY,EAAE,CAAC,CAAC;QAC/F,CAAC;QACD,OAAO,QAAQ,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,qDAAqD,YAAY,EAAE,CAAC,CAAC;IAC7F,CAAC;AACH,CAAC"}
|
|
1
|
+
{"version":3,"file":"network.js","sourceRoot":"","sources":["../../src/utils/network.ts"],"names":[],"mappings":"AAAA,OAAO,EAA6B,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACnG,OAAO,IAAI,MAAM,SAAS,CAAC;AAE3B,OAAO,EAAW,MAAM,EAAE,MAAM,WAAW,CAAC;AAE5C,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,MAAM,UAAU,GAAG;IACjB,uHAAuH;IACvH,uHAAuH;IACvH,uHAAuH;IACvH,uHAAuH;IACvH,uHAAuH;CAC/G,CAAC;AAEX,MAAM,OAAO,GAAG;IACd,iBAAiB,EAAE,gBAAgB;IACnC,MAAM,EACJ,yIAAyI;IAC3I,YAAY,EACV,uHAAuH;IACzH,iBAAiB,EAAE,yBAAyB;IAC5C,UAAU,EAAE,YAAY;CAChB,CAAC;AAEX,MAAM,CAAC,KAAK,UAAU,cAAc;IAClC,IAAI,CAAC;QACH,OAAO,MAAM,MAAM,CAAC;YAClB,QAAQ,EAAE,IAAI;YACd,iBAAiB,EAAE,IAAI;SACxB,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,GAAG,CAAC,wCAAwC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAgB,EAChB,GAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QAErC,MAAM,IAAI,CAAC,WAAW,CAAC;YACrB,KAAK,EAAE,IAAI;YACX,MAAM,EAAE,IAAI;YACZ,iBAAiB,EAAE,CAAC;YACpB,QAAQ,EAAE,KAAK;YACf,QAAQ,EAAE,KAAK;YACf,WAAW,EAAE,IAAI;SAClB,CAAC,CAAC;QACH,MAAM,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;QACxC,MAAM,IAAI,CAAC,YAAY,CACrB,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,UAAU,CAAC,CAAC,CAAC,CAC3E,CAAC;QACF,MAAM,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAC;QAEtC,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAC5B,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE;YACxB,SAAS,EAAE,cAAc;YACzB,OAAO,EAAE,KAAK;SACf,CAAC,CACH,CAAC;QAEF,IAAI,SAAS,CAAC,MAAM,KAAK,YAAY,EAAE,CAAC;YACtC,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAG,EAAE;gBACvB,QAAQ,CAAC,gBAAgB,CACvB,OAAO,EACP,CAAC,CAAC,EAAE,EAAE;oBACJ,IAAI,CAAC,CAAC,MAAM,YAAY,OAAO,IAAI,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC,qBAAqB,CAAC;wBACnF,CAAC,CAAC,cAAc,EAAE,CAAC;gBACvB,CAAC,EACD,IAAI,CACL,CAAC;gBAEF,SAAS,UAAU,CAAC,EAA0B;oBAC5C,MAAM,SAAS,GAAG,EAAE,CAAC,sBAAsB,CACzC,qBAAqB,CACW,CAAC;oBACnC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;wBAC7B,IAAI,CAAC,KAAK,EAAE,CAAC;wBACb,UAAU,CAAC,IAAI,CAAC,CAAC;oBACnB,CAAC;gBACH,CAAC;gBACD,UAAU,CAAC,QAAQ,CAAC,CAAC;YACvB,CAAC,CAAC,CAAC;QACL,CAAC;QAED,IAAI,SAAS,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YACnC,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC;gBACxC,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAG,EAAE;oBAC5C,MAAM,KAAK,GAAG,QAAQ,CAAC,cAAc,CAAC,mBAAmB,CAAC,CAAC;oBAC3D,IAAI,CAAC,KAAK;wBAAE,OAAO,CAAC,CAAC;oBACrB,IAAI,KAAK,GAAG,CAAC,CAAC;oBACd,MAAM,KAAK,GAAG,KAAK,CAAC,gBAAgB,CAAC,uBAAuB,CAAC,CAAC;oBAC9D,KAAK,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;wBACnB,MAAM,GAAG,GAAG,EAAE,CAAC,aAAa,CAAC,mBAAmB,CAAC,CAAC;wBAClD,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,YAAY,WAAW,CAAC;4BAAE,OAAO;wBAClD,MAAM,MAAM,GAAG,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;wBAChC,IAAI,CAAC,MAAM;4BAAE,OAAO;wBACpB,MAAM,OAAO,GAAG,MAAM,CAAC,kBAAkB,CAAC;wBAC1C,IACE,OAAO,YAAY,WAAW;4BAC9B,OAAO,CAAC,KAAK,CAAC,OAAO,KAAK,GAAG;4BAC7B,OAAO,CAAC,KAAK,CAAC,MAAM,KAAK,MAAM;4BAE/B,OAAO;wBACT,GAAG,CAAC,KAAK,EAAE,CAAC;wBACZ,KAAK,EAAE,CAAC;oBACV,CAAC,CAAC,CAAC;oBACH,OAAO,KAAK,CAAC;gBACf,CAAC,CAAC,CAAC;gBAEH,IAAI,YAAY,KAAK,CAAC;oBAAE,MAAM;gBAE9B,MAAM,IAAI;qBACP,eAAe,CACd,GAAG,EAAE;oBACH,MAAM,KAAK,GAAG,QAAQ,CAAC,cAAc,CAAC,mBAAmB,CAAC,CAAC;oBAC3D,IAAI,CAAC,KAAK;wBAAE,OAAO,IAAI,CAAC;oBACxB,MAAM,OAAO,GAAG,KAAK,CAAC,gBAAgB,CAAC,2BAA2B,CAAC,CAAC;oBACpE,OAAO,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;wBACrC,MAAM,GAAG,GAAG,CAAC,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;wBACtC,IAAI,CAAC,GAAG;4BAAE,OAAO,IAAI,CAAC;wBACtB,MAAM,OAAO,GAAG,CAAC,CAAC,kBAAkB,CAAC;wBACrC,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,OAAO,YAAY,WAAW,CAAC;4BAAE,OAAO,IAAI,CAAC;wBAC/D,OAAO,OAAO,CAAC,KAAK,CAAC,OAAO,KAAK,GAAG,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,KAAK,MAAM,CAAC;oBAC1E,CAAC,CAAC,CAAC;gBACL,CAAC,EACD,EAAE,OAAO,EAAE,IAAI,EAAE,CAClB;qBACA,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YACrB,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QAC/D,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;QACnB,OAAO,OAAO,CAAC;IACjB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,yCAAyC,YAAY,EAAE,CAAC,CAAC;IAC3E,CAAC;AACH,CAAC;AAED,KAAK,UAAU,iBAAiB,CAAC,GAAiB;IAChD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7B,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QACD,OAAO,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;IAC1B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,mCAAmC,YAAY,EAAE,CAAC,CAAC;IACrE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAiB,EACjB,UAA+B,SAAS;IAExC,IAAI,CAAC;QACH,IAAI,GAAG,GAAuB,SAAS,CAAC;QACxC,IAAI,OAAO,EAAE,CAAC;YACZ,GAAG,GAAG,MAAM,oBAAoB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QACjD,CAAC;aAAM,CAAC;YACN,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC;QAC/D,CAAC;QACD,IAAI,GAAG;YAAE,OAAO,GAAG,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;IAC/C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,6BAA6B,GAAG,CAAC,QAAQ,EAAE,GAAG,YAAY,EAAE,CAAC,CAAC;IAChF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,GAAW;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;QACvD,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;QACrD,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,SAAS,GAAG,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC;QAE9C,OAAO,SAAS,CAAC;IACnB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,0CAA0C,YAAY,EAAE,CAAC,CAAC;IAClF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,GAAQ;IACzC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,KAAK,IAAI,EAAE;YAC9C,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,GAAG,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YAC/D,CAAC;YACD,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QAC9B,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAqB,CAAC;IAC7C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,iDAAiD,YAAY,EAAE,CAAC,CAAC;IACzF,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,GAAQ;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,KAAK,IAAI,EAAE;YAC9C,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,GAAG,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YAC/D,CAAC;YACD,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QAC9B,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,GAAG,MAAM,gBAAgB,CAAC,IAAI,CAAC,CAAC;QAChE,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,uDAAuD,YAAY,EAAE,CAAC,CAAC;QAC/F,CAAC;QACD,OAAO,QAAQ,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,GAAG,GAAG,qDAAqD,YAAY,EAAE,CAAC,CAAC;IAC7F,CAAC;AACH,CAAC"}
|
package/bin/utils/strings.d.ts
CHANGED
package/bin/utils/strings.js
CHANGED
|
@@ -4,4 +4,7 @@ export function removeTrailingSlash(str) {
|
|
|
4
4
|
export function removeLeadingSlash(str) {
|
|
5
5
|
return str.startsWith('/') ? str.substring(1) : str;
|
|
6
6
|
}
|
|
7
|
+
export function optionallyAddLeadingSlash(str) {
|
|
8
|
+
return str.startsWith('/') ? str : '/' + str;
|
|
9
|
+
}
|
|
7
10
|
//# sourceMappingURL=strings.js.map
|
package/bin/utils/strings.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"strings.js","sourceRoot":"","sources":["../../src/utils/strings.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,mBAAmB,CAAC,GAAW;IAC7C,OAAO,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AACpE,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,GAAW;IAC5C,OAAO,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AACtD,CAAC"}
|
|
1
|
+
{"version":3,"file":"strings.js","sourceRoot":"","sources":["../../src/utils/strings.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,mBAAmB,CAAC,GAAW;IAC7C,OAAO,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AACpE,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,GAAW;IAC5C,OAAO,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AACtD,CAAC;AAED,MAAM,UAAU,yBAAyB,CAAC,GAAW;IACnD,OAAO,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC;AAC/C,CAAC"}
|
package/bin/utils/text.js
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
|
-
import { visit } from 'unist-util-visit';
|
|
1
|
+
import { CONTINUE, SKIP, visit } from 'unist-util-visit';
|
|
2
2
|
export function getText(element) {
|
|
3
3
|
if (!element)
|
|
4
4
|
return '';
|
|
5
5
|
let text = '';
|
|
6
|
-
visit(element,
|
|
7
|
-
|
|
6
|
+
visit(element, function (node) {
|
|
7
|
+
if (node.type === 'element' && node.tagName === 'svg')
|
|
8
|
+
return SKIP;
|
|
9
|
+
if (node.type === 'text')
|
|
10
|
+
text += node.value;
|
|
11
|
+
return CONTINUE;
|
|
8
12
|
});
|
|
9
13
|
return text;
|
|
10
14
|
}
|
package/bin/utils/text.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"text.js","sourceRoot":"","sources":["../../src/utils/text.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;
|
|
1
|
+
{"version":3,"file":"text.js","sourceRoot":"","sources":["../../src/utils/text.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAEzD,MAAM,UAAU,OAAO,CAAC,OAA4B;IAClD,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,CAAC;IACxB,IAAI,IAAI,GAAG,EAAE,CAAC;IACd,KAAK,CAAC,OAAO,EAAE,UAAU,IAAI;QAC3B,IAAI,IAAI,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,OAAO,KAAK,KAAK;YAAE,OAAO,IAAI,CAAC;QACnE,IAAI,IAAI,CAAC,IAAI,KAAK,MAAM;YAAE,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC;QAC7C,OAAO,QAAQ,CAAC;IAClB,CAAC,CAAC,CAAC;IACH,OAAO,IAAI,CAAC;AACd,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mintlify/scraping",
|
|
3
|
-
"version": "4.0.
|
|
3
|
+
"version": "4.0.591",
|
|
4
4
|
"description": "Scrape documentation frameworks to Mintlify docs",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=18.0.0"
|
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
"format:check": "prettier . --check"
|
|
44
44
|
},
|
|
45
45
|
"dependencies": {
|
|
46
|
-
"@mintlify/common": "1.0.
|
|
46
|
+
"@mintlify/common": "1.0.729",
|
|
47
47
|
"@mintlify/openapi-parser": "^0.0.8",
|
|
48
48
|
"fs-extra": "11.1.1",
|
|
49
49
|
"hast-util-to-mdast": "10.1.0",
|
|
@@ -66,7 +66,7 @@
|
|
|
66
66
|
"@mintlify/models": "0.0.272",
|
|
67
67
|
"@mintlify/prettier-config": "1.0.4",
|
|
68
68
|
"@mintlify/ts-config": "2.0.2",
|
|
69
|
-
"@mintlify/validation": "0.1.
|
|
69
|
+
"@mintlify/validation": "0.1.595",
|
|
70
70
|
"@trivago/prettier-plugin-sort-imports": "4.3.0",
|
|
71
71
|
"@tsconfig/recommended": "1.0.2",
|
|
72
72
|
"@types/hast": "3.0.4",
|
|
@@ -82,5 +82,5 @@
|
|
|
82
82
|
"typescript": "5.5.3",
|
|
83
83
|
"vitest": "2.0.4"
|
|
84
84
|
},
|
|
85
|
-
"gitHead": "
|
|
85
|
+
"gitHead": "c40f23d742edb39a24f4259970f9cd5f0a506f3c"
|
|
86
86
|
}
|
package/src/cli.ts
CHANGED
|
@@ -27,8 +27,17 @@ await yargs(hideBin(process.argv))
|
|
|
27
27
|
.command(
|
|
28
28
|
'section <url>',
|
|
29
29
|
'Scrapes the entire docs site based on the URL provided',
|
|
30
|
-
(yargs) =>
|
|
31
|
-
|
|
30
|
+
(yargs) =>
|
|
31
|
+
yargs
|
|
32
|
+
.positional('url', { type: 'string', demandOption: true })
|
|
33
|
+
.option('filter', {
|
|
34
|
+
describe:
|
|
35
|
+
'Only scrape URLs matching this path filter (e.g. /docs will match /docs and /docs/*)',
|
|
36
|
+
type: 'string',
|
|
37
|
+
alias: 'f',
|
|
38
|
+
})
|
|
39
|
+
.check(checkUrl),
|
|
40
|
+
async ({ url, filter }) => await site(url, filter)
|
|
32
41
|
)
|
|
33
42
|
|
|
34
43
|
.command(
|
|
@@ -117,13 +126,13 @@ async function page(url: string) {
|
|
|
117
126
|
}
|
|
118
127
|
}
|
|
119
128
|
|
|
120
|
-
async function site(url: string) {
|
|
129
|
+
async function site(url: string, filter?: string) {
|
|
121
130
|
try {
|
|
122
131
|
const urlObj = new URL(url);
|
|
123
132
|
const html = await fetchPageHtml(urlObj);
|
|
124
133
|
log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
|
|
125
134
|
|
|
126
|
-
const result = await scrapeAllSiteTabs(html, urlObj);
|
|
135
|
+
const result = await scrapeAllSiteTabs(html, urlObj, { filter });
|
|
127
136
|
if (result.success) {
|
|
128
137
|
const mintConfig = result.data as MintConfigType;
|
|
129
138
|
const docsConfig = upgradeToDocsConfig(mintConfig, {
|
package/src/constants.ts
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import { activeColors } from './utils/log.js';
|
|
2
|
-
|
|
3
1
|
export const OVERVIEW_PAGE_SLUG = '/mintie_overview';
|
|
4
2
|
|
|
5
3
|
export const SUPPORTED_MEDIA_EXTENSIONS = [
|
|
@@ -50,6 +48,4 @@ ${SPACES}We currently support: ReadMe, GitBook, and Docusaurus`;
|
|
|
50
48
|
|
|
51
49
|
export const MDAST_FAILURE_MSG = 'failed to convert MDAST to Markdown string';
|
|
52
50
|
|
|
53
|
-
export const FINAL_SUCCESS_MESSAGE = `We've successfully scraped your docs site.
|
|
54
|
-
${SPACES}We've downloaded the ${activeColors.cyan}\`navigation\`${activeColors.default} array (and if necessary, the ${activeColors.cyan}\`tabs\`${activeColors.default} array)
|
|
55
|
-
${SPACES}into ${activeColors.blue}\`docs.json\`${activeColors.default}.`;
|
|
51
|
+
export const FINAL_SUCCESS_MESSAGE = `We've successfully scraped your docs site. We've downloaded the \`navigation\` array (and if necessary, the \`tabs\` array) into \`docs.json\`.`;
|
package/src/index.ts
CHANGED
|
@@ -2,3 +2,13 @@ export { generateOpenApiPages } from './openapi/generateOpenApiPages.js';
|
|
|
2
2
|
export { generateOpenApiPagesForDocsConfig } from './openapi/generateOpenApiPagesForDocsConfig.js';
|
|
3
3
|
export * from './utils/log.js';
|
|
4
4
|
export { generateAsyncApiPagesForDocsConfig } from './asyncapi/generateAsyncApiPagesForDocsConfig.js';
|
|
5
|
+
|
|
6
|
+
export { scrapePageGroup } from './pipeline/group.js';
|
|
7
|
+
export { scrapeAllSiteTabs } from './pipeline/tabs.js';
|
|
8
|
+
export { htmlToHast } from './pipeline/root.js';
|
|
9
|
+
export { detectFramework, framework } from './utils/detectFramework.js';
|
|
10
|
+
export { fetchPageHtml } from './utils/network.js';
|
|
11
|
+
export { write } from './utils/file.js';
|
|
12
|
+
export { getErrorMessage } from './utils/errors.js';
|
|
13
|
+
export { checkUrl } from './utils/url.js';
|
|
14
|
+
export { FINAL_SUCCESS_MESSAGE } from './constants.js';
|
package/src/nav/retrieve.ts
CHANGED
|
@@ -39,6 +39,28 @@ export function retrieveNavItems(rootNode: Element): Array<NavigationEntry> {
|
|
|
39
39
|
if (node.tagName === rootSectionTagName) node.tagName = 'li';
|
|
40
40
|
if (node.tagName !== 'li') return CONTINUE;
|
|
41
41
|
|
|
42
|
+
const className = node.properties.className;
|
|
43
|
+
if (
|
|
44
|
+
framework.vendor === 'gitbook' &&
|
|
45
|
+
Array.isArray(className) &&
|
|
46
|
+
className.includes('page-group-item')
|
|
47
|
+
) {
|
|
48
|
+
const titleDiv = node.children.find(
|
|
49
|
+
(child) => child.type === 'element' && child.tagName === 'div'
|
|
50
|
+
);
|
|
51
|
+
const childList = node.children.find(
|
|
52
|
+
(child) => child.type === 'element' && child.tagName === 'ul'
|
|
53
|
+
);
|
|
54
|
+
if (titleDiv && titleDiv.type === 'element' && childList && childList.type === 'element') {
|
|
55
|
+
const title = findTitle(titleDiv, { delete: false });
|
|
56
|
+
const childEntries = retrieveNavItems(childList);
|
|
57
|
+
if (title && childEntries.length > 0) {
|
|
58
|
+
result.push({ group: title, pages: childEntries });
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return SKIP;
|
|
62
|
+
}
|
|
63
|
+
|
|
42
64
|
let title: string | undefined = undefined;
|
|
43
65
|
if (
|
|
44
66
|
node.children[0] &&
|
package/src/nav/root.ts
CHANGED
|
@@ -38,6 +38,17 @@ export function retrieveRootNavElement(rootNode: HastRoot): Element | undefined
|
|
|
38
38
|
let element: Element | undefined = undefined;
|
|
39
39
|
visit(rootNode, 'element', function (node) {
|
|
40
40
|
const { className } = node.properties;
|
|
41
|
+
|
|
42
|
+
if (
|
|
43
|
+
framework.vendor === 'gitbook' &&
|
|
44
|
+
node.tagName === 'aside' &&
|
|
45
|
+
(node.properties.id === 'table-of-contents' ||
|
|
46
|
+
node.properties.dataTestid === 'table-of-contents')
|
|
47
|
+
) {
|
|
48
|
+
element = node;
|
|
49
|
+
return EXIT;
|
|
50
|
+
}
|
|
51
|
+
|
|
41
52
|
if (
|
|
42
53
|
node.tagName === rootTagName &&
|
|
43
54
|
Array.isArray(className) &&
|
package/src/pipeline/site.ts
CHANGED
|
@@ -12,7 +12,11 @@ import { detectFramework, framework } from '../utils/detectFramework.js';
|
|
|
12
12
|
import { logErrorResults } from '../utils/errors.js';
|
|
13
13
|
import { fetchPageHtml, startPuppeteer } from '../utils/network.js';
|
|
14
14
|
import { INDEX_NAMES, iterateThroughReservedNames } from '../utils/reservedNames.js';
|
|
15
|
-
import {
|
|
15
|
+
import {
|
|
16
|
+
removeTrailingSlash,
|
|
17
|
+
removeLeadingSlash,
|
|
18
|
+
optionallyAddLeadingSlash,
|
|
19
|
+
} from '../utils/strings.js';
|
|
16
20
|
import { downloadColors } from './color.js';
|
|
17
21
|
import { scrapePageGroup } from './group.js';
|
|
18
22
|
import { downloadFavicon } from './icon.js';
|
|
@@ -20,10 +24,18 @@ import { downloadLogos } from './logo.js';
|
|
|
20
24
|
import { htmlToHast } from './root.js';
|
|
21
25
|
import { downloadTitle } from './title.js';
|
|
22
26
|
|
|
27
|
+
function matchesFilter(pathname: string, filter: string): boolean {
|
|
28
|
+
const normalizedPathname = removeTrailingSlash(pathname);
|
|
29
|
+
const normalizedFilter = removeTrailingSlash(optionallyAddLeadingSlash(filter));
|
|
30
|
+
return (
|
|
31
|
+
normalizedPathname === normalizedFilter || normalizedPathname.startsWith(normalizedFilter + '/')
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
|
|
23
35
|
export async function scrapeSite(
|
|
24
36
|
html: string,
|
|
25
37
|
url: string | URL,
|
|
26
|
-
opts: { hast?: HastRoot; tabs?: Array<Tab
|
|
38
|
+
opts: { hast?: HastRoot; tabs?: Array<Tab>; filter?: string } = {}
|
|
27
39
|
): Promise<Result<MintConfig>> {
|
|
28
40
|
let hast = opts.hast;
|
|
29
41
|
if (!hast) hast = htmlToHast(html);
|
|
@@ -33,7 +45,7 @@ export async function scrapeSite(
|
|
|
33
45
|
|
|
34
46
|
if (!framework.vendor) detectFramework(hast);
|
|
35
47
|
|
|
36
|
-
if (framework.vendor === 'docusaurus') {
|
|
48
|
+
if (framework.vendor === 'docusaurus' || framework.vendor === 'gitbook') {
|
|
37
49
|
const browser = await startPuppeteer();
|
|
38
50
|
html = await fetchPageHtml(url, browser);
|
|
39
51
|
hast = htmlToHast(html);
|
|
@@ -57,12 +69,16 @@ export async function scrapeSite(
|
|
|
57
69
|
const needsBrowser = framework.vendor === 'gitbook';
|
|
58
70
|
|
|
59
71
|
const externalLinks = listOfLinks.filter((url) => url.origin !== origin);
|
|
60
|
-
const internalLinks = listOfLinks.filter(
|
|
61
|
-
(url
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
)
|
|
72
|
+
const internalLinks = listOfLinks.filter((url) => {
|
|
73
|
+
if (url.origin !== origin || removeTrailingSlash(url.toString()) === origin) return false;
|
|
74
|
+
if (opts.filter && !matchesFilter(url.pathname, opts.filter)) return false;
|
|
75
|
+
return true;
|
|
76
|
+
});
|
|
77
|
+
const rootLinks = listOfLinks.filter((url) => {
|
|
78
|
+
if (url.origin !== origin || removeTrailingSlash(url.toString()) !== origin) return false;
|
|
79
|
+
if (opts.filter && !matchesFilter('/', opts.filter)) return false;
|
|
80
|
+
return true;
|
|
81
|
+
});
|
|
66
82
|
|
|
67
83
|
const allPathnames = [
|
|
68
84
|
...internalLinks.map((url) => url.toString()),
|
|
@@ -157,14 +173,24 @@ export async function scrapeSite(
|
|
|
157
173
|
})
|
|
158
174
|
.filter(Boolean);
|
|
159
175
|
|
|
176
|
+
function filterErroredOrFilteredPaths(value: string) {
|
|
177
|
+
if (allErroredPaths.includes(value)) return true;
|
|
178
|
+
if (opts.filter && !matchesFilter('/' + value, opts.filter)) return true;
|
|
179
|
+
return false;
|
|
180
|
+
}
|
|
181
|
+
|
|
160
182
|
traverse(navItems).forEach(function (value) {
|
|
161
|
-
if (
|
|
183
|
+
if (
|
|
184
|
+
typeof value === 'string' &&
|
|
185
|
+
this.key !== 'group' &&
|
|
186
|
+
filterErroredOrFilteredPaths(value)
|
|
187
|
+
) {
|
|
162
188
|
this.remove();
|
|
163
189
|
} else if (Array.isArray(value)) {
|
|
164
190
|
this.update(
|
|
165
191
|
value
|
|
166
192
|
.filter((item) =>
|
|
167
|
-
typeof item === 'string' &&
|
|
193
|
+
typeof item === 'string' && filterErroredOrFilteredPaths(item) ? undefined : item
|
|
168
194
|
)
|
|
169
195
|
.filter(Boolean)
|
|
170
196
|
);
|
|
@@ -199,15 +225,12 @@ export async function scrapeSite(
|
|
|
199
225
|
typeof val === 'string' && (val.startsWith('https://') || val.startsWith('http://'))
|
|
200
226
|
)
|
|
201
227
|
) {
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
(val)
|
|
205
|
-
!(
|
|
206
|
-
typeof val === 'string' &&
|
|
207
|
-
(val.startsWith('https://') || val.startsWith('http://'))
|
|
208
|
-
)
|
|
209
|
-
)
|
|
228
|
+
const newPages = value.filter(
|
|
229
|
+
(val) =>
|
|
230
|
+
!(typeof val === 'string' && (val.startsWith('https://') || val.startsWith('http://')))
|
|
210
231
|
);
|
|
232
|
+
if (newPages.length) this.update(newPages);
|
|
233
|
+
else this.parent?.remove();
|
|
211
234
|
}
|
|
212
235
|
});
|
|
213
236
|
|
package/src/pipeline/tabs.ts
CHANGED
|
@@ -16,7 +16,8 @@ import { downloadTitle } from './title.js';
|
|
|
16
16
|
|
|
17
17
|
export async function scrapeAllSiteTabs(
|
|
18
18
|
html: string,
|
|
19
|
-
url: string | URL
|
|
19
|
+
url: string | URL,
|
|
20
|
+
opts: { filter?: string } = {}
|
|
20
21
|
): Promise<Result<MintConfig>> {
|
|
21
22
|
const hast = htmlToHast(html);
|
|
22
23
|
url = new URL(url);
|
|
@@ -34,7 +35,7 @@ export async function scrapeAllSiteTabs(
|
|
|
34
35
|
!links.length ||
|
|
35
36
|
(links.length === 1 && links[0] && links[0].url === url.pathname)
|
|
36
37
|
)
|
|
37
|
-
return scrapeSite(html, url, { hast });
|
|
38
|
+
return scrapeSite(html, url, { hast, filter: opts.filter });
|
|
38
39
|
|
|
39
40
|
if (!links.find((link) => url.pathname.startsWith(link.url))) {
|
|
40
41
|
links.push({
|
|
@@ -49,7 +50,7 @@ export async function scrapeAllSiteTabs(
|
|
|
49
50
|
newUrl.pathname = tabEntry.url;
|
|
50
51
|
try {
|
|
51
52
|
const newHtml = await fetchPageHtml(newUrl, undefined);
|
|
52
|
-
return await scrapeSite(newHtml, newUrl, { tabs: [tabEntry] });
|
|
53
|
+
return await scrapeSite(newHtml, newUrl, { tabs: [tabEntry], filter: opts.filter });
|
|
53
54
|
} catch (error) {
|
|
54
55
|
return { success: false as const, message: getErrorMessage(error) };
|
|
55
56
|
}
|
|
@@ -95,5 +96,5 @@ export async function scrapeAllSiteTabs(
|
|
|
95
96
|
};
|
|
96
97
|
}
|
|
97
98
|
|
|
98
|
-
return scrapeSite(html, url, { hast });
|
|
99
|
+
return scrapeSite(html, url, { hast, filter: opts.filter });
|
|
99
100
|
}
|
package/src/utils/network.ts
CHANGED
|
@@ -90,6 +90,53 @@ export async function getHtmlWithPuppeteer(
|
|
|
90
90
|
});
|
|
91
91
|
}
|
|
92
92
|
|
|
93
|
+
if (framework.vendor === 'gitbook') {
|
|
94
|
+
for (let round = 0; round < 10; round++) {
|
|
95
|
+
const clickedCount = await page.evaluate(() => {
|
|
96
|
+
const tocEl = document.getElementById('table-of-contents');
|
|
97
|
+
if (!tocEl) return 0;
|
|
98
|
+
let count = 0;
|
|
99
|
+
const items = tocEl.querySelectorAll('li.page-document-item');
|
|
100
|
+
items.forEach((li) => {
|
|
101
|
+
const btn = li.querySelector(':scope > a button');
|
|
102
|
+
if (!btn || !(btn instanceof HTMLElement)) return;
|
|
103
|
+
const anchor = btn.closest('a');
|
|
104
|
+
if (!anchor) return;
|
|
105
|
+
const sibling = anchor.nextElementSibling;
|
|
106
|
+
if (
|
|
107
|
+
sibling instanceof HTMLElement &&
|
|
108
|
+
sibling.style.opacity === '1' &&
|
|
109
|
+
sibling.style.height === 'auto'
|
|
110
|
+
)
|
|
111
|
+
return;
|
|
112
|
+
btn.click();
|
|
113
|
+
count++;
|
|
114
|
+
});
|
|
115
|
+
return count;
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
if (clickedCount === 0) break;
|
|
119
|
+
|
|
120
|
+
await page
|
|
121
|
+
.waitForFunction(
|
|
122
|
+
() => {
|
|
123
|
+
const tocEl = document.getElementById('table-of-contents');
|
|
124
|
+
if (!tocEl) return true;
|
|
125
|
+
const anchors = tocEl.querySelectorAll('li.page-document-item > a');
|
|
126
|
+
return Array.from(anchors).every((a) => {
|
|
127
|
+
const btn = a.querySelector('button');
|
|
128
|
+
if (!btn) return true;
|
|
129
|
+
const sibling = a.nextElementSibling;
|
|
130
|
+
if (!sibling || !(sibling instanceof HTMLElement)) return true;
|
|
131
|
+
return sibling.style.opacity === '1' && sibling.style.height === 'auto';
|
|
132
|
+
});
|
|
133
|
+
},
|
|
134
|
+
{ timeout: 5000 }
|
|
135
|
+
)
|
|
136
|
+
.catch(() => {});
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
93
140
|
const content = await exponentialBackoff(() => page.content());
|
|
94
141
|
await page.close();
|
|
95
142
|
return content;
|
package/src/utils/strings.ts
CHANGED
|
@@ -5,3 +5,7 @@ export function removeTrailingSlash(str: string): string {
|
|
|
5
5
|
export function removeLeadingSlash(str: string): string {
|
|
6
6
|
return str.startsWith('/') ? str.substring(1) : str;
|
|
7
7
|
}
|
|
8
|
+
|
|
9
|
+
export function optionallyAddLeadingSlash(str: string): string {
|
|
10
|
+
return str.startsWith('/') ? str : '/' + str;
|
|
11
|
+
}
|
package/src/utils/text.ts
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import type { Element } from 'hast';
|
|
2
|
-
import { visit } from 'unist-util-visit';
|
|
2
|
+
import { CONTINUE, SKIP, visit } from 'unist-util-visit';
|
|
3
3
|
|
|
4
4
|
export function getText(element: Element | undefined): string {
|
|
5
5
|
if (!element) return '';
|
|
6
6
|
let text = '';
|
|
7
|
-
visit(element,
|
|
8
|
-
|
|
7
|
+
visit(element, function (node) {
|
|
8
|
+
if (node.type === 'element' && node.tagName === 'svg') return SKIP;
|
|
9
|
+
if (node.type === 'text') text += node.value;
|
|
10
|
+
return CONTINUE;
|
|
9
11
|
});
|
|
10
12
|
return text;
|
|
11
13
|
}
|