@mintlify/scraping 3.0.189 → 4.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +69 -24
- package/bin/cli.js.map +1 -1
- package/bin/scrapingPipeline/logo.js +10 -2
- package/bin/scrapingPipeline/logo.js.map +1 -1
- package/bin/scrapingPipeline/page.js +2 -0
- package/bin/scrapingPipeline/page.js.map +1 -1
- package/bin/scrapingPipeline/root.js +7 -2
- package/bin/scrapingPipeline/root.js.map +1 -1
- package/bin/scrapingPipeline/tabs.js +9 -3
- package/bin/scrapingPipeline/tabs.js.map +1 -1
- package/bin/tsconfig.build.tsbuildinfo +1 -1
- package/bin/utils/hastComments.d.ts +3 -0
- package/bin/utils/hastComments.js +15 -0
- package/bin/utils/hastComments.js.map +1 -0
- package/bin/utils/network.js +26 -18
- package/bin/utils/network.js.map +1 -1
- package/package.json +5 -5
- package/src/cli.ts +76 -24
- package/src/scrapingPipeline/logo.ts +13 -6
- package/src/scrapingPipeline/page.ts +2 -0
- package/src/scrapingPipeline/root.ts +9 -2
- package/src/scrapingPipeline/tabs.ts +8 -3
- package/src/utils/hastComments.ts +17 -0
- package/src/utils/network.ts +23 -18
package/bin/cli.js
CHANGED
|
@@ -2,10 +2,12 @@
|
|
|
2
2
|
import yargs from 'yargs';
|
|
3
3
|
import { hideBin } from 'yargs/helpers';
|
|
4
4
|
import { FINAL_SUCCESS_MESSAGE } from './constants.js';
|
|
5
|
+
import { generateOpenApiPages } from './openapi/generateOpenApiPages.js';
|
|
5
6
|
import { scrapePageGroup } from './scrapingPipeline/group.js';
|
|
6
7
|
import { htmlToHast } from './scrapingPipeline/root.js';
|
|
7
8
|
import { scrapeAllSiteTabs } from './scrapingPipeline/tabs.js';
|
|
8
9
|
import { detectFramework, framework } from './utils/detectFramework.js';
|
|
10
|
+
import { getErrorMessage } from './utils/errors.js';
|
|
9
11
|
import { write } from './utils/file.js';
|
|
10
12
|
import { log } from './utils/log.js';
|
|
11
13
|
import { fetchPageHtml } from './utils/network.js';
|
|
@@ -13,41 +15,84 @@ import { checkUrl } from './utils/url.js';
|
|
|
13
15
|
await yargs(hideBin(process.argv))
|
|
14
16
|
.command('page <url>', 'Scrapes the docs page for the URL provided', (yargs) => yargs.positional('url', { type: 'string', demandOption: true }).check(checkUrl), async ({ url }) => await page(url))
|
|
15
17
|
.command('section <url>', 'Scrapes the entire docs site based on the URL provided', (yargs) => yargs.positional('url', { type: 'string', demandOption: true }).check(checkUrl), async ({ url }) => await site(url))
|
|
18
|
+
.command('openapi-file <openapiFilename>', 'Creates MDX files from an OpenAPI spec', (yargs) => yargs
|
|
19
|
+
.positional('openapiFilename', {
|
|
20
|
+
describe: 'The filename of the OpenAPI spec',
|
|
21
|
+
type: 'string',
|
|
22
|
+
demandOption: true,
|
|
23
|
+
})
|
|
24
|
+
.option('writeFiles', {
|
|
25
|
+
describe: 'Whether or not to write the frontmatter files',
|
|
26
|
+
default: true,
|
|
27
|
+
type: 'boolean',
|
|
28
|
+
alias: 'w',
|
|
29
|
+
})
|
|
30
|
+
.option('outDir', {
|
|
31
|
+
describe: 'The folder in which to write any created frontmatter files',
|
|
32
|
+
type: 'string',
|
|
33
|
+
alias: 'o',
|
|
34
|
+
}), async (argv) => {
|
|
35
|
+
try {
|
|
36
|
+
const { nav } = await generateOpenApiPages(argv.openapiFilename, argv.writeFiles, argv.outDir);
|
|
37
|
+
console.log('navigation object suggestion:');
|
|
38
|
+
console.log(JSON.stringify(nav, undefined, 2));
|
|
39
|
+
}
|
|
40
|
+
catch (error) {
|
|
41
|
+
if (error instanceof Error) {
|
|
42
|
+
console.error(error.message);
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
console.error(error);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
})
|
|
16
49
|
.strictCommands()
|
|
17
50
|
.demandCommand(1, 'Unknown command. See above for the list of supported commands.')
|
|
18
51
|
.alias('h', 'help')
|
|
19
52
|
.alias('v', 'version')
|
|
20
53
|
.parse();
|
|
21
54
|
async function page(url) {
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
55
|
+
try {
|
|
56
|
+
const urlObj = new URL(url);
|
|
57
|
+
const html = await fetchPageHtml(urlObj);
|
|
58
|
+
log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
|
|
59
|
+
const hast = htmlToHast(html);
|
|
60
|
+
detectFramework(hast);
|
|
61
|
+
const needsBrowser = framework.vendor === 'gitbook';
|
|
62
|
+
const results = await scrapePageGroup([urlObj], needsBrowser);
|
|
63
|
+
const result = results[0] || {
|
|
64
|
+
success: false,
|
|
65
|
+
message: `An unknown error occurred when scraping ${url}`,
|
|
66
|
+
};
|
|
67
|
+
if (result.success) {
|
|
68
|
+
log(`Successfully scraped ${url} ${result.data ? `into ${result.data[1]}` : ''}`);
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
log(result.message);
|
|
72
|
+
}
|
|
35
73
|
}
|
|
36
|
-
|
|
37
|
-
|
|
74
|
+
catch (error) {
|
|
75
|
+
const errorMessage = getErrorMessage(error);
|
|
76
|
+
log(errorMessage);
|
|
38
77
|
}
|
|
39
78
|
}
|
|
40
79
|
async function site(url) {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
80
|
+
try {
|
|
81
|
+
const urlObj = new URL(url);
|
|
82
|
+
const html = await fetchPageHtml(urlObj);
|
|
83
|
+
log('Successfully retrieved initial HTML from src: ' + urlObj.toString());
|
|
84
|
+
const result = await scrapeAllSiteTabs(html, urlObj);
|
|
85
|
+
if (result.success) {
|
|
86
|
+
write('mint.json', JSON.stringify(result.data, undefined, 2));
|
|
87
|
+
log(FINAL_SUCCESS_MESSAGE);
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
log(result.message);
|
|
91
|
+
}
|
|
48
92
|
}
|
|
49
|
-
|
|
50
|
-
|
|
93
|
+
catch (error) {
|
|
94
|
+
const errorMessage = getErrorMessage(error);
|
|
95
|
+
log(errorMessage);
|
|
51
96
|
}
|
|
52
97
|
}
|
|
53
98
|
//# sourceMappingURL=cli.js.map
|
package/bin/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAExC,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AACxD,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAC/D,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,4BAA4B,CAAC;AACxE,OAAO,EAAE,KAAK,EAAE,MAAM,iBAAiB,CAAC;AACxC,OAAO,EAAE,GAAG,EAAE,MAAM,gBAAgB,CAAC;AACrC,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,EAAE,QAAQ,EAAE,MAAM,gBAAgB,CAAC;AAE1C,MAAM,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;KAC/B,OAAO,CACN,YAAY,EACZ,4CAA4C,EAC5C,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,EAC1F,KAAK,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,MAAM,IAAI,CAAC,GAAG,CAAC,CACnC;KAEA,OAAO,CACN,eAAe,EACf,wDAAwD,EACxD,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,EAC1F,KAAK,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,MAAM,IAAI,CAAC,GAAG,CAAC,CACnC;KAEA,cAAc,EAAE;KAChB,aAAa,CAAC,CAAC,EAAE,gEAAgE,CAAC;KAClF,KAAK,CAAC,GAAG,EAAE,MAAM,CAAC;KAClB,KAAK,CAAC,GAAG,EAAE,SAAS,CAAC;KACrB,KAAK,EAAE,CAAC;AAEX,KAAK,UAAU,IAAI,CAAC,GAAW;IAC7B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAExC,OAAO,EAAE,qBAAqB,EAAE,MAAM,gBAAgB,CAAC;AACvD,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AACxD,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAC/D,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,4BAA4B,CAAC;AACxE,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,KAAK,EAAE,MAAM,iBAAiB,CAAC;AACxC,OAAO,EAAE,GAAG,EAAE,MAAM,gBAAgB,CAAC;AACrC,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,EAAE,QAAQ,EAAE,MAAM,gBAAgB,CAAC;AAE1C,MAAM,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;KAC/B,OAAO,CACN,YAAY,EACZ,4CAA4C,EAC5C,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,EAC1F,KAAK,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,MAAM,IAAI,CAAC,GAAG,CAAC,CACnC;KAEA,OAAO,CACN,eAAe,EACf,wDAAwD,EACxD,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,EAC1F,KAAK,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,MAAM,IAAI,CAAC,GAAG,CAAC,CACnC;KAEA,OAAO,CACN,gCAAgC,EAChC,wCAAwC,EACxC,CAAC,KAAK,EAAE,EAAE,CACR,KAAK;KACF,UAAU,CAAC,iBAAiB,EAAE;IAC7B,QAAQ,EAAE,kCAAkC;IAC5C,IAAI,EAAE,QAAQ;IACd,YAAY,EAAE,IAAI;CACnB,CAAC;KACD,MAAM,CAAC,YAAY,EAAE;IACpB,QAAQ,EAAE,+CAA+C;IACzD,OAAO,EAAE,IAAI;IACb,IAAI,EAAE,SAAS;IACf,KAAK,EAAE,GAAG;CACX,CAAC;KACD,MAAM,CAAC,QAAQ,EAAE;IAChB,QAAQ,EAAE,4DAA4D;IACtE,IAAI,EAAE,QAAQ;IACd,KAAK,EAAE,GAAG;CACX,CAAC,EACN,KAAK,EAAE,IAAI,EAAE,EAAE;IACb,IAAI,CAAC;QACH,MAAM,EAAE,GAAG,EAAE,GAAG,MAAM,oBAAoB,CACxC,IAAI,CAAC,eAAe,EACpB,IAAI,CAAC,UAAU,EACf,IAAI,CAAC,MAAM,CACZ,CAAC;QACF,OAAO,CAAC,GAAG,CAAC,+BAA+B,CAAC,CAAC;QAC7C,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC;IACjD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC/B,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACvB,CAAC;IACH,CAAC;AACH,CAAC,CACF;KAEA,cAAc,EAAE;KAChB,aAAa,CAAC,CAAC,EAAE,gEAAgE,CAAC;KAClF,KAAK,CAAC,GAAG,EAAE,MAAM,CAAC;KAClB,KAAK,CAAC,GAAG,EAAE,SAAS,CAAC;KACrB,KAAK,EAAE,CAAC;AAEX,KAAK,UAAU,IAAI,CAAC,GAAW;IAC7B,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,MAAM,CAAC,CAAC;QACzC,GAAG,CAAC,gDAAgD,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QAE1E,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;QAC9B,eAAe,CAAC,IAAI,CAAC,CAAC;QAEtB,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,KAAK,SAAS,CAAC;QACpD,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,CAAC,MAAM,CAAC,EAAE,YAAY,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,IAAI;YAC3B,OAAO,EAAE,KAAK;YACd,OAAO,EAAE,2CAA2C,GAAG,EAAE;SAC1D,CAAC;QAEF,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YACnB,GAAG,CAAC,wBAAwB,GAAG,IAAI,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACpF,CAAC;aAAM,CAAC;YACN,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,GAAG,CAAC,YAAY,CAAC,CAAC;IACpB,CAAC;AACH,CAAC;AAED,KAAK,UAAU,IAAI,CAAC,GAAW;IAC7B,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,MAAM,CAAC,CAAC;QACzC,GAAG,CAAC,gDAAgD,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QAE1E,MAAM,MAAM,GAAG,MAAM,iBAAiB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;QACrD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YACnB,KAAK,CAAC,WAAW,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC;YAC9D,GAAG,CAAC,qBAAqB,CAAC,CAAC;QAC7B,CAAC;aAAM,CAAC;YACN,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,GAAG,CAAC,YAAY,CAAC,CAAC;IACpB,CAAC;AACH,CAAC"}
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import { join } from 'node:path';
|
|
2
2
|
import { EXIT, visit } from 'unist-util-visit';
|
|
3
3
|
import { framework } from '../utils/detectFramework.js';
|
|
4
|
+
import { getErrorMessage } from '../utils/errors.js';
|
|
4
5
|
import { downloadImage } from '../utils/images.js';
|
|
6
|
+
import { log } from '../utils/log.js';
|
|
5
7
|
import { fetchPageHtml } from '../utils/network.js';
|
|
6
8
|
import { htmlToHast } from './root.js';
|
|
7
9
|
function findReadmeLogoNodes(root) {
|
|
@@ -74,8 +76,14 @@ export async function downloadLogos(url, browser) {
|
|
|
74
76
|
}));
|
|
75
77
|
}
|
|
76
78
|
else {
|
|
77
|
-
|
|
78
|
-
|
|
79
|
+
try {
|
|
80
|
+
const html = await fetchPageHtml(url);
|
|
81
|
+
await findLogosFromHtml(html, framework.vendor === 'gitbook' ? findGitBookLogoNodes : findDocusaurusLogoNodes, filepaths);
|
|
82
|
+
}
|
|
83
|
+
catch (error) {
|
|
84
|
+
const errorMessage = getErrorMessage(error);
|
|
85
|
+
log(`Failed to retrieve logo from HTML: ${errorMessage}`);
|
|
86
|
+
}
|
|
79
87
|
}
|
|
80
88
|
if (browser)
|
|
81
89
|
await browser.close();
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"logo.js","sourceRoot":"","sources":["../../src/scrapingPipeline/logo.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAE/C,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,SAAS,mBAAmB,CAAC,IAAc;IACzC,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IACE,IAAI,CAAC,OAAO,KAAK,KAAK;YACtB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC;YAEjD,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACxB,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,SAAS,oBAAoB,CAAC,IAAc;IAC1C,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IAAI,IAAI,CAAC,OAAO,KAAK,KAAK,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,KAAK,MAAM,EAAE,CAAC;YAC7D,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtB,CAAC;IACH,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,SAAS,uBAAuB,CAAC,IAAc;IAC7C,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IACE,IAAI,CAAC,OAAO,KAAK,KAAK;YACtB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,eAAe,CAAC,EACnD,CAAC;YACD,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,OAAO;gBACtC,IAAI,OAAO,CAAC,OAAO,KAAK,KAAK;oBAAE,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACxD,CAAC,CAAC,CAAC;YACH,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,IAAY,EACZ,UAA0D,EAC1D,SAAwB;IAExB,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAC9B,MAAM,QAAQ,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAElC,IAAI,QAAQ,EAAE,CAAC;QACb,SAAS,CAAC,IAAI,CACZ,GAAG,CAAC,MAAM,OAAO,CAAC,GAAG,CACnB,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC1B,MAAM,GAAG,GAAG,MAAM,aAAa,CAC7B,IAAI,CAAC,UAAU,CAAC,GAAa,EAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,QAAQ,CAAC,CAC9B,CAAC;YAEF,IAAI,GAAG,CAAC,OAAO,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC;gBAC5B,OAAO,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACrB,CAAC;iBAAM,CAAC;gBACN,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC,CAAC,CACH,CAAC,CACH,CAAC;IACJ,CAAC;IAED,SAAS,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,EAAE;QACpC,IAAI,CAAC,QAAQ;YAAE,SAAS,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAiB,EACjB,OAA4B;IAE5B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IACnB,MAAM,SAAS,GAAkB,EAAE,CAAC;IACpC,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,KAAK,GAAkB,EAAE,CAAC;QAEhC,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACrC,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE;YAC9B,SAAS,EAAE,cAAc;SAC1B,CAAC,CAAC;QAEH,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QACjC,MAAM,IAAI,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QAEjC,MAAM,OAAO,CAAC,GAAG,CACf,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YACvB,OAAO,MAAM,iBAAiB,CAAC,IAAI,EAAE,mBAAmB,EAAE,SAAS,CAAC,CAAC;QACvE,CAAC,CAAC,CACH,CAAC;IACJ,CAAC;SAAM,CAAC;QACN,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,GAAG,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"logo.js","sourceRoot":"","sources":["../../src/scrapingPipeline/logo.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAE/C,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACtC,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,SAAS,mBAAmB,CAAC,IAAc;IACzC,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IACE,IAAI,CAAC,OAAO,KAAK,KAAK;YACtB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC;YAEjD,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACxB,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,SAAS,oBAAoB,CAAC,IAAc;IAC1C,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IAAI,IAAI,CAAC,OAAO,KAAK,KAAK,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,KAAK,MAAM,EAAE,CAAC;YAC7D,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtB,CAAC;IACH,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,SAAS,uBAAuB,CAAC,IAAc;IAC7C,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,IAAI;QACnC,IACE,IAAI,CAAC,OAAO,KAAK,KAAK;YACtB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YACxC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,CAAC,eAAe,CAAC,EACnD,CAAC;YACD,KAAK,CAAC,IAAI,EAAE,SAAS,EAAE,UAAU,OAAO;gBACtC,IAAI,OAAO,CAAC,OAAO,KAAK,KAAK;oBAAE,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACxD,CAAC,CAAC,CAAC;YACH,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,IAAY,EACZ,UAA0D,EAC1D,SAAwB;IAExB,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAC9B,MAAM,QAAQ,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAElC,IAAI,QAAQ,EAAE,CAAC;QACb,SAAS,CAAC,IAAI,CACZ,GAAG,CAAC,MAAM,OAAO,CAAC,GAAG,CACnB,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC1B,MAAM,GAAG,GAAG,MAAM,aAAa,CAC7B,IAAI,CAAC,UAAU,CAAC,GAAa,EAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,QAAQ,CAAC,CAC9B,CAAC;YAEF,IAAI,GAAG,CAAC,OAAO,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC;gBAC5B,OAAO,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACrB,CAAC;iBAAM,CAAC;gBACN,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC,CAAC,CACH,CAAC,CACH,CAAC;IACJ,CAAC;IAED,SAAS,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,EAAE;QACpC,IAAI,CAAC,QAAQ;YAAE,SAAS,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAiB,EACjB,OAA4B;IAE5B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IACnB,MAAM,SAAS,GAAkB,EAAE,CAAC;IACpC,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,KAAK,GAAkB,EAAE,CAAC;QAEhC,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACrC,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE;YAC9B,SAAS,EAAE,cAAc;SAC1B,CAAC,CAAC;QAEH,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QACjC,MAAM,IAAI,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QAEjC,MAAM,OAAO,CAAC,GAAG,CACf,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YACvB,OAAO,MAAM,iBAAiB,CAAC,IAAI,EAAE,mBAAmB,EAAE,SAAS,CAAC,CAAC;QACvE,CAAC,CAAC,CACH,CAAC;IACJ,CAAC;SAAM,CAAC;QACN,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,GAAG,CAAC,CAAC;YACtC,MAAM,iBAAiB,CACrB,IAAI,EACJ,SAAS,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,uBAAuB,EAC/E,SAAS,CACV,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;YAC5C,GAAG,CAAC,sCAAsC,YAAY,EAAE,CAAC,CAAC;QAC5D,CAAC;IACH,CAAC;IAED,IAAI,OAAO;QAAE,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IAEnC,MAAM,eAAe,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;IAEzD,OAAO,eAAe,CAAC,MAAM,KAAK,CAAC;QACjC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC;QACpB,CAAC,CAAC,eAAe,CAAC,MAAM,GAAG,CAAC;YAC1B,CAAC,CAAC;gBACE,KAAK,EAAE,eAAe,CAAC,CAAC,CAAW;gBACnC,IAAI,EAAE,eAAe,CAAC,CAAC,CAAW;aACnC;YACH,CAAC,CAAC,SAAS,CAAC;AAClB,CAAC"}
|
|
@@ -14,6 +14,7 @@ import { unifiedRemoveEmptyParagraphs } from '../utils/emptyParagraphs.js';
|
|
|
14
14
|
import { getErrorMessage, logErrorResults } from '../utils/errors.js';
|
|
15
15
|
import { escapeCharactersOutsideCodeBlocks } from '../utils/escape.js';
|
|
16
16
|
import { write, writePage } from '../utils/file.js';
|
|
17
|
+
import { removeHastComments } from '../utils/hastComments.js';
|
|
17
18
|
import { log } from '../utils/log.js';
|
|
18
19
|
import { unifiedRemoveNestedRoots } from '../utils/nestedRoots.js';
|
|
19
20
|
import { unifiedRemovePositions } from '../utils/position.js';
|
|
@@ -30,6 +31,7 @@ export async function scrapePage(html, url, opts = { externalLink: false }) {
|
|
|
30
31
|
return { success: true, data: [url.toString(), filename] };
|
|
31
32
|
}
|
|
32
33
|
const hast = htmlToHast(html);
|
|
34
|
+
removeHastComments(hast);
|
|
33
35
|
if (!framework.vendor)
|
|
34
36
|
detectFramework(hast);
|
|
35
37
|
const urlStr = url.toString();
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"page.js","sourceRoot":"","sources":["../../src/scrapingPipeline/page.ts"],"names":[],"mappings":"AAEA,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,eAAe,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAElC,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACjE,OAAO,EAAE,mBAAmB,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EACL,aAAa,EACb,UAAU,EACV,eAAe,EACf,oBAAoB,EACpB,WAAW,EACX,eAAe,EACf,UAAU,EACV,eAAe,GAChB,MAAM,+BAA+B,CAAC;AACvC,OAAO,EAAE,8BAA8B,EAAE,MAAM,+BAA+B,CAAC;AAC/E,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AACzE,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAE1D,OAAO,EAAE,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACzE,OAAO,EAAE,4BAA4B,EAAE,MAAM,6BAA6B,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACtE,OAAO,EAAE,iCAAiC,EAAE,MAAM,oBAAoB,CAAC;AACvE,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACtC,OAAO,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAC9E,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAChF,OAAO,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,IAAY,EACZ,GAAiB,EACjB,OAII,EAAE,YAAY,EAAE,KAAK,EAAE;IAE3B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAEnB,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;QACtB,MAAM,QAAQ,GAAG,IAAI,CAAC;QACtB,MAAM,eAAe,GAAG,GAAG,QAAQ,MAAM,CAAC;QAC1C,SAAS,CAAC,eAAe,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;QACvD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE,QAAQ,CAAC,EAAE,CAAC;IAC7D,CAAC;IAED,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"page.js","sourceRoot":"","sources":["../../src/scrapingPipeline/page.ts"],"names":[],"mappings":"AAEA,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,eAAe,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAElC,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACjE,OAAO,EAAE,mBAAmB,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACzE,OAAO,EACL,aAAa,EACb,UAAU,EACV,eAAe,EACf,oBAAoB,EACpB,WAAW,EACX,eAAe,EACf,UAAU,EACV,eAAe,GAChB,MAAM,+BAA+B,CAAC;AACvC,OAAO,EAAE,8BAA8B,EAAE,MAAM,+BAA+B,CAAC;AAC/E,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AACzE,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAE1D,OAAO,EAAE,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACzE,OAAO,EAAE,4BAA4B,EAAE,MAAM,6BAA6B,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACtE,OAAO,EAAE,iCAAiC,EAAE,MAAM,oBAAoB,CAAC;AACvE,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC;AAC9D,OAAO,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACtC,OAAO,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC;AACnE,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAC9E,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAChF,OAAO,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,IAAY,EACZ,GAAiB,EACjB,OAII,EAAE,YAAY,EAAE,KAAK,EAAE;IAE3B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAEnB,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;QACtB,MAAM,QAAQ,GAAG,IAAI,CAAC;QACtB,MAAM,eAAe,GAAG,GAAG,QAAQ,MAAM,CAAC;QAC1C,SAAS,CAAC,eAAe,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;QACvD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE,QAAQ,CAAC,EAAE,CAAC;IAC7D,CAAC;IAED,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAC9B,kBAAkB,CAAC,IAAI,CAAC,CAAC;IAEzB,IAAI,CAAC,SAAS,CAAC,MAAM;QAAE,eAAe,CAAC,IAAI,CAAC,CAAC;IAE7C,MAAM,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;IAC9B,MAAM,OAAO,GAAG,mBAAmB,CAAC,IAAI,CAAC,CAAC;IAC1C,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,KAAK,mBAAmB,EAAE,EAAE,CAAC;IAEtF,MAAM,aAAa,GAAa;QAC9B,IAAI,EAAE,MAAM;QACZ,QAAQ,EAAE,CAAC,OAAO,CAAC;KACpB,CAAC;IAEF,MAAM,SAAS,GAAc,OAAO,EAAE;SACnC,GAAG,CAAC,UAAU,CAAC;SACf,GAAG,CAAC,eAAe,CAAC;SACpB,GAAG,CAAC,WAAW,CAAC;SAChB,GAAG,CAAC,UAAU,CAAC;SACf,GAAG,CAAC,aAAa,CAAC;SAClB,GAAG,CAAC,eAAe,CAAC;SACpB,GAAG,CAAC,oBAAoB,CAAC;SACzB,GAAG,CAAC,eAAe,CAAC;SACpB,GAAG,CAAC,uBAAuB,CAAC;SAC5B,GAAG,CAAC,sBAAsB,CAAC;SAC3B,GAAG,CAAC,4BAA4B,CAAC;SACjC,GAAG,CAAC,iCAAiC,CAAC;SACtC,GAAG,CAAC,qBAAqB,CAAC;QAE3B,iDAAiD;QACjD,mDAAmD;QACnD,4BAA4B;SAC3B,GAAG,CAAC,8BAA8B,CAAC;SACnC,GAAG,CAAC,wBAAwB,CAAC;SAC7B,GAAG,CAAC,wBAAwB,CAAC;SAC7B,OAAO,CAAC,aAAa,CAAc,CAAC;IAEvC,IAAI,CAAC;QACH,MAAM,YAAY,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;QAClE,eAAe,CAAC,wBAAwB,GAAG,CAAC,QAAQ,EAAE,EAAE,EAAE,YAAY,CAAC,CAAC;IAC1E,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,GAAG,CAAC,yDAAyD,GAAG,CAAC,QAAQ,EAAE,GAAG,YAAY,EAAE,CAAC,CAAC;QAC9F,MAAM,KAAK,CAAC;IACd,CAAC;IAED,MAAM,KAAK,GAAG,mBAAmB,CAAC,SAAS,CAAC,CAAC;IAC7C,MAAM,WAAW,GAAG,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAEtD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,OAAO,EAAE;aACrB,GAAG,CAAC,SAAS,CAAC;aACd,GAAG,CAAC,SAAS,CAAC;YACf,kGAAkG;aACjG,GAAG,CAAC,eAAe,CAAC;aACpB,SAAS,CAAC,SAAS,CAAC,CAAC;QAExB,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;QAC3C,CAAC;aAAM,IAAI,GAAG,CAAC,MAAM,KAAK,mBAAmB,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,CAAC;YAC9D,GAAG,GAAG,IAAI,GAAG,CAAC,MAAM,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC;QAC7C,CAAC;QAED,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,KAAK,EAAE,WAAW,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;QACtF,OAAO;YACL,OAAO,EAAE,IAAI;YACb,IAAI,EAAE,IAAI,CAAC,QAAQ;gBACjB,CAAC,CAAC,CAAC,kBAAkB,CAAC,mBAAmB,CAAC,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,CAAC,QAAQ,CAAC;gBACpF,CAAC,CAAC,SAAS;SACd,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,KAAK,CAAC,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC;QAC7D,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC5C,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,KAAK,iBAAiB,GAAG,YAAY,EAAE,EAAE,CAAC;IACvF,CAAC;AACH,CAAC"}
|
|
@@ -1,8 +1,13 @@
|
|
|
1
1
|
import rehypeParse from 'rehype-parse';
|
|
2
2
|
import { unified } from 'unified';
|
|
3
|
+
import { rehypeRemoveHastComments } from '../utils/hastComments.js';
|
|
3
4
|
import { unifiedRemovePositions } from '../utils/position.js';
|
|
4
5
|
export function htmlToHast(html) {
|
|
5
|
-
|
|
6
|
-
|
|
6
|
+
return unified()
|
|
7
|
+
// @ts-expect-error remarkStringify errors even if used for valid code from documentation examples
|
|
8
|
+
.use(rehypeParse)
|
|
9
|
+
.use(unifiedRemovePositions)
|
|
10
|
+
.use(rehypeRemoveHastComments)
|
|
11
|
+
.parse(html);
|
|
7
12
|
}
|
|
8
13
|
//# sourceMappingURL=root.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"root.js","sourceRoot":"","sources":["../../src/scrapingPipeline/root.ts"],"names":[],"mappings":"AACA,OAAO,WAAW,MAAM,cAAc,CAAC;AACvC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAElC,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAE9D,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,
|
|
1
|
+
{"version":3,"file":"root.js","sourceRoot":"","sources":["../../src/scrapingPipeline/root.ts"],"names":[],"mappings":"AACA,OAAO,WAAW,MAAM,cAAc,CAAC;AACvC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAElC,OAAO,EAAE,wBAAwB,EAAE,MAAM,0BAA0B,CAAC;AACpE,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAE9D,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,OACE,OAAO,EAAE;QACP,kGAAkG;SACjG,GAAG,CAAC,WAAW,CAAC;SAChB,GAAG,CAAC,sBAAsB,CAAC;SAC3B,GAAG,CAAC,wBAAwB,CAAC;SAC7B,KAAK,CAAC,IAAI,CACd,CAAC;AACJ,CAAC"}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { retrieveTabLinks } from '../tabs/retrieveReadme.js';
|
|
2
2
|
import { detectFramework, framework } from '../utils/detectFramework.js';
|
|
3
|
+
import { getErrorMessage } from '../utils/errors.js';
|
|
3
4
|
import { log } from '../utils/log.js';
|
|
4
5
|
import { fetchPageHtml, startPuppeteer } from '../utils/network.js';
|
|
5
6
|
import { getTitleFromLink } from '../utils/title.js';
|
|
@@ -30,8 +31,13 @@ export async function scrapeAllSiteTabs(html, url) {
|
|
|
30
31
|
const results = await Promise.all(links.map(async (tabEntry) => {
|
|
31
32
|
const newUrl = new URL(url);
|
|
32
33
|
newUrl.pathname = tabEntry.url;
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
try {
|
|
35
|
+
const newHtml = await fetchPageHtml(newUrl, undefined);
|
|
36
|
+
return await scrapeSite(newHtml, newUrl, { tabs: [tabEntry] });
|
|
37
|
+
}
|
|
38
|
+
catch (error) {
|
|
39
|
+
return { success: false, message: getErrorMessage(error) };
|
|
40
|
+
}
|
|
35
41
|
}));
|
|
36
42
|
const navigations = [];
|
|
37
43
|
const tabs = [];
|
|
@@ -45,7 +51,7 @@ export async function scrapeAllSiteTabs(html, url) {
|
|
|
45
51
|
});
|
|
46
52
|
const failures = results.filter((result) => !result.success);
|
|
47
53
|
failures.forEach((result) => {
|
|
48
|
-
log('Failed to scrape tab
|
|
54
|
+
log('Failed to scrape tab' + result.message);
|
|
49
55
|
});
|
|
50
56
|
return {
|
|
51
57
|
success: true,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tabs.js","sourceRoot":"","sources":["../../src/scrapingPipeline/tabs.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAE7D,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACzE,OAAO,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACtC,OAAO,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AACvC,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,IAAY,EACZ,GAAiB;IAEjB,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAC9B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAEnB,eAAe,CAAC,IAAI,CAAC,CAAC;IAEtB,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,KAAK,SAAS,CAAC;IACpD,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,MAAM,cAAc,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAElE,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,IAAI,CAAC,CAAC;IAC5C,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IAE/C,IAAI,SAAS,CAAC,MAAM,KAAK,QAAQ,IAAI,SAAS,CAAC,MAAM,KAAK,YAAY,EAAE,CAAC;QACvE,MAAM,KAAK,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;QACrC,IACE,CAAC,KAAK;YACN,CAAC,KAAK,CAAC,MAAM;YACb,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,KAAK,GAAG,CAAC,QAAQ,CAAC;YAEjE,OAAO,UAAU,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;QAEzC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC7D,KAAK,CAAC,IAAI,CAAC;gBACT,IAAI,EAAE,gBAAgB,CAAC,GAAG,CAAC,QAAQ,CAAC;gBACpC,GAAG,EAAE,GAAG,CAAC,QAAQ;aAClB,CAAC,CAAC;QACL,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;YAC3B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC5B,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC;YAC/B,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"tabs.js","sourceRoot":"","sources":["../../src/scrapingPipeline/tabs.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAE7D,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACzE,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACtC,OAAO,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AACvC,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,IAAY,EACZ,GAAiB;IAEjB,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAC9B,GAAG,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAEnB,eAAe,CAAC,IAAI,CAAC,CAAC;IAEtB,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,KAAK,SAAS,CAAC;IACpD,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,MAAM,cAAc,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAElE,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,IAAI,CAAC,CAAC;IAC5C,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IAE/C,IAAI,SAAS,CAAC,MAAM,KAAK,QAAQ,IAAI,SAAS,CAAC,MAAM,KAAK,YAAY,EAAE,CAAC;QACvE,MAAM,KAAK,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;QACrC,IACE,CAAC,KAAK;YACN,CAAC,KAAK,CAAC,MAAM;YACb,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,KAAK,GAAG,CAAC,QAAQ,CAAC;YAEjE,OAAO,UAAU,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;QAEzC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC7D,KAAK,CAAC,IAAI,CAAC;gBACT,IAAI,EAAE,gBAAgB,CAAC,GAAG,CAAC,QAAQ,CAAC;gBACpC,GAAG,EAAE,GAAG,CAAC,QAAQ;aAClB,CAAC,CAAC;QACL,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;YAC3B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC5B,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC;YAC/B,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;gBACvD,OAAO,MAAM,UAAU,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;YACjE,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,EAAE,OAAO,EAAE,KAAc,EAAE,OAAO,EAAE,eAAe,CAAC,KAAK,CAAC,EAAE,CAAC;YACtE,CAAC;QACH,CAAC,CAAC,CACH,CAAC;QAEF,MAAM,WAAW,GAA2B,EAAE,CAAC;QAC/C,MAAM,IAAI,GAAe,EAAE,CAAC;QAE5B,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC7D,SAAS,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YAC3B,IAAI,CAAC,MAAM,CAAC,IAAI;gBAAE,OAAO;YACzB,WAAW,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAC5C,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI;gBAAE,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvD,CAAC,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC7D,QAAQ,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YAC1B,GAAG,CAAC,sBAAsB,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,OAAO,EAAE,IAAI;YACb,IAAI,EAAE;gBACJ,OAAO,EAAE,kCAAkC;gBAC3C,IAAI,EAAE,EAAE;gBACR,IAAI;gBACJ,MAAM,EAAE;oBACN,OAAO,EAAE,EAAE;iBACZ;gBACD,OAAO,EAAE,OAAO,IAAI,EAAE;gBACtB,UAAU,EAAE,WAAyB;gBACrC,IAAI;aACL;SACF,CAAC;IACJ,CAAC;IAED,OAAO,UAAU,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;AACzC,CAAC"}
|