@opentermsarchive/engine 10.3.0 → 10.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -35,15 +35,16 @@ export async function extractFromHTML(sourceDocument) {
|
|
|
35
35
|
const filteredDOM = await filter(webPageDOM, sourceDocument);
|
|
36
36
|
const cleanedDOM = filteredDOM.remove(insignificantContentSelectors);
|
|
37
37
|
const selectedDOM = cleanedDOM.select(contentSelectors);
|
|
38
|
+
const contentSelectorsDisplay = typeof contentSelectors === 'object' ? JSON.stringify(contentSelectors) : contentSelectors;
|
|
38
39
|
|
|
39
40
|
if (!selectedDOM?.children.length) {
|
|
40
|
-
throw new Error(`The provided selector "${
|
|
41
|
+
throw new Error(`The provided selector "${contentSelectorsDisplay}" has no match in the web page at '${location}'. This could be due to elements being removed before content selection if "remove" and "select" selectors match the same content.`);
|
|
41
42
|
}
|
|
42
43
|
|
|
43
44
|
const markdownContent = transformFromHTML(selectedDOM);
|
|
44
45
|
|
|
45
46
|
if (!markdownContent) {
|
|
46
|
-
throw new Error(`The provided selector "${
|
|
47
|
+
throw new Error(`The provided selector "${contentSelectorsDisplay}" matches an empty content in the web page at '${location}'`);
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
return markdownContent;
|
|
@@ -3,13 +3,14 @@ import stealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
|
3
3
|
|
|
4
4
|
import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js';
|
|
5
5
|
|
|
6
|
-
puppeteer.use(stealthPlugin());
|
|
7
|
-
|
|
8
6
|
let browser;
|
|
9
7
|
|
|
10
8
|
export default async function fetch(url, cssSelectors, config) {
|
|
9
|
+
puppeteer.use(stealthPlugin({ locale: config.language }));
|
|
10
|
+
|
|
11
11
|
let context;
|
|
12
12
|
let page;
|
|
13
|
+
let client;
|
|
13
14
|
let response;
|
|
14
15
|
const selectors = [].concat(cssSelectors);
|
|
15
16
|
|
|
@@ -25,6 +26,14 @@ export default async function fetch(url, cssSelectors, config) {
|
|
|
25
26
|
await page.setDefaultNavigationTimeout(config.navigationTimeout);
|
|
26
27
|
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
|
|
27
28
|
|
|
29
|
+
// Use CDP to ensure the browser language is set correctly (most reliable method, see https://zirkelc.dev/posts/puppeteer-language-experiment)
|
|
30
|
+
client = await page.createCDPSession();
|
|
31
|
+
|
|
32
|
+
await client.send('Network.setUserAgentOverride', {
|
|
33
|
+
userAgent: await browser.userAgent(),
|
|
34
|
+
acceptLanguage: config.language,
|
|
35
|
+
});
|
|
36
|
+
|
|
28
37
|
if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
|
|
29
38
|
await page.authenticate(browser.proxyCredentials);
|
|
30
39
|
}
|
|
@@ -73,6 +82,9 @@ export default async function fetch(url, cssSelectors, config) {
|
|
|
73
82
|
}
|
|
74
83
|
throw new Error(error.message);
|
|
75
84
|
} finally {
|
|
85
|
+
if (client) {
|
|
86
|
+
await client.detach();
|
|
87
|
+
}
|
|
76
88
|
if (page) {
|
|
77
89
|
await page.close();
|
|
78
90
|
}
|