@opentermsarchive/engine 10.4.0 → 10.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -8,37 +8,64 @@ let browser;
|
|
|
8
8
|
export default async function fetch(url, cssSelectors, config) {
|
|
9
9
|
puppeteer.use(stealthPlugin({ locale: config.language }));
|
|
10
10
|
|
|
11
|
-
let context;
|
|
12
|
-
let page;
|
|
13
|
-
let client;
|
|
14
|
-
let response;
|
|
15
|
-
const selectors = [].concat(cssSelectors);
|
|
16
|
-
|
|
17
11
|
if (!browser) {
|
|
18
12
|
throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
|
|
19
13
|
}
|
|
20
14
|
|
|
15
|
+
let context;
|
|
16
|
+
let page;
|
|
17
|
+
let client;
|
|
18
|
+
|
|
21
19
|
try {
|
|
22
20
|
context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache)
|
|
23
21
|
page = await context.newPage();
|
|
22
|
+
client = await page.createCDPSession();
|
|
24
23
|
|
|
25
|
-
await page
|
|
26
|
-
await page.setDefaultNavigationTimeout(config.navigationTimeout);
|
|
27
|
-
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
|
|
24
|
+
await configurePage(page, client, config);
|
|
28
25
|
|
|
29
|
-
|
|
30
|
-
client = await page.createCDPSession();
|
|
26
|
+
const selectors = [].concat(cssSelectors).filter(Boolean);
|
|
31
27
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
28
|
+
let pdf = {};
|
|
29
|
+
let handled = null;
|
|
30
|
+
|
|
31
|
+
if (!selectors.length) { // CSS selectors are specified only for HTML content and omitted when fetching a PDF
|
|
32
|
+
({ pdf, handled } = setupPdfInterception(client));
|
|
33
|
+
}
|
|
36
34
|
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
let response;
|
|
36
|
+
let navigationAborted = false;
|
|
37
|
+
|
|
38
|
+
try {
|
|
39
|
+
response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
|
|
40
|
+
} catch (error) {
|
|
41
|
+
if (error.message.includes('net::ERR_ABORTED')) {
|
|
42
|
+
// Chrome may sometimes abort navigation for files such as PDFs.
|
|
43
|
+
// Do not throw for now; wait for the PDF interception handler to finish processing the response.
|
|
44
|
+
navigationAborted = true;
|
|
45
|
+
} else {
|
|
46
|
+
throw error;
|
|
47
|
+
}
|
|
39
48
|
}
|
|
40
49
|
|
|
41
|
-
|
|
50
|
+
// PDF interception handling
|
|
51
|
+
if (handled) {
|
|
52
|
+
await handled; // Wait for the interception callback to finish processing the response
|
|
53
|
+
|
|
54
|
+
if (pdf.content) {
|
|
55
|
+
return {
|
|
56
|
+
mimeType: 'application/pdf',
|
|
57
|
+
content: pdf.content,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (pdf.status) { // Status captured by CDP interception
|
|
62
|
+
throw new Error(`Received HTTP code ${pdf.status} when trying to fetch '${url}'`);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (navigationAborted) {
|
|
67
|
+
throw new Error(`Navigation aborted when trying to fetch '${url}'`);
|
|
68
|
+
}
|
|
42
69
|
|
|
43
70
|
if (!response) {
|
|
44
71
|
throw new Error(`Response is empty when trying to fetch '${url}'`);
|
|
@@ -46,31 +73,11 @@ export default async function fetch(url, cssSelectors, config) {
|
|
|
46
73
|
|
|
47
74
|
const statusCode = response.status();
|
|
48
75
|
|
|
49
|
-
if (
|
|
76
|
+
if (!isValidHttpStatus(statusCode)) {
|
|
50
77
|
throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
|
|
51
78
|
}
|
|
52
79
|
|
|
53
|
-
|
|
54
|
-
page.waitForFunction(
|
|
55
|
-
cssSelector => {
|
|
56
|
-
const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
|
|
57
|
-
|
|
58
|
-
return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
|
|
59
|
-
},
|
|
60
|
-
{ timeout: config.waitForElementsTimeout },
|
|
61
|
-
selector,
|
|
62
|
-
));
|
|
63
|
-
|
|
64
|
-
// We expect all elements to be present on the page…
|
|
65
|
-
await Promise.all(waitForSelectorsPromises).catch(error => {
|
|
66
|
-
if (error.name == 'TimeoutError') {
|
|
67
|
-
// however, if they are not, this is not considered as an error since selectors may be out of date
|
|
68
|
-
// and the whole content of the page should still be returned.
|
|
69
|
-
return;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
throw error;
|
|
73
|
-
});
|
|
80
|
+
await waitForSelectors(page, selectors, config.waitForElementsTimeout);
|
|
74
81
|
|
|
75
82
|
return {
|
|
76
83
|
mimeType: 'text/html',
|
|
@@ -80,17 +87,10 @@ export default async function fetch(url, cssSelectors, config) {
|
|
|
80
87
|
if (error.name === 'TimeoutError') {
|
|
81
88
|
throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
|
|
82
89
|
}
|
|
90
|
+
|
|
83
91
|
throw new Error(error.message);
|
|
84
92
|
} finally {
|
|
85
|
-
|
|
86
|
-
await client.detach();
|
|
87
|
-
}
|
|
88
|
-
if (page) {
|
|
89
|
-
await page.close();
|
|
90
|
-
}
|
|
91
|
-
if (context) {
|
|
92
|
-
await context.close(); // Close the isolated context to free resources and ensure complete cleanup
|
|
93
|
-
}
|
|
93
|
+
await cleanupPage(client, page, context);
|
|
94
94
|
}
|
|
95
95
|
}
|
|
96
96
|
|
|
@@ -151,3 +151,103 @@ export async function stopHeadlessBrowser() {
|
|
|
151
151
|
await browser.close();
|
|
152
152
|
browser = null;
|
|
153
153
|
}
|
|
154
|
+
|
|
155
|
+
function isValidHttpStatus(status) {
|
|
156
|
+
return (status >= 200 && status < 300) || status === 304;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
async function configurePage(page, client, config) {
|
|
160
|
+
await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600)
|
|
161
|
+
await page.setDefaultNavigationTimeout(config.navigationTimeout);
|
|
162
|
+
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
|
|
163
|
+
|
|
164
|
+
// Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment)
|
|
165
|
+
await client.send('Network.setUserAgentOverride', {
|
|
166
|
+
userAgent: await browser.userAgent(),
|
|
167
|
+
acceptLanguage: config.language,
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
|
|
171
|
+
await page.authenticate(browser.proxyCredentials);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function setupPdfInterception(client) {
|
|
176
|
+
const pdf = { content: null, status: null };
|
|
177
|
+
let onHandled;
|
|
178
|
+
const handled = new Promise(resolve => { onHandled = resolve; });
|
|
179
|
+
|
|
180
|
+
client.send('Fetch.enable', { patterns: [{ urlPattern: '*', requestStage: 'Response' }] }); // Intercept all responses before Chrome processes them, allowing to capture PDF content before it's handled by the PDF viewer
|
|
181
|
+
|
|
182
|
+
client.on('Fetch.requestPaused', async ({ requestId, resourceType, responseHeaders, responseStatusCode }) => {
|
|
183
|
+
try {
|
|
184
|
+
const contentType = responseHeaders?.find(header => header.name.toLowerCase() === 'content-type')?.value;
|
|
185
|
+
|
|
186
|
+
if (!contentType?.includes('application/pdf')) {
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
pdf.status = responseStatusCode;
|
|
191
|
+
|
|
192
|
+
if (!isValidHttpStatus(responseStatusCode)) {
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
try {
|
|
197
|
+
const { body, base64Encoded } = await client.send('Fetch.getResponseBody', { requestId });
|
|
198
|
+
|
|
199
|
+
pdf.content = Buffer.from(body, base64Encoded ? 'base64' : 'utf8');
|
|
200
|
+
} catch {
|
|
201
|
+
// Response body may be unavailable due to network error or connection interruption
|
|
202
|
+
}
|
|
203
|
+
} finally {
|
|
204
|
+
try {
|
|
205
|
+
await client.send('Fetch.continueResponse', { requestId });
|
|
206
|
+
} catch {
|
|
207
|
+
// Client may have been closed by cleanupPage() in fetch() while this async callback was still running
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if (resourceType === 'Document') { // Signal that the main navigation request has been processed
|
|
211
|
+
onHandled();
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
return { pdf, handled };
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
async function waitForSelectors(page, selectors, timeout) {
|
|
220
|
+
const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
|
|
221
|
+
page.waitForFunction(
|
|
222
|
+
cssSelector => {
|
|
223
|
+
const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
|
|
224
|
+
|
|
225
|
+
return element?.textContent.trim().length; // Ensures element exists and has non-empty text
|
|
226
|
+
},
|
|
227
|
+
{ timeout },
|
|
228
|
+
selector,
|
|
229
|
+
));
|
|
230
|
+
|
|
231
|
+
// We expect all elements to be present on the page…
|
|
232
|
+
await Promise.all(waitForSelectorsPromises).catch(error => {
|
|
233
|
+
if (error.name == 'TimeoutError') {
|
|
234
|
+
// however, if they are not, this is not considered as an error since selectors may be out of date
|
|
235
|
+
// and the whole content of the page should still be returned.
|
|
236
|
+
return;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
throw error;
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
async function cleanupPage(client, page, context) {
|
|
244
|
+
if (client) {
|
|
245
|
+
await client.detach().catch(() => {});
|
|
246
|
+
}
|
|
247
|
+
if (page) {
|
|
248
|
+
await page.close().catch(() => {});
|
|
249
|
+
}
|
|
250
|
+
if (context) {
|
|
251
|
+
await context.close().catch(() => {}); // Close the isolated context to free resources and ensure complete cleanup
|
|
252
|
+
}
|
|
253
|
+
}
|
|
@@ -1,10 +1,15 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
1
2
|
import http from 'http';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
2
5
|
|
|
3
6
|
import { expect, use } from 'chai';
|
|
4
7
|
import chaiAsPromised from 'chai-as-promised';
|
|
5
8
|
|
|
6
9
|
import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
|
|
7
10
|
|
|
11
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
|
|
8
13
|
const SERVER_PORT = 8977;
|
|
9
14
|
|
|
10
15
|
use(chaiAsPromised);
|
|
@@ -16,6 +21,7 @@ describe('Full DOM Fetcher', function () {
|
|
|
16
21
|
this.timeout(60000);
|
|
17
22
|
|
|
18
23
|
let temporaryServer;
|
|
24
|
+
let expectedPDFContent;
|
|
19
25
|
|
|
20
26
|
before(async () => {
|
|
21
27
|
await launchHeadlessBrowser();
|
|
@@ -27,6 +33,10 @@ describe('Full DOM Fetcher', function () {
|
|
|
27
33
|
if (request.url === '/delayed-content') {
|
|
28
34
|
response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
|
|
29
35
|
}
|
|
36
|
+
if (request.url === '/terms.pdf') {
|
|
37
|
+
expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
|
|
38
|
+
response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
|
|
39
|
+
}
|
|
30
40
|
|
|
31
41
|
return response.end();
|
|
32
42
|
}).listen(SERVER_PORT);
|
|
@@ -85,5 +95,27 @@ describe('Full DOM Fetcher', function () {
|
|
|
85
95
|
await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
|
|
86
96
|
});
|
|
87
97
|
});
|
|
98
|
+
|
|
99
|
+
context('when URL targets a PDF file', () => {
|
|
100
|
+
let content;
|
|
101
|
+
let mimeType;
|
|
102
|
+
const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`;
|
|
103
|
+
|
|
104
|
+
before(async () => {
|
|
105
|
+
({ content, mimeType } = await fetch(pdfUrl, [], config));
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
it('returns a buffer for PDF content', () => {
|
|
109
|
+
expect(content).to.be.an.instanceOf(Buffer);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it('returns the correct MIME type', () => {
|
|
113
|
+
expect(mimeType).to.equal('application/pdf');
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
it('returns the PDF file content', () => {
|
|
117
|
+
expect(content.equals(expectedPDFContent)).to.be.true;
|
|
118
|
+
});
|
|
119
|
+
});
|
|
88
120
|
});
|
|
89
121
|
});
|