@opentermsarchive/engine 10.4.0 → 10.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "10.4.0",
3
+ "version": "10.5.0",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -8,37 +8,64 @@ let browser;
8
8
  export default async function fetch(url, cssSelectors, config) {
9
9
  puppeteer.use(stealthPlugin({ locale: config.language }));
10
10
 
11
- let context;
12
- let page;
13
- let client;
14
- let response;
15
- const selectors = [].concat(cssSelectors);
16
-
17
11
  if (!browser) {
18
12
  throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
19
13
  }
20
14
 
15
+ let context;
16
+ let page;
17
+ let client;
18
+
21
19
  try {
22
20
  context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache)
23
21
  page = await context.newPage();
22
+ client = await page.createCDPSession();
24
23
 
25
- await page.setViewport({ width: 1920, height: 1080 }); // Set a realistic viewport size to avoid detection based on default Puppeteer dimensions (800x600)
26
- await page.setDefaultNavigationTimeout(config.navigationTimeout);
27
- await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
24
+ await configurePage(page, client, config);
28
25
 
29
- // Use CDP to ensure the browser language is set correctly (most reliable method, see https://zirkelc.dev/posts/puppeteer-language-experiment)
30
- client = await page.createCDPSession();
26
+ const selectors = [].concat(cssSelectors).filter(Boolean);
31
27
 
32
- await client.send('Network.setUserAgentOverride', {
33
- userAgent: await browser.userAgent(),
34
- acceptLanguage: config.language,
35
- });
28
+ let pdf = {};
29
+ let handled = null;
30
+
31
+ if (!selectors.length) { // CSS selectors are specified only for HTML content and omitted when fetching a PDF
32
+ ({ pdf, handled } = setupPdfInterception(client));
33
+ }
36
34
 
37
- if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
38
- await page.authenticate(browser.proxyCredentials);
35
+ let response;
36
+ let navigationAborted = false;
37
+
38
+ try {
39
+ response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
40
+ } catch (error) {
41
+ if (error.message.includes('net::ERR_ABORTED')) {
42
+ // Chrome may sometimes abort navigation for files such as PDFs.
43
+ // Do not throw for now; wait for the PDF interception handler to finish processing the response.
44
+ navigationAborted = true;
45
+ } else {
46
+ throw error;
47
+ }
39
48
  }
40
49
 
41
- response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
50
+ // PDF interception handling
51
+ if (handled) {
52
+ await handled; // Wait for the interception callback to finish processing the response
53
+
54
+ if (pdf.content) {
55
+ return {
56
+ mimeType: 'application/pdf',
57
+ content: pdf.content,
58
+ };
59
+ }
60
+
61
+ if (pdf.status) { // Status captured by CDP interception
62
+ throw new Error(`Received HTTP code ${pdf.status} when trying to fetch '${url}'`);
63
+ }
64
+ }
65
+
66
+ if (navigationAborted) {
67
+ throw new Error(`Navigation aborted when trying to fetch '${url}'`);
68
+ }
42
69
 
43
70
  if (!response) {
44
71
  throw new Error(`Response is empty when trying to fetch '${url}'`);
@@ -46,31 +73,11 @@ export default async function fetch(url, cssSelectors, config) {
46
73
 
47
74
  const statusCode = response.status();
48
75
 
49
- if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) {
76
+ if (!isValidHttpStatus(statusCode)) {
50
77
  throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
51
78
  }
52
79
 
53
- const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
54
- page.waitForFunction(
55
- cssSelector => {
56
- const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
57
-
58
- return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
59
- },
60
- { timeout: config.waitForElementsTimeout },
61
- selector,
62
- ));
63
-
64
- // We expect all elements to be present on the page…
65
- await Promise.all(waitForSelectorsPromises).catch(error => {
66
- if (error.name == 'TimeoutError') {
67
- // however, if they are not, this is not considered as an error since selectors may be out of date
68
- // and the whole content of the page should still be returned.
69
- return;
70
- }
71
-
72
- throw error;
73
- });
80
+ await waitForSelectors(page, selectors, config.waitForElementsTimeout);
74
81
 
75
82
  return {
76
83
  mimeType: 'text/html',
@@ -80,17 +87,10 @@ export default async function fetch(url, cssSelectors, config) {
80
87
  if (error.name === 'TimeoutError') {
81
88
  throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
82
89
  }
90
+
83
91
  throw new Error(error.message);
84
92
  } finally {
85
- if (client) {
86
- await client.detach();
87
- }
88
- if (page) {
89
- await page.close();
90
- }
91
- if (context) {
92
- await context.close(); // Close the isolated context to free resources and ensure complete cleanup
93
- }
93
+ await cleanupPage(client, page, context);
94
94
  }
95
95
  }
96
96
 
@@ -151,3 +151,103 @@ export async function stopHeadlessBrowser() {
151
151
  await browser.close();
152
152
  browser = null;
153
153
  }
154
+
155
+ function isValidHttpStatus(status) {
156
+ return (status >= 200 && status < 300) || status === 304;
157
+ }
158
+
159
+ async function configurePage(page, client, config) {
160
+ await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600)
161
+ await page.setDefaultNavigationTimeout(config.navigationTimeout);
162
+ await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
163
+
164
+ // Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment)
165
+ await client.send('Network.setUserAgentOverride', {
166
+ userAgent: await browser.userAgent(),
167
+ acceptLanguage: config.language,
168
+ });
169
+
170
+ if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
171
+ await page.authenticate(browser.proxyCredentials);
172
+ }
173
+ }
174
+
175
+ function setupPdfInterception(client) {
176
+ const pdf = { content: null, status: null };
177
+ let onHandled;
178
+ const handled = new Promise(resolve => { onHandled = resolve; });
179
+
180
+ client.send('Fetch.enable', { patterns: [{ urlPattern: '*', requestStage: 'Response' }] }); // Intercept all responses before Chrome processes them, allowing to capture PDF content before it's handled by the PDF viewer
181
+
182
+ client.on('Fetch.requestPaused', async ({ requestId, resourceType, responseHeaders, responseStatusCode }) => {
183
+ try {
184
+ const contentType = responseHeaders?.find(header => header.name.toLowerCase() === 'content-type')?.value;
185
+
186
+ if (!contentType?.includes('application/pdf')) {
187
+ return;
188
+ }
189
+
190
+ pdf.status = responseStatusCode;
191
+
192
+ if (!isValidHttpStatus(responseStatusCode)) {
193
+ return;
194
+ }
195
+
196
+ try {
197
+ const { body, base64Encoded } = await client.send('Fetch.getResponseBody', { requestId });
198
+
199
+ pdf.content = Buffer.from(body, base64Encoded ? 'base64' : 'utf8');
200
+ } catch {
201
+ // Response body may be unavailable due to network error or connection interruption
202
+ }
203
+ } finally {
204
+ try {
205
+ await client.send('Fetch.continueResponse', { requestId });
206
+ } catch {
207
+ // Client may have been closed by cleanupPage() in fetch() while this async callback was still running
208
+ }
209
+
210
+ if (resourceType === 'Document') { // Signal that the main navigation request has been processed
211
+ onHandled();
212
+ }
213
+ }
214
+ });
215
+
216
+ return { pdf, handled };
217
+ }
218
+
219
+ async function waitForSelectors(page, selectors, timeout) {
220
+ const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
221
+ page.waitForFunction(
222
+ cssSelector => {
223
+ const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
224
+
225
+ return element?.textContent.trim().length; // Ensures element exists and has non-empty text
226
+ },
227
+ { timeout },
228
+ selector,
229
+ ));
230
+
231
+ // We expect all elements to be present on the page…
232
+ await Promise.all(waitForSelectorsPromises).catch(error => {
233
+ if (error.name == 'TimeoutError') {
234
+ // however, if they are not, this is not considered as an error since selectors may be out of date
235
+ // and the whole content of the page should still be returned.
236
+ return;
237
+ }
238
+
239
+ throw error;
240
+ });
241
+ }
242
+
243
+ async function cleanupPage(client, page, context) {
244
+ if (client) {
245
+ await client.detach().catch(() => {});
246
+ }
247
+ if (page) {
248
+ await page.close().catch(() => {});
249
+ }
250
+ if (context) {
251
+ await context.close().catch(() => {}); // Close the isolated context to free resources and ensure complete cleanup
252
+ }
253
+ }
@@ -1,10 +1,15 @@
1
+ import fs from 'fs';
1
2
  import http from 'http';
3
+ import path from 'path';
4
+ import { fileURLToPath } from 'url';
2
5
 
3
6
  import { expect, use } from 'chai';
4
7
  import chaiAsPromised from 'chai-as-promised';
5
8
 
6
9
  import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
7
10
 
11
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
12
+
8
13
  const SERVER_PORT = 8977;
9
14
 
10
15
  use(chaiAsPromised);
@@ -16,6 +21,7 @@ describe('Full DOM Fetcher', function () {
16
21
  this.timeout(60000);
17
22
 
18
23
  let temporaryServer;
24
+ let expectedPDFContent;
19
25
 
20
26
  before(async () => {
21
27
  await launchHeadlessBrowser();
@@ -27,6 +33,10 @@ describe('Full DOM Fetcher', function () {
27
33
  if (request.url === '/delayed-content') {
28
34
  response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
29
35
  }
36
+ if (request.url === '/terms.pdf') {
37
+ expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
38
+ response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
39
+ }
30
40
 
31
41
  return response.end();
32
42
  }).listen(SERVER_PORT);
@@ -85,5 +95,27 @@ describe('Full DOM Fetcher', function () {
85
95
  await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
86
96
  });
87
97
  });
98
+
99
+ context('when URL targets a PDF file', () => {
100
+ let content;
101
+ let mimeType;
102
+ const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`;
103
+
104
+ before(async () => {
105
+ ({ content, mimeType } = await fetch(pdfUrl, [], config));
106
+ });
107
+
108
+ it('returns a buffer for PDF content', () => {
109
+ expect(content).to.be.an.instanceOf(Buffer);
110
+ });
111
+
112
+ it('returns the correct MIME type', () => {
113
+ expect(mimeType).to.equal('application/pdf');
114
+ });
115
+
116
+ it('returns the PDF file content', () => {
117
+ expect(content.equals(expectedPDFContent)).to.be.true;
118
+ });
119
+ });
88
120
  });
89
121
  });