@opentermsarchive/engine 5.4.2 → 5.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "5.4.2",
3
+ "version": "5.6.0",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -1,18 +1,25 @@
1
1
  export class FetchDocumentError extends Error {
2
+ static LIKELY_BOT_BLOCKING_ERRORS = [
3
+ 'HTTP code 403',
4
+ 'HTTP code 406',
5
+ 'HTTP code 502',
6
+ 'ECONNRESET',
7
+ ];
8
+
2
9
  static LIKELY_TRANSIENT_ERRORS = [
3
10
  'EAI_AGAIN', // DNS lookup temporary failure - DNS server is temporarily unavailable or overloaded
4
11
  'ETIMEDOUT', // Connection timeout - network latency or server load issues
5
- 'ECONNRESET', // Connection reset - connection was forcibly closed, often due to network issues
6
12
  'ERR_NAME_NOT_RESOLVED', // DNS lookup temporary failure - DNS server is temporarily unavailable or overloaded
7
13
  'HTTP code 500', // Internal Server Error - server encountered an error while processing the request
8
- 'HTTP code 502', // Bad Gateway - upstream server returned invalid response, often temporary
9
14
  'HTTP code 503', // Service Unavailable - server is temporarily overloaded or down for maintenance
10
15
  'HTTP code 504', // Gateway Timeout - upstream server took too long to respond, might be temporary
16
+ ...FetchDocumentError.LIKELY_BOT_BLOCKING_ERRORS,
11
17
  ];
12
18
 
13
19
  constructor(message) {
14
20
  super(`Fetch failed: ${message}`);
15
21
  this.name = 'FetchDocumentError';
16
22
  this.mayBeTransient = FetchDocumentError.LIKELY_TRANSIENT_ERRORS.some(err => message.includes(err));
23
+ this.mayBeBotBlocking = FetchDocumentError.LIKELY_BOT_BLOCKING_ERRORS.some(err => message.includes(err));
17
24
  }
18
25
  }
@@ -30,7 +30,6 @@ describe('FetchDocumentError', () => {
30
30
 
31
31
  describe('non-transient errors', () => {
32
32
  [
33
- 'HTTP code 403',
34
33
  'HTTP code 404',
35
34
  'HTTP code 429',
36
35
  ].forEach(errorMessage => {
@@ -1,4 +1,3 @@
1
- import { TimeoutError } from 'puppeteer';
2
1
  import puppeteer from 'puppeteer-extra';
3
2
  import stealthPlugin from 'puppeteer-extra-plugin-stealth';
4
3
 
@@ -33,7 +32,16 @@ export default async function fetch(url, cssSelectors, config) {
33
32
  throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
34
33
  }
35
34
 
36
- const waitForSelectorsPromises = selectors.filter(Boolean).map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
35
+ const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
36
+ page.waitForFunction(
37
+ cssSelector => {
38
+ const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
39
+
40
+ return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
41
+ },
42
+ { timeout: config.waitForElementsTimeout },
43
+ selector,
44
+ ));
37
45
 
38
46
  // We expect all elements to be present on the page…
39
47
  await Promise.all(waitForSelectorsPromises).catch(error => {
@@ -51,7 +59,7 @@ export default async function fetch(url, cssSelectors, config) {
51
59
  content: await page.content(),
52
60
  };
53
61
  } catch (error) {
54
- if (error instanceof TimeoutError) {
62
+ if (error.name === 'TimeoutError') {
55
63
  throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
56
64
  }
57
65
  throw new Error(error.message);
@@ -0,0 +1,90 @@
1
+ import http from 'http';
2
+
3
+ import chai from 'chai';
4
+ import chaiAsPromised from 'chai-as-promised';
5
+
6
+ import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
7
+
8
+ const { expect } = chai;
9
+ const SERVER_PORT = 8977;
10
+
11
+ chai.use(chaiAsPromised);
12
+
13
+ const dynamicHTML = '<!DOCTYPE html><html><head><title>Dynamic Page</title><script>setTimeout(() => { document.body.innerHTML += "<div class=\'dynamic\'>Loaded</div>"; }, 100);</script></head><body></body></html>';
14
+ const delayedContentHTML = '<!DOCTYPE html><html><head><title>Delayed Content</title><script>setTimeout(() => { document.querySelector(".content").textContent = "Final content"; }, 100);</script></head><body><div class="content"></div></body></html>';
15
+
16
+ describe('Full DOM Fetcher', function () {
17
+ this.timeout(60000);
18
+
19
+ let temporaryServer;
20
+
21
+ before(async () => {
22
+ await launchHeadlessBrowser();
23
+
24
+ temporaryServer = http.createServer((request, response) => {
25
+ if (request.url === '/dynamic') {
26
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(dynamicHTML);
27
+ }
28
+ if (request.url === '/delayed-content') {
29
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
30
+ }
31
+
32
+ return response.end();
33
+ }).listen(SERVER_PORT);
34
+ });
35
+
36
+ after(async () => {
37
+ if (temporaryServer) {
38
+ temporaryServer.close();
39
+ }
40
+ await stopHeadlessBrowser();
41
+ });
42
+
43
+ describe('Browser lifecycle', () => {
44
+ it('throws error when trying to fetch without launching browser', async () => {
45
+ await stopHeadlessBrowser();
46
+ await expect(fetch('http://example.com', [], { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }))
47
+ .to.be.rejectedWith('The headless browser should be controlled manually');
48
+ await launchHeadlessBrowser();
49
+ });
50
+
51
+ it('reuses existing browser instance', async () => {
52
+ const browser1 = await launchHeadlessBrowser();
53
+ const browser2 = await launchHeadlessBrowser();
54
+
55
+ expect(browser1).to.equal(browser2);
56
+ });
57
+ });
58
+
59
+ describe('#fetch', () => {
60
+ const config = { navigationTimeout: 1000, waitForElementsTimeout: 1000, language: 'en' };
61
+
62
+ it('waits for dynamically injected elements to appear in the DOM', async () => {
63
+ const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/dynamic`, ['.dynamic'], config);
64
+
65
+ expect(result.content).to.match(/<body[^>]*>.*<div class="dynamic">Loaded<\/div>.*<\/body>/s);
66
+ });
67
+
68
+ it('fails when waiting for non-existent elements exceeds timeout', async () => {
69
+ const url = `http://127.0.0.1:${SERVER_PORT}/dynamic`;
70
+ const timeout = 10;
71
+
72
+ await expect(fetch(url, ['.non-existent'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
73
+ });
74
+
75
+ context('when a DOM element exists but its content is loaded asynchronously', () => {
76
+ it('waits for the element content to be fully loaded', async () => {
77
+ const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/delayed-content`, ['.content'], config);
78
+
79
+ expect(result.content).to.match(/<div class="content">Final content<\/div>/);
80
+ });
81
+
82
+ it('fails when content loading exceeds navigation timeout', async () => {
83
+ const url = `http://127.0.0.1:${SERVER_PORT}/delayed-content`;
84
+ const timeout = 10;
85
+
86
+ await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
87
+ });
88
+ });
89
+ });
90
+ });
@@ -12,13 +12,6 @@ export const FETCHER_TYPES = {
12
12
  HTML_ONLY: 'htmlOnly',
13
13
  };
14
14
 
15
- const LIKELY_BOT_BLOCKING_ERRORS = [
16
- 'HTTP code 403',
17
- 'HTTP code 406',
18
- 'HTTP code 502',
19
- 'ECONNRESET',
20
- ];
21
-
22
15
  /**
23
16
  * Fetch a resource from the network, returning a promise which is fulfilled once the response is available
24
17
  * @function fetch
@@ -70,9 +63,7 @@ async function fetchWithFallback(url, cssSelectors, fetcherConfig) {
70
63
  try {
71
64
  return await fetchWithHtmlOnly(url, fetcherConfig);
72
65
  } catch (error) {
73
- const isBotBlockingError = LIKELY_BOT_BLOCKING_ERRORS.some(code => error.message.includes(code));
74
-
75
- if (!isBotBlockingError || fetcherConfig.executeClientScripts === false) {
66
+ if (!error.mayBeBotBlocking || fetcherConfig.executeClientScripts === false) {
76
67
  throw error;
77
68
  }
78
69
 
@@ -81,15 +72,23 @@ async function fetchWithFallback(url, cssSelectors, fetcherConfig) {
81
72
  }
82
73
 
83
74
  async function fetchWithFullDom(url, cssSelectors, fetcherConfig) {
84
- return {
85
- ...await fetchFullDom(url, cssSelectors, fetcherConfig),
86
- fetcher: FETCHER_TYPES.FULL_DOM,
87
- };
75
+ try {
76
+ return {
77
+ ...await fetchFullDom(url, cssSelectors, fetcherConfig),
78
+ fetcher: FETCHER_TYPES.FULL_DOM,
79
+ };
80
+ } catch (error) {
81
+ throw new FetchDocumentError(error.message);
82
+ }
88
83
  }
89
84
 
90
85
  async function fetchWithHtmlOnly(url, fetcherConfig) {
91
- return {
92
- ...await fetchHtmlOnly(url, fetcherConfig),
93
- fetcher: FETCHER_TYPES.HTML_ONLY,
94
- };
86
+ try {
87
+ return {
88
+ ...await fetchHtmlOnly(url, fetcherConfig),
89
+ fetcher: FETCHER_TYPES.HTML_ONLY,
90
+ };
91
+ } catch (error) {
92
+ throw new FetchDocumentError(error.message);
93
+ }
95
94
  }