@opentermsarchive/engine 5.4.2 → 5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "5.4.2",
3
+ "version": "5.5.0",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -1,4 +1,3 @@
1
- import { TimeoutError } from 'puppeteer';
2
1
  import puppeteer from 'puppeteer-extra';
3
2
  import stealthPlugin from 'puppeteer-extra-plugin-stealth';
4
3
 
@@ -33,7 +32,16 @@ export default async function fetch(url, cssSelectors, config) {
33
32
  throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
34
33
  }
35
34
 
36
- const waitForSelectorsPromises = selectors.filter(Boolean).map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
35
+ const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
36
+ page.waitForFunction(
37
+ cssSelector => {
38
+ const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
39
+
40
+ return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
41
+ },
42
+ { timeout: config.waitForElementsTimeout },
43
+ selector,
44
+ ));
37
45
 
38
46
  // We expect all elements to be present on the page…
39
47
  await Promise.all(waitForSelectorsPromises).catch(error => {
@@ -51,7 +59,7 @@ export default async function fetch(url, cssSelectors, config) {
51
59
  content: await page.content(),
52
60
  };
53
61
  } catch (error) {
54
- if (error instanceof TimeoutError) {
62
+ if (error.name === 'TimeoutError') {
55
63
  throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
56
64
  }
57
65
  throw new Error(error.message);
@@ -0,0 +1,90 @@
1
+ import http from 'http';
2
+
3
+ import chai from 'chai';
4
+ import chaiAsPromised from 'chai-as-promised';
5
+
6
+ import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
7
+
8
+ const { expect } = chai;
9
+ const SERVER_PORT = 8977;
10
+
11
+ chai.use(chaiAsPromised);
12
+
13
+ const dynamicHTML = '<!DOCTYPE html><html><head><title>Dynamic Page</title><script>setTimeout(() => { document.body.innerHTML += "<div class=\'dynamic\'>Loaded</div>"; }, 100);</script></head><body></body></html>';
14
+ const delayedContentHTML = '<!DOCTYPE html><html><head><title>Delayed Content</title><script>setTimeout(() => { document.querySelector(".content").textContent = "Final content"; }, 100);</script></head><body><div class="content"></div></body></html>';
15
+
16
+ describe('Full DOM Fetcher', function () {
17
+ this.timeout(60000);
18
+
19
+ let temporaryServer;
20
+
21
+ before(async () => {
22
+ await launchHeadlessBrowser();
23
+
24
+ temporaryServer = http.createServer((request, response) => {
25
+ if (request.url === '/dynamic') {
26
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(dynamicHTML);
27
+ }
28
+ if (request.url === '/delayed-content') {
29
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
30
+ }
31
+
32
+ return response.end();
33
+ }).listen(SERVER_PORT);
34
+ });
35
+
36
+ after(async () => {
37
+ if (temporaryServer) {
38
+ temporaryServer.close();
39
+ }
40
+ await stopHeadlessBrowser();
41
+ });
42
+
43
+ describe('Browser lifecycle', () => {
44
+ it('throws error when trying to fetch without launching browser', async () => {
45
+ await stopHeadlessBrowser();
46
+ await expect(fetch('http://example.com', [], { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }))
47
+ .to.be.rejectedWith('The headless browser should be controlled manually');
48
+ await launchHeadlessBrowser();
49
+ });
50
+
51
+ it('reuses existing browser instance', async () => {
52
+ const browser1 = await launchHeadlessBrowser();
53
+ const browser2 = await launchHeadlessBrowser();
54
+
55
+ expect(browser1).to.equal(browser2);
56
+ });
57
+ });
58
+
59
+ describe('#fetch', () => {
60
+ const config = { navigationTimeout: 1000, waitForElementsTimeout: 1000, language: 'en' };
61
+
62
+ it('waits for dynamically injected elements to appear in the DOM', async () => {
63
+ const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/dynamic`, ['.dynamic'], config);
64
+
65
+ expect(result.content).to.match(/<body[^>]*>.*<div class="dynamic">Loaded<\/div>.*<\/body>/s);
66
+ });
67
+
68
+ it('fails when waiting for non-existent elements exceeds timeout', async () => {
69
+ const url = `http://127.0.0.1:${SERVER_PORT}/dynamic`;
70
+ const timeout = 10;
71
+
72
+ await expect(fetch(url, ['.non-existent'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
73
+ });
74
+
75
+ context('when a DOM element exists but its content is loaded asynchronously', () => {
76
+ it('waits for the element content to be fully loaded', async () => {
77
+ const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/delayed-content`, ['.content'], config);
78
+
79
+ expect(result.content).to.match(/<div class="content">Final content<\/div>/);
80
+ });
81
+
82
+ it('fails when content loading exceeds navigation timeout', async () => {
83
+ const url = `http://127.0.0.1:${SERVER_PORT}/delayed-content`;
84
+ const timeout = 10;
85
+
86
+ await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
87
+ });
88
+ });
89
+ });
90
+ });