@opentermsarchive/engine 5.4.1 → 5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "5.4.1",
3
+ "version": "5.5.0",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -1,4 +1,3 @@
1
- import { TimeoutError } from 'puppeteer';
2
1
  import puppeteer from 'puppeteer-extra';
3
2
  import stealthPlugin from 'puppeteer-extra-plugin-stealth';
4
3
 
@@ -33,7 +32,16 @@ export default async function fetch(url, cssSelectors, config) {
33
32
  throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
34
33
  }
35
34
 
36
- const waitForSelectorsPromises = selectors.filter(Boolean).map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
35
+ const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
36
+ page.waitForFunction(
37
+ cssSelector => {
38
+ const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
39
+
40
+ return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
41
+ },
42
+ { timeout: config.waitForElementsTimeout },
43
+ selector,
44
+ ));
37
45
 
38
46
  // We expect all elements to be present on the page…
39
47
  await Promise.all(waitForSelectorsPromises).catch(error => {
@@ -51,7 +59,7 @@ export default async function fetch(url, cssSelectors, config) {
51
59
  content: await page.content(),
52
60
  };
53
61
  } catch (error) {
54
- if (error instanceof TimeoutError) {
62
+ if (error.name === 'TimeoutError') {
55
63
  throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
56
64
  }
57
65
  throw new Error(error.message);
@@ -0,0 +1,90 @@
1
+ import http from 'http';
2
+
3
+ import chai from 'chai';
4
+ import chaiAsPromised from 'chai-as-promised';
5
+
6
+ import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
7
+
8
+ const { expect } = chai;
9
+ const SERVER_PORT = 8977;
10
+
11
+ chai.use(chaiAsPromised);
12
+
13
+ const dynamicHTML = '<!DOCTYPE html><html><head><title>Dynamic Page</title><script>setTimeout(() => { document.body.innerHTML += "<div class=\'dynamic\'>Loaded</div>"; }, 100);</script></head><body></body></html>';
14
+ const delayedContentHTML = '<!DOCTYPE html><html><head><title>Delayed Content</title><script>setTimeout(() => { document.querySelector(".content").textContent = "Final content"; }, 100);</script></head><body><div class="content"></div></body></html>';
15
+
16
+ describe('Full DOM Fetcher', function () {
17
+ this.timeout(60000);
18
+
19
+ let temporaryServer;
20
+
21
+ before(async () => {
22
+ await launchHeadlessBrowser();
23
+
24
+ temporaryServer = http.createServer((request, response) => {
25
+ if (request.url === '/dynamic') {
26
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(dynamicHTML);
27
+ }
28
+ if (request.url === '/delayed-content') {
29
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
30
+ }
31
+
32
+ return response.end();
33
+ }).listen(SERVER_PORT);
34
+ });
35
+
36
+ after(async () => {
37
+ if (temporaryServer) {
38
+ temporaryServer.close();
39
+ }
40
+ await stopHeadlessBrowser();
41
+ });
42
+
43
+ describe('Browser lifecycle', () => {
44
+ it('throws error when trying to fetch without launching browser', async () => {
45
+ await stopHeadlessBrowser();
46
+ await expect(fetch('http://example.com', [], { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }))
47
+ .to.be.rejectedWith('The headless browser should be controlled manually');
48
+ await launchHeadlessBrowser();
49
+ });
50
+
51
+ it('reuses existing browser instance', async () => {
52
+ const browser1 = await launchHeadlessBrowser();
53
+ const browser2 = await launchHeadlessBrowser();
54
+
55
+ expect(browser1).to.equal(browser2);
56
+ });
57
+ });
58
+
59
+ describe('#fetch', () => {
60
+ const config = { navigationTimeout: 1000, waitForElementsTimeout: 1000, language: 'en' };
61
+
62
+ it('waits for dynamically injected elements to appear in the DOM', async () => {
63
+ const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/dynamic`, ['.dynamic'], config);
64
+
65
+ expect(result.content).to.match(/<body[^>]*>.*<div class="dynamic">Loaded<\/div>.*<\/body>/s);
66
+ });
67
+
68
+ it('fails when waiting for non-existent elements exceeds timeout', async () => {
69
+ const url = `http://127.0.0.1:${SERVER_PORT}/dynamic`;
70
+ const timeout = 10;
71
+
72
+ await expect(fetch(url, ['.non-existent'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
73
+ });
74
+
75
+ context('when a DOM element exists but its content is loaded asynchronously', () => {
76
+ it('waits for the element content to be fully loaded', async () => {
77
+ const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/delayed-content`, ['.content'], config);
78
+
79
+ expect(result.content).to.match(/<div class="content">Final content<\/div>/);
80
+ });
81
+
82
+ it('fails when content loading exceeds navigation timeout', async () => {
83
+ const url = `http://127.0.0.1:${SERVER_PORT}/delayed-content`;
84
+ const timeout = 10;
85
+
86
+ await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
87
+ });
88
+ });
89
+ });
90
+ });
@@ -1,4 +1,5 @@
1
1
  import events from 'events';
2
+ import { createRequire } from 'module';
2
3
 
3
4
  import async from 'async';
4
5
 
@@ -11,6 +12,9 @@ import Version from './recorder/version.js';
11
12
  import * as services from './services/index.js';
12
13
  import Service from './services/service.js';
13
14
 
15
+ const require = createRequire(import.meta.url);
16
+ const { version: PACKAGE_VERSION } = require('../../package.json');
17
+
14
18
  // The parallel handling feature is currently set to a parallelism of 1 on terms tracking
15
19
  // because when it's higher there are two issues:
16
20
  // - too many requests on the same endpoint yield 403
@@ -249,7 +253,7 @@ export default class Archivist extends events.EventEmitter {
249
253
  termsType: terms.type,
250
254
  fetchDate: terms.fetchDate,
251
255
  isExtractOnly: extractOnly,
252
- metadata: { 'x-engine-version': process.env.npm_package_version },
256
+ metadata: { 'x-engine-version': PACKAGE_VERSION },
253
257
  });
254
258
 
255
259
  await this.recorder.record(record);
@@ -275,7 +279,7 @@ export default class Archivist extends events.EventEmitter {
275
279
  content: sourceDocument.content,
276
280
  mimeType: sourceDocument.mimeType,
277
281
  metadata: {
278
- 'x-engine-version': process.env.npm_package_version,
282
+ 'x-engine-version': PACKAGE_VERSION,
279
283
  'x-fetcher': sourceDocument.fetcher,
280
284
  'x-source-document-location': sourceDocument.location,
281
285
  },
@@ -1,7 +1,12 @@
1
+ import { createRequire } from 'module';
2
+
1
3
  import express from 'express';
2
4
 
3
5
  import Service from '../../archivist/services/service.js';
4
6
 
7
+ const require = createRequire(import.meta.url);
8
+ const { version: PACKAGE_VERSION } = require('../../../package.json');
9
+
5
10
  /**
6
11
  * @param {object} collection The collection
7
12
  * @param {object} services The services of the collection
@@ -170,7 +175,6 @@ import Service from '../../archivist/services/service.js';
170
175
  */
171
176
  export default function metadataRouter(collection, services) {
172
177
  const router = express.Router();
173
- const engineVersion = process.env.npm_package_version;
174
178
 
175
179
  /**
176
180
  * @swagger
@@ -188,7 +192,7 @@ export default function metadataRouter(collection, services) {
188
192
  const dynamicMetadata = {
189
193
  totalServices: Object.keys(services).length,
190
194
  totalTerms: Service.getNumberOfTerms(services),
191
- engineVersion,
195
+ engineVersion: PACKAGE_VERSION,
192
196
  };
193
197
 
194
198
  res.json({
@@ -1,11 +1,15 @@
1
+ import { createRequire } from 'module';
2
+
1
3
  import { expect } from 'chai';
2
4
  import config from 'config';
3
5
  import request from 'supertest';
4
6
 
5
7
  import app from '../server.js';
6
8
 
9
+ const require = createRequire(import.meta.url);
10
+ const { version: PACKAGE_VERSION } = require('../../../package.json');
11
+
7
12
  const basePath = config.get('@opentermsarchive/engine.collection-api.basePath');
8
- const engineVersion = process.env.npm_package_version;
9
13
 
10
14
  const EXPECTED_RESPONSE = {
11
15
  totalServices: 7,
@@ -80,7 +84,7 @@ describe('Metadata API', () => {
80
84
  it('returns expected metadata object', () => {
81
85
  expect(response.body).to.deep.equal({
82
86
  ...EXPECTED_RESPONSE,
83
- engineVersion,
87
+ engineVersion: PACKAGE_VERSION,
84
88
  });
85
89
  });
86
90
  });
package/src/index.js CHANGED
@@ -1,3 +1,5 @@
1
+ import { createRequire } from 'module';
2
+
1
3
  import config from 'config';
2
4
  import cron from 'croner';
3
5
  import cronstrue from 'cronstrue';
@@ -8,6 +10,9 @@ import logger from './logger/index.js';
8
10
  import Notifier from './notifier/index.js';
9
11
  import Reporter from './reporter/index.js';
10
12
 
13
+ const require = createRequire(import.meta.url);
14
+ const { version: PACKAGE_VERSION } = require('../package.json');
15
+
11
16
  export default async function track({ services, types, extractOnly, schedule }) {
12
17
  const archivist = new Archivist({
13
18
  recorderConfig: config.get('@opentermsarchive/engine.recorder'),
@@ -21,7 +26,7 @@ export default async function track({ services, types, extractOnly, schedule })
21
26
  const collection = await getCollection();
22
27
  const collectionName = collection?.name ? ` with ${collection.name} collection` : '';
23
28
 
24
- logger.info(`Start engine v${process.env.npm_package_version}${collectionName}\n`);
29
+ logger.info(`Start engine v${PACKAGE_VERSION}${collectionName}\n`);
25
30
 
26
31
  if (services?.length) {
27
32
  services = services.filter(serviceId => {