@opentermsarchive/engine 5.4.1 → 5.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/archivist/fetcher/fullDomFetcher.js +11 -3
- package/src/archivist/fetcher/fullDomFetcher.test.js +90 -0
- package/src/archivist/index.js +6 -2
- package/src/collection-api/routes/metadata.js +6 -2
- package/src/collection-api/routes/metadata.test.js +6 -2
- package/src/index.js +6 -1
package/package.json
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { TimeoutError } from 'puppeteer';
|
|
2
1
|
import puppeteer from 'puppeteer-extra';
|
|
3
2
|
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
4
3
|
|
|
@@ -33,7 +32,16 @@ export default async function fetch(url, cssSelectors, config) {
|
|
|
33
32
|
throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
|
|
34
33
|
}
|
|
35
34
|
|
|
36
|
-
const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
|
|
35
|
+
const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
|
|
36
|
+
page.waitForFunction(
|
|
37
|
+
cssSelector => {
|
|
38
|
+
const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
|
|
39
|
+
|
|
40
|
+
return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
|
|
41
|
+
},
|
|
42
|
+
{ timeout: config.waitForElementsTimeout },
|
|
43
|
+
selector,
|
|
44
|
+
));
|
|
37
45
|
|
|
38
46
|
// We expect all elements to be present on the page…
|
|
39
47
|
await Promise.all(waitForSelectorsPromises).catch(error => {
|
|
@@ -51,7 +59,7 @@ export default async function fetch(url, cssSelectors, config) {
|
|
|
51
59
|
content: await page.content(),
|
|
52
60
|
};
|
|
53
61
|
} catch (error) {
|
|
54
|
-
if (error
|
|
62
|
+
if (error.name === 'TimeoutError') {
|
|
55
63
|
throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
|
|
56
64
|
}
|
|
57
65
|
throw new Error(error.message);
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import http from 'http';
|
|
2
|
+
|
|
3
|
+
import chai from 'chai';
|
|
4
|
+
import chaiAsPromised from 'chai-as-promised';
|
|
5
|
+
|
|
6
|
+
import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
|
|
7
|
+
|
|
8
|
+
const { expect } = chai;
|
|
9
|
+
const SERVER_PORT = 8977;
|
|
10
|
+
|
|
11
|
+
chai.use(chaiAsPromised);
|
|
12
|
+
|
|
13
|
+
const dynamicHTML = '<!DOCTYPE html><html><head><title>Dynamic Page</title><script>setTimeout(() => { document.body.innerHTML += "<div class=\'dynamic\'>Loaded</div>"; }, 100);</script></head><body></body></html>';
|
|
14
|
+
const delayedContentHTML = '<!DOCTYPE html><html><head><title>Delayed Content</title><script>setTimeout(() => { document.querySelector(".content").textContent = "Final content"; }, 100);</script></head><body><div class="content"></div></body></html>';
|
|
15
|
+
|
|
16
|
+
describe('Full DOM Fetcher', function () {
|
|
17
|
+
this.timeout(60000);
|
|
18
|
+
|
|
19
|
+
let temporaryServer;
|
|
20
|
+
|
|
21
|
+
before(async () => {
|
|
22
|
+
await launchHeadlessBrowser();
|
|
23
|
+
|
|
24
|
+
temporaryServer = http.createServer((request, response) => {
|
|
25
|
+
if (request.url === '/dynamic') {
|
|
26
|
+
response.writeHead(200, { 'Content-Type': 'text/html' }).write(dynamicHTML);
|
|
27
|
+
}
|
|
28
|
+
if (request.url === '/delayed-content') {
|
|
29
|
+
response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return response.end();
|
|
33
|
+
}).listen(SERVER_PORT);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
after(async () => {
|
|
37
|
+
if (temporaryServer) {
|
|
38
|
+
temporaryServer.close();
|
|
39
|
+
}
|
|
40
|
+
await stopHeadlessBrowser();
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
describe('Browser lifecycle', () => {
|
|
44
|
+
it('throws error when trying to fetch without launching browser', async () => {
|
|
45
|
+
await stopHeadlessBrowser();
|
|
46
|
+
await expect(fetch('http://example.com', [], { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }))
|
|
47
|
+
.to.be.rejectedWith('The headless browser should be controlled manually');
|
|
48
|
+
await launchHeadlessBrowser();
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it('reuses existing browser instance', async () => {
|
|
52
|
+
const browser1 = await launchHeadlessBrowser();
|
|
53
|
+
const browser2 = await launchHeadlessBrowser();
|
|
54
|
+
|
|
55
|
+
expect(browser1).to.equal(browser2);
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
describe('#fetch', () => {
|
|
60
|
+
const config = { navigationTimeout: 1000, waitForElementsTimeout: 1000, language: 'en' };
|
|
61
|
+
|
|
62
|
+
it('waits for dynamically injected elements to appear in the DOM', async () => {
|
|
63
|
+
const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/dynamic`, ['.dynamic'], config);
|
|
64
|
+
|
|
65
|
+
expect(result.content).to.match(/<body[^>]*>.*<div class="dynamic">Loaded<\/div>.*<\/body>/s);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it('fails when waiting for non-existent elements exceeds timeout', async () => {
|
|
69
|
+
const url = `http://127.0.0.1:${SERVER_PORT}/dynamic`;
|
|
70
|
+
const timeout = 10;
|
|
71
|
+
|
|
72
|
+
await expect(fetch(url, ['.non-existent'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
context('when a DOM element exists but its content is loaded asynchronously', () => {
|
|
76
|
+
it('waits for the element content to be fully loaded', async () => {
|
|
77
|
+
const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/delayed-content`, ['.content'], config);
|
|
78
|
+
|
|
79
|
+
expect(result.content).to.match(/<div class="content">Final content<\/div>/);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('fails when content loading exceeds navigation timeout', async () => {
|
|
83
|
+
const url = `http://127.0.0.1:${SERVER_PORT}/delayed-content`;
|
|
84
|
+
const timeout = 10;
|
|
85
|
+
|
|
86
|
+
await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
|
|
87
|
+
});
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
});
|
package/src/archivist/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import events from 'events';
|
|
2
|
+
import { createRequire } from 'module';
|
|
2
3
|
|
|
3
4
|
import async from 'async';
|
|
4
5
|
|
|
@@ -11,6 +12,9 @@ import Version from './recorder/version.js';
|
|
|
11
12
|
import * as services from './services/index.js';
|
|
12
13
|
import Service from './services/service.js';
|
|
13
14
|
|
|
15
|
+
const require = createRequire(import.meta.url);
|
|
16
|
+
const { version: PACKAGE_VERSION } = require('../../package.json');
|
|
17
|
+
|
|
14
18
|
// The parallel handling feature is currently set to a parallelism of 1 on terms tracking
|
|
15
19
|
// because when it's higher there are two issues:
|
|
16
20
|
// - too many requests on the same endpoint yield 403
|
|
@@ -249,7 +253,7 @@ export default class Archivist extends events.EventEmitter {
|
|
|
249
253
|
termsType: terms.type,
|
|
250
254
|
fetchDate: terms.fetchDate,
|
|
251
255
|
isExtractOnly: extractOnly,
|
|
252
|
-
metadata: { 'x-engine-version':
|
|
256
|
+
metadata: { 'x-engine-version': PACKAGE_VERSION },
|
|
253
257
|
});
|
|
254
258
|
|
|
255
259
|
await this.recorder.record(record);
|
|
@@ -275,7 +279,7 @@ export default class Archivist extends events.EventEmitter {
|
|
|
275
279
|
content: sourceDocument.content,
|
|
276
280
|
mimeType: sourceDocument.mimeType,
|
|
277
281
|
metadata: {
|
|
278
|
-
'x-engine-version':
|
|
282
|
+
'x-engine-version': PACKAGE_VERSION,
|
|
279
283
|
'x-fetcher': sourceDocument.fetcher,
|
|
280
284
|
'x-source-document-location': sourceDocument.location,
|
|
281
285
|
},
|
|
@@ -1,7 +1,12 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
|
|
1
3
|
import express from 'express';
|
|
2
4
|
|
|
3
5
|
import Service from '../../archivist/services/service.js';
|
|
4
6
|
|
|
7
|
+
const require = createRequire(import.meta.url);
|
|
8
|
+
const { version: PACKAGE_VERSION } = require('../../../package.json');
|
|
9
|
+
|
|
5
10
|
/**
|
|
6
11
|
* @param {object} collection The collection
|
|
7
12
|
* @param {object} services The services of the collection
|
|
@@ -170,7 +175,6 @@ import Service from '../../archivist/services/service.js';
|
|
|
170
175
|
*/
|
|
171
176
|
export default function metadataRouter(collection, services) {
|
|
172
177
|
const router = express.Router();
|
|
173
|
-
const engineVersion = process.env.npm_package_version;
|
|
174
178
|
|
|
175
179
|
/**
|
|
176
180
|
* @swagger
|
|
@@ -188,7 +192,7 @@ export default function metadataRouter(collection, services) {
|
|
|
188
192
|
const dynamicMetadata = {
|
|
189
193
|
totalServices: Object.keys(services).length,
|
|
190
194
|
totalTerms: Service.getNumberOfTerms(services),
|
|
191
|
-
engineVersion,
|
|
195
|
+
engineVersion: PACKAGE_VERSION,
|
|
192
196
|
};
|
|
193
197
|
|
|
194
198
|
res.json({
|
|
@@ -1,11 +1,15 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
|
|
1
3
|
import { expect } from 'chai';
|
|
2
4
|
import config from 'config';
|
|
3
5
|
import request from 'supertest';
|
|
4
6
|
|
|
5
7
|
import app from '../server.js';
|
|
6
8
|
|
|
9
|
+
const require = createRequire(import.meta.url);
|
|
10
|
+
const { version: PACKAGE_VERSION } = require('../../../package.json');
|
|
11
|
+
|
|
7
12
|
const basePath = config.get('@opentermsarchive/engine.collection-api.basePath');
|
|
8
|
-
const engineVersion = process.env.npm_package_version;
|
|
9
13
|
|
|
10
14
|
const EXPECTED_RESPONSE = {
|
|
11
15
|
totalServices: 7,
|
|
@@ -80,7 +84,7 @@ describe('Metadata API', () => {
|
|
|
80
84
|
it('returns expected metadata object', () => {
|
|
81
85
|
expect(response.body).to.deep.equal({
|
|
82
86
|
...EXPECTED_RESPONSE,
|
|
83
|
-
engineVersion,
|
|
87
|
+
engineVersion: PACKAGE_VERSION,
|
|
84
88
|
});
|
|
85
89
|
});
|
|
86
90
|
});
|
package/src/index.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
|
|
1
3
|
import config from 'config';
|
|
2
4
|
import cron from 'croner';
|
|
3
5
|
import cronstrue from 'cronstrue';
|
|
@@ -8,6 +10,9 @@ import logger from './logger/index.js';
|
|
|
8
10
|
import Notifier from './notifier/index.js';
|
|
9
11
|
import Reporter from './reporter/index.js';
|
|
10
12
|
|
|
13
|
+
const require = createRequire(import.meta.url);
|
|
14
|
+
const { version: PACKAGE_VERSION } = require('../package.json');
|
|
15
|
+
|
|
11
16
|
export default async function track({ services, types, extractOnly, schedule }) {
|
|
12
17
|
const archivist = new Archivist({
|
|
13
18
|
recorderConfig: config.get('@opentermsarchive/engine.recorder'),
|
|
@@ -21,7 +26,7 @@ export default async function track({ services, types, extractOnly, schedule })
|
|
|
21
26
|
const collection = await getCollection();
|
|
22
27
|
const collectionName = collection?.name ? ` with ${collection.name} collection` : '';
|
|
23
28
|
|
|
24
|
-
logger.info(`Start engine v${
|
|
29
|
+
logger.info(`Start engine v${PACKAGE_VERSION}${collectionName}\n`);
|
|
25
30
|
|
|
26
31
|
if (services?.length) {
|
|
27
32
|
services = services.filter(serviceId => {
|