@opentermsarchive/engine 1.1.3 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  export class FetchDocumentError extends Error {
2
2
  constructor(message) {
3
- super(message);
3
+ super(`Fetch failed: ${message}`);
4
4
  this.name = 'FetchDocumentError';
5
5
  }
6
6
  }
@@ -2,8 +2,6 @@ import puppeteer from 'puppeteer';
2
2
  import puppeteerExtra from 'puppeteer-extra';
3
3
  import stealthPlugin from 'puppeteer-extra-plugin-stealth';
4
4
 
5
- import { FetchDocumentError } from './errors.js';
6
-
7
5
  puppeteerExtra.use(stealthPlugin());
8
6
 
9
7
  let browser;
@@ -26,13 +24,13 @@ export default async function fetch(url, cssSelectors, config) {
26
24
  response = await page.goto(url, { waitUntil: 'networkidle0' });
27
25
 
28
26
  if (!response) {
29
- throw new FetchDocumentError(`Response is empty when trying to fetch '${url}'`);
27
+ throw new Error(`Response is empty when trying to fetch '${url}'`);
30
28
  }
31
29
 
32
30
  const statusCode = response.status();
33
31
 
34
32
  if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) {
35
- throw new FetchDocumentError(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
33
+ throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
36
34
  }
37
35
 
38
36
  const waitForSelectorsPromises = selectors.map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
@@ -54,9 +52,9 @@ export default async function fetch(url, cssSelectors, config) {
54
52
  };
55
53
  } catch (error) {
56
54
  if (error instanceof puppeteer.errors.TimeoutError) {
57
- throw new FetchDocumentError(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
55
+ throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
58
56
  }
59
- throw new FetchDocumentError(error.message);
57
+ throw new Error(error.message);
60
58
  } finally {
61
59
  if (page) {
62
60
  await page.close();
@@ -5,8 +5,6 @@ import HttpProxyAgent from 'http-proxy-agent';
5
5
  import HttpsProxyAgent from 'https-proxy-agent';
6
6
  import nodeFetch, { AbortError } from 'node-fetch';
7
7
 
8
- import { FetchDocumentError } from './errors.js';
9
-
10
8
  export default async function fetch(url, configuration) {
11
9
  const controller = new AbortController();
12
10
  const timeout = setTimeout(() => controller.abort(), configuration.navigationTimeout);
@@ -29,10 +27,11 @@ export default async function fetch(url, configuration) {
29
27
  response = await nodeFetch(url, nodeFetchOptions);
30
28
 
31
29
  if (!response.ok) {
32
- throw new FetchDocumentError(`Received HTTP code ${response.status} when trying to fetch '${url}'`);
30
+ throw new Error(`Received HTTP code ${response.status} when trying to fetch '${url}'`);
33
31
  }
34
32
 
35
33
  const mimeType = response.headers.get('content-type');
34
+ const contentLength = response.headers.get('content-length');
36
35
  const responseBuffer = await response.arrayBuffer();
37
36
  let content;
38
37
 
@@ -42,8 +41,8 @@ export default async function fetch(url, configuration) {
42
41
  content = Buffer.from(responseBuffer);
43
42
  }
44
43
 
45
- if (!content) {
46
- throw new FetchDocumentError(`Received an empty content when fetching '${url}'`);
44
+ if (contentLength == 0 || !content) {
45
+ throw new Error(`Received an empty content when fetching '${url}'`);
47
46
  }
48
47
 
49
48
  return {
@@ -52,10 +51,10 @@ export default async function fetch(url, configuration) {
52
51
  };
53
52
  } catch (error) {
54
53
  if (error instanceof AbortError) {
55
- throw new FetchDocumentError(`Timed out after ${configuration.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
54
+ throw new Error(`Timed out after ${configuration.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
56
55
  }
57
56
 
58
- throw new FetchDocumentError(error.message);
57
+ throw new Error(error.message);
59
58
  } finally {
60
59
  clearTimeout(timeout);
61
60
  }
@@ -1,5 +1,6 @@
1
1
  import config from 'config';
2
2
 
3
+ import { FetchDocumentError } from './errors.js';
3
4
  import fetchFullDom from './fullDomFetcher.js';
4
5
  import fetchHtmlOnly from './htmlOnlyFetcher.js';
5
6
 
@@ -28,9 +29,13 @@ export default async function fetch({
28
29
  waitForElementsTimeout = config.get('fetcher.waitForElementsTimeout'),
29
30
  } = {},
30
31
  }) {
31
- if (executeClientScripts) {
32
- return fetchFullDom(url, cssSelectors, { navigationTimeout, language, waitForElementsTimeout });
33
- }
32
+ try {
33
+ if (executeClientScripts) {
34
+ return await fetchFullDom(url, cssSelectors, { navigationTimeout, language, waitForElementsTimeout });
35
+ }
34
36
 
35
- return fetchHtmlOnly(url, { navigationTimeout, language });
37
+ return await fetchHtmlOnly(url, { navigationTimeout, language });
38
+ } catch (error) {
39
+ throw new FetchDocumentError(error.message);
40
+ }
36
41
  }
@@ -41,6 +41,9 @@ describe('Fetcher', function () {
41
41
  if (request.url == '/404') {
42
42
  response.writeHead(404, { 'Content-Type': 'text/html' }).write('<!DOCTYPE html><html><body>404</body></html>');
43
43
  }
44
+ if (request.url === '/zero-content') {
45
+ response.writeHead(200, { 'Content-Type': 'text/html', 'Content-Length': '0' }).write('');
46
+ }
44
47
  if (request.url == '/terms.pdf') {
45
48
  expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
46
49
 
@@ -70,11 +73,11 @@ describe('Fetcher', function () {
70
73
  ({ content, mimeType } = await fetch({ url, selectors: 'body' }));
71
74
  });
72
75
 
73
- it('returns the web page content of the given URL', async () => {
76
+ it('returns the web page content of the given URL', () => {
74
77
  expect(content).to.equal(termsHTML);
75
78
  });
76
79
 
77
- it('returns the MIME type of the given URL', async () => {
80
+ it('returns the MIME type of the given URL', () => {
78
81
  expect(mimeType).to.equal('text/html');
79
82
  });
80
83
 
@@ -83,11 +86,11 @@ describe('Fetcher', function () {
83
86
  ({ content, mimeType } = await fetch({ url, selectors: 'body', executeClientScripts: true }));
84
87
  });
85
88
 
86
- it('returns the web page content of the given URL', async () => {
89
+ it('returns the web page content of the given URL', () => {
87
90
  expect(content).to.equal(termsHTML);
88
91
  });
89
92
 
90
- it('returns the MIME type of the given URL', async () => {
93
+ it('returns the MIME type of the given URL', () => {
91
94
  expect(mimeType).to.equal('text/html');
92
95
  });
93
96
  });
@@ -100,11 +103,11 @@ describe('Fetcher', function () {
100
103
  ({ content, mimeType } = await fetch({ url, selectors: NOT_PRESENT_SELECTOR }));
101
104
  });
102
105
 
103
- it('returns the web page content of the given URL', async () => {
106
+ it('returns the web page content of the given URL', () => {
104
107
  expect(content).to.equal(termsHTML);
105
108
  });
106
109
 
107
- it('returns the MIME type of the given URL', async () => {
110
+ it('returns the MIME type of the given URL', () => {
108
111
  expect(mimeType).to.equal('text/html');
109
112
  });
110
113
 
@@ -113,11 +116,11 @@ describe('Fetcher', function () {
113
116
  ({ content, mimeType } = await fetch({ url, selectors: NOT_PRESENT_SELECTOR, executeClientScripts: true }));
114
117
  });
115
118
 
116
- it('returns the web page content of the given URL', async () => {
119
+ it('returns the web page content of the given URL', () => {
117
120
  expect(content).to.equal(termsHTML);
118
121
  });
119
122
 
120
- it('returns the MIME type of the given URL', async () => {
123
+ it('returns the MIME type of the given URL', () => {
121
124
  expect(mimeType).to.equal('text/html');
122
125
  });
123
126
  });
@@ -133,7 +136,7 @@ describe('Fetcher', function () {
133
136
  ({ content } = await fetch({ url, selectors: 'body' }));
134
137
  });
135
138
 
136
- it('returns the web page content of the given URL', async () => {
139
+ it('returns the web page content of the given URL', () => {
137
140
  expect(content).to.equal(termsWithOtherCharsetHTML);
138
141
  });
139
142
  });
@@ -148,18 +151,26 @@ describe('Fetcher', function () {
148
151
  ({ content, mimeType } = await fetch({ url: pdfUrl }));
149
152
  });
150
153
 
151
- it('returns a buffer for PDF content', async () => {
154
+ it('returns a buffer for PDF content', () => {
152
155
  expect(content).to.be.an.instanceOf(Buffer);
153
156
  });
154
157
 
155
- it('returns a blob with the file type', async () => {
158
+ it('returns a blob with the file type', () => {
156
159
  expect(mimeType).to.equal('application/pdf');
157
160
  });
158
161
 
159
- it('returns a blob with the file content', async () => {
162
+ it('returns a blob with the file content', () => {
160
163
  expect(content.equals(expectedPDFContent)).to.be.true;
161
164
  });
162
165
  });
166
+
167
+ context('when server responds with empty content', () => {
168
+ const zeroContentUrl = `http://127.0.0.1:${SERVER_PORT}/zero-content`;
169
+
170
+ it('throws a FetchDocumentError error', async () => {
171
+ await expect(fetch({ url: zeroContentUrl })).to.be.rejectedWith(FetchDocumentError, /empty content/);
172
+ });
173
+ });
163
174
  });
164
175
 
165
176
  describe('Error handling', () => {
@@ -3,7 +3,7 @@ import events from 'events';
3
3
  import async from 'async';
4
4
 
5
5
  import { InaccessibleContentError } from './errors.js';
6
- import extract from './extract/index.js';
6
+ import extract, { ExtractDocumentError } from './extract/index.js';
7
7
  import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError } from './fetcher/index.js';
8
8
  import Recorder from './recorder/index.js';
9
9
  import Snapshot from './recorder/snapshot.js';
@@ -72,7 +72,7 @@ export default class Archivist extends events.EventEmitter {
72
72
 
73
73
  initQueue() {
74
74
  this.trackingQueue = async.queue(this.trackTermsChanges.bind(this), MAX_PARALLEL_TRACKING);
75
- this.trackingQueue.error(async (error, { terms }) => {
75
+ this.trackingQueue.error((error, { terms }) => {
76
76
  if (error instanceof InaccessibleContentError) {
77
77
  this.emit('inaccessibleContent', error, terms);
78
78
 
@@ -99,8 +99,11 @@ export default class Archivist extends events.EventEmitter {
99
99
  });
100
100
  }
101
101
 
102
- async track({ services: servicesIds = this.servicesIds, terms: termsTypes = [], extractOnly = false }) {
103
- this.emit('trackingStarted', servicesIds.length, Service.getNumberOfTerms(this.services, servicesIds), extractOnly);
102
+ async track({ services: servicesIds = this.servicesIds, types: termsTypes = [], extractOnly = false } = {}) {
103
+ const numberOfTerms = Service.getNumberOfTerms(this.services, servicesIds, termsTypes);
104
+
105
+ this.emit('trackingStarted', servicesIds.length, numberOfTerms, extractOnly);
106
+
104
107
  await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]);
105
108
 
106
109
  this.trackingQueue.concurrency = extractOnly ? MAX_PARALLEL_EXTRACTING : MAX_PARALLEL_TRACKING;
@@ -120,7 +123,8 @@ export default class Archivist extends events.EventEmitter {
120
123
  }
121
124
 
122
125
  await Promise.all([ stopHeadlessBrowser(), this.recorder.finalize() ]);
123
- this.emit('trackingCompleted', servicesIds.length, Service.getNumberOfTerms(this.services, servicesIds), extractOnly);
126
+
127
+ this.emit('trackingCompleted', servicesIds.length, numberOfTerms, extractOnly);
124
128
  }
125
129
 
126
130
  async trackTermsChanges({ terms, extractOnly = false }) {
@@ -142,7 +146,7 @@ export default class Archivist extends events.EventEmitter {
142
146
  async fetchSourceDocuments(terms) {
143
147
  terms.fetchDate = new Date();
144
148
 
145
- const inaccessibleContentErrors = [];
149
+ const fetchDocumentErrors = [];
146
150
 
147
151
  await Promise.all(terms.sourceDocuments.map(async sourceDocument => {
148
152
  const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
@@ -157,16 +161,16 @@ export default class Archivist extends events.EventEmitter {
157
161
  throw error;
158
162
  }
159
163
 
160
- inaccessibleContentErrors.push(error.message);
164
+ fetchDocumentErrors.push(error.message);
161
165
  }
162
166
  }));
163
167
 
164
- if (inaccessibleContentErrors.length) {
165
- throw new InaccessibleContentError(inaccessibleContentErrors);
168
+ if (fetchDocumentErrors.length) {
169
+ throw new InaccessibleContentError(fetchDocumentErrors);
166
170
  }
167
171
  }
168
172
 
169
- async loadSourceDocumentsFromSnapshots(terms) {
173
+ loadSourceDocumentsFromSnapshots(terms) {
170
174
  return Promise.all(terms.sourceDocuments.map(async sourceDocument => {
171
175
  const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
172
176
 
@@ -182,12 +186,32 @@ export default class Archivist extends events.EventEmitter {
182
186
  }
183
187
 
184
188
  async extractVersionContent(sourceDocuments) {
185
- return (await Promise.all(sourceDocuments.map(async sourceDocument => this.extract(sourceDocument)))).join(Version.SOURCE_DOCUMENTS_SEPARATOR);
189
+ const extractDocumentErrors = [];
190
+
191
+ const result = await Promise.all(sourceDocuments.map(async sourceDocument => {
192
+ try {
193
+ return await this.extract(sourceDocument);
194
+ } catch (error) {
195
+ if (!(error instanceof ExtractDocumentError)) {
196
+ throw error;
197
+ }
198
+
199
+ extractDocumentErrors.push(error.message);
200
+ }
201
+ }));
202
+
203
+ if (extractDocumentErrors.length) {
204
+ throw new InaccessibleContentError(extractDocumentErrors);
205
+ }
206
+
207
+ return result.join(Version.SOURCE_DOCUMENTS_SEPARATOR);
186
208
  }
187
209
 
188
210
  async recordVersion(terms, extractOnly) {
211
+ const content = await this.extractVersionContent(terms.sourceDocuments);
212
+
189
213
  const record = new Version({
190
- content: await this.extractVersionContent(terms.sourceDocuments),
214
+ content,
191
215
  snapshotIds: terms.sourceDocuments.map(sourceDocuments => sourceDocuments.snapshotId),
192
216
  serviceId: terms.service.id,
193
217
  termsType: terms.type,
@@ -208,7 +232,7 @@ export default class Archivist extends events.EventEmitter {
208
232
  return record;
209
233
  }
210
234
 
211
- async recordSnapshots(terms) {
235
+ recordSnapshots(terms) {
212
236
  return Promise.all(terms.sourceDocuments.map(async sourceDocument => {
213
237
  const record = new Snapshot({
214
238
  serviceId: terms.service.id,
@@ -390,7 +390,7 @@ describe('Archivist', function () {
390
390
  before(async () => {
391
391
  await app.recordSnapshots(terms);
392
392
  resetSpiesHistory();
393
- terms.sourceDocuments.forEach(async sourceDocument => {
393
+ terms.sourceDocuments.forEach(sourceDocument => {
394
394
  sourceDocument.content = serviceBSnapshotExpectedContent;
395
395
  });
396
396
  [changedSnapshot] = await app.recordSnapshots(terms);
@@ -402,7 +402,7 @@ describe('Archivist', function () {
402
402
  return resetGitRepositories();
403
403
  });
404
404
 
405
- it('emits "snapshotRecorded" event', async () => {
405
+ it('emits "snapshotRecorded" event', () => {
406
406
  expect(spies.onSnapshotRecorded).to.have.been.calledWith(changedSnapshot);
407
407
  });
408
408
 
@@ -437,10 +437,10 @@ describe('Archivist', function () {
437
437
  let terms;
438
438
  let version;
439
439
 
440
- before(async () => {
440
+ before(() => {
441
441
  terms = app.services.service·A.getTerms({ type: SERVICE_A_TYPE });
442
442
  terms.fetchDate = FETCH_DATE;
443
- terms.sourceDocuments.forEach(async sourceDocument => {
443
+ terms.sourceDocuments.forEach(sourceDocument => {
444
444
  sourceDocument.content = serviceASnapshotExpectedContent;
445
445
  sourceDocument.mimeType = MIME_TYPE;
446
446
  });
@@ -458,7 +458,7 @@ describe('Archivist', function () {
458
458
  return resetGitRepositories();
459
459
  });
460
460
 
461
- it('emits "firstVersionRecorded" event', async () => {
461
+ it('emits "firstVersionRecorded" event', () => {
462
462
  expect(spies.onFirstVersionRecorded).to.have.been.calledWith(version);
463
463
  });
464
464
 
@@ -529,23 +529,23 @@ describe('Archivist', function () {
529
529
  return resetGitRepositories();
530
530
  });
531
531
 
532
- it('emits "trackingStarted" event', async () => {
532
+ it('emits "trackingStarted" event', () => {
533
533
  expect(spies.onTrackingStarted).to.have.been.calledOnce;
534
534
  });
535
535
 
536
- it('emits "firstSnapshotRecorded" events', async () => {
536
+ it('emits "firstSnapshotRecorded" events', () => {
537
537
  expect(spies.onFirstSnapshotRecorded).to.have.been.calledTwice;
538
538
  });
539
539
 
540
- it('emits "firstVersionRecorded" events', async () => {
540
+ it('emits "firstVersionRecorded" events', () => {
541
541
  expect(spies.onFirstVersionRecorded).to.have.been.calledTwice;
542
542
  });
543
543
 
544
- it('emits "firstVersionRecorded" events after "firstSnapshotRecorded" events', async () => {
544
+ it('emits "firstVersionRecorded" events after "firstSnapshotRecorded" events', () => {
545
545
  expect(spies.onFirstVersionRecorded).to.have.been.calledAfter(spies.onFirstSnapshotRecorded);
546
546
  });
547
547
 
548
- it('emits "trackingCompleted" event', async () => {
548
+ it('emits "trackingCompleted" event', () => {
549
549
  expect(spies.onTrackingCompleted).to.have.been.calledAfter(spies.onTrackingStarted);
550
550
  });
551
551
 
@@ -579,23 +579,23 @@ describe('Archivist', function () {
579
579
  return resetGitRepositories();
580
580
  });
581
581
 
582
- it('emits "trackingStarted" event', async () => {
582
+ it('emits "trackingStarted" event', () => {
583
583
  expect(spies.onTrackingStarted).to.have.been.calledOnce;
584
584
  });
585
585
 
586
- it('emits "snapshotNotChanged" events', async () => {
586
+ it('emits "snapshotNotChanged" events', () => {
587
587
  expect(spies.onSnapshotNotChanged).to.have.been.calledTwice;
588
588
  });
589
589
 
590
- it('emits "versionNotChanged" events', async () => {
590
+ it('emits "versionNotChanged" events', () => {
591
591
  expect(spies.onVersionNotChanged).to.have.been.calledTwice;
592
592
  });
593
593
 
594
- it('emits "versionNotChanged" events after "snapshotRecorded" events', async () => {
594
+ it('emits "versionNotChanged" events after "snapshotRecorded" events', () => {
595
595
  expect(spies.onVersionNotChanged).to.have.been.calledAfter(spies.onSnapshotNotChanged);
596
596
  });
597
597
 
598
- it('emits "trackingCompleted" event', async () => {
598
+ it('emits "trackingCompleted" event', () => {
599
599
  expect(spies.onTrackingCompleted).to.have.been.calledAfter(spies.onTrackingStarted);
600
600
  });
601
601
 
@@ -644,31 +644,31 @@ describe('Archivist', function () {
644
644
  return resetGitRepositories();
645
645
  });
646
646
 
647
- it('emits "trackingStarted" event', async () => {
647
+ it('emits "trackingStarted" event', () => {
648
648
  expect(spies.onTrackingStarted).to.have.been.calledOnce;
649
649
  });
650
650
 
651
- it('emits "snapshotNotChanged" event for the service that was not changed', async () => {
651
+ it('emits "snapshotNotChanged" event for the service that was not changed', () => {
652
652
  expect(spies.onSnapshotNotChanged).to.have.been.calledOnceWith(snapshotB);
653
653
  });
654
654
 
655
- it('emits "snapshotRecorded" event for the service that was changed', async () => {
655
+ it('emits "snapshotRecorded" event for the service that was changed', () => {
656
656
  expect(spies.onSnapshotRecorded).to.have.been.calledOnceWith(snapshotA);
657
657
  });
658
658
 
659
- it('emits "versionNotChanged" events for the service that was not changed', async () => {
659
+ it('emits "versionNotChanged" events for the service that was not changed', () => {
660
660
  expect(spies.onVersionNotChanged).to.have.been.calledOnceWith(versionB);
661
661
  });
662
662
 
663
- it('emits "versionRecorded" event for the service that was changed', async () => {
663
+ it('emits "versionRecorded" event for the service that was changed', () => {
664
664
  expect(spies.onVersionRecorded).to.have.been.calledOnceWith(versionA);
665
665
  });
666
666
 
667
- it('emits "snapshotRecorded" events after "versionRecorded" events', async () => {
667
+ it('emits "snapshotRecorded" events after "versionRecorded" events', () => {
668
668
  expect(spies.onVersionRecorded).to.have.been.calledAfter(spies.onSnapshotRecorded);
669
669
  });
670
670
 
671
- it('emits "trackingCompleted" event', async () => {
671
+ it('emits "trackingCompleted" event', () => {
672
672
  expect(spies.onTrackingCompleted).to.have.been.calledAfter(spies.onTrackingStarted);
673
673
  });
674
674
 
@@ -28,8 +28,14 @@ export default class Service {
28
28
  return history?.find(entry => new Date(date) <= new Date(entry.validUntil)) || currentlyValidTerms;
29
29
  }
30
30
 
31
- getTermsTypes() {
32
- return Object.keys(this.terms);
31
+ getTermsTypes(termsTypes) {
32
+ let result = Object.keys(this.terms);
33
+
34
+ if (termsTypes) {
35
+ result = result.filter(item => termsTypes.includes(item));
36
+ }
37
+
38
+ return result;
33
39
  }
34
40
 
35
41
  addTerms(terms) {
@@ -54,8 +60,8 @@ export default class Service {
54
60
  return this.terms[termsType].history.map(entry => entry.validUntil);
55
61
  }
56
62
 
57
- getNumberOfTerms() {
58
- return this.getTermsTypes().length;
63
+ getNumberOfTerms(termsTypes) {
64
+ return this.getTermsTypes(termsTypes).length;
59
65
  }
60
66
 
61
67
  hasHistory() {
@@ -63,7 +69,7 @@ export default class Service {
63
69
  return Boolean(Object.keys(this.terms).find(termsType => this.terms[termsType].history));
64
70
  }
65
71
 
66
- static getNumberOfTerms(services, servicesIds) {
67
- return servicesIds.reduce((acc, serviceId) => acc + services[serviceId].getNumberOfTerms(), 0);
72
+ static getNumberOfTerms(services, servicesIds, termsTypes) {
73
+ return servicesIds.reduce((acc, serviceId) => acc + services[serviceId].getNumberOfTerms(termsTypes), 0);
68
74
  }
69
75
  }