@opentermsarchive/engine 9.0.0 → 9.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "9.0.0",
3
+ "version": "9.1.0",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -158,46 +158,30 @@ export default class Archivist extends events.EventEmitter {
158
158
 
159
159
  async trackTermsChanges({ terms, extractOnly = false }) {
160
160
  if (!extractOnly) {
161
- await this.fetchSourceDocuments(terms);
162
- await this.recordSnapshots(terms);
161
+ await this.fetchAndRecordSnapshots(terms);
163
162
  }
164
163
 
165
- await this.loadSourceDocumentsFromSnapshots(terms);
164
+ const contents = await this.extractContentsFromSnapshots(terms);
166
165
 
167
- if (terms.sourceDocuments.filter(sourceDocument => !sourceDocument.content).length) {
168
- // If some source documents do not have associated snapshots, it is not possible to generate a fully valid version
166
+ if (contents.filter(Boolean).length !== terms.sourceDocuments.length) { // If there is not content for all source documents, it is not possible to generate a fully valid version
169
167
  return;
170
168
  }
171
169
 
172
- await this.recordVersion(terms, extractOnly);
173
-
174
- terms.sourceDocuments.forEach(sourceDocument => {
175
- sourceDocument.content = null; // Reduce memory usage by clearing no longer needed large content strings
176
- sourceDocument.mimeType = null; // …and associated MIME type
177
- sourceDocument.snapshotId = null; // …and associated snapshot ID for consistency
178
- });
170
+ await this.recordVersion(terms, contents.join(Version.SOURCE_DOCUMENTS_SEPARATOR), extractOnly);
179
171
  }
180
172
 
181
- async fetchSourceDocuments(terms) {
173
+ async fetchAndRecordSnapshots(terms) {
182
174
  terms.fetchDate = new Date();
183
-
184
175
  const fetchDocumentErrors = [];
185
176
 
186
177
  for (const sourceDocument of terms.sourceDocuments) {
187
- const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
188
-
189
- try {
190
- const { mimeType, content, fetcher } = await this.fetch({ url, executeClientScripts, cssSelectors });
191
-
192
- sourceDocument.content = content;
193
- sourceDocument.mimeType = mimeType;
194
- sourceDocument.fetcher = fetcher;
195
- } catch (error) {
196
- if (!(error instanceof FetchDocumentError)) {
197
- throw error;
198
- }
178
+ const error = await this.fetchSourceDocument(sourceDocument);
199
179
 
180
+ if (error) {
200
181
  fetchDocumentErrors.push(error);
182
+ } else {
183
+ await this.recordSnapshot(terms, sourceDocument);
184
+ sourceDocument.clearContent(); // Reduce memory usage by clearing no longer needed large content strings
201
185
  }
202
186
  }
203
187
 
@@ -206,27 +190,49 @@ export default class Archivist extends events.EventEmitter {
206
190
  }
207
191
  }
208
192
 
209
- loadSourceDocumentsFromSnapshots(terms) {
210
- return Promise.all(terms.sourceDocuments.map(async sourceDocument => {
211
- const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
193
+ async fetchSourceDocument(sourceDocument) {
194
+ const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
212
195
 
213
- if (!snapshot) { // This can happen if one of the source documents for a terms has not yet been fetched
214
- return;
196
+ try {
197
+ const { mimeType, content, fetcher } = await this.fetch({ url, executeClientScripts, cssSelectors });
198
+
199
+ sourceDocument.content = content;
200
+ sourceDocument.mimeType = mimeType;
201
+ sourceDocument.fetcher = fetcher;
202
+ } catch (error) {
203
+ if (!(error instanceof FetchDocumentError)) {
204
+ throw error;
215
205
  }
216
206
 
217
- sourceDocument.content = snapshot.content;
218
- sourceDocument.mimeType = snapshot.mimeType;
219
- sourceDocument.snapshotId = snapshot.id;
220
- terms.fetchDate = snapshot.fetchDate;
221
- }));
207
+ return error;
208
+ }
222
209
  }
223
210
 
224
- async extractVersionContent(sourceDocuments) {
211
+ async extractContentsFromSnapshots(terms) {
225
212
  const extractDocumentErrors = [];
226
213
 
227
- const result = await Promise.all(sourceDocuments.map(async sourceDocument => {
214
+ const contents = await Promise.all(terms.sourceDocuments.map(async sourceDocument => {
215
+ const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
216
+
228
217
  try {
229
- return await this.extract(sourceDocument);
218
+ if (!snapshot) { // This can happen if one of the source documents for a terms has not yet been fetched
219
+ return;
220
+ }
221
+
222
+ sourceDocument.content = snapshot.content;
223
+ sourceDocument.mimeType = snapshot.mimeType;
224
+ sourceDocument.snapshotId = snapshot.id;
225
+ terms.fetchDate = snapshot.fetchDate;
226
+
227
+ if (!sourceDocument.content) {
228
+ throw new ExtractDocumentError(`Empty content for source document ${sourceDocument.location} in snapshot ${snapshot.id}`);
229
+ }
230
+
231
+ const content = await this.extract(sourceDocument);
232
+
233
+ sourceDocument.clearContent(); // Reduce memory usage by clearing no longer needed large content strings
234
+
235
+ return content;
230
236
  } catch (error) {
231
237
  if (!(error instanceof ExtractDocumentError)) {
232
238
  throw error;
@@ -240,12 +246,10 @@ export default class Archivist extends events.EventEmitter {
240
246
  throw new InaccessibleContentError(extractDocumentErrors);
241
247
  }
242
248
 
243
- return result.join(Version.SOURCE_DOCUMENTS_SEPARATOR);
249
+ return contents;
244
250
  }
245
251
 
246
- async recordVersion(terms, extractOnly) {
247
- const content = await this.extractVersionContent(terms.sourceDocuments);
248
-
252
+ async recordVersion(terms, content, extractOnly) {
249
253
  const record = new Version({
250
254
  content,
251
255
  snapshotIds: terms.sourceDocuments.map(sourceDocuments => sourceDocuments.snapshotId),
@@ -269,35 +273,33 @@ export default class Archivist extends events.EventEmitter {
269
273
  return record;
270
274
  }
271
275
 
272
- recordSnapshots(terms) {
273
- return Promise.all(terms.sourceDocuments.map(async sourceDocument => {
274
- const record = new Snapshot({
275
- serviceId: terms.service.id,
276
- termsType: terms.type,
277
- documentId: terms.hasMultipleSourceDocuments && sourceDocument.id,
278
- fetchDate: terms.fetchDate,
279
- content: sourceDocument.content,
280
- mimeType: sourceDocument.mimeType,
281
- metadata: {
282
- 'x-engine-version': PACKAGE_VERSION,
283
- 'x-fetcher': sourceDocument.fetcher,
284
- 'x-source-document-location': sourceDocument.location,
285
- },
286
- });
276
+ async recordSnapshot(terms, sourceDocument) {
277
+ const record = new Snapshot({
278
+ serviceId: terms.service.id,
279
+ termsType: terms.type,
280
+ documentId: terms.hasMultipleSourceDocuments && sourceDocument.id,
281
+ fetchDate: terms.fetchDate,
282
+ content: sourceDocument.content,
283
+ mimeType: sourceDocument.mimeType,
284
+ metadata: {
285
+ 'x-engine-version': PACKAGE_VERSION,
286
+ 'x-fetcher': sourceDocument.fetcher,
287
+ 'x-source-document-location': sourceDocument.location,
288
+ },
289
+ });
287
290
 
288
- await this.recorder.record(record);
291
+ await this.recorder.record(record);
289
292
 
290
- if (!record.id) {
291
- this.emit('snapshotNotChanged', record);
293
+ if (!record.id) {
294
+ this.emit('snapshotNotChanged', record);
292
295
 
293
- return record;
294
- }
296
+ return record;
297
+ }
295
298
 
296
- sourceDocument.snapshotId = record.id;
299
+ sourceDocument.snapshotId = record.id;
297
300
 
298
- this.emit(record.isFirstRecord ? 'firstSnapshotRecorded' : 'snapshotRecorded', record);
301
+ this.emit(record.isFirstRecord ? 'firstSnapshotRecorded' : 'snapshotRecorded', record);
299
302
 
300
- return record;
301
- }));
303
+ return record;
302
304
  }
303
305
  }
@@ -276,8 +276,8 @@ describe('Archivist', function () {
276
276
  service: { id: 'test-service' },
277
277
  type: 'test-type',
278
278
  sourceDocuments: [
279
- { location: 'https://example.com/doc1' },
280
- { location: 'https://example.com/doc2' },
279
+ { location: 'https://example.com/doc1', content: 'test', mimeType: 'text/html' },
280
+ { location: 'https://example.com/doc2', content: 'test', mimeType: 'text/html' },
281
281
  ],
282
282
  };
283
283
  });
@@ -446,7 +446,7 @@ describe('Archivist', function () {
446
446
  });
447
447
  });
448
448
 
449
- describe('#recordSnapshots', () => {
449
+ describe('#recordSnapshot', () => {
450
450
  let terms;
451
451
  let snapshot;
452
452
 
@@ -462,7 +462,7 @@ describe('Archivist', function () {
462
462
 
463
463
  context('when it is the first record', () => {
464
464
  before(async () => {
465
- [snapshot] = await app.recordSnapshots(terms);
465
+ snapshot = await app.recordSnapshot(terms, terms.sourceDocuments[0]);
466
466
  });
467
467
 
468
468
  after(() => {
@@ -483,12 +483,12 @@ describe('Archivist', function () {
483
483
  let changedSnapshot;
484
484
 
485
485
  before(async () => {
486
- await app.recordSnapshots(terms);
486
+ await app.recordSnapshot(terms, terms.sourceDocuments[0]);
487
487
  resetSpiesHistory();
488
488
  terms.sourceDocuments.forEach(sourceDocument => {
489
489
  sourceDocument.content = serviceBSnapshotExpectedContent;
490
490
  });
491
- [changedSnapshot] = await app.recordSnapshots(terms);
491
+ changedSnapshot = await app.recordSnapshot(terms, terms.sourceDocuments[0]);
492
492
  });
493
493
 
494
494
  after(() => {
@@ -508,9 +508,9 @@ describe('Archivist', function () {
508
508
  let snapshot;
509
509
 
510
510
  before(async () => {
511
- await app.recordSnapshots(terms);
511
+ await app.recordSnapshot(terms, terms.sourceDocuments[0]);
512
512
  resetSpiesHistory();
513
- [snapshot] = await app.recordSnapshots(terms);
513
+ snapshot = await app.recordSnapshot(terms, terms.sourceDocuments[0]);
514
514
  });
515
515
 
516
516
  after(() => {
@@ -544,7 +544,7 @@ describe('Archivist', function () {
544
544
 
545
545
  context('when it is the first record', () => {
546
546
  before(async () => {
547
- version = await app.recordVersion(terms);
547
+ version = await app.recordVersion(terms, 'content');
548
548
  });
549
549
 
550
550
  after(() => {
@@ -565,12 +565,12 @@ describe('Archivist', function () {
565
565
  let changedVersion;
566
566
 
567
567
  before(async () => {
568
- await app.recordVersion(terms);
568
+ await app.recordVersion(terms, 'content');
569
569
  resetSpiesHistory();
570
570
  terms.sourceDocuments.forEach(sourceDocument => {
571
571
  sourceDocument.content = serviceBSnapshotExpectedContent;
572
572
  });
573
- changedVersion = await app.recordVersion(terms);
573
+ changedVersion = await app.recordVersion(terms, 'content updated');
574
574
  });
575
575
 
576
576
  after(() => {
@@ -590,9 +590,9 @@ describe('Archivist', function () {
590
590
  let version;
591
591
 
592
592
  before(async () => {
593
- await app.recordVersion(terms);
593
+ await app.recordVersion(terms, 'content');
594
594
  resetSpiesHistory();
595
- version = await app.recordVersion(terms);
595
+ version = await app.recordVersion(terms, 'content');
596
596
  });
597
597
 
598
598
  after(() => {
@@ -35,6 +35,11 @@ export default class SourceDocument {
35
35
  return result.filter(selector => selector);
36
36
  }
37
37
 
38
+ clearContent() {
39
+ this.content = null;
40
+ this.mimeType = null;
41
+ }
42
+
38
43
  static extractCssSelectorsFromProperty(property) {
39
44
  if (Array.isArray(property)) {
40
45
  return []