@opentermsarchive/engine 8.0.0 → 9.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
package/src/archivist/index.js
CHANGED
|
@@ -158,46 +158,30 @@ export default class Archivist extends events.EventEmitter {
|
|
|
158
158
|
|
|
159
159
|
async trackTermsChanges({ terms, extractOnly = false }) {
|
|
160
160
|
if (!extractOnly) {
|
|
161
|
-
await this.
|
|
162
|
-
await this.recordSnapshots(terms);
|
|
161
|
+
await this.fetchAndRecordSnapshots(terms);
|
|
163
162
|
}
|
|
164
163
|
|
|
165
|
-
await this.
|
|
164
|
+
const contents = await this.extractContentsFromSnapshots(terms);
|
|
166
165
|
|
|
167
|
-
if (
|
|
168
|
-
// If some source documents do not have associated snapshots, it is not possible to generate a fully valid version
|
|
166
|
+
if (contents.filter(Boolean).length !== terms.sourceDocuments.length) { // If there is not content for all source documents, it is not possible to generate a fully valid version
|
|
169
167
|
return;
|
|
170
168
|
}
|
|
171
169
|
|
|
172
|
-
await this.recordVersion(terms, extractOnly);
|
|
173
|
-
|
|
174
|
-
terms.sourceDocuments.forEach(sourceDocument => {
|
|
175
|
-
sourceDocument.content = null; // Reduce memory usage by clearing no longer needed large content strings
|
|
176
|
-
sourceDocument.mimeType = null; // …and associated MIME type
|
|
177
|
-
sourceDocument.snapshotId = null; // …and associated snapshot ID for consistency
|
|
178
|
-
});
|
|
170
|
+
await this.recordVersion(terms, contents.join(Version.SOURCE_DOCUMENTS_SEPARATOR), extractOnly);
|
|
179
171
|
}
|
|
180
172
|
|
|
181
|
-
async
|
|
173
|
+
async fetchAndRecordSnapshots(terms) {
|
|
182
174
|
terms.fetchDate = new Date();
|
|
183
|
-
|
|
184
175
|
const fetchDocumentErrors = [];
|
|
185
176
|
|
|
186
177
|
for (const sourceDocument of terms.sourceDocuments) {
|
|
187
|
-
const
|
|
188
|
-
|
|
189
|
-
try {
|
|
190
|
-
const { mimeType, content, fetcher } = await this.fetch({ url, executeClientScripts, cssSelectors });
|
|
191
|
-
|
|
192
|
-
sourceDocument.content = content;
|
|
193
|
-
sourceDocument.mimeType = mimeType;
|
|
194
|
-
sourceDocument.fetcher = fetcher;
|
|
195
|
-
} catch (error) {
|
|
196
|
-
if (!(error instanceof FetchDocumentError)) {
|
|
197
|
-
throw error;
|
|
198
|
-
}
|
|
178
|
+
const error = await this.fetchSourceDocument(sourceDocument);
|
|
199
179
|
|
|
180
|
+
if (error) {
|
|
200
181
|
fetchDocumentErrors.push(error);
|
|
182
|
+
} else {
|
|
183
|
+
await this.recordSnapshot(terms, sourceDocument);
|
|
184
|
+
sourceDocument.clearContent(); // Reduce memory usage by clearing no longer needed large content strings
|
|
201
185
|
}
|
|
202
186
|
}
|
|
203
187
|
|
|
@@ -206,27 +190,49 @@ export default class Archivist extends events.EventEmitter {
|
|
|
206
190
|
}
|
|
207
191
|
}
|
|
208
192
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
|
|
193
|
+
async fetchSourceDocument(sourceDocument) {
|
|
194
|
+
const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
|
|
212
195
|
|
|
213
|
-
|
|
214
|
-
|
|
196
|
+
try {
|
|
197
|
+
const { mimeType, content, fetcher } = await this.fetch({ url, executeClientScripts, cssSelectors });
|
|
198
|
+
|
|
199
|
+
sourceDocument.content = content;
|
|
200
|
+
sourceDocument.mimeType = mimeType;
|
|
201
|
+
sourceDocument.fetcher = fetcher;
|
|
202
|
+
} catch (error) {
|
|
203
|
+
if (!(error instanceof FetchDocumentError)) {
|
|
204
|
+
throw error;
|
|
215
205
|
}
|
|
216
206
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
sourceDocument.snapshotId = snapshot.id;
|
|
220
|
-
terms.fetchDate = snapshot.fetchDate;
|
|
221
|
-
}));
|
|
207
|
+
return error;
|
|
208
|
+
}
|
|
222
209
|
}
|
|
223
210
|
|
|
224
|
-
async
|
|
211
|
+
async extractContentsFromSnapshots(terms) {
|
|
225
212
|
const extractDocumentErrors = [];
|
|
226
213
|
|
|
227
|
-
const
|
|
214
|
+
const contents = await Promise.all(terms.sourceDocuments.map(async sourceDocument => {
|
|
215
|
+
const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
|
|
216
|
+
|
|
228
217
|
try {
|
|
229
|
-
|
|
218
|
+
if (!snapshot) { // This can happen if one of the source documents for a terms has not yet been fetched
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
sourceDocument.content = snapshot.content;
|
|
223
|
+
sourceDocument.mimeType = snapshot.mimeType;
|
|
224
|
+
sourceDocument.snapshotId = snapshot.id;
|
|
225
|
+
terms.fetchDate = snapshot.fetchDate;
|
|
226
|
+
|
|
227
|
+
if (!sourceDocument.content) {
|
|
228
|
+
throw new ExtractDocumentError(`Empty content for source document ${sourceDocument.location} in snapshot ${snapshot.id}`);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
const content = await this.extract(sourceDocument);
|
|
232
|
+
|
|
233
|
+
sourceDocument.clearContent(); // Reduce memory usage by clearing no longer needed large content strings
|
|
234
|
+
|
|
235
|
+
return content;
|
|
230
236
|
} catch (error) {
|
|
231
237
|
if (!(error instanceof ExtractDocumentError)) {
|
|
232
238
|
throw error;
|
|
@@ -240,12 +246,10 @@ export default class Archivist extends events.EventEmitter {
|
|
|
240
246
|
throw new InaccessibleContentError(extractDocumentErrors);
|
|
241
247
|
}
|
|
242
248
|
|
|
243
|
-
return
|
|
249
|
+
return contents;
|
|
244
250
|
}
|
|
245
251
|
|
|
246
|
-
async recordVersion(terms, extractOnly) {
|
|
247
|
-
const content = await this.extractVersionContent(terms.sourceDocuments);
|
|
248
|
-
|
|
252
|
+
async recordVersion(terms, content, extractOnly) {
|
|
249
253
|
const record = new Version({
|
|
250
254
|
content,
|
|
251
255
|
snapshotIds: terms.sourceDocuments.map(sourceDocuments => sourceDocuments.snapshotId),
|
|
@@ -269,35 +273,33 @@ export default class Archivist extends events.EventEmitter {
|
|
|
269
273
|
return record;
|
|
270
274
|
}
|
|
271
275
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
});
|
|
276
|
+
async recordSnapshot(terms, sourceDocument) {
|
|
277
|
+
const record = new Snapshot({
|
|
278
|
+
serviceId: terms.service.id,
|
|
279
|
+
termsType: terms.type,
|
|
280
|
+
documentId: terms.hasMultipleSourceDocuments && sourceDocument.id,
|
|
281
|
+
fetchDate: terms.fetchDate,
|
|
282
|
+
content: sourceDocument.content,
|
|
283
|
+
mimeType: sourceDocument.mimeType,
|
|
284
|
+
metadata: {
|
|
285
|
+
'x-engine-version': PACKAGE_VERSION,
|
|
286
|
+
'x-fetcher': sourceDocument.fetcher,
|
|
287
|
+
'x-source-document-location': sourceDocument.location,
|
|
288
|
+
},
|
|
289
|
+
});
|
|
287
290
|
|
|
288
|
-
|
|
291
|
+
await this.recorder.record(record);
|
|
289
292
|
|
|
290
|
-
|
|
291
|
-
|
|
293
|
+
if (!record.id) {
|
|
294
|
+
this.emit('snapshotNotChanged', record);
|
|
292
295
|
|
|
293
|
-
|
|
294
|
-
|
|
296
|
+
return record;
|
|
297
|
+
}
|
|
295
298
|
|
|
296
|
-
|
|
299
|
+
sourceDocument.snapshotId = record.id;
|
|
297
300
|
|
|
298
|
-
|
|
301
|
+
this.emit(record.isFirstRecord ? 'firstSnapshotRecorded' : 'snapshotRecorded', record);
|
|
299
302
|
|
|
300
|
-
|
|
301
|
-
}));
|
|
303
|
+
return record;
|
|
302
304
|
}
|
|
303
305
|
}
|
|
@@ -276,8 +276,8 @@ describe('Archivist', function () {
|
|
|
276
276
|
service: { id: 'test-service' },
|
|
277
277
|
type: 'test-type',
|
|
278
278
|
sourceDocuments: [
|
|
279
|
-
{ location: 'https://example.com/doc1' },
|
|
280
|
-
{ location: 'https://example.com/doc2' },
|
|
279
|
+
{ location: 'https://example.com/doc1', content: 'test', mimeType: 'text/html' },
|
|
280
|
+
{ location: 'https://example.com/doc2', content: 'test', mimeType: 'text/html' },
|
|
281
281
|
],
|
|
282
282
|
};
|
|
283
283
|
});
|
|
@@ -446,7 +446,7 @@ describe('Archivist', function () {
|
|
|
446
446
|
});
|
|
447
447
|
});
|
|
448
448
|
|
|
449
|
-
describe('#
|
|
449
|
+
describe('#recordSnapshot', () => {
|
|
450
450
|
let terms;
|
|
451
451
|
let snapshot;
|
|
452
452
|
|
|
@@ -462,7 +462,7 @@ describe('Archivist', function () {
|
|
|
462
462
|
|
|
463
463
|
context('when it is the first record', () => {
|
|
464
464
|
before(async () => {
|
|
465
|
-
|
|
465
|
+
snapshot = await app.recordSnapshot(terms, terms.sourceDocuments[0]);
|
|
466
466
|
});
|
|
467
467
|
|
|
468
468
|
after(() => {
|
|
@@ -483,12 +483,12 @@ describe('Archivist', function () {
|
|
|
483
483
|
let changedSnapshot;
|
|
484
484
|
|
|
485
485
|
before(async () => {
|
|
486
|
-
await app.
|
|
486
|
+
await app.recordSnapshot(terms, terms.sourceDocuments[0]);
|
|
487
487
|
resetSpiesHistory();
|
|
488
488
|
terms.sourceDocuments.forEach(sourceDocument => {
|
|
489
489
|
sourceDocument.content = serviceBSnapshotExpectedContent;
|
|
490
490
|
});
|
|
491
|
-
|
|
491
|
+
changedSnapshot = await app.recordSnapshot(terms, terms.sourceDocuments[0]);
|
|
492
492
|
});
|
|
493
493
|
|
|
494
494
|
after(() => {
|
|
@@ -508,9 +508,9 @@ describe('Archivist', function () {
|
|
|
508
508
|
let snapshot;
|
|
509
509
|
|
|
510
510
|
before(async () => {
|
|
511
|
-
await app.
|
|
511
|
+
await app.recordSnapshot(terms, terms.sourceDocuments[0]);
|
|
512
512
|
resetSpiesHistory();
|
|
513
|
-
|
|
513
|
+
snapshot = await app.recordSnapshot(terms, terms.sourceDocuments[0]);
|
|
514
514
|
});
|
|
515
515
|
|
|
516
516
|
after(() => {
|
|
@@ -544,7 +544,7 @@ describe('Archivist', function () {
|
|
|
544
544
|
|
|
545
545
|
context('when it is the first record', () => {
|
|
546
546
|
before(async () => {
|
|
547
|
-
version = await app.recordVersion(terms);
|
|
547
|
+
version = await app.recordVersion(terms, 'content');
|
|
548
548
|
});
|
|
549
549
|
|
|
550
550
|
after(() => {
|
|
@@ -565,12 +565,12 @@ describe('Archivist', function () {
|
|
|
565
565
|
let changedVersion;
|
|
566
566
|
|
|
567
567
|
before(async () => {
|
|
568
|
-
await app.recordVersion(terms);
|
|
568
|
+
await app.recordVersion(terms, 'content');
|
|
569
569
|
resetSpiesHistory();
|
|
570
570
|
terms.sourceDocuments.forEach(sourceDocument => {
|
|
571
571
|
sourceDocument.content = serviceBSnapshotExpectedContent;
|
|
572
572
|
});
|
|
573
|
-
changedVersion = await app.recordVersion(terms);
|
|
573
|
+
changedVersion = await app.recordVersion(terms, 'content updated');
|
|
574
574
|
});
|
|
575
575
|
|
|
576
576
|
after(() => {
|
|
@@ -590,9 +590,9 @@ describe('Archivist', function () {
|
|
|
590
590
|
let version;
|
|
591
591
|
|
|
592
592
|
before(async () => {
|
|
593
|
-
await app.recordVersion(terms);
|
|
593
|
+
await app.recordVersion(terms, 'content');
|
|
594
594
|
resetSpiesHistory();
|
|
595
|
-
version = await app.recordVersion(terms);
|
|
595
|
+
version = await app.recordVersion(terms, 'content');
|
|
596
596
|
});
|
|
597
597
|
|
|
598
598
|
after(() => {
|
|
@@ -35,6 +35,11 @@ export default class SourceDocument {
|
|
|
35
35
|
return result.filter(selector => selector);
|
|
36
36
|
}
|
|
37
37
|
|
|
38
|
+
clearContent() {
|
|
39
|
+
this.content = null;
|
|
40
|
+
this.mimeType = null;
|
|
41
|
+
}
|
|
42
|
+
|
|
38
43
|
static extractCssSelectorsFromProperty(property) {
|
|
39
44
|
if (Array.isArray(property)) {
|
|
40
45
|
return []
|
|
@@ -155,10 +155,10 @@ const { version: PACKAGE_VERSION } = require('../../../package.json');
|
|
|
155
155
|
* roles:
|
|
156
156
|
* type: array
|
|
157
157
|
* description: Roles of the entity within the governance, see [collection governance](https://docs.opentermsarchive.org/collections/reference/governance/)
|
|
158
|
-
* example: [host,
|
|
158
|
+
* example: [host, sysadmin]
|
|
159
159
|
* items:
|
|
160
160
|
* type: string
|
|
161
|
-
* enum: [host,
|
|
161
|
+
* enum: [host, sysadmin, curator, maintainer, contributor, analyst, diffuser, sponsor]
|
|
162
162
|
* i18n:
|
|
163
163
|
* type: object
|
|
164
164
|
* description: Internationalization of any of the Metadata properties (except i18n itself) for different language codes
|
|
@@ -30,38 +30,28 @@ const EXPECTED_RESPONSE = {
|
|
|
30
30
|
jurisdictions: [
|
|
31
31
|
'EU',
|
|
32
32
|
],
|
|
33
|
+
trackingPeriods: [
|
|
34
|
+
{
|
|
35
|
+
startDate: '2025-07-17',
|
|
36
|
+
schedule: '30 */12 * * *',
|
|
37
|
+
serverLocation: 'Strasbourg, FR',
|
|
38
|
+
},
|
|
39
|
+
],
|
|
33
40
|
governance: {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
logo: 'https://opentermsarchive.org/images/logo/logo-open-terms-archive-black.png',
|
|
49
|
-
},
|
|
50
|
-
],
|
|
51
|
-
maintainers: [
|
|
52
|
-
{
|
|
53
|
-
name: 'Open Terms Archive',
|
|
54
|
-
url: 'https://opentermsarchive.org/',
|
|
55
|
-
logo: 'https://opentermsarchive.org/images/logo/logo-open-terms-archive-black.png',
|
|
56
|
-
},
|
|
57
|
-
],
|
|
58
|
-
sponsors: [
|
|
59
|
-
{
|
|
60
|
-
name: 'Open Terms Archive',
|
|
61
|
-
url: 'https://opentermsarchive.org/',
|
|
62
|
-
logo: 'https://opentermsarchive.org/images/logo/logo-open-terms-archive-black.png',
|
|
63
|
-
},
|
|
64
|
-
],
|
|
41
|
+
'Open Terms Archive': {
|
|
42
|
+
url: 'https://opentermsarchive.org',
|
|
43
|
+
logo: 'https://opentermsarchive.org/images/logo/logo-open-terms-archive-black.png',
|
|
44
|
+
roles: [
|
|
45
|
+
'host',
|
|
46
|
+
'sysadmin',
|
|
47
|
+
'curator',
|
|
48
|
+
'maintainer',
|
|
49
|
+
'contributor',
|
|
50
|
+
'analyst',
|
|
51
|
+
'diffuser',
|
|
52
|
+
'sponsor',
|
|
53
|
+
],
|
|
54
|
+
},
|
|
65
55
|
},
|
|
66
56
|
};
|
|
67
57
|
|