@opentermsarchive/engine 0.26.1 → 0.27.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -3
- package/bin/ota-track.js +3 -3
- package/bin/ota-validate.js +2 -2
- package/bin/ota.js +1 -1
- package/config/default.json +1 -1
- package/package.json +3 -4
- package/scripts/dataset/export/index.js +4 -4
- package/scripts/dataset/export/index.test.js +11 -17
- package/scripts/declarations/lint/index.mocha.js +1 -1
- package/scripts/declarations/utils/index.js +12 -12
- package/scripts/declarations/validate/definitions.js +1 -1
- package/scripts/declarations/validate/index.mocha.js +30 -34
- package/scripts/declarations/validate/service.history.schema.js +11 -11
- package/scripts/declarations/validate/service.schema.js +13 -13
- package/scripts/history/migrate-services.js +4 -4
- package/scripts/history/update-to-full-hash.js +2 -2
- package/scripts/import/index.js +14 -14
- package/scripts/rewrite/rewrite-snapshots.js +3 -3
- package/scripts/rewrite/rewrite-versions.js +14 -14
- package/scripts/utils/renamer/README.md +3 -3
- package/scripts/utils/renamer/index.js +13 -13
- package/src/archivist/errors.js +1 -1
- package/src/archivist/extract/exports.js +3 -0
- package/src/archivist/{filter → extract}/index.js +23 -27
- package/src/archivist/extract/index.test.js +516 -0
- package/src/archivist/index.js +101 -140
- package/src/archivist/index.test.js +178 -166
- package/src/archivist/recorder/index.js +11 -55
- package/src/archivist/recorder/index.test.js +310 -356
- package/src/archivist/recorder/record.js +18 -7
- package/src/archivist/recorder/repositories/git/dataMapper.js +41 -31
- package/src/archivist/recorder/repositories/git/index.js +11 -15
- package/src/archivist/recorder/repositories/git/index.test.js +1058 -463
- package/src/archivist/recorder/repositories/interface.js +8 -6
- package/src/archivist/recorder/repositories/mongo/dataMapper.js +21 -14
- package/src/archivist/recorder/repositories/mongo/index.js +8 -8
- package/src/archivist/recorder/repositories/mongo/index.test.js +898 -479
- package/src/archivist/recorder/snapshot.js +5 -0
- package/src/archivist/recorder/snapshot.test.js +65 -0
- package/src/archivist/recorder/version.js +14 -0
- package/src/archivist/recorder/version.test.js +65 -0
- package/src/archivist/services/index.js +60 -51
- package/src/archivist/services/index.test.js +63 -83
- package/src/archivist/services/service.js +26 -22
- package/src/archivist/services/service.test.js +46 -68
- package/src/archivist/services/{pageDeclaration.js → sourceDocument.js} +11 -9
- package/src/archivist/services/{pageDeclaration.test.js → sourceDocument.test.js} +21 -21
- package/src/archivist/services/terms.js +26 -0
- package/src/archivist/services/{documentDeclaration.test.js → terms.test.js} +15 -15
- package/src/exports.js +2 -2
- package/src/index.js +16 -13
- package/src/logger/index.js +35 -36
- package/src/notifier/index.js +8 -8
- package/src/tracker/index.js +6 -6
- package/src/archivist/filter/exports.js +0 -3
- package/src/archivist/filter/index.test.js +0 -564
- package/src/archivist/recorder/record.test.js +0 -91
- package/src/archivist/services/documentDeclaration.js +0 -26
- /package/scripts/utils/renamer/rules/{documentTypes.json → termsTypes.json} +0 -0
- /package/scripts/utils/renamer/rules/{documentTypesByService.json → termsTypesByService.json} +0 -0
package/src/archivist/index.js
CHANGED
|
@@ -1,30 +1,30 @@
|
|
|
1
1
|
import events from 'events';
|
|
2
2
|
|
|
3
3
|
import async from 'async';
|
|
4
|
-
import config from 'config';
|
|
5
4
|
|
|
6
5
|
import { InaccessibleContentError } from './errors.js';
|
|
6
|
+
import extract from './extract/index.js';
|
|
7
7
|
import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError } from './fetcher/index.js';
|
|
8
|
-
import filter from './filter/index.js';
|
|
9
8
|
import Recorder from './recorder/index.js';
|
|
9
|
+
import Snapshot from './recorder/snapshot.js';
|
|
10
|
+
import Version from './recorder/version.js';
|
|
10
11
|
import * as services from './services/index.js';
|
|
12
|
+
import Service from './services/service.js';
|
|
11
13
|
|
|
12
|
-
// The parallel handling feature is currently set to a parallelism of 1 on
|
|
14
|
+
// The parallel handling feature is currently set to a parallelism of 1 on terms tracking
|
|
13
15
|
// because when it's higher there are two issues:
|
|
14
16
|
// - too many requests on the same endpoint yield 403
|
|
15
17
|
// - sometimes when creating a commit no SHA are returned for unknown reasons
|
|
16
|
-
const
|
|
17
|
-
const
|
|
18
|
+
const MAX_PARALLEL_TRACKING = 1;
|
|
19
|
+
const MAX_PARALLEL_EXTRACTING = 10;
|
|
18
20
|
|
|
19
|
-
export const
|
|
21
|
+
export const EVENTS = [
|
|
20
22
|
'snapshotRecorded',
|
|
21
23
|
'firstSnapshotRecorded',
|
|
22
24
|
'snapshotNotChanged',
|
|
23
25
|
'versionRecorded',
|
|
24
26
|
'firstVersionRecorded',
|
|
25
27
|
'versionNotChanged',
|
|
26
|
-
'refilteringStarted',
|
|
27
|
-
'refilteringCompleted',
|
|
28
28
|
'trackingStarted',
|
|
29
29
|
'trackingCompleted',
|
|
30
30
|
'inaccessibleContent',
|
|
@@ -32,19 +32,15 @@ export const AVAILABLE_EVENTS = [
|
|
|
32
32
|
];
|
|
33
33
|
|
|
34
34
|
export default class Archivist extends events.EventEmitter {
|
|
35
|
-
get
|
|
36
|
-
return this.services;
|
|
35
|
+
get servicesIds() {
|
|
36
|
+
return Object.keys(this.services).sort((a, b) => a.localeCompare(b)); // Sort service IDs by lowercase name to have more intuitive logs;
|
|
37
37
|
}
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
return Object.keys(this.services);
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
constructor({ recorderConfig }) {
|
|
39
|
+
constructor({ recorderConfig, fetcherConfig }) {
|
|
44
40
|
super();
|
|
45
41
|
this.recorder = new Recorder(recorderConfig);
|
|
46
|
-
this.fetch = params => fetch({ ...params, config:
|
|
47
|
-
this.
|
|
42
|
+
this.fetch = params => fetch({ ...params, config: fetcherConfig });
|
|
43
|
+
this.extract = extract;
|
|
48
44
|
}
|
|
49
45
|
|
|
50
46
|
async initialize() {
|
|
@@ -53,7 +49,7 @@ export default class Archivist extends events.EventEmitter {
|
|
|
53
49
|
}
|
|
54
50
|
|
|
55
51
|
await this.recorder.initialize();
|
|
56
|
-
this.
|
|
52
|
+
this.initQueue();
|
|
57
53
|
this.services = await services.load();
|
|
58
54
|
|
|
59
55
|
this.on('error', async () => {
|
|
@@ -64,40 +60,32 @@ export default class Archivist extends events.EventEmitter {
|
|
|
64
60
|
process.exit(2);
|
|
65
61
|
}, 60 * 1000);
|
|
66
62
|
|
|
67
|
-
this.
|
|
68
|
-
this.trackDocumentChangesQueue.kill();
|
|
63
|
+
this.trackingQueue.kill();
|
|
69
64
|
await stopHeadlessBrowser().then(() => console.log('Headless browser stopped'));
|
|
70
65
|
await this.recorder.finalize().then(() => console.log('Recorder finalized'));
|
|
71
66
|
process.exit(1);
|
|
72
67
|
});
|
|
73
68
|
}
|
|
74
69
|
|
|
75
|
-
|
|
76
|
-
this.
|
|
77
|
-
this.
|
|
78
|
-
|
|
79
|
-
const queueErrorHandler = async (error, documentDeclaration) => {
|
|
80
|
-
const { service, type } = documentDeclaration;
|
|
81
|
-
|
|
70
|
+
initQueue() {
|
|
71
|
+
this.trackingQueue = async.queue(this.trackTermsChanges.bind(this), MAX_PARALLEL_TRACKING);
|
|
72
|
+
this.trackingQueue.error(async (error, { terms }) => {
|
|
82
73
|
if (error.toString().includes('HttpError: API rate limit exceeded for user ID')) {
|
|
83
74
|
return; // This is an error due to SendInBlue quota, bypass
|
|
84
75
|
}
|
|
85
76
|
|
|
86
77
|
if (error instanceof InaccessibleContentError) {
|
|
87
|
-
this.emit('inaccessibleContent', error,
|
|
78
|
+
this.emit('inaccessibleContent', error, terms);
|
|
88
79
|
|
|
89
80
|
return;
|
|
90
81
|
}
|
|
91
82
|
|
|
92
|
-
this.emit('error', error,
|
|
93
|
-
};
|
|
94
|
-
|
|
95
|
-
this.trackDocumentChangesQueue.error(queueErrorHandler);
|
|
96
|
-
this.refilterDocumentsQueue.error(queueErrorHandler);
|
|
83
|
+
this.emit('error', error, terms);
|
|
84
|
+
});
|
|
97
85
|
}
|
|
98
86
|
|
|
99
87
|
attach(listener) {
|
|
100
|
-
|
|
88
|
+
EVENTS.forEach(event => {
|
|
101
89
|
const handlerName = `on${event[0].toUpperCase()}${event.substring(1)}`;
|
|
102
90
|
|
|
103
91
|
if (listener[handlerName]) {
|
|
@@ -106,78 +94,57 @@ export default class Archivist extends events.EventEmitter {
|
|
|
106
94
|
});
|
|
107
95
|
}
|
|
108
96
|
|
|
109
|
-
async
|
|
110
|
-
this.emit('trackingStarted', servicesIds.length, this.
|
|
111
|
-
|
|
97
|
+
async track({ services: servicesIds = this.servicesIds, terms: termsTypes = [], extractOnly = false }) {
|
|
98
|
+
this.emit('trackingStarted', servicesIds.length, Service.getNumberOfTerms(this.services, servicesIds), extractOnly);
|
|
112
99
|
await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]);
|
|
113
100
|
|
|
114
|
-
this
|
|
115
|
-
|
|
116
|
-
await this.trackDocumentChangesQueue.drain();
|
|
117
|
-
await Promise.all([ stopHeadlessBrowser(), this.recorder.finalize() ]);
|
|
118
|
-
|
|
119
|
-
this.emit('trackingCompleted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
async refilterAndRecord(servicesIds = this.serviceIds, documentTypes = []) {
|
|
123
|
-
this.emit('refilteringStarted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
|
|
124
|
-
|
|
125
|
-
await this.recorder.initialize();
|
|
126
|
-
|
|
127
|
-
this.#forEachDocumentOf(servicesIds, documentTypes, documentDeclaration => this.refilterDocumentsQueue.push(documentDeclaration));
|
|
101
|
+
this.trackingQueue.concurrency = extractOnly ? MAX_PARALLEL_EXTRACTING : MAX_PARALLEL_TRACKING;
|
|
128
102
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
103
|
+
servicesIds.forEach(serviceId => {
|
|
104
|
+
this.services[serviceId].getTermsTypes().forEach(termsType => {
|
|
105
|
+
if (termsTypes.length && !termsTypes.includes(termsType)) {
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
134
108
|
|
|
135
|
-
|
|
136
|
-
|
|
109
|
+
this.trackingQueue.push({ terms: this.services[serviceId].getTerms(termsType), extractOnly });
|
|
110
|
+
});
|
|
111
|
+
});
|
|
137
112
|
|
|
138
|
-
|
|
139
|
-
}
|
|
113
|
+
await this.trackingQueue.drain();
|
|
140
114
|
|
|
141
|
-
|
|
142
|
-
|
|
115
|
+
await Promise.all([ stopHeadlessBrowser(), this.recorder.finalize() ]);
|
|
116
|
+
this.emit('trackingCompleted', servicesIds.length, Service.getNumberOfTerms(this.services, servicesIds), extractOnly);
|
|
143
117
|
}
|
|
144
118
|
|
|
145
|
-
async
|
|
146
|
-
|
|
119
|
+
async trackTermsChanges({ terms, extractOnly = false }) {
|
|
120
|
+
if (!extractOnly) {
|
|
121
|
+
await this.fetchSourceDocuments(terms);
|
|
122
|
+
await this.recordSnapshots(terms);
|
|
123
|
+
}
|
|
147
124
|
|
|
148
|
-
|
|
125
|
+
await this.loadSourceDocumentsFromSnapshots(terms);
|
|
149
126
|
|
|
150
|
-
if (!
|
|
127
|
+
if (terms.sourceDocuments.filter(sourceDocument => !sourceDocument.content).length) {
|
|
128
|
+
// If some source documents do not have associated snapshots, it is not possible to generate a fully valid version
|
|
151
129
|
return;
|
|
152
130
|
}
|
|
153
131
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
return this.recordVersion({
|
|
157
|
-
content: await this.generateDocumentFilteredContent(snapshots, pages),
|
|
158
|
-
snapshotIds: snapshots.map(({ id }) => id),
|
|
159
|
-
serviceId,
|
|
160
|
-
documentType,
|
|
161
|
-
fetchDate,
|
|
162
|
-
isRefiltering,
|
|
163
|
-
});
|
|
132
|
+
return this.recordVersion(terms, extractOnly);
|
|
164
133
|
}
|
|
165
134
|
|
|
166
|
-
async
|
|
135
|
+
async fetchSourceDocuments(terms) {
|
|
136
|
+
terms.fetchDate = new Date();
|
|
137
|
+
|
|
167
138
|
const inaccessibleContentErrors = [];
|
|
168
139
|
|
|
169
|
-
|
|
140
|
+
await Promise.all(terms.sourceDocuments.map(async sourceDocument => {
|
|
141
|
+
const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
|
|
142
|
+
|
|
170
143
|
try {
|
|
171
144
|
const { mimeType, content } = await this.fetch({ url, executeClientScripts, cssSelectors });
|
|
172
145
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
mimeType,
|
|
176
|
-
serviceId,
|
|
177
|
-
documentType,
|
|
178
|
-
pageId: isMultiPage && pageId,
|
|
179
|
-
fetchDate: new Date(),
|
|
180
|
-
};
|
|
146
|
+
sourceDocument.content = content;
|
|
147
|
+
sourceDocument.mimeType = mimeType;
|
|
181
148
|
} catch (error) {
|
|
182
149
|
if (!(error instanceof FetchDocumentError)) {
|
|
183
150
|
throw error;
|
|
@@ -198,79 +165,73 @@ export default class Archivist extends events.EventEmitter {
|
|
|
198
165
|
if (inaccessibleContentErrors.length) {
|
|
199
166
|
throw new InaccessibleContentError(inaccessibleContentErrors);
|
|
200
167
|
}
|
|
201
|
-
|
|
202
|
-
return result;
|
|
203
168
|
}
|
|
204
169
|
|
|
205
|
-
async
|
|
206
|
-
return
|
|
207
|
-
|
|
170
|
+
async loadSourceDocumentsFromSnapshots(terms) {
|
|
171
|
+
return Promise.all(terms.sourceDocuments.map(async sourceDocument => {
|
|
172
|
+
const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
|
|
208
173
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
const pageDeclaration = pageId ? pages.find(({ id }) => pageId == id) : pages[0];
|
|
174
|
+
if (!snapshot) { // This can happen if one of the source documents for a terms has not yet been fetched
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
213
177
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
178
|
+
sourceDocument.content = snapshot.content;
|
|
179
|
+
sourceDocument.mimeType = snapshot.mimeType;
|
|
180
|
+
terms.fetchDate = snapshot.fetchDate;
|
|
181
|
+
}));
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
async extractVersionContent(sourceDocuments) {
|
|
185
|
+
return (await Promise.all(sourceDocuments.map(async sourceDocument => this.extract(sourceDocument)))).join(Version.SOURCE_DOCUMENTS_SEPARATOR);
|
|
217
186
|
}
|
|
218
187
|
|
|
219
|
-
async
|
|
220
|
-
const
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
188
|
+
async recordVersion(terms, extractOnly) {
|
|
189
|
+
const record = new Version({
|
|
190
|
+
content: await this.extractVersionContent(terms.sourceDocuments),
|
|
191
|
+
snapshotIds: terms.sourceDocuments.map(sourceDocuments => sourceDocuments.snapshotId),
|
|
192
|
+
serviceId: terms.service.id,
|
|
193
|
+
termsType: terms.type,
|
|
194
|
+
fetchDate: terms.fetchDate,
|
|
195
|
+
isExtractOnly: extractOnly,
|
|
227
196
|
});
|
|
228
197
|
|
|
229
|
-
|
|
230
|
-
this.emit('snapshotNotChanged', serviceId, documentType, pageId);
|
|
198
|
+
await this.recorder.record(record);
|
|
231
199
|
|
|
232
|
-
|
|
200
|
+
if (!record.id) {
|
|
201
|
+
this.emit('versionNotChanged', record);
|
|
202
|
+
|
|
203
|
+
return record;
|
|
233
204
|
}
|
|
234
205
|
|
|
235
|
-
this.emit(isFirstRecord ? '
|
|
206
|
+
this.emit(record.isFirstRecord ? 'firstVersionRecorded' : 'versionRecorded', record);
|
|
236
207
|
|
|
237
|
-
return
|
|
208
|
+
return record;
|
|
238
209
|
}
|
|
239
210
|
|
|
240
|
-
async
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
211
|
+
async recordSnapshots(terms) {
|
|
212
|
+
return Promise.all(terms.sourceDocuments.map(async sourceDocument => {
|
|
213
|
+
const record = new Snapshot({
|
|
214
|
+
serviceId: terms.service.id,
|
|
215
|
+
termsType: terms.type,
|
|
216
|
+
documentId: terms.hasMultipleSourceDocuments && sourceDocument.id,
|
|
217
|
+
fetchDate: terms.fetchDate,
|
|
218
|
+
content: sourceDocument.content,
|
|
219
|
+
mimeType: sourceDocument.mimeType,
|
|
220
|
+
});
|
|
250
221
|
|
|
251
|
-
|
|
252
|
-
this.emit('versionNotChanged', serviceId, documentType);
|
|
222
|
+
await this.recorder.record(record);
|
|
253
223
|
|
|
254
|
-
|
|
255
|
-
|
|
224
|
+
if (!record.id) {
|
|
225
|
+
this.emit('snapshotNotChanged', record);
|
|
256
226
|
|
|
257
|
-
|
|
258
|
-
|
|
227
|
+
return record;
|
|
228
|
+
}
|
|
259
229
|
|
|
260
|
-
|
|
261
|
-
return serviceIds.reduce((acc, serviceId) => acc + this.services[serviceId].getNumberOfDocuments(), 0);
|
|
262
|
-
}
|
|
230
|
+
sourceDocument.snapshotId = record.id;
|
|
263
231
|
|
|
264
|
-
|
|
265
|
-
servicesIds.sort((a, b) => a.localeCompare(b)); // Sort service IDs by lowercase name to have more intuitive logs
|
|
266
|
-
servicesIds.forEach(serviceId => {
|
|
267
|
-
this.services[serviceId].getDocumentTypes().forEach(documentType => {
|
|
268
|
-
if (documentTypes.length && !documentTypes.includes(documentType)) {
|
|
269
|
-
return;
|
|
270
|
-
}
|
|
232
|
+
this.emit(record.isFirstRecord ? 'firstSnapshotRecorded' : 'snapshotRecorded', record);
|
|
271
233
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
});
|
|
234
|
+
return record;
|
|
235
|
+
}));
|
|
275
236
|
}
|
|
276
237
|
}
|