@opentermsarchive/engine 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +1 -469
  2. package/bin/ota-track.js +3 -3
  3. package/bin/ota-validate.js +2 -2
  4. package/bin/ota.js +1 -1
  5. package/config/default.json +1 -1
  6. package/config/test.json +2 -2
  7. package/package.json +6 -7
  8. package/scripts/dataset/export/index.js +4 -4
  9. package/scripts/dataset/export/index.test.js +11 -17
  10. package/scripts/dataset/export/test/fixtures/dataset/README.md +1 -1
  11. package/scripts/declarations/lint/index.mocha.js +1 -1
  12. package/scripts/declarations/utils/index.js +12 -12
  13. package/scripts/declarations/validate/definitions.js +1 -1
  14. package/scripts/declarations/validate/index.mocha.js +30 -34
  15. package/scripts/declarations/validate/service.history.schema.js +11 -11
  16. package/scripts/declarations/validate/service.schema.js +13 -13
  17. package/scripts/history/migrate-services.js +4 -4
  18. package/scripts/history/update-to-full-hash.js +2 -2
  19. package/scripts/import/index.js +14 -14
  20. package/scripts/rewrite/config/rewrite-snapshots.json +1 -1
  21. package/scripts/rewrite/config/rewrite-versions.json +1 -1
  22. package/scripts/rewrite/rewrite-snapshots.js +3 -3
  23. package/scripts/rewrite/rewrite-versions.js +14 -14
  24. package/scripts/utils/renamer/README.md +3 -3
  25. package/scripts/utils/renamer/index.js +13 -13
  26. package/src/archivist/errors.js +1 -1
  27. package/src/archivist/extract/exports.js +3 -0
  28. package/src/archivist/{filter → extract}/index.js +23 -27
  29. package/src/archivist/extract/index.test.js +516 -0
  30. package/src/archivist/index.js +101 -140
  31. package/src/archivist/index.test.js +178 -166
  32. package/src/archivist/recorder/index.js +11 -55
  33. package/src/archivist/recorder/index.test.js +310 -356
  34. package/src/archivist/recorder/record.js +18 -7
  35. package/src/archivist/recorder/repositories/git/dataMapper.js +41 -31
  36. package/src/archivist/recorder/repositories/git/index.js +11 -15
  37. package/src/archivist/recorder/repositories/git/index.test.js +1058 -463
  38. package/src/archivist/recorder/repositories/interface.js +8 -6
  39. package/src/archivist/recorder/repositories/mongo/dataMapper.js +21 -14
  40. package/src/archivist/recorder/repositories/mongo/index.js +8 -8
  41. package/src/archivist/recorder/repositories/mongo/index.test.js +898 -479
  42. package/src/archivist/recorder/snapshot.js +5 -0
  43. package/src/archivist/recorder/snapshot.test.js +65 -0
  44. package/src/archivist/recorder/version.js +14 -0
  45. package/src/archivist/recorder/version.test.js +65 -0
  46. package/src/archivist/services/index.js +60 -51
  47. package/src/archivist/services/index.test.js +63 -83
  48. package/src/archivist/services/service.js +26 -22
  49. package/src/archivist/services/service.test.js +46 -68
  50. package/src/archivist/services/{pageDeclaration.js → sourceDocument.js} +11 -9
  51. package/src/archivist/services/{pageDeclaration.test.js → sourceDocument.test.js} +21 -21
  52. package/src/archivist/services/terms.js +26 -0
  53. package/src/archivist/services/{documentDeclaration.test.js → terms.test.js} +15 -15
  54. package/src/exports.js +2 -2
  55. package/src/index.js +16 -13
  56. package/src/logger/index.js +35 -36
  57. package/src/notifier/index.js +8 -8
  58. package/src/tracker/index.js +6 -6
  59. package/src/archivist/filter/exports.js +0 -3
  60. package/src/archivist/filter/index.test.js +0 -564
  61. package/src/archivist/recorder/record.test.js +0 -91
  62. package/src/archivist/services/documentDeclaration.js +0 -26
  63. /package/scripts/utils/renamer/rules/{documentTypes.json → termsTypes.json} +0 -0
  64. /package/scripts/utils/renamer/rules/{documentTypesByService.json → termsTypesByService.json} +0 -0
@@ -1,30 +1,30 @@
1
1
  import events from 'events';
2
2
 
3
3
  import async from 'async';
4
- import config from 'config';
5
4
 
6
5
  import { InaccessibleContentError } from './errors.js';
6
+ import extract from './extract/index.js';
7
7
  import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError } from './fetcher/index.js';
8
- import filter from './filter/index.js';
9
8
  import Recorder from './recorder/index.js';
9
+ import Snapshot from './recorder/snapshot.js';
10
+ import Version from './recorder/version.js';
10
11
  import * as services from './services/index.js';
12
+ import Service from './services/service.js';
11
13
 
12
- // The parallel handling feature is currently set to a parallelism of 1 on document tracking
14
+ // The parallel handling feature is currently set to a parallelism of 1 on terms tracking
13
15
  // because when it's higher there are two issues:
14
16
  // - too many requests on the same endpoint yield 403
15
17
  // - sometimes when creating a commit no SHA are returned for unknown reasons
16
- const MAX_PARALLEL_DOCUMENTS_TRACKS = 1;
17
- const MAX_PARALLEL_REFILTERS = 10;
18
+ const MAX_PARALLEL_TRACKING = 1;
19
+ const MAX_PARALLEL_EXTRACTING = 10;
18
20
 
19
- export const AVAILABLE_EVENTS = [
21
+ export const EVENTS = [
20
22
  'snapshotRecorded',
21
23
  'firstSnapshotRecorded',
22
24
  'snapshotNotChanged',
23
25
  'versionRecorded',
24
26
  'firstVersionRecorded',
25
27
  'versionNotChanged',
26
- 'refilteringStarted',
27
- 'refilteringCompleted',
28
28
  'trackingStarted',
29
29
  'trackingCompleted',
30
30
  'inaccessibleContent',
@@ -32,19 +32,15 @@ export const AVAILABLE_EVENTS = [
32
32
  ];
33
33
 
34
34
  export default class Archivist extends events.EventEmitter {
35
- get serviceDeclarations() {
36
- return this.services;
35
+ get servicesIds() {
36
+ return Object.keys(this.services).sort((a, b) => a.localeCompare(b)); // Sort service IDs by lowercase name to have more intuitive logs;
37
37
  }
38
38
 
39
- get serviceIds() {
40
- return Object.keys(this.services);
41
- }
42
-
43
- constructor({ recorderConfig }) {
39
+ constructor({ recorderConfig, fetcherConfig }) {
44
40
  super();
45
41
  this.recorder = new Recorder(recorderConfig);
46
- this.fetch = params => fetch({ ...params, config: config.get('fetcher') });
47
- this.filter = filter;
42
+ this.fetch = params => fetch({ ...params, config: fetcherConfig });
43
+ this.extract = extract;
48
44
  }
49
45
 
50
46
  async initialize() {
@@ -53,7 +49,7 @@ export default class Archivist extends events.EventEmitter {
53
49
  }
54
50
 
55
51
  await this.recorder.initialize();
56
- this.initQueues();
52
+ this.initQueue();
57
53
  this.services = await services.load();
58
54
 
59
55
  this.on('error', async () => {
@@ -64,40 +60,32 @@ export default class Archivist extends events.EventEmitter {
64
60
  process.exit(2);
65
61
  }, 60 * 1000);
66
62
 
67
- this.refilterDocumentsQueue.kill();
68
- this.trackDocumentChangesQueue.kill();
63
+ this.trackingQueue.kill();
69
64
  await stopHeadlessBrowser().then(() => console.log('Headless browser stopped'));
70
65
  await this.recorder.finalize().then(() => console.log('Recorder finalized'));
71
66
  process.exit(1);
72
67
  });
73
68
  }
74
69
 
75
- initQueues() {
76
- this.trackDocumentChangesQueue = async.queue(async documentDeclaration => this.trackDocumentChanges(documentDeclaration), MAX_PARALLEL_DOCUMENTS_TRACKS);
77
- this.refilterDocumentsQueue = async.queue(async documentDeclaration => this.refilterAndRecordDocument(documentDeclaration), MAX_PARALLEL_REFILTERS);
78
-
79
- const queueErrorHandler = async (error, documentDeclaration) => {
80
- const { service, type } = documentDeclaration;
81
-
70
+ initQueue() {
71
+ this.trackingQueue = async.queue(this.trackTermsChanges.bind(this), MAX_PARALLEL_TRACKING);
72
+ this.trackingQueue.error(async (error, { terms }) => {
82
73
  if (error.toString().includes('HttpError: API rate limit exceeded for user ID')) {
83
74
  return; // This is an error due to SendInBlue quota, bypass
84
75
  }
85
76
 
86
77
  if (error instanceof InaccessibleContentError) {
87
- this.emit('inaccessibleContent', error, service.id, type, documentDeclaration);
78
+ this.emit('inaccessibleContent', error, terms);
88
79
 
89
80
  return;
90
81
  }
91
82
 
92
- this.emit('error', error, service.id, type);
93
- };
94
-
95
- this.trackDocumentChangesQueue.error(queueErrorHandler);
96
- this.refilterDocumentsQueue.error(queueErrorHandler);
83
+ this.emit('error', error, terms);
84
+ });
97
85
  }
98
86
 
99
87
  attach(listener) {
100
- AVAILABLE_EVENTS.forEach(event => {
88
+ EVENTS.forEach(event => {
101
89
  const handlerName = `on${event[0].toUpperCase()}${event.substring(1)}`;
102
90
 
103
91
  if (listener[handlerName]) {
@@ -106,78 +94,57 @@ export default class Archivist extends events.EventEmitter {
106
94
  });
107
95
  }
108
96
 
109
- async trackChanges(servicesIds = this.serviceIds, documentTypes = []) {
110
- this.emit('trackingStarted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
111
-
97
+ async track({ services: servicesIds = this.servicesIds, terms: termsTypes = [], extractOnly = false }) {
98
+ this.emit('trackingStarted', servicesIds.length, Service.getNumberOfTerms(this.services, servicesIds), extractOnly);
112
99
  await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]);
113
100
 
114
- this.#forEachDocumentOf(servicesIds, documentTypes, documentDeclaration => this.trackDocumentChangesQueue.push(documentDeclaration));
115
-
116
- await this.trackDocumentChangesQueue.drain();
117
- await Promise.all([ stopHeadlessBrowser(), this.recorder.finalize() ]);
118
-
119
- this.emit('trackingCompleted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
120
- }
121
-
122
- async refilterAndRecord(servicesIds = this.serviceIds, documentTypes = []) {
123
- this.emit('refilteringStarted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
124
-
125
- await this.recorder.initialize();
126
-
127
- this.#forEachDocumentOf(servicesIds, documentTypes, documentDeclaration => this.refilterDocumentsQueue.push(documentDeclaration));
101
+ this.trackingQueue.concurrency = extractOnly ? MAX_PARALLEL_EXTRACTING : MAX_PARALLEL_TRACKING;
128
102
 
129
- await this.refilterDocumentsQueue.drain();
130
- await this.recorder.finalize();
131
-
132
- this.emit('refilteringCompleted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
133
- }
103
+ servicesIds.forEach(serviceId => {
104
+ this.services[serviceId].getTermsTypes().forEach(termsType => {
105
+ if (termsTypes.length && !termsTypes.includes(termsType)) {
106
+ return;
107
+ }
134
108
 
135
- async trackDocumentChanges(documentDeclaration) {
136
- await Promise.all((await this.fetchDocumentPages(documentDeclaration)).map(params => this.recordSnapshot(params)));
109
+ this.trackingQueue.push({ terms: this.services[serviceId].getTerms(termsType), extractOnly });
110
+ });
111
+ });
137
112
 
138
- return this.generateDocumentVersion(documentDeclaration);
139
- }
113
+ await this.trackingQueue.drain();
140
114
 
141
- async refilterAndRecordDocument(documentDeclaration) {
142
- return this.generateDocumentVersion(documentDeclaration, { isRefiltering: true });
115
+ await Promise.all([ stopHeadlessBrowser(), this.recorder.finalize() ]);
116
+ this.emit('trackingCompleted', servicesIds.length, Service.getNumberOfTerms(this.services, servicesIds), extractOnly);
143
117
  }
144
118
 
145
- async generateDocumentVersion(documentDeclaration, { isRefiltering = false } = {}) {
146
- const { service: { id: serviceId }, type: documentType, pages } = documentDeclaration;
119
+ async trackTermsChanges({ terms, extractOnly = false }) {
120
+ if (!extractOnly) {
121
+ await this.fetchSourceDocuments(terms);
122
+ await this.recordSnapshots(terms);
123
+ }
147
124
 
148
- const snapshots = await this.getDocumentSnapshots(documentDeclaration);
125
+ await this.loadSourceDocumentsFromSnapshots(terms);
149
126
 
150
- if (!snapshots.length) {
127
+ if (terms.sourceDocuments.filter(sourceDocument => !sourceDocument.content).length) {
128
+ // If some source documents do not have associated snapshots, it is not possible to generate a fully valid version
151
129
  return;
152
130
  }
153
131
 
154
- const [{ fetchDate }] = snapshots; // In case of multipage document, use the first snapshot fetch date
155
-
156
- return this.recordVersion({
157
- content: await this.generateDocumentFilteredContent(snapshots, pages),
158
- snapshotIds: snapshots.map(({ id }) => id),
159
- serviceId,
160
- documentType,
161
- fetchDate,
162
- isRefiltering,
163
- });
132
+ return this.recordVersion(terms, extractOnly);
164
133
  }
165
134
 
166
- async fetchDocumentPages({ service: { id: serviceId }, type: documentType, pages, isMultiPage }) {
135
+ async fetchSourceDocuments(terms) {
136
+ terms.fetchDate = new Date();
137
+
167
138
  const inaccessibleContentErrors = [];
168
139
 
169
- const result = await Promise.all(pages.map(async ({ location: url, executeClientScripts, cssSelectors, id: pageId }) => {
140
+ await Promise.all(terms.sourceDocuments.map(async sourceDocument => {
141
+ const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
142
+
170
143
  try {
171
144
  const { mimeType, content } = await this.fetch({ url, executeClientScripts, cssSelectors });
172
145
 
173
- return {
174
- content,
175
- mimeType,
176
- serviceId,
177
- documentType,
178
- pageId: isMultiPage && pageId,
179
- fetchDate: new Date(),
180
- };
146
+ sourceDocument.content = content;
147
+ sourceDocument.mimeType = mimeType;
181
148
  } catch (error) {
182
149
  if (!(error instanceof FetchDocumentError)) {
183
150
  throw error;
@@ -198,79 +165,73 @@ export default class Archivist extends events.EventEmitter {
198
165
  if (inaccessibleContentErrors.length) {
199
166
  throw new InaccessibleContentError(inaccessibleContentErrors);
200
167
  }
201
-
202
- return result;
203
168
  }
204
169
 
205
- async getDocumentSnapshots({ service: { id: serviceId }, type: documentType, pages, isMultiPage }) {
206
- return (await Promise.all(pages.map(async page => this.recorder.getLatestSnapshot(serviceId, documentType, isMultiPage && page.id)))).filter(Boolean);
207
- }
170
+ async loadSourceDocumentsFromSnapshots(terms) {
171
+ return Promise.all(terms.sourceDocuments.map(async sourceDocument => {
172
+ const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
208
173
 
209
- async generateDocumentFilteredContent(snapshots, pages) {
210
- return (
211
- await Promise.all(snapshots.map(async ({ pageId, content, mimeType }) => {
212
- const pageDeclaration = pageId ? pages.find(({ id }) => pageId == id) : pages[0];
174
+ if (!snapshot) { // This can happen if one of the source documents for a terms has not yet been fetched
175
+ return;
176
+ }
213
177
 
214
- return this.filter({ content, mimeType, pageDeclaration });
215
- }))
216
- ).join('\n\n');
178
+ sourceDocument.content = snapshot.content;
179
+ sourceDocument.mimeType = snapshot.mimeType;
180
+ terms.fetchDate = snapshot.fetchDate;
181
+ }));
182
+ }
183
+
184
+ async extractVersionContent(sourceDocuments) {
185
+ return (await Promise.all(sourceDocuments.map(async sourceDocument => this.extract(sourceDocument)))).join(Version.SOURCE_DOCUMENTS_SEPARATOR);
217
186
  }
218
187
 
219
- async recordSnapshot({ content, mimeType, fetchDate, serviceId, documentType, pageId }) {
220
- const { id: snapshotId, isFirstRecord } = await this.recorder.recordSnapshot({
221
- serviceId,
222
- documentType,
223
- pageId,
224
- content,
225
- mimeType,
226
- fetchDate,
188
+ async recordVersion(terms, extractOnly) {
189
+ const record = new Version({
190
+ content: await this.extractVersionContent(terms.sourceDocuments),
191
+ snapshotIds: terms.sourceDocuments.map(sourceDocuments => sourceDocuments.snapshotId),
192
+ serviceId: terms.service.id,
193
+ termsType: terms.type,
194
+ fetchDate: terms.fetchDate,
195
+ isExtractOnly: extractOnly,
227
196
  });
228
197
 
229
- if (!snapshotId) {
230
- this.emit('snapshotNotChanged', serviceId, documentType, pageId);
198
+ await this.recorder.record(record);
231
199
 
232
- return;
200
+ if (!record.id) {
201
+ this.emit('versionNotChanged', record);
202
+
203
+ return record;
233
204
  }
234
205
 
235
- this.emit(isFirstRecord ? 'firstSnapshotRecorded' : 'snapshotRecorded', serviceId, documentType, pageId, snapshotId);
206
+ this.emit(record.isFirstRecord ? 'firstVersionRecorded' : 'versionRecorded', record);
236
207
 
237
- return snapshotId;
208
+ return record;
238
209
  }
239
210
 
240
- async recordVersion({ content, fetchDate, snapshotIds, serviceId, documentType, isRefiltering }) {
241
- const recordFunction = !isRefiltering ? 'recordVersion' : 'recordRefilter';
242
-
243
- const { id: versionId, isFirstRecord } = await this.recorder[recordFunction]({
244
- serviceId,
245
- documentType,
246
- content,
247
- fetchDate,
248
- snapshotIds,
249
- });
211
+ async recordSnapshots(terms) {
212
+ return Promise.all(terms.sourceDocuments.map(async sourceDocument => {
213
+ const record = new Snapshot({
214
+ serviceId: terms.service.id,
215
+ termsType: terms.type,
216
+ documentId: terms.hasMultipleSourceDocuments && sourceDocument.id,
217
+ fetchDate: terms.fetchDate,
218
+ content: sourceDocument.content,
219
+ mimeType: sourceDocument.mimeType,
220
+ });
250
221
 
251
- if (!versionId) {
252
- this.emit('versionNotChanged', serviceId, documentType);
222
+ await this.recorder.record(record);
253
223
 
254
- return;
255
- }
224
+ if (!record.id) {
225
+ this.emit('snapshotNotChanged', record);
256
226
 
257
- this.emit(isFirstRecord ? 'firstVersionRecorded' : 'versionRecorded', serviceId, documentType, versionId);
258
- }
227
+ return record;
228
+ }
259
229
 
260
- getNumberOfDocuments(serviceIds = this.serviceIds) {
261
- return serviceIds.reduce((acc, serviceId) => acc + this.services[serviceId].getNumberOfDocuments(), 0);
262
- }
230
+ sourceDocument.snapshotId = record.id;
263
231
 
264
- async #forEachDocumentOf(servicesIds = [], documentTypes = [], callback) { // eslint-disable-line default-param-last
265
- servicesIds.sort((a, b) => a.localeCompare(b)); // Sort service IDs by lowercase name to have more intuitive logs
266
- servicesIds.forEach(serviceId => {
267
- this.services[serviceId].getDocumentTypes().forEach(documentType => {
268
- if (documentTypes.length && !documentTypes.includes(documentType)) {
269
- return;
270
- }
232
+ this.emit(record.isFirstRecord ? 'firstSnapshotRecorded' : 'snapshotRecorded', record);
271
233
 
272
- callback(this.services[serviceId].getDocumentDeclaration(documentType));
273
- });
274
- });
234
+ return record;
235
+ }));
275
236
  }
276
237
  }