@opentermsarchive/engine 9.2.3 → 10.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ota-apply-technical-upgrades.js +19 -0
- package/bin/ota-dataset.js +2 -2
- package/bin/ota-track.js +0 -1
- package/bin/ota.js +1 -0
- package/config/default.json +1 -1
- package/config/development.json +60 -0
- package/package.json +1 -1
- package/scripts/dataset/assets/README.template.js +1 -1
- package/scripts/dataset/export/test/fixtures/dataset/README.md +1 -1
- package/scripts/dataset/index.js +8 -3
- package/scripts/dataset/logger/index.js +25 -3
- package/scripts/dataset/publish/datagouv/dataset.js +234 -0
- package/scripts/dataset/publish/datagouv/index.js +82 -0
- package/scripts/dataset/publish/github/index.js +11 -2
- package/scripts/dataset/publish/gitlab/index.js +3 -1
- package/scripts/dataset/publish/index.js +39 -5
- package/src/archivist/index.js +75 -11
- package/src/archivist/index.test.js +345 -96
- package/src/archivist/recorder/index.js +2 -5
- package/src/archivist/recorder/index.test.js +18 -9
- package/src/archivist/recorder/repositories/git/dataMapper.js +4 -4
- package/src/archivist/recorder/repositories/git/index.test.js +16 -16
- package/src/archivist/recorder/repositories/mongo/dataMapper.js +2 -2
- package/src/archivist/recorder/repositories/mongo/index.test.js +22 -11
- package/src/collection-api/server.js +1 -1
- package/src/index.js +19 -9
- package/src/logger/index.js +6 -6
- package/src/reporter/gitlab/index.js +2 -2
package/src/archivist/index.js
CHANGED
|
@@ -20,7 +20,7 @@ const { version: PACKAGE_VERSION } = require('../../package.json');
|
|
|
20
20
|
// - too many requests on the same endpoint yield 403
|
|
21
21
|
// - sometimes when creating a commit no SHA are returned for unknown reasons
|
|
22
22
|
const MAX_PARALLEL_TRACKING = 1;
|
|
23
|
-
const
|
|
23
|
+
const MAX_PARALLEL_TECHNICAL_UPGRADES = 10;
|
|
24
24
|
|
|
25
25
|
export const EVENTS = [
|
|
26
26
|
'snapshotRecorded',
|
|
@@ -128,14 +128,32 @@ export default class Archivist extends events.EventEmitter {
|
|
|
128
128
|
});
|
|
129
129
|
}
|
|
130
130
|
|
|
131
|
-
async track({ services: servicesIds = this.servicesIds, types: termsTypes = []
|
|
131
|
+
async track({ services: servicesIds = this.servicesIds, types: termsTypes = [] } = {}) {
|
|
132
|
+
await this.processTerms({
|
|
133
|
+
servicesIds,
|
|
134
|
+
termsTypes,
|
|
135
|
+
technicalUpgradeOnly: false,
|
|
136
|
+
concurrency: MAX_PARALLEL_TRACKING,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async applyTechnicalUpgrades({ services: servicesIds = this.servicesIds, types: termsTypes = [] } = {}) {
|
|
141
|
+
await this.processTerms({
|
|
142
|
+
servicesIds,
|
|
143
|
+
termsTypes,
|
|
144
|
+
technicalUpgradeOnly: true,
|
|
145
|
+
concurrency: MAX_PARALLEL_TECHNICAL_UPGRADES,
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
async processTerms({ servicesIds, termsTypes, technicalUpgradeOnly, concurrency }) {
|
|
132
150
|
const numberOfTerms = Service.getNumberOfTerms(this.services, servicesIds, termsTypes);
|
|
133
151
|
|
|
134
|
-
this.emit('trackingStarted', servicesIds.length, numberOfTerms,
|
|
152
|
+
this.emit('trackingStarted', servicesIds.length, numberOfTerms, technicalUpgradeOnly);
|
|
135
153
|
|
|
136
154
|
await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]);
|
|
137
155
|
|
|
138
|
-
this.trackingQueue.concurrency =
|
|
156
|
+
this.trackingQueue.concurrency = concurrency;
|
|
139
157
|
|
|
140
158
|
servicesIds.forEach(serviceId => {
|
|
141
159
|
this.services[serviceId].getTermsTypes().forEach(termsType => {
|
|
@@ -143,7 +161,7 @@ export default class Archivist extends events.EventEmitter {
|
|
|
143
161
|
return;
|
|
144
162
|
}
|
|
145
163
|
|
|
146
|
-
this.trackingQueue.push({ terms: this.services[serviceId].getTerms({ type: termsType }),
|
|
164
|
+
this.trackingQueue.push({ terms: this.services[serviceId].getTerms({ type: termsType }), technicalUpgradeOnly });
|
|
147
165
|
});
|
|
148
166
|
});
|
|
149
167
|
|
|
@@ -153,12 +171,14 @@ export default class Archivist extends events.EventEmitter {
|
|
|
153
171
|
|
|
154
172
|
await Promise.all([ stopHeadlessBrowser(), this.recorder.finalize() ]);
|
|
155
173
|
|
|
156
|
-
this.emit('trackingCompleted', servicesIds.length, numberOfTerms,
|
|
174
|
+
this.emit('trackingCompleted', servicesIds.length, numberOfTerms, technicalUpgradeOnly);
|
|
157
175
|
}
|
|
158
176
|
|
|
159
|
-
async trackTermsChanges({ terms,
|
|
160
|
-
if (!
|
|
177
|
+
async trackTermsChanges({ terms, technicalUpgradeOnly = false }) {
|
|
178
|
+
if (!technicalUpgradeOnly) {
|
|
161
179
|
await this.fetchAndRecordSnapshots(terms);
|
|
180
|
+
} else {
|
|
181
|
+
await this.fetchAndRecordNewSourceDocuments(terms); // In technical upgrade mode, fetch and record snapshots only for new source documents that don't have existing snapshots yet (e.g., when a declaration is updated to add a new source document)
|
|
162
182
|
}
|
|
163
183
|
|
|
164
184
|
const contents = await this.extractContentsFromSnapshots(terms);
|
|
@@ -167,7 +187,7 @@ export default class Archivist extends events.EventEmitter {
|
|
|
167
187
|
return;
|
|
168
188
|
}
|
|
169
189
|
|
|
170
|
-
await this.recordVersion(terms, contents.join(Version.SOURCE_DOCUMENTS_SEPARATOR),
|
|
190
|
+
await this.recordVersion(terms, contents.join(Version.SOURCE_DOCUMENTS_SEPARATOR), technicalUpgradeOnly);
|
|
171
191
|
}
|
|
172
192
|
|
|
173
193
|
async fetchAndRecordSnapshots(terms) {
|
|
@@ -190,6 +210,50 @@ export default class Archivist extends events.EventEmitter {
|
|
|
190
210
|
}
|
|
191
211
|
}
|
|
192
212
|
|
|
213
|
+
async fetchAndRecordNewSourceDocuments(terms) {
|
|
214
|
+
if (!terms.hasMultipleSourceDocuments) { // If the terms has only one source document, there is nothing to do
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const existingVersion = await this.recorder.versionsRepository.findLatest(terms.service.id, terms.type);
|
|
219
|
+
|
|
220
|
+
if (!existingVersion) { // If the terms does not have a version recorded, skip this step as the next version will be tagged as "First record…" anyway
|
|
221
|
+
return;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const missingSourceDocuments = [];
|
|
225
|
+
|
|
226
|
+
for (const sourceDocument of terms.sourceDocuments) {
|
|
227
|
+
const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
|
|
228
|
+
|
|
229
|
+
if (!snapshot) {
|
|
230
|
+
missingSourceDocuments.push(sourceDocument);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (!missingSourceDocuments.length) {
|
|
235
|
+
return;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
terms.fetchDate = new Date();
|
|
239
|
+
const fetchDocumentErrors = [];
|
|
240
|
+
|
|
241
|
+
for (const sourceDocument of missingSourceDocuments) {
|
|
242
|
+
const error = await this.fetchSourceDocument(sourceDocument);
|
|
243
|
+
|
|
244
|
+
if (error) {
|
|
245
|
+
fetchDocumentErrors.push(error);
|
|
246
|
+
} else {
|
|
247
|
+
await this.recordSnapshot(terms, sourceDocument);
|
|
248
|
+
sourceDocument.clearContent(); // Reduce memory usage by clearing no longer needed large content strings
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if (fetchDocumentErrors.length) {
|
|
253
|
+
throw new InaccessibleContentError(fetchDocumentErrors);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
193
257
|
async fetchSourceDocument(sourceDocument) {
|
|
194
258
|
const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
|
|
195
259
|
|
|
@@ -249,14 +313,14 @@ export default class Archivist extends events.EventEmitter {
|
|
|
249
313
|
return contents;
|
|
250
314
|
}
|
|
251
315
|
|
|
252
|
-
async recordVersion(terms, content,
|
|
316
|
+
async recordVersion(terms, content, technicalUpgradeOnly) {
|
|
253
317
|
const record = new Version({
|
|
254
318
|
content,
|
|
255
319
|
snapshotIds: terms.sourceDocuments.map(sourceDocuments => sourceDocuments.snapshotId),
|
|
256
320
|
serviceId: terms.service.id,
|
|
257
321
|
termsType: terms.type,
|
|
258
322
|
fetchDate: terms.fetchDate,
|
|
259
|
-
|
|
323
|
+
isTechnicalUpgrade: technicalUpgradeOnly,
|
|
260
324
|
metadata: { 'x-engine-version': PACKAGE_VERSION },
|
|
261
325
|
});
|
|
262
326
|
|