@opentermsarchive/engine 9.2.3 → 10.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ #! /usr/bin/env node
2
+ import './env.js';
3
+
4
+ import path from 'path';
5
+ import { fileURLToPath, pathToFileURL } from 'url';
6
+
7
+ import { program } from 'commander';
8
+
9
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
+
11
+ const { applyTechnicalUpgrades } = await import(pathToFileURL(path.resolve(__dirname, '../src/index.js'))); // load asynchronously to ensure env.js is loaded before
12
+
13
+ program
14
+ .name('ota apply-technical-upgrades')
15
+ .description('Apply technical upgrades by generating new versions from the latest snapshots using updated declarations, engine logic, or dependencies, and by retrieving any missing snapshots for newly added source documents')
16
+ .option('-s, --services [serviceId...]', 'service IDs to apply technical upgrades to')
17
+ .option('-t, --types [termsType...]', 'terms types to apply technical upgrades to');
18
+
19
+ applyTechnicalUpgrades(program.parse(process.argv).opts());
package/bin/ota-track.js CHANGED
@@ -15,7 +15,6 @@ program
15
15
  .description('Retrieve declared documents, record snapshots, extract versions and publish the resulting records')
16
16
  .option('-s, --services [serviceId...]', 'service IDs of services to track')
17
17
  .option('-t, --types [termsType...]', 'terms types to track')
18
- .option('-e, --extract-only', 'extract versions from existing snapshots with latest declarations and engine, without recording new snapshots')
19
18
  .option('--schedule', 'track automatically at a regular interval');
20
19
 
21
20
  track(program.parse(process.argv).opts());
package/bin/ota.js CHANGED
@@ -11,6 +11,7 @@ program
11
11
  .description(description)
12
12
  .version(version)
13
13
  .command('track', 'Track the current terms of services according to provided declarations')
14
+ .command('apply-technical-upgrades', 'Apply technical upgrades by generating new versions from the latest snapshots using updated declarations, engine logic, or dependencies')
14
15
  .command('validate', 'Run a series of tests to check the validity of terms declarations')
15
16
  .command('lint', 'Check format and stylistic errors in declarations and auto fix them')
16
17
  .command('dataset', 'Export the versions dataset into a ZIP file and optionally publish it to GitHub releases')
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "9.2.3",
3
+ "version": "10.0.0",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -20,7 +20,7 @@ const { version: PACKAGE_VERSION } = require('../../package.json');
20
20
  // - too many requests on the same endpoint yield 403
21
21
  // - sometimes when creating a commit no SHA are returned for unknown reasons
22
22
  const MAX_PARALLEL_TRACKING = 1;
23
- const MAX_PARALLEL_EXTRACTING = 10;
23
+ const MAX_PARALLEL_TECHNICAL_UPGRADES = 10;
24
24
 
25
25
  export const EVENTS = [
26
26
  'snapshotRecorded',
@@ -128,14 +128,32 @@ export default class Archivist extends events.EventEmitter {
128
128
  });
129
129
  }
130
130
 
131
- async track({ services: servicesIds = this.servicesIds, types: termsTypes = [], extractOnly = false } = {}) {
131
+ async track({ services: servicesIds = this.servicesIds, types: termsTypes = [] } = {}) {
132
+ await this.processTerms({
133
+ servicesIds,
134
+ termsTypes,
135
+ technicalUpgradeOnly: false,
136
+ concurrency: MAX_PARALLEL_TRACKING,
137
+ });
138
+ }
139
+
140
+ async applyTechnicalUpgrades({ services: servicesIds = this.servicesIds, types: termsTypes = [] } = {}) {
141
+ await this.processTerms({
142
+ servicesIds,
143
+ termsTypes,
144
+ technicalUpgradeOnly: true,
145
+ concurrency: MAX_PARALLEL_TECHNICAL_UPGRADES,
146
+ });
147
+ }
148
+
149
+ async processTerms({ servicesIds, termsTypes, technicalUpgradeOnly, concurrency }) {
132
150
  const numberOfTerms = Service.getNumberOfTerms(this.services, servicesIds, termsTypes);
133
151
 
134
- this.emit('trackingStarted', servicesIds.length, numberOfTerms, extractOnly);
152
+ this.emit('trackingStarted', servicesIds.length, numberOfTerms, technicalUpgradeOnly);
135
153
 
136
154
  await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]);
137
155
 
138
- this.trackingQueue.concurrency = extractOnly ? MAX_PARALLEL_EXTRACTING : MAX_PARALLEL_TRACKING;
156
+ this.trackingQueue.concurrency = concurrency;
139
157
 
140
158
  servicesIds.forEach(serviceId => {
141
159
  this.services[serviceId].getTermsTypes().forEach(termsType => {
@@ -143,7 +161,7 @@ export default class Archivist extends events.EventEmitter {
143
161
  return;
144
162
  }
145
163
 
146
- this.trackingQueue.push({ terms: this.services[serviceId].getTerms({ type: termsType }), extractOnly });
164
+ this.trackingQueue.push({ terms: this.services[serviceId].getTerms({ type: termsType }), technicalUpgradeOnly });
147
165
  });
148
166
  });
149
167
 
@@ -153,12 +171,14 @@ export default class Archivist extends events.EventEmitter {
153
171
 
154
172
  await Promise.all([ stopHeadlessBrowser(), this.recorder.finalize() ]);
155
173
 
156
- this.emit('trackingCompleted', servicesIds.length, numberOfTerms, extractOnly);
174
+ this.emit('trackingCompleted', servicesIds.length, numberOfTerms, technicalUpgradeOnly);
157
175
  }
158
176
 
159
- async trackTermsChanges({ terms, extractOnly = false }) {
160
- if (!extractOnly) {
177
+ async trackTermsChanges({ terms, technicalUpgradeOnly = false }) {
178
+ if (!technicalUpgradeOnly) {
161
179
  await this.fetchAndRecordSnapshots(terms);
180
+ } else {
181
+ await this.fetchAndRecordNewSourceDocuments(terms); // In technical upgrade mode, fetch and record snapshots only for new source documents that don't have existing snapshots yet (e.g., when a declaration is updated to add a new source document)
162
182
  }
163
183
 
164
184
  const contents = await this.extractContentsFromSnapshots(terms);
@@ -167,7 +187,7 @@ export default class Archivist extends events.EventEmitter {
167
187
  return;
168
188
  }
169
189
 
170
- await this.recordVersion(terms, contents.join(Version.SOURCE_DOCUMENTS_SEPARATOR), extractOnly);
190
+ await this.recordVersion(terms, contents.join(Version.SOURCE_DOCUMENTS_SEPARATOR), technicalUpgradeOnly);
171
191
  }
172
192
 
173
193
  async fetchAndRecordSnapshots(terms) {
@@ -190,6 +210,50 @@ export default class Archivist extends events.EventEmitter {
190
210
  }
191
211
  }
192
212
 
213
+ async fetchAndRecordNewSourceDocuments(terms) {
214
+ if (!terms.hasMultipleSourceDocuments) { // If the terms has only one source document, there is nothing to do
215
+ return;
216
+ }
217
+
218
+ const existingVersion = await this.recorder.versionsRepository.findLatest(terms.service.id, terms.type);
219
+
220
+ if (!existingVersion) { // If the terms does not have a version recorded, skip this step as the next version will be tagged as "First record…" anyway
221
+ return;
222
+ }
223
+
224
+ const missingSourceDocuments = [];
225
+
226
+ for (const sourceDocument of terms.sourceDocuments) {
227
+ const snapshot = await this.recorder.getLatestSnapshot(terms, sourceDocument.id);
228
+
229
+ if (!snapshot) {
230
+ missingSourceDocuments.push(sourceDocument);
231
+ }
232
+ }
233
+
234
+ if (!missingSourceDocuments.length) {
235
+ return;
236
+ }
237
+
238
+ terms.fetchDate = new Date();
239
+ const fetchDocumentErrors = [];
240
+
241
+ for (const sourceDocument of missingSourceDocuments) {
242
+ const error = await this.fetchSourceDocument(sourceDocument);
243
+
244
+ if (error) {
245
+ fetchDocumentErrors.push(error);
246
+ } else {
247
+ await this.recordSnapshot(terms, sourceDocument);
248
+ sourceDocument.clearContent(); // Reduce memory usage by clearing no longer needed large content strings
249
+ }
250
+ }
251
+
252
+ if (fetchDocumentErrors.length) {
253
+ throw new InaccessibleContentError(fetchDocumentErrors);
254
+ }
255
+ }
256
+
193
257
  async fetchSourceDocument(sourceDocument) {
194
258
  const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
195
259
 
@@ -249,14 +313,14 @@ export default class Archivist extends events.EventEmitter {
249
313
  return contents;
250
314
  }
251
315
 
252
- async recordVersion(terms, content, extractOnly) {
316
+ async recordVersion(terms, content, technicalUpgradeOnly) {
253
317
  const record = new Version({
254
318
  content,
255
319
  snapshotIds: terms.sourceDocuments.map(sourceDocuments => sourceDocuments.snapshotId),
256
320
  serviceId: terms.service.id,
257
321
  termsType: terms.type,
258
322
  fetchDate: terms.fetchDate,
259
- isExtractOnly: extractOnly,
323
+ isTechnicalUpgrade: technicalUpgradeOnly,
260
324
  metadata: { 'x-engine-version': PACKAGE_VERSION },
261
325
  });
262
326
 
@@ -11,6 +11,7 @@ import sinonChai from 'sinon-chai';
11
11
  import { InaccessibleContentError } from './errors.js';
12
12
  import { FetchDocumentError } from './fetcher/index.js';
13
13
  import Git from './recorder/repositories/git/git.js';
14
+ import SourceDocument from './services/sourceDocument.js';
14
15
 
15
16
  import Archivist, { EVENTS } from './index.js';
16
17
 
@@ -52,6 +53,31 @@ describe('Archivist', function () {
52
53
 
53
54
  const services = [ 'service·A', 'Service B!' ];
54
55
 
56
+ function setupNockForServices({ serviceA = true, serviceB = true } = {}) {
57
+ nock.cleanAll();
58
+ if (serviceA) {
59
+ nock('https://www.servicea.example')
60
+ .get('/tos')
61
+ .reply(200, serviceASnapshotExpectedContent, { 'Content-Type': 'text/html' });
62
+ }
63
+ if (serviceB) {
64
+ nock('https://www.serviceb.example')
65
+ .get('/privacy')
66
+ .reply(200, serviceBSnapshotExpectedContent, { 'Content-Type': 'application/pdf' });
67
+ }
68
+ }
69
+
70
+ async function createAndInitializeArchivist() {
71
+ const archivist = new Archivist({
72
+ recorderConfig: config.get('@opentermsarchive/engine.recorder'),
73
+ fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
74
+ });
75
+
76
+ await archivist.initialize();
77
+
78
+ return archivist;
79
+ }
80
+
55
81
  before(async () => {
56
82
  gitVersion = new Git({
57
83
  path: VERSIONS_PATH,
@@ -70,13 +96,8 @@ describe('Archivist', function () {
70
96
 
71
97
  describe('#track', () => {
72
98
  before(async () => {
73
- nock('https://www.servicea.example').get('/tos').reply(200, serviceASnapshotExpectedContent, { 'Content-Type': 'text/html' });
74
- nock('https://www.serviceb.example').get('/privacy').reply(200, serviceBSnapshotExpectedContent, { 'Content-Type': 'application/pdf' });
75
- app = new Archivist({
76
- recorderConfig: config.get('@opentermsarchive/engine.recorder'),
77
- fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
78
- });
79
- await app.initialize();
99
+ setupNockForServices();
100
+ app = await createAndInitializeArchivist();
80
101
  });
81
102
 
82
103
  context('when everything works fine', () => {
@@ -112,8 +133,7 @@ describe('Archivist', function () {
112
133
  context('when there is an operational error with service A', () => {
113
134
  before(async () => {
114
135
  // as there is no more HTTP request mocks for service A, it should throw an `ENOTFOUND` error which is considered as an expected error in our workflow
115
- nock.cleanAll();
116
- nock('https://www.serviceb.example').get('/privacy').reply(200, serviceBSnapshotExpectedContent, { 'Content-Type': 'application/pdf' });
136
+ setupNockForServices({ serviceA: false, serviceB: true });
117
137
  await app.track({ services });
118
138
  });
119
139
 
@@ -139,107 +159,353 @@ describe('Archivist', function () {
139
159
  expect(resultingTerms).to.equal(serviceBVersionExpectedContent);
140
160
  });
141
161
  });
162
+ });
163
+
164
+ describe('#applyTechnicalUpgrades', () => {
165
+ context('when a service’s filter declaration changes', () => {
166
+ context('when everything works fine', () => {
167
+ let originalSnapshotId;
168
+ let firstVersionId;
169
+ let reExtractedVersionId;
170
+ let reExtractedVersionMessageBody;
171
+ let serviceBCommits;
172
+
173
+ before(async () => {
174
+ setupNockForServices();
175
+ app = await createAndInitializeArchivist();
176
+ await app.track({ services });
177
+
178
+ ({ id: originalSnapshotId } = await app.recorder.snapshotsRepository.findLatest(SERVICE_A_ID, SERVICE_A_TYPE));
179
+ ({ id: firstVersionId } = await app.recorder.versionsRepository.findLatest(SERVICE_A_ID, SERVICE_A_TYPE));
180
+
181
+ serviceBCommits = await gitVersion.log({ file: SERVICE_B_EXPECTED_VERSION_FILE_PATH });
182
+
183
+ app.services[SERVICE_A_ID].getTerms({ type: SERVICE_A_TYPE }).sourceDocuments[0].contentSelectors = 'h1';
184
+
185
+ await app.applyTechnicalUpgrades({ services: [ 'service·A', 'Service B!' ] });
186
+
187
+ const [reExtractedVersionCommit] = await gitVersion.log({ file: SERVICE_A_EXPECTED_VERSION_FILE_PATH });
188
+
189
+ reExtractedVersionId = reExtractedVersionCommit.hash;
190
+ reExtractedVersionMessageBody = reExtractedVersionCommit.body;
191
+ });
192
+
193
+ after(resetGitRepositories);
194
+
195
+ it('updates the version of the changed service', async () => {
196
+ const serviceAContent = await fs.readFile(path.resolve(__dirname, SERVICE_A_EXPECTED_VERSION_FILE_PATH), { encoding: 'utf8' });
197
+
198
+ expect(serviceAContent).to.equal('Terms of service with UTF-8 \'çhãràčtęrs"\n========================================');
199
+ });
200
+
201
+ it('generates a new version id', () => {
202
+ expect(reExtractedVersionId).to.not.equal(firstVersionId);
203
+ });
204
+
205
+ it('mentions the snapshot id in the changelog', () => {
206
+ expect(reExtractedVersionMessageBody).to.include(originalSnapshotId);
207
+ });
208
+
209
+ it('does not change other services', async () => {
210
+ const serviceBVersion = await fs.readFile(path.resolve(__dirname, SERVICE_B_EXPECTED_VERSION_FILE_PATH), { encoding: 'utf8' });
211
+
212
+ expect(serviceBVersion).to.equal(serviceBVersionExpectedContent);
213
+ });
214
+
215
+ it('does not generate a new id for other services', async () => {
216
+ const serviceBCommitsAfterExtraction = await gitVersion.log({ file: SERVICE_B_EXPECTED_VERSION_FILE_PATH });
217
+
218
+ expect(serviceBCommitsAfterExtraction.map(commit => commit.hash)).to.deep.equal(serviceBCommits.map(commit => commit.hash));
219
+ });
220
+ });
221
+
222
+ context('when there is an operational error with service A', () => {
223
+ let inaccessibleContentSpy;
224
+ let versionNotChangedSpy;
225
+ let versionB;
226
+
227
+ before(async () => {
228
+ setupNockForServices();
229
+ app = await createAndInitializeArchivist();
230
+ await app.track({ services });
231
+ app.services[SERVICE_A_ID].getTerms({ type: SERVICE_A_TYPE }).sourceDocuments[0].contentSelectors = 'inexistant-selector';
232
+ inaccessibleContentSpy = sinon.spy();
233
+ versionNotChangedSpy = sinon.spy();
234
+ app.on('inaccessibleContent', inaccessibleContentSpy);
235
+ app.on('versionNotChanged', record => {
236
+ if (record.serviceId == 'Service B!') {
237
+ versionB = record;
238
+ }
239
+ versionNotChangedSpy(record);
240
+ });
241
+ await app.applyTechnicalUpgrades({ services });
242
+ });
243
+
244
+ after(resetGitRepositories);
245
+
246
+ it('emits an inaccessibleContent event', () => {
247
+ expect(inaccessibleContentSpy).to.have.been.called;
248
+ });
249
+
250
+ it('still extracts the terms of other services', () => {
251
+ expect(versionNotChangedSpy).to.have.been.calledWith(versionB);
252
+ });
253
+ });
254
+
255
+ describe('with combined source documents', () => {
256
+ const MULTI_SOURCE_DOCS = {
257
+ SERVICE_ID: 'service_with_multiple_source_documents_terms',
258
+ TERMS_TYPE: 'Community Guidelines',
259
+ BASE_URL: 'https://www.service-with-multiple-source-documents-terms.example',
260
+ CONTENT: {
261
+ COMMUNITY_STANDARDS: '<html><body id="main"><h1>Community Standards</h1><p>Community Standards content</p></body></html>',
262
+ HATE_SPEECH: '<html><body><p>Hate speech content</p><div id="footer">Footer</div></body></html>',
263
+ VIOLENCE_INCITEMENT: '<html><body><p>Violence incitement content</p><button class="share">Share</button><button class="print">Print</button></body></html>',
264
+ NEW_POLICY: '<html><body><p>New additional policy</p></body></html>',
265
+ },
266
+ PATHS: {
267
+ COMMUNITY_STANDARDS: '/community-standards',
268
+ HATE_SPEECH: '/community-standards/hate-speech/',
269
+ VIOLENCE_INCITEMENT: '/community-standards/violence-incitement/',
270
+ NEW_POLICY: '/community-standards/new-policy/',
271
+ },
272
+ EXPECTED_TEXTS: {
273
+ COMMUNITY_STANDARDS: 'Community Standards',
274
+ HATE_SPEECH: 'Hate speech content',
275
+ VIOLENCE_INCITEMENT: 'Violence incitement content',
276
+ NEW_POLICY: 'New additional policy',
277
+ },
278
+ };
279
+
280
+ const { SERVICE_ID, TERMS_TYPE } = MULTI_SOURCE_DOCS;
281
+
282
+ function setupNockForMultiSourceDocs(pathKeys) {
283
+ pathKeys.forEach(pathKey => {
284
+ nock(MULTI_SOURCE_DOCS.BASE_URL)
285
+ .persist()
286
+ .get(MULTI_SOURCE_DOCS.PATHS[pathKey])
287
+ .reply(200, MULTI_SOURCE_DOCS.CONTENT[pathKey], { 'Content-Type': 'text/html' });
288
+ });
289
+ }
290
+
291
+ function disableClientScriptsForTerms(terms) {
292
+ terms.sourceDocuments.forEach(doc => {
293
+ doc.executeClientScripts = false;
294
+ });
295
+ }
142
296
 
143
- context('extracting only', () => {
144
- context('when a service’s filter declaration changes', () => {
145
- context('when everything works fine', () => {
146
- let originalSnapshotId;
147
- let firstVersionId;
148
- let reExtractedVersionId;
149
- let reExtractedVersionMessageBody;
150
- let serviceBCommits;
297
+ context('when a source document is added to existing combined terms', () => {
298
+ let initialVersion;
299
+ let upgradeVersion;
151
300
 
152
301
  before(async () => {
153
- nock('https://www.servicea.example').get('/tos').reply(200, serviceASnapshotExpectedContent, { 'Content-Type': 'text/html' });
154
- nock('https://www.serviceb.example').get('/privacy').reply(200, serviceBSnapshotExpectedContent, { 'Content-Type': 'application/pdf' });
155
- app = new Archivist({
156
- recorderConfig: config.get('@opentermsarchive/engine.recorder'),
157
- fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
158
- });
302
+ setupNockForMultiSourceDocs([ 'COMMUNITY_STANDARDS', 'HATE_SPEECH', 'VIOLENCE_INCITEMENT', 'NEW_POLICY' ]);
159
303
 
160
- await app.initialize();
161
- await app.track({ services });
304
+ app = await createAndInitializeArchivist();
162
305
 
163
- ({ id: originalSnapshotId } = await app.recorder.snapshotsRepository.findLatest(SERVICE_A_ID, SERVICE_A_TYPE));
164
- ({ id: firstVersionId } = await app.recorder.versionsRepository.findLatest(SERVICE_A_ID, SERVICE_A_TYPE));
306
+ let terms = app.services[SERVICE_ID].getTerms({ type: TERMS_TYPE });
165
307
 
166
- serviceBCommits = await gitVersion.log({ file: SERVICE_B_EXPECTED_VERSION_FILE_PATH });
308
+ disableClientScriptsForTerms(terms);
167
309
 
168
- app.services[SERVICE_A_ID].getTerms({ type: SERVICE_A_TYPE }).sourceDocuments[0].contentSelectors = 'h1';
310
+ // First, track the terms normally to create initial version
311
+ await app.track({ services: [SERVICE_ID], types: [TERMS_TYPE] });
312
+ initialVersion = await app.recorder.versionsRepository.findLatest(SERVICE_ID, TERMS_TYPE);
169
313
 
170
- await app.track({ services: [ 'service·A', 'Service B!' ], extractOnly: true });
314
+ // Modify the declaration to add a new source document
315
+ terms = app.services[SERVICE_ID].getTerms({ type: TERMS_TYPE });
171
316
 
172
- const [reExtractedVersionCommit] = await gitVersion.log({ file: SERVICE_A_EXPECTED_VERSION_FILE_PATH });
317
+ terms.sourceDocuments.push(new SourceDocument({
318
+ id: 'new-policy',
319
+ location: `${MULTI_SOURCE_DOCS.BASE_URL}${MULTI_SOURCE_DOCS.PATHS.NEW_POLICY}`,
320
+ contentSelectors: 'body',
321
+ executeClientScripts: false,
322
+ filters: [],
323
+ }));
173
324
 
174
- reExtractedVersionId = reExtractedVersionCommit.hash;
175
- reExtractedVersionMessageBody = reExtractedVersionCommit.body;
325
+ // Apply technical upgrades
326
+ await app.applyTechnicalUpgrades({ services: [SERVICE_ID], types: [TERMS_TYPE] });
327
+ upgradeVersion = await app.recorder.versionsRepository.findLatest(SERVICE_ID, TERMS_TYPE);
176
328
  });
177
329
 
178
- after(resetGitRepositories);
330
+ after(async () => {
331
+ await resetGitRepositories();
332
+ nock.cleanAll();
333
+ });
179
334
 
180
- it('updates the version of the changed service', async () => {
181
- const serviceAContent = await fs.readFile(path.resolve(__dirname, SERVICE_A_EXPECTED_VERSION_FILE_PATH), { encoding: 'utf8' });
335
+ it('creates a new version', () => {
336
+ expect(upgradeVersion.id).to.not.equal(initialVersion.id);
337
+ });
182
338
 
183
- expect(serviceAContent).to.equal('Terms of service with UTF-8 \'çhãràčtęrs"\n========================================');
339
+ it('marks the new version as technical upgrade', () => {
340
+ expect(upgradeVersion.isTechnicalUpgrade).to.be.true;
184
341
  });
185
342
 
186
- it('generates a new version id', () => {
187
- expect(reExtractedVersionId).to.not.equal(firstVersionId);
343
+ it('fetches and includes the new source document in the version', async () => {
344
+ const versionContent = await upgradeVersion.content;
345
+
346
+ expect(versionContent).to.include(MULTI_SOURCE_DOCS.EXPECTED_TEXTS.NEW_POLICY);
188
347
  });
189
348
 
190
- it('mentions the snapshot id in the changelog', () => {
191
- expect(reExtractedVersionMessageBody).to.include(originalSnapshotId);
349
+ it('includes all source documents in version', async () => {
350
+ const versionContent = await upgradeVersion.content;
351
+
352
+ expect(versionContent).to.include(MULTI_SOURCE_DOCS.EXPECTED_TEXTS.COMMUNITY_STANDARDS);
353
+ expect(versionContent).to.include(MULTI_SOURCE_DOCS.EXPECTED_TEXTS.HATE_SPEECH);
354
+ expect(versionContent).to.include(MULTI_SOURCE_DOCS.EXPECTED_TEXTS.VIOLENCE_INCITEMENT);
355
+ expect(versionContent).to.include(MULTI_SOURCE_DOCS.EXPECTED_TEXTS.NEW_POLICY);
192
356
  });
357
+ });
358
+
359
+ context('when a source document location is modified in combined terms', () => {
360
+ let initialVersion;
361
+ let latestVersion;
362
+ let newLocationScope;
363
+
364
+ before(async () => {
365
+ setupNockForMultiSourceDocs([ 'COMMUNITY_STANDARDS', 'HATE_SPEECH', 'VIOLENCE_INCITEMENT' ]);
366
+
367
+ app = await createAndInitializeArchivist();
368
+
369
+ let terms = app.services[SERVICE_ID].getTerms({ type: TERMS_TYPE });
370
+
371
+ disableClientScriptsForTerms(terms);
372
+
373
+ // First, track the terms normally
374
+ await app.track({ services: [SERVICE_ID], types: [TERMS_TYPE] });
375
+ initialVersion = await app.recorder.versionsRepository.findLatest(SERVICE_ID, TERMS_TYPE);
376
+
377
+ // Mock new location (but it won't be fetched during technical upgrade)
378
+ newLocationScope = nock(MULTI_SOURCE_DOCS.BASE_URL)
379
+ .persist()
380
+ .get('/community-standards/hate-speech-updated/')
381
+ .reply(200, '<html><body><p>Updated hate speech policy</p></body></html>', { 'Content-Type': 'text/html' });
193
382
 
194
- it('does not change other services', async () => {
195
- const serviceBVersion = await fs.readFile(path.resolve(__dirname, SERVICE_B_EXPECTED_VERSION_FILE_PATH), { encoding: 'utf8' });
383
+ // Modify the declaration to change location
384
+ terms = app.services[SERVICE_ID].getTerms({ type: TERMS_TYPE });
196
385
 
197
- expect(serviceBVersion).to.equal(serviceBVersionExpectedContent);
386
+ terms.sourceDocuments[1].location = `${MULTI_SOURCE_DOCS.BASE_URL}/community-standards/hate-speech-updated/`;
387
+
388
+ // Apply technical upgrades
389
+ await app.applyTechnicalUpgrades({ services: [SERVICE_ID], types: [TERMS_TYPE] });
390
+ latestVersion = await app.recorder.versionsRepository.findLatest(SERVICE_ID, TERMS_TYPE);
391
+ });
392
+
393
+ after(async () => {
394
+ await resetGitRepositories();
395
+ nock.cleanAll();
396
+ });
397
+
398
+ it('does not create a new version', () => {
399
+ expect(latestVersion.id).to.equal(initialVersion.id);
400
+ });
401
+
402
+ it('does not fetch from new location', () => {
403
+ expect(newLocationScope.isDone()).to.be.false;
198
404
  });
199
405
 
200
- it('does not generate a new id for other services', async () => {
201
- const serviceBCommitsAfterExtraction = await gitVersion.log({ file: SERVICE_B_EXPECTED_VERSION_FILE_PATH });
406
+ it('does not include content from the new location', async () => {
407
+ const versionContent = await latestVersion.content;
202
408
 
203
- expect(serviceBCommitsAfterExtraction.map(commit => commit.hash)).to.deep.equal(serviceBCommits.map(commit => commit.hash));
409
+ expect(versionContent).to.not.include('Updated hate speech policy');
204
410
  });
205
411
  });
206
412
 
207
- context('when there is an operational error with service A', () => {
208
- let inaccessibleContentSpy;
209
- let versionNotChangedSpy;
210
- let versionB;
413
+ context('when a source document selector is modified in combined terms', () => {
414
+ let initialVersion;
415
+ let latestVersion;
416
+ let initialVersionContent;
417
+ let upgradeVersionContent;
211
418
 
212
419
  before(async () => {
213
- nock('https://www.servicea.example').get('/tos').reply(200, serviceASnapshotExpectedContent, { 'Content-Type': 'text/html' });
214
- nock('https://www.serviceb.example').get('/privacy').reply(200, serviceBSnapshotExpectedContent, { 'Content-Type': 'application/pdf' });
215
- app = new Archivist({
216
- recorderConfig: config.get('@opentermsarchive/engine.recorder'),
217
- fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
218
- });
420
+ setupNockForMultiSourceDocs([ 'COMMUNITY_STANDARDS', 'HATE_SPEECH', 'VIOLENCE_INCITEMENT' ]);
421
+
422
+ app = await createAndInitializeArchivist();
423
+
424
+ let terms = app.services[SERVICE_ID].getTerms({ type: TERMS_TYPE });
425
+
426
+ disableClientScriptsForTerms(terms);
427
+
428
+ // First, track the terms normally
429
+ await app.track({ services: [SERVICE_ID], types: [TERMS_TYPE] });
430
+ initialVersion = await app.recorder.versionsRepository.findLatest(SERVICE_ID, TERMS_TYPE);
431
+ initialVersionContent = await initialVersion.content;
432
+
433
+ // Modify the declaration to change selector
434
+ terms = app.services[SERVICE_ID].getTerms({ type: TERMS_TYPE });
219
435
 
220
- await app.initialize();
221
- await app.track({ services });
222
- app.services[SERVICE_A_ID].getTerms({ type: SERVICE_A_TYPE }).sourceDocuments[0].contentSelectors = 'inexistant-selector';
223
- inaccessibleContentSpy = sinon.spy();
224
- versionNotChangedSpy = sinon.spy();
225
- app.on('inaccessibleContent', inaccessibleContentSpy);
226
- app.on('versionNotChanged', record => {
227
- if (record.serviceId == 'Service B!') {
228
- versionB = record;
229
- }
230
- versionNotChangedSpy(record);
436
+ // Change from 'body' to 'h1' for the first source document
437
+ terms.sourceDocuments[0].contentSelectors = 'h1';
438
+
439
+ // Apply technical upgrades
440
+ await app.applyTechnicalUpgrades({ services: [SERVICE_ID], types: [TERMS_TYPE] });
441
+ latestVersion = await app.recorder.versionsRepository.findLatest(SERVICE_ID, TERMS_TYPE);
442
+ upgradeVersionContent = await latestVersion.content;
443
+ });
444
+
445
+ after(async () => {
446
+ await resetGitRepositories();
447
+ nock.cleanAll();
448
+ });
449
+
450
+ it('creates a new version', () => {
451
+ expect(latestVersion.id).to.not.equal(initialVersion.id);
452
+ });
453
+
454
+ it('marks the new version as technical upgrade', () => {
455
+ expect(latestVersion.isTechnicalUpgrade).to.be.true;
456
+ });
457
+
458
+ it('extracts content with the new selector from existing snapshot', () => {
459
+ // With new selector 'h1', should only extract the heading
460
+ expect(upgradeVersionContent).to.include(MULTI_SOURCE_DOCS.EXPECTED_TEXTS.COMMUNITY_STANDARDS);
461
+ // The rest should be from other source documents
462
+ expect(upgradeVersionContent).to.include(MULTI_SOURCE_DOCS.EXPECTED_TEXTS.HATE_SPEECH);
463
+ expect(upgradeVersionContent).to.include(MULTI_SOURCE_DOCS.EXPECTED_TEXTS.VIOLENCE_INCITEMENT);
464
+ });
465
+
466
+ it('regenerates version with updated extraction logic', () => {
467
+ expect(upgradeVersionContent).to.not.equal(initialVersionContent);
468
+ });
469
+ });
470
+
471
+ context('when adding source document but no version exists yet', () => {
472
+ let newSourceScope;
473
+
474
+ before(async () => {
475
+ newSourceScope = nock(MULTI_SOURCE_DOCS.BASE_URL)
476
+ .get(MULTI_SOURCE_DOCS.PATHS.NEW_POLICY)
477
+ .reply(200, MULTI_SOURCE_DOCS.CONTENT.NEW_POLICY, { 'Content-Type': 'text/html' });
478
+
479
+ app = await createAndInitializeArchivist();
480
+
481
+ // Modify declaration before any tracking
482
+ const terms = app.services[SERVICE_ID].getTerms({ type: TERMS_TYPE });
483
+
484
+ terms.sourceDocuments.push({
485
+ id: 'new-policy',
486
+ location: `${MULTI_SOURCE_DOCS.BASE_URL}${MULTI_SOURCE_DOCS.PATHS.NEW_POLICY}`,
487
+ contentSelectors: 'body',
488
+ executeClientScripts: false,
489
+ filters: [],
231
490
  });
232
- await app.track({ services, extractOnly: true });
491
+
492
+ // Apply technical upgrades (should skip because no version exists)
493
+ await app.applyTechnicalUpgrades({ services: [SERVICE_ID], types: [TERMS_TYPE] });
233
494
  });
234
495
 
235
- after(resetGitRepositories);
496
+ after(async () => {
497
+ await resetGitRepositories();
498
+ nock.cleanAll();
499
+ });
236
500
 
237
- it('emits an inaccessibleContent event', () => {
238
- expect(inaccessibleContentSpy).to.have.been.called;
501
+ it('does not create a version when none existed before', async () => {
502
+ const version = await app.recorder.versionsRepository.findLatest(SERVICE_ID, TERMS_TYPE);
503
+
504
+ expect(version).to.be.null;
239
505
  });
240
506
 
241
- it('still extracts the terms of other services', () => {
242
- expect(versionNotChangedSpy).to.have.been.calledWith(versionB);
507
+ it('does not fetch the new source document', () => {
508
+ expect(newSourceScope.isDone()).to.be.false;
243
509
  });
244
510
  });
245
511
  });
@@ -256,11 +522,7 @@ describe('Archivist', function () {
256
522
  const retryableError = new FetchDocumentError(FetchDocumentError.LIKELY_TRANSIENT_ERRORS[0]);
257
523
 
258
524
  before(async () => {
259
- app = new Archivist({
260
- recorderConfig: config.get('@opentermsarchive/engine.recorder'),
261
- fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
262
- });
263
- await app.initialize();
525
+ app = await createAndInitializeArchivist();
264
526
  });
265
527
 
266
528
  beforeEach(() => {
@@ -345,11 +607,7 @@ describe('Archivist', function () {
345
607
 
346
608
  describe('#attach', () => {
347
609
  before(async () => {
348
- app = new Archivist({
349
- recorderConfig: config.get('@opentermsarchive/engine.recorder'),
350
- fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
351
- });
352
- await app.initialize();
610
+ app = await createAndInitializeArchivist();
353
611
 
354
612
  EVENTS.forEach(event => {
355
613
  const handlerName = `on${event[0].toUpperCase()}${event.substring(1)}`;
@@ -378,14 +636,9 @@ describe('Archivist', function () {
378
636
  let plugin;
379
637
 
380
638
  before(async () => {
381
- nock.cleanAll();
382
- nock('https://www.servicea.example').get('/tos').reply(200, serviceASnapshotExpectedContent, { 'Content-Type': 'text/html' });
639
+ setupNockForServices({ serviceA: true, serviceB: false });
383
640
 
384
- app = new Archivist({
385
- recorderConfig: config.get('@opentermsarchive/engine.recorder'),
386
- fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
387
- });
388
- await app.initialize();
641
+ app = await createAndInitializeArchivist();
389
642
 
390
643
  plugin = { onFirstVersionRecorded: () => { throw new Error('Plugin error'); } };
391
644
 
@@ -432,11 +685,7 @@ describe('Archivist', function () {
432
685
  }
433
686
 
434
687
  before(async () => {
435
- app = new Archivist({
436
- recorderConfig: config.get('@opentermsarchive/engine.recorder'),
437
- fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
438
- });
439
- await app.initialize();
688
+ app = await createAndInitializeArchivist();
440
689
 
441
690
  EVENTS.forEach(event => {
442
691
  const handlerName = `on${event[0].toUpperCase()}${event.substr(1)}`;
@@ -12,11 +12,8 @@ export default class Recorder {
12
12
  return Promise.all([ this.versionsRepository.initialize(), this.snapshotsRepository.initialize() ]);
13
13
  }
14
14
 
15
- async finalize() {
16
- // Close repositories sequentially to avoid race conditions when both repositories use the same MongoDB connection (same server/database).
17
- // Parallel closing can cause "Operation interrupted because client was closed" errors, especially on Windows.
18
- await this.versionsRepository.finalize();
19
- await this.snapshotsRepository.finalize();
15
+ finalize() {
16
+ return Promise.all([ this.versionsRepository.finalize(), this.snapshotsRepository.finalize() ]);
20
17
  }
21
18
 
22
19
  getLatestSnapshot(terms, sourceDocumentId) {
@@ -6,6 +6,8 @@ import Version from './version.js';
6
6
 
7
7
  import Recorder from './index.js';
8
8
 
9
+ const isWindows = process.platform === 'win32';
10
+
9
11
  const MIME_TYPE = 'text/html';
10
12
  const FETCH_DATE = new Date('2000-01-01T12:00:00.000Z');
11
13
  const FETCH_DATE_LATER = new Date('2000-01-02T12:00:00.000Z');
@@ -18,7 +20,14 @@ describe('Recorder', () => {
18
20
  describe(repositoryType, () => {
19
21
  let recorder;
20
22
 
21
- before(async () => {
23
+ before(async function () {
24
+ if (repositoryType == 'mongo' && isWindows) {
25
+ console.log('MongoDB tests are unstable on Windows due to race condition in connection cleanup.');
26
+ console.log('Lacking a production use case for Mongo on Windows, we skip tests. Please reach out if you have a use case.');
27
+ // On Windows, when multiple repositories connect to the same MongoDB server and are closed in parallel or even sequentially, unhandled "Operation interrupted because client was closed" errors occur after all tests pass.
28
+ // The issue does not occur on Linux or macOS, so it appears to be a platform-specific difference in how the MongoDB driver handles connection pool cleanup during client.close().
29
+ this.skip();
30
+ }
22
31
  const options = config.util.cloneDeep(config.get('@opentermsarchive/engine.recorder'));
23
32
 
24
33
  options.versions.storage.type = repositoryType;
@@ -28,7 +37,7 @@ describe('Recorder', () => {
28
37
  await recorder.initialize();
29
38
  });
30
39
 
31
- after(() => recorder.finalize());
40
+ after(() => recorder?.finalize());
32
41
 
33
42
  context('Snapshot', () => {
34
43
  describe('#record', () => {
@@ -258,8 +267,8 @@ describe('Recorder', () => {
258
267
  expect(await record.content).to.equal(UPDATED_CONTENT);
259
268
  });
260
269
 
261
- it('records in the version that it is not an extracted only version', () => {
262
- expect(record.isExtractOnly).to.equal(false);
270
+ it('records in the version that it is not a technical upgrade version', () => {
271
+ expect(record.isTechnicalUpgrade).to.equal(false);
263
272
  });
264
273
 
265
274
  it('returns the record id', () => {
@@ -315,7 +324,7 @@ describe('Recorder', () => {
315
324
  content: CONTENT,
316
325
  snapshotIds: [SNAPSHOT_ID],
317
326
  fetchDate: FETCH_DATE,
318
- isExtractOnly: true,
327
+ isTechnicalUpgrade: true,
319
328
  })));
320
329
 
321
330
  record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE);
@@ -354,7 +363,7 @@ describe('Recorder', () => {
354
363
  content: UPDATED_CONTENT,
355
364
  snapshotIds: [SNAPSHOT_ID],
356
365
  fetchDate: FETCH_DATE_LATER,
357
- isExtractOnly: true,
366
+ isTechnicalUpgrade: true,
358
367
  })));
359
368
 
360
369
  record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE);
@@ -366,8 +375,8 @@ describe('Recorder', () => {
366
375
  expect(await record.content).to.equal(UPDATED_CONTENT);
367
376
  });
368
377
 
369
- it('records in the version that it is an extracted only version', () => {
370
- expect(record.isExtractOnly).to.equal(true);
378
+ it('records in the version that it is an technical upgrade version', () => {
379
+ expect(record.isTechnicalUpgrade).to.equal(true);
371
380
  });
372
381
 
373
382
  it('returns the record id', () => {
@@ -395,7 +404,7 @@ describe('Recorder', () => {
395
404
  content: CONTENT,
396
405
  snapshotIds: [SNAPSHOT_ID],
397
406
  fetchDate: FETCH_DATE_LATER,
398
- isExtractOnly: true,
407
+ isTechnicalUpgrade: true,
399
408
  })));
400
409
 
401
410
  record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE);
@@ -7,7 +7,7 @@ import Version from '../../version.js';
7
7
 
8
8
  export const COMMIT_MESSAGE_PREFIXES = {
9
9
  startTracking: 'First record of',
10
- extractOnly: 'Apply technical or declaration upgrade on',
10
+ technicalUpgrade: 'Apply technical or declaration upgrade on',
11
11
  update: 'Record new changes of',
12
12
  deprecated_startTracking: 'Start tracking',
13
13
  deprecated_refilter: 'Refilter',
@@ -22,9 +22,9 @@ const MULTIPLE_SOURCE_DOCUMENTS_PREFIX = 'This version was recorded after extrac
22
22
  export const COMMIT_MESSAGE_PREFIXES_REGEXP = new RegExp(`^(${Object.values(COMMIT_MESSAGE_PREFIXES).join('|')})`);
23
23
 
24
24
  export function toPersistence(record, snapshotIdentiferTemplate) {
25
- const { serviceId, termsType, documentId, isExtractOnly, snapshotIds = [], mimeType, isFirstRecord, metadata } = record;
25
+ const { serviceId, termsType, documentId, isTechnicalUpgrade, snapshotIds = [], mimeType, isFirstRecord, metadata } = record;
26
26
 
27
- let prefix = isExtractOnly ? COMMIT_MESSAGE_PREFIXES.extractOnly : COMMIT_MESSAGE_PREFIXES.update;
27
+ let prefix = isTechnicalUpgrade ? COMMIT_MESSAGE_PREFIXES.technicalUpgrade : COMMIT_MESSAGE_PREFIXES.update;
28
28
 
29
29
  prefix = isFirstRecord ? COMMIT_MESSAGE_PREFIXES.startTracking : prefix;
30
30
 
@@ -75,7 +75,7 @@ export function toDomain(commit) {
75
75
  const mimeTypeValue = mime.getType(relativeFilePath);
76
76
 
77
77
  if (mimeTypeValue == mime.getType('markdown')) {
78
- attributes.isExtractOnly = message.startsWith(COMMIT_MESSAGE_PREFIXES.extractOnly) || message.startsWith(COMMIT_MESSAGE_PREFIXES.deprecated_refilter);
78
+ attributes.isTechnicalUpgrade = message.startsWith(COMMIT_MESSAGE_PREFIXES.technicalUpgrade) || message.startsWith(COMMIT_MESSAGE_PREFIXES.deprecated_refilter);
79
79
  attributes.snapshotIds = snapshotIdsMatch;
80
80
 
81
81
  return new Version(attributes);
@@ -208,7 +208,7 @@ describe('GitRepository', () => {
208
208
  });
209
209
  });
210
210
 
211
- context('when it is an extracted only version', () => {
211
+ context('when it is an technical upgrade version', () => {
212
212
  const EXTRACTED_ONLY_CONTENT = `${CONTENT} extracted only`;
213
213
 
214
214
  before(async () => {
@@ -217,7 +217,7 @@ describe('GitRepository', () => {
217
217
  termsType: TERMS_TYPE,
218
218
  content: CONTENT,
219
219
  fetchDate: FETCH_DATE_EARLIER,
220
- })); // An extracted only version cannot be the first record
220
+ })); // An technical upgrade version cannot be the first record
221
221
 
222
222
  numberOfRecordsBefore = (await git.log()).length;
223
223
 
@@ -226,7 +226,7 @@ describe('GitRepository', () => {
226
226
  termsType: TERMS_TYPE,
227
227
  content: EXTRACTED_ONLY_CONTENT,
228
228
  fetchDate: FETCH_DATE,
229
- isExtractOnly: true,
229
+ isTechnicalUpgrade: true,
230
230
  snapshotIds: [SNAPSHOT_ID],
231
231
  })));
232
232
 
@@ -245,8 +245,8 @@ describe('GitRepository', () => {
245
245
  expect(commit.hash).to.include(id);
246
246
  });
247
247
 
248
- it('stores information that it is an extracted only version', () => {
249
- expect(commit.message).to.include(COMMIT_MESSAGE_PREFIXES.extractOnly);
248
+ it('stores information that it is an technical upgrade version', () => {
249
+ expect(commit.message).to.include(COMMIT_MESSAGE_PREFIXES.technicalUpgrade);
250
250
  });
251
251
  });
252
252
 
@@ -518,7 +518,7 @@ describe('GitRepository', () => {
518
518
  serviceId: SERVICE_PROVIDER_ID,
519
519
  termsType: TERMS_TYPE,
520
520
  content: `${CONTENT} - updated 2`,
521
- isExtractOnly: true,
521
+ isTechnicalUpgrade: true,
522
522
  fetchDate: FETCH_DATE_EARLIER,
523
523
  snapshotIds: [SNAPSHOT_ID],
524
524
  }));
@@ -569,7 +569,7 @@ describe('GitRepository', () => {
569
569
  serviceId: SERVICE_PROVIDER_ID,
570
570
  termsType: TERMS_TYPE,
571
571
  content: `${CONTENT} - updated 2`,
572
- isExtractOnly: true,
572
+ isTechnicalUpgrade: true,
573
573
  fetchDate: FETCH_DATE_EARLIER,
574
574
  snapshotIds: [SNAPSHOT_ID],
575
575
  }));
@@ -678,7 +678,7 @@ describe('GitRepository', () => {
678
678
  serviceId: SERVICE_PROVIDER_ID,
679
679
  termsType: TERMS_TYPE,
680
680
  content: `${CONTENT} - updated 2`,
681
- isExtractOnly: true,
681
+ isTechnicalUpgrade: true,
682
682
  fetchDate: FETCH_DATE_EARLIER,
683
683
  snapshotIds: [SNAPSHOT_ID],
684
684
  mimeType: HTML_MIME_TYPE,
@@ -1079,7 +1079,7 @@ describe('GitRepository', () => {
1079
1079
  serviceId: SERVICE_PROVIDER_ID,
1080
1080
  termsType: TERMS_TYPE,
1081
1081
  content: `${CONTENT} - updated 2`,
1082
- isExtractOnly: true,
1082
+ isTechnicalUpgrade: true,
1083
1083
  fetchDate: FETCH_DATE_EARLIER,
1084
1084
  mimeType: HTML_MIME_TYPE,
1085
1085
  }));
@@ -1130,7 +1130,7 @@ describe('GitRepository', () => {
1130
1130
  serviceId: SERVICE_PROVIDER_ID,
1131
1131
  termsType: TERMS_TYPE,
1132
1132
  content: `${CONTENT} - updated 2`,
1133
- isExtractOnly: true,
1133
+ isTechnicalUpgrade: true,
1134
1134
  fetchDate: FETCH_DATE_EARLIER,
1135
1135
  mimeType: HTML_MIME_TYPE,
1136
1136
  }));
@@ -1269,7 +1269,7 @@ describe('GitRepository', () => {
1269
1269
  serviceId: SERVICE_PROVIDER_ID,
1270
1270
  termsType: TERMS_TYPE,
1271
1271
  content: `${CONTENT} - updated 2`,
1272
- isExtractOnly: true,
1272
+ isTechnicalUpgrade: true,
1273
1273
  fetchDate: FETCH_DATE_EARLIER,
1274
1274
  mimeType: HTML_MIME_TYPE,
1275
1275
  }));
@@ -1398,24 +1398,24 @@ describe('GitRepository', () => {
1398
1398
  after(() => subject.removeAll());
1399
1399
 
1400
1400
  describe('Records attributes', () => {
1401
- describe('#isExtractOnly', () => {
1401
+ describe('#isTechnicalUpgrade', () => {
1402
1402
  context('records with deprecated message', () => {
1403
1403
  it('returns the proper value', async () => {
1404
- expect((await subject.findById(commits.deprecatedRefilter.id)).isExtractOnly).to.be.true;
1404
+ expect((await subject.findById(commits.deprecatedRefilter.id)).isTechnicalUpgrade).to.be.true;
1405
1405
  });
1406
1406
 
1407
1407
  it('returns the proper value', async () => {
1408
- expect((await subject.findById(commits.deprecatedFirstRecord.id)).isExtractOnly).to.be.false;
1408
+ expect((await subject.findById(commits.deprecatedFirstRecord.id)).isTechnicalUpgrade).to.be.false;
1409
1409
  });
1410
1410
  });
1411
1411
 
1412
1412
  context('record with current message', () => {
1413
1413
  it('returns the proper value', async () => {
1414
- expect((await subject.findById(commits.currentExtractOnly.id)).isExtractOnly).to.be.true;
1414
+ expect((await subject.findById(commits.currentExtractOnly.id)).isTechnicalUpgrade).to.be.true;
1415
1415
  });
1416
1416
 
1417
1417
  it('returns the proper value', async () => {
1418
- expect((await subject.findById(commits.currentFirstRecord.id)).isExtractOnly).to.be.false;
1418
+ expect((await subject.findById(commits.currentFirstRecord.id)).isTechnicalUpgrade).to.be.false;
1419
1419
  });
1420
1420
  });
1421
1421
  });
@@ -17,7 +17,7 @@ export function toPersistence(record) {
17
17
  }
18
18
 
19
19
  export function toDomain(mongoDocument) {
20
- const { _id, serviceId, termsType, documentId, fetchDate, mimeType, isExtractOnly, isRefilter, isFirstRecord, snapshotIds, metadata } = mongoDocument;
20
+ const { _id, serviceId, termsType, documentId, fetchDate, mimeType, isTechnicalUpgrade, isExtractOnly, isRefilter, isFirstRecord, snapshotIds, metadata } = mongoDocument;
21
21
 
22
22
  const attributes = {
23
23
  id: _id.toString(),
@@ -27,7 +27,7 @@ export function toDomain(mongoDocument) {
27
27
  mimeType,
28
28
  fetchDate: new Date(fetchDate),
29
29
  isFirstRecord: Boolean(isFirstRecord),
30
- isExtractOnly: Boolean(isExtractOnly) || Boolean(isRefilter),
30
+ isTechnicalUpgrade: Boolean(isTechnicalUpgrade) || Boolean(isExtractOnly) || Boolean(isRefilter),
31
31
  snapshotIds: snapshotIds?.map(snapshotId => snapshotId.toString()) || [],
32
32
  metadata,
33
33
  };
@@ -16,6 +16,7 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url));
16
16
 
17
17
  const { connectionURI } = config.get('@opentermsarchive/engine.recorder.snapshots.storage.mongo');
18
18
  const client = new MongoClient(connectionURI);
19
+ const isWindows = process.platform === 'win32';
19
20
 
20
21
  const SERVICE_PROVIDER_ID = 'test_service';
21
22
  const TERMS_TYPE = 'Terms of Service';
@@ -41,6 +42,16 @@ const METADATA = {
41
42
  let collection;
42
43
 
43
44
  describe('MongoRepository', () => {
45
+ before(function () {
46
+ if (isWindows) {
47
+ console.log('MongoDB tests are unstable on Windows due to race condition in connection cleanup.');
48
+ console.log('Lacking a production use case for Mongo on Windows, we skip tests. Please reach out if you have a use case.');
49
+ // On Windows, when multiple repositories connect to the same MongoDB server and are closed in parallel or even sequentially, unhandled "Operation interrupted because client was closed" errors occur after all tests pass.
50
+ // The issue does not occur on Linux or macOS, so it appears to be a platform-specific difference in how the MongoDB driver handles connection pool cleanup during client.close().
51
+ this.skip();
52
+ }
53
+ });
54
+
44
55
  let subject;
45
56
 
46
57
  context('Version', () => {
@@ -220,7 +231,7 @@ describe('MongoRepository', () => {
220
231
  });
221
232
  });
222
233
 
223
- context('when it is an extracted only version', () => {
234
+ context('when it is an technical upgrade version', () => {
224
235
  const EXTRACTED_ONLY_CONTENT = `${CONTENT} extracted only`;
225
236
 
226
237
  before(async () => {
@@ -230,7 +241,7 @@ describe('MongoRepository', () => {
230
241
  content: CONTENT,
231
242
  fetchDate: FETCH_DATE_EARLIER,
232
243
  snapshotIds: [SNAPSHOT_ID],
233
- })); // An extracted only version cannot be the first record
244
+ })); // An technical upgrade version cannot be the first record
234
245
 
235
246
  numberOfRecordsBefore = await collection.countDocuments({
236
247
  serviceId: SERVICE_PROVIDER_ID,
@@ -243,7 +254,7 @@ describe('MongoRepository', () => {
243
254
  content: EXTRACTED_ONLY_CONTENT,
244
255
  fetchDate: FETCH_DATE,
245
256
  snapshotIds: [SNAPSHOT_ID],
246
- isExtractOnly: true,
257
+ isTechnicalUpgrade: true,
247
258
  })));
248
259
 
249
260
  numberOfRecordsAfter = await collection.countDocuments({
@@ -267,8 +278,8 @@ describe('MongoRepository', () => {
267
278
  expect(mongoDocument._id.toString()).to.equal(record.id);
268
279
  });
269
280
 
270
- it('stores information that it is an extracted only version', () => {
271
- expect(mongoDocument.isExtractOnly).to.be.true;
281
+ it('stores information that it is an technical upgrade version', () => {
282
+ expect(mongoDocument.isTechnicalUpgrade).to.be.true;
272
283
  });
273
284
  });
274
285
 
@@ -596,7 +607,7 @@ describe('MongoRepository', () => {
596
607
  serviceId: SERVICE_PROVIDER_ID,
597
608
  termsType: TERMS_TYPE,
598
609
  content: `${CONTENT} - updated 2`,
599
- isExtractOnly: true,
610
+ isTechnicalUpgrade: true,
600
611
  fetchDate: FETCH_DATE_EARLIER,
601
612
  snapshotIds: [SNAPSHOT_ID],
602
613
  }));
@@ -645,7 +656,7 @@ describe('MongoRepository', () => {
645
656
  serviceId: SERVICE_PROVIDER_ID,
646
657
  termsType: TERMS_TYPE,
647
658
  content: `${CONTENT} - updated 2`,
648
- isExtractOnly: true,
659
+ isTechnicalUpgrade: true,
649
660
  fetchDate: FETCH_DATE_EARLIER,
650
661
  snapshotIds: [SNAPSHOT_ID],
651
662
  }));
@@ -810,7 +821,7 @@ describe('MongoRepository', () => {
810
821
  serviceId: SERVICE_PROVIDER_ID,
811
822
  termsType: TERMS_TYPE,
812
823
  content: `${CONTENT} - updated 2`,
813
- isExtractOnly: true,
824
+ isTechnicalUpgrade: true,
814
825
  fetchDate: FETCH_DATE_EARLIER,
815
826
  snapshotIds: [SNAPSHOT_ID],
816
827
  }));
@@ -1164,7 +1175,7 @@ describe('MongoRepository', () => {
1164
1175
  serviceId: SERVICE_PROVIDER_ID,
1165
1176
  termsType: TERMS_TYPE,
1166
1177
  content: `${CONTENT} - updated 2`,
1167
- isExtractOnly: true,
1178
+ isTechnicalUpgrade: true,
1168
1179
  fetchDate: FETCH_DATE_EARLIER,
1169
1180
  mimeType: HTML_MIME_TYPE,
1170
1181
  }));
@@ -1213,7 +1224,7 @@ describe('MongoRepository', () => {
1213
1224
  serviceId: SERVICE_PROVIDER_ID,
1214
1225
  termsType: TERMS_TYPE,
1215
1226
  content: `${CONTENT} - updated 2`,
1216
- isExtractOnly: true,
1227
+ isTechnicalUpgrade: true,
1217
1228
  fetchDate: FETCH_DATE_EARLIER,
1218
1229
  mimeType: HTML_MIME_TYPE,
1219
1230
  }));
@@ -1421,7 +1432,7 @@ describe('MongoRepository', () => {
1421
1432
  serviceId: SERVICE_PROVIDER_ID,
1422
1433
  termsType: TERMS_TYPE,
1423
1434
  content: `${CONTENT} - updated 2`,
1424
- isExtractOnly: true,
1435
+ isTechnicalUpgrade: true,
1425
1436
  fetchDate: FETCH_DATE_EARLIER,
1426
1437
  mimeType: HTML_MIME_TYPE,
1427
1438
  }));
package/src/index.js CHANGED
@@ -13,7 +13,7 @@ import Reporter from './reporter/index.js';
13
13
  const require = createRequire(import.meta.url);
14
14
  const { version: PACKAGE_VERSION } = require('../package.json');
15
15
 
16
- export default async function track({ services, types, extractOnly, schedule }) {
16
+ async function initialize(services) {
17
17
  const archivist = new Archivist({
18
18
  recorderConfig: config.get('@opentermsarchive/engine.recorder'),
19
19
  fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
@@ -40,13 +40,17 @@ export default async function track({ services, types, extractOnly, schedule })
40
40
  });
41
41
  }
42
42
 
43
- // The result of the extraction step that generates the version from the snapshots may depend on changes to the engine or its dependencies.
44
- // The process thus starts by only performing the extraction process so that any version following such changes can be labelled (to avoid sending notifications, for example)
45
- await archivist.track({ services, types, extractOnly: true });
43
+ return { archivist, services };
44
+ }
46
45
 
47
- if (extractOnly) {
48
- return;
49
- }
46
+ export default async function track({ services, types, schedule }) {
47
+ const { archivist, services: filteredServices } = await initialize(services);
48
+
49
+ // Technical upgrade pass: apply changes from engine, dependency, or declaration upgrades.
50
+ // This regenerates versions from existing snapshots with updated extraction logic.
51
+ // For terms with combined source documents, if a new document was added to the declaration, it will be fetched and combined with existing snapshots to regenerate the complete version.
52
+ // All versions from this pass are labeled as technical upgrades to avoid false notifications about content changes.
53
+ await archivist.applyTechnicalUpgrades({ services: filteredServices, types });
50
54
 
51
55
  if (process.env.OTA_ENGINE_SENDINBLUE_API_KEY) {
52
56
  try {
@@ -72,7 +76,7 @@ export default async function track({ services, types, extractOnly, schedule })
72
76
  }
73
77
 
74
78
  if (!schedule) {
75
- await archivist.track({ services, types });
79
+ await archivist.track({ services: filteredServices, types });
76
80
 
77
81
  return;
78
82
  }
@@ -86,6 +90,12 @@ export default async function track({ services, types, extractOnly, schedule })
86
90
  new Cron( // eslint-disable-line no-new
87
91
  trackingSchedule,
88
92
  { protect: job => logger.warn(`Tracking scheduled at ${new Date().toISOString()} were blocked by an unfinished tracking started at ${job.currentRun().toISOString()}`) },
89
- () => archivist.track({ services, types }),
93
+ () => archivist.track({ services: filteredServices, types }),
90
94
  );
91
95
  }
96
+
97
+ export async function applyTechnicalUpgrades({ services, types }) {
98
+ const { archivist, services: filteredServices } = await initialize(services);
99
+
100
+ await archivist.applyTechnicalUpgrades({ services: filteredServices, types });
101
+ }
@@ -195,9 +195,9 @@ logger.onVersionNotChanged = ({ serviceId, termsType }) => {
195
195
  logger.info({ message: 'No changes after filtering, did not record version', serviceId, termsType });
196
196
  };
197
197
 
198
- logger.onTrackingStarted = (numberOfServices, numberOfTerms, extractOnly) => {
199
- if (extractOnly) {
200
- logger.info(`Examining ${numberOfTerms} terms from ${numberOfServices} services for extraction…`);
198
+ logger.onTrackingStarted = (numberOfServices, numberOfTerms, technicalUpgradeOnly) => {
199
+ if (technicalUpgradeOnly) {
200
+ logger.info(`Applying technical upgrades to ${numberOfTerms} terms from ${numberOfServices} services…`);
201
201
  } else {
202
202
  logger.info(`Tracking changes of ${numberOfTerms} terms from ${numberOfServices} services…`);
203
203
  }
@@ -206,11 +206,11 @@ logger.onTrackingStarted = (numberOfServices, numberOfTerms, extractOnly) => {
206
206
  trackingStartTime = Date.now();
207
207
  };
208
208
 
209
- logger.onTrackingCompleted = (numberOfServices, numberOfTerms, extractOnly) => {
209
+ logger.onTrackingCompleted = (numberOfServices, numberOfTerms, technicalUpgradeOnly) => {
210
210
  const duration = formatDuration(Date.now() - trackingStartTime);
211
211
 
212
- if (extractOnly) {
213
- logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction in ${duration}`);
212
+ if (technicalUpgradeOnly) {
213
+ logger.info(`Applied technical upgrades to ${numberOfTerms} terms from ${numberOfServices} services in ${duration}`);
214
214
  logger.info(`Recorded ${recordedVersionsCount} new versions\n`);
215
215
  } else {
216
216
  logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services in ${duration}`);