@opentermsarchive/engine 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/.env.example +3 -0
  2. package/.eslintrc.yaml +116 -0
  3. package/.github/workflows/deploy.yml +50 -0
  4. package/.github/workflows/release.yml +64 -0
  5. package/.github/workflows/test.yml +77 -0
  6. package/CHANGELOG.md +14 -0
  7. package/CODE_OF_CONDUCT.md +128 -0
  8. package/CONTRIBUTING.md +143 -0
  9. package/LICENSE +153 -0
  10. package/MIGRATING.md +42 -0
  11. package/README.fr.md +110 -0
  12. package/README.md +438 -0
  13. package/Vagrantfile +38 -0
  14. package/ansible.cfg +13 -0
  15. package/bin/.env.js +1 -0
  16. package/bin/lint-declarations.js +31 -0
  17. package/bin/track.js +26 -0
  18. package/bin/validate-declarations.js +68 -0
  19. package/config/ci.json +5 -0
  20. package/config/contrib.json +35 -0
  21. package/config/dating.json +37 -0
  22. package/config/default.json +71 -0
  23. package/config/france.json +40 -0
  24. package/config/p2b-compliance.json +40 -0
  25. package/config/pga.json +40 -0
  26. package/config/production.json +27 -0
  27. package/config/test.json +49 -0
  28. package/config/vagrant.json +24 -0
  29. package/decision-records/0001-service-name-and-id.md +73 -0
  30. package/decision-records/0002-service-history.md +212 -0
  31. package/decision-records/0003-snapshots-database.md +123 -0
  32. package/ops/README.md +280 -0
  33. package/ops/app.yml +5 -0
  34. package/ops/infra.yml +6 -0
  35. package/ops/inventories/dev.yml +7 -0
  36. package/ops/inventories/production.yml +27 -0
  37. package/ops/roles/infra/defaults/main.yml +2 -0
  38. package/ops/roles/infra/files/.gitconfig +3 -0
  39. package/ops/roles/infra/files/mongod.conf +18 -0
  40. package/ops/roles/infra/files/ota-bot-key.private_key +26 -0
  41. package/ops/roles/infra/tasks/main.yml +78 -0
  42. package/ops/roles/infra/tasks/mongo.yml +40 -0
  43. package/ops/roles/infra/templates/ssh_config.j2 +5 -0
  44. package/ops/roles/ota/defaults/main.yml +14 -0
  45. package/ops/roles/ota/files/.env +21 -0
  46. package/ops/roles/ota/tasks/database.yml +65 -0
  47. package/ops/roles/ota/tasks/main.yml +110 -0
  48. package/ops/site.yml +6 -0
  49. package/package.json +101 -0
  50. package/pm2.config.cjs +20 -0
  51. package/scripts/dataset/README.md +37 -0
  52. package/scripts/dataset/assets/LICENSE +540 -0
  53. package/scripts/dataset/assets/README.template.js +65 -0
  54. package/scripts/dataset/export/index.js +106 -0
  55. package/scripts/dataset/export/index.test.js +155 -0
  56. package/scripts/dataset/export/test/fixtures/dataset/LICENSE +540 -0
  57. package/scripts/dataset/export/test/fixtures/dataset/README.md +40 -0
  58. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-01T11-27-00Z.md +1 -0
  59. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-11T11-32-47Z.md +1 -0
  60. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Privacy Policy/2022-01-01T12-12-24Z.md +1 -0
  61. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Terms of Service/2022-01-06T11-32-47Z.md +1 -0
  62. package/scripts/dataset/index.js +40 -0
  63. package/scripts/dataset/logger/index.js +17 -0
  64. package/scripts/dataset/main.js +25 -0
  65. package/scripts/dataset/publish/index.js +39 -0
  66. package/scripts/declarations/lint/index.js +36 -0
  67. package/scripts/declarations/utils/index.js +81 -0
  68. package/scripts/declarations/validate/definitions.js +63 -0
  69. package/scripts/declarations/validate/index.mocha.js +262 -0
  70. package/scripts/declarations/validate/service.history.schema.js +86 -0
  71. package/scripts/declarations/validate/service.schema.js +91 -0
  72. package/scripts/history/logger/index.js +39 -0
  73. package/scripts/history/migrate-services.js +212 -0
  74. package/scripts/history/update-to-full-hash.js +61 -0
  75. package/scripts/history/utils/index.js +23 -0
  76. package/scripts/import/README.md +59 -0
  77. package/scripts/import/config/import.json +12 -0
  78. package/scripts/import/index.js +224 -0
  79. package/scripts/import/loadCommits.js +66 -0
  80. package/scripts/import/logger/index.js +43 -0
  81. package/scripts/rewrite/README.md +131 -0
  82. package/scripts/rewrite/config/rewrite-snapshots.json +32 -0
  83. package/scripts/rewrite/config/rewrite-versions.json +32 -0
  84. package/scripts/rewrite/initializer/files/license +428 -0
  85. package/scripts/rewrite/initializer/files/readme.md +8 -0
  86. package/scripts/rewrite/initializer/index.js +44 -0
  87. package/scripts/rewrite/rewrite-snapshots.js +108 -0
  88. package/scripts/rewrite/rewrite-versions.js +160 -0
  89. package/scripts/rewrite/utils.js +33 -0
  90. package/scripts/utils/renamer/README.md +49 -0
  91. package/scripts/utils/renamer/index.js +45 -0
  92. package/scripts/utils/renamer/rules/documentTypes.json +25 -0
  93. package/scripts/utils/renamer/rules/documentTypesByService.json +170 -0
  94. package/scripts/utils/renamer/rules/serviceNames.json +92 -0
  95. package/src/archivist/errors.js +9 -0
  96. package/src/archivist/fetcher/errors.js +6 -0
  97. package/src/archivist/fetcher/exports.js +18 -0
  98. package/src/archivist/fetcher/fullDomFetcher.js +84 -0
  99. package/src/archivist/fetcher/htmlOnlyFetcher.js +62 -0
  100. package/src/archivist/fetcher/index.js +35 -0
  101. package/src/archivist/fetcher/index.test.js +239 -0
  102. package/src/archivist/filter/exports.js +3 -0
  103. package/src/archivist/filter/index.js +178 -0
  104. package/src/archivist/filter/index.test.js +561 -0
  105. package/src/archivist/index.js +276 -0
  106. package/src/archivist/index.test.js +600 -0
  107. package/src/archivist/recorder/index.js +77 -0
  108. package/src/archivist/recorder/index.test.js +463 -0
  109. package/src/archivist/recorder/record.js +35 -0
  110. package/src/archivist/recorder/record.test.js +91 -0
  111. package/src/archivist/recorder/repositories/factory.js +23 -0
  112. package/src/archivist/recorder/repositories/git/dataMapper.js +83 -0
  113. package/src/archivist/recorder/repositories/git/git.js +122 -0
  114. package/src/archivist/recorder/repositories/git/git.test.js +86 -0
  115. package/src/archivist/recorder/repositories/git/index.js +182 -0
  116. package/src/archivist/recorder/repositories/git/index.test.js +714 -0
  117. package/src/archivist/recorder/repositories/interface.js +108 -0
  118. package/src/archivist/recorder/repositories/mongo/dataMapper.js +32 -0
  119. package/src/archivist/recorder/repositories/mongo/index.js +121 -0
  120. package/src/archivist/recorder/repositories/mongo/index.test.js +721 -0
  121. package/src/archivist/services/documentDeclaration.js +26 -0
  122. package/src/archivist/services/documentDeclaration.test.js +85 -0
  123. package/src/archivist/services/documentTypes.json +386 -0
  124. package/src/archivist/services/index.js +255 -0
  125. package/src/archivist/services/index.test.js +327 -0
  126. package/src/archivist/services/pageDeclaration.js +51 -0
  127. package/src/archivist/services/pageDeclaration.test.js +224 -0
  128. package/src/archivist/services/service.js +60 -0
  129. package/src/archivist/services/service.test.js +164 -0
  130. package/src/exports.js +3 -0
  131. package/src/index.js +59 -0
  132. package/src/logger/README.md +1 -0
  133. package/src/logger/index.js +131 -0
  134. package/src/main.js +18 -0
  135. package/src/notifier/README.md +1 -0
  136. package/src/notifier/index.js +150 -0
  137. package/src/tracker/README.md +1 -0
  138. package/src/tracker/index.js +215 -0
  139. package/test/fixtures/service_A.js +22 -0
  140. package/test/fixtures/service_A_terms.md +10 -0
  141. package/test/fixtures/service_A_terms_snapshot.html +14 -0
  142. package/test/fixtures/service_B.js +22 -0
  143. package/test/fixtures/service_with_declaration_history.js +65 -0
  144. package/test/fixtures/service_with_filters_history.js +155 -0
  145. package/test/fixtures/service_with_history.js +188 -0
  146. package/test/fixtures/service_with_multipage_document.js +100 -0
  147. package/test/fixtures/service_without_history.js +31 -0
  148. package/test/fixtures/services.js +19 -0
  149. package/test/fixtures/terms.pdf +0 -0
  150. package/test/fixtures/termsFromPDF.md +25 -0
  151. package/test/fixtures/termsModified.pdf +0 -0
  152. package/test/services/service_A.json +9 -0
  153. package/test/services/service_B.json +9 -0
  154. package/test/services/service_with_declaration_history.filters.js +7 -0
  155. package/test/services/service_with_declaration_history.history.json +17 -0
  156. package/test/services/service_with_declaration_history.json +13 -0
  157. package/test/services/service_with_filters_history.filters.history.js +29 -0
  158. package/test/services/service_with_filters_history.filters.js +7 -0
  159. package/test/services/service_with_filters_history.json +13 -0
  160. package/test/services/service_with_history.filters.history.js +29 -0
  161. package/test/services/service_with_history.filters.js +7 -0
  162. package/test/services/service_with_history.history.json +26 -0
  163. package/test/services/service_with_history.json +17 -0
  164. package/test/services/service_with_multipage_document.filters.js +7 -0
  165. package/test/services/service_with_multipage_document.history.json +37 -0
  166. package/test/services/service_with_multipage_document.json +28 -0
  167. package/test/services/service_without_history.filters.js +7 -0
  168. package/test/services/service_without_history.json +13 -0
@@ -0,0 +1,276 @@
1
+ import events from 'events';
2
+
3
+ import async from 'async';
4
+ import config from 'config';
5
+
6
+ import { InaccessibleContentError } from './errors.js';
7
+ import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError } from './fetcher/index.js';
8
+ import filter from './filter/index.js';
9
+ import Recorder from './recorder/index.js';
10
+ import * as services from './services/index.js';
11
+
12
+ // The parallel handling feature is currently set to a parallelism of 1 on document tracking
13
+ // because when it's higher there are two issues:
14
+ // - too many requests on the same endpoint yield 403
15
+ // - sometimes when creating a commit no SHA are returned for unknown reasons
16
+ const MAX_PARALLEL_DOCUMENTS_TRACKS = 1;
17
+ const MAX_PARALLEL_REFILTERS = 10;
18
+
19
+ export const AVAILABLE_EVENTS = [
20
+ 'snapshotRecorded',
21
+ 'firstSnapshotRecorded',
22
+ 'snapshotNotChanged',
23
+ 'versionRecorded',
24
+ 'firstVersionRecorded',
25
+ 'versionNotChanged',
26
+ 'refilteringStarted',
27
+ 'refilteringCompleted',
28
+ 'trackingStarted',
29
+ 'trackingCompleted',
30
+ 'inaccessibleContent',
31
+ 'error',
32
+ ];
33
+
34
+ export default class Archivist extends events.EventEmitter {
35
+ get serviceDeclarations() {
36
+ return this.services;
37
+ }
38
+
39
+ get serviceIds() {
40
+ return Object.keys(this.services);
41
+ }
42
+
43
+ constructor({ recorderConfig }) {
44
+ super();
45
+ this.recorder = new Recorder(recorderConfig);
46
+ this.fetch = params => fetch({ ...params, config: config.get('fetcher') });
47
+ this.filter = filter;
48
+ }
49
+
50
+ async initialize() {
51
+ if (this.services) {
52
+ return;
53
+ }
54
+
55
+ await this.recorder.initialize();
56
+ this.initQueues();
57
+ this.services = await services.load();
58
+
59
+ this.on('error', async () => {
60
+ console.log('Abort and clean up operations before exiting…');
61
+
62
+ setTimeout(() => {
63
+ console.log('Cleaning timed out, force process to exit');
64
+ process.exit(2);
65
+ }, 60 * 1000);
66
+
67
+ this.refilterDocumentsQueue.kill();
68
+ this.trackDocumentChangesQueue.kill();
69
+ await stopHeadlessBrowser().then(() => console.log('Headless browser stopped'));
70
+ await this.recorder.finalize().then(() => console.log('Recorder finalized'));
71
+ process.exit(1);
72
+ });
73
+ }
74
+
75
+ initQueues() {
76
+ this.trackDocumentChangesQueue = async.queue(async documentDeclaration => this.trackDocumentChanges(documentDeclaration), MAX_PARALLEL_DOCUMENTS_TRACKS);
77
+ this.refilterDocumentsQueue = async.queue(async documentDeclaration => this.refilterAndRecordDocument(documentDeclaration), MAX_PARALLEL_REFILTERS);
78
+
79
+ const queueErrorHandler = async (error, documentDeclaration) => {
80
+ const { service, type } = documentDeclaration;
81
+
82
+ if (error.toString().includes('HttpError: API rate limit exceeded for user ID')) {
83
+ return; // This is an error due to SendInBlue quota, bypass
84
+ }
85
+
86
+ if (error instanceof InaccessibleContentError) {
87
+ this.emit('inaccessibleContent', error, service.id, type, documentDeclaration);
88
+
89
+ return;
90
+ }
91
+
92
+ this.emit('error', error, service.id, type);
93
+ };
94
+
95
+ this.trackDocumentChangesQueue.error(queueErrorHandler);
96
+ this.refilterDocumentsQueue.error(queueErrorHandler);
97
+ }
98
+
99
+ attach(listener) {
100
+ AVAILABLE_EVENTS.forEach(event => {
101
+ const handlerName = `on${event[0].toUpperCase()}${event.substring(1)}`;
102
+
103
+ if (listener[handlerName]) {
104
+ this.on(event, listener[handlerName].bind(listener));
105
+ }
106
+ });
107
+ }
108
+
109
+ async trackChanges(servicesIds = this.serviceIds, documentTypes = []) {
110
+ this.emit('trackingStarted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
111
+
112
+ await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]);
113
+
114
+ this.#forEachDocumentOf(servicesIds, documentTypes, documentDeclaration => this.trackDocumentChangesQueue.push(documentDeclaration));
115
+
116
+ await this.trackDocumentChangesQueue.drain();
117
+ await Promise.all([ stopHeadlessBrowser(), this.recorder.finalize() ]);
118
+
119
+ this.emit('trackingCompleted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
120
+ }
121
+
122
+ async refilterAndRecord(servicesIds = this.serviceIds, documentTypes = []) {
123
+ this.emit('refilteringStarted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
124
+
125
+ await this.recorder.initialize();
126
+
127
+ this.#forEachDocumentOf(servicesIds, documentTypes, documentDeclaration => this.refilterDocumentsQueue.push(documentDeclaration));
128
+
129
+ await this.refilterDocumentsQueue.drain();
130
+ await this.recorder.finalize();
131
+
132
+ this.emit('refilteringCompleted', servicesIds.length, this.getNumberOfDocuments(servicesIds));
133
+ }
134
+
135
+ async trackDocumentChanges(documentDeclaration) {
136
+ await Promise.all((await this.fetchDocumentPages(documentDeclaration)).map(params => this.recordSnapshot(params)));
137
+
138
+ return this.generateDocumentVersion(documentDeclaration);
139
+ }
140
+
141
+ async refilterAndRecordDocument(documentDeclaration) {
142
+ return this.generateDocumentVersion(documentDeclaration, { isRefiltering: true });
143
+ }
144
+
145
+ async generateDocumentVersion(documentDeclaration, { isRefiltering = false } = {}) {
146
+ const { service: { id: serviceId }, type: documentType, pages } = documentDeclaration;
147
+
148
+ const snapshots = await this.getDocumentSnapshots(documentDeclaration);
149
+
150
+ if (!snapshots.length) {
151
+ return;
152
+ }
153
+
154
+ const [{ fetchDate }] = snapshots; // In case of multipage document, use the first snapshot fetch date
155
+
156
+ return this.recordVersion({
157
+ content: await this.generateDocumentFilteredContent(snapshots, pages),
158
+ snapshotIds: snapshots.map(({ id }) => id),
159
+ serviceId,
160
+ documentType,
161
+ fetchDate,
162
+ isRefiltering,
163
+ });
164
+ }
165
+
166
+ async fetchDocumentPages({ service: { id: serviceId }, type: documentType, pages, isMultiPage }) {
167
+ const inaccessibleContentErrors = [];
168
+
169
+ const result = await Promise.all(pages.map(async ({ location: url, executeClientScripts, cssSelectors, id: pageId }) => {
170
+ try {
171
+ const { mimeType, content } = await this.fetch({ url, executeClientScripts, cssSelectors });
172
+
173
+ return {
174
+ content,
175
+ mimeType,
176
+ serviceId,
177
+ documentType,
178
+ pageId: isMultiPage && pageId,
179
+ fetchDate: new Date(),
180
+ };
181
+ } catch (error) {
182
+ if (!(error instanceof FetchDocumentError)) {
183
+ throw error;
184
+ }
185
+
186
+ if (error.message.includes('EAI_AGAIN')) {
187
+ // EAI_AGAIN is a DNS lookup timed out error, which means it is a network connectivity error or proxy related error.
188
+ // This operational error is mostly transient and should be handled by retrying the operation.
189
+ // As there is no retry mechanism in the engine yet, crash the engine and leave it to the process
190
+ // manager to handle the retries and the delay between them.
191
+ throw error;
192
+ }
193
+
194
+ inaccessibleContentErrors.push(error.message);
195
+ }
196
+ }));
197
+
198
+ if (inaccessibleContentErrors.length) {
199
+ throw new InaccessibleContentError(inaccessibleContentErrors);
200
+ }
201
+
202
+ return result;
203
+ }
204
+
205
+ async getDocumentSnapshots({ service: { id: serviceId }, type: documentType, pages, isMultiPage }) {
206
+ return (await Promise.all(pages.map(async page => this.recorder.getLatestSnapshot(serviceId, documentType, isMultiPage && page.id)))).filter(Boolean);
207
+ }
208
+
209
+ async generateDocumentFilteredContent(snapshots, pages) {
210
+ return (
211
+ await Promise.all(snapshots.map(async ({ pageId, content, mimeType }) => {
212
+ const pageDeclaration = pageId ? pages.find(({ id }) => pageId == id) : pages[0];
213
+
214
+ return this.filter({ content, mimeType, pageDeclaration });
215
+ }))
216
+ ).join('\n\n');
217
+ }
218
+
219
+ async recordSnapshot({ content, mimeType, fetchDate, serviceId, documentType, pageId }) {
220
+ const { id: snapshotId, isFirstRecord } = await this.recorder.recordSnapshot({
221
+ serviceId,
222
+ documentType,
223
+ pageId,
224
+ content,
225
+ mimeType,
226
+ fetchDate,
227
+ });
228
+
229
+ if (!snapshotId) {
230
+ this.emit('snapshotNotChanged', serviceId, documentType, pageId);
231
+
232
+ return;
233
+ }
234
+
235
+ this.emit(isFirstRecord ? 'firstSnapshotRecorded' : 'snapshotRecorded', serviceId, documentType, pageId, snapshotId);
236
+
237
+ return snapshotId;
238
+ }
239
+
240
+ async recordVersion({ content, fetchDate, snapshotIds, serviceId, documentType, isRefiltering }) {
241
+ const recordFunction = !isRefiltering ? 'recordVersion' : 'recordRefilter';
242
+
243
+ const { id: versionId, isFirstRecord } = await this.recorder[recordFunction]({
244
+ serviceId,
245
+ documentType,
246
+ content,
247
+ fetchDate,
248
+ snapshotIds,
249
+ });
250
+
251
+ if (!versionId) {
252
+ this.emit('versionNotChanged', serviceId, documentType);
253
+
254
+ return;
255
+ }
256
+
257
+ this.emit(isFirstRecord ? 'firstVersionRecorded' : 'versionRecorded', serviceId, documentType, versionId);
258
+ }
259
+
260
+ getNumberOfDocuments(serviceIds = this.serviceIds) {
261
+ return serviceIds.reduce((acc, serviceId) => acc + this.services[serviceId].getNumberOfDocuments(), 0);
262
+ }
263
+
264
+ async #forEachDocumentOf(servicesIds = [], documentTypes = [], callback) { // eslint-disable-line default-param-last
265
+ servicesIds.sort((a, b) => a.localeCompare(b)); // Sort service IDs by lowercase name to have more intuitive logs
266
+ servicesIds.forEach(serviceId => {
267
+ this.services[serviceId].getDocumentTypes().forEach(documentType => {
268
+ if (documentTypes.length && !documentTypes.includes(documentType)) {
269
+ return;
270
+ }
271
+
272
+ callback(this.services[serviceId].getDocumentDeclaration(documentType));
273
+ });
274
+ });
275
+ }
276
+ }