@opentermsarchive/engine 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/.env.example +3 -0
  2. package/.eslintrc.yaml +116 -0
  3. package/.github/workflows/deploy.yml +50 -0
  4. package/.github/workflows/release.yml +64 -0
  5. package/.github/workflows/test.yml +77 -0
  6. package/CHANGELOG.md +14 -0
  7. package/CODE_OF_CONDUCT.md +128 -0
  8. package/CONTRIBUTING.md +143 -0
  9. package/LICENSE +153 -0
  10. package/MIGRATING.md +42 -0
  11. package/README.fr.md +110 -0
  12. package/README.md +438 -0
  13. package/Vagrantfile +38 -0
  14. package/ansible.cfg +13 -0
  15. package/bin/.env.js +1 -0
  16. package/bin/lint-declarations.js +31 -0
  17. package/bin/track.js +26 -0
  18. package/bin/validate-declarations.js +68 -0
  19. package/config/ci.json +5 -0
  20. package/config/contrib.json +35 -0
  21. package/config/dating.json +37 -0
  22. package/config/default.json +71 -0
  23. package/config/france.json +40 -0
  24. package/config/p2b-compliance.json +40 -0
  25. package/config/pga.json +40 -0
  26. package/config/production.json +27 -0
  27. package/config/test.json +49 -0
  28. package/config/vagrant.json +24 -0
  29. package/decision-records/0001-service-name-and-id.md +73 -0
  30. package/decision-records/0002-service-history.md +212 -0
  31. package/decision-records/0003-snapshots-database.md +123 -0
  32. package/ops/README.md +280 -0
  33. package/ops/app.yml +5 -0
  34. package/ops/infra.yml +6 -0
  35. package/ops/inventories/dev.yml +7 -0
  36. package/ops/inventories/production.yml +27 -0
  37. package/ops/roles/infra/defaults/main.yml +2 -0
  38. package/ops/roles/infra/files/.gitconfig +3 -0
  39. package/ops/roles/infra/files/mongod.conf +18 -0
  40. package/ops/roles/infra/files/ota-bot-key.private_key +26 -0
  41. package/ops/roles/infra/tasks/main.yml +78 -0
  42. package/ops/roles/infra/tasks/mongo.yml +40 -0
  43. package/ops/roles/infra/templates/ssh_config.j2 +5 -0
  44. package/ops/roles/ota/defaults/main.yml +14 -0
  45. package/ops/roles/ota/files/.env +21 -0
  46. package/ops/roles/ota/tasks/database.yml +65 -0
  47. package/ops/roles/ota/tasks/main.yml +110 -0
  48. package/ops/site.yml +6 -0
  49. package/package.json +101 -0
  50. package/pm2.config.cjs +20 -0
  51. package/scripts/dataset/README.md +37 -0
  52. package/scripts/dataset/assets/LICENSE +540 -0
  53. package/scripts/dataset/assets/README.template.js +65 -0
  54. package/scripts/dataset/export/index.js +106 -0
  55. package/scripts/dataset/export/index.test.js +155 -0
  56. package/scripts/dataset/export/test/fixtures/dataset/LICENSE +540 -0
  57. package/scripts/dataset/export/test/fixtures/dataset/README.md +40 -0
  58. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-01T11-27-00Z.md +1 -0
  59. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-11T11-32-47Z.md +1 -0
  60. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Privacy Policy/2022-01-01T12-12-24Z.md +1 -0
  61. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Terms of Service/2022-01-06T11-32-47Z.md +1 -0
  62. package/scripts/dataset/index.js +40 -0
  63. package/scripts/dataset/logger/index.js +17 -0
  64. package/scripts/dataset/main.js +25 -0
  65. package/scripts/dataset/publish/index.js +39 -0
  66. package/scripts/declarations/lint/index.js +36 -0
  67. package/scripts/declarations/utils/index.js +81 -0
  68. package/scripts/declarations/validate/definitions.js +63 -0
  69. package/scripts/declarations/validate/index.mocha.js +262 -0
  70. package/scripts/declarations/validate/service.history.schema.js +86 -0
  71. package/scripts/declarations/validate/service.schema.js +91 -0
  72. package/scripts/history/logger/index.js +39 -0
  73. package/scripts/history/migrate-services.js +212 -0
  74. package/scripts/history/update-to-full-hash.js +61 -0
  75. package/scripts/history/utils/index.js +23 -0
  76. package/scripts/import/README.md +59 -0
  77. package/scripts/import/config/import.json +12 -0
  78. package/scripts/import/index.js +224 -0
  79. package/scripts/import/loadCommits.js +66 -0
  80. package/scripts/import/logger/index.js +43 -0
  81. package/scripts/rewrite/README.md +131 -0
  82. package/scripts/rewrite/config/rewrite-snapshots.json +32 -0
  83. package/scripts/rewrite/config/rewrite-versions.json +32 -0
  84. package/scripts/rewrite/initializer/files/license +428 -0
  85. package/scripts/rewrite/initializer/files/readme.md +8 -0
  86. package/scripts/rewrite/initializer/index.js +44 -0
  87. package/scripts/rewrite/rewrite-snapshots.js +108 -0
  88. package/scripts/rewrite/rewrite-versions.js +160 -0
  89. package/scripts/rewrite/utils.js +33 -0
  90. package/scripts/utils/renamer/README.md +49 -0
  91. package/scripts/utils/renamer/index.js +45 -0
  92. package/scripts/utils/renamer/rules/documentTypes.json +25 -0
  93. package/scripts/utils/renamer/rules/documentTypesByService.json +170 -0
  94. package/scripts/utils/renamer/rules/serviceNames.json +92 -0
  95. package/src/archivist/errors.js +9 -0
  96. package/src/archivist/fetcher/errors.js +6 -0
  97. package/src/archivist/fetcher/exports.js +18 -0
  98. package/src/archivist/fetcher/fullDomFetcher.js +84 -0
  99. package/src/archivist/fetcher/htmlOnlyFetcher.js +62 -0
  100. package/src/archivist/fetcher/index.js +35 -0
  101. package/src/archivist/fetcher/index.test.js +239 -0
  102. package/src/archivist/filter/exports.js +3 -0
  103. package/src/archivist/filter/index.js +178 -0
  104. package/src/archivist/filter/index.test.js +561 -0
  105. package/src/archivist/index.js +276 -0
  106. package/src/archivist/index.test.js +600 -0
  107. package/src/archivist/recorder/index.js +77 -0
  108. package/src/archivist/recorder/index.test.js +463 -0
  109. package/src/archivist/recorder/record.js +35 -0
  110. package/src/archivist/recorder/record.test.js +91 -0
  111. package/src/archivist/recorder/repositories/factory.js +23 -0
  112. package/src/archivist/recorder/repositories/git/dataMapper.js +83 -0
  113. package/src/archivist/recorder/repositories/git/git.js +122 -0
  114. package/src/archivist/recorder/repositories/git/git.test.js +86 -0
  115. package/src/archivist/recorder/repositories/git/index.js +182 -0
  116. package/src/archivist/recorder/repositories/git/index.test.js +714 -0
  117. package/src/archivist/recorder/repositories/interface.js +108 -0
  118. package/src/archivist/recorder/repositories/mongo/dataMapper.js +32 -0
  119. package/src/archivist/recorder/repositories/mongo/index.js +121 -0
  120. package/src/archivist/recorder/repositories/mongo/index.test.js +721 -0
  121. package/src/archivist/services/documentDeclaration.js +26 -0
  122. package/src/archivist/services/documentDeclaration.test.js +85 -0
  123. package/src/archivist/services/documentTypes.json +386 -0
  124. package/src/archivist/services/index.js +255 -0
  125. package/src/archivist/services/index.test.js +327 -0
  126. package/src/archivist/services/pageDeclaration.js +51 -0
  127. package/src/archivist/services/pageDeclaration.test.js +224 -0
  128. package/src/archivist/services/service.js +60 -0
  129. package/src/archivist/services/service.test.js +164 -0
  130. package/src/exports.js +3 -0
  131. package/src/index.js +59 -0
  132. package/src/logger/README.md +1 -0
  133. package/src/logger/index.js +131 -0
  134. package/src/main.js +18 -0
  135. package/src/notifier/README.md +1 -0
  136. package/src/notifier/index.js +150 -0
  137. package/src/tracker/README.md +1 -0
  138. package/src/tracker/index.js +215 -0
  139. package/test/fixtures/service_A.js +22 -0
  140. package/test/fixtures/service_A_terms.md +10 -0
  141. package/test/fixtures/service_A_terms_snapshot.html +14 -0
  142. package/test/fixtures/service_B.js +22 -0
  143. package/test/fixtures/service_with_declaration_history.js +65 -0
  144. package/test/fixtures/service_with_filters_history.js +155 -0
  145. package/test/fixtures/service_with_history.js +188 -0
  146. package/test/fixtures/service_with_multipage_document.js +100 -0
  147. package/test/fixtures/service_without_history.js +31 -0
  148. package/test/fixtures/services.js +19 -0
  149. package/test/fixtures/terms.pdf +0 -0
  150. package/test/fixtures/termsFromPDF.md +25 -0
  151. package/test/fixtures/termsModified.pdf +0 -0
  152. package/test/services/service_A.json +9 -0
  153. package/test/services/service_B.json +9 -0
  154. package/test/services/service_with_declaration_history.filters.js +7 -0
  155. package/test/services/service_with_declaration_history.history.json +17 -0
  156. package/test/services/service_with_declaration_history.json +13 -0
  157. package/test/services/service_with_filters_history.filters.history.js +29 -0
  158. package/test/services/service_with_filters_history.filters.js +7 -0
  159. package/test/services/service_with_filters_history.json +13 -0
  160. package/test/services/service_with_history.filters.history.js +29 -0
  161. package/test/services/service_with_history.filters.js +7 -0
  162. package/test/services/service_with_history.history.json +26 -0
  163. package/test/services/service_with_history.json +17 -0
  164. package/test/services/service_with_multipage_document.filters.js +7 -0
  165. package/test/services/service_with_multipage_document.history.json +37 -0
  166. package/test/services/service_with_multipage_document.json +28 -0
  167. package/test/services/service_without_history.filters.js +7 -0
  168. package/test/services/service_without_history.json +13 -0
@@ -0,0 +1,66 @@
1
+ import path from 'path';
2
+ import { performance } from 'perf_hooks';
3
+ import { fileURLToPath } from 'url';
4
+
5
+ import config from 'config';
6
+ import { MongoClient } from 'mongodb';
7
+
8
+ import Git from '../../src/archivist/recorder/repositories/git/git.js';
9
+
10
+ import logger from './logger/index.js';
11
+
12
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
13
+
14
+ const ROOT_PATH = path.resolve(__dirname, '../../');
15
+
16
+ let sourceRepository;
17
+ let collection;
18
+ let client;
19
+
20
+ (async function main() {
21
+ console.time('Total time');
22
+ logger.info({ message: 'Start importing commits…' });
23
+
24
+ await initialize();
25
+
26
+ logger.info({ message: 'Waiting for git log… (this can take a while)' });
27
+ const start = performance.now();
28
+ const commits = (await sourceRepository.log(['--stat=4096'])).sort((a, b) => new Date(a.date) - new Date(b.date));
29
+ const end = performance.now();
30
+ const filteredCommits = commits.filter(({ message }) => message.match(/^(Start tracking|Update)/));
31
+
32
+ logger.info({ message: `Loaded git log in ${Number((end - start) / 1000).toFixed(2)} s` });
33
+ logger.info({ message: `Source repo contains ${commits.length} commits` });
34
+
35
+ const totalCommitToLoad = filteredCommits.length;
36
+ const numberOfSkippedCommits = commits.length - totalCommitToLoad;
37
+
38
+ if (numberOfSkippedCommits) {
39
+ logger.info({ message: `Skipped ${numberOfSkippedCommits} commits that do not need to be imported (README, LICENCE…).` });
40
+ }
41
+
42
+ let counter = 1;
43
+
44
+ for (const commit of filteredCommits.reverse()) { // reverse array to insert most recent commits first
45
+ await collection.updateOne({ hash: commit.hash }, { $set: { ...commit } }, { upsert: true }); // eslint-disable-line no-await-in-loop
46
+
47
+ if (counter % 1000 == 0) {
48
+ logger.info({ message: ' ', current: counter, total: totalCommitToLoad });
49
+ }
50
+ counter++;
51
+ }
52
+
53
+ await client.close();
54
+ }());
55
+
56
+ async function initialize() {
57
+ client = new MongoClient(config.get('import.mongo.connectionURI'));
58
+
59
+ await client.connect();
60
+ const db = client.db(config.get('import.mongo.database'));
61
+
62
+ collection = db.collection(config.get('import.mongo.commitsCollection'));
63
+ sourceRepository = new Git({ path: path.resolve(ROOT_PATH, config.get('import.sourcePath')) });
64
+
65
+ await sourceRepository.initialize();
66
+ }
@@ -0,0 +1,43 @@
1
+ import winston from 'winston';
2
+
3
+ const { combine, timestamp, printf, colorize } = winston.format;
4
+
5
+ const alignedWithColorsAndTime = combine(
6
+ colorize(),
7
+ timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }),
8
+ printf(({ level, message, timestamp, serviceId, type, sha, current, total }) => {
9
+ let prefix = ''.padEnd(8);
10
+
11
+ if (current && total) {
12
+ prefix = `${Number(((current) / total) * 100).toFixed(2)}%`.padEnd(8);
13
+ }
14
+
15
+ if (serviceId) {
16
+ prefix += `${serviceId}`.padEnd(30);
17
+ }
18
+
19
+ if (type) {
20
+ prefix += `${type}`.padEnd(50);
21
+ }
22
+
23
+ if (sha) {
24
+ prefix += `${sha}`.padEnd(42);
25
+ } else {
26
+ prefix += ''.padEnd(42);
27
+ }
28
+
29
+ return `${timestamp} ${level.padEnd(15)} ${prefix} ${message}`;
30
+ }),
31
+ );
32
+
33
+ const consoleTransport = new winston.transports.Console();
34
+
35
+ const transports = [consoleTransport];
36
+
37
+ const logger = winston.createLogger({
38
+ format: alignedWithColorsAndTime,
39
+ transports,
40
+ rejectionHandlers: transports,
41
+ });
42
+
43
+ export default logger;
@@ -0,0 +1,131 @@
1
+ __:warning: These scripts are no longer up-to-date with the codebase and are not guaranteed to work.__
2
+
3
+ # Rewrite history
4
+
5
+ As some document types or service names can change over time or as we need to import history from other tools, provided they have an history with the same structure as Open Terms Archive, we need a way to rewrite, reorder and apply changes to the snapshots or versions history.
6
+
7
+ The script works by reading commits from a **source** repository, applying changes and then committing the result in another, empty or not, **target** repository. So a source repository with commits is required.
8
+
9
+ When re-writing versions, filters are re-applied on snapshots, so services declarations and history are required.
10
+
11
+ :warning: Currently, history rewriting only works with Git storage.
12
+
13
+ ## Rewrite snapshots
14
+
15
+ ### Configuring
16
+
17
+ You can change the **source** and **target** repository in `config/rewrite-snapshots.json`. We use the `recorder` module to write to the **target** repository, so to configure the **target** repo, change the `recorder.snapshots.storage.git.path` value:
18
+
19
+ ```json
20
+ {
21
+
22
+ "recorder": {
23
+ "snapshots": {
24
+ "storage": {
25
+ "git": {
26
+ "path": "<Target repository>"
27
+
28
+ }
29
+ }
30
+ }
31
+ },
32
+ "rewrite": {
33
+ "snapshotsSourcePath": "<Source repository>"
34
+ }
35
+ }
36
+ ```
37
+
38
+ Other configuration elements are inherited from the default `recorder` config.
39
+
40
+ ### Running
41
+
42
+ Run every command by setting `NODE_ENV` to `rewrite-snapshots`.
43
+
44
+ Run the script by running:
45
+
46
+ ```sh
47
+ cd scripts/rewrite
48
+ NODE_ENV=rewrite-snapshots node rewrite-snapshots.js
49
+ ```
50
+
51
+ You can write in an empty target repository and initialize it by passing the options `--init`:
52
+
53
+ ```sh
54
+ NODE_ENV=rewrite-snapshots node rewrite-snapshots.js --init
55
+ ```
56
+
57
+ This option will create the repository if it does not exists.
58
+
59
+ :warning: **If the repository already exist it will be deleted and reinitialized by this options.**
60
+
61
+ The resulting rewritten history can be found in the configured target repository or by default in the `data/snapshots-rewritten` repository.
62
+
63
+ ## Rewrite versions
64
+
65
+ ### Configuring
66
+
67
+ You can change the **source** and **target** repository in `config/rewrite-versions.json`. We use the `recorder` module to write to the **target** repository, so to configure the **target** repo, change the `recorder.versions.storage.git.path` value:
68
+
69
+ ```json
70
+ {
71
+
72
+ "recorder": {
73
+ "versions": {
74
+ "storage": {
75
+ "git": {
76
+ "path": "<Target repository>"
77
+
78
+ }
79
+ }
80
+ }
81
+ },
82
+ "rewrite": {
83
+ "snapshotsSourcePath": "<Source repository>"
84
+ }
85
+ }
86
+ ```
87
+
88
+ Other configuration elements are inherited from the default `recorder` config.
89
+
90
+ ### Running
91
+
92
+ Run every command by setting `NODE_ENV` to `rewrite-versions`.
93
+
94
+ Run the script by running:
95
+
96
+ ```sh
97
+ cd scripts/rewrite
98
+ NODE_ENV=rewrite-versions node rewrite-versions.js
99
+ ```
100
+
101
+ You can write in an empty target repository and initialize it by passing the options `--init`:
102
+
103
+ ```sh
104
+ NODE_ENV=rewrite-versions node rewrite-versions.js --init
105
+ ```
106
+
107
+ This option will create the repository if it does not exists.
108
+
109
+ :warning: **If the repository already exist it will be deleted and reinitialized by this options.**
110
+
111
+ The resulting rewritten history can be found in the configured target repository or by default in the `data/versions-rewritten` repository.
112
+
113
+ ### Important notes
114
+
115
+ - Your source repository will be read as it, so checkout the proper branch of commit before running the script.
116
+ - If you kill the script during its run, your source repository will probably on a commit in the middle of the history, you need to manually checkout to the proper wanted commit of branche before re-running it.
117
+
118
+ ## Adding renaming rules
119
+
120
+ See the [renamer module documentation](../renamer/README.md).
121
+
122
+ ### Currently handled cases
123
+
124
+ Currently, the script will:
125
+
126
+ - Ignore commits which are not a document snapshot (like renaming or documentation commits)
127
+ - Reorder commits according to their author date
128
+ - Rename document types according to declared rules
129
+ - Rename services according to declared rules
130
+ - Skip commits with empty content
131
+ - Skip commits which do not change the document
@@ -0,0 +1,32 @@
1
+ {
2
+ "recorder": {
3
+ "versions": {
4
+ "storage": {
5
+ "git": {
6
+ "path": "./data/versions",
7
+ "publish": false,
8
+ "prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/ambanum/OpenTermsArchive-snapshots/commit/",
9
+ "author": {
10
+ "name": "Open Terms Archive Bot",
11
+ "email": "bot@opentermsarchive.org"
12
+ }
13
+ }
14
+ }
15
+ },
16
+ "snapshots": {
17
+ "storage": {
18
+ "git": {
19
+ "path": "./data/snapshots-rewritten",
20
+ "publish": false,
21
+ "author": {
22
+ "name": "Open Terms Archive Bot",
23
+ "email": "bot@opentermsarchive.org"
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "rewrite": {
30
+ "snapshotsSourcePath": "./data/snapshots"
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "recorder": {
3
+ "versions": {
4
+ "storage": {
5
+ "git": {
6
+ "path": "./data/versions-rewritten",
7
+ "publish": false,
8
+ "prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/ambanum/OpenTermsArchive-snapshots/commit/",
9
+ "author": {
10
+ "name": "Open Terms Archive Bot",
11
+ "email": "bot@opentermsarchive.org"
12
+ }
13
+ }
14
+ }
15
+ },
16
+ "snapshots": {
17
+ "storage": {
18
+ "git": {
19
+ "path": "./data/snapshots",
20
+ "publish": false,
21
+ "author": {
22
+ "name": "Open Terms Archive Bot",
23
+ "email": "bot@opentermsarchive.org"
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "rewrite": {
30
+ "snapshotsSourcePath": "./data/snapshots"
31
+ }
32
+ }