@opentermsarchive/engine 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/.env.example +3 -0
  2. package/.eslintrc.yaml +116 -0
  3. package/.github/workflows/deploy.yml +50 -0
  4. package/.github/workflows/release.yml +64 -0
  5. package/.github/workflows/test.yml +77 -0
  6. package/CHANGELOG.md +14 -0
  7. package/CODE_OF_CONDUCT.md +128 -0
  8. package/CONTRIBUTING.md +143 -0
  9. package/LICENSE +153 -0
  10. package/MIGRATING.md +42 -0
  11. package/README.fr.md +110 -0
  12. package/README.md +438 -0
  13. package/Vagrantfile +38 -0
  14. package/ansible.cfg +13 -0
  15. package/bin/.env.js +1 -0
  16. package/bin/lint-declarations.js +31 -0
  17. package/bin/track.js +26 -0
  18. package/bin/validate-declarations.js +68 -0
  19. package/config/ci.json +5 -0
  20. package/config/contrib.json +35 -0
  21. package/config/dating.json +37 -0
  22. package/config/default.json +71 -0
  23. package/config/france.json +40 -0
  24. package/config/p2b-compliance.json +40 -0
  25. package/config/pga.json +40 -0
  26. package/config/production.json +27 -0
  27. package/config/test.json +49 -0
  28. package/config/vagrant.json +24 -0
  29. package/decision-records/0001-service-name-and-id.md +73 -0
  30. package/decision-records/0002-service-history.md +212 -0
  31. package/decision-records/0003-snapshots-database.md +123 -0
  32. package/ops/README.md +280 -0
  33. package/ops/app.yml +5 -0
  34. package/ops/infra.yml +6 -0
  35. package/ops/inventories/dev.yml +7 -0
  36. package/ops/inventories/production.yml +27 -0
  37. package/ops/roles/infra/defaults/main.yml +2 -0
  38. package/ops/roles/infra/files/.gitconfig +3 -0
  39. package/ops/roles/infra/files/mongod.conf +18 -0
  40. package/ops/roles/infra/files/ota-bot-key.private_key +26 -0
  41. package/ops/roles/infra/tasks/main.yml +78 -0
  42. package/ops/roles/infra/tasks/mongo.yml +40 -0
  43. package/ops/roles/infra/templates/ssh_config.j2 +5 -0
  44. package/ops/roles/ota/defaults/main.yml +14 -0
  45. package/ops/roles/ota/files/.env +21 -0
  46. package/ops/roles/ota/tasks/database.yml +65 -0
  47. package/ops/roles/ota/tasks/main.yml +110 -0
  48. package/ops/site.yml +6 -0
  49. package/package.json +101 -0
  50. package/pm2.config.cjs +20 -0
  51. package/scripts/dataset/README.md +37 -0
  52. package/scripts/dataset/assets/LICENSE +540 -0
  53. package/scripts/dataset/assets/README.template.js +65 -0
  54. package/scripts/dataset/export/index.js +106 -0
  55. package/scripts/dataset/export/index.test.js +155 -0
  56. package/scripts/dataset/export/test/fixtures/dataset/LICENSE +540 -0
  57. package/scripts/dataset/export/test/fixtures/dataset/README.md +40 -0
  58. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-01T11-27-00Z.md +1 -0
  59. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-11T11-32-47Z.md +1 -0
  60. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Privacy Policy/2022-01-01T12-12-24Z.md +1 -0
  61. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Terms of Service/2022-01-06T11-32-47Z.md +1 -0
  62. package/scripts/dataset/index.js +40 -0
  63. package/scripts/dataset/logger/index.js +17 -0
  64. package/scripts/dataset/main.js +25 -0
  65. package/scripts/dataset/publish/index.js +39 -0
  66. package/scripts/declarations/lint/index.js +36 -0
  67. package/scripts/declarations/utils/index.js +81 -0
  68. package/scripts/declarations/validate/definitions.js +63 -0
  69. package/scripts/declarations/validate/index.mocha.js +262 -0
  70. package/scripts/declarations/validate/service.history.schema.js +86 -0
  71. package/scripts/declarations/validate/service.schema.js +91 -0
  72. package/scripts/history/logger/index.js +39 -0
  73. package/scripts/history/migrate-services.js +212 -0
  74. package/scripts/history/update-to-full-hash.js +61 -0
  75. package/scripts/history/utils/index.js +23 -0
  76. package/scripts/import/README.md +59 -0
  77. package/scripts/import/config/import.json +12 -0
  78. package/scripts/import/index.js +224 -0
  79. package/scripts/import/loadCommits.js +66 -0
  80. package/scripts/import/logger/index.js +43 -0
  81. package/scripts/rewrite/README.md +131 -0
  82. package/scripts/rewrite/config/rewrite-snapshots.json +32 -0
  83. package/scripts/rewrite/config/rewrite-versions.json +32 -0
  84. package/scripts/rewrite/initializer/files/license +428 -0
  85. package/scripts/rewrite/initializer/files/readme.md +8 -0
  86. package/scripts/rewrite/initializer/index.js +44 -0
  87. package/scripts/rewrite/rewrite-snapshots.js +108 -0
  88. package/scripts/rewrite/rewrite-versions.js +160 -0
  89. package/scripts/rewrite/utils.js +33 -0
  90. package/scripts/utils/renamer/README.md +49 -0
  91. package/scripts/utils/renamer/index.js +45 -0
  92. package/scripts/utils/renamer/rules/documentTypes.json +25 -0
  93. package/scripts/utils/renamer/rules/documentTypesByService.json +170 -0
  94. package/scripts/utils/renamer/rules/serviceNames.json +92 -0
  95. package/src/archivist/errors.js +9 -0
  96. package/src/archivist/fetcher/errors.js +6 -0
  97. package/src/archivist/fetcher/exports.js +18 -0
  98. package/src/archivist/fetcher/fullDomFetcher.js +84 -0
  99. package/src/archivist/fetcher/htmlOnlyFetcher.js +62 -0
  100. package/src/archivist/fetcher/index.js +35 -0
  101. package/src/archivist/fetcher/index.test.js +239 -0
  102. package/src/archivist/filter/exports.js +3 -0
  103. package/src/archivist/filter/index.js +178 -0
  104. package/src/archivist/filter/index.test.js +561 -0
  105. package/src/archivist/index.js +276 -0
  106. package/src/archivist/index.test.js +600 -0
  107. package/src/archivist/recorder/index.js +77 -0
  108. package/src/archivist/recorder/index.test.js +463 -0
  109. package/src/archivist/recorder/record.js +35 -0
  110. package/src/archivist/recorder/record.test.js +91 -0
  111. package/src/archivist/recorder/repositories/factory.js +23 -0
  112. package/src/archivist/recorder/repositories/git/dataMapper.js +83 -0
  113. package/src/archivist/recorder/repositories/git/git.js +122 -0
  114. package/src/archivist/recorder/repositories/git/git.test.js +86 -0
  115. package/src/archivist/recorder/repositories/git/index.js +182 -0
  116. package/src/archivist/recorder/repositories/git/index.test.js +714 -0
  117. package/src/archivist/recorder/repositories/interface.js +108 -0
  118. package/src/archivist/recorder/repositories/mongo/dataMapper.js +32 -0
  119. package/src/archivist/recorder/repositories/mongo/index.js +121 -0
  120. package/src/archivist/recorder/repositories/mongo/index.test.js +721 -0
  121. package/src/archivist/services/documentDeclaration.js +26 -0
  122. package/src/archivist/services/documentDeclaration.test.js +85 -0
  123. package/src/archivist/services/documentTypes.json +386 -0
  124. package/src/archivist/services/index.js +255 -0
  125. package/src/archivist/services/index.test.js +327 -0
  126. package/src/archivist/services/pageDeclaration.js +51 -0
  127. package/src/archivist/services/pageDeclaration.test.js +224 -0
  128. package/src/archivist/services/service.js +60 -0
  129. package/src/archivist/services/service.test.js +164 -0
  130. package/src/exports.js +3 -0
  131. package/src/index.js +59 -0
  132. package/src/logger/README.md +1 -0
  133. package/src/logger/index.js +131 -0
  134. package/src/main.js +18 -0
  135. package/src/notifier/README.md +1 -0
  136. package/src/notifier/index.js +150 -0
  137. package/src/tracker/README.md +1 -0
  138. package/src/tracker/index.js +215 -0
  139. package/test/fixtures/service_A.js +22 -0
  140. package/test/fixtures/service_A_terms.md +10 -0
  141. package/test/fixtures/service_A_terms_snapshot.html +14 -0
  142. package/test/fixtures/service_B.js +22 -0
  143. package/test/fixtures/service_with_declaration_history.js +65 -0
  144. package/test/fixtures/service_with_filters_history.js +155 -0
  145. package/test/fixtures/service_with_history.js +188 -0
  146. package/test/fixtures/service_with_multipage_document.js +100 -0
  147. package/test/fixtures/service_without_history.js +31 -0
  148. package/test/fixtures/services.js +19 -0
  149. package/test/fixtures/terms.pdf +0 -0
  150. package/test/fixtures/termsFromPDF.md +25 -0
  151. package/test/fixtures/termsModified.pdf +0 -0
  152. package/test/services/service_A.json +9 -0
  153. package/test/services/service_B.json +9 -0
  154. package/test/services/service_with_declaration_history.filters.js +7 -0
  155. package/test/services/service_with_declaration_history.history.json +17 -0
  156. package/test/services/service_with_declaration_history.json +13 -0
  157. package/test/services/service_with_filters_history.filters.history.js +29 -0
  158. package/test/services/service_with_filters_history.filters.js +7 -0
  159. package/test/services/service_with_filters_history.json +13 -0
  160. package/test/services/service_with_history.filters.history.js +29 -0
  161. package/test/services/service_with_history.filters.js +7 -0
  162. package/test/services/service_with_history.history.json +26 -0
  163. package/test/services/service_with_history.json +17 -0
  164. package/test/services/service_with_multipage_document.filters.js +7 -0
  165. package/test/services/service_with_multipage_document.history.json +37 -0
  166. package/test/services/service_with_multipage_document.json +28 -0
  167. package/test/services/service_without_history.filters.js +7 -0
  168. package/test/services/service_without_history.json +13 -0
package/README.md ADDED
@@ -0,0 +1,438 @@
1
+ # Open Terms Archive
2
+
3
+ **Services** have **terms** that can change over time. _Open Terms Archive_ enables users rights advocates, regulatory bodies and any interested citizen to follow the **changes** to these **terms** by being **notified** whenever a new **version** is published, and exploring their entire **history**.
4
+
5
+ > Les services ont des conditions générales qui évoluent dans le temps. _Open Terms Archive_ permet aux défenseurs des droits des utilisateurs, aux régulateurs et à toute personne intéressée de suivre les évolutions de ces conditions générales en étant notifiée à chaque publication d'une nouvelle version, et en explorant leur historique.
6
+
7
+ [🇫🇷 Manuel en français](README.fr.md).
8
+
9
+ ## Table of Contents
10
+
11
+ - [How it works](#how-it-works)
12
+ - [Exploring the versions history](#exploring-the-versions-history)
13
+ - [Be notified](#be-notified)
14
+ - [By email](#by-email)
15
+ - [By RSS](#by-rss)
16
+ - [Importing as a module](#importing-as-a-module)
17
+ - [CLI](#cli)
18
+ - [Features exposed](#features-exposed)
19
+ - [fetch](#fetch)
20
+ - [filter](#filter)
21
+ - [Using locally](#using-locally)
22
+ - [Installing](#installing)
23
+ - [Declarations repository](#declarations-repository)
24
+ - [Core](#core)
25
+ - [Configuring](#configuring)
26
+ - [Configuration file](#configuration-file)
27
+ - [Storage repositories](#storage-repositories)
28
+ - [Environment variables](#environment-variables)
29
+ - [Running](#running)
30
+ - [Deploying](#deploying)
31
+ - [Publishing](#publishing)
32
+ - [Contributing](#contributing)
33
+ - [Adding or updating a service](#adding-a-new-service-or-updating-an-existing-service)
34
+ - [Core engine](#core-engine)
35
+ - [Funding and partnerships](#funding-and-partnerships)
36
+ - [License](#license)
37
+
38
+ ## How it works
39
+
40
+ _Note: Words in bold are [business domain names](https://en.wikipedia.org/wiki/Domain-driven_design)._
41
+
42
+ **Services** are **declared** within _Open Terms Archive_ with a **declaration file** listing all the **documents** that, together, constitute the **terms** under which this **service** can be used. These **documents** all have a **type**, such as “terms and conditions”, “privacy policy”, “developer agreement”…
43
+
44
+ In order to **track** their **changes**, **documents** are periodically obtained by **fetching** a web **location** and **selecting content** within the **web page** to remove the **noise** (ads, navigation menu, login fields…). Beyond selecting a subset of a page, some **documents** have additional **noise** (hashes in links, CSRF tokens…) that would be false positives for **changes**. _Open Terms Archive_ thus supports specific **filters** for each **document**.
45
+
46
+ However, the shape of that **noise** can change over time. In order to recover in case of information loss during the **noise filtering** step, a **snapshot** is **recorded** every time there is a **change**. After the **noise** is **filtered out** from the **snapshot**, if there are **changes** in the resulting **document**, a new **version** of the **document** is **recorded**.
47
+
48
+ Anyone can run their own **private** instance and track changes on their own. However, we also **publish** each **version** on a [**public** instance](https://github.com/OpenTermsArchive/contrib-versions) that makes it easy to explore the entire **history** and enables **notifying** over email whenever a new **version** is **recorded**.
49
+ Users can [**subscribe** to **notifications**](#be-notified).
50
+
51
+ _Note: For now, when multiple versions coexist, **terms** are only **tracked** in their English version and for the European jurisdiction._
52
+
53
+ ## Exploring the versions history
54
+
55
+ We offer a public database of versions recorded each time there is a change in the terms of service and other contractual documents of tracked services: [contrib-versions](https://github.com/OpenTermsArchive/contrib-versions).
56
+
57
+ From the **repository homepage** [contrib-versions](https://github.com/OpenTermsArchive/contrib-versions), open the folder of the **service of your choice** (e.g. [WhatsApp](https://github.com/OpenTermsArchive/contrib-versions/tree/main/WhatsApp)).
58
+
59
+ You will see the **set of documents tracked** for that service, now click **on the document of your choice** (e.g. [WhatsApp's Privacy Policy](https://github.com/OpenTermsArchive/contrib-versions/blob/main/WhatsApp/Privacy%20Policy.md)). The **latest version** (updated hourly) will be displayed.
60
+
61
+ To view the **history of changes** made to this document, click on **History** at the top right of the document (for our previous [example](https://github.com/OpenTermsArchive/contrib-versions/commits/main/WhatsApp/Privacy%20Policy.md)). The **changes** are ordered **by date**, with the latest first.
62
+
63
+ Click on a change to see what it consists of (for example [this one](https://github.com/OpenTermsArchive/contrib-versions/commit/58a1d2ae4187a3260ac58f3f3c7dcd3aeacaebcd)). There are **two types of display** you can choose from the icons in the gray bar above the document.
64
+
65
+ - The first one, named _source diff_ (button with chevrons) allows you to **display the old version and the new one side by side** (for our [example](https://github.com/OpenTermsArchive/contrib-versions/commit/58a1d2ae4187a3260ac58f3f3c7dcd3aeacaebcd#diff-e8bdae8692561f60aeac9d27a55e84fc)). This display has the merit of **explicitly showing** all additions and deletions.
66
+ - The second one, named _rich diff_ (button with a document icon) allows you to **unify all the changes in a single document** (for our [example](https://github.com/OpenTermsArchive/contrib-versions/commit/58a1d2ae4187a3260ac58f3f3c7dcd3aeacaebcd?short_path=e8bdae8#diff-e8bdae8692561f60aeac9d27a55e84fc)). The **red** color shows **deleted** elements, the **yellow** color shows **modified** paragraphs, and the **green** color shows **added** elements. Be careful, this display **does not show some changes** such as hyperlinks and text style's changes.
67
+
68
+ ### Notes
69
+
70
+ - For long documents, unchanged **paragraphs will not be displayed by default**. You can manually make them appear by clicking on the small arrows just above or just below the displayed paragraphs.
71
+ - You can use the **History button anywhere** in the repository contrib-versions, which will then display the **history of changes made to all documents in the folder** where you are (including sub-folders).
72
+
73
+ ## Be notified
74
+
75
+ ### By email
76
+
77
+ #### Document per document
78
+
79
+ You can go on the official front website [opentermsarchive.org](https://opentermsarchive.org). From there, you can select a service and then the corresponding document type.
80
+ After you enter your email and click on subscribe, we will add your email to the correspondning mailing list in [SendInBlue](https://www.sendinblue.com/) and will not store your email anywhere else.
81
+ Then, everytime a modification is found on the correspondning document, we will send you an email.
82
+
83
+ You can unsubscribe at any moment by clicking on the `unsubscribe` link at the bottom of the received email.
84
+
85
+ #### For all documents at once
86
+
87
+ You can [subscribe](https://59692a77.sibforms.com/serve/MUIEAKuTv3y67e27PkjAiw7UkHCn0qVrcD188cQb-ofHVBGpvdUWQ6EraZ5AIb6vJqz3L8LDvYhEzPb2SE6eGWP35zXrpwEFVJCpGuER9DKPBUrifKScpF_ENMqwE_OiOZ3FdCV2ra-TXQNxB2sTEL13Zj8HU7U0vbbeF7TnbFiW8gGbcOa5liqmMvw_rghnEB2htMQRCk6A3eyj) to receive an email whenever a document is updated in the database.
88
+
89
+ **Beware, you are likely to receive a large amount of notifications!** You can unsubscribe by replying to any email you will receive.
90
+
91
+ ### By RSS
92
+
93
+ You can receive notification for a specific service or document by subscribing to RSS feeds.
94
+
95
+ > An RSS feed is a type of web page that contains information about the latest content published by a website, such as the date of publication and the address where you can view it. When this resource is updated, a feed reader app automatically notifies you and you can see the update.
96
+
97
+ To find out the address of the RSS feed you want to subscribe to:
98
+
99
+ 1. [Navigate](#exploring-the-versions-history) to the page with the history of changes you are interested in. _In the WhatsApp example above, this would be [this page](https://github.com/OpenTermsArchive/contrib-versions/commits/main/WhatsApp/Privacy%20Policy.md)._
100
+ 2. Copy the address of that page from your browser’s address bar. _In the WhatsApp example, this would be `https://github.com/OpenTermsArchive/contrib-versions/commits/main/WhatsApp/Privacy%20Policy.md`._
101
+ 3. Append `.atom` at the end of this address. _In the WhatsApp example, this would become `https://github.com/OpenTermsArchive/contrib-versions/commits/main/WhatsApp/Privacy%20Policy.md.atom`._
102
+ 4. Subscribe your RSS feed reader to the resulting address.
103
+
104
+ #### Recap of available RSS feeds
105
+
106
+ | Updated for | URL |
107
+ | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
108
+ | all services and documents | `https://github.com/OpenTermsArchive/contrib-versions/commits.atom` |
109
+ | all the documents of a service | Replace `$serviceId` with the service ID:<br>`https://github.com/OpenTermsArchive/contrib-versions/commits/main/$serviceId.atom.` |
110
+ | a specific document of a service | Replace `$serviceId` with the service ID and `$documentType` with the document type:<br>`https://github.com/OpenTermsArchive/contrib-versions/commits/main/$serviceId/$documentType.md.atom` |
111
+
112
+ For example:
113
+
114
+ - To receive all updates of `Facebook` documents, the URL is `https://github.com/OpenTermsArchive/contrib-versions/commits/main/Facebook.atom`.
115
+ - To receive all updates of the `Privacy Policy` from `Google`, the URL is `https://github.com/OpenTermsArchive/contrib-versions/commits/main/Google/Privacy%20Policy.md.atom`.
116
+
117
+ ## Importing as a module
118
+
119
+ Open Terms Archive exposes a JavaScript API to make some of its capabilities available in NodeJS. You can install it as an NPM module:
120
+
121
+ ```
122
+ npm install "ambanum/OpenTermsArchive#main"
123
+ ```
124
+
125
+ ### CLI
126
+
127
+ The following commands are available where the package is installed:
128
+
129
+ - `./node_modules/.bin/ota-lint-declarations`: check and normalise the format of declarations.
130
+ - `./node_modules/.bin/ota-validate-declarations`: validate declarations.
131
+ - `./node_modules/.bin/ota-track`: track services. Recorded snapshots and versions will be stored in the `data` folder at the root of the module where the package is installed.
132
+
133
+ In order to have them available globally in your command line, install it with the `--global` option.
134
+
135
+ ### Features exposed
136
+
137
+ #### fetch
138
+
139
+ The `fetch` module gets the MIME type and content of a document from its URL.
140
+
141
+ You can use it in your code by using `import fetch from 'open-terms-archive/fetch';`.
142
+
143
+ Documentation on how to use `fetch` is provided as JSDoc within [./src/archivist/fetcher/index.js](./src/archivist/fetcher/index.js).
144
+
145
+ If you plan to use `executeClientScripts` as a parameter of `fetch`, the fetching will be done using a headless browser.
146
+ In order to not instantiate this browser at each fetch, the starting and stopping of the browser is your responsibility.
147
+
148
+ Here is an example on how to use it:
149
+
150
+ ```js
151
+ import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from 'open-terms-archive/fetch';
152
+
153
+ await launchHeadlessBrowser();
154
+ await fetch({ executeClientScripts: true, ... });
155
+ await fetch({ executeClientScripts: true, ... });
156
+ await fetch({ executeClientScripts: true, ... });
157
+ await stopHeadlessBrowser();
158
+ ```
159
+
160
+ The `fetch` module can also be configured as a [`node-config` submodule](https://github.com/node-config/node-config/wiki/Sub-Module-Configuration).
161
+ If [`node-config`](https://github.com/node-config/node-config) is used in the project, the default `fetcher` configuration can be overridden by adding a `fetcher` object to the local config. See [Configuration file](#configuration-file) for full reference.
162
+
163
+ #### filter
164
+
165
+ The `filter` module transforms HTML or PDF content into a Markdown string.
166
+ It will filter content based on the [document declaration](https://github.com/OpenTermsArchive/contrib-declarations/blob/main/CONTRIBUTING.md#declaring-a-new-service).
167
+
168
+ You can use the filter in your code by using `import filter from 'open-terms-archive/filter';`.
169
+
170
+ The `filter` function documentation is available as JSDoc within [./src/archivist/filter/index.js](./src/archivist/filter/index.js).
171
+
172
+ #### page-declaration
173
+
174
+ PageDeclaration object is used to describe a page to be tracked by Open Terms Archive.
175
+
176
+ You can use the page-declaration in your code by using `import pageDeclaration from 'open-terms-archive/page-declaration';`.
177
+
178
+ ## Using locally
179
+
180
+ ### Installing
181
+
182
+ This module is built with [Node](https://nodejs.org/en/) and is tested on macOS, UNIX and Windows. You will need to [install Node >= v16.x](https://nodejs.org/en/download/) to run it.
183
+
184
+ #### Declarations repository
185
+
186
+ 1. Locally clone your declarations repository, e.g., `git@github.com:OpenTermsArchive/contrib-declarations.git`.
187
+ 2. Go into your folder and initialize it, e.g., `cd contrib-declarations; npm install`.
188
+ 3. You can now modify your declarations in the `./declarations/` folder, following [these instructions](https://github.com/OpenTermsArchive/contrib-declarations/blob/main/CONTRIBUTING.md).
189
+ 4. When you want to test:
190
+ - If you want to test every declaration, run `npm test`.
191
+ - If you want to test a specific declaration, run `npm test $serviceId`, e.g., `npm test HER`.
192
+ - If you want to have faster feedback on the structure of a specific declaration, run `npm run test:schema $serviceId`, e.g., `npm run test:schema HER`.
193
+ 5. Once you have done that, if you have any error, it will be prompted and detailed at the end of the test.
194
+ - E.g., `InaccessibleContentError`: Your selector is wrong and should be fixed.
195
+ - E.g., `TypeError`: The file declaration is invalid.
196
+ - E.g., if you have a weird error, you may want to contact OTA, if may be a bug.
197
+
198
+ ##### Note: Testing
199
+
200
+ Testing works with multiple tests (e.g., checking the validity of the file, that the URL is correct and reachable, that the content is correctly gathered, etc.); as it may take a bit of time, that's why you may want to use `npm run test:schema`.
201
+
202
+ #### Core
203
+
204
+ When refering to the base folder, it means the folder where you will be `git pull`ing everything.
205
+
206
+ 1. If not done already, follow the previous part with the repo of your choice.
207
+ 2. In the base folder of the previous step (i.e., not _in_ the previous folder, but _where the previous folder is_), clone the core engine: `git clone git@github.com:ambanum/OpenTermsArchive.git`.
208
+ 3. Go into the cloned folder and install dependencies: `cd contrib-declarations; npm install`.
209
+ 4. If you are using the main repo, you are done, go to step 6.
210
+ 5. If you are using a special repo instance (e.g., `dating-declarations`), create a new [config file](#configuring), `config/development.json`, and add:
211
+ ```json
212
+ {
213
+
214
+ "services": {
215
+ "declarationsPath": "../<name of the repo>/declarations"
216
+ }
217
+ }
218
+ ```
219
+ e.g.,
220
+ ```json
221
+ {
222
+ "services": {
223
+ "declarationsPath": "../dating-declarations/declarations"
224
+ }
225
+ }
226
+ ```
227
+ 6. In the folder of the repo (i.e., `OpenTermsArchive`), use `npm start`.
228
+ - It will first do a refiltering to check whenever everything works properly.
229
+ - You will then start to see everything being downloaded under `data/`.
230
+ - More details in [Running](#running).
231
+
232
+ ##### Notes: Tips
233
+
234
+ - You may want to regularly `git pull` to have the latest updates, both in the core engine and in the declarations repos.
235
+ - You have to `npm install` in the declarations repo at least once, and a least once each time `package.json` changes.
236
+ - Be careful, it doesn't download the history! If you want that, you need to git clone `snapshots` and `versions` in `data/`.
237
+
238
+ You can clone as many declarations repositories as you want. The one that will be loaded at execution will be defined through configuration.
239
+
240
+ ### Configuring
241
+
242
+ #### Configuration file
243
+
244
+ The default configuration can be found in `config/default.json`. The full reference is given below. You are unlikely to want to edit all of these elements.
245
+
246
+ ```js
247
+ {
248
+ "services": {
249
+ "declarationsPath": "Directory containing services declarations and associated filters"
250
+ },
251
+ "recorder": {
252
+ "versions": {
253
+ "storage": {
254
+ "<storage-repository>": "Storage repository configuration object; see below"
255
+ }
256
+ },
257
+ "snapshots": {
258
+ "storage": {
259
+ "<storage-repository>": "Storage repository configuration object; see below"
260
+ }
261
+ }
262
+ },
263
+ "fetcher": {
264
+ "waitForElementsTimeout": "Maximum time (in milliseconds) to wait for elements to be present in the page when fetching document in a headless browser"
265
+ "navigationTimeout": "Maximum time (in milliseconds) to wait for page to load",
266
+ "language": "Language (in ISO 639-1 format) to pass in request headers"
267
+ },
268
+ "notifier": { // Notify specified mailing lists when new versions are recorded
269
+ "sendInBlue": { // SendInBlue API Key is defined in environment variables, see the “Environment variables” section below
270
+ "updatesListId": "SendInBlue contacts list ID of persons to notify on document updates",
271
+ "updateTemplateId": "SendInBlue email template ID used for updates notifications"
272
+ }
273
+ },
274
+ "logger": { // Logging mechanism to be notified upon error
275
+ "smtp": {
276
+ "host": "SMTP server hostname",
277
+ "username": "User for server authentication" // Password for server authentication is defined in environment variables, see the “Environment variables” section below
278
+ },
279
+ "sendMailOnError": { // Can be set to `false` if you do not want to send email on error
280
+ "to": "The address to send the email to in case of an error",
281
+ "from": "The address from which to send the email",
282
+ "sendWarnings": "Boolean. Set to true to also send email in case of warning",
283
+ }
284
+ },
285
+ "tracker": { // Tracking mechanism to create GitHub issues when document content is inaccessible
286
+ "githubIssues": {
287
+ "repository": "GitHub repository where to create isssues",
288
+ "label": {
289
+ "name": "Label to attach to bot-created issues. This specific label will be created automatically in the target repository",
290
+ "color": "The hexadecimal color code for the label, without the leading #",
291
+ "description": "A short description of the label"
292
+ }
293
+ }
294
+ },
295
+ "dataset": { // Release mechanism to create dataset periodically
296
+ "title": "Title of the dataset; recommended to be the name of the instance that generated it",
297
+ "versionsRepositoryURL": "GitHub repository where the dataset will be published as a release; recommended to be the versions repository for discoverability and tagging purposes"
298
+ }
299
+ }
300
+ ```
301
+
302
+ The default configuration is merged with (and overridden by) environment-specific configuration that can be specified at startup with the `NODE_ENV` environment variable. For example, you would run `NODE_ENV=development npm start` to load the `development.json` configuration file.
303
+
304
+ If you want to change your local configuration, we suggest you create a `config/development.json` file with overridden values. Example production configuration files can be found in the `config` folder.
305
+
306
+ ##### Storage repositories
307
+
308
+ Two storage repositories are currently supported: Git and MongoDB. Each one can be used independently for versions and snapshots.
309
+
310
+ ###### Git
311
+
312
+ ```json
313
+ {
314
+
315
+ "storage": {
316
+ "git": {
317
+ "path": "Versions database directory path, relative to the root of this project",
318
+ "publish": "Boolean. Set to true to push changes to the origin of the cloned repository at the end of every run. Recommended for production only.",
319
+ "snapshotIdentiferTemplate": "Text. Template used to explicit where to find the referenced snapshot id. Must contain a %SNAPSHOT_ID that will be replaced by the snapshot ID. Only useful for versions",
320
+ "author": {
321
+ "name": "Name to which changes in tracked documents will be credited",
322
+ "email": "Email to which changes in tracked documents will be credited"
323
+ }
324
+ }
325
+ }
326
+
327
+ }
328
+ ```
329
+ ###### MongoDB
330
+
331
+ ```json
332
+ {
333
+
334
+ "storage": {
335
+ "mongo": {
336
+ "connectionURI": "URI for defining connection to the MongoDB instance. See https://docs.mongodb.com/manual/reference/connection-string/",
337
+ "database": "Database name",
338
+ "collection": "Collection name"
339
+ }
340
+ }
341
+
342
+ }
343
+ ```
344
+
345
+ #### Environment variables
346
+
347
+ Environment variables can be passed in the command-line or provided in a `.env` file at the root of the repository. See `.env.example` for an example of such a file.
348
+
349
+ - `SMTP_PASSWORD`: a password for email server authentication, in order to send email notifications.
350
+ - `SENDINBLUE_API_KEY`: a SendInBlue API key, in order to send email notifications with that service.
351
+ - `GITHUB_TOKEN`: a token with repository privileges to access the [GitHub API](https://github.com/settings/tokens).
352
+
353
+ If your infrastructure requires using an outgoing HTTP/HTTPS proxy to access the Internet, you can provide it through the `HTTP_PROXY` and `HTTPS_PROXY` environment variable.
354
+
355
+ ### Running
356
+
357
+ To get the latest versions of all documents:
358
+
359
+ ```
360
+ npm start
361
+ ```
362
+
363
+ The latest version of a document will be available in the versions path defined in your configuration, under `$versions_folder/$service_provider_name/$document_type.md`.
364
+
365
+ To update documents automatically:
366
+
367
+ ```
368
+ npm run start:scheduler
369
+ ```
370
+
371
+ To get the latest version of a specific service's terms:
372
+
373
+ ```
374
+ npm start -- --services <service_id>
375
+ ```
376
+
377
+ > The service ID is the case sensitive name of the service declaration file without the extension. For example, for `Twitter.json`, the service ID is `Twitter`.
378
+
379
+
380
+ To get the latest version of a specific service's terms and document type:
381
+
382
+ ```
383
+ npm start -- --services <service_id> --documentTypes <document_type>
384
+ ```
385
+
386
+ To display help:
387
+
388
+ ```
389
+ npm start -- --help
390
+ ```
391
+
392
+ ## Deploying
393
+
394
+ See [Ops Readme](ops/README.md).
395
+
396
+ ## Publishing
397
+
398
+ To generate a dataset:
399
+
400
+ ```
401
+ npm run dataset:generate
402
+ ```
403
+
404
+ To release a dataset:
405
+
406
+ ```
407
+ npm run dataset:release
408
+ ```
409
+
410
+ To weekly release a dataset:
411
+
412
+ ```
413
+ npm run dataset:scheduler
414
+ ```
415
+
416
+ ## Contributing
417
+
418
+ Thanks for wanting to contribute! There are different ways to contribute to Open Terms Archive. We describe the most common below. If you want to explore other venues for contributing, please contact us over email (contact@[our domain name]) or [Twitter](https://twitter.com/OpenTerms).
419
+
420
+ ### Adding a new service or updating an existing service
421
+
422
+ See the [CONTRIBUTING](https://github.com/OpenTermsArchive/contrib-declarations/blob/main/CONTRIBUTING.md) of repository [`OpenTermsArchive/contrib-declarations`](https://github.com/OpenTermsArchive/contrib-declarations). You will need knowledge of JSON and web DOM.
423
+
424
+ ### Core engine
425
+
426
+ To contribute to the core engine of Open Terms Archive, see the [CONTRIBUTING](CONTRIBUTING.md) file of this repository. You will need knowledge of JavaScript and NodeJS.
427
+
428
+ ### Funding and partnerships
429
+
430
+ Beyond individual contributions, we need funds and committed partners to pay for a core team to maintain and grow Open Terms Archive. If you know of opportunities, please let us know! You can find [on our website](https://opentermsarchive.org/en/about) an up-to-date list of the partners and funders that make Open Terms Archive possible.
431
+
432
+
433
+ ---
434
+
435
+ ## License
436
+
437
+ The code for this software is distributed under the European Union Public Licence (EUPL) v1.2.
438
+ Contact the author if you have any specific need or question regarding licensing.
package/Vagrantfile ADDED
@@ -0,0 +1,38 @@
1
+ # -*- mode: ruby -*-
2
+ # vi: set ft=ruby :
3
+
4
+ Vagrant.configure("2") do |config|
5
+ config.vm.hostname = "vagrant"
6
+
7
+ config.vm.box = "debian/bullseye64" # Unable to locate package mongodb-org
8
+
9
+ # in order to have the same config for both Docker and VirtualBox providers, we load the key manually
10
+ # if necessary, create the key with `ssh-keygen -f ~/.ssh/ota-vagrant -q -N ""`
11
+ # CAUTION: use of `~` in path causes problems with ssh
12
+ config.vm.provision "file", source: File.join(ENV['HOME'], ".ssh", "ota-vagrant.pub"), destination: "/home/vagrant/.ssh/authorized_keys"
13
+
14
+ # based on https://github.com/rofrano/vagrant-docker-provider#example-vagrantfile
15
+ config.vm.provider :docker do |docker, override|
16
+ override.vm.box = nil
17
+ docker.image = "rofrano/vagrant-provider:debian"
18
+ docker.remains_running = true
19
+ docker.has_ssh = true
20
+ docker.privileged = true
21
+ docker.volumes = ["/sys/fs/cgroup:/sys/fs/cgroup:rw"]
22
+ docker.create_args = ["--cgroupns=host"]
23
+
24
+ # python is not installed by default in the vagrant-provider image
25
+ # and deploying results in /bin/sh: 1: /usr/bin/python: not found
26
+ # use a provision to fix that
27
+ # only with debian, no need with ubuntu
28
+ # Also need to name the provisioner, so that it runs only once https://github.com/hashicorp/vagrant/issues/7685#issuecomment-308281283
29
+ config.vm.provision "install_python3", type: "shell", inline: $installPython3
30
+ end
31
+ end
32
+
33
+ $installPython3 = <<-SCRIPT
34
+ echo Updating apt...
35
+ sudo apt-get update --fix-missing # Needed to fix "No package matching 'chromium' is available"
36
+ echo Installing python...
37
+ sudo apt-get --assume-yes install python3 python3-pip
38
+ SCRIPT
package/ansible.cfg ADDED
@@ -0,0 +1,13 @@
1
+ [defaults]
2
+
3
+ inventory = ops/inventories/dev.yml
4
+ roles_path = ops/roles
5
+
6
+
7
+ # The two following lines allow to have human readable output
8
+ # Use the YAML callback plugin.
9
+ stdout_callback = yaml
10
+ # Use the stdout_callback when running ad-hoc commands.
11
+ bin_ansible_callbacks = true
12
+
13
+ vault_password_file = vault.key
package/bin/.env.js ADDED
@@ -0,0 +1 @@
1
+ process.env.SUPPRESS_NO_CONFIG_WARNING = 'y';
@@ -0,0 +1,31 @@
1
+ #! /usr/bin/env node
2
+ // makes it easy to lint all files relative to one service ID, which would have been
3
+ // more difficult to achieve using an eslint based command directly defined in the package.json.
4
+ // It also ensures that the same version of eslint is used in the OpenTermsArchive core and declarations repositories.
5
+ import './.env.js'; // Workaround to ensure `SUPPRESS_NO_CONFIG_WARNING` is set before config is imported
6
+
7
+ import fs from 'fs';
8
+ import path from 'path';
9
+ import { fileURLToPath, pathToFileURL } from 'url';
10
+
11
+ import { program } from 'commander';
12
+ import config from 'config';
13
+
14
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
15
+
16
+ // Initialise configs to allow clients of this module to use it without requiring node-config in their own application.
17
+ // see https://github.com/lorenwest/node-config/wiki/Sub-Module-Configuration
18
+
19
+ config.util.setModuleDefaults('services', { declarationsPath: path.resolve(process.cwd(), './declarations') });
20
+ const { version } = JSON.parse(fs.readFileSync(new URL('../package.json', import.meta.url)).toString());
21
+
22
+ program
23
+ .name('ota-lint-declarations')
24
+ .description('Check format and stylistic errors in declarations and auto fix them')
25
+ .version(version)
26
+ .option('-s, --services [serviceId...]', 'service IDs of services to handle')
27
+ .option('-m, --modified', 'to only lint modified services already commited to git');
28
+
29
+ const lintDeclarations = (await import(pathToFileURL(path.resolve(__dirname, '../scripts/declarations/lint/index.js')))).default;
30
+
31
+ lintDeclarations(program.parse().opts());
package/bin/track.js ADDED
@@ -0,0 +1,26 @@
1
+ #! /usr/bin/env node
2
+ import './.env.js'; // Workaround to ensure `SUPPRESS_NO_CONFIG_WARNING` is set before config is imported
3
+
4
+ import fs from 'fs';
5
+ import path from 'path';
6
+ import { fileURLToPath, pathToFileURL } from 'url';
7
+
8
+ import config from 'config';
9
+
10
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
11
+
12
+ const defaultConfigs = JSON.parse(fs.readFileSync(path.resolve(__dirname, '../config/default.json')));
13
+
14
+ // Initialise configs to allow clients of this module to use it without requiring node-config in their own application.
15
+ // see https://github.com/lorenwest/node-config/wiki/Sub-Module-Configuration
16
+ config.util.setModuleDefaults('services', { declarationsPath: path.resolve(process.cwd(), './declarations') });
17
+ config.util.setModuleDefaults('fetcher', defaultConfigs.fetcher);
18
+ config.util.setModuleDefaults('recorder', config.util.extendDeep({}, defaultConfigs.recorder, {
19
+ versions: { storage: { git: { path: path.resolve(process.cwd(), './data/versions') } } },
20
+ snapshots: { storage: { git: { path: path.resolve(process.cwd(), './data/snapshots') } } },
21
+ }));
22
+ config.util.setModuleDefaults('logger', defaultConfigs.logger);
23
+ // we do not want any tracker when launching through this command line
24
+ config.util.setModuleDefaults('tracker', {});
25
+
26
+ import(pathToFileURL(path.resolve(__dirname, '../src/main.js')));
@@ -0,0 +1,68 @@
1
+ #! /usr/bin/env node
2
+ import './.env.js'; // Workaround to ensure `SUPPRESS_NO_CONFIG_WARNING` is set before config is imported
3
+
4
+ import fs from 'fs';
5
+ import path from 'path';
6
+ import { fileURLToPath } from 'url';
7
+
8
+ import { program } from 'commander';
9
+ import config from 'config';
10
+ import Mocha from 'mocha';
11
+
12
+ const { version } = JSON.parse(fs.readFileSync(new URL('../package.json', import.meta.url)).toString());
13
+
14
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
15
+
16
+ const defaultConfigs = JSON.parse(fs.readFileSync(path.resolve(__dirname, '../config/default.json')));
17
+
18
+ // Initialise configs to allow clients of this module to use it without requiring node-config in their own application.
19
+ // see https://github.com/lorenwest/node-config/wiki/Sub-Module-Configuration
20
+ config.util.setModuleDefaults('services', { declarationsPath: path.resolve(process.cwd(), './declarations') });
21
+ config.util.setModuleDefaults('fetcher', defaultConfigs.fetcher);
22
+
23
+ const VALIDATE_PATH = path.resolve(__dirname, '../scripts/declarations/validate/index.mocha.js');
24
+
25
+ // Mocha catches unhandled rejection from the user code and re-emits them to the process (see https://github.com/mochajs/mocha/blob/master/lib/runner.js#L198)
26
+ process.on('unhandledRejection', reason => {
27
+ // Re-throw them so that the validation command fails in these cases (for example, if there is a syntax error when parsing JSON declaration files)
28
+ throw reason;
29
+ });
30
+
31
+ program
32
+ .name('ota-validate-declarations')
33
+ .description('Run a series of tests to check the validity of document declarations')
34
+ .version(version)
35
+ .option('-s, --services [serviceId...]', 'service IDs of services to handle')
36
+ .option('-d, --documentTypes [documentType...]', 'document types to handle')
37
+ .option('-m, --modified', 'to only lint modified services already commited to git')
38
+ .option('-so, --schema-only', 'only refilter exisiting snapshots with last declarations and engine\'s updates');
39
+
40
+ const mocha = new Mocha({
41
+ delay: true, // as the validation script performs an asynchronous load before running the tests, the execution of the tests are delayed until run() is called
42
+ failZero: true, // consider that being called with no service to validate is a failure
43
+ });
44
+
45
+ (async () => {
46
+ mocha.addFile(VALIDATE_PATH); // As `delay` has been called, this statement will not load the file directly, `loadFilesAsync` is required.
47
+ await mocha.loadFilesAsync() // Load files previously added to the Mocha cache with `addFile`.
48
+ .catch(error => {
49
+ console.error(error);
50
+ process.exit(2);
51
+ });
52
+
53
+ let hasFailedTests = false;
54
+
55
+ const generateValidationTestSuite = (await import('../scripts/declarations/validate/index.mocha.js')).default;
56
+
57
+ generateValidationTestSuite(program.parse().opts());
58
+
59
+ mocha.run()
60
+ .on('fail', () => { hasFailedTests = true; })
61
+ .on('end', () => {
62
+ if (hasFailedTests) {
63
+ process.exit(1);
64
+ }
65
+
66
+ process.exit(0);
67
+ });
68
+ })();
package/config/ci.json ADDED
@@ -0,0 +1,5 @@
1
+ {
2
+ "services": {
3
+ "declarationsPath": "./contrib-declarations/declarations"
4
+ }
5
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "services": {
3
+ "repository": "https://github.com/OpenTermsArchive/contrib-declarations.git"
4
+ },
5
+ "recorder": {
6
+ "versions": {
7
+ "storage": {
8
+ "git": {
9
+ "snapshotIdentiferTemplate": "mongo://contrib/open-terms-archive/snapshots/%SNAPSHOT_ID",
10
+ "repository": "git@github.com:OpenTermsArchive/contrib-versions.git"
11
+ }
12
+ }
13
+ },
14
+ "snapshots": {
15
+ "storage": {
16
+ "type": "mongo"
17
+ }
18
+ }
19
+ },
20
+ "notifier": {
21
+ "sendInBlue": {
22
+ "updatesListId": 596,
23
+ "updateTemplateId": 39
24
+ }
25
+ },
26
+ "tracker": {
27
+ "githubIssues": {
28
+ "repository": "OpenTermsArchive/contrib-declarations"
29
+ }
30
+ },
31
+ "dataset": {
32
+ "title": "contrib",
33
+ "versionsRepositoryURL": "https://github.com/OpenTermsArchive/contrib-versions"
34
+ }
35
+ }