@opentermsarchive/engine 0.16.0 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +240 -232
  2. package/package.json +7 -1
  3. package/scripts/dataset/README.md +2 -2
  4. package/scripts/dataset/assets/README.template.js +5 -5
  5. package/scripts/dataset/export/test/fixtures/dataset/README.md +5 -5
  6. package/scripts/import/README.md +1 -1
  7. package/scripts/rewrite/README.md +2 -2
  8. package/scripts/rewrite/rewrite-versions.js +1 -1
  9. package/scripts/utils/renamer/README.md +5 -5
  10. package/scripts/utils/renamer/index.js +2 -2
  11. package/src/archivist/recorder/index.js +2 -2
  12. package/src/archivist/recorder/index.test.js +3 -3
  13. package/src/archivist/recorder/repositories/git/dataMapper.js +1 -1
  14. package/src/archivist/recorder/repositories/git/index.test.js +5 -5
  15. package/src/archivist/recorder/repositories/interface.js +2 -2
  16. package/src/archivist/recorder/repositories/mongo/index.test.js +4 -4
  17. package/src/archivist/services/index.test.js +2 -2
  18. package/src/archivist/services/service.test.js +1 -1
  19. package/src/main.js +1 -1
  20. package/.env.example +0 -3
  21. package/.eslintrc.yaml +0 -116
  22. package/.github/workflows/deploy.yml +0 -50
  23. package/.github/workflows/release.yml +0 -71
  24. package/.github/workflows/test.yml +0 -77
  25. package/CHANGELOG.md +0 -14
  26. package/CODE_OF_CONDUCT.md +0 -128
  27. package/CONTRIBUTING.md +0 -143
  28. package/MIGRATING.md +0 -42
  29. package/README.fr.md +0 -110
  30. package/Vagrantfile +0 -38
  31. package/ansible.cfg +0 -13
  32. package/decision-records/0001-service-name-and-id.md +0 -73
  33. package/decision-records/0002-service-history.md +0 -212
  34. package/decision-records/0003-snapshots-database.md +0 -123
  35. package/ops/README.md +0 -280
  36. package/ops/app.yml +0 -5
  37. package/ops/infra.yml +0 -6
  38. package/ops/inventories/dev.yml +0 -7
  39. package/ops/inventories/production.yml +0 -27
  40. package/ops/roles/infra/defaults/main.yml +0 -2
  41. package/ops/roles/infra/files/.gitconfig +0 -3
  42. package/ops/roles/infra/files/mongod.conf +0 -18
  43. package/ops/roles/infra/files/ota-bot-key.private_key +0 -26
  44. package/ops/roles/infra/tasks/main.yml +0 -78
  45. package/ops/roles/infra/tasks/mongo.yml +0 -40
  46. package/ops/roles/infra/templates/ssh_config.j2 +0 -5
  47. package/ops/roles/ota/defaults/main.yml +0 -14
  48. package/ops/roles/ota/files/.env +0 -21
  49. package/ops/roles/ota/tasks/database.yml +0 -65
  50. package/ops/roles/ota/tasks/main.yml +0 -110
  51. package/ops/site.yml +0 -6
  52. package/pm2.config.cjs +0 -20
  53. package/test/fixtures/service_A.js +0 -22
  54. package/test/fixtures/service_A_terms.md +0 -10
  55. package/test/fixtures/service_A_terms_snapshot.html +0 -14
  56. package/test/fixtures/service_B.js +0 -22
  57. package/test/fixtures/service_with_declaration_history.js +0 -65
  58. package/test/fixtures/service_with_filters_history.js +0 -155
  59. package/test/fixtures/service_with_history.js +0 -188
  60. package/test/fixtures/service_with_multipage_document.js +0 -100
  61. package/test/fixtures/service_without_history.js +0 -31
  62. package/test/fixtures/services.js +0 -19
  63. package/test/fixtures/terms.pdf +0 -0
  64. package/test/fixtures/termsFromPDF.md +0 -25
  65. package/test/fixtures/termsModified.pdf +0 -0
  66. package/test/services/service_A.json +0 -9
  67. package/test/services/service_B.json +0 -9
  68. package/test/services/service_with_declaration_history.filters.js +0 -7
  69. package/test/services/service_with_declaration_history.history.json +0 -17
  70. package/test/services/service_with_declaration_history.json +0 -13
  71. package/test/services/service_with_filters_history.filters.history.js +0 -29
  72. package/test/services/service_with_filters_history.filters.js +0 -7
  73. package/test/services/service_with_filters_history.json +0 -13
  74. package/test/services/service_with_history.filters.history.js +0 -29
  75. package/test/services/service_with_history.filters.js +0 -7
  76. package/test/services/service_with_history.history.json +0 -26
  77. package/test/services/service_with_history.json +0 -17
  78. package/test/services/service_with_multipage_document.filters.js +0 -7
  79. package/test/services/service_with_multipage_document.history.json +0 -37
  80. package/test/services/service_with_multipage_document.json +0 -28
  81. package/test/services/service_without_history.filters.js +0 -7
  82. package/test/services/service_without_history.json +0 -13
@@ -1,212 +0,0 @@
1
- # Defining a service history system
2
-
3
- - Date: 2020-11-23
4
-
5
- ## Context and Problem Statement
6
-
7
- We need to be able to regenerate versions from snapshots. As documents is aim to change over time (location or filters) we can't rely on the last version of the declaration to regenerate the version from an old snapshot. So we need a system to keep track of declaration changes, that's what we called declarations and filters versioning.
8
-
9
- ## Solutions considered
10
-
11
- At this time, we see three solutions which have in common the following rules:
12
-
13
- - `history` is optional
14
- - the current valid declaration has no date and should be clearly identifiable
15
- - the `valid_until` date is an inclusive expiration date. It should be the exact authored date of the last snapshot commit for which the declaration is still valid.
16
-
17
- ## Option 1: Add an `history` field in service declaration
18
-
19
- In `services/ASKfm.json`:
20
-
21
- ```
22
- {
23
- "name": "ASKfm",
24
- "documents": {
25
- "Terms of Service": {
26
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
27
- "select": ".selection",
28
- "filter": [ "add" ]
29
- "history": [
30
- {
31
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
32
- "select": "body",
33
- "filter": [ "add" ]
34
- "valid_until": "2020-08-24T14:02:39Z"
35
- },
36
- {
37
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
38
- "select": "body",
39
- "valid_until": "2020-08-23T14:02:39Z"
40
- }
41
- ]
42
- }
43
- }
44
- }
45
- ```
46
-
47
- Note: When no historisation is needed the file may have no mention of history.
48
-
49
- **Pros:**
50
-
51
- - Everything is in the same file:
52
- - might prevent to forget to update existing history
53
- - might help user to know that history is a thing and encourage them to learn about it if they feel the need
54
- - no (pseudo-)hidden knowledge about history
55
-
56
- **Cons:**
57
-
58
- - Apparent complexity can discourage new contributors
59
- - With time, the file can become huge
60
-
61
- ## Option 2: Add an `serviceId.history.json` file
62
-
63
- In `services/ASKfm.json`:
64
-
65
- ```
66
- {
67
- "name": "ASKfm",
68
- "documents": {
69
- "Terms of Service": {
70
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
71
- "select": ".selection",
72
- "filter": [ "add" ]
73
- }
74
- }
75
- }
76
- ```
77
-
78
- In `services/ASKfm.history.json`:
79
-
80
- ```
81
- {
82
- "name": "ASKfm",
83
- "documents": {
84
- "Terms of Service": [
85
- {
86
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
87
- "select": "body",
88
- "filter": [ "add" ]
89
- "valid_until": "2020-08-24T14:02:39Z"
90
- },
91
- {
92
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
93
- "select": "body",
94
- "valid_until": "2020-08-23T14:02:39Z"
95
- }
96
- ]
97
- }
98
- }
99
- ```
100
-
101
- **Pros:**
102
-
103
- - Service declaration stay small and simple
104
- - History file is kept close to the service declaration so users might see them
105
-
106
- **Cons:**
107
-
108
- - Make the discovery of history capacities less easy
109
- - Increase the probability of forgetting to update history file when making a change in the service discovery
110
-
111
- ## Option 2A
112
-
113
- Same as option 2, but the history file should only contain the document declarations to avoid divergence on service properties with the one in the original file.
114
-
115
- In `services/ASKfm.json`, **called the “service declaration”**:
116
-
117
- ```
118
- {
119
- "name": "ASKfm",
120
- "documents": {
121
- "Terms of Service": {
122
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
123
- "select": ".selection",
124
- "filter": [ "add" ]
125
- }
126
- }
127
- }
128
- ```
129
-
130
- In `services/ASKfm.history.json`, **called the “service history”**:
131
-
132
- ```
133
- {
134
- "Terms of Service": [
135
- {
136
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
137
- "select": "body",
138
- "filter": [ "add" ]
139
- "valid_until": "2020-08-24T14:02:39Z"
140
- },
141
- {
142
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
143
- "select": "body",
144
- "valid_until": "2020-08-23T14:02:39Z"
145
- }
146
- ]
147
- }
148
- ```
149
-
150
- ## Option 3: Add an history service declaration file in `services/history` folder
151
-
152
- In `services/ASKfm.json`:
153
-
154
- ```
155
- {
156
- "name": "ASKfm",
157
- "documents": {
158
- "Terms of Service": {
159
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
160
- "select": ".selection",
161
- "filter": [ "add" ]
162
- }
163
- }
164
- }
165
- ```
166
-
167
- In `services/history/ASKfm.json`:
168
-
169
- ```
170
- {
171
- "name": "ASKfm",
172
- "documents": {
173
- "Terms of Service": [
174
- {
175
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
176
- "select": "body",
177
- "filter": [ "add" ]
178
- "valid_until": "2020-08-24T14:02:39Z"
179
- },
180
- {
181
- "fetch": "https://ask.fm/docs/terms_of_use/?lang=en",
182
- "select": "body",
183
- "valid_until": "2020-08-23T14:02:39Z"
184
- }
185
- ]
186
- }
187
- }
188
- ```
189
-
190
- **Pros:**
191
-
192
- - Service declaration stay small and simple
193
- - All history updates are reserved to users with the knowledge that might work as gatekeepers
194
-
195
- **Cons:**
196
-
197
- - All history updates are reserved to users with the knowledge that might work as gatekeepers :)
198
- - Need to rely on people with knowledge to keep the history
199
-
200
- ## Some thoughts
201
-
202
- ### Community
203
-
204
- The choice might have implication on the community that will grow around the project.
205
-
206
- _Option 1_ shows everything to everyone, it might frightened some contributors with some apparent complexity (once there are history in the declaration file), but it might also encourage them to learn about it if they want or feel the need to. All contributors will share the same view and knowledge about the system. This might encourage collaboration between them to learn and improve together.
207
-
208
- _Option 2_ and _Option 3_ hide the complexity of history management in separate files and only most adventurous contributors will find them by themselves. Contribution to those files will probably be done by specific contributors that will be taught to manage those file. Thus creating two different kind of contributors: those who will stay with the basic service declaration, not knowing that more complex options exist, and those who will have the knowledge of history management whose work might stay in the shadow or work as gatekeeper.
209
-
210
- ## Decision Outcome
211
-
212
- [After consulting the community](https://github.com/ambanum/OpenTermsArchive/issues/156), the options 2A is retained as it hide complexity (compared to Option 1) of the history while increasing its discoverability (compared to Option 3) for contributors who might become more “adventurous”.
@@ -1,123 +0,0 @@
1
- # Determining an appropriate database system to store snapshots
2
-
3
- - Date: 2021-10-20
4
-
5
- ## Context and Problem Statement
6
-
7
- ### Context
8
-
9
- The Versions repository has several purposes:
10
-
11
- - Display differences between two versions, in particular when users receive a notification of change, so that they can simply see the changes.
12
- - Explore significant changes in tracked documents.
13
- - Offer a corpus of the latest versions of all the documents of the monitored services.
14
- - Serve as a dataset for research.
15
-
16
- It is therefore important that repository constitutes a quality dataset, to provide relevant information to users.
17
-
18
- For this purpose, the following constraints are considered necessary:
19
-
20
- - Versions must be ordered chronologically, so that navigation through the history of a document is intuitive.
21
- - Versions should not contain noise, only significant changes.
22
- - Each version must contain a link to the snapshot that was used to generate it.
23
-
24
- Currently, the following problems with the repository of Versions are identified:
25
-
26
- - Noise in the versions: URL or structure changes in the tracked documents.
27
- - Presence of refilter commits: related to URL and selector updates in service declarations or to Open Terms Archive code evolution.
28
- - Presence of commits due to code changes: type renaming, service renaming, documentation changes in the repository.
29
- - Presence of unordered commits: consequence of the import of the ToSBack history in snapshots or to the import of snapshots corresponding to archived documents provided by the services themselves.
30
-
31
- The solution considered in order to provide a quality dataset therefore consists of being able to regenerate the `versions` from the `snapshots`, that's what we call rewriting history.
32
-
33
- #### Rewriting history
34
-
35
- To rewrite history, we go through the snapshot commits one by one after reordering them (in memory) and we create a version commit each time, avoiding commits corresponding to noise and performing any renaming.
36
-
37
- This implies being able to version the service filters (used to generate the version from the snapshot).
38
- See https://github.com/ambanum/OpenTermsArchive/issues/156.
39
-
40
- ### Problem
41
-
42
- Currently, `git` is used as database for storing snapshots and versions.
43
- One year ago, the process to rewrite history was estimated to take about 16 hours for 100,000 commits. It has also been noted that the evolution of the time is not linear, the more commits there are in `snapshots` the more the average time per commit increases.
44
-
45
- It appears that the most costly operation is accessing the contents of a commit (checkout).
46
- It also appears that the older the commit is in the git history, the longer this operation takes.
47
-
48
- > For example, on a history containing about 100,000 commits, accessing the contents of the oldest commit takes about 1,000 ms while accessing the most recent commit takes only 100 ms.
49
-
50
- At the date of this document, the number of commits entries approaches the million and to iterate over these snapshots, to rewrite versions history, it currently takes more or less 3 months.
51
-
52
-
53
- Also, `git` implies to store data in a hash tree in the form of chronologically ordered commits. So to insert snapshots in the history, it implies to rewrite the whole snapshots history which also takes the same time as reading them.
54
-
55
- As described previously, we need to be able to regenerate versions from snapshots (for example to [rename services](https://github.com/ambanum/OpenTermsArchive/issues/314)) and to be able to insert snapshots in the history (for example to [import databases](https://github.com/ambanum/OpenTermsArchive/pull/214)).
56
- **This cannot take 6 months.**
57
-
58
- Moreover, as the number of snapshots will keep on growing, we need a system which allows scaling, potentially across multiple servers.
59
-
60
- Thus, we need a database management system meeting the following requirements:
61
-
62
- - Access time to a snapshot should be constant and independent from its authoring date.
63
- - Inserting time of a snapshot should be constant and independent from its authoring date.
64
- - Support concurrent access.
65
- - Scale horizontally.
66
-
67
- ### Solutions considered
68
-
69
- #### 1. Keep the system under git
70
-
71
- ##### Splitting into sub-repos
72
-
73
- Since accessing the contents of a commit takes longer the older it is in the history considered, the idea would be to work successively on ordered subsets of this history.
74
- This means truncating the history, browsing the remaining commits and regenerating the corresponding versions. Then creating another subset of the history which contains an arbitrary number of commits following the commits already browsed and perform the processing.
75
-
76
- To create a history subset with git :
77
- - Create a clone of a subset of N commits from the local snapshot: `git clone --depth <N> "file://local/path/snapshots" snapshots-tmp` with `N` corresponding to the position of the first commit you want in the block relative to the last commit in the history
78
- - Remove all commits older than the last commit you want to keep in the block: `git reset --hard <sha>` with `sha` corresponding to the id of the last commit you want to have in the block.
79
- - Clean up git to ensure that history navigation is efficient: `git gc`.
80
-
81
- So we need to split the history into chronologically ordered blocks, which leads us to the next problem.
82
-
83
- ##### Splitting and reordering blocks of snapshots
84
-
85
- Because snapshot commits are unordered, we can't simply create blocks of a fixed size from the git history (otherwise we'd process commits out of order).
86
- It is necessary to create blocks whose commits are ordered within the block but also in relation to the other blocks: for example, all the commits of the first block processed must be older than the commits of all the other blocks.
87
-
88
- The solution would be to create blocks in order: from the git history, we look for commits that are not in their place (whose date is earlier than that of its predecessor).
89
-
90
- Each of these commits represents the first commit of a block. This block extends to the previous one, the starting point of the next block.
91
- We thus obtain blocks whose commits are ordered.
92
-
93
- We still have to order the blocks between them (note, it is possible to have to cut a block to be able to place another).
94
-
95
- These chronologically ordered commit blocks, without overlap, can then be used with the previous approach (it may be necessary to re-split these blocks so that they have a reasonable size).
96
-
97
- #### 2. Move snapshots to a document-oriented database
98
-
99
- The idea of this solution is to keep the `versions` under git in order to continue to enjoy the benefits that GitHub provides in terms of browsing and viewing diffs, but to save the snapshots in a database, since we don't really need to browse the snapshots via a graphical interface nor to see the diff between two snapshots, which would allow us to be able to access the content more efficiently.
100
-
101
- MongoDB seems to meet the constraints:
102
-
103
- - It natively allows horizontal scaling with [replica sets](https://docs.mongodb.com/manual/replication/) and [sharding](https://docs.mongodb.com/manual/sharding/).
104
- - It supports concurrent access.
105
- - It has [In-Memory storage engine](https://docs.mongodb.com/manual/core/inmemory/) as an option for performance.
106
-
107
- We also did a simple test to ensure that access time and insert time also meets the requirements. We populated a database with one million entries and tried accessing snapshots with random dates and we found that access times remained stable. In our test on 1000 sequential access to random snapshot, the average access time was ~3.5ms with a maximum of ~50ms.
108
-
109
- Moreover, MongoDB has the following benefits:
110
-
111
- - Easy to use: offers a simple query syntax SQL and has a quick learning curve, especially for JavaScript developers.
112
- - Flexible and evolutive: allows to manage data of any structure, not just tabular structures defined in advance.
113
- - Widely used in the JavaScript ecosystem.
114
-
115
- As downside, joining documents in MongoDB is no easy task and pulling data from several collections requires a number of queries, which will lead to long turn-around times. This is not a problem in our case as we do not currently envision a need for such complex queries.
116
-
117
- ## Decision Outcome
118
-
119
- As MongoDB meets the requirements it is retained as a solution.
120
-
121
- ### Benchmark
122
-
123
- With MongoDB implementation, refilter takes around ~3m where it took around ~1h20 with the Git version.
package/ops/README.md DELETED
@@ -1,280 +0,0 @@
1
- # Open Terms Archive Ops
2
-
3
- Recipes to set up the infrastructure of and deploy Open Terms Archive.
4
-
5
- ## Requirements
6
-
7
- 1. Install [Ansible](https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html).
8
- 2. Install [Vagrant](https://www.vagrantup.com/downloads).
9
- 3. Install [VirtualBox](https://www.virtualbox.org/wiki/Downloads) to manage virtual machines. If you prefer Docker, or have an Apple Silicon machine, install [Docker](https://docs.docker.com/get-docker/) instead.
10
- 4. Create a dedicated SSH key with no password: `ssh-keygen -f ~/.ssh/ota-vagrant -q -N ""`. This key will be automatically used by Vagrant.
11
-
12
- > VirtualBox is not compatible with Apple Silicon (M1…) processors. If you have such a machine, you will need to use the Docker provider. Since MongoDB cannot be installed on ARM, it is skipped in the infrastructure installation process. This means you cannot test the MongoDB storage repository with Vagrant with an Apple Silicon processor.
13
-
14
- ## Usage
15
-
16
- **You should never apply changes to production from your machine.** We use continuous deployment to apply changes. To avoid making changes on the production server by mistake, we use [Vagrant](https://www.vagrantup.com) to describe and spawn virtual machines. By default all commands will only affect the Vagrant development virtual machine (VM).
17
-
18
- ### Launch
19
-
20
- If you’re on an Apple Silicon processor or want to use Docker instead of VirtualBox, use `vagrant up --provider=docker`.
21
-
22
- In all other cases, use `vagrant up` 🙂
23
-
24
- You can then deploy the code to the running machine with all the options described below.
25
-
26
- ### Main commands
27
-
28
- - To set up a full [(phoenix)](https://martinfowler.com/bliki/PhoenixServer.html) server:
29
-
30
- ```
31
- ansible-playbook ops/site.yml
32
- ```
33
-
34
- - To setup the infrastructure only:
35
-
36
- ```
37
- ansible-playbook ops/infra.yml
38
- ```
39
-
40
- - To setup the `Open Terms Archive` app only:
41
-
42
- ```
43
- ansible-playbook ops/app.yml
44
- ```
45
-
46
- ### Vagrant quick reference
47
-
48
- #### Connect to the virtual machine
49
-
50
- ```
51
- vagrant up
52
- vagrant ssh # use "vagrant" as password
53
- ```
54
-
55
- #### Start again with a clean virtual machine
56
-
57
- ```
58
- vagrant halt # stop machine
59
- vagrant destroy # remove machine
60
- vagrant up
61
- ```
62
-
63
- #### Troubleshooting: Remote host identification has changed
64
-
65
- In case you get that kind of error:
66
-
67
- ```
68
- fatal: [127.0.0.1]: UNREACHABLE! => changed=false
69
- msg: |-
70
- Failed to connect to the host via ssh: @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
71
- @ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @
72
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
73
- IT IS POSSIBLE THAT SOMEONE IS DOING SOMETHING NASTY!
74
-
75
- unreachable: true
76
- ```
77
-
78
- It may be because you already have a `known_host` registered with the same IP and port. To solve this, remove it from the entries using `ssh-keygen -R [127.0.0.1]:2222`.
79
-
80
- #### Troubleshooting: Connection refused
81
-
82
- If you have the following error:
83
-
84
- ```
85
- Failed to connect to the host via ssh: ssh: connect to host 127.0.0.1 port 2222: Connection refused
86
- ```
87
-
88
- You may have a collision on the default port `2222` used by Vagrant to forward SSH commands.
89
- Run the following command to know which ports are forwarded for the virtual machine:
90
-
91
- ```
92
- vagrant port
93
- ```
94
-
95
- It should display something like that:
96
-
97
- ```
98
- The forwarded ports for the machine are listed below. Please note that
99
- these values may differ from values configured in the Vagrantfile if the
100
- provider supports automatic port collision detection and resolution.
101
-
102
- 22 (guest) => 2200 (host)
103
- ```
104
-
105
- Modify the Ansible SSH options in the `ops/inventories/dev.yml` file with the proper `ansible_ssh_port`:
106
-
107
- ```
108
- all:
109
- children:
110
- vagrant:
111
- hosts:
112
- 127.0.0.1:
113
- […]
114
- ansible_ssh_port: 2200
115
- […]
116
- ```
117
-
118
- ### Logs
119
-
120
- You can obtain logs from the process manager over SSH:
121
-
122
- ```
123
- ssh <user>@<instance_hostname> pm2 logs ota
124
- ```
125
-
126
- ### Tags
127
-
128
- Some tags are available to refine what will happen, use them with `--tags`:
129
-
130
- - `setup`: to only setup system dependencies required by the app (cloning repo, installing app dependencies, all config files, and so on…)
131
- - `start`: to start the app
132
- - `stop`: to stop the app
133
- - `restart`: to restart the app
134
- - `update`: to update the app (pull code, install dependencies and restart app)
135
- - `update-declarations`: to update services declarations (pull declarations, install dependencies and restart app)
136
-
137
- For example, if you have changes to the core engine to deploy but no infrastructure changes, you can update the app only by running:
138
-
139
- ```
140
- ansible-playbook ops/app.yml --tags update --limit <instance_name>
141
- ```
142
-
143
- ## Production
144
-
145
- ### Applying changes
146
-
147
- To test locally your changes to the playbook before opening a pull request:
148
-
149
- - Remove all traces of previous tests to ensure that your changes do not work by coincidence: `vagrant destroy && vagrant up`.
150
- - Start by applying your changes on the virtual machine: `ansible-playbook ops/site.yml`.
151
- - Connect through SSH to the virtual machine and check that everything works as intended: `vagrant ssh`, `pm2 logs`…
152
- - Open a pull request and wait for it to be reviewed and merged. The continuous deployment process will take care of applying your changes to every production instance.
153
-
154
- ### Deploying manually from your machine
155
-
156
- **You should not be doing this.** If something terrible is happening in production, did you try just stopping the instance? Any fix should be applied through a PR and deployed in CD to ensure reproducibility.
157
-
158
- Note that executing the playbook on the `production` inventory will affect **all** production servers. Unless you know exactly what you are doing, you should always execute a playbook on a specific server only, add the `--limit` option with the instance name defined in `ops/inventories/production.yml` as parameter:
159
-
160
- ```
161
- ansible-playbook --inventory ops/inventories/production.yml ops/site.yml --limit <instance_name>
162
- ```
163
-
164
- ### Allowed keys
165
-
166
- Setting up the production infrastructure for publishing on the shared versions repository entails decrypting a private key managed with [Ansible Vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html). It is decrypted with a password stored in the passwords database.
167
-
168
- In case the instance you're deploying on is operated by the Core team, you should use the `OTA-bot` SSH private key instead of your personal one. You can thus run any of the commands with the `--private-key` option, passing it the path to the bot SSH private key. This key can be found in the passwords database.
169
-
170
- ### Commands examples
171
-
172
- - Check deployment without actually applying changes for the `dating` instance:
173
-
174
- ```
175
- ansible-playbook --inventory ops/inventories/production.yml ops/app.yml --limit dating --check --diff
176
- ```
177
-
178
- - Update the Open Terms Archive application only on the `dating` instance, without applying changes to the infrastructure:
179
-
180
- ```
181
- ansible-playbook --inventory ops/inventories/production.yml ops/app.yml --limit dating --tag update
182
- ```
183
-
184
- - Update services declarations only on the `france` instance:
185
-
186
- ```
187
- ansible-playbook --inventory ops/inventories/production.yml ops/app.yml --limit france --tag update-declarations
188
- ```
189
-
190
- - Stop the Open Terms Archive application only on the `france` instance:
191
-
192
- ```
193
- ansible-playbook --inventory ops/inventories/production.yml ops/app.yml --limit france --tag stop
194
- ```
195
-
196
- - Update the infrastructure and the Open Terms Archive application on all servers:
197
-
198
- ```
199
- ansible-playbook --inventory ops/inventories/production.yml ops/site.yml
200
- ```
201
-
202
- ## Set up a new instance
203
-
204
- ### Provision a server
205
-
206
- #### With [OVH Horizon](https://horizon.cloud.ovh.net/project/instances/)
207
-
208
- Click on the `Launch Instance` button. Then fill in at least the following fields:
209
-
210
- - `Instance name`.
211
- - `Source`. Suggested: `Debian 11`.
212
- - `Flavor`. Suggested: `b-7-flex`.
213
- - `Key pair`. Suggested: Your own personal SSH key, to allow you to connect to the freshly created server.
214
-
215
- #### Recommended specs
216
-
217
- The following setup is sufficient to track 20 services:
218
-
219
- - 1 vCore @ 1.8GHz
220
- - 2 GB RAM
221
- - 1 MBps bandwidth
222
- - 20 GB disk space
223
-
224
- The major factor for performance is bandwidth.
225
-
226
- Disk space is used up linearily with time as the archive grows. The number of services, their frequency of change and the chosen storage mechanism will all influence the speed at which disk space is used. You can start with 20GB but will have to consider expansion in the future. You should be safe for a longer time period with 100GB.
227
-
228
- We suggest using a dedicated attached volume for storage, independently from the main VM drive, so that you can more easily upgrade or format it.
229
-
230
- ### Define host
231
-
232
- Add an entry to the production inventory file `ops/inventories/production.yml` for the created host with the server address and proper variables.
233
-
234
- The host name can not contain dashes. Use snake_case.
235
-
236
- ### Configure instance
237
-
238
- Create a JSON file in the `config` folder with the name of the instance.
239
-
240
- ### Create repositories
241
-
242
- Create the `snapshot` and `version` repositories, with:
243
-
244
- - A `main` branch.
245
- - The `main` branch should be the default branch.
246
- - At least one commit on this branch with some content (`README.md` and `LICENSE`).
247
-
248
- Templates are provided to that end, for [declarations](https://github.com/OpenTermsArchive/template-declarations/), [snapshots](https://github.com/OpenTermsArchive/template-snapshots/) and [versions](https://github.com/OpenTermsArchive/template-versions/).
249
-
250
- ### Set up permissions
251
-
252
- The @OTA-Bot GitHub user should have write access to all three (declarations, snapshots, versions) repositories, so it can publish data, create issues, and publish dataset releases.
253
-
254
- Each instance should have a responsible entity, which we currently model as a [“team” in the @OpenTermsArchive](https://github.com/orgs/OpenTermsArchive/teams) GitHub organisation. Each team has write access to the three repositories, and @OTA-Bot should be added to that team along with the human maintainers.
255
-
256
- ## Optimise performance
257
-
258
- ### MongoDB
259
-
260
- If you use MongoDB as storage, hosting the database on an XFS-formatted volume significantly improves performance.
261
-
262
- The following instructions assume [OVH Horizon](https://horizon.cloud.ovh.net/project/instances/) for volume creation, but can be adapted for any cloud provider.
263
-
264
- #### Mounting
265
-
266
- - Create a volume with the highest speed possible.
267
- - Attach the volume to the server that runs your Open Terms Archive instance.
268
- - On the machine, check what is your volume with `lsblk` (it should be one with no partition).
269
- - Then use `sudo fdisk /dev/sd$N` (where `$N` is the identifier of the volume) and answer `n`, `p`, `1`, `w`.
270
- - Install XFS utilities `sudo apt-get install xfsprogs`
271
- - Format the disk to XFS: `sudo mkfs.xfs -f /dev/sd$N1`/
272
- - Finally, create a folder (for example in `/mnt`) and mount the volume in it: `sudo mount -t auto /dev/sd$N1 /mnt/disk`.
273
-
274
- #### Unmounting
275
-
276
- To remove a volume:
277
-
278
- - Unmount it with `sudo umount /mnt/disk`.
279
- - Unattach it from the Horizon console.
280
- - Remove the volume from the Horizon console.
package/ops/app.yml DELETED
@@ -1,5 +0,0 @@
1
- ---
2
- - name: Set up Open Terms Archive app and databases
3
- hosts: all
4
- roles:
5
- - ota
package/ops/infra.yml DELETED
@@ -1,6 +0,0 @@
1
- ---
2
- - name: Set up Open Terms Archive infrastructure
3
- hosts: all
4
- become: yes
5
- roles:
6
- - infra
@@ -1,7 +0,0 @@
1
- vagrant:
2
- hosts:
3
- 127.0.0.1:
4
- ansible_user: vagrant
5
- ansible_port: 2222
6
- ansible_python_interpreter: /usr/bin/python3
7
- ansible_ssh_private_key_file: ~/.ssh/ota-vagrant
@@ -1,27 +0,0 @@
1
- contrib:
2
- hosts:
3
- 198.244.153.104:
4
- ed25519_fingerprint: AAAAC3NzaC1lZDI1NTE5AAAAIITN8hTCst7+6mHNzeo465crCZwHrc/SzUL1410mb9Lv
5
- dating:
6
- hosts:
7
- vps-99ae1d89.vps.ovh.net:
8
- ed25519_fingerprint: AAAAC3NzaC1lZDI1NTE5AAAAIClFdaZhaXFmxdQI+rNSOsZaSlrgPlK9UzyGvi66u88V
9
- france:
10
- hosts:
11
- 198.244.142.9:
12
- ed25519_fingerprint: AAAAC3NzaC1lZDI1NTE5AAAAIKH7P9SCnnSiVOhGMNvHIjWw5+3TYlmgmTK45Y9d1aCu
13
- pga:
14
- hosts:
15
- 134.102.58.70:
16
- ansible_user: pga
17
- ed25519_fingerprint: AAAAC3NzaC1lZDI1NTE5AAAAIDmKHW4LMOEIxnBHkdNzwvSrzjmfhQkx5n2lFtJdraOy
18
- p2b_compliance:
19
- hosts:
20
- vps-463f0baf.vps.ovh.net:
21
- ansible_user: ota
22
- ed25519_fingerprint: AAAAC3NzaC1lZDI1NTE5AAAAIDOrkEl2aR2gJe0XmLy4j+0/51G/kAlkupfU4S2Qv0dJ
23
- config_file_name: p2b-compliance
24
-
25
- all:
26
- vars:
27
- ansible_user: debian
@@ -1,2 +0,0 @@
1
- instance_name: '{{ group_names[0] }}'
2
- config_file_name: '{{ instance_name }}'
@@ -1,3 +0,0 @@
1
- [user]
2
- email = bot@opentermsarchive.org
3
- name = Open Terms Archive Bot