@opentermsarchive/engine 0.19.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -184,7 +184,7 @@ npx ota track --services "<service_id>" ["<service_id>"...]
184
184
  ##### Track specific terms of specific services
185
185
 
186
186
  ```sh
187
- npx ota track --services "<service_id>" ["<service_id>"...] --termsTypes "<terms_type>" ["<terms_type>"...]
187
+ npx ota track --services "<service_id>" ["<service_id>"...] --terms-types "<terms_type>" ["<terms_type>"...]
188
188
  ```
189
189
 
190
190
  ##### Track documents four times a day
@@ -196,7 +196,7 @@ npx ota track --schedule
196
196
  #### `ota validate`
197
197
 
198
198
  ```sh
199
- npx ota validate [--services <service_id>...] [--termsTypes <terms_type>...]
199
+ npx ota validate [--services <service_id>...] [--terms-types <terms_type>...]
200
200
  ```
201
201
 
202
202
  Check that all declarations allow recording a snapshot and a version properly.
@@ -206,7 +206,7 @@ If one or several `<service_id>` are provided, check only those services.
206
206
  ##### Validate schema only
207
207
 
208
208
  ```sh
209
- npx ota validate --schema-only [--services <service_id>...] [--termsTypes <terms_type>...]
209
+ npx ota validate --schema-only [--services <service_id>...] [--terms-types <terms_type>...]
210
210
  ```
211
211
 
212
212
  Check that all declarations are readable by the engine.
@@ -227,6 +227,38 @@ Automatically correct formatting mistakes and ensure that all declarations are s
227
227
 
228
228
  If one or several `<service_id>` are provided, check only those services.
229
229
 
230
+ #### `ota dataset`
231
+
232
+ Export the versions dataset into a ZIP file and publish it to GitHub releases.
233
+
234
+ The dataset title and the URL of the versions repository are defined in the [configuration](#configuring).
235
+
236
+ To export the dataset into a local ZIP file:
237
+
238
+ ```sh
239
+ npx ota dataset [--file <filename>]
240
+ ```
241
+
242
+ To export the dataset into a ZIP file and publish it on GitHub releases:
243
+
244
+ ```sh
245
+ GITHUB_TOKEN=ghp_XXXXXXXXX npx ota dataset --publish
246
+ ```
247
+
248
+ The `GITHUB_TOKEN` can also be defined in a [`.env` file](#environment-variables).
249
+
250
+ To export, publish the dataset and remove the local copy that was created after it has been uploaded:
251
+
252
+ ```sh
253
+ GITHUB_TOKEN=ghp_XXXXXXXXX npx ota dataset --publish --remove-local-copy
254
+ ```
255
+
256
+ To schedule export, publishing and local copy removal:
257
+
258
+ ```sh
259
+ GITHUB_TOKEN=ghp_XXXXXXXXX npx ota dataset --schedule --publish --remove-local-copy
260
+ ```
261
+
230
262
  ### API
231
263
 
232
264
  Once added as a dependency, the engine exposes a JavaScript API that can be called in your own code. The following modules are available.
@@ -277,10 +309,6 @@ import pageDeclaration from '@opentermsarchive/engine/page-declaration';
277
309
 
278
310
  The `PageDeclaration` format is defined [in source code](./src/archivist/services/pageDeclaration.js).
279
311
 
280
- ### Dataset generation
281
-
282
- See the [`dataset` script documentation](./scripts/dataset/README.md).
283
-
284
312
  ## Configuring
285
313
 
286
314
  ### Configuration file
@@ -0,0 +1,33 @@
1
+ #! /usr/bin/env node
2
+ import './env.js';
3
+
4
+ import { program } from 'commander';
5
+ import cron from 'croner';
6
+
7
+ import { release } from '../scripts/dataset/index.js';
8
+ import logger from '../src/logger/index.js';
9
+
10
+ program
11
+ .name('ota dataset')
12
+ .description('Export the versions dataset into a ZIP file and optionally publish it to GitHub releases')
13
+ .option('-f, --file <filename>', 'file name of the generated dataset')
14
+ .option('-p, --publish', 'publish dataset to GitHub releases on versions repository. Mandatory authentication to GitHub is provided through the `GITHUB_TOKEN` environment variable')
15
+ .option('-r, --remove-local-copy', 'remove local copy of dataset after publishing. Works only in combination with --publish option')
16
+ .option('--schedule', 'schedule automatic dataset generation');
17
+
18
+ const { schedule, publish, removeLocalCopy, file: fileName } = program.parse().opts();
19
+
20
+ const options = {
21
+ fileName,
22
+ shouldPublish: publish,
23
+ shouldRemoveLocalCopy: removeLocalCopy,
24
+ };
25
+
26
+ if (!schedule) {
27
+ await release(options);
28
+ } else {
29
+ logger.info('The scheduler is running…');
30
+ logger.info('Dataset will be published every Monday at 08:30 in the timezone of this machine');
31
+
32
+ cron('30 8 * * MON', () => release(options));
33
+ }
package/bin/ota-track.js CHANGED
@@ -14,7 +14,7 @@ program
14
14
  .name('ota track')
15
15
  .description('Retrieve declared documents, record snapshots, extract versions and publish the resulting records')
16
16
  .option('-s, --services [serviceId...]', 'service IDs of services to track')
17
- .option('-t, --termsType [termsType...]', 'terms types to track')
17
+ .option('-t, --terms-types [termsType...]', 'terms types to track')
18
18
  .option('-r, --refilter-only', 'refilter existing snapshots with latest declarations and engine, without recording new snapshots')
19
19
  .option('--schedule', 'schedule automatic document tracking');
20
20
 
@@ -21,7 +21,7 @@ program
21
21
  .name('ota validate')
22
22
  .description('Run a series of tests to check the validity of document declarations')
23
23
  .option('-s, --services [serviceId...]', 'service IDs of services to validate')
24
- .option('-t, --termsTypes [termsType...]', 'terms types to validate')
24
+ .option('-t, --terms-types [termsType...]', 'terms types to validate')
25
25
  .option('-m, --modified', 'target only services modified in the current git branch')
26
26
  .option('-o, --schema-only', 'much faster check of declarations, but does not check that the documents are actually accessible');
27
27
 
package/bin/ota.js CHANGED
@@ -13,4 +13,5 @@ program
13
13
  .command('track', 'Track the current terms of services according to provided declarations')
14
14
  .command('validate', 'Run a series of tests to check the validity of document declarations')
15
15
  .command('lint', 'Check format and stylistic errors in declarations and auto fix them')
16
+ .command('dataset', 'Export the versions dataset into a ZIP file and optionally publish it to GitHub releases')
16
17
  .parse(process.argv);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "0.19.1",
3
+ "version": "0.21.0",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://github.com/ambanum/OpenTermsArchive#readme",
6
6
  "bugs": {
@@ -30,8 +30,8 @@
30
30
  ".eslintrc.yaml"
31
31
  ],
32
32
  "scripts": {
33
- "dataset:generate": "node scripts/dataset/main.js",
34
- "dataset:release": "node scripts/dataset/main.js --publish --remove-local-copy",
33
+ "dataset:generate": "node bin/ota.js dataset",
34
+ "dataset:release": "node bin/ota.js dataset --publish --remove-local-copy",
35
35
  "dataset:scheduler": "npm run dataset:release -- --schedule",
36
36
  "declarations:lint": "node bin/ota.js lint",
37
37
  "declarations:validate": "node bin/ota.js validate",
@@ -84,9 +84,12 @@ export async function filterHTML({ content, pageDeclaration }) {
84
84
 
85
85
  // clean code from common changing patterns - initially for Windstream
86
86
  domFragment.querySelectorAll('a[href*="/email-protection"]').forEach(node => {
87
- if (node.href.match(/((.*?)\/email-protection#)[0-9a-fA-F]+/gim)) {
88
- node.href = `${node.href.split('#')[0]}#removed`;
89
- }
87
+ const newProtectedLink = webPageDOM.createElement('a');
88
+ const [href] = node.href.split('#');
89
+
90
+ newProtectedLink.href = href;
91
+ newProtectedLink.innerHTML = '[email protected]';
92
+ node.parentNode.replaceChild(newProtectedLink, node);
90
93
  });
91
94
 
92
95
  const markdownContent = transform(domFragment);
@@ -63,6 +63,7 @@ const rawHTMLWithCommonChangingItems = `
63
63
  <p><a id="link3" href="http://absolute.url/link">link 3</a></p>
64
64
  <p><a id="link4" href="">link 4</a></p>
65
65
  <a href="/cdn-cgi/l/email-protection#3b4c52555f484f495e5a56154b49524d5a584215484f5a4f5e565e554f7b4c52555f484f495e5a5615585456">[email&#160;protected]</a>
66
+ <p><a href="/cdn-cgi/l/email-protection#2d4e4243594c4e596d4e4459545e4e424259034858">conta<span>[email&#160;protected]</span></a></p>
66
67
  </body>
67
68
  </html>`;
68
69
 
@@ -78,7 +79,9 @@ const expectedFilteredWithCommonChangingItems = `Title
78
79
 
79
80
  link 4
80
81
 
81
- [\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection#removed)`;
82
+ [\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection)
83
+
84
+ [\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection)`;
82
85
  /* eslint-enable no-irregular-whitespace */
83
86
 
84
87
  const additionalFilter = {
package/src/index.js CHANGED
@@ -6,7 +6,7 @@ import logger from './logger/index.js';
6
6
  import Notifier from './notifier/index.js';
7
7
  import Tracker from './tracker/index.js';
8
8
 
9
- export default async function track({ services = [], termsType: documentTypes, refilterOnly, schedule }) {
9
+ export default async function track({ services = [], termsTypes: documentTypes, refilterOnly, schedule }) {
10
10
  const archivist = new Archivist({ recorderConfig: config.get('recorder') });
11
11
 
12
12
  archivist.attach(logger);
@@ -1,37 +0,0 @@
1
- # Dataset release
2
-
3
- Export the versions dataset into a ZIP file and publish it to GitHub releases.
4
-
5
- ## Configuring
6
-
7
- You can change the configuration in the appropriate config file in the `config` folder. See the [main README](../../README.md#configuring) for documentation on using the configuration file.
8
-
9
- ## Running
10
-
11
- To export the dataset into a local ZIP file:
12
-
13
- ```sh
14
- node scripts/dataset/main.js [$filename]
15
- ```
16
-
17
- To export the dataset into a ZIP file and publish it on GitHub releases:
18
-
19
- ```sh
20
- node scripts/dataset/main.js --publish
21
- ```
22
-
23
- To export, publish the dataset and remove the local copy that was created after it has been uploaded:
24
-
25
- ```sh
26
- node scripts/dataset/main.js --publish --remove-local-copy
27
- ```
28
-
29
- To schedule export, publishing and local copy removal:
30
-
31
- ```sh
32
- node scripts/dataset/main.js --schedule --publish --remove-local-copy
33
- ```
34
-
35
- ## Adding renaming rules
36
-
37
- See the [renamer module documentation](../utils/renamer/README.md).
@@ -1,25 +0,0 @@
1
- import cron from 'croner';
2
-
3
- import logger from './logger/index.js';
4
-
5
- import { release } from './index.js';
6
-
7
- const args = process.argv.slice(2);
8
- const argsWithoutOptions = args.filter(arg => !arg.startsWith('--'));
9
- const [fileName] = argsWithoutOptions;
10
- const shouldSchedule = args.includes('--schedule');
11
-
12
- const options = {
13
- fileName,
14
- shouldPublish: args.includes('--publish'),
15
- shouldRemoveLocalCopy: args.includes('--remove-local-copy'),
16
- };
17
-
18
- if (!shouldSchedule) {
19
- release(options);
20
- } else {
21
- logger.info('The scheduler is running…');
22
- logger.info('Dataset will be published at 08:30 on every Monday');
23
-
24
- cron('30 8 * * MON', () => release(options));
25
- }