npm - @opentermsarchive/engine - Versions diffs - 0.19.1 → 0.21.0 - Mend

@opentermsarchive/engine 0.19.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +35 -7
package/bin/ota-dataset.js +33 -0
package/bin/ota-track.js +1 -1
package/bin/ota-validate.js +1 -1
package/bin/ota.js +1 -0
package/package.json +3 -3
package/src/archivist/filter/index.js +6 -3
package/src/archivist/filter/index.test.js +4 -1
package/src/index.js +1 -1
package/scripts/dataset/README.md +0 -37
package/scripts/dataset/main.js +0 -25

package/README.md CHANGED Viewed

@@ -184,7 +184,7 @@ npx ota track --services "<service_id>" ["<service_id>"...]
 ##### Track specific terms of specific services
 ```sh
-npx ota track --services "<service_id>" ["<service_id>"...] --termsTypes "<terms_type>" ["<terms_type>"...]
+npx ota track --services "<service_id>" ["<service_id>"...] --terms-types "<terms_type>" ["<terms_type>"...]
 ```
 ##### Track documents four times a day
@@ -196,7 +196,7 @@ npx ota track --schedule
 #### `ota validate`
 ```sh
-npx ota validate [--services <service_id>...] [--termsTypes <terms_type>...]
+npx ota validate [--services <service_id>...] [--terms-types <terms_type>...]
 ```
 Check that all declarations allow recording a snapshot and a version properly.
@@ -206,7 +206,7 @@ If one or several `<service_id>` are provided, check only those services.
 ##### Validate schema only
 ```sh
-npx ota validate --schema-only [--services <service_id>...] [--termsTypes <terms_type>...]
+npx ota validate --schema-only [--services <service_id>...] [--terms-types <terms_type>...]
 ```
 Check that all declarations are readable by the engine.
@@ -227,6 +227,38 @@ Automatically correct formatting mistakes and ensure that all declarations are s
 If one or several `<service_id>` are provided, check only those services.
+#### `ota dataset`
+Export the versions dataset into a ZIP file and publish it to GitHub releases.
+The dataset title and the URL of the versions repository are defined in the [configuration](#configuring).
+To export the dataset into a local ZIP file:
+```sh
+npx ota dataset [--file <filename>]
+```
+To export the dataset into a ZIP file and publish it on GitHub releases:
+```sh
+GITHUB_TOKEN=ghp_XXXXXXXXX npx ota dataset --publish
+```
+The `GITHUB_TOKEN` can also be defined in a [`.env` file](#environment-variables).
+To export, publish the dataset and remove the local copy that was created after it has been uploaded:
+```sh
+GITHUB_TOKEN=ghp_XXXXXXXXX npx ota dataset --publish --remove-local-copy
+```
+To schedule export, publishing and local copy removal:
+```sh
+GITHUB_TOKEN=ghp_XXXXXXXXX npx ota dataset --schedule --publish --remove-local-copy
+```
 ### API
 Once added as a dependency, the engine exposes a JavaScript API that can be called in your own code. The following modules are available.
@@ -277,10 +309,6 @@ import pageDeclaration from '@opentermsarchive/engine/page-declaration';
 The `PageDeclaration` format is defined [in source code](./src/archivist/services/pageDeclaration.js).
-### Dataset generation
-See the [`dataset` script documentation](./scripts/dataset/README.md).
 ## Configuring
 ### Configuration file

package/bin/ota-dataset.js ADDED Viewed

@@ -0,0 +1,33 @@
+#! /usr/bin/env node
+import './env.js';
+import { program } from 'commander';
+import cron from 'croner';
+import { release } from '../scripts/dataset/index.js';
+import logger from '../src/logger/index.js';
+program
+  .name('ota dataset')
+  .description('Export the versions dataset into a ZIP file and optionally publish it to GitHub releases')
+  .option('-f, --file <filename>', 'file name of the generated dataset')
+  .option('-p, --publish', 'publish dataset to GitHub releases on versions repository. Mandatory authentication to GitHub is provided through the `GITHUB_TOKEN` environment variable')
+  .option('-r, --remove-local-copy', 'remove local copy of dataset after publishing. Works only in combination with --publish option')
+  .option('--schedule', 'schedule automatic dataset generation');
+const { schedule, publish, removeLocalCopy, file: fileName } = program.parse().opts();
+const options = {
+  fileName,
+  shouldPublish: publish,
+  shouldRemoveLocalCopy: removeLocalCopy,
+};
+if (!schedule) {
+  await release(options);
+} else {
+  logger.info('The scheduler is running…');
+  logger.info('Dataset will be published every Monday at 08:30 in the timezone of this machine');
+  cron('30 8 * * MON', () => release(options));
+}

package/bin/ota-track.js CHANGED Viewed

@@ -14,7 +14,7 @@ program
   .name('ota track')
   .description('Retrieve declared documents, record snapshots, extract versions and publish the resulting records')
   .option('-s, --services [serviceId...]', 'service IDs of services to track')
-  .option('-t, --termsType [termsType...]', 'terms types to track')
+  .option('-t, --terms-types [termsType...]', 'terms types to track')
   .option('-r, --refilter-only', 'refilter existing snapshots with latest declarations and engine, without recording new snapshots')
   .option('--schedule', 'schedule automatic document tracking');

package/bin/ota-validate.js CHANGED Viewed

@@ -21,7 +21,7 @@ program
   .name('ota validate')
   .description('Run a series of tests to check the validity of document declarations')
   .option('-s, --services [serviceId...]', 'service IDs of services to validate')
-  .option('-t, --termsTypes [termsType...]', 'terms types to validate')
+  .option('-t, --terms-types [termsType...]', 'terms types to validate')
   .option('-m, --modified', 'target only services modified in the current git branch')
   .option('-o, --schema-only', 'much faster check of declarations, but does not check that the documents are actually accessible');

package/bin/ota.js CHANGED Viewed

@@ -13,4 +13,5 @@ program
   .command('track', 'Track the current terms of services according to provided declarations')
   .command('validate', 'Run a series of tests to check the validity of document declarations')
   .command('lint', 'Check format and stylistic errors in declarations and auto fix them')
+  .command('dataset', 'Export the versions dataset into a ZIP file and optionally publish it to GitHub releases')
   .parse(process.argv);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@opentermsarchive/engine",
-  "version": "0.19.1",
+  "version": "0.21.0",
   "description": "Tracks and makes visible changes to the terms of online services",
   "homepage": "https://github.com/ambanum/OpenTermsArchive#readme",
   "bugs": {
@@ -30,8 +30,8 @@
     ".eslintrc.yaml"
   ],
   "scripts": {
-    "dataset:generate": "node scripts/dataset/main.js",
-    "dataset:release": "node scripts/dataset/main.js --publish --remove-local-copy",
+    "dataset:generate": "node bin/ota.js dataset",
+    "dataset:release": "node bin/ota.js dataset --publish --remove-local-copy",
     "dataset:scheduler": "npm run dataset:release -- --schedule",
     "declarations:lint": "node bin/ota.js lint",
     "declarations:validate": "node bin/ota.js validate",

package/src/archivist/filter/index.js CHANGED Viewed

@@ -84,9 +84,12 @@ export async function filterHTML({ content, pageDeclaration }) {
   // clean code from common changing patterns - initially for Windstream
   domFragment.querySelectorAll('a[href*="/email-protection"]').forEach(node => {
-    if (node.href.match(/((.*?)\/email-protection#)[0-9a-fA-F]+/gim)) {
-      node.href = `${node.href.split('#')[0]}#removed`;
-    }
+    const newProtectedLink = webPageDOM.createElement('a');
+    const [href] = node.href.split('#');
+    newProtectedLink.href = href;
+    newProtectedLink.innerHTML = '[email protected]';
+    node.parentNode.replaceChild(newProtectedLink, node);
   });
   const markdownContent = transform(domFragment);

package/src/archivist/filter/index.test.js CHANGED Viewed

@@ -63,6 +63,7 @@ const rawHTMLWithCommonChangingItems = `
     <p><a id="link3" href="http://absolute.url/link">link 3</a></p>
     <p><a id="link4" href="">link 4</a></p>
     <a href="/cdn-cgi/l/email-protection#3b4c52555f484f495e5a56154b49524d5a584215484f5a4f5e565e554f7b4c52555f484f495e5a5615585456">[email&#160;protected]</a>
+    <p><a href="/cdn-cgi/l/email-protection#2d4e4243594c4e596d4e4459545e4e424259034858">conta<span>[email&#160;protected]</span></a></p>
   </body>
 </html>`;
@@ -78,7 +79,9 @@ const expectedFilteredWithCommonChangingItems = `Title
 link 4
-[\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection#removed)`;
+[\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection)
+[\\[email protected\\]](https://exemple.com/cdn-cgi/l/email-protection)`;
 /* eslint-enable no-irregular-whitespace */
 const additionalFilter = {

package/src/index.js CHANGED Viewed

@@ -6,7 +6,7 @@ import logger from './logger/index.js';
 import Notifier from './notifier/index.js';
 import Tracker from './tracker/index.js';
-export default async function track({ services = [], termsType: documentTypes, refilterOnly, schedule }) {
+export default async function track({ services = [], termsTypes: documentTypes, refilterOnly, schedule }) {
   const archivist = new Archivist({ recorderConfig: config.get('recorder') });
   archivist.attach(logger);

package/scripts/dataset/README.md DELETED Viewed

@@ -1,37 +0,0 @@
-# Dataset release
-Export the versions dataset into a ZIP file and publish it to GitHub releases.
-## Configuring
-You can change the configuration in the appropriate config file in the `config` folder. See the [main README](../../README.md#configuring) for documentation on using the configuration file.
-## Running
-To export the dataset into a local ZIP file:
-```sh
-node scripts/dataset/main.js [$filename]
-```
-To export the dataset into a ZIP file and publish it on GitHub releases:
-```sh
-node scripts/dataset/main.js --publish
-```
-To export, publish the dataset and remove the local copy that was created after it has been uploaded:
-```sh
-node scripts/dataset/main.js --publish --remove-local-copy
-```
-To schedule export, publishing and local copy removal:
-```sh
-node scripts/dataset/main.js --schedule --publish --remove-local-copy
-```
-## Adding renaming rules
-See the [renamer module documentation](../utils/renamer/README.md).

package/scripts/dataset/main.js DELETED Viewed

@@ -1,25 +0,0 @@
-import cron from 'croner';
-import logger from './logger/index.js';
-import { release } from './index.js';
-const args = process.argv.slice(2);
-const argsWithoutOptions = args.filter(arg => !arg.startsWith('--'));
-const [fileName] = argsWithoutOptions;
-const shouldSchedule = args.includes('--schedule');
-const options = {
-  fileName,
-  shouldPublish: args.includes('--publish'),
-  shouldRemoveLocalCopy: args.includes('--remove-local-copy'),
-};
-if (!shouldSchedule) {
-  release(options);
-} else {
-  logger.info('The scheduler is running…');
-  logger.info('Dataset will be published at 08:30 on every Monday');
-  cron('30 8 * * MON', () => release(options));
-}