@opentermsarchive/engine 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/.env.example +3 -0
  2. package/.eslintrc.yaml +116 -0
  3. package/.github/workflows/deploy.yml +50 -0
  4. package/.github/workflows/release.yml +64 -0
  5. package/.github/workflows/test.yml +77 -0
  6. package/CHANGELOG.md +14 -0
  7. package/CODE_OF_CONDUCT.md +128 -0
  8. package/CONTRIBUTING.md +143 -0
  9. package/LICENSE +153 -0
  10. package/MIGRATING.md +42 -0
  11. package/README.fr.md +110 -0
  12. package/README.md +438 -0
  13. package/Vagrantfile +38 -0
  14. package/ansible.cfg +13 -0
  15. package/bin/.env.js +1 -0
  16. package/bin/lint-declarations.js +31 -0
  17. package/bin/track.js +26 -0
  18. package/bin/validate-declarations.js +68 -0
  19. package/config/ci.json +5 -0
  20. package/config/contrib.json +35 -0
  21. package/config/dating.json +37 -0
  22. package/config/default.json +71 -0
  23. package/config/france.json +40 -0
  24. package/config/p2b-compliance.json +40 -0
  25. package/config/pga.json +40 -0
  26. package/config/production.json +27 -0
  27. package/config/test.json +49 -0
  28. package/config/vagrant.json +24 -0
  29. package/decision-records/0001-service-name-and-id.md +73 -0
  30. package/decision-records/0002-service-history.md +212 -0
  31. package/decision-records/0003-snapshots-database.md +123 -0
  32. package/ops/README.md +280 -0
  33. package/ops/app.yml +5 -0
  34. package/ops/infra.yml +6 -0
  35. package/ops/inventories/dev.yml +7 -0
  36. package/ops/inventories/production.yml +27 -0
  37. package/ops/roles/infra/defaults/main.yml +2 -0
  38. package/ops/roles/infra/files/.gitconfig +3 -0
  39. package/ops/roles/infra/files/mongod.conf +18 -0
  40. package/ops/roles/infra/files/ota-bot-key.private_key +26 -0
  41. package/ops/roles/infra/tasks/main.yml +78 -0
  42. package/ops/roles/infra/tasks/mongo.yml +40 -0
  43. package/ops/roles/infra/templates/ssh_config.j2 +5 -0
  44. package/ops/roles/ota/defaults/main.yml +14 -0
  45. package/ops/roles/ota/files/.env +21 -0
  46. package/ops/roles/ota/tasks/database.yml +65 -0
  47. package/ops/roles/ota/tasks/main.yml +110 -0
  48. package/ops/site.yml +6 -0
  49. package/package.json +101 -0
  50. package/pm2.config.cjs +20 -0
  51. package/scripts/dataset/README.md +37 -0
  52. package/scripts/dataset/assets/LICENSE +540 -0
  53. package/scripts/dataset/assets/README.template.js +65 -0
  54. package/scripts/dataset/export/index.js +106 -0
  55. package/scripts/dataset/export/index.test.js +155 -0
  56. package/scripts/dataset/export/test/fixtures/dataset/LICENSE +540 -0
  57. package/scripts/dataset/export/test/fixtures/dataset/README.md +40 -0
  58. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-01T11-27-00Z.md +1 -0
  59. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-11T11-32-47Z.md +1 -0
  60. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Privacy Policy/2022-01-01T12-12-24Z.md +1 -0
  61. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Terms of Service/2022-01-06T11-32-47Z.md +1 -0
  62. package/scripts/dataset/index.js +40 -0
  63. package/scripts/dataset/logger/index.js +17 -0
  64. package/scripts/dataset/main.js +25 -0
  65. package/scripts/dataset/publish/index.js +39 -0
  66. package/scripts/declarations/lint/index.js +36 -0
  67. package/scripts/declarations/utils/index.js +81 -0
  68. package/scripts/declarations/validate/definitions.js +63 -0
  69. package/scripts/declarations/validate/index.mocha.js +262 -0
  70. package/scripts/declarations/validate/service.history.schema.js +86 -0
  71. package/scripts/declarations/validate/service.schema.js +91 -0
  72. package/scripts/history/logger/index.js +39 -0
  73. package/scripts/history/migrate-services.js +212 -0
  74. package/scripts/history/update-to-full-hash.js +61 -0
  75. package/scripts/history/utils/index.js +23 -0
  76. package/scripts/import/README.md +59 -0
  77. package/scripts/import/config/import.json +12 -0
  78. package/scripts/import/index.js +224 -0
  79. package/scripts/import/loadCommits.js +66 -0
  80. package/scripts/import/logger/index.js +43 -0
  81. package/scripts/rewrite/README.md +131 -0
  82. package/scripts/rewrite/config/rewrite-snapshots.json +32 -0
  83. package/scripts/rewrite/config/rewrite-versions.json +32 -0
  84. package/scripts/rewrite/initializer/files/license +428 -0
  85. package/scripts/rewrite/initializer/files/readme.md +8 -0
  86. package/scripts/rewrite/initializer/index.js +44 -0
  87. package/scripts/rewrite/rewrite-snapshots.js +108 -0
  88. package/scripts/rewrite/rewrite-versions.js +160 -0
  89. package/scripts/rewrite/utils.js +33 -0
  90. package/scripts/utils/renamer/README.md +49 -0
  91. package/scripts/utils/renamer/index.js +45 -0
  92. package/scripts/utils/renamer/rules/documentTypes.json +25 -0
  93. package/scripts/utils/renamer/rules/documentTypesByService.json +170 -0
  94. package/scripts/utils/renamer/rules/serviceNames.json +92 -0
  95. package/src/archivist/errors.js +9 -0
  96. package/src/archivist/fetcher/errors.js +6 -0
  97. package/src/archivist/fetcher/exports.js +18 -0
  98. package/src/archivist/fetcher/fullDomFetcher.js +84 -0
  99. package/src/archivist/fetcher/htmlOnlyFetcher.js +62 -0
  100. package/src/archivist/fetcher/index.js +35 -0
  101. package/src/archivist/fetcher/index.test.js +239 -0
  102. package/src/archivist/filter/exports.js +3 -0
  103. package/src/archivist/filter/index.js +178 -0
  104. package/src/archivist/filter/index.test.js +561 -0
  105. package/src/archivist/index.js +276 -0
  106. package/src/archivist/index.test.js +600 -0
  107. package/src/archivist/recorder/index.js +77 -0
  108. package/src/archivist/recorder/index.test.js +463 -0
  109. package/src/archivist/recorder/record.js +35 -0
  110. package/src/archivist/recorder/record.test.js +91 -0
  111. package/src/archivist/recorder/repositories/factory.js +23 -0
  112. package/src/archivist/recorder/repositories/git/dataMapper.js +83 -0
  113. package/src/archivist/recorder/repositories/git/git.js +122 -0
  114. package/src/archivist/recorder/repositories/git/git.test.js +86 -0
  115. package/src/archivist/recorder/repositories/git/index.js +182 -0
  116. package/src/archivist/recorder/repositories/git/index.test.js +714 -0
  117. package/src/archivist/recorder/repositories/interface.js +108 -0
  118. package/src/archivist/recorder/repositories/mongo/dataMapper.js +32 -0
  119. package/src/archivist/recorder/repositories/mongo/index.js +121 -0
  120. package/src/archivist/recorder/repositories/mongo/index.test.js +721 -0
  121. package/src/archivist/services/documentDeclaration.js +26 -0
  122. package/src/archivist/services/documentDeclaration.test.js +85 -0
  123. package/src/archivist/services/documentTypes.json +386 -0
  124. package/src/archivist/services/index.js +255 -0
  125. package/src/archivist/services/index.test.js +327 -0
  126. package/src/archivist/services/pageDeclaration.js +51 -0
  127. package/src/archivist/services/pageDeclaration.test.js +224 -0
  128. package/src/archivist/services/service.js +60 -0
  129. package/src/archivist/services/service.test.js +164 -0
  130. package/src/exports.js +3 -0
  131. package/src/index.js +59 -0
  132. package/src/logger/README.md +1 -0
  133. package/src/logger/index.js +131 -0
  134. package/src/main.js +18 -0
  135. package/src/notifier/README.md +1 -0
  136. package/src/notifier/index.js +150 -0
  137. package/src/tracker/README.md +1 -0
  138. package/src/tracker/index.js +215 -0
  139. package/test/fixtures/service_A.js +22 -0
  140. package/test/fixtures/service_A_terms.md +10 -0
  141. package/test/fixtures/service_A_terms_snapshot.html +14 -0
  142. package/test/fixtures/service_B.js +22 -0
  143. package/test/fixtures/service_with_declaration_history.js +65 -0
  144. package/test/fixtures/service_with_filters_history.js +155 -0
  145. package/test/fixtures/service_with_history.js +188 -0
  146. package/test/fixtures/service_with_multipage_document.js +100 -0
  147. package/test/fixtures/service_without_history.js +31 -0
  148. package/test/fixtures/services.js +19 -0
  149. package/test/fixtures/terms.pdf +0 -0
  150. package/test/fixtures/termsFromPDF.md +25 -0
  151. package/test/fixtures/termsModified.pdf +0 -0
  152. package/test/services/service_A.json +9 -0
  153. package/test/services/service_B.json +9 -0
  154. package/test/services/service_with_declaration_history.filters.js +7 -0
  155. package/test/services/service_with_declaration_history.history.json +17 -0
  156. package/test/services/service_with_declaration_history.json +13 -0
  157. package/test/services/service_with_filters_history.filters.history.js +29 -0
  158. package/test/services/service_with_filters_history.filters.js +7 -0
  159. package/test/services/service_with_filters_history.json +13 -0
  160. package/test/services/service_with_history.filters.history.js +29 -0
  161. package/test/services/service_with_history.filters.js +7 -0
  162. package/test/services/service_with_history.history.json +26 -0
  163. package/test/services/service_with_history.json +17 -0
  164. package/test/services/service_with_multipage_document.filters.js +7 -0
  165. package/test/services/service_with_multipage_document.history.json +37 -0
  166. package/test/services/service_with_multipage_document.json +28 -0
  167. package/test/services/service_without_history.filters.js +7 -0
  168. package/test/services/service_without_history.json +13 -0
@@ -0,0 +1,39 @@
1
+ import fsApi from 'fs';
2
+ import path from 'path';
3
+ import url from 'url';
4
+
5
+ import config from 'config';
6
+ import dotenv from 'dotenv';
7
+ import { Octokit } from 'octokit';
8
+
9
+ import * as readme from '../assets/README.template.js';
10
+
11
+ dotenv.config();
12
+
13
+ export default async function publish({ archivePath, releaseDate, stats }) {
14
+ const octokit = new Octokit({ auth: process.env.GITHUB_TOKEN });
15
+
16
+ const [ owner, repo ] = url.parse(config.get('dataset.versionsRepositoryURL')).pathname.split('/').filter(component => component);
17
+
18
+ const tagName = `${path.basename(archivePath, path.extname(archivePath))}`; // use archive filename as Git tag
19
+
20
+ const { data: { upload_url: uploadUrl, html_url: releaseUrl } } = await octokit.rest.repos.createRelease({
21
+ owner,
22
+ repo,
23
+ tag_name: tagName,
24
+ name: readme.title({ releaseDate }),
25
+ body: readme.body(stats),
26
+ });
27
+
28
+ await octokit.rest.repos.uploadReleaseAsset({
29
+ data: fsApi.readFileSync(archivePath),
30
+ headers: {
31
+ 'content-type': 'application/zip',
32
+ 'content-length': fsApi.statSync(archivePath).size,
33
+ },
34
+ name: path.basename(archivePath),
35
+ url: uploadUrl,
36
+ });
37
+
38
+ return releaseUrl;
39
+ }
@@ -0,0 +1,36 @@
1
+ import path from 'path';
2
+ import { fileURLToPath } from 'url';
3
+
4
+ import config from 'config';
5
+ import { ESLint } from 'eslint';
6
+
7
+ import DeclarationUtils from '../utils/index.js';
8
+
9
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
+
11
+ const declarationsPath = config.get('services.declarationsPath');
12
+ const instancePath = path.resolve(declarationsPath, '../');
13
+ const ESLINT_CONFIG_PATH = path.join(__dirname, '../../../.eslintrc.yaml');
14
+
15
+ const lintDeclarations = async ({ services, modified }) => {
16
+ console.log(`Linting declaration files in ${instancePath}`);
17
+ let servicesToValidate = services || ['*'];
18
+
19
+ if (modified) {
20
+ const declarationUtils = new DeclarationUtils(instancePath);
21
+
22
+ servicesToValidate = await declarationUtils.getModifiedServices();
23
+ }
24
+
25
+ for (const service of servicesToValidate) {
26
+ /* eslint-disable no-await-in-loop */
27
+ const lintResults = await new ESLint({ overrideConfigFile: ESLINT_CONFIG_PATH, fix: true })
28
+ .lintFiles(path.join(declarationsPath, `${service}.*`));
29
+
30
+ await ESLint.outputFixes(lintResults);
31
+ console.log(lintResults.map(lintResult => `${path.basename(lintResult.filePath)} linted`).join('\n'));
32
+ /* eslint-enable no-await-in-loop */
33
+ }
34
+ };
35
+
36
+ export default lintDeclarations;
@@ -0,0 +1,81 @@
1
+ import path from 'path';
2
+
3
+ import DeepDiff from 'deep-diff';
4
+ import simpleGit from 'simple-git';
5
+
6
+ export default class DeclarationUtils {
7
+ constructor(instancePath, defaultBranch = 'remotes/origin/main') {
8
+ this.git = simpleGit(instancePath, { maxConcurrentProcesses: 1 });
9
+ this.defaultBranch = defaultBranch;
10
+ }
11
+
12
+ static filePathToServiceId = filePath => path.parse(filePath.replace(/\.history|\.filters/, '')).name;
13
+
14
+ async getJSONFile(path, ref) {
15
+ try {
16
+ return JSON.parse(await this.git.show([`${ref}:${path}`]));
17
+ } catch (e) {
18
+ return {};
19
+ }
20
+ }
21
+
22
+ async getModifiedData() {
23
+ const modifiedFilePathsAsString = (await this.git.diff([ '--diff-filter=d', '--name-only', this.defaultBranch, 'HEAD', '--', './declarations' ])).trim();
24
+
25
+ const modifiedFilePaths = modifiedFilePathsAsString ? modifiedFilePathsAsString.split('\n') : [];
26
+
27
+ return { modifiedFilePaths, modifiedServiceIds: Array.from(new Set(modifiedFilePaths.map(DeclarationUtils.filePathToServiceId))) };
28
+ }
29
+
30
+ async getModifiedServices() {
31
+ const { modifiedServiceIds } = await this.getModifiedData();
32
+
33
+ return modifiedServiceIds;
34
+ }
35
+
36
+ async getModifiedServiceDocumentTypes() {
37
+ const { modifiedFilePaths, modifiedServiceIds } = await this.getModifiedData();
38
+ const servicesDocumentTypes = {};
39
+
40
+ await Promise.all(modifiedFilePaths.map(async modifiedFilePath => {
41
+ const serviceId = DeclarationUtils.filePathToServiceId(modifiedFilePath);
42
+
43
+ if (!modifiedFilePath.endsWith('.json')) {
44
+ // Here we should compare AST of both files to detect on which function
45
+ // change has been made, and then find which document type depends on this
46
+ // function.
47
+ // As this is a complicated process, we will just send back all document types
48
+ const declaration = await this.getJSONFile(`declarations/${serviceId}.json`, this.defaultBranch);
49
+
50
+ return Object.keys(declaration.documents);
51
+ }
52
+
53
+ const defaultFile = await this.getJSONFile(modifiedFilePath, this.defaultBranch);
54
+ const modifiedFile = await this.getJSONFile(modifiedFilePath, 'HEAD');
55
+
56
+ const diff = DeepDiff.diff(defaultFile, modifiedFile);
57
+
58
+ if (!diff) {
59
+ // This can happen if only a lint has been applied to a document
60
+ return;
61
+ }
62
+
63
+ const modifiedDocumentTypes = diff.reduce((acc, { path }) => {
64
+ if (modifiedFilePath.includes('.history')) {
65
+ acc.add(path[0]);
66
+ } else if (path[0] == 'documents') {
67
+ acc.add(path[1]);
68
+ }
69
+
70
+ return acc;
71
+ }, new Set());
72
+
73
+ servicesDocumentTypes[serviceId] = Array.from(new Set([ ...servicesDocumentTypes[serviceId] || [], ...modifiedDocumentTypes ]));
74
+ }));
75
+
76
+ return {
77
+ services: modifiedServiceIds,
78
+ servicesDocumentTypes,
79
+ };
80
+ }
81
+ }
@@ -0,0 +1,63 @@
1
+ const definitions = {
2
+ location: {
3
+ type: 'string',
4
+ format: 'uri',
5
+ description: 'The URL where the document can be found',
6
+ },
7
+ pdfLocation: {
8
+ type: 'string',
9
+ pattern: '^https?://.+.[pP][dD][fF](\\?.+)?$',
10
+ description: 'The URL where the document can be found',
11
+ },
12
+ executeClientScripts: {
13
+ type: 'boolean',
14
+ description: 'Execute client-side JavaScript loaded by the document before accessing the content, in case the DOM modifications are needed to access the content.',
15
+ },
16
+ selectors: {
17
+ description: 'Selector(s) that targets element to include',
18
+ oneOf: [
19
+ { $ref: '#/definitions/cssSelector' },
20
+ { $ref: '#/definitions/range' },
21
+ {
22
+ type: 'array',
23
+ items: { oneOf: [{ $ref: '#/definitions/cssSelector' }, { $ref: '#/definitions/range' }] },
24
+ },
25
+ ],
26
+ },
27
+ contentSelectors: { $ref: '#/definitions/selectors' },
28
+ noiseSelectors: { $ref: '#/definitions/selectors' },
29
+ filters: {
30
+ type: 'array',
31
+ items: {
32
+ type: 'string',
33
+ pattern: '^.+$',
34
+ description: 'Filter function name',
35
+ },
36
+ },
37
+ validUntil: {
38
+ type: 'string',
39
+ format: 'date-time',
40
+ },
41
+ cssSelector: {
42
+ type: 'string',
43
+ pattern: '^.+$',
44
+ description: 'A CSS selector',
45
+ },
46
+ range: {
47
+ type: 'object',
48
+ properties: {
49
+ startBefore: { $ref: '#/definitions/cssSelector' },
50
+ startAfter: { $ref: '#/definitions/cssSelector' },
51
+ endBefore: { $ref: '#/definitions/cssSelector' },
52
+ endAfter: { $ref: '#/definitions/cssSelector' },
53
+ },
54
+ oneOf: [
55
+ { required: [ 'startBefore', 'endBefore' ] },
56
+ { required: [ 'startBefore', 'endAfter' ] },
57
+ { required: [ 'startAfter', 'endBefore' ] },
58
+ { required: [ 'startAfter', 'endAfter' ] },
59
+ ],
60
+ },
61
+ };
62
+
63
+ export default definitions;
@@ -0,0 +1,262 @@
1
+ import fsApi from 'fs';
2
+ import path from 'path';
3
+ import { fileURLToPath } from 'url';
4
+
5
+ import Ajv from 'ajv';
6
+ import { expect } from 'chai';
7
+ import config from 'config';
8
+ import { ESLint } from 'eslint';
9
+ import jsonSourceMap from 'json-source-map';
10
+
11
+ import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from '../../../src/archivist/fetcher/index.js';
12
+ import filter from '../../../src/archivist/filter/index.js';
13
+ import * as services from '../../../src/archivist/services/index.js';
14
+ import DeclarationUtils from '../utils/index.js';
15
+
16
+ import serviceHistorySchema from './service.history.schema.js';
17
+ import serviceSchema from './service.schema.js';
18
+
19
+ const fs = fsApi.promises;
20
+
21
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
22
+ const ROOT_PATH = path.resolve(__dirname, '../../../');
23
+ const ESLINT_CONFIG_PATH = path.join(ROOT_PATH, '.eslintrc.yaml');
24
+
25
+ const MIN_DOC_LENGTH = 100;
26
+ const SLOW_DOCUMENT_THRESHOLD = 10 * 1000; // number of milliseconds after which a document fetch is considered slow
27
+
28
+ const eslint = new ESLint({ overrideConfigFile: ESLINT_CONFIG_PATH, fix: false });
29
+ const eslintWithFix = new ESLint({ overrideConfigFile: ESLINT_CONFIG_PATH, fix: true });
30
+
31
+ const declarationsPath = path.resolve(ROOT_PATH, config.get('services.declarationsPath'));
32
+ const instancePath = path.resolve(declarationsPath, '../');
33
+
34
+ export default async options => {
35
+ const schemaOnly = options.schemaOnly || false;
36
+ let servicesToValidate = options.services || [];
37
+ const documentTypes = options.documentTypes || [];
38
+ let servicesDocumentTypes = {};
39
+
40
+ const serviceDeclarations = await services.loadWithHistory(servicesToValidate);
41
+
42
+ if (!servicesToValidate.length) {
43
+ servicesToValidate = Object.keys(serviceDeclarations);
44
+ }
45
+
46
+ if (options.modified) {
47
+ const declarationUtils = new DeclarationUtils(instancePath);
48
+
49
+ ({ services: servicesToValidate, servicesDocumentTypes } = await declarationUtils.getModifiedServiceDocumentTypes());
50
+ }
51
+
52
+ describe('Service declarations validation', async function () {
53
+ this.timeout(30000);
54
+ this.slow(SLOW_DOCUMENT_THRESHOLD);
55
+
56
+ servicesToValidate.forEach(serviceId => {
57
+ const service = serviceDeclarations[serviceId];
58
+ const filePath = path.join(declarationsPath, `${serviceId}.json`);
59
+ const historyFilePath = path.join(declarationsPath, `${serviceId}.history.json`);
60
+
61
+ before(launchHeadlessBrowser);
62
+
63
+ after(stopHeadlessBrowser);
64
+
65
+ context(serviceId, async () => {
66
+ before(async function () {
67
+ if (!service) {
68
+ console.log(' (Tests skipped as declaration has been archived)');
69
+ this.skip();
70
+ }
71
+ });
72
+
73
+ it('valid declaration schema', async () => {
74
+ const declaration = JSON.parse(await fs.readFile(filePath));
75
+
76
+ assertValid(serviceSchema, declaration);
77
+ });
78
+
79
+ it('valid declaration file format', async () => {
80
+ await lintFile(filePath);
81
+ });
82
+
83
+ if (service && service.hasHistory()) {
84
+ it('valid history declaration schema', async () => {
85
+ const declarationHistory = JSON.parse(await fs.readFile(historyFilePath));
86
+
87
+ assertValid(serviceHistorySchema, declarationHistory);
88
+ });
89
+
90
+ it('valid history declaration file format', async () => {
91
+ await lintFile(path.join(declarationsPath, `${serviceId}.history.json`));
92
+ });
93
+ }
94
+
95
+ const filtersFilePath = path.join(declarationsPath, `${serviceId}.filters.js`);
96
+
97
+ if (fsApi.existsSync(filtersFilePath)) {
98
+ it('valid filters file format', async () => {
99
+ await lintFile(filtersFilePath);
100
+ });
101
+ }
102
+
103
+ const filtersHistoryFilePath = path.join(declarationsPath, `${serviceId}.filters.history.js`);
104
+
105
+ if (fsApi.existsSync(filtersHistoryFilePath)) {
106
+ it('valid filters history file format', async () => {
107
+ await lintFile(filtersHistoryFilePath);
108
+ });
109
+ }
110
+
111
+ if (!schemaOnly && service) {
112
+ service.getDocumentTypes()
113
+ .filter(documentType => {
114
+ if (servicesDocumentTypes[serviceId] && servicesDocumentTypes[serviceId].length > 0) {
115
+ return servicesDocumentTypes[serviceId].includes(documentType);
116
+ }
117
+
118
+ if (documentTypes.length > 0) {
119
+ return documentTypes.includes(documentType);
120
+ }
121
+
122
+ return true;
123
+ })
124
+ .forEach(type => {
125
+ describe(type, () => {
126
+ const documentDeclaration = service.getDocumentDeclaration(type);
127
+
128
+ documentDeclaration.pages.forEach(page => {
129
+ let content;
130
+ let filteredContent;
131
+ let mimeType;
132
+
133
+ context(page.location, () => {
134
+ before(async function () {
135
+ if (!documentDeclaration) {
136
+ console.log(' (Tests skipped as declaration has been archived)');
137
+ this.skip();
138
+ }
139
+ });
140
+
141
+ it('fetchable URL', async () => {
142
+ const { location, executeClientScripts } = page;
143
+ const document = await fetch({
144
+ url: location,
145
+ executeClientScripts,
146
+ cssSelectors: page.cssSelectors,
147
+ config: config.get('fetcher'),
148
+ });
149
+
150
+ content = document.content;
151
+ mimeType = document.mimeType;
152
+ });
153
+
154
+ it('selector matches an element in the web page', async function checkSelector() {
155
+ if (!content) {
156
+ console.log(' [Tests skipped as URL is not fetchable]');
157
+ this.skip();
158
+ }
159
+
160
+ filteredContent = await filter({ content, pageDeclaration: page, mimeType });
161
+
162
+ expect(filteredContent).to.not.be.empty;
163
+ });
164
+
165
+ it(`filtered content has at least ${MIN_DOC_LENGTH} characters`, async function checkContentLength() {
166
+ if (!content) {
167
+ console.log(' [Tests skipped as URL is not fetchable]');
168
+ this.skip();
169
+ }
170
+
171
+ if (!filteredContent) {
172
+ console.log(' [Tests skipped as content cannot be filtered]');
173
+ this.skip();
174
+ }
175
+
176
+ expect(filteredContent.length).to.be.greaterThan(MIN_DOC_LENGTH);
177
+ });
178
+
179
+ it('content is consistent when fetched and filtered twice in a row', async function checkContentConsistency() {
180
+ this.slow(SLOW_DOCUMENT_THRESHOLD * 2);
181
+
182
+ if (!content) {
183
+ console.log(' [Tests skipped as URL is not fetchable]');
184
+ this.skip();
185
+ }
186
+
187
+ if (!filteredContent) {
188
+ console.log(' [Tests skipped as content cannot be filtered]');
189
+ this.skip();
190
+ }
191
+
192
+ const document = await fetch({
193
+ url: page.location,
194
+ executeClientScripts: page.executeClientScripts,
195
+ cssSelectors: page.cssSelectors,
196
+ config: config.get('fetcher'),
197
+ });
198
+ const secondFilteredContent = await filter({ content: document.content, pageDeclaration: page, mimeType: document.mimeType });
199
+
200
+ expect(secondFilteredContent).to.equal(filteredContent);
201
+ });
202
+ });
203
+ });
204
+ });
205
+ });
206
+ }
207
+ });
208
+ });
209
+ });
210
+
211
+ run();
212
+ };
213
+
214
+ const validator = new Ajv({
215
+ allErrors: true,
216
+ jsonPointers: true,
217
+ });
218
+
219
+ function assertValid(schema, subject) {
220
+ const valid = validator.validate(schema, subject);
221
+
222
+ if (!valid) {
223
+ const errorPointers = new Set();
224
+ let errorMessage = '';
225
+ const sourceMap = jsonSourceMap.stringify(subject, null, 2);
226
+ const jsonLines = sourceMap.json.split('\n');
227
+
228
+ validator.errors.forEach(error => {
229
+ console.log('error', error);
230
+ errorMessage += `\n\n${validator.errorsText([error])}`;
231
+ const errorPointer = sourceMap.pointers[error.dataPath];
232
+
233
+ if (errorPointer) {
234
+ errorMessage += `\n> ${jsonLines
235
+ .slice(errorPointer.value.line, errorPointer.valueEnd.line)
236
+ .join('\n> ')}`;
237
+ errorPointers.add(errorPointer);
238
+ } else {
239
+ errorMessage += ' (in entire file)\n';
240
+ }
241
+ });
242
+
243
+ errorMessage += `\n\n${errorPointers.size} features have errors in total`;
244
+
245
+ throw new Error(errorMessage);
246
+ }
247
+ }
248
+
249
+ async function lintFile(filePath) {
250
+ const [lintResult] = await eslint.lintFiles(filePath);
251
+
252
+ if (!lintResult.errorCount) {
253
+ return;
254
+ }
255
+
256
+ // Create a new instance of linter with option `fix` set to true to get a fixed output.
257
+ // It is not possible to use only a linter with this option enabled because when this option is set, if it can fix errors, it considers that there are no errors and returns `0` for the `errorCount`.
258
+ // So use two linters to have access both to `errorCount` and fix `output` variables.
259
+ const [lintResultFixed] = await eslintWithFix.lintFiles(filePath);
260
+
261
+ expect(lintResult.source).to.equal(lintResultFixed.output, `${path.basename(filePath)} is not properly formatted. Use the lint script to format it correctly.\n`);
262
+ }
@@ -0,0 +1,86 @@
1
+ import { DOCUMENT_TYPES } from '../../../src/archivist/services/index.js';
2
+
3
+ import definitions from './definitions.js';
4
+
5
+ const AVAILABLE_TYPES_NAME = Object.keys(DOCUMENT_TYPES);
6
+
7
+ const documentsProperties = () => {
8
+ const result = {};
9
+
10
+ AVAILABLE_TYPES_NAME.forEach(type => {
11
+ result[type] = {
12
+ type: 'array',
13
+ items: {
14
+ oneOf: [
15
+ { $ref: '#/definitions/singlePageDocumentHistory' },
16
+ { $ref: '#/definitions/multiPageDocumentHistory' },
17
+ { $ref: '#/definitions/pdfDocumentHistory' },
18
+ ],
19
+ },
20
+ };
21
+ });
22
+
23
+ return result;
24
+ };
25
+
26
+ const schema = {
27
+ type: 'object',
28
+ additionalProperties: false,
29
+ title: 'Service declaration history',
30
+ properties: documentsProperties(),
31
+ propertyNames: { enum: AVAILABLE_TYPES_NAME },
32
+ definitions: {
33
+ ...definitions,
34
+ pdfDocumentHistory: {
35
+ type: 'object',
36
+ additionalProperties: false,
37
+ required: [ 'fetch', 'validUntil' ],
38
+ properties: {
39
+ fetch: { $ref: '#/definitions/pdfLocation' },
40
+ validUntil: { $ref: '#/definitions/validUntil' },
41
+ },
42
+ },
43
+ singlePageDocumentHistory: {
44
+ type: 'object',
45
+ additionalProperties: false,
46
+ required: [ 'fetch', 'select', 'validUntil' ],
47
+ properties: {
48
+ fetch: { $ref: '#/definitions/location' },
49
+ select: { $ref: '#/definitions/contentSelectors' },
50
+ filter: { $ref: '#/definitions/filters' },
51
+ remove: { $ref: '#/definitions/noiseSelectors' },
52
+ executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
53
+ validUntil: { $ref: '#/definitions/validUntil' },
54
+ },
55
+ },
56
+ multiPageDocumentHistory: {
57
+ type: 'object',
58
+ additionalProperties: false,
59
+ required: ['combine'],
60
+ properties: {
61
+ combine: {
62
+ type: 'array',
63
+ items: {
64
+ type: 'object',
65
+ additionalProperties: false,
66
+ required: ['fetch'],
67
+ properties: {
68
+ fetch: { $ref: '#/definitions/location' },
69
+ select: { $ref: '#/definitions/contentSelectors' },
70
+ filter: { $ref: '#/definitions/filters' },
71
+ remove: { $ref: '#/definitions/noiseSelectors' },
72
+ executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
73
+ },
74
+ },
75
+ },
76
+ select: { $ref: '#/definitions/contentSelectors' },
77
+ filter: { $ref: '#/definitions/filters' },
78
+ remove: { $ref: '#/definitions/noiseSelectors' },
79
+ executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
80
+ validUntil: { $ref: '#/definitions/validUntil' },
81
+ },
82
+ },
83
+ },
84
+ };
85
+
86
+ export default schema;
@@ -0,0 +1,91 @@
1
+ import { DOCUMENT_TYPES } from '../../../src/archivist/services/index.js';
2
+
3
+ import definitions from './definitions.js';
4
+
5
+ const AVAILABLE_TYPES_NAME = Object.keys(DOCUMENT_TYPES);
6
+
7
+ const documentsProperties = () => {
8
+ const result = {};
9
+
10
+ AVAILABLE_TYPES_NAME.forEach(type => {
11
+ result[type] = {
12
+ oneOf: [
13
+ { $ref: '#/definitions/singlePageDocument' },
14
+ { $ref: '#/definitions/multiPageDocument' },
15
+ { $ref: '#/definitions/pdfDocument' },
16
+ ],
17
+ };
18
+ });
19
+
20
+ return result;
21
+ };
22
+
23
+ const schema = {
24
+ type: 'object',
25
+ additionalProperties: false,
26
+ title: 'Service declaration',
27
+ required: [ 'name', 'documents' ],
28
+ properties: {
29
+ name: {
30
+ type: 'string',
31
+ title: 'Service public name',
32
+ examples: ['Facebook'],
33
+ },
34
+ documents: {
35
+ type: 'object',
36
+ properties: documentsProperties(),
37
+ propertyNames: { enum: AVAILABLE_TYPES_NAME },
38
+ },
39
+ importedFrom: {
40
+ type: 'string',
41
+ title: 'Imported from',
42
+ examples: [
43
+ 'https://github.com/tosdr/tosback2/blob/5acac7abb5e967cfafd124a5e275f98f6ecd423e/rules/4shared.com.xml',
44
+ ],
45
+ },
46
+ },
47
+ definitions: {
48
+ ...definitions,
49
+ pdfDocument: {
50
+ type: 'object',
51
+ additionalProperties: false,
52
+ required: ['fetch'],
53
+ properties: { fetch: { $ref: '#/definitions/pdfLocation' } },
54
+ },
55
+ page: {
56
+ type: 'object',
57
+ additionalProperties: false,
58
+ required: ['fetch'],
59
+ properties: {
60
+ fetch: { $ref: '#/definitions/location' },
61
+ select: { $ref: '#/definitions/contentSelectors' },
62
+ filter: { $ref: '#/definitions/filters' },
63
+ remove: { $ref: '#/definitions/noiseSelectors' },
64
+ executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
65
+ },
66
+ },
67
+ singlePageDocument: {
68
+ allOf: [
69
+ { $ref: '#/definitions/page' },
70
+ { required: [ 'fetch', 'select' ] },
71
+ ],
72
+ },
73
+ multiPageDocument: {
74
+ type: 'object',
75
+ additionalProperties: false,
76
+ required: ['combine'],
77
+ properties: {
78
+ combine: {
79
+ type: 'array',
80
+ items: { $ref: '#/definitions/page' },
81
+ },
82
+ select: { $ref: '#/definitions/contentSelectors' },
83
+ filter: { $ref: '#/definitions/filters' },
84
+ remove: { $ref: '#/definitions/noiseSelectors' },
85
+ executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
86
+ },
87
+ },
88
+ },
89
+ };
90
+
91
+ export default schema;