@opentermsarchive/engine 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/.env.example +3 -0
  2. package/.eslintrc.yaml +116 -0
  3. package/.github/workflows/deploy.yml +50 -0
  4. package/.github/workflows/release.yml +64 -0
  5. package/.github/workflows/test.yml +77 -0
  6. package/CHANGELOG.md +14 -0
  7. package/CODE_OF_CONDUCT.md +128 -0
  8. package/CONTRIBUTING.md +143 -0
  9. package/LICENSE +153 -0
  10. package/MIGRATING.md +42 -0
  11. package/README.fr.md +110 -0
  12. package/README.md +438 -0
  13. package/Vagrantfile +38 -0
  14. package/ansible.cfg +13 -0
  15. package/bin/.env.js +1 -0
  16. package/bin/lint-declarations.js +31 -0
  17. package/bin/track.js +26 -0
  18. package/bin/validate-declarations.js +68 -0
  19. package/config/ci.json +5 -0
  20. package/config/contrib.json +35 -0
  21. package/config/dating.json +37 -0
  22. package/config/default.json +71 -0
  23. package/config/france.json +40 -0
  24. package/config/p2b-compliance.json +40 -0
  25. package/config/pga.json +40 -0
  26. package/config/production.json +27 -0
  27. package/config/test.json +49 -0
  28. package/config/vagrant.json +24 -0
  29. package/decision-records/0001-service-name-and-id.md +73 -0
  30. package/decision-records/0002-service-history.md +212 -0
  31. package/decision-records/0003-snapshots-database.md +123 -0
  32. package/ops/README.md +280 -0
  33. package/ops/app.yml +5 -0
  34. package/ops/infra.yml +6 -0
  35. package/ops/inventories/dev.yml +7 -0
  36. package/ops/inventories/production.yml +27 -0
  37. package/ops/roles/infra/defaults/main.yml +2 -0
  38. package/ops/roles/infra/files/.gitconfig +3 -0
  39. package/ops/roles/infra/files/mongod.conf +18 -0
  40. package/ops/roles/infra/files/ota-bot-key.private_key +26 -0
  41. package/ops/roles/infra/tasks/main.yml +78 -0
  42. package/ops/roles/infra/tasks/mongo.yml +40 -0
  43. package/ops/roles/infra/templates/ssh_config.j2 +5 -0
  44. package/ops/roles/ota/defaults/main.yml +14 -0
  45. package/ops/roles/ota/files/.env +21 -0
  46. package/ops/roles/ota/tasks/database.yml +65 -0
  47. package/ops/roles/ota/tasks/main.yml +110 -0
  48. package/ops/site.yml +6 -0
  49. package/package.json +101 -0
  50. package/pm2.config.cjs +20 -0
  51. package/scripts/dataset/README.md +37 -0
  52. package/scripts/dataset/assets/LICENSE +540 -0
  53. package/scripts/dataset/assets/README.template.js +65 -0
  54. package/scripts/dataset/export/index.js +106 -0
  55. package/scripts/dataset/export/index.test.js +155 -0
  56. package/scripts/dataset/export/test/fixtures/dataset/LICENSE +540 -0
  57. package/scripts/dataset/export/test/fixtures/dataset/README.md +40 -0
  58. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-01T11-27-00Z.md +1 -0
  59. package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-11T11-32-47Z.md +1 -0
  60. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Privacy Policy/2022-01-01T12-12-24Z.md +1 -0
  61. package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Terms of Service/2022-01-06T11-32-47Z.md +1 -0
  62. package/scripts/dataset/index.js +40 -0
  63. package/scripts/dataset/logger/index.js +17 -0
  64. package/scripts/dataset/main.js +25 -0
  65. package/scripts/dataset/publish/index.js +39 -0
  66. package/scripts/declarations/lint/index.js +36 -0
  67. package/scripts/declarations/utils/index.js +81 -0
  68. package/scripts/declarations/validate/definitions.js +63 -0
  69. package/scripts/declarations/validate/index.mocha.js +262 -0
  70. package/scripts/declarations/validate/service.history.schema.js +86 -0
  71. package/scripts/declarations/validate/service.schema.js +91 -0
  72. package/scripts/history/logger/index.js +39 -0
  73. package/scripts/history/migrate-services.js +212 -0
  74. package/scripts/history/update-to-full-hash.js +61 -0
  75. package/scripts/history/utils/index.js +23 -0
  76. package/scripts/import/README.md +59 -0
  77. package/scripts/import/config/import.json +12 -0
  78. package/scripts/import/index.js +224 -0
  79. package/scripts/import/loadCommits.js +66 -0
  80. package/scripts/import/logger/index.js +43 -0
  81. package/scripts/rewrite/README.md +131 -0
  82. package/scripts/rewrite/config/rewrite-snapshots.json +32 -0
  83. package/scripts/rewrite/config/rewrite-versions.json +32 -0
  84. package/scripts/rewrite/initializer/files/license +428 -0
  85. package/scripts/rewrite/initializer/files/readme.md +8 -0
  86. package/scripts/rewrite/initializer/index.js +44 -0
  87. package/scripts/rewrite/rewrite-snapshots.js +108 -0
  88. package/scripts/rewrite/rewrite-versions.js +160 -0
  89. package/scripts/rewrite/utils.js +33 -0
  90. package/scripts/utils/renamer/README.md +49 -0
  91. package/scripts/utils/renamer/index.js +45 -0
  92. package/scripts/utils/renamer/rules/documentTypes.json +25 -0
  93. package/scripts/utils/renamer/rules/documentTypesByService.json +170 -0
  94. package/scripts/utils/renamer/rules/serviceNames.json +92 -0
  95. package/src/archivist/errors.js +9 -0
  96. package/src/archivist/fetcher/errors.js +6 -0
  97. package/src/archivist/fetcher/exports.js +18 -0
  98. package/src/archivist/fetcher/fullDomFetcher.js +84 -0
  99. package/src/archivist/fetcher/htmlOnlyFetcher.js +62 -0
  100. package/src/archivist/fetcher/index.js +35 -0
  101. package/src/archivist/fetcher/index.test.js +239 -0
  102. package/src/archivist/filter/exports.js +3 -0
  103. package/src/archivist/filter/index.js +178 -0
  104. package/src/archivist/filter/index.test.js +561 -0
  105. package/src/archivist/index.js +276 -0
  106. package/src/archivist/index.test.js +600 -0
  107. package/src/archivist/recorder/index.js +77 -0
  108. package/src/archivist/recorder/index.test.js +463 -0
  109. package/src/archivist/recorder/record.js +35 -0
  110. package/src/archivist/recorder/record.test.js +91 -0
  111. package/src/archivist/recorder/repositories/factory.js +23 -0
  112. package/src/archivist/recorder/repositories/git/dataMapper.js +83 -0
  113. package/src/archivist/recorder/repositories/git/git.js +122 -0
  114. package/src/archivist/recorder/repositories/git/git.test.js +86 -0
  115. package/src/archivist/recorder/repositories/git/index.js +182 -0
  116. package/src/archivist/recorder/repositories/git/index.test.js +714 -0
  117. package/src/archivist/recorder/repositories/interface.js +108 -0
  118. package/src/archivist/recorder/repositories/mongo/dataMapper.js +32 -0
  119. package/src/archivist/recorder/repositories/mongo/index.js +121 -0
  120. package/src/archivist/recorder/repositories/mongo/index.test.js +721 -0
  121. package/src/archivist/services/documentDeclaration.js +26 -0
  122. package/src/archivist/services/documentDeclaration.test.js +85 -0
  123. package/src/archivist/services/documentTypes.json +386 -0
  124. package/src/archivist/services/index.js +255 -0
  125. package/src/archivist/services/index.test.js +327 -0
  126. package/src/archivist/services/pageDeclaration.js +51 -0
  127. package/src/archivist/services/pageDeclaration.test.js +224 -0
  128. package/src/archivist/services/service.js +60 -0
  129. package/src/archivist/services/service.test.js +164 -0
  130. package/src/exports.js +3 -0
  131. package/src/index.js +59 -0
  132. package/src/logger/README.md +1 -0
  133. package/src/logger/index.js +131 -0
  134. package/src/main.js +18 -0
  135. package/src/notifier/README.md +1 -0
  136. package/src/notifier/index.js +150 -0
  137. package/src/tracker/README.md +1 -0
  138. package/src/tracker/index.js +215 -0
  139. package/test/fixtures/service_A.js +22 -0
  140. package/test/fixtures/service_A_terms.md +10 -0
  141. package/test/fixtures/service_A_terms_snapshot.html +14 -0
  142. package/test/fixtures/service_B.js +22 -0
  143. package/test/fixtures/service_with_declaration_history.js +65 -0
  144. package/test/fixtures/service_with_filters_history.js +155 -0
  145. package/test/fixtures/service_with_history.js +188 -0
  146. package/test/fixtures/service_with_multipage_document.js +100 -0
  147. package/test/fixtures/service_without_history.js +31 -0
  148. package/test/fixtures/services.js +19 -0
  149. package/test/fixtures/terms.pdf +0 -0
  150. package/test/fixtures/termsFromPDF.md +25 -0
  151. package/test/fixtures/termsModified.pdf +0 -0
  152. package/test/services/service_A.json +9 -0
  153. package/test/services/service_B.json +9 -0
  154. package/test/services/service_with_declaration_history.filters.js +7 -0
  155. package/test/services/service_with_declaration_history.history.json +17 -0
  156. package/test/services/service_with_declaration_history.json +13 -0
  157. package/test/services/service_with_filters_history.filters.history.js +29 -0
  158. package/test/services/service_with_filters_history.filters.js +7 -0
  159. package/test/services/service_with_filters_history.json +13 -0
  160. package/test/services/service_with_history.filters.history.js +29 -0
  161. package/test/services/service_with_history.filters.js +7 -0
  162. package/test/services/service_with_history.history.json +26 -0
  163. package/test/services/service_with_history.json +17 -0
  164. package/test/services/service_with_multipage_document.filters.js +7 -0
  165. package/test/services/service_with_multipage_document.history.json +37 -0
  166. package/test/services/service_with_multipage_document.json +28 -0
  167. package/test/services/service_without_history.filters.js +7 -0
  168. package/test/services/service_without_history.json +13 -0
@@ -0,0 +1,84 @@
1
+ import puppeteer from 'puppeteer';
2
+ import puppeteerExtra from 'puppeteer-extra';
3
+ import stealthPlugin from 'puppeteer-extra-plugin-stealth';
4
+
5
+ import { FetchDocumentError } from './errors.js';
6
+
7
+ puppeteerExtra.use(stealthPlugin());
8
+
9
+ let browser;
10
+
11
+ export default async function fetch(url, cssSelectors, config) {
12
+ let page;
13
+ let response;
14
+ const selectors = [].concat(cssSelectors);
15
+
16
+ if (!browser) {
17
+ throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
18
+ }
19
+
20
+ try {
21
+ page = await browser.newPage();
22
+
23
+ await page.setDefaultNavigationTimeout(config.navigationTimeout);
24
+ await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
25
+
26
+ response = await page.goto(url, { waitUntil: 'networkidle0' });
27
+
28
+ if (!response) {
29
+ throw new FetchDocumentError(`Response is empty when trying to fetch '${url}'`);
30
+ }
31
+
32
+ const statusCode = response.status();
33
+
34
+ if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) {
35
+ throw new FetchDocumentError(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
36
+ }
37
+
38
+ const waitForSelectorsPromises = selectors.map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
39
+
40
+ // We expect all elements to be present on the page…
41
+ await Promise.all(waitForSelectorsPromises).catch(error => {
42
+ if (error.name == 'TimeoutError') {
43
+ // however, if they are not, this is not considered as an error since selectors may be out of date
44
+ // and the whole content of the page should still be returned.
45
+ return;
46
+ }
47
+
48
+ throw error;
49
+ });
50
+
51
+ return {
52
+ mimeType: 'text/html',
53
+ content: await page.content(),
54
+ };
55
+ } catch (error) {
56
+ if (error instanceof puppeteer.errors.TimeoutError) {
57
+ throw new FetchDocumentError(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
58
+ }
59
+ throw new FetchDocumentError(error.message);
60
+ } finally {
61
+ if (page) {
62
+ await page.close();
63
+ }
64
+ }
65
+ }
66
+
67
+ export async function launchHeadlessBrowser() {
68
+ if (browser) {
69
+ return browser;
70
+ }
71
+
72
+ browser = await puppeteerExtra.launch({ headless: true });
73
+
74
+ return browser;
75
+ }
76
+
77
+ export async function stopHeadlessBrowser() {
78
+ if (!browser) {
79
+ return;
80
+ }
81
+
82
+ await browser.close();
83
+ browser = null;
84
+ }
@@ -0,0 +1,62 @@
1
+ import convertBody from '@opentermsarchive/fetch-charset-detection'; // eslint-disable-line import/no-unresolved
2
+ import AbortController from 'abort-controller';
3
+ // https://github.com/node-fetch/fetch-charset-detection/issues/247
4
+ import HttpProxyAgent from 'http-proxy-agent';
5
+ import HttpsProxyAgent from 'https-proxy-agent';
6
+ import nodeFetch, { AbortError } from 'node-fetch';
7
+
8
+ import { FetchDocumentError } from './errors.js';
9
+
10
+ export default async function fetch(url, configuration) {
11
+ const controller = new AbortController();
12
+ const timeout = setTimeout(() => controller.abort(), configuration.navigationTimeout);
13
+
14
+ const nodeFetchOptions = {
15
+ signal: controller.signal,
16
+ credentials: 'include',
17
+ headers: { 'Accept-Language': configuration.language },
18
+ };
19
+
20
+ if (url.startsWith('https:') && process.env.HTTPS_PROXY) {
21
+ nodeFetchOptions.agent = new HttpsProxyAgent(process.env.HTTPS_PROXY);
22
+ } else if (url.startsWith('http:') && process.env.HTTP_PROXY) {
23
+ nodeFetchOptions.agent = new HttpProxyAgent(process.env.HTTP_PROXY);
24
+ }
25
+
26
+ let response;
27
+
28
+ try {
29
+ response = await nodeFetch(url, nodeFetchOptions);
30
+
31
+ if (!response.ok) {
32
+ throw new FetchDocumentError(`Received HTTP code ${response.status} when trying to fetch '${url}'`);
33
+ }
34
+
35
+ const mimeType = response.headers.get('content-type');
36
+ const responseBuffer = await response.arrayBuffer();
37
+ let content;
38
+
39
+ if (mimeType.startsWith('text/')) {
40
+ content = convertBody(responseBuffer, response.headers);
41
+ } else {
42
+ content = Buffer.from(responseBuffer);
43
+ }
44
+
45
+ if (!content) {
46
+ throw new FetchDocumentError(`Received an empty content when fetching '${url}'`);
47
+ }
48
+
49
+ return {
50
+ mimeType,
51
+ content,
52
+ };
53
+ } catch (error) {
54
+ if (error instanceof AbortError) {
55
+ throw new FetchDocumentError(`Timed out after ${configuration.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
56
+ }
57
+
58
+ throw new FetchDocumentError(error.message);
59
+ } finally {
60
+ clearTimeout(timeout);
61
+ }
62
+ }
@@ -0,0 +1,35 @@
1
+ import config from 'config';
2
+
3
+ import fetchFullDom from './fullDomFetcher.js';
4
+ import fetchHtmlOnly from './htmlOnlyFetcher.js';
5
+
6
+ export { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
7
+ export { FetchDocumentError } from './errors.js';
8
+
9
+ /**
10
+ * Fetch a resource from the network, returning a promise which is fulfilled once the response is available
11
+ *
12
+ * @param {Object} params - Fetcher parameters
13
+ * @param {string} params.url - URL of the resource you want to fetch
14
+ * @param {boolean} [params.executeClientScripts] - Enable execution of client scripts. When set to `true`, this property loads the page in a headless browser to load all assets and execute client scripts before returning its content
15
+ * @param {string|Array} [params.cssSelectors] - List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled
16
+ * @param {Object} [params.config] - Fetcher configuration
17
+ * @param {number} [params.config.navigationTimeout] - Maximum time (in milliseconds) to wait before considering the fetch failed
18
+ * @param {string} [params.config.language] - Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers
19
+ * @param {number} [params.config.waitForElementsTimeout] - Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled
20
+ * @returns {Promise} @returns {Promise<Object>} Promise which will be resolved with an object containing the `mimeType` and the `content` of the URL as string or Buffer
21
+ */
22
+ export default async function fetch({
23
+ url, executeClientScripts, cssSelectors,
24
+ config: {
25
+ navigationTimeout = config.get('fetcher.navigationTimeout'),
26
+ language = config.get('fetcher.language'),
27
+ waitForElementsTimeout = config.get('fetcher.waitForElementsTimeout'),
28
+ } = {},
29
+ }) {
30
+ if (executeClientScripts) {
31
+ return fetchFullDom(url, cssSelectors, { navigationTimeout, language, waitForElementsTimeout });
32
+ }
33
+
34
+ return fetchHtmlOnly(url, { navigationTimeout, language });
35
+ }
@@ -0,0 +1,239 @@
1
+ import fs from 'fs';
2
+ import http from 'http';
3
+ import path from 'path';
4
+ import { fileURLToPath } from 'url';
5
+
6
+ import chai from 'chai';
7
+ import chaiAsPromised from 'chai-as-promised';
8
+ import iconv from 'iconv-lite';
9
+
10
+ import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError } from './index.js';
11
+
12
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
13
+
14
+ const { expect } = chai;
15
+ const SERVER_PORT = 8976;
16
+
17
+ chai.use(chaiAsPromised);
18
+
19
+ const termsHTML = '<!DOCTYPE html><html><head><meta charset="UTF-8"><title>First provider TOS</title></head><body><h1>Terms of service</h1><p>Dapibus quis diam sagittis</p></body></html>';
20
+ const termsWithOtherCharsetHTML = '<!DOCTYPE html><html><head><meta http-equiv="Content-Type" content="text/html; charset=windows-1251"><title>TOS на първия доставчик</title></head><body><h1>Условия за ползване</h1><p>Dapibus quis diam sagittis</p></body></html>';
21
+
22
+ describe('Fetcher', function () {
23
+ this.timeout(10000);
24
+
25
+ before(launchHeadlessBrowser);
26
+
27
+ after(stopHeadlessBrowser);
28
+
29
+ describe('#fetch', () => {
30
+ let temporaryServer;
31
+ let expectedPDFContent;
32
+
33
+ before(done => {
34
+ temporaryServer = http.createServer((request, response) => {
35
+ if (request.url === '/') {
36
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(termsHTML);
37
+ }
38
+ if (request.url === '/other-charset') {
39
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(iconv.encode(termsWithOtherCharsetHTML, 'windows-1251'));
40
+ }
41
+ if (request.url == '/404') {
42
+ response.writeHead(404, { 'Content-Type': 'text/html' }).write('<!DOCTYPE html><html><body>404</body></html>');
43
+ }
44
+ if (request.url == '/terms.pdf') {
45
+ expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
46
+
47
+ response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
48
+ }
49
+
50
+ return response.end();
51
+ }).listen(SERVER_PORT);
52
+
53
+ done();
54
+ });
55
+
56
+ after(() => {
57
+ if (temporaryServer) {
58
+ temporaryServer.close();
59
+ }
60
+ });
61
+
62
+ describe('Available URL', () => {
63
+ context('when html page is available', () => {
64
+ let content;
65
+ let mimeType;
66
+ const url = `http://127.0.0.1:${SERVER_PORT}`;
67
+
68
+ context('when expected selectors are present', () => {
69
+ before(async () => {
70
+ ({ content, mimeType } = await fetch({ url, selectors: 'body' }));
71
+ });
72
+
73
+ it('returns the web page content of the given URL', async () => {
74
+ expect(content).to.equal(termsHTML);
75
+ });
76
+
77
+ it('returns the MIME type of the given URL', async () => {
78
+ expect(mimeType).to.equal('text/html');
79
+ });
80
+
81
+ context('with client script enabled', () => {
82
+ before(async () => {
83
+ ({ content, mimeType } = await fetch({ url, selectors: 'body', executeClientScripts: true }));
84
+ });
85
+
86
+ it('returns the web page content of the given URL', async () => {
87
+ expect(content).to.equal(termsHTML);
88
+ });
89
+
90
+ it('returns the MIME type of the given URL', async () => {
91
+ expect(mimeType).to.equal('text/html');
92
+ });
93
+ });
94
+ });
95
+
96
+ context('when expected selectors are not present', () => {
97
+ const NOT_PRESENT_SELECTOR = 'h2';
98
+
99
+ before(async () => {
100
+ ({ content, mimeType } = await fetch({ url, selectors: NOT_PRESENT_SELECTOR }));
101
+ });
102
+
103
+ it('returns the web page content of the given URL', async () => {
104
+ expect(content).to.equal(termsHTML);
105
+ });
106
+
107
+ it('returns the MIME type of the given URL', async () => {
108
+ expect(mimeType).to.equal('text/html');
109
+ });
110
+
111
+ context('with client script enabled', () => {
112
+ before(async () => {
113
+ ({ content, mimeType } = await fetch({ url, selectors: NOT_PRESENT_SELECTOR, executeClientScripts: true }));
114
+ });
115
+
116
+ it('returns the web page content of the given URL', async () => {
117
+ expect(content).to.equal(termsHTML);
118
+ });
119
+
120
+ it('returns the MIME type of the given URL', async () => {
121
+ expect(mimeType).to.equal('text/html');
122
+ });
123
+ });
124
+ });
125
+ });
126
+
127
+ context('when html page is in different charset', () => {
128
+ let content;
129
+ const url = `http://127.0.0.1:${SERVER_PORT}/other-charset`;
130
+
131
+ context('when expected selectors are present', () => {
132
+ before(async () => {
133
+ ({ content } = await fetch({ url, selectors: 'body' }));
134
+ });
135
+
136
+ it('returns the web page content of the given URL', async () => {
137
+ expect(content).to.equal(termsWithOtherCharsetHTML);
138
+ });
139
+ });
140
+ });
141
+
142
+ context('when url targets a PDF file', () => {
143
+ let content;
144
+ let mimeType;
145
+ const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`;
146
+
147
+ before(async () => {
148
+ ({ content, mimeType } = await fetch({ url: pdfUrl }));
149
+ });
150
+
151
+ it('returns a buffer for PDF content', async () => {
152
+ expect(content).to.be.an.instanceOf(Buffer);
153
+ });
154
+
155
+ it('returns a blob with the file type', async () => {
156
+ expect(mimeType).to.equal('application/pdf');
157
+ });
158
+
159
+ it('returns a blob with the file content', async () => {
160
+ expect(content.equals(expectedPDFContent)).to.be.true;
161
+ });
162
+ });
163
+ });
164
+
165
+ describe('Error handling', () => {
166
+ const url404 = `http://127.0.0.1:${SERVER_PORT}/404`;
167
+
168
+ context('when web page is not available', () => {
169
+ it('throws a FetchDocumentError error', async () => {
170
+ await expect(fetch({ url: url404 })).to.be.rejectedWith(FetchDocumentError, /404/);
171
+ });
172
+
173
+ context('with client script enabled', () => {
174
+ it('throws a FetchDocumentError error', async () => {
175
+ await expect(fetch({ url: url404, executeClientScripts: true, cssSelectors: 'body' })).to.be.rejectedWith(FetchDocumentError, /404/);
176
+ });
177
+ });
178
+ });
179
+
180
+ context('when server is not resolved', () => {
181
+ const notAvailableUrl = 'https://not.available.example';
182
+
183
+ it('throws a FetchDocumentError error', async () => {
184
+ await expect(fetch({ url: notAvailableUrl })).to.be.rejectedWith(FetchDocumentError);
185
+ });
186
+
187
+ context('with client script enabled', () => {
188
+ it('throws a FetchDocumentError error', async () => {
189
+ await expect(fetch({ url: notAvailableUrl, executeClientScripts: true })).to.be.rejectedWith(FetchDocumentError);
190
+ });
191
+ });
192
+ });
193
+
194
+ describe('when there is a certificate error', () => {
195
+ context('when website has a self signed certificate', () => {
196
+ const selfSignedSslUrl = 'https://self-signed.badssl.com/';
197
+
198
+ it('throws a FetchDocumentError error', async () => {
199
+ await expect(fetch({ url: selfSignedSslUrl })).to.be.rejectedWith(FetchDocumentError);
200
+ });
201
+
202
+ context('with client script enabled', () => {
203
+ it('throws a FetchDocumentError error', async () => {
204
+ await expect(fetch({ url: selfSignedSslUrl, executeClientScripts: true })).to.be.rejectedWith(FetchDocumentError);
205
+ });
206
+ });
207
+ });
208
+
209
+ context('when website has an expired certificate', () => {
210
+ const expiredSslUrl = 'https://expired.badssl.com/';
211
+
212
+ it('throws a FetchDocumentError error', async () => {
213
+ await expect(fetch({ url: expiredSslUrl })).to.be.rejectedWith(FetchDocumentError);
214
+ });
215
+
216
+ context('with client script enabled', () => {
217
+ it('throws a FetchDocumentError error', async () => {
218
+ await expect(fetch({ url: expiredSslUrl, executeClientScripts: true })).to.be.rejectedWith(FetchDocumentError);
219
+ });
220
+ });
221
+ });
222
+
223
+ context('when website has an untrusted root certificate', () => {
224
+ const untrustedRootSslUrl = 'https://untrusted-root.badssl.com/';
225
+
226
+ it('throws a FetchDocumentError error', async () => {
227
+ await expect(fetch({ url: untrustedRootSslUrl })).to.be.rejectedWith(FetchDocumentError);
228
+ });
229
+
230
+ context('with client script enabled', () => {
231
+ it('throws a FetchDocumentError error', async () => {
232
+ await expect(fetch({ url: untrustedRootSslUrl, executeClientScripts: true })).to.be.rejectedWith(FetchDocumentError);
233
+ });
234
+ });
235
+ });
236
+ });
237
+ });
238
+ });
239
+ });
@@ -0,0 +1,3 @@
1
+ import filter from './index.js';
2
+
3
+ export default filter;
@@ -0,0 +1,178 @@
1
+ import url from 'url';
2
+
3
+ import ciceroMark from '@accordproject/markdown-cicero';
4
+ import mardownPdf from '@accordproject/markdown-pdf';
5
+ import TurndownService from '@opentermsarchive/turndown';
6
+ import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
7
+ import jsdom from 'jsdom';
8
+
9
+ import { InaccessibleContentError } from '../errors.js';
10
+
11
+ const { JSDOM } = jsdom;
12
+ const turndownService = new TurndownService();
13
+
14
+ turndownService.use(turndownPluginGithubFlavouredMarkdown.gfm);
15
+
16
+ export const LINKS_TO_CONVERT_SELECTOR = 'a[href]:not([href^="#"]):not([href=""])';
17
+
18
+ const { PdfTransformer } = mardownPdf;
19
+ const { CiceroMarkTransformer } = ciceroMark;
20
+
21
+ const ciceroMarkTransformer = new CiceroMarkTransformer();
22
+
23
+ /**
24
+ * Filter document content and convert it to Markdown
25
+ *
26
+ * @param {Object} params - Filter parameters
27
+ * @param {string|Buffer} params.content - Content to filter: a buffer containing PDF data in case mimetype associated is PDF or a DOM dump of an HTML page given as a string
28
+ * @param {string} params.mimeType - MIME type of the given content
29
+ * @param {string} params.pageDeclaration - see {@link ./src/archivist/services/pageDeclaration.js}
30
+ * @returns {Promise<string>} Promise which is fulfilled once the content is filtered and converted in Markdown. The promise will resolve into a string containing the filtered content in Markdown format
31
+ */
32
+ export default async function filter({ content, mimeType, pageDeclaration }) {
33
+ if (mimeType == 'application/pdf') {
34
+ return filterPDF({ content });
35
+ }
36
+
37
+ return filterHTML({
38
+ content,
39
+ pageDeclaration,
40
+ });
41
+ }
42
+
43
+ export async function filterHTML({ content, pageDeclaration }) {
44
+ const {
45
+ location,
46
+ contentSelectors = [],
47
+ noiseSelectors = [],
48
+ filters: serviceSpecificFilters = [],
49
+ } = pageDeclaration;
50
+
51
+ const jsdomInstance = new JSDOM(content, {
52
+ url: location,
53
+ virtualConsole: new jsdom.VirtualConsole(),
54
+ });
55
+ const { document: webPageDOM } = jsdomInstance.window;
56
+
57
+ for (const filterFunction of serviceSpecificFilters) {
58
+ try {
59
+ /* eslint-disable no-await-in-loop */
60
+ // We want this to be made in series
61
+ await filterFunction(webPageDOM, {
62
+ fetch: location,
63
+ select: contentSelectors,
64
+ remove: noiseSelectors,
65
+ filter: serviceSpecificFilters.map(filter => filter.name),
66
+ });
67
+ /* eslint-enable no-await-in-loop */
68
+ } catch (error) {
69
+ throw new InaccessibleContentError(`The filter function "${filterFunction.name}" failed: ${error}`);
70
+ }
71
+ }
72
+
73
+ remove(webPageDOM, noiseSelectors); // remove function works in place
74
+
75
+ const domFragment = select(webPageDOM, contentSelectors);
76
+
77
+ if (!domFragment.children.length) {
78
+ throw new InaccessibleContentError(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
79
+ }
80
+
81
+ convertRelativeURLsToAbsolute(domFragment, location);
82
+
83
+ domFragment.querySelectorAll('script, style').forEach(node => node.remove());
84
+
85
+ // clean code from common changing patterns - initially for Windstream
86
+ domFragment.querySelectorAll('a[href*="/email-protection"]').forEach(node => {
87
+ if (node.href.match(/((.*?)\/email-protection#)[0-9a-fA-F]+/gim)) {
88
+ node.href = `${node.href.split('#')[0]}#removed`;
89
+ }
90
+ });
91
+
92
+ const markdownContent = transform(domFragment);
93
+
94
+ if (!markdownContent) {
95
+ throw new InaccessibleContentError(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
96
+ }
97
+
98
+ return markdownContent;
99
+ }
100
+
101
+ export async function filterPDF({ content: pdfBuffer }) {
102
+ try {
103
+ const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
104
+
105
+ return ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
106
+ } catch (error) {
107
+ if (error.parserError) {
108
+ throw new InaccessibleContentError("Can't parse PDF file");
109
+ }
110
+
111
+ throw error;
112
+ }
113
+ }
114
+
115
+ function selectRange(document, rangeSelector) {
116
+ const { startBefore, startAfter, endBefore, endAfter } = rangeSelector;
117
+
118
+ const selection = document.createRange();
119
+ const startNode = document.querySelector(startBefore || startAfter);
120
+ const endNode = document.querySelector(endBefore || endAfter);
121
+
122
+ if (!startNode) {
123
+ throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
124
+ }
125
+
126
+ if (!endNode) {
127
+ throw new InaccessibleContentError(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
128
+ }
129
+
130
+ selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
131
+ selection[endBefore ? 'setEndBefore' : 'setEndAfter'](endNode);
132
+
133
+ return selection;
134
+ }
135
+
136
+ export function convertRelativeURLsToAbsolute(document, baseURL) {
137
+ Array.from(document.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
138
+ link.href = url.resolve(baseURL, link.href);
139
+ });
140
+ }
141
+
142
+ // Works in place
143
+ function remove(webPageDOM, noiseSelectors) {
144
+ const rangeSelections = [];
145
+ const nodes = [];
146
+
147
+ [].concat(noiseSelectors).forEach(selector => {
148
+ if (typeof selector === 'object') {
149
+ rangeSelections.push(selectRange(webPageDOM, selector));
150
+ } else {
151
+ nodes.push(...webPageDOM.querySelectorAll(selector));
152
+ }
153
+ });
154
+
155
+ // Removing range selections still works even if the starting or ending node is deleted. So, start by removing all nodes selected by a direct CSS selector, then delete all contents selections.
156
+ nodes.forEach(node => node.remove());
157
+ rangeSelections.forEach(rangeSelection => rangeSelection.deleteContents());
158
+ }
159
+
160
+ function select(webPageDOM, contentSelectors) {
161
+ const result = webPageDOM.createDocumentFragment();
162
+
163
+ [].concat(contentSelectors).forEach(selector => {
164
+ if (typeof selector === 'object') {
165
+ const rangeSelection = selectRange(webPageDOM, selector);
166
+
167
+ result.appendChild(rangeSelection.cloneContents());
168
+ } else {
169
+ webPageDOM.querySelectorAll(selector).forEach(element => result.appendChild(element.cloneNode(true)));
170
+ }
171
+ });
172
+
173
+ return result;
174
+ }
175
+
176
+ function transform(domFragment) {
177
+ return turndownService.turndown(domFragment);
178
+ }