@opentermsarchive/engine 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.yaml +2 -0
- package/package.json +2 -1
- package/scripts/declarations/utils/fixtures/serviceATermsUpdated.history.json +9 -0
- package/scripts/declarations/utils/index.js +4 -0
- package/scripts/declarations/utils/index.test.js +12 -4
- package/scripts/declarations/validate/index.mocha.js +1 -1
- package/src/archivist/extract/errors.js +6 -0
- package/src/archivist/extract/index.js +32 -16
- package/src/archivist/extract/index.test.js +319 -302
- package/src/archivist/fetcher/errors.js +1 -1
- package/src/archivist/fetcher/fullDomFetcher.js +4 -6
- package/src/archivist/fetcher/htmlOnlyFetcher.js +6 -7
- package/src/archivist/fetcher/index.js +9 -4
- package/src/archivist/fetcher/index.test.js +24 -13
- package/src/archivist/index.js +37 -13
- package/src/archivist/index.test.js +22 -22
- package/src/archivist/services/service.js +12 -6
- package/src/archivist/services/service.test.js +60 -39
- package/src/logger/index.js +3 -3
- package/src/reporter/index.js +4 -2
- package/src/reporter/labels.json +10 -0
package/.eslintrc.yaml
CHANGED
|
@@ -10,6 +10,7 @@ plugins:
|
|
|
10
10
|
- chai-friendly
|
|
11
11
|
- import
|
|
12
12
|
- json-format
|
|
13
|
+
- no-only-tests
|
|
13
14
|
rules:
|
|
14
15
|
arrow-parens:
|
|
15
16
|
- error
|
|
@@ -101,6 +102,7 @@ rules:
|
|
|
101
102
|
- error
|
|
102
103
|
- properties: false
|
|
103
104
|
require-await: 1
|
|
105
|
+
no-only-tests/no-only-tests: error
|
|
104
106
|
|
|
105
107
|
overrides:
|
|
106
108
|
- files:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@opentermsarchive/engine",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Tracks and makes visible changes to the terms of online services",
|
|
5
5
|
"homepage": "https://opentermsarchive.org",
|
|
6
6
|
"bugs": {
|
|
@@ -103,6 +103,7 @@
|
|
|
103
103
|
"devDependencies": {
|
|
104
104
|
"@commitlint/cli": "^19.0.3",
|
|
105
105
|
"dir-compare": "^4.0.0",
|
|
106
|
+
"eslint-plugin-no-only-tests": "^3.1.0",
|
|
106
107
|
"keep-a-changelog": "^2.5.3",
|
|
107
108
|
"nock": "^13.2.1",
|
|
108
109
|
"node-stream-zip": "^1.15.0",
|
|
@@ -43,6 +43,10 @@ export default class DeclarationUtils {
|
|
|
43
43
|
await Promise.all(modifiedFilePaths.map(async modifiedFilePath => {
|
|
44
44
|
const serviceId = DeclarationUtils.getServiceIdFromFilePath(modifiedFilePath);
|
|
45
45
|
|
|
46
|
+
if (modifiedFilePath.endsWith('.history.json')) {
|
|
47
|
+
return; // Assuming history modifications imply corresponding changes in the service declaration and that the analysis of which terms types of this service have changed will be done when analysing the related declaration, no further action is required here
|
|
48
|
+
}
|
|
49
|
+
|
|
46
50
|
if (modifiedFilePath.endsWith('.filters.js')) {
|
|
47
51
|
const declaration = await this.getJSONFromFile(this.defaultBranch, `declarations/${serviceId}.json`);
|
|
48
52
|
|
|
@@ -15,6 +15,7 @@ const SUBJECT_PATH = path.resolve(__dirname, './test');
|
|
|
15
15
|
const FIXTURES = {
|
|
16
16
|
serviceA: { path: './fixtures/serviceA.json' },
|
|
17
17
|
serviceATermsUpdated: { path: './fixtures/serviceATermsUpdated.json' },
|
|
18
|
+
serviceATermsUpdatedHistory: { path: './fixtures/serviceATermsUpdated.history.json' },
|
|
18
19
|
serviceAMultipleTermsUpdated: { path: './fixtures/serviceAMultipleTermsUpdated.json' },
|
|
19
20
|
serviceATermsAdded: { path: './fixtures/serviceATermsAdded.json' },
|
|
20
21
|
serviceATermsRemoved: { path: './fixtures/serviceATermsRemoved.json' },
|
|
@@ -23,6 +24,7 @@ const FIXTURES = {
|
|
|
23
24
|
|
|
24
25
|
const COMMIT_PATHS = {
|
|
25
26
|
serviceA: './declarations/ServiceA.json',
|
|
27
|
+
serviceAHistory: './declarations/ServiceA.history.json',
|
|
26
28
|
serviceB: './declarations/ServiceB.json',
|
|
27
29
|
};
|
|
28
30
|
|
|
@@ -36,7 +38,7 @@ const removeLatestCommit = async () => {
|
|
|
36
38
|
await declarationUtils.git.reset('hard', ['HEAD~1']);
|
|
37
39
|
};
|
|
38
40
|
|
|
39
|
-
describe
|
|
41
|
+
describe('DeclarationUtils', () => {
|
|
40
42
|
describe('#getModifiedServicesAndTermsTypes', () => {
|
|
41
43
|
before(async () => {
|
|
42
44
|
await loadFixtures();
|
|
@@ -46,8 +48,14 @@ describe.only('DeclarationUtils', () => {
|
|
|
46
48
|
after(() => fs.rm(SUBJECT_PATH, { recursive: true }));
|
|
47
49
|
|
|
48
50
|
context('when an existing declaration has been modified', () => {
|
|
49
|
-
before(() =>
|
|
50
|
-
|
|
51
|
+
before(async () => {
|
|
52
|
+
await commitChanges(COMMIT_PATHS.serviceA, FIXTURES.serviceATermsUpdated.content);
|
|
53
|
+
await commitChanges(COMMIT_PATHS.serviceAHistory, FIXTURES.serviceATermsUpdatedHistory.content);
|
|
54
|
+
});
|
|
55
|
+
after(async () => {
|
|
56
|
+
await removeLatestCommit();
|
|
57
|
+
await removeLatestCommit();
|
|
58
|
+
});
|
|
51
59
|
|
|
52
60
|
it('returns the service ID and the updated terms type', async () => {
|
|
53
61
|
expect(await declarationUtils.getModifiedServicesAndTermsTypes()).to.deep.equal({
|
|
@@ -82,7 +90,7 @@ describe.only('DeclarationUtils', () => {
|
|
|
82
90
|
});
|
|
83
91
|
|
|
84
92
|
context('when a declaration has been removed', () => {
|
|
85
|
-
before(
|
|
93
|
+
before(removeLatestCommit);
|
|
86
94
|
after(async () => {
|
|
87
95
|
await fs.mkdir(path.resolve(SUBJECT_PATH, './declarations'), { recursive: true });
|
|
88
96
|
await commitChanges(COMMIT_PATHS.serviceA, FIXTURES.serviceA.content);
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import url from 'url';
|
|
2
|
-
|
|
3
1
|
import ciceroMark from '@accordproject/markdown-cicero';
|
|
4
2
|
import mardownPdf from '@accordproject/markdown-pdf';
|
|
5
3
|
import TurndownService from '@opentermsarchive/turndown';
|
|
@@ -7,7 +5,9 @@ import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
|
|
|
7
5
|
import jsdom from 'jsdom';
|
|
8
6
|
import mime from 'mime';
|
|
9
7
|
|
|
10
|
-
import {
|
|
8
|
+
import { ExtractDocumentError } from './errors.js';
|
|
9
|
+
|
|
10
|
+
export { ExtractDocumentError } from './errors.js';
|
|
11
11
|
|
|
12
12
|
const { JSDOM } = jsdom;
|
|
13
13
|
const turndownService = new TurndownService();
|
|
@@ -29,11 +29,15 @@ const ciceroMarkTransformer = new CiceroMarkTransformer();
|
|
|
29
29
|
* @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
|
|
30
30
|
*/
|
|
31
31
|
export default async function extract(sourceDocument) {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
try {
|
|
33
|
+
if (sourceDocument.mimeType == mime.getType('pdf')) {
|
|
34
|
+
return await extractFromPDF(sourceDocument);
|
|
35
|
+
}
|
|
35
36
|
|
|
36
|
-
|
|
37
|
+
return await extractFromHTML(sourceDocument);
|
|
38
|
+
} catch (error) {
|
|
39
|
+
throw new ExtractDocumentError(error.message);
|
|
40
|
+
}
|
|
37
41
|
}
|
|
38
42
|
|
|
39
43
|
export async function extractFromHTML(sourceDocument) {
|
|
@@ -63,7 +67,7 @@ export async function extractFromHTML(sourceDocument) {
|
|
|
63
67
|
});
|
|
64
68
|
/* eslint-enable no-await-in-loop */
|
|
65
69
|
} catch (error) {
|
|
66
|
-
throw new
|
|
70
|
+
throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
|
|
67
71
|
}
|
|
68
72
|
}
|
|
69
73
|
|
|
@@ -72,7 +76,7 @@ export async function extractFromHTML(sourceDocument) {
|
|
|
72
76
|
const domFragment = select(webPageDOM, contentSelectors);
|
|
73
77
|
|
|
74
78
|
if (!domFragment.children.length) {
|
|
75
|
-
throw new
|
|
79
|
+
throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
|
|
76
80
|
}
|
|
77
81
|
|
|
78
82
|
convertRelativeURLsToAbsolute(domFragment, location);
|
|
@@ -92,24 +96,32 @@ export async function extractFromHTML(sourceDocument) {
|
|
|
92
96
|
const markdownContent = transform(domFragment);
|
|
93
97
|
|
|
94
98
|
if (!markdownContent) {
|
|
95
|
-
throw new
|
|
99
|
+
throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
|
|
96
100
|
}
|
|
97
101
|
|
|
98
102
|
return markdownContent;
|
|
99
103
|
}
|
|
100
104
|
|
|
101
|
-
export async function extractFromPDF({ content: pdfBuffer }) {
|
|
105
|
+
export async function extractFromPDF({ location, content: pdfBuffer }) {
|
|
106
|
+
let markdownContent;
|
|
107
|
+
|
|
102
108
|
try {
|
|
103
109
|
const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
|
|
104
110
|
|
|
105
|
-
|
|
111
|
+
markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
|
|
106
112
|
} catch (error) {
|
|
107
113
|
if (error.parserError) {
|
|
108
|
-
throw new
|
|
114
|
+
throw new Error("Can't parse PDF file");
|
|
109
115
|
}
|
|
110
116
|
|
|
111
117
|
throw error;
|
|
112
118
|
}
|
|
119
|
+
|
|
120
|
+
if (!markdownContent) {
|
|
121
|
+
throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return markdownContent;
|
|
113
125
|
}
|
|
114
126
|
|
|
115
127
|
function selectRange(webPageDOM, rangeSelector) {
|
|
@@ -120,11 +132,11 @@ function selectRange(webPageDOM, rangeSelector) {
|
|
|
120
132
|
const endNode = webPageDOM.querySelector(endBefore || endAfter);
|
|
121
133
|
|
|
122
134
|
if (!startNode) {
|
|
123
|
-
throw new
|
|
135
|
+
throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
|
|
124
136
|
}
|
|
125
137
|
|
|
126
138
|
if (!endNode) {
|
|
127
|
-
throw new
|
|
139
|
+
throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
|
|
128
140
|
}
|
|
129
141
|
|
|
130
142
|
selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
|
|
@@ -135,7 +147,11 @@ function selectRange(webPageDOM, rangeSelector) {
|
|
|
135
147
|
|
|
136
148
|
export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
|
|
137
149
|
Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
|
|
138
|
-
|
|
150
|
+
try {
|
|
151
|
+
link.href = new URL(link.href, baseURL).href;
|
|
152
|
+
} catch (error) {
|
|
153
|
+
// Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
|
|
154
|
+
}
|
|
139
155
|
});
|
|
140
156
|
}
|
|
141
157
|
|