@opentermsarchive/engine 1.1.3 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/archivist/extract/errors.js +6 -0
- package/src/archivist/extract/index.js +32 -16
- package/src/archivist/extract/index.test.js +319 -302
- package/src/archivist/fetcher/errors.js +1 -1
- package/src/archivist/fetcher/fullDomFetcher.js +4 -6
- package/src/archivist/fetcher/htmlOnlyFetcher.js +6 -7
- package/src/archivist/fetcher/index.js +9 -4
- package/src/archivist/fetcher/index.test.js +23 -12
- package/src/archivist/index.js +37 -13
- package/src/archivist/index.test.js +22 -22
- package/src/archivist/services/service.js +12 -6
- package/src/archivist/services/service.test.js +60 -39
- package/src/logger/index.js +3 -3
- package/src/reporter/index.js +4 -2
- package/src/reporter/labels.json +10 -0
package/package.json
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import url from 'url';
|
|
2
|
-
|
|
3
1
|
import ciceroMark from '@accordproject/markdown-cicero';
|
|
4
2
|
import mardownPdf from '@accordproject/markdown-pdf';
|
|
5
3
|
import TurndownService from '@opentermsarchive/turndown';
|
|
@@ -7,7 +5,9 @@ import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
|
|
|
7
5
|
import jsdom from 'jsdom';
|
|
8
6
|
import mime from 'mime';
|
|
9
7
|
|
|
10
|
-
import {
|
|
8
|
+
import { ExtractDocumentError } from './errors.js';
|
|
9
|
+
|
|
10
|
+
export { ExtractDocumentError } from './errors.js';
|
|
11
11
|
|
|
12
12
|
const { JSDOM } = jsdom;
|
|
13
13
|
const turndownService = new TurndownService();
|
|
@@ -29,11 +29,15 @@ const ciceroMarkTransformer = new CiceroMarkTransformer();
|
|
|
29
29
|
* @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
|
|
30
30
|
*/
|
|
31
31
|
export default async function extract(sourceDocument) {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
try {
|
|
33
|
+
if (sourceDocument.mimeType == mime.getType('pdf')) {
|
|
34
|
+
return await extractFromPDF(sourceDocument);
|
|
35
|
+
}
|
|
35
36
|
|
|
36
|
-
|
|
37
|
+
return await extractFromHTML(sourceDocument);
|
|
38
|
+
} catch (error) {
|
|
39
|
+
throw new ExtractDocumentError(error.message);
|
|
40
|
+
}
|
|
37
41
|
}
|
|
38
42
|
|
|
39
43
|
export async function extractFromHTML(sourceDocument) {
|
|
@@ -63,7 +67,7 @@ export async function extractFromHTML(sourceDocument) {
|
|
|
63
67
|
});
|
|
64
68
|
/* eslint-enable no-await-in-loop */
|
|
65
69
|
} catch (error) {
|
|
66
|
-
throw new
|
|
70
|
+
throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
|
|
67
71
|
}
|
|
68
72
|
}
|
|
69
73
|
|
|
@@ -72,7 +76,7 @@ export async function extractFromHTML(sourceDocument) {
|
|
|
72
76
|
const domFragment = select(webPageDOM, contentSelectors);
|
|
73
77
|
|
|
74
78
|
if (!domFragment.children.length) {
|
|
75
|
-
throw new
|
|
79
|
+
throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
|
|
76
80
|
}
|
|
77
81
|
|
|
78
82
|
convertRelativeURLsToAbsolute(domFragment, location);
|
|
@@ -92,24 +96,32 @@ export async function extractFromHTML(sourceDocument) {
|
|
|
92
96
|
const markdownContent = transform(domFragment);
|
|
93
97
|
|
|
94
98
|
if (!markdownContent) {
|
|
95
|
-
throw new
|
|
99
|
+
throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
|
|
96
100
|
}
|
|
97
101
|
|
|
98
102
|
return markdownContent;
|
|
99
103
|
}
|
|
100
104
|
|
|
101
|
-
export async function extractFromPDF({ content: pdfBuffer }) {
|
|
105
|
+
export async function extractFromPDF({ location, content: pdfBuffer }) {
|
|
106
|
+
let markdownContent;
|
|
107
|
+
|
|
102
108
|
try {
|
|
103
109
|
const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
|
|
104
110
|
|
|
105
|
-
|
|
111
|
+
markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
|
|
106
112
|
} catch (error) {
|
|
107
113
|
if (error.parserError) {
|
|
108
|
-
throw new
|
|
114
|
+
throw new Error("Can't parse PDF file");
|
|
109
115
|
}
|
|
110
116
|
|
|
111
117
|
throw error;
|
|
112
118
|
}
|
|
119
|
+
|
|
120
|
+
if (!markdownContent) {
|
|
121
|
+
throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return markdownContent;
|
|
113
125
|
}
|
|
114
126
|
|
|
115
127
|
function selectRange(webPageDOM, rangeSelector) {
|
|
@@ -120,11 +132,11 @@ function selectRange(webPageDOM, rangeSelector) {
|
|
|
120
132
|
const endNode = webPageDOM.querySelector(endBefore || endAfter);
|
|
121
133
|
|
|
122
134
|
if (!startNode) {
|
|
123
|
-
throw new
|
|
135
|
+
throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
|
|
124
136
|
}
|
|
125
137
|
|
|
126
138
|
if (!endNode) {
|
|
127
|
-
throw new
|
|
139
|
+
throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
|
|
128
140
|
}
|
|
129
141
|
|
|
130
142
|
selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
|
|
@@ -135,7 +147,11 @@ function selectRange(webPageDOM, rangeSelector) {
|
|
|
135
147
|
|
|
136
148
|
export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
|
|
137
149
|
Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
|
|
138
|
-
|
|
150
|
+
try {
|
|
151
|
+
link.href = new URL(link.href, baseURL).href;
|
|
152
|
+
} catch (error) {
|
|
153
|
+
// Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
|
|
154
|
+
}
|
|
139
155
|
});
|
|
140
156
|
}
|
|
141
157
|
|