@opentermsarchive/engine 1.1.3 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "1.1.3",
3
+ "version": "1.2.0",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -0,0 +1,6 @@
1
+ export class ExtractDocumentError extends Error {
2
+ constructor(message) {
3
+ super(`Extract failed: ${message}`);
4
+ this.name = 'ExtractDocumentError';
5
+ }
6
+ }
@@ -1,5 +1,3 @@
1
- import url from 'url';
2
-
3
1
  import ciceroMark from '@accordproject/markdown-cicero';
4
2
  import mardownPdf from '@accordproject/markdown-pdf';
5
3
  import TurndownService from '@opentermsarchive/turndown';
@@ -7,7 +5,9 @@ import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
7
5
  import jsdom from 'jsdom';
8
6
  import mime from 'mime';
9
7
 
10
- import { InaccessibleContentError } from '../errors.js';
8
+ import { ExtractDocumentError } from './errors.js';
9
+
10
+ export { ExtractDocumentError } from './errors.js';
11
11
 
12
12
  const { JSDOM } = jsdom;
13
13
  const turndownService = new TurndownService();
@@ -29,11 +29,15 @@ const ciceroMarkTransformer = new CiceroMarkTransformer();
29
29
  * @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
30
30
  */
31
31
  export default async function extract(sourceDocument) {
32
- if (sourceDocument.mimeType == mime.getType('pdf')) {
33
- return extractFromPDF(sourceDocument);
34
- }
32
+ try {
33
+ if (sourceDocument.mimeType == mime.getType('pdf')) {
34
+ return await extractFromPDF(sourceDocument);
35
+ }
35
36
 
36
- return extractFromHTML(sourceDocument);
37
+ return await extractFromHTML(sourceDocument);
38
+ } catch (error) {
39
+ throw new ExtractDocumentError(error.message);
40
+ }
37
41
  }
38
42
 
39
43
  export async function extractFromHTML(sourceDocument) {
@@ -63,7 +67,7 @@ export async function extractFromHTML(sourceDocument) {
63
67
  });
64
68
  /* eslint-enable no-await-in-loop */
65
69
  } catch (error) {
66
- throw new InaccessibleContentError(`The filter function "${filterFunction.name}" failed: ${error}`);
70
+ throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
67
71
  }
68
72
  }
69
73
 
@@ -72,7 +76,7 @@ export async function extractFromHTML(sourceDocument) {
72
76
  const domFragment = select(webPageDOM, contentSelectors);
73
77
 
74
78
  if (!domFragment.children.length) {
75
- throw new InaccessibleContentError(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
79
+ throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
76
80
  }
77
81
 
78
82
  convertRelativeURLsToAbsolute(domFragment, location);
@@ -92,24 +96,32 @@ export async function extractFromHTML(sourceDocument) {
92
96
  const markdownContent = transform(domFragment);
93
97
 
94
98
  if (!markdownContent) {
95
- throw new InaccessibleContentError(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
99
+ throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
96
100
  }
97
101
 
98
102
  return markdownContent;
99
103
  }
100
104
 
101
- export async function extractFromPDF({ content: pdfBuffer }) {
105
+ export async function extractFromPDF({ location, content: pdfBuffer }) {
106
+ let markdownContent;
107
+
102
108
  try {
103
109
  const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
104
110
 
105
- return ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
111
+ markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
106
112
  } catch (error) {
107
113
  if (error.parserError) {
108
- throw new InaccessibleContentError("Can't parse PDF file");
114
+ throw new Error("Can't parse PDF file");
109
115
  }
110
116
 
111
117
  throw error;
112
118
  }
119
+
120
+ if (!markdownContent) {
121
+ throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
122
+ }
123
+
124
+ return markdownContent;
113
125
  }
114
126
 
115
127
  function selectRange(webPageDOM, rangeSelector) {
@@ -120,11 +132,11 @@ function selectRange(webPageDOM, rangeSelector) {
120
132
  const endNode = webPageDOM.querySelector(endBefore || endAfter);
121
133
 
122
134
  if (!startNode) {
123
- throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
135
+ throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
124
136
  }
125
137
 
126
138
  if (!endNode) {
127
- throw new InaccessibleContentError(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
139
+ throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
128
140
  }
129
141
 
130
142
  selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
@@ -135,7 +147,11 @@ function selectRange(webPageDOM, rangeSelector) {
135
147
 
136
148
  export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
137
149
  Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
138
- link.href = url.resolve(baseURL, link.href);
150
+ try {
151
+ link.href = new URL(link.href, baseURL).href;
152
+ } catch (error) {
153
+ // Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
154
+ }
139
155
  });
140
156
  }
141
157