npm - @opentermsarchive/engine - Versions diffs - 1.1.3 → 1.2.0 - Mend

@opentermsarchive/engine 1.1.3 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/package.json +1 -1
package/src/archivist/extract/errors.js +6 -0
package/src/archivist/extract/index.js +32 -16
package/src/archivist/extract/index.test.js +319 -302
package/src/archivist/fetcher/errors.js +1 -1
package/src/archivist/fetcher/fullDomFetcher.js +4 -6
package/src/archivist/fetcher/htmlOnlyFetcher.js +6 -7
package/src/archivist/fetcher/index.js +9 -4
package/src/archivist/fetcher/index.test.js +23 -12
package/src/archivist/index.js +37 -13
package/src/archivist/index.test.js +22 -22
package/src/archivist/services/service.js +12 -6
package/src/archivist/services/service.test.js +60 -39
package/src/logger/index.js +3 -3
package/src/reporter/index.js +4 -2
package/src/reporter/labels.json +10 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@opentermsarchive/engine",
-  "version": "1.1.3",
+  "version": "1.2.0",
   "description": "Tracks and makes visible changes to the terms of online services",
   "homepage": "https://opentermsarchive.org",
   "bugs": {

package/src/archivist/extract/errors.js ADDED Viewed

@@ -0,0 +1,6 @@
+export class ExtractDocumentError extends Error {
+  constructor(message) {
+    super(`Extract failed: ${message}`);
+    this.name = 'ExtractDocumentError';
+  }
+}

package/src/archivist/extract/index.js CHANGED Viewed

@@ -1,5 +1,3 @@
-import url from 'url';
 import ciceroMark from '@accordproject/markdown-cicero';
 import mardownPdf from '@accordproject/markdown-pdf';
 import TurndownService from '@opentermsarchive/turndown';
@@ -7,7 +5,9 @@ import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
 import jsdom from 'jsdom';
 import mime from 'mime';
-import { InaccessibleContentError } from '../errors.js';
+import { ExtractDocumentError } from './errors.js';
+export { ExtractDocumentError } from './errors.js';
 const { JSDOM } = jsdom;
 const turndownService = new TurndownService();
@@ -29,11 +29,15 @@ const ciceroMarkTransformer = new CiceroMarkTransformer();
  * @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
 */
 export default async function extract(sourceDocument) {
-  if (sourceDocument.mimeType == mime.getType('pdf')) {
-    return extractFromPDF(sourceDocument);
-  }
+  try {
+    if (sourceDocument.mimeType == mime.getType('pdf')) {
+      return await extractFromPDF(sourceDocument);
+    }
-  return extractFromHTML(sourceDocument);
+    return await extractFromHTML(sourceDocument);
+  } catch (error) {
+    throw new ExtractDocumentError(error.message);
+  }
 }
 export async function extractFromHTML(sourceDocument) {
@@ -63,7 +67,7 @@ export async function extractFromHTML(sourceDocument) {
       });
       /* eslint-enable no-await-in-loop */
     } catch (error) {
-      throw new InaccessibleContentError(`The filter function "${filterFunction.name}" failed: ${error}`);
+      throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
     }
   }
@@ -72,7 +76,7 @@ export async function extractFromHTML(sourceDocument) {
   const domFragment = select(webPageDOM, contentSelectors);
   if (!domFragment.children.length) {
-    throw new InaccessibleContentError(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
+    throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
   }
   convertRelativeURLsToAbsolute(domFragment, location);
@@ -92,24 +96,32 @@ export async function extractFromHTML(sourceDocument) {
   const markdownContent = transform(domFragment);
   if (!markdownContent) {
-    throw new InaccessibleContentError(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
+    throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
   }
   return markdownContent;
 }
-export async function extractFromPDF({ content: pdfBuffer }) {
+export async function extractFromPDF({ location, content: pdfBuffer }) {
+  let markdownContent;
   try {
     const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
-    return ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
+    markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
   } catch (error) {
     if (error.parserError) {
-      throw new InaccessibleContentError("Can't parse PDF file");
+      throw new Error("Can't parse PDF file");
     }
     throw error;
   }
+  if (!markdownContent) {
+    throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
+  }
+  return markdownContent;
 }
 function selectRange(webPageDOM, rangeSelector) {
@@ -120,11 +132,11 @@ function selectRange(webPageDOM, rangeSelector) {
   const endNode = webPageDOM.querySelector(endBefore || endAfter);
   if (!startNode) {
-    throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
+    throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
   }
   if (!endNode) {
-    throw new InaccessibleContentError(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
+    throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
   }
   selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
@@ -135,7 +147,11 @@ function selectRange(webPageDOM, rangeSelector) {
 export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
   Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
-    link.href = url.resolve(baseURL, link.href);
+    try {
+      link.href = new URL(link.href, baseURL).href;
+    } catch (error) {
+      // Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
+    }
   });
 }