npm - @opentermsarchive/engine - Versions diffs - 1.1.2 → 1.2.0 - Mend

@opentermsarchive/engine 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/.eslintrc.yaml +2 -0
package/package.json +2 -1
package/scripts/declarations/utils/fixtures/serviceATermsUpdated.history.json +9 -0
package/scripts/declarations/utils/index.js +4 -0
package/scripts/declarations/utils/index.test.js +12 -4
package/scripts/declarations/validate/index.mocha.js +1 -1
package/src/archivist/extract/errors.js +6 -0
package/src/archivist/extract/index.js +32 -16
package/src/archivist/extract/index.test.js +319 -302
package/src/archivist/fetcher/errors.js +1 -1
package/src/archivist/fetcher/fullDomFetcher.js +4 -6
package/src/archivist/fetcher/htmlOnlyFetcher.js +6 -7
package/src/archivist/fetcher/index.js +9 -4
package/src/archivist/fetcher/index.test.js +24 -13
package/src/archivist/index.js +37 -13
package/src/archivist/index.test.js +22 -22
package/src/archivist/services/service.js +12 -6
package/src/archivist/services/service.test.js +60 -39
package/src/logger/index.js +3 -3
package/src/reporter/index.js +4 -2
package/src/reporter/labels.json +10 -0

package/.eslintrc.yaml CHANGED Viewed

@@ -10,6 +10,7 @@ plugins:
   - chai-friendly
   - import
   - json-format
+  - no-only-tests
 rules:
   arrow-parens:
     - error
@@ -101,6 +102,7 @@ rules:
     - error
     - properties: false
   require-await: 1
+  no-only-tests/no-only-tests: error
 overrides:
   - files:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@opentermsarchive/engine",
-  "version": "1.1.2",
+  "version": "1.2.0",
   "description": "Tracks and makes visible changes to the terms of online services",
   "homepage": "https://opentermsarchive.org",
   "bugs": {
@@ -103,6 +103,7 @@
   "devDependencies": {
     "@commitlint/cli": "^19.0.3",
     "dir-compare": "^4.0.0",
+    "eslint-plugin-no-only-tests": "^3.1.0",
     "keep-a-changelog": "^2.5.3",
     "nock": "^13.2.1",
     "node-stream-zip": "^1.15.0",

package/scripts/declarations/utils/fixtures/serviceATermsUpdated.history.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "Terms of Service": [
+    {
+      "fetch": "https://domain.example/tos",
+      "select": "body",
+      "validUntil": "2024-03-18T18:30:09Z"
+    }
+  ]
+}

package/scripts/declarations/utils/index.js CHANGED Viewed

@@ -43,6 +43,10 @@ export default class DeclarationUtils {
     await Promise.all(modifiedFilePaths.map(async modifiedFilePath => {
       const serviceId = DeclarationUtils.getServiceIdFromFilePath(modifiedFilePath);
+      if (modifiedFilePath.endsWith('.history.json')) {
+        return; // Assuming history modifications imply corresponding changes in the service declaration and that the analysis of which terms types of this service have changed will be done when analysing the related declaration, no further action is required here
+      }
       if (modifiedFilePath.endsWith('.filters.js')) {
         const declaration = await this.getJSONFromFile(this.defaultBranch, `declarations/${serviceId}.json`);

package/scripts/declarations/utils/index.test.js CHANGED Viewed

@@ -15,6 +15,7 @@ const SUBJECT_PATH = path.resolve(__dirname, './test');
 const FIXTURES = {
   serviceA: { path: './fixtures/serviceA.json' },
   serviceATermsUpdated: { path: './fixtures/serviceATermsUpdated.json' },
+  serviceATermsUpdatedHistory: { path: './fixtures/serviceATermsUpdated.history.json' },
   serviceAMultipleTermsUpdated: { path: './fixtures/serviceAMultipleTermsUpdated.json' },
   serviceATermsAdded: { path: './fixtures/serviceATermsAdded.json' },
   serviceATermsRemoved: { path: './fixtures/serviceATermsRemoved.json' },
@@ -23,6 +24,7 @@ const FIXTURES = {
 const COMMIT_PATHS = {
   serviceA: './declarations/ServiceA.json',
+  serviceAHistory: './declarations/ServiceA.history.json',
   serviceB: './declarations/ServiceB.json',
 };
@@ -36,7 +38,7 @@ const removeLatestCommit = async () => {
   await declarationUtils.git.reset('hard', ['HEAD~1']);
 };
-describe.only('DeclarationUtils', () => {
+describe('DeclarationUtils', () => {
   describe('#getModifiedServicesAndTermsTypes', () => {
     before(async () => {
       await loadFixtures();
@@ -46,8 +48,14 @@ describe.only('DeclarationUtils', () => {
     after(() => fs.rm(SUBJECT_PATH, { recursive: true }));
     context('when an existing declaration has been modified', () => {
-      before(() => commitChanges(COMMIT_PATHS.serviceA, FIXTURES.serviceATermsUpdated.content));
-      after(removeLatestCommit);
+      before(async () => {
+        await commitChanges(COMMIT_PATHS.serviceA, FIXTURES.serviceATermsUpdated.content);
+        await commitChanges(COMMIT_PATHS.serviceAHistory, FIXTURES.serviceATermsUpdatedHistory.content);
+      });
+      after(async () => {
+        await removeLatestCommit();
+        await removeLatestCommit();
+      });
       it('returns the service ID and the updated terms type', async () => {
         expect(await declarationUtils.getModifiedServicesAndTermsTypes()).to.deep.equal({
@@ -82,7 +90,7 @@ describe.only('DeclarationUtils', () => {
     });
     context('when a declaration has been removed', () => {
-      before(() => removeLatestCommit(declarationUtils.git));
+      before(removeLatestCommit);
       after(async () => {
         await fs.mkdir(path.resolve(SUBJECT_PATH, './declarations'), { recursive: true });
         await commitChanges(COMMIT_PATHS.serviceA, FIXTURES.serviceA.content);

package/scripts/declarations/validate/index.mocha.js CHANGED Viewed

@@ -41,7 +41,7 @@ export default async options => {
   }
   describe('Service declarations validation', async function () {
-    this.timeout(30000);
+    this.timeout(60000);
     this.slow(SLOW_DOCUMENT_THRESHOLD);
     servicesToValidate.forEach(serviceId => {

package/src/archivist/extract/errors.js ADDED Viewed

@@ -0,0 +1,6 @@
+export class ExtractDocumentError extends Error {
+  constructor(message) {
+    super(`Extract failed: ${message}`);
+    this.name = 'ExtractDocumentError';
+  }
+}

package/src/archivist/extract/index.js CHANGED Viewed

@@ -1,5 +1,3 @@
-import url from 'url';
 import ciceroMark from '@accordproject/markdown-cicero';
 import mardownPdf from '@accordproject/markdown-pdf';
 import TurndownService from '@opentermsarchive/turndown';
@@ -7,7 +5,9 @@ import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
 import jsdom from 'jsdom';
 import mime from 'mime';
-import { InaccessibleContentError } from '../errors.js';
+import { ExtractDocumentError } from './errors.js';
+export { ExtractDocumentError } from './errors.js';
 const { JSDOM } = jsdom;
 const turndownService = new TurndownService();
@@ -29,11 +29,15 @@ const ciceroMarkTransformer = new CiceroMarkTransformer();
  * @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
 */
 export default async function extract(sourceDocument) {
-  if (sourceDocument.mimeType == mime.getType('pdf')) {
-    return extractFromPDF(sourceDocument);
-  }
+  try {
+    if (sourceDocument.mimeType == mime.getType('pdf')) {
+      return await extractFromPDF(sourceDocument);
+    }
-  return extractFromHTML(sourceDocument);
+    return await extractFromHTML(sourceDocument);
+  } catch (error) {
+    throw new ExtractDocumentError(error.message);
+  }
 }
 export async function extractFromHTML(sourceDocument) {
@@ -63,7 +67,7 @@ export async function extractFromHTML(sourceDocument) {
       });
       /* eslint-enable no-await-in-loop */
     } catch (error) {
-      throw new InaccessibleContentError(`The filter function "${filterFunction.name}" failed: ${error}`);
+      throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
     }
   }
@@ -72,7 +76,7 @@ export async function extractFromHTML(sourceDocument) {
   const domFragment = select(webPageDOM, contentSelectors);
   if (!domFragment.children.length) {
-    throw new InaccessibleContentError(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
+    throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
   }
   convertRelativeURLsToAbsolute(domFragment, location);
@@ -92,24 +96,32 @@ export async function extractFromHTML(sourceDocument) {
   const markdownContent = transform(domFragment);
   if (!markdownContent) {
-    throw new InaccessibleContentError(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
+    throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
   }
   return markdownContent;
 }
-export async function extractFromPDF({ content: pdfBuffer }) {
+export async function extractFromPDF({ location, content: pdfBuffer }) {
+  let markdownContent;
   try {
     const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
-    return ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
+    markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
   } catch (error) {
     if (error.parserError) {
-      throw new InaccessibleContentError("Can't parse PDF file");
+      throw new Error("Can't parse PDF file");
     }
     throw error;
   }
+  if (!markdownContent) {
+    throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
+  }
+  return markdownContent;
 }
 function selectRange(webPageDOM, rangeSelector) {
@@ -120,11 +132,11 @@ function selectRange(webPageDOM, rangeSelector) {
   const endNode = webPageDOM.querySelector(endBefore || endAfter);
   if (!startNode) {
-    throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
+    throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
   }
   if (!endNode) {
-    throw new InaccessibleContentError(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
+    throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
   }
   selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
@@ -135,7 +147,11 @@ function selectRange(webPageDOM, rangeSelector) {
 export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
   Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
-    link.href = url.resolve(baseURL, link.href);
+    try {
+      link.href = new URL(link.href, baseURL).href;
+    } catch (error) {
+      // Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
+    }
   });
 }