npm - @opentermsarchive/engine - Versions diffs - 0.26.1 → 0.27.0 - Mend

@opentermsarchive/engine 0.26.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/bin/ota-track.js +3 -3
package/bin/ota-validate.js +2 -2
package/bin/ota.js +1 -1
package/config/default.json +1 -1
package/package.json +3 -4
package/scripts/dataset/export/index.js +4 -4
package/scripts/dataset/export/index.test.js +11 -17
package/scripts/declarations/lint/index.mocha.js +1 -1
package/scripts/declarations/utils/index.js +12 -12
package/scripts/declarations/validate/definitions.js +1 -1
package/scripts/declarations/validate/index.mocha.js +30 -34
package/scripts/declarations/validate/service.history.schema.js +11 -11
package/scripts/declarations/validate/service.schema.js +13 -13
package/scripts/history/migrate-services.js +4 -4
package/scripts/history/update-to-full-hash.js +2 -2
package/scripts/import/index.js +14 -14
package/scripts/rewrite/rewrite-snapshots.js +3 -3
package/scripts/rewrite/rewrite-versions.js +14 -14
package/scripts/utils/renamer/README.md +3 -3
package/scripts/utils/renamer/index.js +13 -13
package/src/archivist/errors.js +1 -1
package/src/archivist/extract/exports.js +3 -0
package/src/archivist/{filter → extract}/index.js +23 -27
package/src/archivist/extract/index.test.js +516 -0
package/src/archivist/index.js +101 -140
package/src/archivist/index.test.js +178 -166
package/src/archivist/recorder/index.js +11 -55
package/src/archivist/recorder/index.test.js +310 -356
package/src/archivist/recorder/record.js +18 -7
package/src/archivist/recorder/repositories/git/dataMapper.js +41 -31
package/src/archivist/recorder/repositories/git/index.js +11 -15
package/src/archivist/recorder/repositories/git/index.test.js +1058 -463
package/src/archivist/recorder/repositories/interface.js +8 -6
package/src/archivist/recorder/repositories/mongo/dataMapper.js +21 -14
package/src/archivist/recorder/repositories/mongo/index.js +8 -8
package/src/archivist/recorder/repositories/mongo/index.test.js +898 -479
package/src/archivist/recorder/snapshot.js +5 -0
package/src/archivist/recorder/snapshot.test.js +65 -0
package/src/archivist/recorder/version.js +14 -0
package/src/archivist/recorder/version.test.js +65 -0
package/src/archivist/services/index.js +60 -51
package/src/archivist/services/index.test.js +63 -83
package/src/archivist/services/service.js +26 -22
package/src/archivist/services/service.test.js +46 -68
package/src/archivist/services/{pageDeclaration.js → sourceDocument.js} +11 -9
package/src/archivist/services/{pageDeclaration.test.js → sourceDocument.test.js} +21 -21
package/src/archivist/services/terms.js +26 -0
package/src/archivist/services/{documentDeclaration.test.js → terms.test.js} +15 -15
package/src/exports.js +2 -2
package/src/index.js +16 -13
package/src/logger/index.js +35 -36
package/src/notifier/index.js +8 -8
package/src/tracker/index.js +6 -6
package/src/archivist/filter/exports.js +0 -3
package/src/archivist/filter/index.test.js +0 -564
package/src/archivist/recorder/record.test.js +0 -91
package/src/archivist/services/documentDeclaration.js +0 -26
/package/scripts/utils/renamer/rules/{documentTypes.json → termsTypes.json} +0 -0
/package/scripts/utils/renamer/rules/{documentTypesByService.json → termsTypesByService.json} +0 -0

package/scripts/import/index.js CHANGED Viewed

@@ -18,7 +18,7 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const ROOT_PATH = path.resolve(__dirname, '../../');
 const MAX_PARALLEL = 10;
 const MAX_RETRY = 5;
-const PDF_MIME_TYPE = 'application/pdf';
+const PDF_MIME_TYPE = mime.getType('pdf');
 const COUNTERS = {
   imported: 0,
   skippedNoChanges: 0,
@@ -87,10 +87,10 @@ function queueErrorHandler(error, { commit }) {
   const serviceId = path.dirname(relativeFilePath);
   const extension = path.extname(relativeFilePath);
-  const documentType = path.basename(relativeFilePath, extension);
+  const termsType = path.basename(relativeFilePath, extension);
   commitsNotImported.push(commit.hash);
-  logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type: documentType, sha: commit.hash });
+  logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type: termsType, sha: commit.hash });
   COUNTERS.errors++;
 }
@@ -117,9 +117,9 @@ function queueDrainHandler(totalToTreat) {
   };
 }
-async function getCommitContent({ sha, serviceId, documentType, extension }) {
+async function getCommitContent({ sha, serviceId, termsType, extension }) {
   const start = performance.now();
-  const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(documentType)}.${extension}`;
+  const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(termsType)}.${extension}`;
   const response = await nodeFetch(url);
   const end = performance.now();
@@ -141,7 +141,7 @@ async function getCommitContent({ sha, serviceId, documentType, extension }) {
     throw new TooManyRequestsError(`Cannot get commit content on Github ${url}. 429: Too Many Requests`);
   }
-  logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type: documentType, sha });
+  logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type: termsType, sha });
   return content;
 }
@@ -151,12 +151,12 @@ async function handleCommit(commit, index, total) {
   let serviceId = path.dirname(relativeFilePath);
   const extension = path.extname(relativeFilePath);
-  let documentType = path.basename(relativeFilePath, extension);
+  let termsType = path.basename(relativeFilePath, extension);
   logger.info({
     message: 'Start to handle commit',
     serviceId,
-    type: documentType,
+    type: termsType,
     sha: commit.hash,
     current: index + 1,
     total,
@@ -168,7 +168,7 @@ async function handleCommit(commit, index, total) {
     logger.info({
       message: 'Skipped commit as an entry already exists for this commit',
       serviceId,
-      type: documentType,
+      type: termsType,
       sha: commit.hash,
     });
     COUNTERS.skippedNoChanges++;
@@ -176,9 +176,9 @@ async function handleCommit(commit, index, total) {
     return;
   }
-  let content = await getCommitContent({ sha: commit.hash, serviceId, documentType, extension: extension.replace('.', '') });
+  let content = await getCommitContent({ sha: commit.hash, serviceId, termsType, extension: extension.replace('.', '') });
-  ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
+  ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
   const mimeType = mime.getType(extension);
@@ -198,7 +198,7 @@ async function handleCommit(commit, index, total) {
     await snapshotsCollection.insertOne({
       serviceId,
-      documentType,
+      termsType,
       content,
       mimeType,
       fetchDate: commit.date,
@@ -207,10 +207,10 @@ async function handleCommit(commit, index, total) {
     });
     const end = performance.now();
-    logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type: documentType });
+    logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type: termsType });
     COUNTERS.imported++;
   } catch (error) {
-    logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type: documentType });
+    logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type: termsType });
     commitsNotImported.push(commit.hash);
     COUNTERS.errors++;
   }

package/scripts/rewrite/rewrite-snapshots.js CHANGED Viewed

@@ -76,13 +76,13 @@ let recorder;
     const { content, mimeType } = await loadFile(SNAPSHOTS_SOURCE_PATH, relativeFilePath);
     let serviceId = path.dirname(relativeFilePath);
-    let documentType = path.basename(relativeFilePath, path.extname(relativeFilePath));
+    let termsType = path.basename(relativeFilePath, path.extname(relativeFilePath));
-    ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
+    ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
     const { id: snapshotId } = await recorder.recordSnapshot({
       serviceId,
-      documentType,
+      termsType,
       content,
       mimeType,
       fetchDate: commit.date,

package/scripts/rewrite/rewrite-versions.js CHANGED Viewed

@@ -4,7 +4,7 @@ import { fileURLToPath } from 'url';
 import config from 'config';
 import { InaccessibleContentError } from '../../src/archivist/errors.js';
-import filter from '../../src/archivist/filter/index.js';
+import extract from '../../src/archivist/extract/index.js';
 import Recorder from '../../src/archivist/recorder/index.js';
 import Git from '../../src/archivist/recorder/repositories/git/git.js';
 import GitRepository from '../../src/archivist/recorder/repositories/git/index.js';
@@ -86,41 +86,41 @@ let recorder;
     const { content, mimeType } = await loadFile(SNAPSHOTS_SOURCE_PATH, relativeFilePath);
     let serviceId = path.dirname(relativeFilePath);
-    let documentType = path.basename(relativeFilePath, path.extname(relativeFilePath));
+    let termsType = path.basename(relativeFilePath, path.extname(relativeFilePath));
-    ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
+    ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
     if (!servicesDeclarations[serviceId]) {
       console.log(`⌙ Skip unknown service "${serviceId}"`);
       continue;
     }
-    const documentDeclaration = servicesDeclarations[serviceId].getDocumentDeclaration(
-      documentType,
+    const terms = servicesDeclarations[serviceId].getTerms(
+      termsType,
       commit.date,
     );
-    if (!documentDeclaration) {
-      console.log(`⌙ Skip unknown terms type "${documentType}" for service "${serviceId}"`);
+    if (!terms) {
+      console.log(`⌙ Skip unknown terms type "${termsType}" for service "${serviceId}"`);
       continue;
     }
-    if (documentDeclaration.validUntil) {
-      console.log(`⌙ Use declaration valid until ${documentDeclaration.validUntil}`);
+    if (terms.validUntil) {
+      console.log(`⌙ Use declaration valid until ${terms.validUntil}`);
     }
     try {
-      const document = await filter({
+      const versionContent = await extract({
         content,
         mimeType,
-        documentDeclaration,
+        terms,
       });
       const { id: versionId } = await recorder.recordVersion({
         serviceId,
-        documentType,
-        content: document,
-        mimeType: MARKDOWN_MIME_TYPE, // The result of the `filter` function is always in markdown format
+        termsType,
+        content: versionContent,
+        mimeType: MARKDOWN_MIME_TYPE, // The result of the `extract` function is always in markdown format
         fetchDate: commit.date,
         snapshotId: commit.hash,
       });

package/scripts/utils/renamer/README.md CHANGED Viewed

@@ -8,7 +8,7 @@ You can use it in your other scripts like this:
 ```
 await renamer.loadRules();
-const { serviceId: renamedServiceId, documentType: renamedDocumentType } = renamer.applyRules(serviceId, documentType);
+const { serviceId: renamedServiceId, termsType: renamedDocumentType } = renamer.applyRules(serviceId, termsType);
 ```
 ## Adding renaming rules
@@ -26,7 +26,7 @@ To rename a service, add a rule in `./rules/services.json`, for example, to rena
 ### Terms type
-To rename a terms type, add a rule in `./rules/documentTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy", add the following line in the file:
+To rename a terms type, add a rule in `./rules/termsTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy", add the following line in the file:
 ```json
 {
@@ -37,7 +37,7 @@ To rename a terms type, add a rule in `./rules/documentTypes.json`, for example,
 ### Terms type for a specific service
-To rename a terms type only for a specific service, add a rule in `./rules/servicesDocumentTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy" only for Skype, add the following line in the file:
+To rename a terms type only for a specific service, add a rule in `./rules/termsTypesByService.json`, for example, to rename "Program Policies" to "Acceptable Use Policy" only for Skype, add the following line in the file:
 ```json
 {

package/scripts/utils/renamer/index.js CHANGED Viewed

@@ -10,12 +10,12 @@ let renamingRules;
 export async function loadRules() {
   renamingRules = {
     serviceNames: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/serviceNames.json'))),
-    documentTypes: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/documentTypes.json'))),
-    documentTypesByService: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/documentTypesByService.json'))),
+    termsTypes: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/termsTypes.json'))),
+    termsTypesByService: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/termsTypesByService.json'))),
   };
 }
-export function applyRules(serviceId, documentType) {
+export function applyRules(serviceId, termsType) {
   const renamedServiceId = renamingRules.serviceNames[serviceId];
   if (renamedServiceId) {
@@ -23,23 +23,23 @@ export function applyRules(serviceId, documentType) {
     serviceId = renamedServiceId;
   }
-  const renamedDocumentType = renamingRules.documentTypes[documentType];
+  const renamedTermsType = renamingRules.termsTypes[termsType];
-  if (renamedDocumentType) {
-    console.log(`⌙ Rename terms type "${documentType}" to "${renamedDocumentType}" of "${serviceId}" service`);
-    documentType = renamedDocumentType;
+  if (renamedTermsType) {
+    console.log(`⌙ Rename terms type "${termsType}" to "${renamedTermsType}" of "${serviceId}" service`);
+    termsType = renamedTermsType;
   }
-  const renamedServiceDocumentType = renamingRules.documentTypesByService[serviceId]
-    && renamingRules.documentTypesByService[serviceId][documentType];
+  const renamedServiceTermsType = renamingRules.termsTypesByService[serviceId]
+    && renamingRules.termsTypesByService[serviceId][termsType];
-  if (renamedServiceDocumentType) {
-    console.log(`⌙ Specific rename terms type "${documentType}" to "${renamedServiceDocumentType}" of "${serviceId}" service`);
-    documentType = renamedServiceDocumentType;
+  if (renamedServiceTermsType) {
+    console.log(`⌙ Specific rename terms type "${termsType}" to "${renamedServiceTermsType}" of "${serviceId}" service`);
+    termsType = renamedServiceTermsType;
   }
   return {
     serviceId,
-    documentType,
+    termsType,
   };
 }

package/src/archivist/errors.js CHANGED Viewed

@@ -3,7 +3,7 @@ export class InaccessibleContentError extends Error {
     if (Array.isArray(message)) {
       message = `\n - ${message.join('\n - ')}`;
     }
-    super(`The document cannot be accessed or its content can not be selected:${message}`);
+    super(`The documents cannot be accessed or their contents can not be selected:${message}`);
     this.name = 'InaccessibleContentError';
   }
 }

package/src/archivist/extract/exports.js ADDED Viewed

@@ -0,0 +1,3 @@
+import extract from './index.js';
+export default extract;

package/src/archivist/{filter → extract}/index.js RENAMED Viewed

@@ -5,6 +5,7 @@ import mardownPdf from '@accordproject/markdown-pdf';
 import TurndownService from '@opentermsarchive/turndown';
 import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
 import jsdom from 'jsdom';
+import mime from 'mime';
 import { InaccessibleContentError } from '../errors.js';
@@ -21,32 +22,27 @@ const { CiceroMarkTransformer } = ciceroMark;
 const ciceroMarkTransformer = new CiceroMarkTransformer();
 /**
- * Filter document content and convert it to Markdown
+ * Extract content from source document and convert it to Markdown
  *
- * @param {Object} params - Filter parameters
- * @param {string|Buffer} params.content - Content to filter: a buffer containing PDF data in case mimetype associated is PDF or a DOM dump of an HTML page given as a string
- * @param {string} params.mimeType - MIME type of the given content
- * @param {string} params.pageDeclaration - see {@link ./src/archivist/services/pageDeclaration.js}
- * @returns {Promise<string>} Promise which is fulfilled once the content is filtered and converted in Markdown. The promise will resolve into a string containing the filtered content in Markdown format
+ * @param {string} sourceDocument - Source document from which to extract content, see {@link ./src/archivist/services/sourceDocument.js}
+ * @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
 */
-export default async function filter({ content, mimeType, pageDeclaration }) {
-  if (mimeType == 'application/pdf') {
-    return filterPDF({ content });
+export default async function extract(sourceDocument) {
+  if (sourceDocument.mimeType == mime.getType('pdf')) {
+    return extractFromPDF(sourceDocument);
   }
-  return filterHTML({
-    content,
-    pageDeclaration,
-  });
+  return extractFromHTML(sourceDocument);
 }
-export async function filterHTML({ content, pageDeclaration }) {
+export async function extractFromHTML(sourceDocument) {
   const {
     location,
     contentSelectors = [],
-    noiseSelectors = [],
+    insignificantContentSelectors = [],
     filters: serviceSpecificFilters = [],
-  } = pageDeclaration;
+    content,
+  } = sourceDocument;
   const jsdomInstance = new JSDOM(content, {
     url: location,
@@ -61,7 +57,7 @@ export async function filterHTML({ content, pageDeclaration }) {
       await filterFunction(webPageDOM, {
         fetch: location,
         select: contentSelectors,
-        remove: noiseSelectors,
+        remove: insignificantContentSelectors,
         filter: serviceSpecificFilters.map(filter => filter.name),
       });
       /* eslint-enable no-await-in-loop */
@@ -70,7 +66,7 @@ export async function filterHTML({ content, pageDeclaration }) {
     }
   }
-  remove(webPageDOM, noiseSelectors); // remove function works in place
+  remove(webPageDOM, insignificantContentSelectors); // remove function works in place
   const domFragment = select(webPageDOM, contentSelectors);
@@ -101,7 +97,7 @@ export async function filterHTML({ content, pageDeclaration }) {
   return markdownContent;
 }
-export async function filterPDF({ content: pdfBuffer }) {
+export async function extractFromPDF({ content: pdfBuffer }) {
   try {
     const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
@@ -115,12 +111,12 @@ export async function filterPDF({ content: pdfBuffer }) {
   }
 }
-function selectRange(document, rangeSelector) {
+function selectRange(webPageDOM, rangeSelector) {
   const { startBefore, startAfter, endBefore, endAfter } = rangeSelector;
-  const selection = document.createRange();
-  const startNode = document.querySelector(startBefore || startAfter);
-  const endNode = document.querySelector(endBefore || endAfter);
+  const selection = webPageDOM.createRange();
+  const startNode = webPageDOM.querySelector(startBefore || startAfter);
+  const endNode = webPageDOM.querySelector(endBefore || endAfter);
   if (!startNode) {
     throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
@@ -136,18 +132,18 @@ function selectRange(document, rangeSelector) {
   return selection;
 }
-export function convertRelativeURLsToAbsolute(document, baseURL) {
-  Array.from(document.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
+export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
+  Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
     link.href = url.resolve(baseURL, link.href);
   });
 }
 // Works in place
-function remove(webPageDOM, noiseSelectors) {
+function remove(webPageDOM, insignificantContentSelectors) {
   const rangeSelections = [];
   const nodes = [];
-  [].concat(noiseSelectors).forEach(selector => {
+  [].concat(insignificantContentSelectors).forEach(selector => {
     if (typeof selector === 'object') {
       rangeSelections.push(selectRange(webPageDOM, selector));
     } else {