npm - @opentermsarchive/engine - Versions diffs - 5.2.0 → 5.3.1 - Mend

@opentermsarchive/engine 5.2.0 → 5.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@opentermsarchive/engine",
-  "version": "5.2.0",
+  "version": "5.3.1",
   "description": "Tracks and makes visible changes to the terms of online services",
   "homepage": "https://opentermsarchive.org",
   "bugs": {
@@ -51,7 +51,6 @@
   "dependencies": {
     "@accordproject/markdown-cicero": "^0.15.2",
     "@accordproject/markdown-pdf": "^0.15.2",
-    "@opentermsarchive/fetch-charset-detection": "^1.0.1",
     "@opentermsarchive/turndown": "^7.1.3",
     "@stylistic/eslint-plugin-js": "^1.4.1",
     "abort-controller": "^3.0.0",
@@ -78,6 +77,7 @@
     "eslint-plugin-no-only-tests": "^3.1.0",
     "express": "^4.19.2",
     "express-async-errors": "^3.1.1",
+    "fetch-charset-detection": "^1.0.1",
     "fs-extra": "^10.0.0",
     "helmet": "^6.0.1",
     "http-proxy-agent": "^5.0.0",

package/src/archivist/fetcher/fullDomFetcher.js CHANGED Viewed

@@ -33,7 +33,7 @@ export default async function fetch(url, cssSelectors, config) {
       throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
     }
-    const waitForSelectorsPromises = selectors.map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
+    const waitForSelectorsPromises = selectors.filter(Boolean).map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
     // We expect all elements to be present on the page…
     await Promise.all(waitForSelectorsPromises).catch(error => {

package/src/archivist/fetcher/htmlOnlyFetcher.js CHANGED Viewed

@@ -1,18 +1,17 @@
-import convertBody from '@opentermsarchive/fetch-charset-detection'; // eslint-disable-line import/no-unresolved
 import AbortController from 'abort-controller';
-// https://github.com/node-fetch/fetch-charset-detection/issues/247
+import convertBody from 'fetch-charset-detection'; // eslint-disable-line import/no-unresolved
 import HttpProxyAgent from 'http-proxy-agent';
 import HttpsProxyAgent from 'https-proxy-agent';
 import nodeFetch, { AbortError } from 'node-fetch';
-export default async function fetch(url, configuration) {
+export default async function fetch(url, config) {
   const controller = new AbortController();
-  const timeout = setTimeout(() => controller.abort(), configuration.navigationTimeout);
+  const timeout = setTimeout(() => controller.abort(), config.navigationTimeout);
   const nodeFetchOptions = {
     signal: controller.signal,
     credentials: 'include',
-    headers: { 'Accept-Language': configuration.language },
+    headers: { 'Accept-Language': config.language },
   };
   if (url.startsWith('https:') && process.env.HTTPS_PROXY) {
@@ -51,7 +50,7 @@ export default async function fetch(url, configuration) {
     };
   } catch (error) {
     if (error instanceof AbortError) {
-      throw new Error(`Timed out after ${configuration.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
+      throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
     }
     throw new Error(error.message);

package/src/archivist/fetcher/index.js CHANGED Viewed

@@ -7,35 +7,89 @@ import fetchHtmlOnly from './htmlOnlyFetcher.js';
 export { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
 export { FetchDocumentError } from './errors.js';
+export const FETCHER_TYPES = {
+  FULL_DOM: 'fullDom',
+  HTML_ONLY: 'htmlOnly',
+};
+const LIKELY_BOT_BLOCKING_ERRORS = [
+  'HTTP code 403',
+  'HTTP code 406',
+  'HTTP code 502',
+  'ECONNRESET',
+];
 /**
  * Fetch a resource from the network, returning a promise which is fulfilled once the response is available
  * @function fetch
- * @param   {object}                                                  params                                 Fetcher parameters
- * @param   {string}                                                  params.url                             URL of the resource you want to fetch
- * @param   {boolean}                                                 [params.executeClientScripts]          Enable execution of client scripts. When set to `true`, this property loads the page in a headless browser to load all assets and execute client scripts before returning its content
- * @param   {string|Array}                                            [params.cssSelectors]                  List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled
- * @param   {object}                                                  [params.config]                        Fetcher configuration
- * @param   {number}                                                  [params.config.navigationTimeout]      Maximum time (in milliseconds) to wait before considering the fetch failed
- * @param   {string}                                                  [params.config.language]               Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers
- * @param   {number}                                                  [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled
- * @returns {Promise<{ mimeType: string, content: string | Buffer }>}                                        Promise containing the fetched resource's MIME type and content
+ * @param   {object}                                                                   params                                 Fetcher parameters
+ * @param   {string}                                                                   params.url                             URL of the resource you want to fetch
+ * @param   {boolean}                                                                  [params.executeClientScripts]          Enable execution of client scripts. When set to `true`, this property loads the page in a headless browser to load all assets and execute client scripts before returning its content. If undefined, the engine will automatically balance performance and tracking success rate, defaulting to not executing scripts and escalating to headless browser if needed
+ * @param   {string|Array}                                                             [params.cssSelectors]                  List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled
+ * @param   {object}                                                                   [params.config]                        Fetcher configuration
+ * @param   {number}                                                                   [params.config.navigationTimeout]      Maximum time (in milliseconds) to wait before considering the fetch failed
+ * @param   {string}                                                                   [params.config.language]               Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers
+ * @param   {number}                                                                   [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled
+ * @returns {Promise<{ mimeType: string, content: string | Buffer, fetcher: string }>}                                        Promise containing the fetched resource's MIME type, content, and fetcher type
+ * @throws {FetchDocumentError} When the fetch operation fails
  * @async
  */
 export default async function fetch({
-  url, executeClientScripts, cssSelectors,
+  url,
+  executeClientScripts,
+  cssSelectors,
   config: {
     navigationTimeout = config.get('@opentermsarchive/engine.fetcher.navigationTimeout'),
     language = config.get('@opentermsarchive/engine.fetcher.language'),
     waitForElementsTimeout = config.get('@opentermsarchive/engine.fetcher.waitForElementsTimeout'),
   } = {},
 }) {
+  if (!url) {
+    throw new FetchDocumentError('URL is required');
+  }
+  const fetcherConfig = {
+    navigationTimeout,
+    language,
+    waitForElementsTimeout,
+    executeClientScripts,
+  };
   try {
     if (executeClientScripts) {
-      return await fetchFullDom(url, cssSelectors, { navigationTimeout, language, waitForElementsTimeout });
+      return await fetchWithFullDom(url, cssSelectors, fetcherConfig);
     }
-    return await fetchHtmlOnly(url, { navigationTimeout, language });
+    return await fetchWithFallback(url, cssSelectors, fetcherConfig);
   } catch (error) {
     throw new FetchDocumentError(error.message);
   }
 }
+async function fetchWithFallback(url, cssSelectors, fetcherConfig) {
+  try {
+    return await fetchWithHtmlOnly(url, fetcherConfig);
+  } catch (error) {
+    const isBotBlockingError = LIKELY_BOT_BLOCKING_ERRORS.some(code => error.message.includes(code));
+    if (!isBotBlockingError || fetcherConfig.executeClientScripts === false) {
+      throw error;
+    }
+    return fetchWithFullDom(url, cssSelectors, fetcherConfig);
+  }
+}
+async function fetchWithFullDom(url, cssSelectors, fetcherConfig) {
+  return {
+    ...await fetchFullDom(url, cssSelectors, fetcherConfig),
+    fetcher: FETCHER_TYPES.FULL_DOM,
+  };
+}
+async function fetchWithHtmlOnly(url, fetcherConfig) {
+  return {
+    ...await fetchHtmlOnly(url, fetcherConfig),
+    fetcher: FETCHER_TYPES.HTML_ONLY,
+  };
+}

package/src/archivist/fetcher/index.test.js CHANGED Viewed

@@ -7,7 +7,7 @@ import chai from 'chai';
 import chaiAsPromised from 'chai-as-promised';
 import iconv from 'iconv-lite';
-import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError } from './index.js';
+import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError, FETCHER_TYPES } from './index.js';
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -31,6 +31,8 @@ describe('Fetcher', function () {
     let expectedPDFContent;
     before(done => {
+      let blockCount = 0;
       temporaryServer = http.createServer((request, response) => {
         if (request.url === '/') {
           response.writeHead(200, { 'Content-Type': 'text/html' }).write(termsHTML);
@@ -46,9 +48,19 @@ describe('Fetcher', function () {
         }
         if (request.url == '/terms.pdf') {
           expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
           response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
         }
+        if (request.url === '/block-once') {
+          if (blockCount === 0) {
+            blockCount++;
+            response.writeHead(403, { 'Content-Type': 'text/html' }).write('<!DOCTYPE html><html><body>Access Denied - Bot Detected</body></html>');
+          } else {
+            response.writeHead(200, { 'Content-Type': 'text/html' }).write(termsHTML);
+          }
+        }
+        if (request.url === '/always-block') {
+          response.writeHead(403, { 'Content-Type': 'text/html' }).write('<!DOCTYPE html><html><body>Access Denied - Bot Detected</body></html>');
+        }
         return response.end();
       }).listen(SERVER_PORT);
@@ -66,11 +78,12 @@ describe('Fetcher', function () {
       context('when html page is available', () => {
         let content;
         let mimeType;
+        let fetcher;
         const url = `http://127.0.0.1:${SERVER_PORT}`;
         context('when expected selectors are present', () => {
           before(async () => {
-            ({ content, mimeType } = await fetch({ url, cssSelectors: 'body' }));
+            ({ content, mimeType, fetcher } = await fetch({ url, cssSelectors: 'body' }));
           });
           it('returns the web page content of the given URL', () => {
@@ -81,9 +94,13 @@ describe('Fetcher', function () {
             expect(mimeType).to.equal('text/html');
           });
+          it('uses HTML-only fetcher by default', () => {
+            expect(fetcher).to.equal(FETCHER_TYPES.HTML_ONLY);
+          });
           context('with client script enabled', () => {
             before(async () => {
-              ({ content, mimeType } = await fetch({ url, cssSelectors: 'body', executeClientScripts: true }));
+              ({ content, mimeType, fetcher } = await fetch({ url, cssSelectors: 'body', executeClientScripts: true }));
             });
             it('returns the web page content of the given URL', () => {
@@ -93,6 +110,10 @@ describe('Fetcher', function () {
             it('returns the MIME type of the given URL', () => {
               expect(mimeType).to.equal('text/html');
             });
+            it('uses full DOM fetcher when client scripts are enabled', () => {
+              expect(fetcher).to.equal(FETCHER_TYPES.FULL_DOM);
+            });
           });
         });
@@ -100,7 +121,7 @@ describe('Fetcher', function () {
           const NOT_PRESENT_SELECTOR = 'h2';
           before(async () => {
-            ({ content, mimeType } = await fetch({ url, cssSelectors: NOT_PRESENT_SELECTOR }));
+            ({ content, mimeType, fetcher } = await fetch({ url, cssSelectors: NOT_PRESENT_SELECTOR }));
           });
           it('returns the web page content of the given URL', () => {
@@ -111,9 +132,13 @@ describe('Fetcher', function () {
             expect(mimeType).to.equal('text/html');
           });
+          it('uses HTML-only fetcher by default', () => {
+            expect(fetcher).to.equal(FETCHER_TYPES.HTML_ONLY);
+          });
           context('with client script enabled', () => {
             before(async () => {
-              ({ content, mimeType } = await fetch({ url, cssSelectors: NOT_PRESENT_SELECTOR, executeClientScripts: true }));
+              ({ content, mimeType, fetcher } = await fetch({ url, cssSelectors: NOT_PRESENT_SELECTOR, executeClientScripts: true }));
             });
             it('returns the web page content of the given URL', () => {
@@ -123,32 +148,42 @@ describe('Fetcher', function () {
             it('returns the MIME type of the given URL', () => {
               expect(mimeType).to.equal('text/html');
             });
+            it('uses full DOM fetcher when client scripts are enabled', () => {
+              expect(fetcher).to.equal(FETCHER_TYPES.FULL_DOM);
+            });
           });
         });
       });
       context('when html page is in different charset', () => {
         let content;
+        let fetcher;
         const url = `http://127.0.0.1:${SERVER_PORT}/other-charset`;
         context('when expected selectors are present', () => {
           before(async () => {
-            ({ content } = await fetch({ url, cssSelectors: 'body' }));
+            ({ content, fetcher } = await fetch({ url, cssSelectors: 'body' }));
           });
           it('returns the web page content of the given URL', () => {
             expect(content).to.equal(termsWithOtherCharsetHTML);
           });
+          it('uses HTML-only fetcher by default', () => {
+            expect(fetcher).to.equal(FETCHER_TYPES.HTML_ONLY);
+          });
         });
       });
       context('when url targets a PDF file', () => {
         let content;
         let mimeType;
+        let fetcher;
         const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`;
         before(async () => {
-          ({ content, mimeType } = await fetch({ url: pdfUrl }));
+          ({ content, mimeType, fetcher } = await fetch({ url: pdfUrl }));
         });
         it('returns a buffer for PDF content', () => {
@@ -162,6 +197,10 @@ describe('Fetcher', function () {
         it('returns a blob with the file content', () => {
           expect(content.equals(expectedPDFContent)).to.be.true;
         });
+        it('returns the fetcher used to fetch the PDF file', () => {
+          expect(fetcher).to.equal(FETCHER_TYPES.HTML_ONLY);
+        });
       });
       context('when server responds with empty content', () => {
@@ -245,6 +284,20 @@ describe('Fetcher', function () {
           });
         });
       });
+      describe('when bot blocking is detected', () => {
+        it('falls back to full DOM fetcher when bot blocking is detected', async () => {
+          const { content, mimeType, fetcher } = await fetch({ url: `http://127.0.0.1:${SERVER_PORT}/block-once` });
+          expect(content).to.equal(termsHTML);
+          expect(mimeType).to.equal('text/html');
+          expect(fetcher).to.equal(FETCHER_TYPES.FULL_DOM);
+        });
+        it('still throws FetchDocumentError if both fetchers fail', async () => {
+          await expect(fetch({ url: `http://127.0.0.1:${SERVER_PORT}/always-block` })).to.be.rejectedWith(FetchDocumentError);
+        });
+      });
     });
   });
 });

package/src/archivist/index.js CHANGED Viewed

@@ -183,10 +183,11 @@ export default class Archivist extends events.EventEmitter {
       const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
       try {
-        const { mimeType, content } = await this.fetch({ url, executeClientScripts, cssSelectors });
+        const { mimeType, content, fetcher } = await this.fetch({ url, executeClientScripts, cssSelectors });
         sourceDocument.content = content;
         sourceDocument.mimeType = mimeType;
+        sourceDocument.fetcher = fetcher;
       } catch (error) {
         if (!(error instanceof FetchDocumentError)) {
           throw error;
@@ -248,6 +249,7 @@ export default class Archivist extends events.EventEmitter {
       termsType: terms.type,
       fetchDate: terms.fetchDate,
       isExtractOnly: extractOnly,
+      metadata: { 'x-engine-version': process.env.npm_package_version },
     });
     await this.recorder.record(record);
@@ -272,6 +274,11 @@ export default class Archivist extends events.EventEmitter {
         fetchDate: terms.fetchDate,
         content: sourceDocument.content,
         mimeType: sourceDocument.mimeType,
+        metadata: {
+          'x-engine-version': process.env.npm_package_version,
+          'x-fetcher': sourceDocument.fetcher,
+          'x-source-document-location': sourceDocument.location,
+        },
       });
       await this.recorder.record(record);

package/src/archivist/recorder/repositories/git/dataMapper.js CHANGED Viewed

@@ -24,7 +24,7 @@ const MULTIPLE_SOURCE_DOCUMENTS_PREFIX = 'This version was recorded after extrac
 export const COMMIT_MESSAGE_PREFIXES_REGEXP = new RegExp(`^(${Object.values(COMMIT_MESSAGE_PREFIXES).join('|')})`);
 export function toPersistence(record, snapshotIdentiferTemplate) {
-  const { serviceId, termsType, documentId, isExtractOnly, snapshotIds = [], mimeType, isFirstRecord } = record;
+  const { serviceId, termsType, documentId, isExtractOnly, snapshotIds = [], mimeType, isFirstRecord, metadata } = record;
   let prefix = isExtractOnly ? COMMIT_MESSAGE_PREFIXES.extractOnly : COMMIT_MESSAGE_PREFIXES.update;
@@ -46,11 +46,12 @@ export function toPersistence(record, snapshotIdentiferTemplate) {
     message: `${subject}\n\n${documentIdMessage || ''}\n\n${snapshotIdsMessage || ''}`,
     content: record.content,
     filePath,
+    metadata,
   };
 }
 export function toDomain(commit) {
-  const { hash, date, message, body, diff } = commit;
+  const { hash, date, message, body, diff, trailers = {} } = commit;
   const modifiedFilesInCommit = diff.files.map(({ file }) => file);
@@ -68,17 +69,22 @@ export function toDomain(commit) {
     serviceId: path.dirname(relativeFilePath),
     termsType,
     documentId,
-    mimeType: mime.getType(relativeFilePath),
     fetchDate: new Date(date),
     isFirstRecord: message.startsWith(COMMIT_MESSAGE_PREFIXES.startTracking) || message.startsWith(COMMIT_MESSAGE_PREFIXES.deprecated_startTracking),
-    isExtractOnly: message.startsWith(COMMIT_MESSAGE_PREFIXES.extractOnly) || message.startsWith(COMMIT_MESSAGE_PREFIXES.deprecated_refilter),
-    snapshotIds: snapshotIdsMatch || [],
+    metadata: { ...trailers },
   };
-  if (attributes.mimeType == mime.getType('markdown')) {
+  const mimeTypeValue = mime.getType(relativeFilePath);
+  if (mimeTypeValue == mime.getType('markdown')) {
+    attributes.isExtractOnly = message.startsWith(COMMIT_MESSAGE_PREFIXES.extractOnly) || message.startsWith(COMMIT_MESSAGE_PREFIXES.deprecated_refilter);
+    attributes.snapshotIds = snapshotIdsMatch;
     return new Version(attributes);
   }
+  attributes.mimeType = mimeTypeValue;
   return new Snapshot(attributes);
 }

package/src/archivist/recorder/repositories/git/git.js CHANGED Viewed

@@ -3,6 +3,8 @@ import path from 'path';
 import simpleGit from 'simple-git';
+import { parseTrailers, formatTrailers } from './trailers.js';
 process.env.LC_ALL = 'en_GB'; // Ensure git messages will be in English as some errors are handled by analysing the message content
 const fs = fsApi.promises;
@@ -38,7 +40,7 @@ export default class Git {
     return this.git.add(this.relativePath(filePath));
   }
-  async commit({ filePath, message, date = new Date() }) {
+  async commit({ filePath, message, date = new Date(), trailers = {} }) {
     const commitDate = new Date(date).toISOString();
     let summary;
@@ -46,7 +48,10 @@ export default class Git {
       process.env.GIT_AUTHOR_DATE = commitDate;
       process.env.GIT_COMMITTER_DATE = commitDate;
-      summary = await this.git.commit(message, filePath, ['--no-verify']); // Skip pre-commit and commit-msg hooks, as commits are programmatically managed, to optimize performance
+      const trailersSection = formatTrailers(trailers);
+      const finalMessage = trailersSection ? `${message}\n\n${trailersSection}` : message;
+      summary = await this.git.commit(finalMessage, filePath, ['--no-verify']); // Skip pre-commit and commit-msg hooks, as commits are programmatically managed, to optimize performance
     } finally {
       process.env.GIT_AUTHOR_DATE = '';
       process.env.GIT_COMMITTER_DATE = '';
@@ -70,14 +75,23 @@ export default class Git {
   async getCommit(options) {
     const [commit] = await this.listCommits([ '-1', ...options ]); // Returns only the most recent commit matching the given options
+    if (commit) {
+      commit.trailers = parseTrailers(commit.body);
+    }
     return commit;
   }
   async log(options = []) {
     try {
       const logSummary = await this.git.log(options);
+      const commits = logSummary.all;
+      commits.forEach(commit => {
+        commit.trailers = parseTrailers(commit.body);
+      });
-      return logSummary.all;
+      return commits;
     } catch (error) {
       if (/unknown revision or path not in the working tree|does not have any commits yet/.test(error.message)) {
         return [];

package/src/archivist/recorder/repositories/git/index.js CHANGED Viewed

@@ -41,12 +41,12 @@ export default class GitRepository extends RepositoryInterface {
       record.isFirstRecord = !await this.#isTracked(serviceId, termsType, documentId);
     }
-    const { message, content, filePath: relativeFilePath } = await this.#toPersistence(record);
+    const { message, content, filePath: relativeFilePath, metadata } = await this.#toPersistence(record);
     const filePath = path.join(this.path, relativeFilePath);
     await GitRepository.writeFile({ filePath, content });
-    const sha = await this.#commit({ filePath, message, date: fetchDate });
+    const sha = await this.#commit({ filePath, message, date: fetchDate, trailers: metadata });
     if (!sha) {
       return Object(null);
@@ -153,11 +153,11 @@ export default class GitRepository extends RepositoryInterface {
     return filePath;
   }
-  async #commit({ filePath, message, date }) {
+  async #commit({ filePath, message, date, trailers }) {
     try {
       await this.git.add(filePath);
-      return await this.git.commit({ filePath, message, date });
+      return await this.git.commit({ filePath, message, date, trailers });
     } catch (error) {
       throw new Error(`Could not commit ${filePath} with message "${message}" due to error: "${error}"`);
     }

package/src/archivist/recorder/repositories/git/index.test.js CHANGED Viewed

@@ -41,6 +41,11 @@ const HTML_MIME_TYPE = mime.getType('html');
 const PDF_MIME_TYPE = mime.getType('pdf');
 const PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test/fixtures/terms.pdf'), { encoding: 'utf8' });
+const METADATA = {
+  fetcher: 'test-fetcher',
+  'engine-version': '5.0.0',
+};
 describe('GitRepository', () => {
   let git;
   let subject;
@@ -314,6 +319,26 @@ describe('GitRepository', () => {
           expect(commit.message).to.include(TERMS_TYPE);
         });
       });
+      context('when metadata is provided', () => {
+        before(async () => {
+          ({ id, isFirstRecord } = await subject.save(new Version({
+            serviceId: SERVICE_PROVIDER_ID,
+            termsType: TERMS_TYPE,
+            content: CONTENT,
+            fetchDate: FETCH_DATE,
+            metadata: METADATA,
+          })));
+          ([commit] = await git.log());
+        });
+        after(() => subject.removeAll());
+        it('stores metadata as commit trailers', () => {
+          expect(commit.trailers).to.deep.equal(METADATA);
+        });
+      });
     });
     describe('#findById', () => {
@@ -328,6 +353,7 @@ describe('GitRepository', () => {
           fetchDate: FETCH_DATE,
           snapshotIds: [SNAPSHOT_ID],
           mimeType: HTML_MIME_TYPE,
+          metadata: METADATA,
         })));
         (record = await subject.findById(id));
@@ -367,6 +393,10 @@ describe('GitRepository', () => {
         expect(record.snapshotIds).to.deep.equal([SNAPSHOT_ID]);
       });
+      it('returns metadata', () => {
+        expect(record.metadata).to.deep.equal(METADATA);
+      });
       context('when requested record does not exist', () => {
         it('returns null', async () => {
           expect(await subject.findById('inexistantID')).to.equal(null);
@@ -435,6 +465,28 @@ describe('GitRepository', () => {
           expect(recordFound).to.equal(null);
         });
       });
+      context('when metadata is provided', () => {
+        let record;
+        before(async () => {
+          await subject.save(new Version({
+            serviceId: SERVICE_PROVIDER_ID,
+            termsType: TERMS_TYPE,
+            content: CONTENT,
+            fetchDate: FETCH_DATE,
+            metadata: METADATA,
+          }));
+          record = await subject.findByDate(SERVICE_PROVIDER_ID, TERMS_TYPE, FETCH_DATE);
+        });
+        after(() => subject.removeAll());
+        it('retrieves metadata', () => {
+          expect(record.metadata).to.deep.equal(METADATA);
+        });
+      });
     });
     describe('#findAll', () => {
@@ -557,6 +609,7 @@ describe('GitRepository', () => {
               content: UPDATED_FILE_CONTENT,
               fetchDate: FETCH_DATE,
               snapshotIds: [SNAPSHOT_ID],
+              metadata: METADATA,
             })));
             latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, TERMS_TYPE);
@@ -575,6 +628,10 @@ describe('GitRepository', () => {
           it('returns the latest record content', () => {
             expect(latestRecord.content.toString('utf8')).to.equal(UPDATED_FILE_CONTENT);
           });
+          it('returns metadata', () => {
+            expect(latestRecord.metadata).to.deep.equal(METADATA);
+          });
         });
       });
@@ -901,6 +958,28 @@ describe('GitRepository', () => {
           expect(mime.getType(EXPECTED_PDF_SNAPSHOT_FILE_PATH)).to.equal(PDF_MIME_TYPE);
         });
       });
+      context('when metadata is provided', () => {
+        before(async () => {
+          ({ id, isFirstRecord } = await subject.save(new Snapshot({
+            serviceId: SERVICE_PROVIDER_ID,
+            termsType: TERMS_TYPE,
+            documentId: DOCUMENT_ID,
+            content: CONTENT,
+            fetchDate: FETCH_DATE,
+            mimeType: HTML_MIME_TYPE,
+            metadata: METADATA,
+          })));
+          ([commit] = await git.log());
+        });
+        after(() => subject.removeAll());
+        it('stores metadata as commit trailers', () => {
+          expect(commit.trailers).to.deep.equal(METADATA);
+        });
+      });
     });
     describe('#findById', () => {
@@ -915,6 +994,7 @@ describe('GitRepository', () => {
           content: CONTENT,
           fetchDate: FETCH_DATE,
           mimeType: HTML_MIME_TYPE,
+          metadata: METADATA,
         })));
         (record = await subject.findById(id));
@@ -958,6 +1038,10 @@ describe('GitRepository', () => {
         expect(record.documentId).to.equal(DOCUMENT_ID);
       });
+      it('returns metadata', () => {
+        expect(record.metadata).to.deep.equal(METADATA);
+      });
       context('when requested record does not exist', () => {
         it('returns null', async () => {
           expect(await subject.findById('inexistantID')).to.equal(null);
@@ -1086,6 +1170,7 @@ describe('GitRepository', () => {
               content: UPDATED_FILE_CONTENT,
               mimeType: HTML_MIME_TYPE,
               fetchDate: FETCH_DATE,
+              metadata: METADATA,
             })));
             latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, TERMS_TYPE);
@@ -1108,6 +1193,10 @@ describe('GitRepository', () => {
           it('returns the latest record mime type', () => {
             expect(latestRecord.mimeType).to.equal(HTML_MIME_TYPE);
           });
+          it('returns metadata', () => {
+            expect(latestRecord.metadata).to.deep.equal(METADATA);
+          });
         });
         context('with PDF document', () => {
@@ -1205,6 +1294,29 @@ describe('GitRepository', () => {
         expect(fetchDates).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]);
       });
     });
+    context('when metadata is provided', () => {
+      let record;
+      before(async () => {
+        await subject.save(new Snapshot({
+          serviceId: SERVICE_PROVIDER_ID,
+          termsType: TERMS_TYPE,
+          content: CONTENT,
+          fetchDate: FETCH_DATE,
+          mimeType: HTML_MIME_TYPE,
+          metadata: METADATA,
+        }));
+        record = await subject.findByDate(SERVICE_PROVIDER_ID, TERMS_TYPE, FETCH_DATE);
+      });
+      after(() => subject.removeAll());
+      it('retrieves metadata', () => {
+        expect(record.metadata).to.deep.equal(METADATA);
+      });
+    });
   });
   context('backwards compatibility with deprecated commit messages', () => {

package/src/archivist/recorder/repositories/git/trailers.js ADDED Viewed

@@ -0,0 +1,48 @@
+export function parseTrailers(message) {
+  const trailers = {};
+  const sections = message.split(/\n\n+/);
+  const trailersSection = sections[sections.length - 1];
+  if (!trailersSection.includes(':')) {
+    return trailers;
+  }
+  const validTrailerKeyRegex = /^[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*:$/; // Accepts either a single word or multiple words separated by dashes
+  for (const line of trailersSection.split('\n')) {
+    const trimmedLine = line.trim();
+    if (!trimmedLine) { // Skip empty lines
+      continue;
+    }
+    const colonIndex = trimmedLine.indexOf(':');
+    if (colonIndex === -1) { // Skip lines without a colon
+      continue;
+    }
+    const key = trimmedLine.slice(0, colonIndex + 1);
+    const value = trimmedLine.slice(colonIndex + 1).trim();
+    if (validTrailerKeyRegex.test(key) && value) {
+      const keyWithoutColon = key.slice(0, -1);
+      trailers[keyWithoutColon.toLowerCase()] = value;
+    }
+  }
+  return trailers;
+}
+export function formatTrailers(trailers) {
+  if (Object.keys(trailers).length === 0) {
+    return '';
+  }
+  return Object.entries(trailers)
+    .filter(([ , value ]) => value !== '')
+    .map(([ key, value ]) => `${key[0].toUpperCase() + key.slice(1).toLowerCase()}: ${value}`)
+    .join('\n');
+}

package/src/archivist/recorder/repositories/git/trailers.test.js ADDED Viewed

@@ -0,0 +1,158 @@
+import { expect } from 'chai';
+import { parseTrailers, formatTrailers } from './trailers.js';
+describe('trailers', () => {
+  describe('#parseTrailers', () => {
+    it('returns empty object for message without trailers', () => {
+      const message = 'A simple commit message\n\nWith a body';
+      expect(parseTrailers(message)).to.deep.equal({});
+    });
+    it('returns empty object when last section has no colon', () => {
+      const message = 'A commit message\n\nWith a body\n\nNo trailers here';
+      expect(parseTrailers(message)).to.deep.equal({});
+    });
+    it('parses single word trailer key', () => {
+      const message = 'A commit message\n\nWith a body\n\nFetcher: my-fetcher';
+      expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my-fetcher' });
+    });
+    it('parses multi-word trailer key with dashes', () => {
+      const message = 'A commit message\n\nWith a body\n\nFeature-Request: my-feature';
+      expect(parseTrailers(message)).to.deep.equal({ 'feature-request': 'my-feature' });
+    });
+    it('parses multiple trailers with different key formats', () => {
+      const message = 'A commit message\n\nWith a body\n\nFetcher: my-fetcher\nFeature-Request: my-feature';
+      expect(parseTrailers(message)).to.deep.equal({
+        fetcher: 'my-fetcher',
+        'feature-request': 'my-feature',
+      });
+    });
+    it('handles case-insensitive keys', () => {
+      const message = 'A commit message\n\nWith a body\n\nFETCHER: my-fetcher\nFeature-Request: my-feature';
+      expect(parseTrailers(message)).to.deep.equal({
+        fetcher: 'my-fetcher',
+        'feature-request': 'my-feature',
+      });
+    });
+    it('handles trailers with colons in values', () => {
+      const message = 'A commit message\n\nWith a body\n\nFetcher: my:fetcher:with:colons';
+      expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my:fetcher:with:colons' });
+    });
+    it('ignores malformed trailer lines', () => {
+      const message = 'A commit message\n\nWith a body\n\nFetcher: my-fetcher\nInvalid line\nReviewer: john-doe';
+      expect(parseTrailers(message)).to.deep.equal({
+        fetcher: 'my-fetcher',
+        reviewer: 'john-doe',
+      });
+    });
+    it('ignores trailer keys with spaces', () => {
+      const message = 'A commit message\n\nWith a body\n\nFeature Request: my-feature\nFetcher: my-fetcher';
+      expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my-fetcher' });
+    });
+    it('ignores trailer keys with spaces before colon', () => {
+      const message = 'A commit message\n\nWith a body\n\nFetcher : my-fetcher\nFeature-Request: my-feature';
+      expect(parseTrailers(message)).to.deep.equal({ 'feature-request': 'my-feature' });
+    });
+    it('ignores trailer keys ending with dash', () => {
+      const message = 'A commit message\n\nWith a body\n\nFeature-: my-feature\nFetcher: my-fetcher';
+      expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my-fetcher' });
+    });
+    it('only keeps trailers from the last section', () => {
+      const message = 'A commit message\n\nWith a body\n\nFetcher: my-fetcher\n\nFeature-Request: my-feature';
+      expect(parseTrailers(message)).to.deep.equal({ 'feature-request': 'my-feature' });
+    });
+    it('ignores trailers with empty values', () => {
+      const message = 'A commit message\n\nWith a body\n\nFetcher:\nFeature-request: my-feature';
+      expect(parseTrailers(message)).to.deep.equal({ 'feature-request': 'my-feature' });
+    });
+    it('handles keys with numbers', () => {
+      const message = 'A commit message\n\nWith a body\n\nIssue-123: my-issue\nFetcher: my-fetcher';
+      expect(parseTrailers(message)).to.deep.equal({
+        'issue-123': 'my-issue',
+        fetcher: 'my-fetcher',
+      });
+    });
+    it('handles multiple consecutive empty lines in message', () => {
+      const message = 'A commit message\n\n\n\nWith a body\n\nFetcher: my-fetcher';
+      expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my-fetcher' });
+    });
+  });
+  describe('#formatTrailers', () => {
+    it('returns empty string when no trailers', () => {
+      expect(formatTrailers({})).to.equal('');
+    });
+    it('formats single word trailer key', () => {
+      expect(formatTrailers({ fetcher: 'my-fetcher' })).to.equal('Fetcher: my-fetcher');
+    });
+    it('formats multi-word trailer key with dashes', () => {
+      expect(formatTrailers({ 'feature-request': 'my-feature' })).to.equal('Feature-request: my-feature');
+    });
+    it('formats multiple trailers with different key formats', () => {
+      expect(formatTrailers({
+        fetcher: 'my-fetcher',
+        'feature-request': 'my-feature',
+      })).to.equal('Fetcher: my-fetcher\nFeature-request: my-feature');
+    });
+    it('capitalizes trailer keys', () => {
+      expect(formatTrailers({
+        fetcher: 'my-fetcher',
+        'feature-request': 'my-feature',
+      })).to.equal('Fetcher: my-fetcher\nFeature-request: my-feature');
+    });
+    it('handles case-insensitive keys', () => {
+      expect(formatTrailers({
+        FETCHER: 'my-fetcher',
+        'FEATURE-REQUEST': 'my-feature',
+      })).to.equal('Fetcher: my-fetcher\nFeature-request: my-feature');
+    });
+    it('skips empty string values', () => {
+      expect(formatTrailers({
+        fetcher: '',
+        'feature-request': 'my-feature',
+      })).to.equal('Feature-request: my-feature');
+    });
+    it('handles keys with numbers', () => {
+      expect(formatTrailers({
+        'issue-123': 'my-issue',
+        fetcher: 'my-fetcher',
+      })).to.equal('Issue-123: my-issue\nFetcher: my-fetcher');
+    });
+  });
+});

package/src/archivist/recorder/repositories/mongo/dataMapper.js CHANGED Viewed

@@ -17,7 +17,7 @@ export function toPersistence(record) {
 }
 export function toDomain(mongoDocument) {
-  const { _id, serviceId, termsType, documentId, fetchDate, mimeType, isExtractOnly, isRefilter, isFirstRecord, snapshotIds } = mongoDocument;
+  const { _id, serviceId, termsType, documentId, fetchDate, mimeType, isExtractOnly, isRefilter, isFirstRecord, snapshotIds, metadata } = mongoDocument;
   const attributes = {
     id: _id.toString(),
@@ -29,6 +29,7 @@ export function toDomain(mongoDocument) {
     isFirstRecord: Boolean(isFirstRecord),
     isExtractOnly: Boolean(isExtractOnly) || Boolean(isRefilter),
     snapshotIds: snapshotIds?.map(snapshotId => snapshotId.toString()) || [],
+    metadata,
   };
   if (snapshotIds) {

package/src/archivist/recorder/repositories/mongo/index.js CHANGED Viewed

@@ -90,7 +90,7 @@ export default class MongoRepository extends RepositoryInterface {
   }
   count() {
-    return this.collection.find().count();
+    return this.collection.countDocuments();
   }
   async* iterate() {

package/src/archivist/recorder/repositories/mongo/index.test.js CHANGED Viewed

@@ -34,6 +34,11 @@ const PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test
 const UPDATED_PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test/fixtures/termsModified.pdf'));
 const PDF_MIME_TYPE = mime.getType('pdf');
+const METADATA = {
+  fetcher: 'test-fetcher',
+  'engine-version': '5.0.0',
+};
 let collection;
 describe('MongoRepository', () => {
@@ -57,10 +62,10 @@ describe('MongoRepository', () => {
       context('when it is the first record', () => {
         before(async () => {
-          numberOfRecordsBefore = await collection.find({
+          numberOfRecordsBefore = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (record = await subject.save(new Version({
             serviceId: SERVICE_PROVIDER_ID,
@@ -68,12 +73,13 @@ describe('MongoRepository', () => {
             content: CONTENT,
             fetchDate: FETCH_DATE,
             snapshotIds: [SNAPSHOT_ID],
+            metadata: METADATA,
           })));
-          numberOfRecordsAfter = await collection.find({
+          numberOfRecordsAfter = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (mongoDocument = await collection.findOne({
             serviceId: SERVICE_PROVIDER_ID,
@@ -132,10 +138,10 @@ describe('MongoRepository', () => {
             snapshotIds: [SNAPSHOT_ID],
           })));
-          numberOfRecordsBefore = await collection.find({
+          numberOfRecordsBefore = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (record = await subject.save(new Version({
             serviceId: SERVICE_PROVIDER_ID,
@@ -145,10 +151,10 @@ describe('MongoRepository', () => {
             snapshotIds: [SNAPSHOT_ID],
           })));
-          numberOfRecordsAfter = await collection.find({
+          numberOfRecordsAfter = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           ([mongoDocument] = await collection.find({
             serviceId: SERVICE_PROVIDER_ID,
@@ -181,10 +187,10 @@ describe('MongoRepository', () => {
             snapshotIds: [SNAPSHOT_ID],
           }));
-          numberOfRecordsBefore = await collection.find({
+          numberOfRecordsBefore = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (record = await subject.save(new Version({
             serviceId: SERVICE_PROVIDER_ID,
@@ -194,10 +200,10 @@ describe('MongoRepository', () => {
             snapshotIds: [SNAPSHOT_ID],
           })));
-          numberOfRecordsAfter = await collection.find({
+          numberOfRecordsAfter = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
         });
         after(() => subject.removeAll());
@@ -223,10 +229,10 @@ describe('MongoRepository', () => {
             snapshotIds: [SNAPSHOT_ID],
           })); // An extracted only version cannot be the first record
-          numberOfRecordsBefore = await collection.find({
+          numberOfRecordsBefore = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (record = await subject.save(new Version({
             serviceId: SERVICE_PROVIDER_ID,
@@ -237,10 +243,10 @@ describe('MongoRepository', () => {
             isExtractOnly: true,
           })));
-          numberOfRecordsAfter = await collection.find({
+          numberOfRecordsAfter = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           ([mongoDocument] = await collection.find({
             serviceId: SERVICE_PROVIDER_ID,
@@ -356,6 +362,29 @@ describe('MongoRepository', () => {
           expect(mongoDocument.documentId).to.equal(DOCUMENT_ID);
         });
       });
+      context('when metadata is provided', () => {
+        before(async () => {
+          await subject.save(new Version({
+            serviceId: SERVICE_PROVIDER_ID,
+            termsType: TERMS_TYPE,
+            content: CONTENT,
+            fetchDate: FETCH_DATE,
+            metadata: METADATA,
+          }));
+          (mongoDocument = await collection.findOne({
+            serviceId: SERVICE_PROVIDER_ID,
+            termsType: TERMS_TYPE,
+          }));
+        });
+        after(() => subject.removeAll());
+        it('stores metadata as commit trailers', () => {
+          expect(mongoDocument.metadata).to.deep.equal(METADATA);
+        });
+      });
     });
     describe('#findById', () => {
@@ -369,6 +398,7 @@ describe('MongoRepository', () => {
           content: CONTENT,
           fetchDate: FETCH_DATE,
           snapshotIds: [SNAPSHOT_ID],
+          metadata: METADATA,
         })));
         (record = await subject.findById(id));
@@ -408,6 +438,10 @@ describe('MongoRepository', () => {
         expect(record.snapshotIds).to.deep.equal([SNAPSHOT_ID]);
       });
+      it('returns the metadata', () => {
+        expect(record.metadata).to.deep.equal(METADATA);
+      });
       context('when requested record does not exist', () => {
         it('returns null', async () => {
           expect(await subject.findById('inexistantID')).to.equal(null);
@@ -504,6 +538,28 @@ describe('MongoRepository', () => {
           });
         });
       });
+      context('when metadata is provided', () => {
+        let record;
+        before(async () => {
+          await subject.save(new Version({
+            serviceId: SERVICE_PROVIDER_ID,
+            termsType: TERMS_TYPE,
+            content: CONTENT,
+            fetchDate: FETCH_DATE,
+            metadata: METADATA,
+          }));
+          record = await subject.findByDate(SERVICE_PROVIDER_ID, TERMS_TYPE, FETCH_DATE);
+        });
+        after(() => subject.removeAll());
+        it('retrieves metadata', () => {
+          expect(record.metadata).to.deep.equal(METADATA);
+        });
+      });
     });
     describe('#findAll', () => {
@@ -695,6 +751,28 @@ describe('MongoRepository', () => {
           expect(latestRecord).to.equal(null);
         });
       });
+      context('when metadata is provided', () => {
+        let record;
+        before(async () => {
+          await subject.save(new Version({
+            serviceId: SERVICE_PROVIDER_ID,
+            termsType: TERMS_TYPE,
+            content: CONTENT,
+            fetchDate: FETCH_DATE,
+            metadata: METADATA,
+          }));
+          record = await subject.findLatest(SERVICE_PROVIDER_ID, TERMS_TYPE);
+        });
+        after(() => subject.removeAll());
+        it('retrieves metadata', () => {
+          expect(record.metadata).to.deep.equal(METADATA);
+        });
+      });
     });
     describe('#iterate', () => {
@@ -770,10 +848,10 @@ describe('MongoRepository', () => {
       context('when it is the first record', () => {
         before(async () => {
-          numberOfRecordsBefore = await collection.find({
+          numberOfRecordsBefore = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (record = await subject.save(new Snapshot({
             serviceId: SERVICE_PROVIDER_ID,
@@ -784,10 +862,10 @@ describe('MongoRepository', () => {
             fetchDate: FETCH_DATE,
           })));
-          numberOfRecordsAfter = await collection.find({
+          numberOfRecordsAfter = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (mongoDocument = await collection.findOne({
             serviceId: SERVICE_PROVIDER_ID,
@@ -850,10 +928,10 @@ describe('MongoRepository', () => {
             fetchDate: FETCH_DATE,
           })));
-          numberOfRecordsBefore = await collection.find({
+          numberOfRecordsBefore = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (record = await subject.save(new Snapshot({
             serviceId: SERVICE_PROVIDER_ID,
@@ -863,10 +941,10 @@ describe('MongoRepository', () => {
             fetchDate: FETCH_DATE,
           })));
-          numberOfRecordsAfter = await collection.find({
+          numberOfRecordsAfter = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           ([mongoDocument] = await collection.find({
             serviceId: SERVICE_PROVIDER_ID,
@@ -899,10 +977,10 @@ describe('MongoRepository', () => {
             fetchDate: FETCH_DATE,
           }));
-          numberOfRecordsBefore = await collection.find({
+          numberOfRecordsBefore = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (record = await subject.save(new Snapshot({
             serviceId: SERVICE_PROVIDER_ID,
@@ -912,10 +990,10 @@ describe('MongoRepository', () => {
             fetchDate: FETCH_DATE_LATER,
           })));
-          numberOfRecordsAfter = await collection.find({
+          numberOfRecordsAfter = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
         });
         after(() => subject.removeAll());
@@ -931,12 +1009,12 @@ describe('MongoRepository', () => {
       context('with PDF document', () => {
         before(async () => {
-          numberOfRecordsBefore = await collection.find({
+          numberOfRecordsBefore = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
             content: PDF_CONTENT,
             mimeType: PDF_MIME_TYPE,
-          }).count();
+          });
           (record = await subject.save(new Snapshot({
             serviceId: SERVICE_PROVIDER_ID,
@@ -946,10 +1024,10 @@ describe('MongoRepository', () => {
             fetchDate: FETCH_DATE,
           })));
-          numberOfRecordsAfter = await collection.find({
+          numberOfRecordsAfter = await collection.countDocuments({
             serviceId: SERVICE_PROVIDER_ID,
             termsType: TERMS_TYPE,
-          }).count();
+          });
           (mongoDocument = await collection.findOne({
             serviceId: SERVICE_PROVIDER_ID,
@@ -991,9 +1069,10 @@ describe('MongoRepository', () => {
           content: CONTENT,
           fetchDate: FETCH_DATE,
           mimeType: HTML_MIME_TYPE,
+          metadata: METADATA,
         })));
-        (record = await subject.findById(id));
+        record = await subject.findById(id);
       });
       after(() => subject.removeAll());
@@ -1034,6 +1113,10 @@ describe('MongoRepository', () => {
         expect(record.documentId).to.equal(DOCUMENT_ID);
       });
+      it('returns the metadata', () => {
+        expect(record.metadata).to.deep.equal(METADATA);
+      });
       context('when requested record does not exist', () => {
         it('returns null', async () => {
           expect(await subject.findById('inexistantID')).to.equal(null);
@@ -1272,6 +1355,29 @@ describe('MongoRepository', () => {
           expect(latestRecord).to.equal(null);
         });
       });
+      context('when metadata is provided', () => {
+        let record;
+        before(async () => {
+          await subject.save(new Snapshot({
+            serviceId: SERVICE_PROVIDER_ID,
+            termsType: TERMS_TYPE,
+            content: CONTENT,
+            fetchDate: FETCH_DATE,
+            mimeType: HTML_MIME_TYPE,
+            metadata: METADATA,
+          }));
+          record = await subject.findLatest(SERVICE_PROVIDER_ID, TERMS_TYPE);
+        });
+        after(() => subject.removeAll());
+        it('retrieves metadata', () => {
+          expect(record.metadata).to.deep.equal(METADATA);
+        });
+      });
     });
     describe('#iterate', () => {