@opentermsarchive/engine 5.2.0 → 5.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "5.2.0",
3
+ "version": "5.3.1",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -51,7 +51,6 @@
51
51
  "dependencies": {
52
52
  "@accordproject/markdown-cicero": "^0.15.2",
53
53
  "@accordproject/markdown-pdf": "^0.15.2",
54
- "@opentermsarchive/fetch-charset-detection": "^1.0.1",
55
54
  "@opentermsarchive/turndown": "^7.1.3",
56
55
  "@stylistic/eslint-plugin-js": "^1.4.1",
57
56
  "abort-controller": "^3.0.0",
@@ -78,6 +77,7 @@
78
77
  "eslint-plugin-no-only-tests": "^3.1.0",
79
78
  "express": "^4.19.2",
80
79
  "express-async-errors": "^3.1.1",
80
+ "fetch-charset-detection": "^1.0.1",
81
81
  "fs-extra": "^10.0.0",
82
82
  "helmet": "^6.0.1",
83
83
  "http-proxy-agent": "^5.0.0",
@@ -33,7 +33,7 @@ export default async function fetch(url, cssSelectors, config) {
33
33
  throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
34
34
  }
35
35
 
36
- const waitForSelectorsPromises = selectors.map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
36
+ const waitForSelectorsPromises = selectors.filter(Boolean).map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
37
37
 
38
38
  // We expect all elements to be present on the page…
39
39
  await Promise.all(waitForSelectorsPromises).catch(error => {
@@ -1,18 +1,17 @@
1
- import convertBody from '@opentermsarchive/fetch-charset-detection'; // eslint-disable-line import/no-unresolved
2
1
  import AbortController from 'abort-controller';
3
- // https://github.com/node-fetch/fetch-charset-detection/issues/247
2
+ import convertBody from 'fetch-charset-detection'; // eslint-disable-line import/no-unresolved
4
3
  import HttpProxyAgent from 'http-proxy-agent';
5
4
  import HttpsProxyAgent from 'https-proxy-agent';
6
5
  import nodeFetch, { AbortError } from 'node-fetch';
7
6
 
8
- export default async function fetch(url, configuration) {
7
+ export default async function fetch(url, config) {
9
8
  const controller = new AbortController();
10
- const timeout = setTimeout(() => controller.abort(), configuration.navigationTimeout);
9
+ const timeout = setTimeout(() => controller.abort(), config.navigationTimeout);
11
10
 
12
11
  const nodeFetchOptions = {
13
12
  signal: controller.signal,
14
13
  credentials: 'include',
15
- headers: { 'Accept-Language': configuration.language },
14
+ headers: { 'Accept-Language': config.language },
16
15
  };
17
16
 
18
17
  if (url.startsWith('https:') && process.env.HTTPS_PROXY) {
@@ -51,7 +50,7 @@ export default async function fetch(url, configuration) {
51
50
  };
52
51
  } catch (error) {
53
52
  if (error instanceof AbortError) {
54
- throw new Error(`Timed out after ${configuration.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
53
+ throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
55
54
  }
56
55
 
57
56
  throw new Error(error.message);
@@ -7,35 +7,89 @@ import fetchHtmlOnly from './htmlOnlyFetcher.js';
7
7
  export { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
8
8
  export { FetchDocumentError } from './errors.js';
9
9
 
10
+ export const FETCHER_TYPES = {
11
+ FULL_DOM: 'fullDom',
12
+ HTML_ONLY: 'htmlOnly',
13
+ };
14
+
15
+ const LIKELY_BOT_BLOCKING_ERRORS = [
16
+ 'HTTP code 403',
17
+ 'HTTP code 406',
18
+ 'HTTP code 502',
19
+ 'ECONNRESET',
20
+ ];
21
+
10
22
  /**
11
23
  * Fetch a resource from the network, returning a promise which is fulfilled once the response is available
12
24
  * @function fetch
13
- * @param {object} params Fetcher parameters
14
- * @param {string} params.url URL of the resource you want to fetch
15
- * @param {boolean} [params.executeClientScripts] Enable execution of client scripts. When set to `true`, this property loads the page in a headless browser to load all assets and execute client scripts before returning its content
16
- * @param {string|Array} [params.cssSelectors] List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled
17
- * @param {object} [params.config] Fetcher configuration
18
- * @param {number} [params.config.navigationTimeout] Maximum time (in milliseconds) to wait before considering the fetch failed
19
- * @param {string} [params.config.language] Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers
20
- * @param {number} [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled
21
- * @returns {Promise<{ mimeType: string, content: string | Buffer }>} Promise containing the fetched resource's MIME type and content
25
+ * @param {object} params Fetcher parameters
26
+ * @param {string} params.url URL of the resource you want to fetch
27
+ * @param {boolean} [params.executeClientScripts] Enable execution of client scripts. When set to `true`, this property loads the page in a headless browser to load all assets and execute client scripts before returning its content. If undefined, the engine will automatically balance performance and tracking success rate, defaulting to not executing scripts and escalating to headless browser if needed
28
+ * @param {string|Array} [params.cssSelectors] List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled
29
+ * @param {object} [params.config] Fetcher configuration
30
+ * @param {number} [params.config.navigationTimeout] Maximum time (in milliseconds) to wait before considering the fetch failed
31
+ * @param {string} [params.config.language] Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers
32
+ * @param {number} [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled
33
+ * @returns {Promise<{ mimeType: string, content: string | Buffer, fetcher: string }>} Promise containing the fetched resource's MIME type, content, and fetcher type
34
+ * @throws {FetchDocumentError} When the fetch operation fails
22
35
  * @async
23
36
  */
24
37
  export default async function fetch({
25
- url, executeClientScripts, cssSelectors,
38
+ url,
39
+ executeClientScripts,
40
+ cssSelectors,
26
41
  config: {
27
42
  navigationTimeout = config.get('@opentermsarchive/engine.fetcher.navigationTimeout'),
28
43
  language = config.get('@opentermsarchive/engine.fetcher.language'),
29
44
  waitForElementsTimeout = config.get('@opentermsarchive/engine.fetcher.waitForElementsTimeout'),
30
45
  } = {},
31
46
  }) {
47
+ if (!url) {
48
+ throw new FetchDocumentError('URL is required');
49
+ }
50
+
51
+ const fetcherConfig = {
52
+ navigationTimeout,
53
+ language,
54
+ waitForElementsTimeout,
55
+ executeClientScripts,
56
+ };
57
+
32
58
  try {
33
59
  if (executeClientScripts) {
34
- return await fetchFullDom(url, cssSelectors, { navigationTimeout, language, waitForElementsTimeout });
60
+ return await fetchWithFullDom(url, cssSelectors, fetcherConfig);
35
61
  }
36
62
 
37
- return await fetchHtmlOnly(url, { navigationTimeout, language });
63
+ return await fetchWithFallback(url, cssSelectors, fetcherConfig);
38
64
  } catch (error) {
39
65
  throw new FetchDocumentError(error.message);
40
66
  }
41
67
  }
68
+
69
+ async function fetchWithFallback(url, cssSelectors, fetcherConfig) {
70
+ try {
71
+ return await fetchWithHtmlOnly(url, fetcherConfig);
72
+ } catch (error) {
73
+ const isBotBlockingError = LIKELY_BOT_BLOCKING_ERRORS.some(code => error.message.includes(code));
74
+
75
+ if (!isBotBlockingError || fetcherConfig.executeClientScripts === false) {
76
+ throw error;
77
+ }
78
+
79
+ return fetchWithFullDom(url, cssSelectors, fetcherConfig);
80
+ }
81
+ }
82
+
83
+ async function fetchWithFullDom(url, cssSelectors, fetcherConfig) {
84
+ return {
85
+ ...await fetchFullDom(url, cssSelectors, fetcherConfig),
86
+ fetcher: FETCHER_TYPES.FULL_DOM,
87
+ };
88
+ }
89
+
90
+ async function fetchWithHtmlOnly(url, fetcherConfig) {
91
+ return {
92
+ ...await fetchHtmlOnly(url, fetcherConfig),
93
+ fetcher: FETCHER_TYPES.HTML_ONLY,
94
+ };
95
+ }
@@ -7,7 +7,7 @@ import chai from 'chai';
7
7
  import chaiAsPromised from 'chai-as-promised';
8
8
  import iconv from 'iconv-lite';
9
9
 
10
- import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError } from './index.js';
10
+ import fetch, { launchHeadlessBrowser, stopHeadlessBrowser, FetchDocumentError, FETCHER_TYPES } from './index.js';
11
11
 
12
12
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
13
13
 
@@ -31,6 +31,8 @@ describe('Fetcher', function () {
31
31
  let expectedPDFContent;
32
32
 
33
33
  before(done => {
34
+ let blockCount = 0;
35
+
34
36
  temporaryServer = http.createServer((request, response) => {
35
37
  if (request.url === '/') {
36
38
  response.writeHead(200, { 'Content-Type': 'text/html' }).write(termsHTML);
@@ -46,9 +48,19 @@ describe('Fetcher', function () {
46
48
  }
47
49
  if (request.url == '/terms.pdf') {
48
50
  expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
49
-
50
51
  response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
51
52
  }
53
+ if (request.url === '/block-once') {
54
+ if (blockCount === 0) {
55
+ blockCount++;
56
+ response.writeHead(403, { 'Content-Type': 'text/html' }).write('<!DOCTYPE html><html><body>Access Denied - Bot Detected</body></html>');
57
+ } else {
58
+ response.writeHead(200, { 'Content-Type': 'text/html' }).write(termsHTML);
59
+ }
60
+ }
61
+ if (request.url === '/always-block') {
62
+ response.writeHead(403, { 'Content-Type': 'text/html' }).write('<!DOCTYPE html><html><body>Access Denied - Bot Detected</body></html>');
63
+ }
52
64
 
53
65
  return response.end();
54
66
  }).listen(SERVER_PORT);
@@ -66,11 +78,12 @@ describe('Fetcher', function () {
66
78
  context('when html page is available', () => {
67
79
  let content;
68
80
  let mimeType;
81
+ let fetcher;
69
82
  const url = `http://127.0.0.1:${SERVER_PORT}`;
70
83
 
71
84
  context('when expected selectors are present', () => {
72
85
  before(async () => {
73
- ({ content, mimeType } = await fetch({ url, cssSelectors: 'body' }));
86
+ ({ content, mimeType, fetcher } = await fetch({ url, cssSelectors: 'body' }));
74
87
  });
75
88
 
76
89
  it('returns the web page content of the given URL', () => {
@@ -81,9 +94,13 @@ describe('Fetcher', function () {
81
94
  expect(mimeType).to.equal('text/html');
82
95
  });
83
96
 
97
+ it('uses HTML-only fetcher by default', () => {
98
+ expect(fetcher).to.equal(FETCHER_TYPES.HTML_ONLY);
99
+ });
100
+
84
101
  context('with client script enabled', () => {
85
102
  before(async () => {
86
- ({ content, mimeType } = await fetch({ url, cssSelectors: 'body', executeClientScripts: true }));
103
+ ({ content, mimeType, fetcher } = await fetch({ url, cssSelectors: 'body', executeClientScripts: true }));
87
104
  });
88
105
 
89
106
  it('returns the web page content of the given URL', () => {
@@ -93,6 +110,10 @@ describe('Fetcher', function () {
93
110
  it('returns the MIME type of the given URL', () => {
94
111
  expect(mimeType).to.equal('text/html');
95
112
  });
113
+
114
+ it('uses full DOM fetcher when client scripts are enabled', () => {
115
+ expect(fetcher).to.equal(FETCHER_TYPES.FULL_DOM);
116
+ });
96
117
  });
97
118
  });
98
119
 
@@ -100,7 +121,7 @@ describe('Fetcher', function () {
100
121
  const NOT_PRESENT_SELECTOR = 'h2';
101
122
 
102
123
  before(async () => {
103
- ({ content, mimeType } = await fetch({ url, cssSelectors: NOT_PRESENT_SELECTOR }));
124
+ ({ content, mimeType, fetcher } = await fetch({ url, cssSelectors: NOT_PRESENT_SELECTOR }));
104
125
  });
105
126
 
106
127
  it('returns the web page content of the given URL', () => {
@@ -111,9 +132,13 @@ describe('Fetcher', function () {
111
132
  expect(mimeType).to.equal('text/html');
112
133
  });
113
134
 
135
+ it('uses HTML-only fetcher by default', () => {
136
+ expect(fetcher).to.equal(FETCHER_TYPES.HTML_ONLY);
137
+ });
138
+
114
139
  context('with client script enabled', () => {
115
140
  before(async () => {
116
- ({ content, mimeType } = await fetch({ url, cssSelectors: NOT_PRESENT_SELECTOR, executeClientScripts: true }));
141
+ ({ content, mimeType, fetcher } = await fetch({ url, cssSelectors: NOT_PRESENT_SELECTOR, executeClientScripts: true }));
117
142
  });
118
143
 
119
144
  it('returns the web page content of the given URL', () => {
@@ -123,32 +148,42 @@ describe('Fetcher', function () {
123
148
  it('returns the MIME type of the given URL', () => {
124
149
  expect(mimeType).to.equal('text/html');
125
150
  });
151
+
152
+ it('uses full DOM fetcher when client scripts are enabled', () => {
153
+ expect(fetcher).to.equal(FETCHER_TYPES.FULL_DOM);
154
+ });
126
155
  });
127
156
  });
128
157
  });
129
158
 
130
159
  context('when html page is in different charset', () => {
131
160
  let content;
161
+ let fetcher;
132
162
  const url = `http://127.0.0.1:${SERVER_PORT}/other-charset`;
133
163
 
134
164
  context('when expected selectors are present', () => {
135
165
  before(async () => {
136
- ({ content } = await fetch({ url, cssSelectors: 'body' }));
166
+ ({ content, fetcher } = await fetch({ url, cssSelectors: 'body' }));
137
167
  });
138
168
 
139
169
  it('returns the web page content of the given URL', () => {
140
170
  expect(content).to.equal(termsWithOtherCharsetHTML);
141
171
  });
172
+
173
+ it('uses HTML-only fetcher by default', () => {
174
+ expect(fetcher).to.equal(FETCHER_TYPES.HTML_ONLY);
175
+ });
142
176
  });
143
177
  });
144
178
 
145
179
  context('when url targets a PDF file', () => {
146
180
  let content;
147
181
  let mimeType;
182
+ let fetcher;
148
183
  const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`;
149
184
 
150
185
  before(async () => {
151
- ({ content, mimeType } = await fetch({ url: pdfUrl }));
186
+ ({ content, mimeType, fetcher } = await fetch({ url: pdfUrl }));
152
187
  });
153
188
 
154
189
  it('returns a buffer for PDF content', () => {
@@ -162,6 +197,10 @@ describe('Fetcher', function () {
162
197
  it('returns a blob with the file content', () => {
163
198
  expect(content.equals(expectedPDFContent)).to.be.true;
164
199
  });
200
+
201
+ it('returns the fetcher used to fetch the PDF file', () => {
202
+ expect(fetcher).to.equal(FETCHER_TYPES.HTML_ONLY);
203
+ });
165
204
  });
166
205
 
167
206
  context('when server responds with empty content', () => {
@@ -245,6 +284,20 @@ describe('Fetcher', function () {
245
284
  });
246
285
  });
247
286
  });
287
+
288
+ describe('when bot blocking is detected', () => {
289
+ it('falls back to full DOM fetcher when bot blocking is detected', async () => {
290
+ const { content, mimeType, fetcher } = await fetch({ url: `http://127.0.0.1:${SERVER_PORT}/block-once` });
291
+
292
+ expect(content).to.equal(termsHTML);
293
+ expect(mimeType).to.equal('text/html');
294
+ expect(fetcher).to.equal(FETCHER_TYPES.FULL_DOM);
295
+ });
296
+
297
+ it('still throws FetchDocumentError if both fetchers fail', async () => {
298
+ await expect(fetch({ url: `http://127.0.0.1:${SERVER_PORT}/always-block` })).to.be.rejectedWith(FetchDocumentError);
299
+ });
300
+ });
248
301
  });
249
302
  });
250
303
  });
@@ -183,10 +183,11 @@ export default class Archivist extends events.EventEmitter {
183
183
  const { location: url, executeClientScripts, cssSelectors } = sourceDocument;
184
184
 
185
185
  try {
186
- const { mimeType, content } = await this.fetch({ url, executeClientScripts, cssSelectors });
186
+ const { mimeType, content, fetcher } = await this.fetch({ url, executeClientScripts, cssSelectors });
187
187
 
188
188
  sourceDocument.content = content;
189
189
  sourceDocument.mimeType = mimeType;
190
+ sourceDocument.fetcher = fetcher;
190
191
  } catch (error) {
191
192
  if (!(error instanceof FetchDocumentError)) {
192
193
  throw error;
@@ -248,6 +249,7 @@ export default class Archivist extends events.EventEmitter {
248
249
  termsType: terms.type,
249
250
  fetchDate: terms.fetchDate,
250
251
  isExtractOnly: extractOnly,
252
+ metadata: { 'x-engine-version': process.env.npm_package_version },
251
253
  });
252
254
 
253
255
  await this.recorder.record(record);
@@ -272,6 +274,11 @@ export default class Archivist extends events.EventEmitter {
272
274
  fetchDate: terms.fetchDate,
273
275
  content: sourceDocument.content,
274
276
  mimeType: sourceDocument.mimeType,
277
+ metadata: {
278
+ 'x-engine-version': process.env.npm_package_version,
279
+ 'x-fetcher': sourceDocument.fetcher,
280
+ 'x-source-document-location': sourceDocument.location,
281
+ },
275
282
  });
276
283
 
277
284
  await this.recorder.record(record);
@@ -24,7 +24,7 @@ const MULTIPLE_SOURCE_DOCUMENTS_PREFIX = 'This version was recorded after extrac
24
24
  export const COMMIT_MESSAGE_PREFIXES_REGEXP = new RegExp(`^(${Object.values(COMMIT_MESSAGE_PREFIXES).join('|')})`);
25
25
 
26
26
  export function toPersistence(record, snapshotIdentiferTemplate) {
27
- const { serviceId, termsType, documentId, isExtractOnly, snapshotIds = [], mimeType, isFirstRecord } = record;
27
+ const { serviceId, termsType, documentId, isExtractOnly, snapshotIds = [], mimeType, isFirstRecord, metadata } = record;
28
28
 
29
29
  let prefix = isExtractOnly ? COMMIT_MESSAGE_PREFIXES.extractOnly : COMMIT_MESSAGE_PREFIXES.update;
30
30
 
@@ -46,11 +46,12 @@ export function toPersistence(record, snapshotIdentiferTemplate) {
46
46
  message: `${subject}\n\n${documentIdMessage || ''}\n\n${snapshotIdsMessage || ''}`,
47
47
  content: record.content,
48
48
  filePath,
49
+ metadata,
49
50
  };
50
51
  }
51
52
 
52
53
  export function toDomain(commit) {
53
- const { hash, date, message, body, diff } = commit;
54
+ const { hash, date, message, body, diff, trailers = {} } = commit;
54
55
 
55
56
  const modifiedFilesInCommit = diff.files.map(({ file }) => file);
56
57
 
@@ -68,17 +69,22 @@ export function toDomain(commit) {
68
69
  serviceId: path.dirname(relativeFilePath),
69
70
  termsType,
70
71
  documentId,
71
- mimeType: mime.getType(relativeFilePath),
72
72
  fetchDate: new Date(date),
73
73
  isFirstRecord: message.startsWith(COMMIT_MESSAGE_PREFIXES.startTracking) || message.startsWith(COMMIT_MESSAGE_PREFIXES.deprecated_startTracking),
74
- isExtractOnly: message.startsWith(COMMIT_MESSAGE_PREFIXES.extractOnly) || message.startsWith(COMMIT_MESSAGE_PREFIXES.deprecated_refilter),
75
- snapshotIds: snapshotIdsMatch || [],
74
+ metadata: { ...trailers },
76
75
  };
77
76
 
78
- if (attributes.mimeType == mime.getType('markdown')) {
77
+ const mimeTypeValue = mime.getType(relativeFilePath);
78
+
79
+ if (mimeTypeValue == mime.getType('markdown')) {
80
+ attributes.isExtractOnly = message.startsWith(COMMIT_MESSAGE_PREFIXES.extractOnly) || message.startsWith(COMMIT_MESSAGE_PREFIXES.deprecated_refilter);
81
+ attributes.snapshotIds = snapshotIdsMatch;
82
+
79
83
  return new Version(attributes);
80
84
  }
81
85
 
86
+ attributes.mimeType = mimeTypeValue;
87
+
82
88
  return new Snapshot(attributes);
83
89
  }
84
90
 
@@ -3,6 +3,8 @@ import path from 'path';
3
3
 
4
4
  import simpleGit from 'simple-git';
5
5
 
6
+ import { parseTrailers, formatTrailers } from './trailers.js';
7
+
6
8
  process.env.LC_ALL = 'en_GB'; // Ensure git messages will be in English as some errors are handled by analysing the message content
7
9
 
8
10
  const fs = fsApi.promises;
@@ -38,7 +40,7 @@ export default class Git {
38
40
  return this.git.add(this.relativePath(filePath));
39
41
  }
40
42
 
41
- async commit({ filePath, message, date = new Date() }) {
43
+ async commit({ filePath, message, date = new Date(), trailers = {} }) {
42
44
  const commitDate = new Date(date).toISOString();
43
45
  let summary;
44
46
 
@@ -46,7 +48,10 @@ export default class Git {
46
48
  process.env.GIT_AUTHOR_DATE = commitDate;
47
49
  process.env.GIT_COMMITTER_DATE = commitDate;
48
50
 
49
- summary = await this.git.commit(message, filePath, ['--no-verify']); // Skip pre-commit and commit-msg hooks, as commits are programmatically managed, to optimize performance
51
+ const trailersSection = formatTrailers(trailers);
52
+ const finalMessage = trailersSection ? `${message}\n\n${trailersSection}` : message;
53
+
54
+ summary = await this.git.commit(finalMessage, filePath, ['--no-verify']); // Skip pre-commit and commit-msg hooks, as commits are programmatically managed, to optimize performance
50
55
  } finally {
51
56
  process.env.GIT_AUTHOR_DATE = '';
52
57
  process.env.GIT_COMMITTER_DATE = '';
@@ -70,14 +75,23 @@ export default class Git {
70
75
  async getCommit(options) {
71
76
  const [commit] = await this.listCommits([ '-1', ...options ]); // Returns only the most recent commit matching the given options
72
77
 
78
+ if (commit) {
79
+ commit.trailers = parseTrailers(commit.body);
80
+ }
81
+
73
82
  return commit;
74
83
  }
75
84
 
76
85
  async log(options = []) {
77
86
  try {
78
87
  const logSummary = await this.git.log(options);
88
+ const commits = logSummary.all;
89
+
90
+ commits.forEach(commit => {
91
+ commit.trailers = parseTrailers(commit.body);
92
+ });
79
93
 
80
- return logSummary.all;
94
+ return commits;
81
95
  } catch (error) {
82
96
  if (/unknown revision or path not in the working tree|does not have any commits yet/.test(error.message)) {
83
97
  return [];
@@ -41,12 +41,12 @@ export default class GitRepository extends RepositoryInterface {
41
41
  record.isFirstRecord = !await this.#isTracked(serviceId, termsType, documentId);
42
42
  }
43
43
 
44
- const { message, content, filePath: relativeFilePath } = await this.#toPersistence(record);
44
+ const { message, content, filePath: relativeFilePath, metadata } = await this.#toPersistence(record);
45
45
 
46
46
  const filePath = path.join(this.path, relativeFilePath);
47
47
 
48
48
  await GitRepository.writeFile({ filePath, content });
49
- const sha = await this.#commit({ filePath, message, date: fetchDate });
49
+ const sha = await this.#commit({ filePath, message, date: fetchDate, trailers: metadata });
50
50
 
51
51
  if (!sha) {
52
52
  return Object(null);
@@ -153,11 +153,11 @@ export default class GitRepository extends RepositoryInterface {
153
153
  return filePath;
154
154
  }
155
155
 
156
- async #commit({ filePath, message, date }) {
156
+ async #commit({ filePath, message, date, trailers }) {
157
157
  try {
158
158
  await this.git.add(filePath);
159
159
 
160
- return await this.git.commit({ filePath, message, date });
160
+ return await this.git.commit({ filePath, message, date, trailers });
161
161
  } catch (error) {
162
162
  throw new Error(`Could not commit ${filePath} with message "${message}" due to error: "${error}"`);
163
163
  }
@@ -41,6 +41,11 @@ const HTML_MIME_TYPE = mime.getType('html');
41
41
  const PDF_MIME_TYPE = mime.getType('pdf');
42
42
  const PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test/fixtures/terms.pdf'), { encoding: 'utf8' });
43
43
 
44
+ const METADATA = {
45
+ fetcher: 'test-fetcher',
46
+ 'engine-version': '5.0.0',
47
+ };
48
+
44
49
  describe('GitRepository', () => {
45
50
  let git;
46
51
  let subject;
@@ -314,6 +319,26 @@ describe('GitRepository', () => {
314
319
  expect(commit.message).to.include(TERMS_TYPE);
315
320
  });
316
321
  });
322
+
323
+ context('when metadata is provided', () => {
324
+ before(async () => {
325
+ ({ id, isFirstRecord } = await subject.save(new Version({
326
+ serviceId: SERVICE_PROVIDER_ID,
327
+ termsType: TERMS_TYPE,
328
+ content: CONTENT,
329
+ fetchDate: FETCH_DATE,
330
+ metadata: METADATA,
331
+ })));
332
+
333
+ ([commit] = await git.log());
334
+ });
335
+
336
+ after(() => subject.removeAll());
337
+
338
+ it('stores metadata as commit trailers', () => {
339
+ expect(commit.trailers).to.deep.equal(METADATA);
340
+ });
341
+ });
317
342
  });
318
343
 
319
344
  describe('#findById', () => {
@@ -328,6 +353,7 @@ describe('GitRepository', () => {
328
353
  fetchDate: FETCH_DATE,
329
354
  snapshotIds: [SNAPSHOT_ID],
330
355
  mimeType: HTML_MIME_TYPE,
356
+ metadata: METADATA,
331
357
  })));
332
358
 
333
359
  (record = await subject.findById(id));
@@ -367,6 +393,10 @@ describe('GitRepository', () => {
367
393
  expect(record.snapshotIds).to.deep.equal([SNAPSHOT_ID]);
368
394
  });
369
395
 
396
+ it('returns metadata', () => {
397
+ expect(record.metadata).to.deep.equal(METADATA);
398
+ });
399
+
370
400
  context('when requested record does not exist', () => {
371
401
  it('returns null', async () => {
372
402
  expect(await subject.findById('inexistantID')).to.equal(null);
@@ -435,6 +465,28 @@ describe('GitRepository', () => {
435
465
  expect(recordFound).to.equal(null);
436
466
  });
437
467
  });
468
+
469
+ context('when metadata is provided', () => {
470
+ let record;
471
+
472
+ before(async () => {
473
+ await subject.save(new Version({
474
+ serviceId: SERVICE_PROVIDER_ID,
475
+ termsType: TERMS_TYPE,
476
+ content: CONTENT,
477
+ fetchDate: FETCH_DATE,
478
+ metadata: METADATA,
479
+ }));
480
+
481
+ record = await subject.findByDate(SERVICE_PROVIDER_ID, TERMS_TYPE, FETCH_DATE);
482
+ });
483
+
484
+ after(() => subject.removeAll());
485
+
486
+ it('retrieves metadata', () => {
487
+ expect(record.metadata).to.deep.equal(METADATA);
488
+ });
489
+ });
438
490
  });
439
491
 
440
492
  describe('#findAll', () => {
@@ -557,6 +609,7 @@ describe('GitRepository', () => {
557
609
  content: UPDATED_FILE_CONTENT,
558
610
  fetchDate: FETCH_DATE,
559
611
  snapshotIds: [SNAPSHOT_ID],
612
+ metadata: METADATA,
560
613
  })));
561
614
 
562
615
  latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, TERMS_TYPE);
@@ -575,6 +628,10 @@ describe('GitRepository', () => {
575
628
  it('returns the latest record content', () => {
576
629
  expect(latestRecord.content.toString('utf8')).to.equal(UPDATED_FILE_CONTENT);
577
630
  });
631
+
632
+ it('returns metadata', () => {
633
+ expect(latestRecord.metadata).to.deep.equal(METADATA);
634
+ });
578
635
  });
579
636
  });
580
637
 
@@ -901,6 +958,28 @@ describe('GitRepository', () => {
901
958
  expect(mime.getType(EXPECTED_PDF_SNAPSHOT_FILE_PATH)).to.equal(PDF_MIME_TYPE);
902
959
  });
903
960
  });
961
+
962
+ context('when metadata is provided', () => {
963
+ before(async () => {
964
+ ({ id, isFirstRecord } = await subject.save(new Snapshot({
965
+ serviceId: SERVICE_PROVIDER_ID,
966
+ termsType: TERMS_TYPE,
967
+ documentId: DOCUMENT_ID,
968
+ content: CONTENT,
969
+ fetchDate: FETCH_DATE,
970
+ mimeType: HTML_MIME_TYPE,
971
+ metadata: METADATA,
972
+ })));
973
+
974
+ ([commit] = await git.log());
975
+ });
976
+
977
+ after(() => subject.removeAll());
978
+
979
+ it('stores metadata as commit trailers', () => {
980
+ expect(commit.trailers).to.deep.equal(METADATA);
981
+ });
982
+ });
904
983
  });
905
984
 
906
985
  describe('#findById', () => {
@@ -915,6 +994,7 @@ describe('GitRepository', () => {
915
994
  content: CONTENT,
916
995
  fetchDate: FETCH_DATE,
917
996
  mimeType: HTML_MIME_TYPE,
997
+ metadata: METADATA,
918
998
  })));
919
999
 
920
1000
  (record = await subject.findById(id));
@@ -958,6 +1038,10 @@ describe('GitRepository', () => {
958
1038
  expect(record.documentId).to.equal(DOCUMENT_ID);
959
1039
  });
960
1040
 
1041
+ it('returns metadata', () => {
1042
+ expect(record.metadata).to.deep.equal(METADATA);
1043
+ });
1044
+
961
1045
  context('when requested record does not exist', () => {
962
1046
  it('returns null', async () => {
963
1047
  expect(await subject.findById('inexistantID')).to.equal(null);
@@ -1086,6 +1170,7 @@ describe('GitRepository', () => {
1086
1170
  content: UPDATED_FILE_CONTENT,
1087
1171
  mimeType: HTML_MIME_TYPE,
1088
1172
  fetchDate: FETCH_DATE,
1173
+ metadata: METADATA,
1089
1174
  })));
1090
1175
 
1091
1176
  latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, TERMS_TYPE);
@@ -1108,6 +1193,10 @@ describe('GitRepository', () => {
1108
1193
  it('returns the latest record mime type', () => {
1109
1194
  expect(latestRecord.mimeType).to.equal(HTML_MIME_TYPE);
1110
1195
  });
1196
+
1197
+ it('returns metadata', () => {
1198
+ expect(latestRecord.metadata).to.deep.equal(METADATA);
1199
+ });
1111
1200
  });
1112
1201
 
1113
1202
  context('with PDF document', () => {
@@ -1205,6 +1294,29 @@ describe('GitRepository', () => {
1205
1294
  expect(fetchDates).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]);
1206
1295
  });
1207
1296
  });
1297
+
1298
+ context('when metadata is provided', () => {
1299
+ let record;
1300
+
1301
+ before(async () => {
1302
+ await subject.save(new Snapshot({
1303
+ serviceId: SERVICE_PROVIDER_ID,
1304
+ termsType: TERMS_TYPE,
1305
+ content: CONTENT,
1306
+ fetchDate: FETCH_DATE,
1307
+ mimeType: HTML_MIME_TYPE,
1308
+ metadata: METADATA,
1309
+ }));
1310
+
1311
+ record = await subject.findByDate(SERVICE_PROVIDER_ID, TERMS_TYPE, FETCH_DATE);
1312
+ });
1313
+
1314
+ after(() => subject.removeAll());
1315
+
1316
+ it('retrieves metadata', () => {
1317
+ expect(record.metadata).to.deep.equal(METADATA);
1318
+ });
1319
+ });
1208
1320
  });
1209
1321
 
1210
1322
  context('backwards compatibility with deprecated commit messages', () => {
@@ -0,0 +1,48 @@
1
+ export function parseTrailers(message) {
2
+ const trailers = {};
3
+
4
+ const sections = message.split(/\n\n+/);
5
+ const trailersSection = sections[sections.length - 1];
6
+
7
+ if (!trailersSection.includes(':')) {
8
+ return trailers;
9
+ }
10
+
11
+ const validTrailerKeyRegex = /^[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*:$/; // Accepts either a single word or multiple words separated by dashes
12
+
13
+ for (const line of trailersSection.split('\n')) {
14
+ const trimmedLine = line.trim();
15
+
16
+ if (!trimmedLine) { // Skip empty lines
17
+ continue;
18
+ }
19
+
20
+ const colonIndex = trimmedLine.indexOf(':');
21
+
22
+ if (colonIndex === -1) { // Skip lines without a colon
23
+ continue;
24
+ }
25
+
26
+ const key = trimmedLine.slice(0, colonIndex + 1);
27
+ const value = trimmedLine.slice(colonIndex + 1).trim();
28
+
29
+ if (validTrailerKeyRegex.test(key) && value) {
30
+ const keyWithoutColon = key.slice(0, -1);
31
+
32
+ trailers[keyWithoutColon.toLowerCase()] = value;
33
+ }
34
+ }
35
+
36
+ return trailers;
37
+ }
38
+
39
+ export function formatTrailers(trailers) {
40
+ if (Object.keys(trailers).length === 0) {
41
+ return '';
42
+ }
43
+
44
+ return Object.entries(trailers)
45
+ .filter(([ , value ]) => value !== '')
46
+ .map(([ key, value ]) => `${key[0].toUpperCase() + key.slice(1).toLowerCase()}: ${value}`)
47
+ .join('\n');
48
+ }
@@ -0,0 +1,158 @@
1
+ import { expect } from 'chai';
2
+
3
+ import { parseTrailers, formatTrailers } from './trailers.js';
4
+
5
+ describe('trailers', () => {
6
+ describe('#parseTrailers', () => {
7
+ it('returns empty object for message without trailers', () => {
8
+ const message = 'A simple commit message\n\nWith a body';
9
+
10
+ expect(parseTrailers(message)).to.deep.equal({});
11
+ });
12
+
13
+ it('returns empty object when last section has no colon', () => {
14
+ const message = 'A commit message\n\nWith a body\n\nNo trailers here';
15
+
16
+ expect(parseTrailers(message)).to.deep.equal({});
17
+ });
18
+
19
+ it('parses single word trailer key', () => {
20
+ const message = 'A commit message\n\nWith a body\n\nFetcher: my-fetcher';
21
+
22
+ expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my-fetcher' });
23
+ });
24
+
25
+ it('parses multi-word trailer key with dashes', () => {
26
+ const message = 'A commit message\n\nWith a body\n\nFeature-Request: my-feature';
27
+
28
+ expect(parseTrailers(message)).to.deep.equal({ 'feature-request': 'my-feature' });
29
+ });
30
+
31
+ it('parses multiple trailers with different key formats', () => {
32
+ const message = 'A commit message\n\nWith a body\n\nFetcher: my-fetcher\nFeature-Request: my-feature';
33
+
34
+ expect(parseTrailers(message)).to.deep.equal({
35
+ fetcher: 'my-fetcher',
36
+ 'feature-request': 'my-feature',
37
+ });
38
+ });
39
+
40
+ it('handles case-insensitive keys', () => {
41
+ const message = 'A commit message\n\nWith a body\n\nFETCHER: my-fetcher\nFeature-Request: my-feature';
42
+
43
+ expect(parseTrailers(message)).to.deep.equal({
44
+ fetcher: 'my-fetcher',
45
+ 'feature-request': 'my-feature',
46
+ });
47
+ });
48
+
49
+ it('handles trailers with colons in values', () => {
50
+ const message = 'A commit message\n\nWith a body\n\nFetcher: my:fetcher:with:colons';
51
+
52
+ expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my:fetcher:with:colons' });
53
+ });
54
+
55
+ it('ignores malformed trailer lines', () => {
56
+ const message = 'A commit message\n\nWith a body\n\nFetcher: my-fetcher\nInvalid line\nReviewer: john-doe';
57
+
58
+ expect(parseTrailers(message)).to.deep.equal({
59
+ fetcher: 'my-fetcher',
60
+ reviewer: 'john-doe',
61
+ });
62
+ });
63
+
64
+ it('ignores trailer keys with spaces', () => {
65
+ const message = 'A commit message\n\nWith a body\n\nFeature Request: my-feature\nFetcher: my-fetcher';
66
+
67
+ expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my-fetcher' });
68
+ });
69
+
70
+ it('ignores trailer keys with spaces before colon', () => {
71
+ const message = 'A commit message\n\nWith a body\n\nFetcher : my-fetcher\nFeature-Request: my-feature';
72
+
73
+ expect(parseTrailers(message)).to.deep.equal({ 'feature-request': 'my-feature' });
74
+ });
75
+
76
+ it('ignores trailer keys ending with dash', () => {
77
+ const message = 'A commit message\n\nWith a body\n\nFeature-: my-feature\nFetcher: my-fetcher';
78
+
79
+ expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my-fetcher' });
80
+ });
81
+
82
+ it('only keeps trailers from the last section', () => {
83
+ const message = 'A commit message\n\nWith a body\n\nFetcher: my-fetcher\n\nFeature-Request: my-feature';
84
+
85
+ expect(parseTrailers(message)).to.deep.equal({ 'feature-request': 'my-feature' });
86
+ });
87
+
88
+ it('ignores trailers with empty values', () => {
89
+ const message = 'A commit message\n\nWith a body\n\nFetcher:\nFeature-request: my-feature';
90
+
91
+ expect(parseTrailers(message)).to.deep.equal({ 'feature-request': 'my-feature' });
92
+ });
93
+
94
+ it('handles keys with numbers', () => {
95
+ const message = 'A commit message\n\nWith a body\n\nIssue-123: my-issue\nFetcher: my-fetcher';
96
+
97
+ expect(parseTrailers(message)).to.deep.equal({
98
+ 'issue-123': 'my-issue',
99
+ fetcher: 'my-fetcher',
100
+ });
101
+ });
102
+
103
+ it('handles multiple consecutive empty lines in message', () => {
104
+ const message = 'A commit message\n\n\n\nWith a body\n\nFetcher: my-fetcher';
105
+
106
+ expect(parseTrailers(message)).to.deep.equal({ fetcher: 'my-fetcher' });
107
+ });
108
+ });
109
+
110
+ describe('#formatTrailers', () => {
111
+ it('returns empty string when no trailers', () => {
112
+ expect(formatTrailers({})).to.equal('');
113
+ });
114
+
115
+ it('formats single word trailer key', () => {
116
+ expect(formatTrailers({ fetcher: 'my-fetcher' })).to.equal('Fetcher: my-fetcher');
117
+ });
118
+
119
+ it('formats multi-word trailer key with dashes', () => {
120
+ expect(formatTrailers({ 'feature-request': 'my-feature' })).to.equal('Feature-request: my-feature');
121
+ });
122
+
123
+ it('formats multiple trailers with different key formats', () => {
124
+ expect(formatTrailers({
125
+ fetcher: 'my-fetcher',
126
+ 'feature-request': 'my-feature',
127
+ })).to.equal('Fetcher: my-fetcher\nFeature-request: my-feature');
128
+ });
129
+
130
+ it('capitalizes trailer keys', () => {
131
+ expect(formatTrailers({
132
+ fetcher: 'my-fetcher',
133
+ 'feature-request': 'my-feature',
134
+ })).to.equal('Fetcher: my-fetcher\nFeature-request: my-feature');
135
+ });
136
+
137
+ it('handles case-insensitive keys', () => {
138
+ expect(formatTrailers({
139
+ FETCHER: 'my-fetcher',
140
+ 'FEATURE-REQUEST': 'my-feature',
141
+ })).to.equal('Fetcher: my-fetcher\nFeature-request: my-feature');
142
+ });
143
+
144
+ it('skips empty string values', () => {
145
+ expect(formatTrailers({
146
+ fetcher: '',
147
+ 'feature-request': 'my-feature',
148
+ })).to.equal('Feature-request: my-feature');
149
+ });
150
+
151
+ it('handles keys with numbers', () => {
152
+ expect(formatTrailers({
153
+ 'issue-123': 'my-issue',
154
+ fetcher: 'my-fetcher',
155
+ })).to.equal('Issue-123: my-issue\nFetcher: my-fetcher');
156
+ });
157
+ });
158
+ });
@@ -17,7 +17,7 @@ export function toPersistence(record) {
17
17
  }
18
18
 
19
19
  export function toDomain(mongoDocument) {
20
- const { _id, serviceId, termsType, documentId, fetchDate, mimeType, isExtractOnly, isRefilter, isFirstRecord, snapshotIds } = mongoDocument;
20
+ const { _id, serviceId, termsType, documentId, fetchDate, mimeType, isExtractOnly, isRefilter, isFirstRecord, snapshotIds, metadata } = mongoDocument;
21
21
 
22
22
  const attributes = {
23
23
  id: _id.toString(),
@@ -29,6 +29,7 @@ export function toDomain(mongoDocument) {
29
29
  isFirstRecord: Boolean(isFirstRecord),
30
30
  isExtractOnly: Boolean(isExtractOnly) || Boolean(isRefilter),
31
31
  snapshotIds: snapshotIds?.map(snapshotId => snapshotId.toString()) || [],
32
+ metadata,
32
33
  };
33
34
 
34
35
  if (snapshotIds) {
@@ -90,7 +90,7 @@ export default class MongoRepository extends RepositoryInterface {
90
90
  }
91
91
 
92
92
  count() {
93
- return this.collection.find().count();
93
+ return this.collection.countDocuments();
94
94
  }
95
95
 
96
96
  async* iterate() {
@@ -34,6 +34,11 @@ const PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test
34
34
  const UPDATED_PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test/fixtures/termsModified.pdf'));
35
35
  const PDF_MIME_TYPE = mime.getType('pdf');
36
36
 
37
+ const METADATA = {
38
+ fetcher: 'test-fetcher',
39
+ 'engine-version': '5.0.0',
40
+ };
41
+
37
42
  let collection;
38
43
 
39
44
  describe('MongoRepository', () => {
@@ -57,10 +62,10 @@ describe('MongoRepository', () => {
57
62
 
58
63
  context('when it is the first record', () => {
59
64
  before(async () => {
60
- numberOfRecordsBefore = await collection.find({
65
+ numberOfRecordsBefore = await collection.countDocuments({
61
66
  serviceId: SERVICE_PROVIDER_ID,
62
67
  termsType: TERMS_TYPE,
63
- }).count();
68
+ });
64
69
 
65
70
  (record = await subject.save(new Version({
66
71
  serviceId: SERVICE_PROVIDER_ID,
@@ -68,12 +73,13 @@ describe('MongoRepository', () => {
68
73
  content: CONTENT,
69
74
  fetchDate: FETCH_DATE,
70
75
  snapshotIds: [SNAPSHOT_ID],
76
+ metadata: METADATA,
71
77
  })));
72
78
 
73
- numberOfRecordsAfter = await collection.find({
79
+ numberOfRecordsAfter = await collection.countDocuments({
74
80
  serviceId: SERVICE_PROVIDER_ID,
75
81
  termsType: TERMS_TYPE,
76
- }).count();
82
+ });
77
83
 
78
84
  (mongoDocument = await collection.findOne({
79
85
  serviceId: SERVICE_PROVIDER_ID,
@@ -132,10 +138,10 @@ describe('MongoRepository', () => {
132
138
  snapshotIds: [SNAPSHOT_ID],
133
139
  })));
134
140
 
135
- numberOfRecordsBefore = await collection.find({
141
+ numberOfRecordsBefore = await collection.countDocuments({
136
142
  serviceId: SERVICE_PROVIDER_ID,
137
143
  termsType: TERMS_TYPE,
138
- }).count();
144
+ });
139
145
 
140
146
  (record = await subject.save(new Version({
141
147
  serviceId: SERVICE_PROVIDER_ID,
@@ -145,10 +151,10 @@ describe('MongoRepository', () => {
145
151
  snapshotIds: [SNAPSHOT_ID],
146
152
  })));
147
153
 
148
- numberOfRecordsAfter = await collection.find({
154
+ numberOfRecordsAfter = await collection.countDocuments({
149
155
  serviceId: SERVICE_PROVIDER_ID,
150
156
  termsType: TERMS_TYPE,
151
- }).count();
157
+ });
152
158
 
153
159
  ([mongoDocument] = await collection.find({
154
160
  serviceId: SERVICE_PROVIDER_ID,
@@ -181,10 +187,10 @@ describe('MongoRepository', () => {
181
187
  snapshotIds: [SNAPSHOT_ID],
182
188
  }));
183
189
 
184
- numberOfRecordsBefore = await collection.find({
190
+ numberOfRecordsBefore = await collection.countDocuments({
185
191
  serviceId: SERVICE_PROVIDER_ID,
186
192
  termsType: TERMS_TYPE,
187
- }).count();
193
+ });
188
194
 
189
195
  (record = await subject.save(new Version({
190
196
  serviceId: SERVICE_PROVIDER_ID,
@@ -194,10 +200,10 @@ describe('MongoRepository', () => {
194
200
  snapshotIds: [SNAPSHOT_ID],
195
201
  })));
196
202
 
197
- numberOfRecordsAfter = await collection.find({
203
+ numberOfRecordsAfter = await collection.countDocuments({
198
204
  serviceId: SERVICE_PROVIDER_ID,
199
205
  termsType: TERMS_TYPE,
200
- }).count();
206
+ });
201
207
  });
202
208
 
203
209
  after(() => subject.removeAll());
@@ -223,10 +229,10 @@ describe('MongoRepository', () => {
223
229
  snapshotIds: [SNAPSHOT_ID],
224
230
  })); // An extracted only version cannot be the first record
225
231
 
226
- numberOfRecordsBefore = await collection.find({
232
+ numberOfRecordsBefore = await collection.countDocuments({
227
233
  serviceId: SERVICE_PROVIDER_ID,
228
234
  termsType: TERMS_TYPE,
229
- }).count();
235
+ });
230
236
 
231
237
  (record = await subject.save(new Version({
232
238
  serviceId: SERVICE_PROVIDER_ID,
@@ -237,10 +243,10 @@ describe('MongoRepository', () => {
237
243
  isExtractOnly: true,
238
244
  })));
239
245
 
240
- numberOfRecordsAfter = await collection.find({
246
+ numberOfRecordsAfter = await collection.countDocuments({
241
247
  serviceId: SERVICE_PROVIDER_ID,
242
248
  termsType: TERMS_TYPE,
243
- }).count();
249
+ });
244
250
 
245
251
  ([mongoDocument] = await collection.find({
246
252
  serviceId: SERVICE_PROVIDER_ID,
@@ -356,6 +362,29 @@ describe('MongoRepository', () => {
356
362
  expect(mongoDocument.documentId).to.equal(DOCUMENT_ID);
357
363
  });
358
364
  });
365
+
366
+ context('when metadata is provided', () => {
367
+ before(async () => {
368
+ await subject.save(new Version({
369
+ serviceId: SERVICE_PROVIDER_ID,
370
+ termsType: TERMS_TYPE,
371
+ content: CONTENT,
372
+ fetchDate: FETCH_DATE,
373
+ metadata: METADATA,
374
+ }));
375
+
376
+ (mongoDocument = await collection.findOne({
377
+ serviceId: SERVICE_PROVIDER_ID,
378
+ termsType: TERMS_TYPE,
379
+ }));
380
+ });
381
+
382
+ after(() => subject.removeAll());
383
+
384
+ it('stores metadata as commit trailers', () => {
385
+ expect(mongoDocument.metadata).to.deep.equal(METADATA);
386
+ });
387
+ });
359
388
  });
360
389
 
361
390
  describe('#findById', () => {
@@ -369,6 +398,7 @@ describe('MongoRepository', () => {
369
398
  content: CONTENT,
370
399
  fetchDate: FETCH_DATE,
371
400
  snapshotIds: [SNAPSHOT_ID],
401
+ metadata: METADATA,
372
402
  })));
373
403
 
374
404
  (record = await subject.findById(id));
@@ -408,6 +438,10 @@ describe('MongoRepository', () => {
408
438
  expect(record.snapshotIds).to.deep.equal([SNAPSHOT_ID]);
409
439
  });
410
440
 
441
+ it('returns the metadata', () => {
442
+ expect(record.metadata).to.deep.equal(METADATA);
443
+ });
444
+
411
445
  context('when requested record does not exist', () => {
412
446
  it('returns null', async () => {
413
447
  expect(await subject.findById('inexistantID')).to.equal(null);
@@ -504,6 +538,28 @@ describe('MongoRepository', () => {
504
538
  });
505
539
  });
506
540
  });
541
+
542
+ context('when metadata is provided', () => {
543
+ let record;
544
+
545
+ before(async () => {
546
+ await subject.save(new Version({
547
+ serviceId: SERVICE_PROVIDER_ID,
548
+ termsType: TERMS_TYPE,
549
+ content: CONTENT,
550
+ fetchDate: FETCH_DATE,
551
+ metadata: METADATA,
552
+ }));
553
+
554
+ record = await subject.findByDate(SERVICE_PROVIDER_ID, TERMS_TYPE, FETCH_DATE);
555
+ });
556
+
557
+ after(() => subject.removeAll());
558
+
559
+ it('retrieves metadata', () => {
560
+ expect(record.metadata).to.deep.equal(METADATA);
561
+ });
562
+ });
507
563
  });
508
564
 
509
565
  describe('#findAll', () => {
@@ -695,6 +751,28 @@ describe('MongoRepository', () => {
695
751
  expect(latestRecord).to.equal(null);
696
752
  });
697
753
  });
754
+
755
+ context('when metadata is provided', () => {
756
+ let record;
757
+
758
+ before(async () => {
759
+ await subject.save(new Version({
760
+ serviceId: SERVICE_PROVIDER_ID,
761
+ termsType: TERMS_TYPE,
762
+ content: CONTENT,
763
+ fetchDate: FETCH_DATE,
764
+ metadata: METADATA,
765
+ }));
766
+
767
+ record = await subject.findLatest(SERVICE_PROVIDER_ID, TERMS_TYPE);
768
+ });
769
+
770
+ after(() => subject.removeAll());
771
+
772
+ it('retrieves metadata', () => {
773
+ expect(record.metadata).to.deep.equal(METADATA);
774
+ });
775
+ });
698
776
  });
699
777
 
700
778
  describe('#iterate', () => {
@@ -770,10 +848,10 @@ describe('MongoRepository', () => {
770
848
 
771
849
  context('when it is the first record', () => {
772
850
  before(async () => {
773
- numberOfRecordsBefore = await collection.find({
851
+ numberOfRecordsBefore = await collection.countDocuments({
774
852
  serviceId: SERVICE_PROVIDER_ID,
775
853
  termsType: TERMS_TYPE,
776
- }).count();
854
+ });
777
855
 
778
856
  (record = await subject.save(new Snapshot({
779
857
  serviceId: SERVICE_PROVIDER_ID,
@@ -784,10 +862,10 @@ describe('MongoRepository', () => {
784
862
  fetchDate: FETCH_DATE,
785
863
  })));
786
864
 
787
- numberOfRecordsAfter = await collection.find({
865
+ numberOfRecordsAfter = await collection.countDocuments({
788
866
  serviceId: SERVICE_PROVIDER_ID,
789
867
  termsType: TERMS_TYPE,
790
- }).count();
868
+ });
791
869
 
792
870
  (mongoDocument = await collection.findOne({
793
871
  serviceId: SERVICE_PROVIDER_ID,
@@ -850,10 +928,10 @@ describe('MongoRepository', () => {
850
928
  fetchDate: FETCH_DATE,
851
929
  })));
852
930
 
853
- numberOfRecordsBefore = await collection.find({
931
+ numberOfRecordsBefore = await collection.countDocuments({
854
932
  serviceId: SERVICE_PROVIDER_ID,
855
933
  termsType: TERMS_TYPE,
856
- }).count();
934
+ });
857
935
 
858
936
  (record = await subject.save(new Snapshot({
859
937
  serviceId: SERVICE_PROVIDER_ID,
@@ -863,10 +941,10 @@ describe('MongoRepository', () => {
863
941
  fetchDate: FETCH_DATE,
864
942
  })));
865
943
 
866
- numberOfRecordsAfter = await collection.find({
944
+ numberOfRecordsAfter = await collection.countDocuments({
867
945
  serviceId: SERVICE_PROVIDER_ID,
868
946
  termsType: TERMS_TYPE,
869
- }).count();
947
+ });
870
948
 
871
949
  ([mongoDocument] = await collection.find({
872
950
  serviceId: SERVICE_PROVIDER_ID,
@@ -899,10 +977,10 @@ describe('MongoRepository', () => {
899
977
  fetchDate: FETCH_DATE,
900
978
  }));
901
979
 
902
- numberOfRecordsBefore = await collection.find({
980
+ numberOfRecordsBefore = await collection.countDocuments({
903
981
  serviceId: SERVICE_PROVIDER_ID,
904
982
  termsType: TERMS_TYPE,
905
- }).count();
983
+ });
906
984
 
907
985
  (record = await subject.save(new Snapshot({
908
986
  serviceId: SERVICE_PROVIDER_ID,
@@ -912,10 +990,10 @@ describe('MongoRepository', () => {
912
990
  fetchDate: FETCH_DATE_LATER,
913
991
  })));
914
992
 
915
- numberOfRecordsAfter = await collection.find({
993
+ numberOfRecordsAfter = await collection.countDocuments({
916
994
  serviceId: SERVICE_PROVIDER_ID,
917
995
  termsType: TERMS_TYPE,
918
- }).count();
996
+ });
919
997
  });
920
998
 
921
999
  after(() => subject.removeAll());
@@ -931,12 +1009,12 @@ describe('MongoRepository', () => {
931
1009
 
932
1010
  context('with PDF document', () => {
933
1011
  before(async () => {
934
- numberOfRecordsBefore = await collection.find({
1012
+ numberOfRecordsBefore = await collection.countDocuments({
935
1013
  serviceId: SERVICE_PROVIDER_ID,
936
1014
  termsType: TERMS_TYPE,
937
1015
  content: PDF_CONTENT,
938
1016
  mimeType: PDF_MIME_TYPE,
939
- }).count();
1017
+ });
940
1018
 
941
1019
  (record = await subject.save(new Snapshot({
942
1020
  serviceId: SERVICE_PROVIDER_ID,
@@ -946,10 +1024,10 @@ describe('MongoRepository', () => {
946
1024
  fetchDate: FETCH_DATE,
947
1025
  })));
948
1026
 
949
- numberOfRecordsAfter = await collection.find({
1027
+ numberOfRecordsAfter = await collection.countDocuments({
950
1028
  serviceId: SERVICE_PROVIDER_ID,
951
1029
  termsType: TERMS_TYPE,
952
- }).count();
1030
+ });
953
1031
 
954
1032
  (mongoDocument = await collection.findOne({
955
1033
  serviceId: SERVICE_PROVIDER_ID,
@@ -991,9 +1069,10 @@ describe('MongoRepository', () => {
991
1069
  content: CONTENT,
992
1070
  fetchDate: FETCH_DATE,
993
1071
  mimeType: HTML_MIME_TYPE,
1072
+ metadata: METADATA,
994
1073
  })));
995
1074
 
996
- (record = await subject.findById(id));
1075
+ record = await subject.findById(id);
997
1076
  });
998
1077
 
999
1078
  after(() => subject.removeAll());
@@ -1034,6 +1113,10 @@ describe('MongoRepository', () => {
1034
1113
  expect(record.documentId).to.equal(DOCUMENT_ID);
1035
1114
  });
1036
1115
 
1116
+ it('returns the metadata', () => {
1117
+ expect(record.metadata).to.deep.equal(METADATA);
1118
+ });
1119
+
1037
1120
  context('when requested record does not exist', () => {
1038
1121
  it('returns null', async () => {
1039
1122
  expect(await subject.findById('inexistantID')).to.equal(null);
@@ -1272,6 +1355,29 @@ describe('MongoRepository', () => {
1272
1355
  expect(latestRecord).to.equal(null);
1273
1356
  });
1274
1357
  });
1358
+
1359
+ context('when metadata is provided', () => {
1360
+ let record;
1361
+
1362
+ before(async () => {
1363
+ await subject.save(new Snapshot({
1364
+ serviceId: SERVICE_PROVIDER_ID,
1365
+ termsType: TERMS_TYPE,
1366
+ content: CONTENT,
1367
+ fetchDate: FETCH_DATE,
1368
+ mimeType: HTML_MIME_TYPE,
1369
+ metadata: METADATA,
1370
+ }));
1371
+
1372
+ record = await subject.findLatest(SERVICE_PROVIDER_ID, TERMS_TYPE);
1373
+ });
1374
+
1375
+ after(() => subject.removeAll());
1376
+
1377
+ it('retrieves metadata', () => {
1378
+ expect(record.metadata).to.deep.equal(METADATA);
1379
+ });
1380
+ });
1275
1381
  });
1276
1382
 
1277
1383
  describe('#iterate', () => {