@opentermsarchive/engine 0.26.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/bin/ota-track.js +3 -3
  2. package/bin/ota-validate.js +2 -2
  3. package/bin/ota.js +1 -1
  4. package/config/default.json +1 -1
  5. package/package.json +3 -4
  6. package/scripts/dataset/export/index.js +4 -4
  7. package/scripts/dataset/export/index.test.js +11 -17
  8. package/scripts/declarations/lint/index.mocha.js +1 -1
  9. package/scripts/declarations/utils/index.js +12 -12
  10. package/scripts/declarations/validate/definitions.js +1 -1
  11. package/scripts/declarations/validate/index.mocha.js +30 -34
  12. package/scripts/declarations/validate/service.history.schema.js +11 -11
  13. package/scripts/declarations/validate/service.schema.js +13 -13
  14. package/scripts/history/migrate-services.js +4 -4
  15. package/scripts/history/update-to-full-hash.js +2 -2
  16. package/scripts/import/index.js +14 -14
  17. package/scripts/rewrite/rewrite-snapshots.js +3 -3
  18. package/scripts/rewrite/rewrite-versions.js +14 -14
  19. package/scripts/utils/renamer/README.md +3 -3
  20. package/scripts/utils/renamer/index.js +13 -13
  21. package/src/archivist/errors.js +1 -1
  22. package/src/archivist/extract/exports.js +3 -0
  23. package/src/archivist/{filter → extract}/index.js +23 -27
  24. package/src/archivist/extract/index.test.js +516 -0
  25. package/src/archivist/index.js +101 -140
  26. package/src/archivist/index.test.js +178 -166
  27. package/src/archivist/recorder/index.js +11 -55
  28. package/src/archivist/recorder/index.test.js +310 -356
  29. package/src/archivist/recorder/record.js +18 -7
  30. package/src/archivist/recorder/repositories/git/dataMapper.js +41 -31
  31. package/src/archivist/recorder/repositories/git/index.js +11 -15
  32. package/src/archivist/recorder/repositories/git/index.test.js +1058 -463
  33. package/src/archivist/recorder/repositories/interface.js +8 -6
  34. package/src/archivist/recorder/repositories/mongo/dataMapper.js +21 -14
  35. package/src/archivist/recorder/repositories/mongo/index.js +8 -8
  36. package/src/archivist/recorder/repositories/mongo/index.test.js +898 -479
  37. package/src/archivist/recorder/snapshot.js +5 -0
  38. package/src/archivist/recorder/snapshot.test.js +65 -0
  39. package/src/archivist/recorder/version.js +14 -0
  40. package/src/archivist/recorder/version.test.js +65 -0
  41. package/src/archivist/services/index.js +60 -51
  42. package/src/archivist/services/index.test.js +63 -83
  43. package/src/archivist/services/service.js +26 -22
  44. package/src/archivist/services/service.test.js +46 -68
  45. package/src/archivist/services/{pageDeclaration.js → sourceDocument.js} +11 -9
  46. package/src/archivist/services/{pageDeclaration.test.js → sourceDocument.test.js} +21 -21
  47. package/src/archivist/services/terms.js +26 -0
  48. package/src/archivist/services/{documentDeclaration.test.js → terms.test.js} +15 -15
  49. package/src/exports.js +2 -2
  50. package/src/index.js +16 -13
  51. package/src/logger/index.js +35 -36
  52. package/src/notifier/index.js +8 -8
  53. package/src/tracker/index.js +6 -6
  54. package/src/archivist/filter/exports.js +0 -3
  55. package/src/archivist/filter/index.test.js +0 -564
  56. package/src/archivist/recorder/record.test.js +0 -91
  57. package/src/archivist/services/documentDeclaration.js +0 -26
  58. /package/scripts/utils/renamer/rules/{documentTypes.json → termsTypes.json} +0 -0
  59. /package/scripts/utils/renamer/rules/{documentTypesByService.json → termsTypesByService.json} +0 -0
@@ -18,7 +18,7 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url));
18
18
  const ROOT_PATH = path.resolve(__dirname, '../../');
19
19
  const MAX_PARALLEL = 10;
20
20
  const MAX_RETRY = 5;
21
- const PDF_MIME_TYPE = 'application/pdf';
21
+ const PDF_MIME_TYPE = mime.getType('pdf');
22
22
  const COUNTERS = {
23
23
  imported: 0,
24
24
  skippedNoChanges: 0,
@@ -87,10 +87,10 @@ function queueErrorHandler(error, { commit }) {
87
87
 
88
88
  const serviceId = path.dirname(relativeFilePath);
89
89
  const extension = path.extname(relativeFilePath);
90
- const documentType = path.basename(relativeFilePath, extension);
90
+ const termsType = path.basename(relativeFilePath, extension);
91
91
 
92
92
  commitsNotImported.push(commit.hash);
93
- logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type: documentType, sha: commit.hash });
93
+ logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type: termsType, sha: commit.hash });
94
94
  COUNTERS.errors++;
95
95
  }
96
96
 
@@ -117,9 +117,9 @@ function queueDrainHandler(totalToTreat) {
117
117
  };
118
118
  }
119
119
 
120
- async function getCommitContent({ sha, serviceId, documentType, extension }) {
120
+ async function getCommitContent({ sha, serviceId, termsType, extension }) {
121
121
  const start = performance.now();
122
- const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(documentType)}.${extension}`;
122
+ const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(termsType)}.${extension}`;
123
123
  const response = await nodeFetch(url);
124
124
  const end = performance.now();
125
125
 
@@ -141,7 +141,7 @@ async function getCommitContent({ sha, serviceId, documentType, extension }) {
141
141
  throw new TooManyRequestsError(`Cannot get commit content on Github ${url}. 429: Too Many Requests`);
142
142
  }
143
143
 
144
- logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type: documentType, sha });
144
+ logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type: termsType, sha });
145
145
 
146
146
  return content;
147
147
  }
@@ -151,12 +151,12 @@ async function handleCommit(commit, index, total) {
151
151
 
152
152
  let serviceId = path.dirname(relativeFilePath);
153
153
  const extension = path.extname(relativeFilePath);
154
- let documentType = path.basename(relativeFilePath, extension);
154
+ let termsType = path.basename(relativeFilePath, extension);
155
155
 
156
156
  logger.info({
157
157
  message: 'Start to handle commit',
158
158
  serviceId,
159
- type: documentType,
159
+ type: termsType,
160
160
  sha: commit.hash,
161
161
  current: index + 1,
162
162
  total,
@@ -168,7 +168,7 @@ async function handleCommit(commit, index, total) {
168
168
  logger.info({
169
169
  message: 'Skipped commit as an entry already exists for this commit',
170
170
  serviceId,
171
- type: documentType,
171
+ type: termsType,
172
172
  sha: commit.hash,
173
173
  });
174
174
  COUNTERS.skippedNoChanges++;
@@ -176,9 +176,9 @@ async function handleCommit(commit, index, total) {
176
176
  return;
177
177
  }
178
178
 
179
- let content = await getCommitContent({ sha: commit.hash, serviceId, documentType, extension: extension.replace('.', '') });
179
+ let content = await getCommitContent({ sha: commit.hash, serviceId, termsType, extension: extension.replace('.', '') });
180
180
 
181
- ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
181
+ ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
182
182
 
183
183
  const mimeType = mime.getType(extension);
184
184
 
@@ -198,7 +198,7 @@ async function handleCommit(commit, index, total) {
198
198
 
199
199
  await snapshotsCollection.insertOne({
200
200
  serviceId,
201
- documentType,
201
+ termsType,
202
202
  content,
203
203
  mimeType,
204
204
  fetchDate: commit.date,
@@ -207,10 +207,10 @@ async function handleCommit(commit, index, total) {
207
207
  });
208
208
  const end = performance.now();
209
209
 
210
- logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type: documentType });
210
+ logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type: termsType });
211
211
  COUNTERS.imported++;
212
212
  } catch (error) {
213
- logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type: documentType });
213
+ logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type: termsType });
214
214
  commitsNotImported.push(commit.hash);
215
215
  COUNTERS.errors++;
216
216
  }
@@ -76,13 +76,13 @@ let recorder;
76
76
  const { content, mimeType } = await loadFile(SNAPSHOTS_SOURCE_PATH, relativeFilePath);
77
77
 
78
78
  let serviceId = path.dirname(relativeFilePath);
79
- let documentType = path.basename(relativeFilePath, path.extname(relativeFilePath));
79
+ let termsType = path.basename(relativeFilePath, path.extname(relativeFilePath));
80
80
 
81
- ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
81
+ ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
82
82
 
83
83
  const { id: snapshotId } = await recorder.recordSnapshot({
84
84
  serviceId,
85
- documentType,
85
+ termsType,
86
86
  content,
87
87
  mimeType,
88
88
  fetchDate: commit.date,
@@ -4,7 +4,7 @@ import { fileURLToPath } from 'url';
4
4
  import config from 'config';
5
5
 
6
6
  import { InaccessibleContentError } from '../../src/archivist/errors.js';
7
- import filter from '../../src/archivist/filter/index.js';
7
+ import extract from '../../src/archivist/extract/index.js';
8
8
  import Recorder from '../../src/archivist/recorder/index.js';
9
9
  import Git from '../../src/archivist/recorder/repositories/git/git.js';
10
10
  import GitRepository from '../../src/archivist/recorder/repositories/git/index.js';
@@ -86,41 +86,41 @@ let recorder;
86
86
  const { content, mimeType } = await loadFile(SNAPSHOTS_SOURCE_PATH, relativeFilePath);
87
87
 
88
88
  let serviceId = path.dirname(relativeFilePath);
89
- let documentType = path.basename(relativeFilePath, path.extname(relativeFilePath));
89
+ let termsType = path.basename(relativeFilePath, path.extname(relativeFilePath));
90
90
 
91
- ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
91
+ ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
92
92
 
93
93
  if (!servicesDeclarations[serviceId]) {
94
94
  console.log(`⌙ Skip unknown service "${serviceId}"`);
95
95
  continue;
96
96
  }
97
97
 
98
- const documentDeclaration = servicesDeclarations[serviceId].getDocumentDeclaration(
99
- documentType,
98
+ const terms = servicesDeclarations[serviceId].getTerms(
99
+ termsType,
100
100
  commit.date,
101
101
  );
102
102
 
103
- if (!documentDeclaration) {
104
- console.log(`⌙ Skip unknown terms type "${documentType}" for service "${serviceId}"`);
103
+ if (!terms) {
104
+ console.log(`⌙ Skip unknown terms type "${termsType}" for service "${serviceId}"`);
105
105
  continue;
106
106
  }
107
107
 
108
- if (documentDeclaration.validUntil) {
109
- console.log(`⌙ Use declaration valid until ${documentDeclaration.validUntil}`);
108
+ if (terms.validUntil) {
109
+ console.log(`⌙ Use declaration valid until ${terms.validUntil}`);
110
110
  }
111
111
 
112
112
  try {
113
- const document = await filter({
113
+ const versionContent = await extract({
114
114
  content,
115
115
  mimeType,
116
- documentDeclaration,
116
+ terms,
117
117
  });
118
118
 
119
119
  const { id: versionId } = await recorder.recordVersion({
120
120
  serviceId,
121
- documentType,
122
- content: document,
123
- mimeType: MARKDOWN_MIME_TYPE, // The result of the `filter` function is always in markdown format
121
+ termsType,
122
+ content: versionContent,
123
+ mimeType: MARKDOWN_MIME_TYPE, // The result of the `extract` function is always in markdown format
124
124
  fetchDate: commit.date,
125
125
  snapshotId: commit.hash,
126
126
  });
@@ -8,7 +8,7 @@ You can use it in your other scripts like this:
8
8
 
9
9
  ```
10
10
  await renamer.loadRules();
11
- const { serviceId: renamedServiceId, documentType: renamedDocumentType } = renamer.applyRules(serviceId, documentType);
11
+ const { serviceId: renamedServiceId, termsType: renamedDocumentType } = renamer.applyRules(serviceId, termsType);
12
12
  ```
13
13
 
14
14
  ## Adding renaming rules
@@ -26,7 +26,7 @@ To rename a service, add a rule in `./rules/services.json`, for example, to rena
26
26
 
27
27
  ### Terms type
28
28
 
29
- To rename a terms type, add a rule in `./rules/documentTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy", add the following line in the file:
29
+ To rename a terms type, add a rule in `./rules/termsTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy", add the following line in the file:
30
30
 
31
31
  ```json
32
32
  {
@@ -37,7 +37,7 @@ To rename a terms type, add a rule in `./rules/documentTypes.json`, for example,
37
37
 
38
38
  ### Terms type for a specific service
39
39
 
40
- To rename a terms type only for a specific service, add a rule in `./rules/servicesDocumentTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy" only for Skype, add the following line in the file:
40
+ To rename a terms type only for a specific service, add a rule in `./rules/termsTypesByService.json`, for example, to rename "Program Policies" to "Acceptable Use Policy" only for Skype, add the following line in the file:
41
41
 
42
42
  ```json
43
43
  {
@@ -10,12 +10,12 @@ let renamingRules;
10
10
  export async function loadRules() {
11
11
  renamingRules = {
12
12
  serviceNames: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/serviceNames.json'))),
13
- documentTypes: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/documentTypes.json'))),
14
- documentTypesByService: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/documentTypesByService.json'))),
13
+ termsTypes: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/termsTypes.json'))),
14
+ termsTypesByService: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/termsTypesByService.json'))),
15
15
  };
16
16
  }
17
17
 
18
- export function applyRules(serviceId, documentType) {
18
+ export function applyRules(serviceId, termsType) {
19
19
  const renamedServiceId = renamingRules.serviceNames[serviceId];
20
20
 
21
21
  if (renamedServiceId) {
@@ -23,23 +23,23 @@ export function applyRules(serviceId, documentType) {
23
23
  serviceId = renamedServiceId;
24
24
  }
25
25
 
26
- const renamedDocumentType = renamingRules.documentTypes[documentType];
26
+ const renamedTermsType = renamingRules.termsTypes[termsType];
27
27
 
28
- if (renamedDocumentType) {
29
- console.log(`⌙ Rename terms type "${documentType}" to "${renamedDocumentType}" of "${serviceId}" service`);
30
- documentType = renamedDocumentType;
28
+ if (renamedTermsType) {
29
+ console.log(`⌙ Rename terms type "${termsType}" to "${renamedTermsType}" of "${serviceId}" service`);
30
+ termsType = renamedTermsType;
31
31
  }
32
32
 
33
- const renamedServiceDocumentType = renamingRules.documentTypesByService[serviceId]
34
- && renamingRules.documentTypesByService[serviceId][documentType];
33
+ const renamedServiceTermsType = renamingRules.termsTypesByService[serviceId]
34
+ && renamingRules.termsTypesByService[serviceId][termsType];
35
35
 
36
- if (renamedServiceDocumentType) {
37
- console.log(`⌙ Specific rename terms type "${documentType}" to "${renamedServiceDocumentType}" of "${serviceId}" service`);
38
- documentType = renamedServiceDocumentType;
36
+ if (renamedServiceTermsType) {
37
+ console.log(`⌙ Specific rename terms type "${termsType}" to "${renamedServiceTermsType}" of "${serviceId}" service`);
38
+ termsType = renamedServiceTermsType;
39
39
  }
40
40
 
41
41
  return {
42
42
  serviceId,
43
- documentType,
43
+ termsType,
44
44
  };
45
45
  }
@@ -3,7 +3,7 @@ export class InaccessibleContentError extends Error {
3
3
  if (Array.isArray(message)) {
4
4
  message = `\n - ${message.join('\n - ')}`;
5
5
  }
6
- super(`The document cannot be accessed or its content can not be selected:${message}`);
6
+ super(`The documents cannot be accessed or their contents can not be selected:${message}`);
7
7
  this.name = 'InaccessibleContentError';
8
8
  }
9
9
  }
@@ -0,0 +1,3 @@
1
+ import extract from './index.js';
2
+
3
+ export default extract;
@@ -5,6 +5,7 @@ import mardownPdf from '@accordproject/markdown-pdf';
5
5
  import TurndownService from '@opentermsarchive/turndown';
6
6
  import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
7
7
  import jsdom from 'jsdom';
8
+ import mime from 'mime';
8
9
 
9
10
  import { InaccessibleContentError } from '../errors.js';
10
11
 
@@ -21,32 +22,27 @@ const { CiceroMarkTransformer } = ciceroMark;
21
22
  const ciceroMarkTransformer = new CiceroMarkTransformer();
22
23
 
23
24
  /**
24
- * Filter document content and convert it to Markdown
25
+ * Extract content from source document and convert it to Markdown
25
26
  *
26
- * @param {Object} params - Filter parameters
27
- * @param {string|Buffer} params.content - Content to filter: a buffer containing PDF data in case mimetype associated is PDF or a DOM dump of an HTML page given as a string
28
- * @param {string} params.mimeType - MIME type of the given content
29
- * @param {string} params.pageDeclaration - see {@link ./src/archivist/services/pageDeclaration.js}
30
- * @returns {Promise<string>} Promise which is fulfilled once the content is filtered and converted in Markdown. The promise will resolve into a string containing the filtered content in Markdown format
27
+ * @param {string} sourceDocument - Source document from which to extract content, see {@link ./src/archivist/services/sourceDocument.js}
28
+ * @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
31
29
  */
32
- export default async function filter({ content, mimeType, pageDeclaration }) {
33
- if (mimeType == 'application/pdf') {
34
- return filterPDF({ content });
30
+ export default async function extract(sourceDocument) {
31
+ if (sourceDocument.mimeType == mime.getType('pdf')) {
32
+ return extractFromPDF(sourceDocument);
35
33
  }
36
34
 
37
- return filterHTML({
38
- content,
39
- pageDeclaration,
40
- });
35
+ return extractFromHTML(sourceDocument);
41
36
  }
42
37
 
43
- export async function filterHTML({ content, pageDeclaration }) {
38
+ export async function extractFromHTML(sourceDocument) {
44
39
  const {
45
40
  location,
46
41
  contentSelectors = [],
47
- noiseSelectors = [],
42
+ insignificantContentSelectors = [],
48
43
  filters: serviceSpecificFilters = [],
49
- } = pageDeclaration;
44
+ content,
45
+ } = sourceDocument;
50
46
 
51
47
  const jsdomInstance = new JSDOM(content, {
52
48
  url: location,
@@ -61,7 +57,7 @@ export async function filterHTML({ content, pageDeclaration }) {
61
57
  await filterFunction(webPageDOM, {
62
58
  fetch: location,
63
59
  select: contentSelectors,
64
- remove: noiseSelectors,
60
+ remove: insignificantContentSelectors,
65
61
  filter: serviceSpecificFilters.map(filter => filter.name),
66
62
  });
67
63
  /* eslint-enable no-await-in-loop */
@@ -70,7 +66,7 @@ export async function filterHTML({ content, pageDeclaration }) {
70
66
  }
71
67
  }
72
68
 
73
- remove(webPageDOM, noiseSelectors); // remove function works in place
69
+ remove(webPageDOM, insignificantContentSelectors); // remove function works in place
74
70
 
75
71
  const domFragment = select(webPageDOM, contentSelectors);
76
72
 
@@ -101,7 +97,7 @@ export async function filterHTML({ content, pageDeclaration }) {
101
97
  return markdownContent;
102
98
  }
103
99
 
104
- export async function filterPDF({ content: pdfBuffer }) {
100
+ export async function extractFromPDF({ content: pdfBuffer }) {
105
101
  try {
106
102
  const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
107
103
 
@@ -115,12 +111,12 @@ export async function filterPDF({ content: pdfBuffer }) {
115
111
  }
116
112
  }
117
113
 
118
- function selectRange(document, rangeSelector) {
114
+ function selectRange(webPageDOM, rangeSelector) {
119
115
  const { startBefore, startAfter, endBefore, endAfter } = rangeSelector;
120
116
 
121
- const selection = document.createRange();
122
- const startNode = document.querySelector(startBefore || startAfter);
123
- const endNode = document.querySelector(endBefore || endAfter);
117
+ const selection = webPageDOM.createRange();
118
+ const startNode = webPageDOM.querySelector(startBefore || startAfter);
119
+ const endNode = webPageDOM.querySelector(endBefore || endAfter);
124
120
 
125
121
  if (!startNode) {
126
122
  throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
@@ -136,18 +132,18 @@ function selectRange(document, rangeSelector) {
136
132
  return selection;
137
133
  }
138
134
 
139
- export function convertRelativeURLsToAbsolute(document, baseURL) {
140
- Array.from(document.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
135
+ export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
136
+ Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
141
137
  link.href = url.resolve(baseURL, link.href);
142
138
  });
143
139
  }
144
140
 
145
141
  // Works in place
146
- function remove(webPageDOM, noiseSelectors) {
142
+ function remove(webPageDOM, insignificantContentSelectors) {
147
143
  const rangeSelections = [];
148
144
  const nodes = [];
149
145
 
150
- [].concat(noiseSelectors).forEach(selector => {
146
+ [].concat(insignificantContentSelectors).forEach(selector => {
151
147
  if (typeof selector === 'object') {
152
148
  rangeSelections.push(selectRange(webPageDOM, selector));
153
149
  } else {