@opentermsarchive/engine 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +1 -469
  2. package/bin/ota-track.js +3 -3
  3. package/bin/ota-validate.js +2 -2
  4. package/bin/ota.js +1 -1
  5. package/config/default.json +1 -1
  6. package/config/test.json +2 -2
  7. package/package.json +6 -7
  8. package/scripts/dataset/export/index.js +4 -4
  9. package/scripts/dataset/export/index.test.js +11 -17
  10. package/scripts/dataset/export/test/fixtures/dataset/README.md +1 -1
  11. package/scripts/declarations/lint/index.mocha.js +1 -1
  12. package/scripts/declarations/utils/index.js +12 -12
  13. package/scripts/declarations/validate/definitions.js +1 -1
  14. package/scripts/declarations/validate/index.mocha.js +30 -34
  15. package/scripts/declarations/validate/service.history.schema.js +11 -11
  16. package/scripts/declarations/validate/service.schema.js +13 -13
  17. package/scripts/history/migrate-services.js +4 -4
  18. package/scripts/history/update-to-full-hash.js +2 -2
  19. package/scripts/import/index.js +14 -14
  20. package/scripts/rewrite/config/rewrite-snapshots.json +1 -1
  21. package/scripts/rewrite/config/rewrite-versions.json +1 -1
  22. package/scripts/rewrite/rewrite-snapshots.js +3 -3
  23. package/scripts/rewrite/rewrite-versions.js +14 -14
  24. package/scripts/utils/renamer/README.md +3 -3
  25. package/scripts/utils/renamer/index.js +13 -13
  26. package/src/archivist/errors.js +1 -1
  27. package/src/archivist/extract/exports.js +3 -0
  28. package/src/archivist/{filter → extract}/index.js +23 -27
  29. package/src/archivist/extract/index.test.js +516 -0
  30. package/src/archivist/index.js +101 -140
  31. package/src/archivist/index.test.js +178 -166
  32. package/src/archivist/recorder/index.js +11 -55
  33. package/src/archivist/recorder/index.test.js +310 -356
  34. package/src/archivist/recorder/record.js +18 -7
  35. package/src/archivist/recorder/repositories/git/dataMapper.js +41 -31
  36. package/src/archivist/recorder/repositories/git/index.js +11 -15
  37. package/src/archivist/recorder/repositories/git/index.test.js +1058 -463
  38. package/src/archivist/recorder/repositories/interface.js +8 -6
  39. package/src/archivist/recorder/repositories/mongo/dataMapper.js +21 -14
  40. package/src/archivist/recorder/repositories/mongo/index.js +8 -8
  41. package/src/archivist/recorder/repositories/mongo/index.test.js +898 -479
  42. package/src/archivist/recorder/snapshot.js +5 -0
  43. package/src/archivist/recorder/snapshot.test.js +65 -0
  44. package/src/archivist/recorder/version.js +14 -0
  45. package/src/archivist/recorder/version.test.js +65 -0
  46. package/src/archivist/services/index.js +60 -51
  47. package/src/archivist/services/index.test.js +63 -83
  48. package/src/archivist/services/service.js +26 -22
  49. package/src/archivist/services/service.test.js +46 -68
  50. package/src/archivist/services/{pageDeclaration.js → sourceDocument.js} +11 -9
  51. package/src/archivist/services/{pageDeclaration.test.js → sourceDocument.test.js} +21 -21
  52. package/src/archivist/services/terms.js +26 -0
  53. package/src/archivist/services/{documentDeclaration.test.js → terms.test.js} +15 -15
  54. package/src/exports.js +2 -2
  55. package/src/index.js +16 -13
  56. package/src/logger/index.js +35 -36
  57. package/src/notifier/index.js +8 -8
  58. package/src/tracker/index.js +6 -6
  59. package/src/archivist/filter/exports.js +0 -3
  60. package/src/archivist/filter/index.test.js +0 -564
  61. package/src/archivist/recorder/record.test.js +0 -91
  62. package/src/archivist/services/documentDeclaration.js +0 -26
  63. /package/scripts/utils/renamer/rules/{documentTypes.json → termsTypes.json} +0 -0
  64. /package/scripts/utils/renamer/rules/{documentTypesByService.json → termsTypesByService.json} +0 -0
@@ -4,7 +4,7 @@ import { fileURLToPath } from 'url';
4
4
  import config from 'config';
5
5
 
6
6
  import { InaccessibleContentError } from '../../src/archivist/errors.js';
7
- import filter from '../../src/archivist/filter/index.js';
7
+ import extract from '../../src/archivist/extract/index.js';
8
8
  import Recorder from '../../src/archivist/recorder/index.js';
9
9
  import Git from '../../src/archivist/recorder/repositories/git/git.js';
10
10
  import GitRepository from '../../src/archivist/recorder/repositories/git/index.js';
@@ -86,41 +86,41 @@ let recorder;
86
86
  const { content, mimeType } = await loadFile(SNAPSHOTS_SOURCE_PATH, relativeFilePath);
87
87
 
88
88
  let serviceId = path.dirname(relativeFilePath);
89
- let documentType = path.basename(relativeFilePath, path.extname(relativeFilePath));
89
+ let termsType = path.basename(relativeFilePath, path.extname(relativeFilePath));
90
90
 
91
- ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
91
+ ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
92
92
 
93
93
  if (!servicesDeclarations[serviceId]) {
94
94
  console.log(`⌙ Skip unknown service "${serviceId}"`);
95
95
  continue;
96
96
  }
97
97
 
98
- const documentDeclaration = servicesDeclarations[serviceId].getDocumentDeclaration(
99
- documentType,
98
+ const terms = servicesDeclarations[serviceId].getTerms(
99
+ termsType,
100
100
  commit.date,
101
101
  );
102
102
 
103
- if (!documentDeclaration) {
104
- console.log(`⌙ Skip unknown terms type "${documentType}" for service "${serviceId}"`);
103
+ if (!terms) {
104
+ console.log(`⌙ Skip unknown terms type "${termsType}" for service "${serviceId}"`);
105
105
  continue;
106
106
  }
107
107
 
108
- if (documentDeclaration.validUntil) {
109
- console.log(`⌙ Use declaration valid until ${documentDeclaration.validUntil}`);
108
+ if (terms.validUntil) {
109
+ console.log(`⌙ Use declaration valid until ${terms.validUntil}`);
110
110
  }
111
111
 
112
112
  try {
113
- const document = await filter({
113
+ const versionContent = await extract({
114
114
  content,
115
115
  mimeType,
116
- documentDeclaration,
116
+ terms,
117
117
  });
118
118
 
119
119
  const { id: versionId } = await recorder.recordVersion({
120
120
  serviceId,
121
- documentType,
122
- content: document,
123
- mimeType: MARKDOWN_MIME_TYPE, // The result of the `filter` function is always in markdown format
121
+ termsType,
122
+ content: versionContent,
123
+ mimeType: MARKDOWN_MIME_TYPE, // The result of the `extract` function is always in markdown format
124
124
  fetchDate: commit.date,
125
125
  snapshotId: commit.hash,
126
126
  });
@@ -8,7 +8,7 @@ You can use it in your other scripts like this:
8
8
 
9
9
  ```
10
10
  await renamer.loadRules();
11
- const { serviceId: renamedServiceId, documentType: renamedDocumentType } = renamer.applyRules(serviceId, documentType);
11
+ const { serviceId: renamedServiceId, termsType: renamedDocumentType } = renamer.applyRules(serviceId, termsType);
12
12
  ```
13
13
 
14
14
  ## Adding renaming rules
@@ -26,7 +26,7 @@ To rename a service, add a rule in `./rules/services.json`, for example, to rena
26
26
 
27
27
  ### Terms type
28
28
 
29
- To rename a terms type, add a rule in `./rules/documentTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy", add the following line in the file:
29
+ To rename a terms type, add a rule in `./rules/termsTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy", add the following line in the file:
30
30
 
31
31
  ```json
32
32
  {
@@ -37,7 +37,7 @@ To rename a terms type, add a rule in `./rules/documentTypes.json`, for example,
37
37
 
38
38
  ### Terms type for a specific service
39
39
 
40
- To rename a terms type only for a specific service, add a rule in `./rules/servicesDocumentTypes.json`, for example, to rename "Program Policies" to "Acceptable Use Policy" only for Skype, add the following line in the file:
40
+ To rename a terms type only for a specific service, add a rule in `./rules/termsTypesByService.json`, for example, to rename "Program Policies" to "Acceptable Use Policy" only for Skype, add the following line in the file:
41
41
 
42
42
  ```json
43
43
  {
@@ -10,12 +10,12 @@ let renamingRules;
10
10
  export async function loadRules() {
11
11
  renamingRules = {
12
12
  serviceNames: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/serviceNames.json'))),
13
- documentTypes: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/documentTypes.json'))),
14
- documentTypesByService: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/documentTypesByService.json'))),
13
+ termsTypes: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/termsTypes.json'))),
14
+ termsTypesByService: JSON.parse(await fs.readFile(path.resolve(__dirname, './rules/termsTypesByService.json'))),
15
15
  };
16
16
  }
17
17
 
18
- export function applyRules(serviceId, documentType) {
18
+ export function applyRules(serviceId, termsType) {
19
19
  const renamedServiceId = renamingRules.serviceNames[serviceId];
20
20
 
21
21
  if (renamedServiceId) {
@@ -23,23 +23,23 @@ export function applyRules(serviceId, documentType) {
23
23
  serviceId = renamedServiceId;
24
24
  }
25
25
 
26
- const renamedDocumentType = renamingRules.documentTypes[documentType];
26
+ const renamedTermsType = renamingRules.termsTypes[termsType];
27
27
 
28
- if (renamedDocumentType) {
29
- console.log(`⌙ Rename terms type "${documentType}" to "${renamedDocumentType}" of "${serviceId}" service`);
30
- documentType = renamedDocumentType;
28
+ if (renamedTermsType) {
29
+ console.log(`⌙ Rename terms type "${termsType}" to "${renamedTermsType}" of "${serviceId}" service`);
30
+ termsType = renamedTermsType;
31
31
  }
32
32
 
33
- const renamedServiceDocumentType = renamingRules.documentTypesByService[serviceId]
34
- && renamingRules.documentTypesByService[serviceId][documentType];
33
+ const renamedServiceTermsType = renamingRules.termsTypesByService[serviceId]
34
+ && renamingRules.termsTypesByService[serviceId][termsType];
35
35
 
36
- if (renamedServiceDocumentType) {
37
- console.log(`⌙ Specific rename terms type "${documentType}" to "${renamedServiceDocumentType}" of "${serviceId}" service`);
38
- documentType = renamedServiceDocumentType;
36
+ if (renamedServiceTermsType) {
37
+ console.log(`⌙ Specific rename terms type "${termsType}" to "${renamedServiceTermsType}" of "${serviceId}" service`);
38
+ termsType = renamedServiceTermsType;
39
39
  }
40
40
 
41
41
  return {
42
42
  serviceId,
43
- documentType,
43
+ termsType,
44
44
  };
45
45
  }
@@ -3,7 +3,7 @@ export class InaccessibleContentError extends Error {
3
3
  if (Array.isArray(message)) {
4
4
  message = `\n - ${message.join('\n - ')}`;
5
5
  }
6
- super(`The document cannot be accessed or its content can not be selected:${message}`);
6
+ super(`The documents cannot be accessed or their contents can not be selected:${message}`);
7
7
  this.name = 'InaccessibleContentError';
8
8
  }
9
9
  }
@@ -0,0 +1,3 @@
1
+ import extract from './index.js';
2
+
3
+ export default extract;
@@ -5,6 +5,7 @@ import mardownPdf from '@accordproject/markdown-pdf';
5
5
  import TurndownService from '@opentermsarchive/turndown';
6
6
  import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
7
7
  import jsdom from 'jsdom';
8
+ import mime from 'mime';
8
9
 
9
10
  import { InaccessibleContentError } from '../errors.js';
10
11
 
@@ -21,32 +22,27 @@ const { CiceroMarkTransformer } = ciceroMark;
21
22
  const ciceroMarkTransformer = new CiceroMarkTransformer();
22
23
 
23
24
  /**
24
- * Filter document content and convert it to Markdown
25
+ * Extract content from source document and convert it to Markdown
25
26
  *
26
- * @param {Object} params - Filter parameters
27
- * @param {string|Buffer} params.content - Content to filter: a buffer containing PDF data in case mimetype associated is PDF or a DOM dump of an HTML page given as a string
28
- * @param {string} params.mimeType - MIME type of the given content
29
- * @param {string} params.pageDeclaration - see {@link ./src/archivist/services/pageDeclaration.js}
30
- * @returns {Promise<string>} Promise which is fulfilled once the content is filtered and converted in Markdown. The promise will resolve into a string containing the filtered content in Markdown format
27
+ * @param {string} sourceDocument - Source document from which to extract content, see {@link ./src/archivist/services/sourceDocument.js}
28
+ * @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
31
29
  */
32
- export default async function filter({ content, mimeType, pageDeclaration }) {
33
- if (mimeType == 'application/pdf') {
34
- return filterPDF({ content });
30
+ export default async function extract(sourceDocument) {
31
+ if (sourceDocument.mimeType == mime.getType('pdf')) {
32
+ return extractFromPDF(sourceDocument);
35
33
  }
36
34
 
37
- return filterHTML({
38
- content,
39
- pageDeclaration,
40
- });
35
+ return extractFromHTML(sourceDocument);
41
36
  }
42
37
 
43
- export async function filterHTML({ content, pageDeclaration }) {
38
+ export async function extractFromHTML(sourceDocument) {
44
39
  const {
45
40
  location,
46
41
  contentSelectors = [],
47
- noiseSelectors = [],
42
+ insignificantContentSelectors = [],
48
43
  filters: serviceSpecificFilters = [],
49
- } = pageDeclaration;
44
+ content,
45
+ } = sourceDocument;
50
46
 
51
47
  const jsdomInstance = new JSDOM(content, {
52
48
  url: location,
@@ -61,7 +57,7 @@ export async function filterHTML({ content, pageDeclaration }) {
61
57
  await filterFunction(webPageDOM, {
62
58
  fetch: location,
63
59
  select: contentSelectors,
64
- remove: noiseSelectors,
60
+ remove: insignificantContentSelectors,
65
61
  filter: serviceSpecificFilters.map(filter => filter.name),
66
62
  });
67
63
  /* eslint-enable no-await-in-loop */
@@ -70,7 +66,7 @@ export async function filterHTML({ content, pageDeclaration }) {
70
66
  }
71
67
  }
72
68
 
73
- remove(webPageDOM, noiseSelectors); // remove function works in place
69
+ remove(webPageDOM, insignificantContentSelectors); // remove function works in place
74
70
 
75
71
  const domFragment = select(webPageDOM, contentSelectors);
76
72
 
@@ -101,7 +97,7 @@ export async function filterHTML({ content, pageDeclaration }) {
101
97
  return markdownContent;
102
98
  }
103
99
 
104
- export async function filterPDF({ content: pdfBuffer }) {
100
+ export async function extractFromPDF({ content: pdfBuffer }) {
105
101
  try {
106
102
  const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
107
103
 
@@ -115,12 +111,12 @@ export async function filterPDF({ content: pdfBuffer }) {
115
111
  }
116
112
  }
117
113
 
118
- function selectRange(document, rangeSelector) {
114
+ function selectRange(webPageDOM, rangeSelector) {
119
115
  const { startBefore, startAfter, endBefore, endAfter } = rangeSelector;
120
116
 
121
- const selection = document.createRange();
122
- const startNode = document.querySelector(startBefore || startAfter);
123
- const endNode = document.querySelector(endBefore || endAfter);
117
+ const selection = webPageDOM.createRange();
118
+ const startNode = webPageDOM.querySelector(startBefore || startAfter);
119
+ const endNode = webPageDOM.querySelector(endBefore || endAfter);
124
120
 
125
121
  if (!startNode) {
126
122
  throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
@@ -136,18 +132,18 @@ function selectRange(document, rangeSelector) {
136
132
  return selection;
137
133
  }
138
134
 
139
- export function convertRelativeURLsToAbsolute(document, baseURL) {
140
- Array.from(document.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
135
+ export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
136
+ Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
141
137
  link.href = url.resolve(baseURL, link.href);
142
138
  });
143
139
  }
144
140
 
145
141
  // Works in place
146
- function remove(webPageDOM, noiseSelectors) {
142
+ function remove(webPageDOM, insignificantContentSelectors) {
147
143
  const rangeSelections = [];
148
144
  const nodes = [];
149
145
 
150
- [].concat(noiseSelectors).forEach(selector => {
146
+ [].concat(insignificantContentSelectors).forEach(selector => {
151
147
  if (typeof selector === 'object') {
152
148
  rangeSelections.push(selectRange(webPageDOM, selector));
153
149
  } else {