@opentermsarchive/engine 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +1 -469
  2. package/bin/ota-track.js +3 -3
  3. package/bin/ota-validate.js +2 -2
  4. package/bin/ota.js +1 -1
  5. package/config/default.json +1 -1
  6. package/config/test.json +2 -2
  7. package/package.json +6 -7
  8. package/scripts/dataset/export/index.js +4 -4
  9. package/scripts/dataset/export/index.test.js +11 -17
  10. package/scripts/dataset/export/test/fixtures/dataset/README.md +1 -1
  11. package/scripts/declarations/lint/index.mocha.js +1 -1
  12. package/scripts/declarations/utils/index.js +12 -12
  13. package/scripts/declarations/validate/definitions.js +1 -1
  14. package/scripts/declarations/validate/index.mocha.js +30 -34
  15. package/scripts/declarations/validate/service.history.schema.js +11 -11
  16. package/scripts/declarations/validate/service.schema.js +13 -13
  17. package/scripts/history/migrate-services.js +4 -4
  18. package/scripts/history/update-to-full-hash.js +2 -2
  19. package/scripts/import/index.js +14 -14
  20. package/scripts/rewrite/config/rewrite-snapshots.json +1 -1
  21. package/scripts/rewrite/config/rewrite-versions.json +1 -1
  22. package/scripts/rewrite/rewrite-snapshots.js +3 -3
  23. package/scripts/rewrite/rewrite-versions.js +14 -14
  24. package/scripts/utils/renamer/README.md +3 -3
  25. package/scripts/utils/renamer/index.js +13 -13
  26. package/src/archivist/errors.js +1 -1
  27. package/src/archivist/extract/exports.js +3 -0
  28. package/src/archivist/{filter → extract}/index.js +23 -27
  29. package/src/archivist/extract/index.test.js +516 -0
  30. package/src/archivist/index.js +101 -140
  31. package/src/archivist/index.test.js +178 -166
  32. package/src/archivist/recorder/index.js +11 -55
  33. package/src/archivist/recorder/index.test.js +310 -356
  34. package/src/archivist/recorder/record.js +18 -7
  35. package/src/archivist/recorder/repositories/git/dataMapper.js +41 -31
  36. package/src/archivist/recorder/repositories/git/index.js +11 -15
  37. package/src/archivist/recorder/repositories/git/index.test.js +1058 -463
  38. package/src/archivist/recorder/repositories/interface.js +8 -6
  39. package/src/archivist/recorder/repositories/mongo/dataMapper.js +21 -14
  40. package/src/archivist/recorder/repositories/mongo/index.js +8 -8
  41. package/src/archivist/recorder/repositories/mongo/index.test.js +898 -479
  42. package/src/archivist/recorder/snapshot.js +5 -0
  43. package/src/archivist/recorder/snapshot.test.js +65 -0
  44. package/src/archivist/recorder/version.js +14 -0
  45. package/src/archivist/recorder/version.test.js +65 -0
  46. package/src/archivist/services/index.js +60 -51
  47. package/src/archivist/services/index.test.js +63 -83
  48. package/src/archivist/services/service.js +26 -22
  49. package/src/archivist/services/service.test.js +46 -68
  50. package/src/archivist/services/{pageDeclaration.js → sourceDocument.js} +11 -9
  51. package/src/archivist/services/{pageDeclaration.test.js → sourceDocument.test.js} +21 -21
  52. package/src/archivist/services/terms.js +26 -0
  53. package/src/archivist/services/{documentDeclaration.test.js → terms.test.js} +15 -15
  54. package/src/exports.js +2 -2
  55. package/src/index.js +16 -13
  56. package/src/logger/index.js +35 -36
  57. package/src/notifier/index.js +8 -8
  58. package/src/tracker/index.js +6 -6
  59. package/src/archivist/filter/exports.js +0 -3
  60. package/src/archivist/filter/index.test.js +0 -564
  61. package/src/archivist/recorder/record.test.js +0 -91
  62. package/src/archivist/services/documentDeclaration.js +0 -26
  63. /package/scripts/utils/renamer/rules/{documentTypes.json → termsTypes.json} +0 -0
  64. /package/scripts/utils/renamer/rules/{documentTypesByService.json → termsTypesByService.json} +0 -0
@@ -8,8 +8,8 @@ import dircompare from 'dir-compare';
8
8
  import mime from 'mime';
9
9
  import StreamZip from 'node-stream-zip';
10
10
 
11
- import Record from '../../../src/archivist/recorder/record.js';
12
11
  import GitRepository from '../../../src/archivist/recorder/repositories/git/index.js';
12
+ import Version from '../../../src/archivist/recorder/version.js';
13
13
 
14
14
  import generateArchive from './index.js';
15
15
 
@@ -20,8 +20,8 @@ const { expect } = chai;
20
20
  const FIRST_SERVICE_PROVIDER_ID = 'ServiceA';
21
21
  const SECOND_SERVICE_PROVIDER_ID = 'ServiceB';
22
22
 
23
- const FIRST_DOCUMENT_TYPE = 'Terms of Service';
24
- const SECOND_DOCUMENT_TYPE = 'Privacy Policy';
23
+ const FIRST_TERMS_TYPE = 'Terms of Service';
24
+ const SECOND_TERMS_TYPE = 'Privacy Policy';
25
25
 
26
26
  const FIRST_FETCH_DATE = '2021-01-01T11:27:00.000Z';
27
27
  const SECOND_FETCH_DATE = '2021-01-11T11:32:47.000Z';
@@ -31,8 +31,6 @@ const FOURTH_FETCH_DATE = '2022-01-01T12:12:24.000Z';
31
31
  const FIRST_CONTENT = 'First Content';
32
32
  const SECOND_CONTENT = 'Second Content';
33
33
 
34
- const MIME_TYPE = 'text/markdown';
35
-
36
34
  const SNAPSHOT_ID = '721ce4a63ad399ecbdb548a66d6d327e7bc97876';
37
35
 
38
36
  const RELEASE_DATE = '2022-01-01T18:21:00.000Z';
@@ -56,38 +54,34 @@ describe('Export', () => {
56
54
 
57
55
  await repository.initialize();
58
56
 
59
- await repository.save(new Record({
57
+ await repository.save(new Version({
60
58
  serviceId: FIRST_SERVICE_PROVIDER_ID,
61
- documentType: FIRST_DOCUMENT_TYPE,
59
+ termsType: FIRST_TERMS_TYPE,
62
60
  content: FIRST_CONTENT,
63
- mimeType: MIME_TYPE,
64
61
  fetchDate: FIRST_FETCH_DATE,
65
62
  snapshotId: SNAPSHOT_ID,
66
63
  }));
67
64
 
68
- await repository.save(new Record({
65
+ await repository.save(new Version({
69
66
  serviceId: FIRST_SERVICE_PROVIDER_ID,
70
- documentType: FIRST_DOCUMENT_TYPE,
67
+ termsType: FIRST_TERMS_TYPE,
71
68
  content: SECOND_CONTENT,
72
- mimeType: MIME_TYPE,
73
69
  fetchDate: SECOND_FETCH_DATE,
74
70
  snapshotId: SNAPSHOT_ID,
75
71
  }));
76
72
 
77
- await repository.save(new Record({
73
+ await repository.save(new Version({
78
74
  serviceId: SECOND_SERVICE_PROVIDER_ID,
79
- documentType: FIRST_DOCUMENT_TYPE,
75
+ termsType: FIRST_TERMS_TYPE,
80
76
  content: FIRST_CONTENT,
81
- mimeType: MIME_TYPE,
82
77
  fetchDate: THIRD_FETCH_DATE,
83
78
  snapshotId: SNAPSHOT_ID,
84
79
  }));
85
80
 
86
- await repository.save(new Record({
81
+ await repository.save(new Version({
87
82
  serviceId: SECOND_SERVICE_PROVIDER_ID,
88
- documentType: SECOND_DOCUMENT_TYPE,
83
+ termsType: SECOND_TERMS_TYPE,
89
84
  content: FIRST_CONTENT,
90
- mimeType: MIME_TYPE,
91
85
  fetchDate: FOURTH_FETCH_DATE,
92
86
  snapshotId: SNAPSHOT_ID,
93
87
  }));
@@ -2,7 +2,7 @@
2
2
 
3
3
  This dataset consolidates the contractual documents of 2 service providers, in all their versions that were accessible online between January 1, 2021 and January 6, 2022.
4
4
 
5
- This dataset is tailored for datascientists and other analysts. You can also explore all these versions interactively on [https://github.com/OpenTermsArchive/sandbox](https://github.com/OpenTermsArchive/sandbox).
5
+ This dataset is tailored for datascientists and other analysts. You can also explore all these versions interactively on [https://github.com/OpenTermsArchive/sandbox-versions](https://github.com/OpenTermsArchive/sandbox-versions).
6
6
 
7
7
  It has been generated with [Open Terms Archive](https://opentermsarchive.org).
8
8
 
@@ -31,7 +31,7 @@ export default async options => {
31
31
  if (options.modified) {
32
32
  const declarationUtils = new DeclarationUtils(instancePath);
33
33
 
34
- ({ services: servicesToValidate } = await declarationUtils.getModifiedServiceDocumentTypes());
34
+ ({ services: servicesToValidate } = await declarationUtils.getModifiedServiceTermsTypes());
35
35
  }
36
36
 
37
37
  const lintFile = lintAndFixFile(options.fix);
@@ -24,27 +24,27 @@ export default class DeclarationUtils {
24
24
 
25
25
  const modifiedFilePaths = modifiedFilePathsAsString ? modifiedFilePathsAsString.split('\n') : [];
26
26
 
27
- return { modifiedFilePaths, modifiedServiceIds: Array.from(new Set(modifiedFilePaths.map(DeclarationUtils.filePathToServiceId))) };
27
+ return { modifiedFilePaths, modifiedServicesIds: Array.from(new Set(modifiedFilePaths.map(DeclarationUtils.filePathToServiceId))) };
28
28
  }
29
29
 
30
30
  async getModifiedServices() {
31
- const { modifiedServiceIds } = await this.getModifiedData();
31
+ const { modifiedServicesIds } = await this.getModifiedData();
32
32
 
33
- return modifiedServiceIds;
33
+ return modifiedServicesIds;
34
34
  }
35
35
 
36
- async getModifiedServiceDocumentTypes() {
37
- const { modifiedFilePaths, modifiedServiceIds } = await this.getModifiedData();
38
- const servicesDocumentTypes = {};
36
+ async getModifiedServiceTermsTypes() {
37
+ const { modifiedFilePaths, modifiedServicesIds } = await this.getModifiedData();
38
+ const servicesTermsTypes = {};
39
39
 
40
40
  await Promise.all(modifiedFilePaths.map(async modifiedFilePath => {
41
41
  const serviceId = DeclarationUtils.filePathToServiceId(modifiedFilePath);
42
42
 
43
43
  if (!modifiedFilePath.endsWith('.json')) {
44
44
  // Here we should compare AST of both files to detect on which function
45
- // change has been made, and then find which document type depends on this
45
+ // change has been made, and then find which terms type depends on this
46
46
  // function.
47
- // As this is a complicated process, we will just send back all document types
47
+ // As this is a complicated process, we will just send back all terms types
48
48
  const declaration = await this.getJSONFile(`declarations/${serviceId}.json`, this.defaultBranch);
49
49
 
50
50
  return Object.keys(declaration.documents);
@@ -60,7 +60,7 @@ export default class DeclarationUtils {
60
60
  return;
61
61
  }
62
62
 
63
- const modifiedDocumentTypes = diff.reduce((acc, { path }) => {
63
+ const modifiedTermsTypes = diff.reduce((acc, { path }) => {
64
64
  if (modifiedFilePath.includes('.history')) {
65
65
  acc.add(path[0]);
66
66
  } else if (path[0] == 'documents') {
@@ -70,12 +70,12 @@ export default class DeclarationUtils {
70
70
  return acc;
71
71
  }, new Set());
72
72
 
73
- servicesDocumentTypes[serviceId] = Array.from(new Set([ ...servicesDocumentTypes[serviceId] || [], ...modifiedDocumentTypes ]));
73
+ servicesTermsTypes[serviceId] = Array.from(new Set([ ...servicesTermsTypes[serviceId] || [], ...modifiedTermsTypes ]));
74
74
  }));
75
75
 
76
76
  return {
77
- services: modifiedServiceIds,
78
- servicesDocumentTypes,
77
+ services: modifiedServicesIds,
78
+ servicesTermsTypes,
79
79
  };
80
80
  }
81
81
  }
@@ -25,7 +25,7 @@ const definitions = {
25
25
  ],
26
26
  },
27
27
  contentSelectors: { $ref: '#/definitions/selectors' },
28
- noiseSelectors: { $ref: '#/definitions/selectors' },
28
+ insignificantContentSelectors: { $ref: '#/definitions/selectors' },
29
29
  filters: {
30
30
  type: 'array',
31
31
  items: {
@@ -6,8 +6,8 @@ import { expect } from 'chai';
6
6
  import config from 'config';
7
7
  import jsonSourceMap from 'json-source-map';
8
8
 
9
+ import extract from '../../../src/archivist/extract/index.js';
9
10
  import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from '../../../src/archivist/fetcher/index.js';
10
- import filter from '../../../src/archivist/filter/index.js';
11
11
  import * as services from '../../../src/archivist/services/index.js';
12
12
  import DeclarationUtils from '../utils/index.js';
13
13
 
@@ -25,8 +25,8 @@ const instancePath = path.resolve(declarationsPath, '../');
25
25
  export default async options => {
26
26
  const schemaOnly = options.schemaOnly || false;
27
27
  let servicesToValidate = options.services || [];
28
- const documentTypes = options.termsTypes || [];
29
- let servicesDocumentTypes = {};
28
+ const termsTypes = options.types || [];
29
+ let servicesTermsTypes = {};
30
30
 
31
31
  const serviceDeclarations = await services.loadWithHistory(servicesToValidate);
32
32
 
@@ -37,7 +37,7 @@ export default async options => {
37
37
  if (options.modified) {
38
38
  const declarationUtils = new DeclarationUtils(instancePath);
39
39
 
40
- ({ services: servicesToValidate, servicesDocumentTypes } = await declarationUtils.getModifiedServiceDocumentTypes());
40
+ ({ services: servicesToValidate, servicesTermsTypes } = await declarationUtils.getModifiedServiceTermsTypes());
41
41
  }
42
42
 
43
43
  describe('Service declarations validation', async function () {
@@ -76,61 +76,57 @@ export default async options => {
76
76
  }
77
77
 
78
78
  if (!schemaOnly && service) {
79
- service.getDocumentTypes()
80
- .filter(documentType => {
81
- if (servicesDocumentTypes[serviceId] && servicesDocumentTypes[serviceId].length > 0) {
82
- return servicesDocumentTypes[serviceId].includes(documentType);
79
+ service.getTermsTypes()
80
+ .filter(termsType => {
81
+ if (servicesTermsTypes[serviceId] && servicesTermsTypes[serviceId].length > 0) {
82
+ return servicesTermsTypes[serviceId].includes(termsType);
83
83
  }
84
84
 
85
- if (documentTypes.length > 0) {
86
- return documentTypes.includes(documentType);
85
+ if (termsTypes.length > 0) {
86
+ return termsTypes.includes(termsType);
87
87
  }
88
88
 
89
89
  return true;
90
90
  })
91
91
  .forEach(type => {
92
92
  describe(type, () => {
93
- const documentDeclaration = service.getDocumentDeclaration(type);
93
+ const terms = service.getTerms(type);
94
94
 
95
- documentDeclaration.pages.forEach(page => {
96
- let content;
95
+ terms.sourceDocuments.forEach(sourceDocument => {
97
96
  let filteredContent;
98
- let mimeType;
99
97
 
100
- context(page.location, () => {
98
+ context(sourceDocument.location, () => {
101
99
  before(async function () {
102
- if (!documentDeclaration) {
100
+ if (!terms) {
103
101
  console.log(' (Tests skipped as declaration has been archived)');
104
102
  this.skip();
105
103
  }
106
104
  });
107
105
 
108
106
  it('fetchable URL', async () => {
109
- const { location, executeClientScripts } = page;
110
- const document = await fetch({
107
+ const { location, executeClientScripts } = sourceDocument;
108
+
109
+ ({ content: sourceDocument.content, mimeType: sourceDocument.mimeType } = await fetch({
111
110
  url: location,
112
111
  executeClientScripts,
113
- cssSelectors: page.cssSelectors,
112
+ cssSelectors: sourceDocument.cssSelectors,
114
113
  config: config.get('fetcher'),
115
- });
116
-
117
- content = document.content;
118
- mimeType = document.mimeType;
114
+ }));
119
115
  });
120
116
 
121
- it('selector matches an element in the web page', async function checkSelector() {
122
- if (!content) {
117
+ it('selector matches an element in the source document', async function checkSelector() {
118
+ if (!sourceDocument.content) {
123
119
  console.log(' [Tests skipped as URL is not fetchable]');
124
120
  this.skip();
125
121
  }
126
122
 
127
- filteredContent = await filter({ content, pageDeclaration: page, mimeType });
123
+ filteredContent = await extract(sourceDocument);
128
124
 
129
125
  expect(filteredContent).to.not.be.empty;
130
126
  });
131
127
 
132
128
  it(`filtered content has at least ${MIN_DOC_LENGTH} characters`, async function checkContentLength() {
133
- if (!content) {
129
+ if (!sourceDocument.content) {
134
130
  console.log(' [Tests skipped as URL is not fetchable]');
135
131
  this.skip();
136
132
  }
@@ -146,7 +142,7 @@ export default async options => {
146
142
  it('content is consistent when fetched and filtered twice in a row', async function checkContentConsistency() {
147
143
  this.slow(SLOW_DOCUMENT_THRESHOLD * 2);
148
144
 
149
- if (!content) {
145
+ if (!sourceDocument.content) {
150
146
  console.log(' [Tests skipped as URL is not fetchable]');
151
147
  this.skip();
152
148
  }
@@ -156,13 +152,13 @@ export default async options => {
156
152
  this.skip();
157
153
  }
158
154
 
159
- const document = await fetch({
160
- url: page.location,
161
- executeClientScripts: page.executeClientScripts,
162
- cssSelectors: page.cssSelectors,
155
+ ({ content: sourceDocument.content, mimeType: sourceDocument.mimeType } = await fetch({
156
+ url: sourceDocument.location,
157
+ executeClientScripts: sourceDocument.executeClientScripts,
158
+ cssSelectors: sourceDocument.cssSelectors,
163
159
  config: config.get('fetcher'),
164
- });
165
- const secondFilteredContent = await filter({ content: document.content, pageDeclaration: page, mimeType: document.mimeType });
160
+ }));
161
+ const secondFilteredContent = await extract(sourceDocument);
166
162
 
167
163
  expect(secondFilteredContent).to.equal(filteredContent);
168
164
  });
@@ -1,10 +1,10 @@
1
- import { DOCUMENT_TYPES } from '../../../src/archivist/services/index.js';
1
+ import TERMS_TYPES from '@opentermsarchive/terms-types';
2
2
 
3
3
  import definitions from './definitions.js';
4
4
 
5
- const AVAILABLE_TYPES_NAME = Object.keys(DOCUMENT_TYPES);
5
+ const AVAILABLE_TYPES_NAME = Object.keys(TERMS_TYPES);
6
6
 
7
- const documentsProperties = () => {
7
+ const termsProperties = () => {
8
8
  const result = {};
9
9
 
10
10
  AVAILABLE_TYPES_NAME.forEach(type => {
@@ -12,8 +12,8 @@ const documentsProperties = () => {
12
12
  type: 'array',
13
13
  items: {
14
14
  oneOf: [
15
- { $ref: '#/definitions/singlePageDocumentHistory' },
16
- { $ref: '#/definitions/multiPageDocumentHistory' },
15
+ { $ref: '#/definitions/singleSourceDocumentTermsHistory' },
16
+ { $ref: '#/definitions/multipleSourceDocumentsTermsHistory' },
17
17
  { $ref: '#/definitions/pdfDocumentHistory' },
18
18
  ],
19
19
  },
@@ -27,7 +27,7 @@ const schema = {
27
27
  type: 'object',
28
28
  additionalProperties: false,
29
29
  title: 'Service declaration history',
30
- properties: documentsProperties(),
30
+ properties: termsProperties(),
31
31
  propertyNames: { enum: AVAILABLE_TYPES_NAME },
32
32
  definitions: {
33
33
  ...definitions,
@@ -40,7 +40,7 @@ const schema = {
40
40
  validUntil: { $ref: '#/definitions/validUntil' },
41
41
  },
42
42
  },
43
- singlePageDocumentHistory: {
43
+ singleSourceDocumentTermsHistory: {
44
44
  type: 'object',
45
45
  additionalProperties: false,
46
46
  required: [ 'fetch', 'select', 'validUntil' ],
@@ -48,12 +48,12 @@ const schema = {
48
48
  fetch: { $ref: '#/definitions/location' },
49
49
  select: { $ref: '#/definitions/contentSelectors' },
50
50
  filter: { $ref: '#/definitions/filters' },
51
- remove: { $ref: '#/definitions/noiseSelectors' },
51
+ remove: { $ref: '#/definitions/insignificantContentSelectors' },
52
52
  executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
53
53
  validUntil: { $ref: '#/definitions/validUntil' },
54
54
  },
55
55
  },
56
- multiPageDocumentHistory: {
56
+ multipleSourceDocumentsTermsHistory: {
57
57
  type: 'object',
58
58
  additionalProperties: false,
59
59
  required: ['combine'],
@@ -68,14 +68,14 @@ const schema = {
68
68
  fetch: { $ref: '#/definitions/location' },
69
69
  select: { $ref: '#/definitions/contentSelectors' },
70
70
  filter: { $ref: '#/definitions/filters' },
71
- remove: { $ref: '#/definitions/noiseSelectors' },
71
+ remove: { $ref: '#/definitions/insignificantContentSelectors' },
72
72
  executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
73
73
  },
74
74
  },
75
75
  },
76
76
  select: { $ref: '#/definitions/contentSelectors' },
77
77
  filter: { $ref: '#/definitions/filters' },
78
- remove: { $ref: '#/definitions/noiseSelectors' },
78
+ remove: { $ref: '#/definitions/insignificantContentSelectors' },
79
79
  executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
80
80
  validUntil: { $ref: '#/definitions/validUntil' },
81
81
  },
@@ -1,17 +1,17 @@
1
- import { DOCUMENT_TYPES } from '../../../src/archivist/services/index.js';
1
+ import TERMS_TYPES from '@opentermsarchive/terms-types';
2
2
 
3
3
  import definitions from './definitions.js';
4
4
 
5
- const AVAILABLE_TYPES_NAME = Object.keys(DOCUMENT_TYPES);
5
+ const AVAILABLE_TYPES_NAME = Object.keys(TERMS_TYPES);
6
6
 
7
- const documentsProperties = () => {
7
+ const termsProperties = () => {
8
8
  const result = {};
9
9
 
10
10
  AVAILABLE_TYPES_NAME.forEach(type => {
11
11
  result[type] = {
12
12
  oneOf: [
13
- { $ref: '#/definitions/singlePageDocument' },
14
- { $ref: '#/definitions/multiPageDocument' },
13
+ { $ref: '#/definitions/singleSourceDocumentTerms' },
14
+ { $ref: '#/definitions/multipleSourceDocumentsTerms' },
15
15
  { $ref: '#/definitions/pdfDocument' },
16
16
  ],
17
17
  };
@@ -33,7 +33,7 @@ const schema = {
33
33
  },
34
34
  documents: {
35
35
  type: 'object',
36
- properties: documentsProperties(),
36
+ properties: termsProperties(),
37
37
  propertyNames: { enum: AVAILABLE_TYPES_NAME },
38
38
  },
39
39
  importedFrom: {
@@ -52,7 +52,7 @@ const schema = {
52
52
  required: ['fetch'],
53
53
  properties: { fetch: { $ref: '#/definitions/pdfLocation' } },
54
54
  },
55
- page: {
55
+ sourceDocument: {
56
56
  type: 'object',
57
57
  additionalProperties: false,
58
58
  required: ['fetch'],
@@ -60,28 +60,28 @@ const schema = {
60
60
  fetch: { $ref: '#/definitions/location' },
61
61
  select: { $ref: '#/definitions/contentSelectors' },
62
62
  filter: { $ref: '#/definitions/filters' },
63
- remove: { $ref: '#/definitions/noiseSelectors' },
63
+ remove: { $ref: '#/definitions/insignificantContentSelectors' },
64
64
  executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
65
65
  },
66
66
  },
67
- singlePageDocument: {
67
+ singleSourceDocumentTerms: {
68
68
  allOf: [
69
- { $ref: '#/definitions/page' },
69
+ { $ref: '#/definitions/sourceDocument' },
70
70
  { required: [ 'fetch', 'select' ] },
71
71
  ],
72
72
  },
73
- multiPageDocument: {
73
+ multipleSourceDocumentsTerms: {
74
74
  type: 'object',
75
75
  additionalProperties: false,
76
76
  required: ['combine'],
77
77
  properties: {
78
78
  combine: {
79
79
  type: 'array',
80
- items: { $ref: '#/definitions/page' },
80
+ items: { $ref: '#/definitions/sourceDocument' },
81
81
  },
82
82
  select: { $ref: '#/definitions/contentSelectors' },
83
83
  filter: { $ref: '#/definitions/filters' },
84
- remove: { $ref: '#/definitions/noiseSelectors' },
84
+ remove: { $ref: '#/definitions/insignificantContentSelectors' },
85
85
  executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
86
86
  },
87
87
  },
@@ -145,10 +145,10 @@ async function rewriteSnapshots(repository, records, idsMapping, logger) {
145
145
  idsMapping[record.id] = recordId; // Saves the mapping between the old ID and the new one.
146
146
 
147
147
  if (recordId) {
148
- logger.info({ message: `Migrated snapshot with new ID: ${recordId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length });
148
+ logger.info({ message: `Migrated snapshot with new ID: ${recordId}`, serviceId: record.serviceId, type: record.termsType, id: record.id, current: i++, total: records.length });
149
149
  counters.migrated++;
150
150
  } else {
151
- logger.info({ message: 'Skipped snapshot', serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length });
151
+ logger.info({ message: 'Skipped snapshot', serviceId: record.serviceId, type: record.termsType, id: record.id, current: i++, total: records.length });
152
152
  counters.skipped++;
153
153
  }
154
154
  }
@@ -169,10 +169,10 @@ async function rewriteVersions(repository, records, idsMapping, logger) {
169
169
  const { id: recordId } = await repository.save(record); // eslint-disable-line no-await-in-loop
170
170
 
171
171
  if (recordId) {
172
- logger.info({ message: `Migrated version with new ID: ${recordId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length });
172
+ logger.info({ message: `Migrated version with new ID: ${recordId}`, serviceId: record.serviceId, type: record.termsType, id: record.id, current: i++, total: records.length });
173
173
  counters.migrated++;
174
174
  } else {
175
- logger.info({ message: 'Skipped version', serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length });
175
+ logger.info({ message: 'Skipped version', serviceId: record.serviceId, type: record.termsType, id: record.id, current: i++, total: records.length });
176
176
  counters.skipped++;
177
177
  }
178
178
  }
@@ -47,9 +47,9 @@ const ROOT_PATH = path.resolve(__dirname, '../../');
47
47
  const { id: recordId } = await versionsTargetRepository.save(record);
48
48
 
49
49
  if (!recordId) {
50
- logger.warn({ message: 'Record skipped', serviceId: record.serviceId, type: record.documentType, id: record.id, current, total });
50
+ logger.warn({ message: 'Record skipped', serviceId: record.serviceId, type: record.termsType, id: record.id, current, total });
51
51
  } else {
52
- logger.info({ message: `Update short sha ${record.snapshotId} to ${fullSnapshotId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current, total });
52
+ logger.info({ message: `Update short sha ${record.snapshotId} to ${fullSnapshotId}`, serviceId: record.serviceId, type: record.termsType, id: record.id, current, total });
53
53
  }
54
54
 
55
55
  current++;
@@ -18,7 +18,7 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url));
18
18
  const ROOT_PATH = path.resolve(__dirname, '../../');
19
19
  const MAX_PARALLEL = 10;
20
20
  const MAX_RETRY = 5;
21
- const PDF_MIME_TYPE = 'application/pdf';
21
+ const PDF_MIME_TYPE = mime.getType('pdf');
22
22
  const COUNTERS = {
23
23
  imported: 0,
24
24
  skippedNoChanges: 0,
@@ -87,10 +87,10 @@ function queueErrorHandler(error, { commit }) {
87
87
 
88
88
  const serviceId = path.dirname(relativeFilePath);
89
89
  const extension = path.extname(relativeFilePath);
90
- const documentType = path.basename(relativeFilePath, extension);
90
+ const termsType = path.basename(relativeFilePath, extension);
91
91
 
92
92
  commitsNotImported.push(commit.hash);
93
- logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type: documentType, sha: commit.hash });
93
+ logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type: termsType, sha: commit.hash });
94
94
  COUNTERS.errors++;
95
95
  }
96
96
 
@@ -117,9 +117,9 @@ function queueDrainHandler(totalToTreat) {
117
117
  };
118
118
  }
119
119
 
120
- async function getCommitContent({ sha, serviceId, documentType, extension }) {
120
+ async function getCommitContent({ sha, serviceId, termsType, extension }) {
121
121
  const start = performance.now();
122
- const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(documentType)}.${extension}`;
122
+ const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(termsType)}.${extension}`;
123
123
  const response = await nodeFetch(url);
124
124
  const end = performance.now();
125
125
 
@@ -141,7 +141,7 @@ async function getCommitContent({ sha, serviceId, documentType, extension }) {
141
141
  throw new TooManyRequestsError(`Cannot get commit content on Github ${url}. 429: Too Many Requests`);
142
142
  }
143
143
 
144
- logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type: documentType, sha });
144
+ logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type: termsType, sha });
145
145
 
146
146
  return content;
147
147
  }
@@ -151,12 +151,12 @@ async function handleCommit(commit, index, total) {
151
151
 
152
152
  let serviceId = path.dirname(relativeFilePath);
153
153
  const extension = path.extname(relativeFilePath);
154
- let documentType = path.basename(relativeFilePath, extension);
154
+ let termsType = path.basename(relativeFilePath, extension);
155
155
 
156
156
  logger.info({
157
157
  message: 'Start to handle commit',
158
158
  serviceId,
159
- type: documentType,
159
+ type: termsType,
160
160
  sha: commit.hash,
161
161
  current: index + 1,
162
162
  total,
@@ -168,7 +168,7 @@ async function handleCommit(commit, index, total) {
168
168
  logger.info({
169
169
  message: 'Skipped commit as an entry already exists for this commit',
170
170
  serviceId,
171
- type: documentType,
171
+ type: termsType,
172
172
  sha: commit.hash,
173
173
  });
174
174
  COUNTERS.skippedNoChanges++;
@@ -176,9 +176,9 @@ async function handleCommit(commit, index, total) {
176
176
  return;
177
177
  }
178
178
 
179
- let content = await getCommitContent({ sha: commit.hash, serviceId, documentType, extension: extension.replace('.', '') });
179
+ let content = await getCommitContent({ sha: commit.hash, serviceId, termsType, extension: extension.replace('.', '') });
180
180
 
181
- ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
181
+ ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
182
182
 
183
183
  const mimeType = mime.getType(extension);
184
184
 
@@ -198,7 +198,7 @@ async function handleCommit(commit, index, total) {
198
198
 
199
199
  await snapshotsCollection.insertOne({
200
200
  serviceId,
201
- documentType,
201
+ termsType,
202
202
  content,
203
203
  mimeType,
204
204
  fetchDate: commit.date,
@@ -207,10 +207,10 @@ async function handleCommit(commit, index, total) {
207
207
  });
208
208
  const end = performance.now();
209
209
 
210
- logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type: documentType });
210
+ logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type: termsType });
211
211
  COUNTERS.imported++;
212
212
  } catch (error) {
213
- logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type: documentType });
213
+ logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type: termsType });
214
214
  commitsNotImported.push(commit.hash);
215
215
  COUNTERS.errors++;
216
216
  }
@@ -5,7 +5,7 @@
5
5
  "git": {
6
6
  "path": "./data/versions",
7
7
  "publish": false,
8
- "prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/ambanum/OpenTermsArchive-snapshots/commit/",
8
+ "prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/contrib-snapshots/commit/",
9
9
  "author": {
10
10
  "name": "Open Terms Archive Bot",
11
11
  "email": "bot@opentermsarchive.org"
@@ -5,7 +5,7 @@
5
5
  "git": {
6
6
  "path": "./data/versions-rewritten",
7
7
  "publish": false,
8
- "prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/ambanum/OpenTermsArchive-snapshots/commit/",
8
+ "prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/contrib-snapshots/commit/",
9
9
  "author": {
10
10
  "name": "Open Terms Archive Bot",
11
11
  "email": "bot@opentermsarchive.org"
@@ -76,13 +76,13 @@ let recorder;
76
76
  const { content, mimeType } = await loadFile(SNAPSHOTS_SOURCE_PATH, relativeFilePath);
77
77
 
78
78
  let serviceId = path.dirname(relativeFilePath);
79
- let documentType = path.basename(relativeFilePath, path.extname(relativeFilePath));
79
+ let termsType = path.basename(relativeFilePath, path.extname(relativeFilePath));
80
80
 
81
- ({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
81
+ ({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
82
82
 
83
83
  const { id: snapshotId } = await recorder.recordSnapshot({
84
84
  serviceId,
85
- documentType,
85
+ termsType,
86
86
  content,
87
87
  mimeType,
88
88
  fetchDate: commit.date,