@opentermsarchive/engine 0.26.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -469
- package/bin/ota-track.js +3 -3
- package/bin/ota-validate.js +2 -2
- package/bin/ota.js +1 -1
- package/config/default.json +1 -1
- package/config/test.json +2 -2
- package/package.json +6 -7
- package/scripts/dataset/export/index.js +4 -4
- package/scripts/dataset/export/index.test.js +11 -17
- package/scripts/dataset/export/test/fixtures/dataset/README.md +1 -1
- package/scripts/declarations/lint/index.mocha.js +1 -1
- package/scripts/declarations/utils/index.js +12 -12
- package/scripts/declarations/validate/definitions.js +1 -1
- package/scripts/declarations/validate/index.mocha.js +30 -34
- package/scripts/declarations/validate/service.history.schema.js +11 -11
- package/scripts/declarations/validate/service.schema.js +13 -13
- package/scripts/history/migrate-services.js +4 -4
- package/scripts/history/update-to-full-hash.js +2 -2
- package/scripts/import/index.js +14 -14
- package/scripts/rewrite/config/rewrite-snapshots.json +1 -1
- package/scripts/rewrite/config/rewrite-versions.json +1 -1
- package/scripts/rewrite/rewrite-snapshots.js +3 -3
- package/scripts/rewrite/rewrite-versions.js +14 -14
- package/scripts/utils/renamer/README.md +3 -3
- package/scripts/utils/renamer/index.js +13 -13
- package/src/archivist/errors.js +1 -1
- package/src/archivist/extract/exports.js +3 -0
- package/src/archivist/{filter → extract}/index.js +23 -27
- package/src/archivist/extract/index.test.js +516 -0
- package/src/archivist/index.js +101 -140
- package/src/archivist/index.test.js +178 -166
- package/src/archivist/recorder/index.js +11 -55
- package/src/archivist/recorder/index.test.js +310 -356
- package/src/archivist/recorder/record.js +18 -7
- package/src/archivist/recorder/repositories/git/dataMapper.js +41 -31
- package/src/archivist/recorder/repositories/git/index.js +11 -15
- package/src/archivist/recorder/repositories/git/index.test.js +1058 -463
- package/src/archivist/recorder/repositories/interface.js +8 -6
- package/src/archivist/recorder/repositories/mongo/dataMapper.js +21 -14
- package/src/archivist/recorder/repositories/mongo/index.js +8 -8
- package/src/archivist/recorder/repositories/mongo/index.test.js +898 -479
- package/src/archivist/recorder/snapshot.js +5 -0
- package/src/archivist/recorder/snapshot.test.js +65 -0
- package/src/archivist/recorder/version.js +14 -0
- package/src/archivist/recorder/version.test.js +65 -0
- package/src/archivist/services/index.js +60 -51
- package/src/archivist/services/index.test.js +63 -83
- package/src/archivist/services/service.js +26 -22
- package/src/archivist/services/service.test.js +46 -68
- package/src/archivist/services/{pageDeclaration.js → sourceDocument.js} +11 -9
- package/src/archivist/services/{pageDeclaration.test.js → sourceDocument.test.js} +21 -21
- package/src/archivist/services/terms.js +26 -0
- package/src/archivist/services/{documentDeclaration.test.js → terms.test.js} +15 -15
- package/src/exports.js +2 -2
- package/src/index.js +16 -13
- package/src/logger/index.js +35 -36
- package/src/notifier/index.js +8 -8
- package/src/tracker/index.js +6 -6
- package/src/archivist/filter/exports.js +0 -3
- package/src/archivist/filter/index.test.js +0 -564
- package/src/archivist/recorder/record.test.js +0 -91
- package/src/archivist/services/documentDeclaration.js +0 -26
- /package/scripts/utils/renamer/rules/{documentTypes.json → termsTypes.json} +0 -0
- /package/scripts/utils/renamer/rules/{documentTypesByService.json → termsTypesByService.json} +0 -0
|
@@ -8,8 +8,8 @@ import dircompare from 'dir-compare';
|
|
|
8
8
|
import mime from 'mime';
|
|
9
9
|
import StreamZip from 'node-stream-zip';
|
|
10
10
|
|
|
11
|
-
import Record from '../../../src/archivist/recorder/record.js';
|
|
12
11
|
import GitRepository from '../../../src/archivist/recorder/repositories/git/index.js';
|
|
12
|
+
import Version from '../../../src/archivist/recorder/version.js';
|
|
13
13
|
|
|
14
14
|
import generateArchive from './index.js';
|
|
15
15
|
|
|
@@ -20,8 +20,8 @@ const { expect } = chai;
|
|
|
20
20
|
const FIRST_SERVICE_PROVIDER_ID = 'ServiceA';
|
|
21
21
|
const SECOND_SERVICE_PROVIDER_ID = 'ServiceB';
|
|
22
22
|
|
|
23
|
-
const
|
|
24
|
-
const
|
|
23
|
+
const FIRST_TERMS_TYPE = 'Terms of Service';
|
|
24
|
+
const SECOND_TERMS_TYPE = 'Privacy Policy';
|
|
25
25
|
|
|
26
26
|
const FIRST_FETCH_DATE = '2021-01-01T11:27:00.000Z';
|
|
27
27
|
const SECOND_FETCH_DATE = '2021-01-11T11:32:47.000Z';
|
|
@@ -31,8 +31,6 @@ const FOURTH_FETCH_DATE = '2022-01-01T12:12:24.000Z';
|
|
|
31
31
|
const FIRST_CONTENT = 'First Content';
|
|
32
32
|
const SECOND_CONTENT = 'Second Content';
|
|
33
33
|
|
|
34
|
-
const MIME_TYPE = 'text/markdown';
|
|
35
|
-
|
|
36
34
|
const SNAPSHOT_ID = '721ce4a63ad399ecbdb548a66d6d327e7bc97876';
|
|
37
35
|
|
|
38
36
|
const RELEASE_DATE = '2022-01-01T18:21:00.000Z';
|
|
@@ -56,38 +54,34 @@ describe('Export', () => {
|
|
|
56
54
|
|
|
57
55
|
await repository.initialize();
|
|
58
56
|
|
|
59
|
-
await repository.save(new
|
|
57
|
+
await repository.save(new Version({
|
|
60
58
|
serviceId: FIRST_SERVICE_PROVIDER_ID,
|
|
61
|
-
|
|
59
|
+
termsType: FIRST_TERMS_TYPE,
|
|
62
60
|
content: FIRST_CONTENT,
|
|
63
|
-
mimeType: MIME_TYPE,
|
|
64
61
|
fetchDate: FIRST_FETCH_DATE,
|
|
65
62
|
snapshotId: SNAPSHOT_ID,
|
|
66
63
|
}));
|
|
67
64
|
|
|
68
|
-
await repository.save(new
|
|
65
|
+
await repository.save(new Version({
|
|
69
66
|
serviceId: FIRST_SERVICE_PROVIDER_ID,
|
|
70
|
-
|
|
67
|
+
termsType: FIRST_TERMS_TYPE,
|
|
71
68
|
content: SECOND_CONTENT,
|
|
72
|
-
mimeType: MIME_TYPE,
|
|
73
69
|
fetchDate: SECOND_FETCH_DATE,
|
|
74
70
|
snapshotId: SNAPSHOT_ID,
|
|
75
71
|
}));
|
|
76
72
|
|
|
77
|
-
await repository.save(new
|
|
73
|
+
await repository.save(new Version({
|
|
78
74
|
serviceId: SECOND_SERVICE_PROVIDER_ID,
|
|
79
|
-
|
|
75
|
+
termsType: FIRST_TERMS_TYPE,
|
|
80
76
|
content: FIRST_CONTENT,
|
|
81
|
-
mimeType: MIME_TYPE,
|
|
82
77
|
fetchDate: THIRD_FETCH_DATE,
|
|
83
78
|
snapshotId: SNAPSHOT_ID,
|
|
84
79
|
}));
|
|
85
80
|
|
|
86
|
-
await repository.save(new
|
|
81
|
+
await repository.save(new Version({
|
|
87
82
|
serviceId: SECOND_SERVICE_PROVIDER_ID,
|
|
88
|
-
|
|
83
|
+
termsType: SECOND_TERMS_TYPE,
|
|
89
84
|
content: FIRST_CONTENT,
|
|
90
|
-
mimeType: MIME_TYPE,
|
|
91
85
|
fetchDate: FOURTH_FETCH_DATE,
|
|
92
86
|
snapshotId: SNAPSHOT_ID,
|
|
93
87
|
}));
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
This dataset consolidates the contractual documents of 2 service providers, in all their versions that were accessible online between January 1, 2021 and January 6, 2022.
|
|
4
4
|
|
|
5
|
-
This dataset is tailored for datascientists and other analysts. You can also explore all these versions interactively on [https://github.com/OpenTermsArchive/sandbox](https://github.com/OpenTermsArchive/sandbox).
|
|
5
|
+
This dataset is tailored for datascientists and other analysts. You can also explore all these versions interactively on [https://github.com/OpenTermsArchive/sandbox-versions](https://github.com/OpenTermsArchive/sandbox-versions).
|
|
6
6
|
|
|
7
7
|
It has been generated with [Open Terms Archive](https://opentermsarchive.org).
|
|
8
8
|
|
|
@@ -31,7 +31,7 @@ export default async options => {
|
|
|
31
31
|
if (options.modified) {
|
|
32
32
|
const declarationUtils = new DeclarationUtils(instancePath);
|
|
33
33
|
|
|
34
|
-
({ services: servicesToValidate } = await declarationUtils.
|
|
34
|
+
({ services: servicesToValidate } = await declarationUtils.getModifiedServiceTermsTypes());
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
const lintFile = lintAndFixFile(options.fix);
|
|
@@ -24,27 +24,27 @@ export default class DeclarationUtils {
|
|
|
24
24
|
|
|
25
25
|
const modifiedFilePaths = modifiedFilePathsAsString ? modifiedFilePathsAsString.split('\n') : [];
|
|
26
26
|
|
|
27
|
-
return { modifiedFilePaths,
|
|
27
|
+
return { modifiedFilePaths, modifiedServicesIds: Array.from(new Set(modifiedFilePaths.map(DeclarationUtils.filePathToServiceId))) };
|
|
28
28
|
}
|
|
29
29
|
|
|
30
30
|
async getModifiedServices() {
|
|
31
|
-
const {
|
|
31
|
+
const { modifiedServicesIds } = await this.getModifiedData();
|
|
32
32
|
|
|
33
|
-
return
|
|
33
|
+
return modifiedServicesIds;
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
-
async
|
|
37
|
-
const { modifiedFilePaths,
|
|
38
|
-
const
|
|
36
|
+
async getModifiedServiceTermsTypes() {
|
|
37
|
+
const { modifiedFilePaths, modifiedServicesIds } = await this.getModifiedData();
|
|
38
|
+
const servicesTermsTypes = {};
|
|
39
39
|
|
|
40
40
|
await Promise.all(modifiedFilePaths.map(async modifiedFilePath => {
|
|
41
41
|
const serviceId = DeclarationUtils.filePathToServiceId(modifiedFilePath);
|
|
42
42
|
|
|
43
43
|
if (!modifiedFilePath.endsWith('.json')) {
|
|
44
44
|
// Here we should compare AST of both files to detect on which function
|
|
45
|
-
// change has been made, and then find which
|
|
45
|
+
// change has been made, and then find which terms type depends on this
|
|
46
46
|
// function.
|
|
47
|
-
// As this is a complicated process, we will just send back all
|
|
47
|
+
// As this is a complicated process, we will just send back all terms types
|
|
48
48
|
const declaration = await this.getJSONFile(`declarations/${serviceId}.json`, this.defaultBranch);
|
|
49
49
|
|
|
50
50
|
return Object.keys(declaration.documents);
|
|
@@ -60,7 +60,7 @@ export default class DeclarationUtils {
|
|
|
60
60
|
return;
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
const
|
|
63
|
+
const modifiedTermsTypes = diff.reduce((acc, { path }) => {
|
|
64
64
|
if (modifiedFilePath.includes('.history')) {
|
|
65
65
|
acc.add(path[0]);
|
|
66
66
|
} else if (path[0] == 'documents') {
|
|
@@ -70,12 +70,12 @@ export default class DeclarationUtils {
|
|
|
70
70
|
return acc;
|
|
71
71
|
}, new Set());
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
servicesTermsTypes[serviceId] = Array.from(new Set([ ...servicesTermsTypes[serviceId] || [], ...modifiedTermsTypes ]));
|
|
74
74
|
}));
|
|
75
75
|
|
|
76
76
|
return {
|
|
77
|
-
services:
|
|
78
|
-
|
|
77
|
+
services: modifiedServicesIds,
|
|
78
|
+
servicesTermsTypes,
|
|
79
79
|
};
|
|
80
80
|
}
|
|
81
81
|
}
|
|
@@ -25,7 +25,7 @@ const definitions = {
|
|
|
25
25
|
],
|
|
26
26
|
},
|
|
27
27
|
contentSelectors: { $ref: '#/definitions/selectors' },
|
|
28
|
-
|
|
28
|
+
insignificantContentSelectors: { $ref: '#/definitions/selectors' },
|
|
29
29
|
filters: {
|
|
30
30
|
type: 'array',
|
|
31
31
|
items: {
|
|
@@ -6,8 +6,8 @@ import { expect } from 'chai';
|
|
|
6
6
|
import config from 'config';
|
|
7
7
|
import jsonSourceMap from 'json-source-map';
|
|
8
8
|
|
|
9
|
+
import extract from '../../../src/archivist/extract/index.js';
|
|
9
10
|
import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from '../../../src/archivist/fetcher/index.js';
|
|
10
|
-
import filter from '../../../src/archivist/filter/index.js';
|
|
11
11
|
import * as services from '../../../src/archivist/services/index.js';
|
|
12
12
|
import DeclarationUtils from '../utils/index.js';
|
|
13
13
|
|
|
@@ -25,8 +25,8 @@ const instancePath = path.resolve(declarationsPath, '../');
|
|
|
25
25
|
export default async options => {
|
|
26
26
|
const schemaOnly = options.schemaOnly || false;
|
|
27
27
|
let servicesToValidate = options.services || [];
|
|
28
|
-
const
|
|
29
|
-
let
|
|
28
|
+
const termsTypes = options.types || [];
|
|
29
|
+
let servicesTermsTypes = {};
|
|
30
30
|
|
|
31
31
|
const serviceDeclarations = await services.loadWithHistory(servicesToValidate);
|
|
32
32
|
|
|
@@ -37,7 +37,7 @@ export default async options => {
|
|
|
37
37
|
if (options.modified) {
|
|
38
38
|
const declarationUtils = new DeclarationUtils(instancePath);
|
|
39
39
|
|
|
40
|
-
({ services: servicesToValidate,
|
|
40
|
+
({ services: servicesToValidate, servicesTermsTypes } = await declarationUtils.getModifiedServiceTermsTypes());
|
|
41
41
|
}
|
|
42
42
|
|
|
43
43
|
describe('Service declarations validation', async function () {
|
|
@@ -76,61 +76,57 @@ export default async options => {
|
|
|
76
76
|
}
|
|
77
77
|
|
|
78
78
|
if (!schemaOnly && service) {
|
|
79
|
-
service.
|
|
80
|
-
.filter(
|
|
81
|
-
if (
|
|
82
|
-
return
|
|
79
|
+
service.getTermsTypes()
|
|
80
|
+
.filter(termsType => {
|
|
81
|
+
if (servicesTermsTypes[serviceId] && servicesTermsTypes[serviceId].length > 0) {
|
|
82
|
+
return servicesTermsTypes[serviceId].includes(termsType);
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
-
if (
|
|
86
|
-
return
|
|
85
|
+
if (termsTypes.length > 0) {
|
|
86
|
+
return termsTypes.includes(termsType);
|
|
87
87
|
}
|
|
88
88
|
|
|
89
89
|
return true;
|
|
90
90
|
})
|
|
91
91
|
.forEach(type => {
|
|
92
92
|
describe(type, () => {
|
|
93
|
-
const
|
|
93
|
+
const terms = service.getTerms(type);
|
|
94
94
|
|
|
95
|
-
|
|
96
|
-
let content;
|
|
95
|
+
terms.sourceDocuments.forEach(sourceDocument => {
|
|
97
96
|
let filteredContent;
|
|
98
|
-
let mimeType;
|
|
99
97
|
|
|
100
|
-
context(
|
|
98
|
+
context(sourceDocument.location, () => {
|
|
101
99
|
before(async function () {
|
|
102
|
-
if (!
|
|
100
|
+
if (!terms) {
|
|
103
101
|
console.log(' (Tests skipped as declaration has been archived)');
|
|
104
102
|
this.skip();
|
|
105
103
|
}
|
|
106
104
|
});
|
|
107
105
|
|
|
108
106
|
it('fetchable URL', async () => {
|
|
109
|
-
const { location, executeClientScripts } =
|
|
110
|
-
|
|
107
|
+
const { location, executeClientScripts } = sourceDocument;
|
|
108
|
+
|
|
109
|
+
({ content: sourceDocument.content, mimeType: sourceDocument.mimeType } = await fetch({
|
|
111
110
|
url: location,
|
|
112
111
|
executeClientScripts,
|
|
113
|
-
cssSelectors:
|
|
112
|
+
cssSelectors: sourceDocument.cssSelectors,
|
|
114
113
|
config: config.get('fetcher'),
|
|
115
|
-
});
|
|
116
|
-
|
|
117
|
-
content = document.content;
|
|
118
|
-
mimeType = document.mimeType;
|
|
114
|
+
}));
|
|
119
115
|
});
|
|
120
116
|
|
|
121
|
-
it('selector matches an element in the
|
|
122
|
-
if (!content) {
|
|
117
|
+
it('selector matches an element in the source document', async function checkSelector() {
|
|
118
|
+
if (!sourceDocument.content) {
|
|
123
119
|
console.log(' [Tests skipped as URL is not fetchable]');
|
|
124
120
|
this.skip();
|
|
125
121
|
}
|
|
126
122
|
|
|
127
|
-
filteredContent = await
|
|
123
|
+
filteredContent = await extract(sourceDocument);
|
|
128
124
|
|
|
129
125
|
expect(filteredContent).to.not.be.empty;
|
|
130
126
|
});
|
|
131
127
|
|
|
132
128
|
it(`filtered content has at least ${MIN_DOC_LENGTH} characters`, async function checkContentLength() {
|
|
133
|
-
if (!content) {
|
|
129
|
+
if (!sourceDocument.content) {
|
|
134
130
|
console.log(' [Tests skipped as URL is not fetchable]');
|
|
135
131
|
this.skip();
|
|
136
132
|
}
|
|
@@ -146,7 +142,7 @@ export default async options => {
|
|
|
146
142
|
it('content is consistent when fetched and filtered twice in a row', async function checkContentConsistency() {
|
|
147
143
|
this.slow(SLOW_DOCUMENT_THRESHOLD * 2);
|
|
148
144
|
|
|
149
|
-
if (!content) {
|
|
145
|
+
if (!sourceDocument.content) {
|
|
150
146
|
console.log(' [Tests skipped as URL is not fetchable]');
|
|
151
147
|
this.skip();
|
|
152
148
|
}
|
|
@@ -156,13 +152,13 @@ export default async options => {
|
|
|
156
152
|
this.skip();
|
|
157
153
|
}
|
|
158
154
|
|
|
159
|
-
|
|
160
|
-
url:
|
|
161
|
-
executeClientScripts:
|
|
162
|
-
cssSelectors:
|
|
155
|
+
({ content: sourceDocument.content, mimeType: sourceDocument.mimeType } = await fetch({
|
|
156
|
+
url: sourceDocument.location,
|
|
157
|
+
executeClientScripts: sourceDocument.executeClientScripts,
|
|
158
|
+
cssSelectors: sourceDocument.cssSelectors,
|
|
163
159
|
config: config.get('fetcher'),
|
|
164
|
-
});
|
|
165
|
-
const secondFilteredContent = await
|
|
160
|
+
}));
|
|
161
|
+
const secondFilteredContent = await extract(sourceDocument);
|
|
166
162
|
|
|
167
163
|
expect(secondFilteredContent).to.equal(filteredContent);
|
|
168
164
|
});
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import
|
|
1
|
+
import TERMS_TYPES from '@opentermsarchive/terms-types';
|
|
2
2
|
|
|
3
3
|
import definitions from './definitions.js';
|
|
4
4
|
|
|
5
|
-
const AVAILABLE_TYPES_NAME = Object.keys(
|
|
5
|
+
const AVAILABLE_TYPES_NAME = Object.keys(TERMS_TYPES);
|
|
6
6
|
|
|
7
|
-
const
|
|
7
|
+
const termsProperties = () => {
|
|
8
8
|
const result = {};
|
|
9
9
|
|
|
10
10
|
AVAILABLE_TYPES_NAME.forEach(type => {
|
|
@@ -12,8 +12,8 @@ const documentsProperties = () => {
|
|
|
12
12
|
type: 'array',
|
|
13
13
|
items: {
|
|
14
14
|
oneOf: [
|
|
15
|
-
{ $ref: '#/definitions/
|
|
16
|
-
{ $ref: '#/definitions/
|
|
15
|
+
{ $ref: '#/definitions/singleSourceDocumentTermsHistory' },
|
|
16
|
+
{ $ref: '#/definitions/multipleSourceDocumentsTermsHistory' },
|
|
17
17
|
{ $ref: '#/definitions/pdfDocumentHistory' },
|
|
18
18
|
],
|
|
19
19
|
},
|
|
@@ -27,7 +27,7 @@ const schema = {
|
|
|
27
27
|
type: 'object',
|
|
28
28
|
additionalProperties: false,
|
|
29
29
|
title: 'Service declaration history',
|
|
30
|
-
properties:
|
|
30
|
+
properties: termsProperties(),
|
|
31
31
|
propertyNames: { enum: AVAILABLE_TYPES_NAME },
|
|
32
32
|
definitions: {
|
|
33
33
|
...definitions,
|
|
@@ -40,7 +40,7 @@ const schema = {
|
|
|
40
40
|
validUntil: { $ref: '#/definitions/validUntil' },
|
|
41
41
|
},
|
|
42
42
|
},
|
|
43
|
-
|
|
43
|
+
singleSourceDocumentTermsHistory: {
|
|
44
44
|
type: 'object',
|
|
45
45
|
additionalProperties: false,
|
|
46
46
|
required: [ 'fetch', 'select', 'validUntil' ],
|
|
@@ -48,12 +48,12 @@ const schema = {
|
|
|
48
48
|
fetch: { $ref: '#/definitions/location' },
|
|
49
49
|
select: { $ref: '#/definitions/contentSelectors' },
|
|
50
50
|
filter: { $ref: '#/definitions/filters' },
|
|
51
|
-
remove: { $ref: '#/definitions/
|
|
51
|
+
remove: { $ref: '#/definitions/insignificantContentSelectors' },
|
|
52
52
|
executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
|
|
53
53
|
validUntil: { $ref: '#/definitions/validUntil' },
|
|
54
54
|
},
|
|
55
55
|
},
|
|
56
|
-
|
|
56
|
+
multipleSourceDocumentsTermsHistory: {
|
|
57
57
|
type: 'object',
|
|
58
58
|
additionalProperties: false,
|
|
59
59
|
required: ['combine'],
|
|
@@ -68,14 +68,14 @@ const schema = {
|
|
|
68
68
|
fetch: { $ref: '#/definitions/location' },
|
|
69
69
|
select: { $ref: '#/definitions/contentSelectors' },
|
|
70
70
|
filter: { $ref: '#/definitions/filters' },
|
|
71
|
-
remove: { $ref: '#/definitions/
|
|
71
|
+
remove: { $ref: '#/definitions/insignificantContentSelectors' },
|
|
72
72
|
executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
|
|
73
73
|
},
|
|
74
74
|
},
|
|
75
75
|
},
|
|
76
76
|
select: { $ref: '#/definitions/contentSelectors' },
|
|
77
77
|
filter: { $ref: '#/definitions/filters' },
|
|
78
|
-
remove: { $ref: '#/definitions/
|
|
78
|
+
remove: { $ref: '#/definitions/insignificantContentSelectors' },
|
|
79
79
|
executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
|
|
80
80
|
validUntil: { $ref: '#/definitions/validUntil' },
|
|
81
81
|
},
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
import
|
|
1
|
+
import TERMS_TYPES from '@opentermsarchive/terms-types';
|
|
2
2
|
|
|
3
3
|
import definitions from './definitions.js';
|
|
4
4
|
|
|
5
|
-
const AVAILABLE_TYPES_NAME = Object.keys(
|
|
5
|
+
const AVAILABLE_TYPES_NAME = Object.keys(TERMS_TYPES);
|
|
6
6
|
|
|
7
|
-
const
|
|
7
|
+
const termsProperties = () => {
|
|
8
8
|
const result = {};
|
|
9
9
|
|
|
10
10
|
AVAILABLE_TYPES_NAME.forEach(type => {
|
|
11
11
|
result[type] = {
|
|
12
12
|
oneOf: [
|
|
13
|
-
{ $ref: '#/definitions/
|
|
14
|
-
{ $ref: '#/definitions/
|
|
13
|
+
{ $ref: '#/definitions/singleSourceDocumentTerms' },
|
|
14
|
+
{ $ref: '#/definitions/multipleSourceDocumentsTerms' },
|
|
15
15
|
{ $ref: '#/definitions/pdfDocument' },
|
|
16
16
|
],
|
|
17
17
|
};
|
|
@@ -33,7 +33,7 @@ const schema = {
|
|
|
33
33
|
},
|
|
34
34
|
documents: {
|
|
35
35
|
type: 'object',
|
|
36
|
-
properties:
|
|
36
|
+
properties: termsProperties(),
|
|
37
37
|
propertyNames: { enum: AVAILABLE_TYPES_NAME },
|
|
38
38
|
},
|
|
39
39
|
importedFrom: {
|
|
@@ -52,7 +52,7 @@ const schema = {
|
|
|
52
52
|
required: ['fetch'],
|
|
53
53
|
properties: { fetch: { $ref: '#/definitions/pdfLocation' } },
|
|
54
54
|
},
|
|
55
|
-
|
|
55
|
+
sourceDocument: {
|
|
56
56
|
type: 'object',
|
|
57
57
|
additionalProperties: false,
|
|
58
58
|
required: ['fetch'],
|
|
@@ -60,28 +60,28 @@ const schema = {
|
|
|
60
60
|
fetch: { $ref: '#/definitions/location' },
|
|
61
61
|
select: { $ref: '#/definitions/contentSelectors' },
|
|
62
62
|
filter: { $ref: '#/definitions/filters' },
|
|
63
|
-
remove: { $ref: '#/definitions/
|
|
63
|
+
remove: { $ref: '#/definitions/insignificantContentSelectors' },
|
|
64
64
|
executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
|
|
65
65
|
},
|
|
66
66
|
},
|
|
67
|
-
|
|
67
|
+
singleSourceDocumentTerms: {
|
|
68
68
|
allOf: [
|
|
69
|
-
{ $ref: '#/definitions/
|
|
69
|
+
{ $ref: '#/definitions/sourceDocument' },
|
|
70
70
|
{ required: [ 'fetch', 'select' ] },
|
|
71
71
|
],
|
|
72
72
|
},
|
|
73
|
-
|
|
73
|
+
multipleSourceDocumentsTerms: {
|
|
74
74
|
type: 'object',
|
|
75
75
|
additionalProperties: false,
|
|
76
76
|
required: ['combine'],
|
|
77
77
|
properties: {
|
|
78
78
|
combine: {
|
|
79
79
|
type: 'array',
|
|
80
|
-
items: { $ref: '#/definitions/
|
|
80
|
+
items: { $ref: '#/definitions/sourceDocument' },
|
|
81
81
|
},
|
|
82
82
|
select: { $ref: '#/definitions/contentSelectors' },
|
|
83
83
|
filter: { $ref: '#/definitions/filters' },
|
|
84
|
-
remove: { $ref: '#/definitions/
|
|
84
|
+
remove: { $ref: '#/definitions/insignificantContentSelectors' },
|
|
85
85
|
executeClientScripts: { $ref: '#/definitions/executeClientScripts' },
|
|
86
86
|
},
|
|
87
87
|
},
|
|
@@ -145,10 +145,10 @@ async function rewriteSnapshots(repository, records, idsMapping, logger) {
|
|
|
145
145
|
idsMapping[record.id] = recordId; // Saves the mapping between the old ID and the new one.
|
|
146
146
|
|
|
147
147
|
if (recordId) {
|
|
148
|
-
logger.info({ message: `Migrated snapshot with new ID: ${recordId}`, serviceId: record.serviceId, type: record.
|
|
148
|
+
logger.info({ message: `Migrated snapshot with new ID: ${recordId}`, serviceId: record.serviceId, type: record.termsType, id: record.id, current: i++, total: records.length });
|
|
149
149
|
counters.migrated++;
|
|
150
150
|
} else {
|
|
151
|
-
logger.info({ message: 'Skipped snapshot', serviceId: record.serviceId, type: record.
|
|
151
|
+
logger.info({ message: 'Skipped snapshot', serviceId: record.serviceId, type: record.termsType, id: record.id, current: i++, total: records.length });
|
|
152
152
|
counters.skipped++;
|
|
153
153
|
}
|
|
154
154
|
}
|
|
@@ -169,10 +169,10 @@ async function rewriteVersions(repository, records, idsMapping, logger) {
|
|
|
169
169
|
const { id: recordId } = await repository.save(record); // eslint-disable-line no-await-in-loop
|
|
170
170
|
|
|
171
171
|
if (recordId) {
|
|
172
|
-
logger.info({ message: `Migrated version with new ID: ${recordId}`, serviceId: record.serviceId, type: record.
|
|
172
|
+
logger.info({ message: `Migrated version with new ID: ${recordId}`, serviceId: record.serviceId, type: record.termsType, id: record.id, current: i++, total: records.length });
|
|
173
173
|
counters.migrated++;
|
|
174
174
|
} else {
|
|
175
|
-
logger.info({ message: 'Skipped version', serviceId: record.serviceId, type: record.
|
|
175
|
+
logger.info({ message: 'Skipped version', serviceId: record.serviceId, type: record.termsType, id: record.id, current: i++, total: records.length });
|
|
176
176
|
counters.skipped++;
|
|
177
177
|
}
|
|
178
178
|
}
|
|
@@ -47,9 +47,9 @@ const ROOT_PATH = path.resolve(__dirname, '../../');
|
|
|
47
47
|
const { id: recordId } = await versionsTargetRepository.save(record);
|
|
48
48
|
|
|
49
49
|
if (!recordId) {
|
|
50
|
-
logger.warn({ message: 'Record skipped', serviceId: record.serviceId, type: record.
|
|
50
|
+
logger.warn({ message: 'Record skipped', serviceId: record.serviceId, type: record.termsType, id: record.id, current, total });
|
|
51
51
|
} else {
|
|
52
|
-
logger.info({ message: `Update short sha ${record.snapshotId} to ${fullSnapshotId}`, serviceId: record.serviceId, type: record.
|
|
52
|
+
logger.info({ message: `Update short sha ${record.snapshotId} to ${fullSnapshotId}`, serviceId: record.serviceId, type: record.termsType, id: record.id, current, total });
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
current++;
|
package/scripts/import/index.js
CHANGED
|
@@ -18,7 +18,7 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
|
18
18
|
const ROOT_PATH = path.resolve(__dirname, '../../');
|
|
19
19
|
const MAX_PARALLEL = 10;
|
|
20
20
|
const MAX_RETRY = 5;
|
|
21
|
-
const PDF_MIME_TYPE = '
|
|
21
|
+
const PDF_MIME_TYPE = mime.getType('pdf');
|
|
22
22
|
const COUNTERS = {
|
|
23
23
|
imported: 0,
|
|
24
24
|
skippedNoChanges: 0,
|
|
@@ -87,10 +87,10 @@ function queueErrorHandler(error, { commit }) {
|
|
|
87
87
|
|
|
88
88
|
const serviceId = path.dirname(relativeFilePath);
|
|
89
89
|
const extension = path.extname(relativeFilePath);
|
|
90
|
-
const
|
|
90
|
+
const termsType = path.basename(relativeFilePath, extension);
|
|
91
91
|
|
|
92
92
|
commitsNotImported.push(commit.hash);
|
|
93
|
-
logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type:
|
|
93
|
+
logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type: termsType, sha: commit.hash });
|
|
94
94
|
COUNTERS.errors++;
|
|
95
95
|
}
|
|
96
96
|
|
|
@@ -117,9 +117,9 @@ function queueDrainHandler(totalToTreat) {
|
|
|
117
117
|
};
|
|
118
118
|
}
|
|
119
119
|
|
|
120
|
-
async function getCommitContent({ sha, serviceId,
|
|
120
|
+
async function getCommitContent({ sha, serviceId, termsType, extension }) {
|
|
121
121
|
const start = performance.now();
|
|
122
|
-
const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(
|
|
122
|
+
const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(termsType)}.${extension}`;
|
|
123
123
|
const response = await nodeFetch(url);
|
|
124
124
|
const end = performance.now();
|
|
125
125
|
|
|
@@ -141,7 +141,7 @@ async function getCommitContent({ sha, serviceId, documentType, extension }) {
|
|
|
141
141
|
throw new TooManyRequestsError(`Cannot get commit content on Github ${url}. 429: Too Many Requests`);
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
-
logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type:
|
|
144
|
+
logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type: termsType, sha });
|
|
145
145
|
|
|
146
146
|
return content;
|
|
147
147
|
}
|
|
@@ -151,12 +151,12 @@ async function handleCommit(commit, index, total) {
|
|
|
151
151
|
|
|
152
152
|
let serviceId = path.dirname(relativeFilePath);
|
|
153
153
|
const extension = path.extname(relativeFilePath);
|
|
154
|
-
let
|
|
154
|
+
let termsType = path.basename(relativeFilePath, extension);
|
|
155
155
|
|
|
156
156
|
logger.info({
|
|
157
157
|
message: 'Start to handle commit',
|
|
158
158
|
serviceId,
|
|
159
|
-
type:
|
|
159
|
+
type: termsType,
|
|
160
160
|
sha: commit.hash,
|
|
161
161
|
current: index + 1,
|
|
162
162
|
total,
|
|
@@ -168,7 +168,7 @@ async function handleCommit(commit, index, total) {
|
|
|
168
168
|
logger.info({
|
|
169
169
|
message: 'Skipped commit as an entry already exists for this commit',
|
|
170
170
|
serviceId,
|
|
171
|
-
type:
|
|
171
|
+
type: termsType,
|
|
172
172
|
sha: commit.hash,
|
|
173
173
|
});
|
|
174
174
|
COUNTERS.skippedNoChanges++;
|
|
@@ -176,9 +176,9 @@ async function handleCommit(commit, index, total) {
|
|
|
176
176
|
return;
|
|
177
177
|
}
|
|
178
178
|
|
|
179
|
-
let content = await getCommitContent({ sha: commit.hash, serviceId,
|
|
179
|
+
let content = await getCommitContent({ sha: commit.hash, serviceId, termsType, extension: extension.replace('.', '') });
|
|
180
180
|
|
|
181
|
-
({ serviceId,
|
|
181
|
+
({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
|
|
182
182
|
|
|
183
183
|
const mimeType = mime.getType(extension);
|
|
184
184
|
|
|
@@ -198,7 +198,7 @@ async function handleCommit(commit, index, total) {
|
|
|
198
198
|
|
|
199
199
|
await snapshotsCollection.insertOne({
|
|
200
200
|
serviceId,
|
|
201
|
-
|
|
201
|
+
termsType,
|
|
202
202
|
content,
|
|
203
203
|
mimeType,
|
|
204
204
|
fetchDate: commit.date,
|
|
@@ -207,10 +207,10 @@ async function handleCommit(commit, index, total) {
|
|
|
207
207
|
});
|
|
208
208
|
const end = performance.now();
|
|
209
209
|
|
|
210
|
-
logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type:
|
|
210
|
+
logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type: termsType });
|
|
211
211
|
COUNTERS.imported++;
|
|
212
212
|
} catch (error) {
|
|
213
|
-
logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type:
|
|
213
|
+
logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type: termsType });
|
|
214
214
|
commitsNotImported.push(commit.hash);
|
|
215
215
|
COUNTERS.errors++;
|
|
216
216
|
}
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"git": {
|
|
6
6
|
"path": "./data/versions",
|
|
7
7
|
"publish": false,
|
|
8
|
-
"prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/
|
|
8
|
+
"prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/contrib-snapshots/commit/",
|
|
9
9
|
"author": {
|
|
10
10
|
"name": "Open Terms Archive Bot",
|
|
11
11
|
"email": "bot@opentermsarchive.org"
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"git": {
|
|
6
6
|
"path": "./data/versions-rewritten",
|
|
7
7
|
"publish": false,
|
|
8
|
-
"prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/
|
|
8
|
+
"prefixMessageToSnapshotId": "This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/contrib-snapshots/commit/",
|
|
9
9
|
"author": {
|
|
10
10
|
"name": "Open Terms Archive Bot",
|
|
11
11
|
"email": "bot@opentermsarchive.org"
|
|
@@ -76,13 +76,13 @@ let recorder;
|
|
|
76
76
|
const { content, mimeType } = await loadFile(SNAPSHOTS_SOURCE_PATH, relativeFilePath);
|
|
77
77
|
|
|
78
78
|
let serviceId = path.dirname(relativeFilePath);
|
|
79
|
-
let
|
|
79
|
+
let termsType = path.basename(relativeFilePath, path.extname(relativeFilePath));
|
|
80
80
|
|
|
81
|
-
({ serviceId,
|
|
81
|
+
({ serviceId, termsType } = renamer.applyRules(serviceId, termsType));
|
|
82
82
|
|
|
83
83
|
const { id: snapshotId } = await recorder.recordSnapshot({
|
|
84
84
|
serviceId,
|
|
85
|
-
|
|
85
|
+
termsType,
|
|
86
86
|
content,
|
|
87
87
|
mimeType,
|
|
88
88
|
fetchDate: commit.date,
|