@opentermsarchive/engine 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +3 -0
- package/.eslintrc.yaml +116 -0
- package/.github/workflows/deploy.yml +50 -0
- package/.github/workflows/release.yml +64 -0
- package/.github/workflows/test.yml +77 -0
- package/CHANGELOG.md +14 -0
- package/CODE_OF_CONDUCT.md +128 -0
- package/CONTRIBUTING.md +143 -0
- package/LICENSE +153 -0
- package/MIGRATING.md +42 -0
- package/README.fr.md +110 -0
- package/README.md +438 -0
- package/Vagrantfile +38 -0
- package/ansible.cfg +13 -0
- package/bin/.env.js +1 -0
- package/bin/lint-declarations.js +31 -0
- package/bin/track.js +26 -0
- package/bin/validate-declarations.js +68 -0
- package/config/ci.json +5 -0
- package/config/contrib.json +35 -0
- package/config/dating.json +37 -0
- package/config/default.json +71 -0
- package/config/france.json +40 -0
- package/config/p2b-compliance.json +40 -0
- package/config/pga.json +40 -0
- package/config/production.json +27 -0
- package/config/test.json +49 -0
- package/config/vagrant.json +24 -0
- package/decision-records/0001-service-name-and-id.md +73 -0
- package/decision-records/0002-service-history.md +212 -0
- package/decision-records/0003-snapshots-database.md +123 -0
- package/ops/README.md +280 -0
- package/ops/app.yml +5 -0
- package/ops/infra.yml +6 -0
- package/ops/inventories/dev.yml +7 -0
- package/ops/inventories/production.yml +27 -0
- package/ops/roles/infra/defaults/main.yml +2 -0
- package/ops/roles/infra/files/.gitconfig +3 -0
- package/ops/roles/infra/files/mongod.conf +18 -0
- package/ops/roles/infra/files/ota-bot-key.private_key +26 -0
- package/ops/roles/infra/tasks/main.yml +78 -0
- package/ops/roles/infra/tasks/mongo.yml +40 -0
- package/ops/roles/infra/templates/ssh_config.j2 +5 -0
- package/ops/roles/ota/defaults/main.yml +14 -0
- package/ops/roles/ota/files/.env +21 -0
- package/ops/roles/ota/tasks/database.yml +65 -0
- package/ops/roles/ota/tasks/main.yml +110 -0
- package/ops/site.yml +6 -0
- package/package.json +101 -0
- package/pm2.config.cjs +20 -0
- package/scripts/dataset/README.md +37 -0
- package/scripts/dataset/assets/LICENSE +540 -0
- package/scripts/dataset/assets/README.template.js +65 -0
- package/scripts/dataset/export/index.js +106 -0
- package/scripts/dataset/export/index.test.js +155 -0
- package/scripts/dataset/export/test/fixtures/dataset/LICENSE +540 -0
- package/scripts/dataset/export/test/fixtures/dataset/README.md +40 -0
- package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-01T11-27-00Z.md +1 -0
- package/scripts/dataset/export/test/fixtures/dataset/ServiceA/Terms of Service/2021-01-11T11-32-47Z.md +1 -0
- package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Privacy Policy/2022-01-01T12-12-24Z.md +1 -0
- package/scripts/dataset/export/test/fixtures/dataset/ServiceB/Terms of Service/2022-01-06T11-32-47Z.md +1 -0
- package/scripts/dataset/index.js +40 -0
- package/scripts/dataset/logger/index.js +17 -0
- package/scripts/dataset/main.js +25 -0
- package/scripts/dataset/publish/index.js +39 -0
- package/scripts/declarations/lint/index.js +36 -0
- package/scripts/declarations/utils/index.js +81 -0
- package/scripts/declarations/validate/definitions.js +63 -0
- package/scripts/declarations/validate/index.mocha.js +262 -0
- package/scripts/declarations/validate/service.history.schema.js +86 -0
- package/scripts/declarations/validate/service.schema.js +91 -0
- package/scripts/history/logger/index.js +39 -0
- package/scripts/history/migrate-services.js +212 -0
- package/scripts/history/update-to-full-hash.js +61 -0
- package/scripts/history/utils/index.js +23 -0
- package/scripts/import/README.md +59 -0
- package/scripts/import/config/import.json +12 -0
- package/scripts/import/index.js +224 -0
- package/scripts/import/loadCommits.js +66 -0
- package/scripts/import/logger/index.js +43 -0
- package/scripts/rewrite/README.md +131 -0
- package/scripts/rewrite/config/rewrite-snapshots.json +32 -0
- package/scripts/rewrite/config/rewrite-versions.json +32 -0
- package/scripts/rewrite/initializer/files/license +428 -0
- package/scripts/rewrite/initializer/files/readme.md +8 -0
- package/scripts/rewrite/initializer/index.js +44 -0
- package/scripts/rewrite/rewrite-snapshots.js +108 -0
- package/scripts/rewrite/rewrite-versions.js +160 -0
- package/scripts/rewrite/utils.js +33 -0
- package/scripts/utils/renamer/README.md +49 -0
- package/scripts/utils/renamer/index.js +45 -0
- package/scripts/utils/renamer/rules/documentTypes.json +25 -0
- package/scripts/utils/renamer/rules/documentTypesByService.json +170 -0
- package/scripts/utils/renamer/rules/serviceNames.json +92 -0
- package/src/archivist/errors.js +9 -0
- package/src/archivist/fetcher/errors.js +6 -0
- package/src/archivist/fetcher/exports.js +18 -0
- package/src/archivist/fetcher/fullDomFetcher.js +84 -0
- package/src/archivist/fetcher/htmlOnlyFetcher.js +62 -0
- package/src/archivist/fetcher/index.js +35 -0
- package/src/archivist/fetcher/index.test.js +239 -0
- package/src/archivist/filter/exports.js +3 -0
- package/src/archivist/filter/index.js +178 -0
- package/src/archivist/filter/index.test.js +561 -0
- package/src/archivist/index.js +276 -0
- package/src/archivist/index.test.js +600 -0
- package/src/archivist/recorder/index.js +77 -0
- package/src/archivist/recorder/index.test.js +463 -0
- package/src/archivist/recorder/record.js +35 -0
- package/src/archivist/recorder/record.test.js +91 -0
- package/src/archivist/recorder/repositories/factory.js +23 -0
- package/src/archivist/recorder/repositories/git/dataMapper.js +83 -0
- package/src/archivist/recorder/repositories/git/git.js +122 -0
- package/src/archivist/recorder/repositories/git/git.test.js +86 -0
- package/src/archivist/recorder/repositories/git/index.js +182 -0
- package/src/archivist/recorder/repositories/git/index.test.js +714 -0
- package/src/archivist/recorder/repositories/interface.js +108 -0
- package/src/archivist/recorder/repositories/mongo/dataMapper.js +32 -0
- package/src/archivist/recorder/repositories/mongo/index.js +121 -0
- package/src/archivist/recorder/repositories/mongo/index.test.js +721 -0
- package/src/archivist/services/documentDeclaration.js +26 -0
- package/src/archivist/services/documentDeclaration.test.js +85 -0
- package/src/archivist/services/documentTypes.json +386 -0
- package/src/archivist/services/index.js +255 -0
- package/src/archivist/services/index.test.js +327 -0
- package/src/archivist/services/pageDeclaration.js +51 -0
- package/src/archivist/services/pageDeclaration.test.js +224 -0
- package/src/archivist/services/service.js +60 -0
- package/src/archivist/services/service.test.js +164 -0
- package/src/exports.js +3 -0
- package/src/index.js +59 -0
- package/src/logger/README.md +1 -0
- package/src/logger/index.js +131 -0
- package/src/main.js +18 -0
- package/src/notifier/README.md +1 -0
- package/src/notifier/index.js +150 -0
- package/src/tracker/README.md +1 -0
- package/src/tracker/index.js +215 -0
- package/test/fixtures/service_A.js +22 -0
- package/test/fixtures/service_A_terms.md +10 -0
- package/test/fixtures/service_A_terms_snapshot.html +14 -0
- package/test/fixtures/service_B.js +22 -0
- package/test/fixtures/service_with_declaration_history.js +65 -0
- package/test/fixtures/service_with_filters_history.js +155 -0
- package/test/fixtures/service_with_history.js +188 -0
- package/test/fixtures/service_with_multipage_document.js +100 -0
- package/test/fixtures/service_without_history.js +31 -0
- package/test/fixtures/services.js +19 -0
- package/test/fixtures/terms.pdf +0 -0
- package/test/fixtures/termsFromPDF.md +25 -0
- package/test/fixtures/termsModified.pdf +0 -0
- package/test/services/service_A.json +9 -0
- package/test/services/service_B.json +9 -0
- package/test/services/service_with_declaration_history.filters.js +7 -0
- package/test/services/service_with_declaration_history.history.json +17 -0
- package/test/services/service_with_declaration_history.json +13 -0
- package/test/services/service_with_filters_history.filters.history.js +29 -0
- package/test/services/service_with_filters_history.filters.js +7 -0
- package/test/services/service_with_filters_history.json +13 -0
- package/test/services/service_with_history.filters.history.js +29 -0
- package/test/services/service_with_history.filters.js +7 -0
- package/test/services/service_with_history.history.json +26 -0
- package/test/services/service_with_history.json +17 -0
- package/test/services/service_with_multipage_document.filters.js +7 -0
- package/test/services/service_with_multipage_document.history.json +37 -0
- package/test/services/service_with_multipage_document.json +28 -0
- package/test/services/service_without_history.filters.js +7 -0
- package/test/services/service_without_history.json +13 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import winston from 'winston';
|
|
2
|
+
|
|
3
|
+
import logger from '../../../src/logger/index.js';
|
|
4
|
+
|
|
5
|
+
const { combine, timestamp, printf, colorize } = winston.format;
|
|
6
|
+
|
|
7
|
+
export const format = combine(
|
|
8
|
+
colorize(),
|
|
9
|
+
timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }),
|
|
10
|
+
printf(({ level, message, timestamp, serviceId, type, id, current, total }) => {
|
|
11
|
+
let prefix = ''.padEnd(8);
|
|
12
|
+
|
|
13
|
+
if (current && total) {
|
|
14
|
+
prefix = `${Number(((current) / total) * 100).toFixed(2)}%`.padEnd(8);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
if (serviceId) {
|
|
18
|
+
prefix += `${serviceId}`.padEnd(30);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
if (type) {
|
|
22
|
+
if (type.length > 50) {
|
|
23
|
+
type = `${type.substring(0, 48)}…`;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
prefix += `${type}`.padEnd(50);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (id) {
|
|
30
|
+
prefix += `${id}`.padEnd(42);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return `${timestamp} ${level.padEnd(15)} ${prefix}${message}`;
|
|
34
|
+
}),
|
|
35
|
+
);
|
|
36
|
+
|
|
37
|
+
logger.format = format;
|
|
38
|
+
|
|
39
|
+
export default logger;
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import fsApi from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
|
|
5
|
+
import config from 'config';
|
|
6
|
+
import winston from 'winston';
|
|
7
|
+
|
|
8
|
+
import GitRepository from '../../src/archivist/recorder/repositories/git/index.js';
|
|
9
|
+
|
|
10
|
+
import { format } from './logger/index.js';
|
|
11
|
+
import { importReadmeInGit } from './utils/index.js';
|
|
12
|
+
|
|
13
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
14
|
+
const ROOT_PATH = path.resolve(__dirname, '../../');
|
|
15
|
+
const fs = fsApi.promises;
|
|
16
|
+
|
|
17
|
+
const CONFIG = {
|
|
18
|
+
servicesToMigrate: [ 'ASICS', 'Amazon', 'Orange Money France' ],
|
|
19
|
+
from: {
|
|
20
|
+
snapshots: 'france-snapshots',
|
|
21
|
+
versions: 'france-versions-hash-updated',
|
|
22
|
+
prefixMessageToSnapshotId: 'This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/france-snapshots/commit/',
|
|
23
|
+
},
|
|
24
|
+
to: {
|
|
25
|
+
snapshots: 'france-elections-snapshots',
|
|
26
|
+
versions: 'france-elections-versions-hash-updated',
|
|
27
|
+
prefixMessageToSnapshotId: 'This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/france-elections-snapshots/commit/',
|
|
28
|
+
},
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
const counters = {
|
|
32
|
+
migrated: 0,
|
|
33
|
+
skipped: 0,
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
(async function main() {
|
|
37
|
+
console.time('Total time');
|
|
38
|
+
|
|
39
|
+
const migration = {
|
|
40
|
+
services: CONFIG.servicesToMigrate,
|
|
41
|
+
from: {
|
|
42
|
+
snapshots: {
|
|
43
|
+
source: new GitRepository({
|
|
44
|
+
...config.get('recorder.snapshots.storage.git'),
|
|
45
|
+
path: path.resolve(ROOT_PATH, `./data/${CONFIG.from.snapshots}`),
|
|
46
|
+
}),
|
|
47
|
+
destination: new GitRepository({
|
|
48
|
+
...config.get('recorder.snapshots.storage.git'),
|
|
49
|
+
path: path.resolve(ROOT_PATH, `./data/${CONFIG.from.snapshots}-migrated`),
|
|
50
|
+
}),
|
|
51
|
+
logger: winston.createLogger({ transports: [ new (winston.transports.File)({ filename: `${__dirname}/logs/${CONFIG.from.snapshots}.log` }), new winston.transports.Console() ], format }),
|
|
52
|
+
},
|
|
53
|
+
versions: {
|
|
54
|
+
source: new GitRepository({
|
|
55
|
+
...config.get('recorder.versions.storage.git'),
|
|
56
|
+
path: path.resolve(ROOT_PATH, `./data/${CONFIG.from.versions}`),
|
|
57
|
+
}),
|
|
58
|
+
destination: new GitRepository({
|
|
59
|
+
...config.get('recorder.versions.storage.git'),
|
|
60
|
+
path: path.resolve(ROOT_PATH, `./data/${CONFIG.from.versions}-migrated`),
|
|
61
|
+
prefixMessageToSnapshotId: CONFIG.from.prefixMessageToSnapshotId,
|
|
62
|
+
}),
|
|
63
|
+
logger: winston.createLogger({ transports: [ new (winston.transports.File)({ filename: `${__dirname}/logs/${CONFIG.from.versions}.log` }), new winston.transports.Console() ], format }),
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
to: {
|
|
67
|
+
snapshots: {
|
|
68
|
+
source: new GitRepository({
|
|
69
|
+
...config.get('recorder.snapshots.storage.git'),
|
|
70
|
+
path: path.resolve(ROOT_PATH, `./data/${CONFIG.to.snapshots}`),
|
|
71
|
+
}),
|
|
72
|
+
destination: new GitRepository({
|
|
73
|
+
...config.get('recorder.snapshots.storage.git'),
|
|
74
|
+
path: path.resolve(ROOT_PATH, `./data/${CONFIG.to.snapshots}-migrated`),
|
|
75
|
+
}),
|
|
76
|
+
logger: winston.createLogger({ transports: [ new (winston.transports.File)({ filename: `${__dirname}/logs/${CONFIG.to.snapshots}.log` }), new winston.transports.Console() ], format }),
|
|
77
|
+
},
|
|
78
|
+
versions: {
|
|
79
|
+
source: new GitRepository({
|
|
80
|
+
...config.get('recorder.versions.storage.git'),
|
|
81
|
+
path: path.resolve(ROOT_PATH, `./data/${CONFIG.to.versions}`),
|
|
82
|
+
}),
|
|
83
|
+
destination: new GitRepository({
|
|
84
|
+
...config.get('recorder.versions.storage.git'),
|
|
85
|
+
path: path.resolve(ROOT_PATH, `./data/${CONFIG.to.versions}-migrated`),
|
|
86
|
+
prefixMessageToSnapshotId: CONFIG.to.prefixMessageToSnapshotId,
|
|
87
|
+
}),
|
|
88
|
+
logger: winston.createLogger({ transports: [ new (winston.transports.File)({ filename: `${__dirname}/logs/${CONFIG.to.versions}.log` }), new winston.transports.Console() ], format }),
|
|
89
|
+
},
|
|
90
|
+
},
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
await initialize(migration);
|
|
94
|
+
|
|
95
|
+
const fromSnapshotsRecords = await migration.from.snapshots.source.findAll();
|
|
96
|
+
const toSnapshotsRecords = await migration.to.snapshots.source.findAll();
|
|
97
|
+
const snapshotsToMigrate = fromSnapshotsRecords.filter(({ serviceId }) => migration.services.includes(serviceId));
|
|
98
|
+
const fromSnapshotsRecordsToRewrite = fromSnapshotsRecords.filter(({ serviceId }) => !migration.services.includes(serviceId));
|
|
99
|
+
const toSnapshotsRecordsMigrated = [ ...toSnapshotsRecords, ...snapshotsToMigrate ].sort((recordA, recordB) => new Date(recordA.fetchDate) - new Date(recordB.fetchDate));
|
|
100
|
+
|
|
101
|
+
const fromVersionsRecords = await migration.from.versions.source.findAll();
|
|
102
|
+
const toVersionsRecords = await migration.to.versions.source.findAll();
|
|
103
|
+
const versionsToMigrate = fromVersionsRecords.filter(({ serviceId }) => migration.services.includes(serviceId));
|
|
104
|
+
const fromVersionsRecordsToRewrite = fromVersionsRecords.filter(({ serviceId }) => !migration.services.includes(serviceId));
|
|
105
|
+
const toVersionsRecordsMigrated = [ ...toVersionsRecords, ...versionsToMigrate ].sort((recordA, recordB) => new Date(recordA.fetchDate) - new Date(recordB.fetchDate));
|
|
106
|
+
|
|
107
|
+
console.log('Number of snapshots in the source', fromSnapshotsRecords.length);
|
|
108
|
+
console.log('Number of snapshots in the target', toSnapshotsRecords.length);
|
|
109
|
+
console.log('Number of snapshots to migrate', snapshotsToMigrate.length);
|
|
110
|
+
|
|
111
|
+
console.log('Number of versions in the source', fromVersionsRecords.length);
|
|
112
|
+
console.log('Number of versions in the target', toVersionsRecords.length);
|
|
113
|
+
console.log('Number of versions to migrate', versionsToMigrate.length);
|
|
114
|
+
|
|
115
|
+
const idsMapping = {};
|
|
116
|
+
|
|
117
|
+
await Promise.all([
|
|
118
|
+
rewriteSnapshots(migration.from.snapshots.destination, fromSnapshotsRecordsToRewrite, idsMapping, migration.from.snapshots.logger),
|
|
119
|
+
rewriteSnapshots(migration.to.snapshots.destination, toSnapshotsRecordsMigrated, idsMapping, migration.to.snapshots.logger),
|
|
120
|
+
]);
|
|
121
|
+
|
|
122
|
+
await fs.writeFile(path.join(__dirname, 'ids-mapping.json'), JSON.stringify(idsMapping, null, 4));
|
|
123
|
+
|
|
124
|
+
console.log('Snapshots migrated\n');
|
|
125
|
+
|
|
126
|
+
await Promise.all([
|
|
127
|
+
rewriteVersions(migration.from.versions.destination, fromVersionsRecordsToRewrite, idsMapping, migration.from.versions.logger),
|
|
128
|
+
rewriteVersions(migration.to.versions.destination, toVersionsRecordsMigrated, idsMapping, migration.to.versions.logger),
|
|
129
|
+
]);
|
|
130
|
+
|
|
131
|
+
console.log(`Records treated: ${Object.values(counters).reduce((acc, value) => acc + value, 0)}`);
|
|
132
|
+
console.log(`⌙ Migrated records: ${counters.migrated}`);
|
|
133
|
+
console.log(`⌙ Skipped records: ${counters.skipped}`);
|
|
134
|
+
console.timeEnd('Total time');
|
|
135
|
+
|
|
136
|
+
await finalize(migration);
|
|
137
|
+
}());
|
|
138
|
+
|
|
139
|
+
async function rewriteSnapshots(repository, records, idsMapping, logger) {
|
|
140
|
+
let i = 1;
|
|
141
|
+
|
|
142
|
+
for (const record of records) {
|
|
143
|
+
const { id: recordId } = await repository.save(record); // eslint-disable-line no-await-in-loop
|
|
144
|
+
|
|
145
|
+
idsMapping[record.id] = recordId; // Saves the mapping between the old ID and the new one.
|
|
146
|
+
|
|
147
|
+
if (recordId) {
|
|
148
|
+
logger.info({ message: `Migrated snapshot with new ID: ${recordId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length });
|
|
149
|
+
counters.migrated++;
|
|
150
|
+
} else {
|
|
151
|
+
logger.info({ message: 'Skipped snapshot', serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length });
|
|
152
|
+
counters.skipped++;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async function rewriteVersions(repository, records, idsMapping, logger) {
|
|
158
|
+
let i = 1;
|
|
159
|
+
|
|
160
|
+
for (const record of records) {
|
|
161
|
+
const newSnapshotId = idsMapping[record.snapshotId];
|
|
162
|
+
|
|
163
|
+
if (!newSnapshotId) {
|
|
164
|
+
throw new Error(`Snapshot ID ${record.snapshotId} not found for record ${record.id}`);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
record.snapshotId = newSnapshotId;
|
|
168
|
+
|
|
169
|
+
const { id: recordId } = await repository.save(record); // eslint-disable-line no-await-in-loop
|
|
170
|
+
|
|
171
|
+
if (recordId) {
|
|
172
|
+
logger.info({ message: `Migrated version with new ID: ${recordId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length });
|
|
173
|
+
counters.migrated++;
|
|
174
|
+
} else {
|
|
175
|
+
logger.info({ message: 'Skipped version', serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length });
|
|
176
|
+
counters.skipped++;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
async function initialize(migration) {
|
|
182
|
+
await Promise.all([
|
|
183
|
+
migration.from.snapshots.source.initialize(),
|
|
184
|
+
migration.from.snapshots.destination.initialize(),
|
|
185
|
+
migration.from.versions.source.initialize(),
|
|
186
|
+
migration.from.versions.destination.initialize(),
|
|
187
|
+
migration.to.snapshots.source.initialize(),
|
|
188
|
+
migration.to.snapshots.destination.initialize(),
|
|
189
|
+
migration.to.versions.source.initialize(),
|
|
190
|
+
migration.to.versions.destination.initialize(),
|
|
191
|
+
]);
|
|
192
|
+
|
|
193
|
+
return Promise.all([
|
|
194
|
+
importReadmeInGit({ from: migration.from.snapshots.source, to: migration.from.snapshots.destination }),
|
|
195
|
+
importReadmeInGit({ from: migration.from.versions.source, to: migration.from.versions.destination }),
|
|
196
|
+
importReadmeInGit({ from: migration.to.snapshots.source, to: migration.to.snapshots.destination }),
|
|
197
|
+
importReadmeInGit({ from: migration.to.versions.source, to: migration.to.versions.destination }),
|
|
198
|
+
]);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
async function finalize(migration) {
|
|
202
|
+
return Promise.all([
|
|
203
|
+
migration.from.snapshots.source.finalize(),
|
|
204
|
+
migration.from.snapshots.destination.finalize(),
|
|
205
|
+
migration.from.versions.source.finalize(),
|
|
206
|
+
migration.from.versions.destination.finalize(),
|
|
207
|
+
migration.to.snapshots.source.finalize(),
|
|
208
|
+
migration.to.snapshots.destination.finalize(),
|
|
209
|
+
migration.to.versions.source.finalize(),
|
|
210
|
+
migration.to.versions.destination.finalize(),
|
|
211
|
+
]);
|
|
212
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import path from 'path';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
|
|
4
|
+
import config from 'config';
|
|
5
|
+
|
|
6
|
+
import GitRepository from '../../src/archivist/recorder/repositories/git/index.js';
|
|
7
|
+
|
|
8
|
+
import logger from './logger/index.js';
|
|
9
|
+
import { importReadmeInGit } from './utils/index.js';
|
|
10
|
+
|
|
11
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
const ROOT_PATH = path.resolve(__dirname, '../../');
|
|
13
|
+
|
|
14
|
+
(async function main() {
|
|
15
|
+
console.time('Total time');
|
|
16
|
+
|
|
17
|
+
const versionsRepository = new GitRepository({
|
|
18
|
+
...config.get('recorder.versions.storage.git'),
|
|
19
|
+
path: path.resolve(ROOT_PATH, './data/france-elections-versions'),
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
const versionsTargetRepository = new GitRepository({
|
|
23
|
+
...config.get('recorder.versions.storage.git'),
|
|
24
|
+
prefixMessageToSnapshotId: 'This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/france-elections-snapshots/commit/',
|
|
25
|
+
path: path.resolve(ROOT_PATH, './data/france-elections-versions-hash-updated-test'),
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
const snapshotsRepository = new GitRepository({
|
|
29
|
+
...config.get('recorder.snapshots.storage.git'),
|
|
30
|
+
path: path.resolve(ROOT_PATH, './data/france-elections-snapshots'),
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
await versionsRepository.initialize();
|
|
34
|
+
await versionsTargetRepository.initialize();
|
|
35
|
+
await snapshotsRepository.initialize();
|
|
36
|
+
|
|
37
|
+
await importReadmeInGit({ from: versionsRepository, to: versionsTargetRepository });
|
|
38
|
+
|
|
39
|
+
const total = await versionsRepository.count();
|
|
40
|
+
let current = 1;
|
|
41
|
+
|
|
42
|
+
for await (const record of versionsRepository.iterate()) {
|
|
43
|
+
const fullSnapshotId = await snapshotsRepository.git.getFullHash(record.snapshotId);
|
|
44
|
+
|
|
45
|
+
record.snapshotId = fullSnapshotId;
|
|
46
|
+
|
|
47
|
+
const { id: recordId } = await versionsTargetRepository.save(record);
|
|
48
|
+
|
|
49
|
+
if (!recordId) {
|
|
50
|
+
logger.warn({ message: 'Record skipped', serviceId: record.serviceId, type: record.documentType, id: record.id, current, total });
|
|
51
|
+
} else {
|
|
52
|
+
logger.info({ message: `Update short sha ${record.snapshotId} to ${fullSnapshotId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current, total });
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
current++;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
await versionsRepository.finalize();
|
|
59
|
+
await versionsTargetRepository.finalize();
|
|
60
|
+
await snapshotsRepository.finalize();
|
|
61
|
+
}());
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import fsApi from 'fs';
|
|
2
|
+
|
|
3
|
+
const fs = fsApi.promises;
|
|
4
|
+
|
|
5
|
+
export async function importReadmeInGit({ from: sourceRepository, to: targetRepository }) {
|
|
6
|
+
const sourceRepositoryReadmePath = `${sourceRepository.path}/README.md`;
|
|
7
|
+
const targetRepositoryReadmePath = `${targetRepository.path}/README.md`;
|
|
8
|
+
|
|
9
|
+
const [firstReadmeCommit] = await sourceRepository.git.log(['README.md']);
|
|
10
|
+
|
|
11
|
+
if (!firstReadmeCommit) {
|
|
12
|
+
console.warn(`No commit found for README in ${sourceRepository.path}`);
|
|
13
|
+
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
await fs.copyFile(sourceRepositoryReadmePath, targetRepositoryReadmePath);
|
|
18
|
+
await targetRepository._commit({
|
|
19
|
+
filePath: targetRepositoryReadmePath,
|
|
20
|
+
message: firstReadmeCommit.message,
|
|
21
|
+
date: firstReadmeCommit.date,
|
|
22
|
+
});
|
|
23
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Import snapshots
|
|
2
|
+
|
|
3
|
+
Import snapshots history from git to MongoDB.
|
|
4
|
+
|
|
5
|
+
## How it works
|
|
6
|
+
|
|
7
|
+
The import process is split into two scripts for performance reasons (reading commits from a huge git repository takes a long time). The first script `loadCommits.js` reads commits from a source repository and inserts them as is (without file contents) into a MongoDB database.
|
|
8
|
+
The second script, `index.js`, reads the commits from the MongoDB database, retrieves the contents from GitHub, applies renaming rules if necessary, and then inserts them into another collection in the database with a format compatible with the OpenTermsArchive application.
|
|
9
|
+
## Configuring
|
|
10
|
+
|
|
11
|
+
You can change the configuration in `config/import.json`.
|
|
12
|
+
|
|
13
|
+
```json
|
|
14
|
+
{
|
|
15
|
+
"import": {
|
|
16
|
+
"sourcePath": "Path of the source repository",
|
|
17
|
+
"githubRepository": "Snapshots GitHub repository identifier. Should respect the format: <organisation_or_user_name>/<repository_name>",
|
|
18
|
+
"mongo": {
|
|
19
|
+
"connectionURI": "URI for defining connection to the MongoDB instance. See https://docs.mongodb.com/manual/reference/connection-string/",
|
|
20
|
+
"database": "Database name",
|
|
21
|
+
"collection": "Collection name"
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Adding renaming rules
|
|
28
|
+
|
|
29
|
+
See the [renamer module documentation](../renamer/README.md).
|
|
30
|
+
|
|
31
|
+
## Running
|
|
32
|
+
|
|
33
|
+
**You should execute commands from the `scripts/import` directory to ensure config is properly loaded:**
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
cd scripts/import
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Before importing commits you have to load them in the database:
|
|
40
|
+
```sh
|
|
41
|
+
NODE_ENV=import node scripts/import/loadCommits.js
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Then import snapshots from commits:
|
|
45
|
+
```sh
|
|
46
|
+
NODE_ENV=import node scripts/import/index.js
|
|
47
|
+
```
|
|
48
|
+
## Important notes
|
|
49
|
+
|
|
50
|
+
- Your source repository will be read as is, so checkout the proper branch of commit before running the script.
|
|
51
|
+
|
|
52
|
+
### Edge cases
|
|
53
|
+
|
|
54
|
+
The script will:
|
|
55
|
+
|
|
56
|
+
- Ignore commits which are not a document snapshot (like renaming or documentation commits).
|
|
57
|
+
- Rename document types according to declared rules. See the [renamer module documentation](../renamer/README.md).
|
|
58
|
+
- Rename services according to declared rules. See the [renamer module documentation](../renamer/README.md).
|
|
59
|
+
- Handle duplicates, so you can run it twice without worrying about duplicate entries in the database.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{
|
|
2
|
+
"import": {
|
|
3
|
+
"sourcePath": "../ota-snapshots",
|
|
4
|
+
"githubRepository": "ambanum/OpenTermsArchive-snapshots",
|
|
5
|
+
"mongo": {
|
|
6
|
+
"connectionURI": "mongodb://127.0.0.1:27017",
|
|
7
|
+
"database": "open-terms-archive-import-retry",
|
|
8
|
+
"snapshotsCollection": "snapshots",
|
|
9
|
+
"commitsCollection": "commits"
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
}
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import path from 'path';
|
|
2
|
+
import { performance } from 'perf_hooks';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
|
|
5
|
+
import async from 'async';
|
|
6
|
+
import config from 'config';
|
|
7
|
+
import mime from 'mime';
|
|
8
|
+
import { MongoClient } from 'mongodb';
|
|
9
|
+
import nodeFetch from 'node-fetch';
|
|
10
|
+
|
|
11
|
+
import Git from '../../src/archivist/recorder/repositories/git/git.js';
|
|
12
|
+
import * as renamer from '../utils/renamer/index.js';
|
|
13
|
+
|
|
14
|
+
import logger from './logger/index.js';
|
|
15
|
+
|
|
16
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
|
|
18
|
+
const ROOT_PATH = path.resolve(__dirname, '../../');
|
|
19
|
+
const MAX_PARALLEL = 10;
|
|
20
|
+
const MAX_RETRY = 5;
|
|
21
|
+
const PDF_MIME_TYPE = 'application/pdf';
|
|
22
|
+
const COUNTERS = {
|
|
23
|
+
imported: 0,
|
|
24
|
+
skippedNoChanges: 0,
|
|
25
|
+
errors: 0,
|
|
26
|
+
};
|
|
27
|
+
const commitsNotImported = [];
|
|
28
|
+
|
|
29
|
+
let sourceRepository;
|
|
30
|
+
let snapshotsCollection;
|
|
31
|
+
let commitsCollection;
|
|
32
|
+
let client;
|
|
33
|
+
|
|
34
|
+
(async function main() {
|
|
35
|
+
console.time('Total time');
|
|
36
|
+
logger.info({ message: 'Start importing…' });
|
|
37
|
+
|
|
38
|
+
await initialize();
|
|
39
|
+
|
|
40
|
+
const totalToTreat = await commitsCollection.find().count();
|
|
41
|
+
|
|
42
|
+
const queue = async.queue(queueWorker, MAX_PARALLEL);
|
|
43
|
+
|
|
44
|
+
queue.error(queueErrorHandler);
|
|
45
|
+
queue.drain(queueDrainHandler(totalToTreat));
|
|
46
|
+
|
|
47
|
+
let counter = 1;
|
|
48
|
+
|
|
49
|
+
await commitsCollection.find().forEach(commit => {
|
|
50
|
+
queue.push({ commit, index: counter, total: totalToTreat });
|
|
51
|
+
counter++;
|
|
52
|
+
});
|
|
53
|
+
}());
|
|
54
|
+
|
|
55
|
+
async function initialize() {
|
|
56
|
+
client = new MongoClient(config.get('import.mongo.connectionURI'));
|
|
57
|
+
|
|
58
|
+
await client.connect();
|
|
59
|
+
const db = client.db(config.get('import.mongo.database'));
|
|
60
|
+
|
|
61
|
+
snapshotsCollection = db.collection(config.get('import.mongo.snapshotsCollection'));
|
|
62
|
+
commitsCollection = db.collection(config.get('import.mongo.commitsCollection'));
|
|
63
|
+
|
|
64
|
+
sourceRepository = new Git({ path: path.resolve(ROOT_PATH, config.get('import.sourcePath')) });
|
|
65
|
+
|
|
66
|
+
await sourceRepository.initialize();
|
|
67
|
+
await renamer.loadRules();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
async function queueWorker({ commit, index, total }) {
|
|
71
|
+
return async.retry({
|
|
72
|
+
times: MAX_RETRY,
|
|
73
|
+
interval(retryCount) {
|
|
74
|
+
return 1000 * 2 ** retryCount;
|
|
75
|
+
},
|
|
76
|
+
}, callback => {
|
|
77
|
+
handleCommit(commit, index, total).then(() => {
|
|
78
|
+
callback();
|
|
79
|
+
}).catch(error => {
|
|
80
|
+
callback(error);
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function queueErrorHandler(error, { commit }) {
|
|
86
|
+
const [{ file: relativeFilePath }] = commit.diff.files;
|
|
87
|
+
|
|
88
|
+
const serviceId = path.dirname(relativeFilePath);
|
|
89
|
+
const extension = path.extname(relativeFilePath);
|
|
90
|
+
const documentType = path.basename(relativeFilePath, extension);
|
|
91
|
+
|
|
92
|
+
commitsNotImported.push(commit.hash);
|
|
93
|
+
logger.error({ message: `${error.stack}\nCommit details: ${JSON.stringify(commit, null, 2)}`, serviceId, type: documentType, sha: commit.hash });
|
|
94
|
+
COUNTERS.errors++;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function queueDrainHandler(totalToTreat) {
|
|
98
|
+
return () => {
|
|
99
|
+
const totalTreatedCommits = Object.values(COUNTERS).reduce((acc, value) => acc + value, 0);
|
|
100
|
+
|
|
101
|
+
console.log(`\nEntries treated: ${totalTreatedCommits} on ${totalToTreat}`);
|
|
102
|
+
console.log(`⌙ Entries imported: ${COUNTERS.imported}`);
|
|
103
|
+
console.log(`⌙ Entries skipped (already on the database): ${COUNTERS.skippedNoChanges}`);
|
|
104
|
+
console.log(`⌙ Entries with errors: ${COUNTERS.errors}`);
|
|
105
|
+
console.timeEnd('Total time');
|
|
106
|
+
|
|
107
|
+
if (totalTreatedCommits != totalToTreat) {
|
|
108
|
+
console.error('\n⚠ WARNING: Total treated entries does not match the total number of entries to be treated! ⚠');
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if (commitsNotImported.length) {
|
|
112
|
+
console.log('Not imported commits:\n');
|
|
113
|
+
console.log(commitsNotImported);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
client.close();
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async function getCommitContent({ sha, serviceId, documentType, extension }) {
|
|
121
|
+
const start = performance.now();
|
|
122
|
+
const url = `https://raw.githubusercontent.com/${config.get('import.githubRepository')}/${sha}/${encodeURI(serviceId)}/${encodeURI(documentType)}.${extension}`;
|
|
123
|
+
const response = await nodeFetch(url);
|
|
124
|
+
const end = performance.now();
|
|
125
|
+
|
|
126
|
+
let content;
|
|
127
|
+
|
|
128
|
+
const mimeType = mime.getType(extension);
|
|
129
|
+
|
|
130
|
+
if (mimeType == PDF_MIME_TYPE) {
|
|
131
|
+
content = Buffer.from(await response.arrayBuffer());
|
|
132
|
+
} else {
|
|
133
|
+
content = await response.text();
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (response.status !== 200) {
|
|
137
|
+
throw new Error(`Cannot get commit content on Github ${url}. Get HTTP Code ${response.status} and response ${content}`);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (content == '429: Too Many Requests') {
|
|
141
|
+
throw new TooManyRequestsError(`Cannot get commit content on Github ${url}. 429: Too Many Requests`);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
logger.info({ message: `Fetched in ${Number(end - start).toFixed(2)} ms`, serviceId, type: documentType, sha });
|
|
145
|
+
|
|
146
|
+
return content;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
async function handleCommit(commit, index, total) {
|
|
150
|
+
const [{ file: relativeFilePath }] = commit.diff.files;
|
|
151
|
+
|
|
152
|
+
let serviceId = path.dirname(relativeFilePath);
|
|
153
|
+
const extension = path.extname(relativeFilePath);
|
|
154
|
+
let documentType = path.basename(relativeFilePath, extension);
|
|
155
|
+
|
|
156
|
+
logger.info({
|
|
157
|
+
message: 'Start to handle commit',
|
|
158
|
+
serviceId,
|
|
159
|
+
type: documentType,
|
|
160
|
+
sha: commit.hash,
|
|
161
|
+
current: index + 1,
|
|
162
|
+
total,
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
const alreadyExistsRecord = await snapshotsCollection.findOne({ '_importMetadata.commitSHA': commit.hash });
|
|
166
|
+
|
|
167
|
+
if (alreadyExistsRecord) {
|
|
168
|
+
logger.info({
|
|
169
|
+
message: 'Skipped commit as an entry already exists for this commit',
|
|
170
|
+
serviceId,
|
|
171
|
+
type: documentType,
|
|
172
|
+
sha: commit.hash,
|
|
173
|
+
});
|
|
174
|
+
COUNTERS.skippedNoChanges++;
|
|
175
|
+
|
|
176
|
+
return;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
let content = await getCommitContent({ sha: commit.hash, serviceId, documentType, extension: extension.replace('.', '') });
|
|
180
|
+
|
|
181
|
+
({ serviceId, documentType } = renamer.applyRules(serviceId, documentType));
|
|
182
|
+
|
|
183
|
+
const mimeType = mime.getType(extension);
|
|
184
|
+
|
|
185
|
+
if (mimeType == PDF_MIME_TYPE) {
|
|
186
|
+
content = content.toString('utf-8'); // Serialize PDF
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
try {
|
|
190
|
+
const _importMetadata = { commitSHA: commit.hash };
|
|
191
|
+
|
|
192
|
+
if (commit.body.includes('tosback2')) {
|
|
193
|
+
_importMetadata.provider = 'TOSBack.org';
|
|
194
|
+
_importMetadata.url = commit.body?.match(/Imported from (.*)\nSnapshot/)[1];
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const start = performance.now();
|
|
198
|
+
|
|
199
|
+
await snapshotsCollection.insertOne({
|
|
200
|
+
serviceId,
|
|
201
|
+
documentType,
|
|
202
|
+
content,
|
|
203
|
+
mimeType,
|
|
204
|
+
fetchDate: commit.date,
|
|
205
|
+
_importMetadata,
|
|
206
|
+
created_at: new Date(),
|
|
207
|
+
});
|
|
208
|
+
const end = performance.now();
|
|
209
|
+
|
|
210
|
+
logger.info({ message: `Recorded in ${Number(end - start).toFixed(2)} ms`, serviceId, type: documentType });
|
|
211
|
+
COUNTERS.imported++;
|
|
212
|
+
} catch (error) {
|
|
213
|
+
logger.error({ message: `Not saved: ${commit.date} ${error.stack}`, serviceId, type: documentType });
|
|
214
|
+
commitsNotImported.push(commit.hash);
|
|
215
|
+
COUNTERS.errors++;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
export class TooManyRequestsError extends Error {
|
|
220
|
+
constructor(message) {
|
|
221
|
+
super(message);
|
|
222
|
+
this.name = 'TooManyRequestsError';
|
|
223
|
+
}
|
|
224
|
+
}
|