@opentermsarchive/engine 2.7.0 → 2.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opentermsarchive/engine",
3
- "version": "2.7.0",
3
+ "version": "2.7.2",
4
4
  "description": "Tracks and makes visible changes to the terms of online services",
5
5
  "homepage": "https://opentermsarchive.org",
6
6
  "bugs": {
@@ -96,7 +96,7 @@
96
96
  "puppeteer-extra": "^3.3.6",
97
97
  "puppeteer-extra-plugin-stealth": "^2.11.2",
98
98
  "sib-api-v3-sdk": "^8.2.1",
99
- "simple-git": "^3.8.0",
99
+ "simple-git": "^3.27.0",
100
100
  "swagger-jsdoc": "^6.2.8",
101
101
  "swagger-ui-express": "^5.0.0",
102
102
  "winston": "^3.9.0",
@@ -7,7 +7,7 @@ const { combine, timestamp, printf, colorize } = winston.format;
7
7
 
8
8
  logger.format = combine(
9
9
  colorize(),
10
- timestamp({ format: 'YYYY-MM-DDTHH:MM:SSZ' }),
10
+ timestamp({ format: 'YYYY-MM-DDTHH:mm:ssZ' }),
11
11
  printf(({ level, message, counter, hash, timestamp }) => {
12
12
  const prefix = counter && hash ? `${counter.toString().padEnd(6)} ${hash.padEnd(40)}` : '';
13
13
 
@@ -28,6 +28,7 @@ export const EVENTS = [
28
28
  'trackingStarted',
29
29
  'trackingCompleted',
30
30
  'inaccessibleContent',
31
+ 'info',
31
32
  'error',
32
33
  'pluginError',
33
34
  ];
@@ -45,6 +46,7 @@ export default class Archivist extends events.EventEmitter {
45
46
  }
46
47
 
47
48
  async initialize() {
49
+ this.emit('info', 'Initializing engine…');
48
50
  if (this.services) {
49
51
  return;
50
52
  }
@@ -67,6 +69,8 @@ export default class Archivist extends events.EventEmitter {
67
69
  process.exit(1);
68
70
  });
69
71
 
72
+ this.emit('info', 'Initialization completed');
73
+
70
74
  return this;
71
75
  }
72
76
 
@@ -140,7 +144,13 @@ export default class Archivist extends events.EventEmitter {
140
144
  return;
141
145
  }
142
146
 
143
- return this.recordVersion(terms, extractOnly);
147
+ await this.recordVersion(terms, extractOnly);
148
+
149
+ terms.sourceDocuments.forEach(sourceDocument => {
150
+ sourceDocument.content = null; // Reduce memory usage by clearing no longer needed large content strings
151
+ sourceDocument.mimeType = null; // …and associated MIME type
152
+ sourceDocument.snapshotId = null; // …and associated snapshot ID for consistency
153
+ });
144
154
  }
145
155
 
146
156
  async fetchSourceDocuments(terms) {
@@ -18,7 +18,10 @@ export default class Git {
18
18
  await fs.mkdir(this.path, { recursive: true });
19
19
  }
20
20
 
21
- this.git = simpleGit(this.path, { maxConcurrentProcesses: 1 });
21
+ this.git = simpleGit(this.path, {
22
+ trimmed: true,
23
+ maxConcurrentProcesses: 1,
24
+ });
22
25
 
23
26
  await this.git.init();
24
27
 
@@ -27,7 +30,8 @@ export default class Git {
27
30
  .addConfig('push.default', 'current')
28
31
  .addConfig('user.name', this.author.name)
29
32
  .addConfig('user.email', this.author.email)
30
- .addConfig('core.quotePath', false); // disable Git's encoding of special characters in pathnames. For example, `service·A` will be encoded as `service\302\267A` without this setting, leading to issues. See https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
33
+ .addConfig('core.quotePath', false) // Disable Git's encoding of special characters in pathnames. For example, `service·A` will be encoded as `service\302\267A` without this setting, leading to issues. See https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
34
+ .addConfig('core.commitGraph', true); // Enable `commit-graph` feature for efficient commit data storage, improving performance of operations like `git log`
31
35
  }
32
36
 
33
37
  add(filePath) {
@@ -42,7 +46,7 @@ export default class Git {
42
46
  process.env.GIT_AUTHOR_DATE = commitDate;
43
47
  process.env.GIT_COMMITTER_DATE = commitDate;
44
48
 
45
- summary = await this.git.commit(message, filePath);
49
+ summary = await this.git.commit(message, filePath, ['--no-verify']); // Skip pre-commit and commit-msg hooks, as commits are programmatically managed, to optimize performance
46
50
  } finally {
47
51
  process.env.GIT_AUTHOR_DATE = '';
48
52
  process.env.GIT_COMMITTER_DATE = '';
@@ -60,11 +64,11 @@ export default class Git {
60
64
  }
61
65
 
62
66
  listCommits(options = []) {
63
- return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]);
67
+ return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]); // Returns all commits in chronological order (`--reverse`), excluding merge commits (`--no-merges`), with modified files names (`--name-only`)
64
68
  }
65
69
 
66
70
  async getCommit(options) {
67
- const [commit] = await this.listCommits([ '-1', ...options ]);
71
+ const [commit] = await this.listCommits([ '-1', ...options ]); // Returns only the most recent commit matching the given options
68
72
 
69
73
  return commit;
70
74
  }
@@ -103,8 +107,8 @@ export default class Git {
103
107
  return this.git.clean('f', '-d');
104
108
  }
105
109
 
106
- async getFullHash(shortHash) {
107
- return (await this.git.show([ shortHash, '--pretty=%H', '-s' ])).trim();
110
+ getFullHash(shortHash) {
111
+ return this.git.show([ shortHash, '--pretty=%H', '-s' ]);
108
112
  }
109
113
 
110
114
  restore(path, commit) {
@@ -120,4 +124,16 @@ export default class Git {
120
124
  relativePath(absolutePath) {
121
125
  return path.relative(this.path, absolutePath); // Git needs a path relative to the .git directory, not an absolute one
122
126
  }
127
+
128
+ async listFiles(path) {
129
+ return (await this.git.raw([ 'ls-files', path ])).split('\n');
130
+ }
131
+
132
+ async writeCommitGraph() {
133
+ await this.git.raw([ 'commit-graph', 'write', '--reachable', '--changed-paths' ]);
134
+ }
135
+
136
+ async updateCommitGraph() {
137
+ await this.git.raw([ 'commit-graph', 'write', '--reachable', '--changed-paths', '--append' ]);
138
+ }
123
139
  }
@@ -29,6 +29,7 @@ export default class GitRepository extends RepositoryInterface {
29
29
  async initialize() {
30
30
  await this.git.initialize();
31
31
  await this.git.cleanUp(); // Drop all uncommitted changes and remove all leftover files that may be present if the process was killed aggressively
32
+ await this.git.writeCommitGraph(); // Create or replace the commit graph with a new one to ensure it's fully consistent
32
33
 
33
34
  return this;
34
35
  }
@@ -56,17 +57,22 @@ export default class GitRepository extends RepositoryInterface {
56
57
  return record;
57
58
  }
58
59
 
59
- finalize() {
60
- if (!this.needsPublication) {
61
- return;
60
+ async finalize() {
61
+ if (this.needsPublication) {
62
+ await this.git.pushChanges();
62
63
  }
63
64
 
64
- return this.git.pushChanges();
65
+ return this.git.updateCommitGraph();
65
66
  }
66
67
 
67
68
  async findLatest(serviceId, termsType, documentId) {
68
- const filePath = DataMapper.generateFilePath(serviceId, termsType, documentId);
69
- const commit = await this.git.getCommit([filePath]);
69
+ const matchingFilesPaths = await this.git.listFiles(DataMapper.generateFilePath(serviceId, termsType, documentId));
70
+
71
+ if (!matchingFilesPaths.length) {
72
+ return null;
73
+ }
74
+
75
+ const commit = await this.git.getCommit([...matchingFilesPaths]); // Returns the most recent commit that modified any of the matching files. If multiple files match the path pattern (e.g. both HTML and PDF versions exist), returns the commit that last modified any of them
70
76
 
71
77
  return this.#toDomain(commit);
72
78
  }
@@ -30,7 +30,7 @@ if (config.get('@opentermsarchive/engine.logger.sendMailOnError')) {
30
30
  const logger = winston.createLogger({
31
31
  format: combine(
32
32
  colorize(),
33
- timestamp({ format: 'YYYY-MM-DDTHH:MM:SSZ' }),
33
+ timestamp({ format: 'YYYY-MM-DDTHH:mm:ssZ' }),
34
34
  printf(({ level, message, timestamp }) => {
35
35
  const timestampPrefix = config.get('@opentermsarchive/engine.logger.timestampPrefix') ? `${timestamp} ` : '';
36
36
 
@@ -4,11 +4,13 @@ import config from 'config';
4
4
  import winston from 'winston';
5
5
  import 'winston-mail';
6
6
 
7
+ import { formatDuration } from './utils.js';
8
+
7
9
  const { combine, timestamp, printf, colorize } = winston.format;
8
10
 
9
11
  const alignedWithColorsAndTime = combine(
10
12
  colorize(),
11
- timestamp({ format: 'YYYY-MM-DDTHH:MM:SSZ' }),
13
+ timestamp({ format: 'YYYY-MM-DDTHH:mm:ssZ' }),
12
14
  printf(({ level, message, timestamp, serviceId, termsType, documentId }) => {
13
15
  const servicePrefix = serviceId && termsType
14
16
  ? `${serviceId} — ${termsType}${documentId ? `:${documentId}` : ''}`
@@ -82,6 +84,7 @@ logger.configure({
82
84
 
83
85
  let recordedSnapshotsCount;
84
86
  let recordedVersionsCount;
87
+ let trackingStartTime;
85
88
 
86
89
  logger.onFirstSnapshotRecorded = ({ serviceId, termsType, documentId, id }) => {
87
90
  logger.info({ message: `Recorded first snapshot with id ${id}`, serviceId, termsType, documentId });
@@ -119,14 +122,17 @@ logger.onTrackingStarted = (numberOfServices, numberOfTerms, extractOnly) => {
119
122
  }
120
123
  recordedSnapshotsCount = 0;
121
124
  recordedVersionsCount = 0;
125
+ trackingStartTime = Date.now();
122
126
  };
123
127
 
124
128
  logger.onTrackingCompleted = (numberOfServices, numberOfTerms, extractOnly) => {
129
+ const duration = formatDuration(Date.now() - trackingStartTime);
130
+
125
131
  if (extractOnly) {
126
- logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction`);
132
+ logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction in ${duration}`);
127
133
  logger.info(`Recorded ${recordedVersionsCount} new versions\n`);
128
134
  } else {
129
- logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services`);
135
+ logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services in ${duration}`);
130
136
  logger.info(`Recorded ${recordedSnapshotsCount} new snapshots and ${recordedVersionsCount} new versions\n`);
131
137
  }
132
138
  };
@@ -139,6 +145,10 @@ logger.onError = (error, terms) => {
139
145
  logger.error({ message: error.stack, serviceId: terms.service.id, termsType: terms.type });
140
146
  };
141
147
 
148
+ logger.onInfo = message => {
149
+ logger.info({ message });
150
+ };
151
+
142
152
  logger.onPluginError = (error, pluginName) => {
143
153
  logger.error({ message: `Error in "${pluginName}" plugin: ${error.stack}` });
144
154
  };
@@ -0,0 +1,22 @@
1
+ export const formatDuration = milliseconds => {
2
+ const seconds = Math.floor(milliseconds / 1000);
3
+ const hours = Math.floor(seconds / 3600);
4
+ const minutes = Math.floor((seconds % 3600) / 60);
5
+ const remainingSeconds = seconds % 60;
6
+
7
+ const parts = [];
8
+
9
+ if (hours > 0) {
10
+ parts.push(`${hours} hour${hours > 1 ? 's' : ''}`);
11
+ }
12
+
13
+ if (minutes > 0) {
14
+ parts.push(`${minutes} minute${minutes > 1 ? 's' : ''}`);
15
+ }
16
+
17
+ if (remainingSeconds > 0 || parts.length === 0) {
18
+ parts.push(`${remainingSeconds} second${remainingSeconds !== 1 ? 's' : ''}`);
19
+ }
20
+
21
+ return parts.join(' and ');
22
+ };