@opentermsarchive/engine 5.0.6 → 5.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/archivist/errors.js +4 -2
- package/src/archivist/fetcher/errors.js +12 -0
- package/src/archivist/fetcher/errors.test.js +45 -0
- package/src/archivist/fetcher/fullDomFetcher.js +1 -1
- package/src/archivist/index.js +30 -9
- package/src/archivist/index.test.js +96 -0
- package/src/logger/index.js +11 -5
package/package.json
CHANGED
package/src/archivist/errors.js
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
export class InaccessibleContentError extends Error {
|
|
2
|
-
constructor(
|
|
3
|
-
const
|
|
2
|
+
constructor(errors) {
|
|
3
|
+
const errorsArray = Array.isArray(errors) ? errors : [errors];
|
|
4
|
+
const reasons = errorsArray.map(error => (error instanceof Error ? error.message : String(error)));
|
|
4
5
|
|
|
5
6
|
super(`The documents cannot be accessed or their contents can not be selected:${`\n - ${reasons.join('\n - ')}`}`);
|
|
6
7
|
this.name = 'InaccessibleContentError';
|
|
7
8
|
this.reasons = reasons;
|
|
9
|
+
this.errors = errorsArray;
|
|
8
10
|
}
|
|
9
11
|
}
|
|
@@ -1,6 +1,18 @@
|
|
|
1
1
|
export class FetchDocumentError extends Error {
|
|
2
|
+
static LIKELY_TRANSIENT_ERRORS = [
|
|
3
|
+
'EAI_AGAIN', // DNS lookup temporary failure - DNS server is temporarily unavailable or overloaded
|
|
4
|
+
'ETIMEDOUT', // Connection timeout - network latency or server load issues
|
|
5
|
+
'ECONNRESET', // Connection reset - connection was forcibly closed, often due to network issues
|
|
6
|
+
'ERR_NAME_NOT_RESOLVED', // DNS lookup temporary failure - DNS server is temporarily unavailable or overloaded
|
|
7
|
+
'HTTP code 500', // Internal Server Error - server encountered an error while processing the request
|
|
8
|
+
'HTTP code 502', // Bad Gateway - upstream server returned invalid response, often temporary
|
|
9
|
+
'HTTP code 503', // Service Unavailable - server is temporarily overloaded or down for maintenance
|
|
10
|
+
'HTTP code 504', // Gateway Timeout - upstream server took too long to respond, might be temporary
|
|
11
|
+
];
|
|
12
|
+
|
|
2
13
|
constructor(message) {
|
|
3
14
|
super(`Fetch failed: ${message}`);
|
|
4
15
|
this.name = 'FetchDocumentError';
|
|
16
|
+
this.mayBeTransient = FetchDocumentError.LIKELY_TRANSIENT_ERRORS.some(err => message.includes(err));
|
|
5
17
|
}
|
|
6
18
|
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { expect } from 'chai';
|
|
2
|
+
|
|
3
|
+
import { FetchDocumentError } from './errors.js';
|
|
4
|
+
|
|
5
|
+
describe('FetchDocumentError', () => {
|
|
6
|
+
describe('constructor', () => {
|
|
7
|
+
it('formats the error message with "Fetch failed:" prefix', () => {
|
|
8
|
+
const error = new FetchDocumentError('test error');
|
|
9
|
+
|
|
10
|
+
expect(error.message).to.equal('Fetch failed: test error');
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
it('sets the error name correctly', () => {
|
|
14
|
+
const error = new FetchDocumentError('test error');
|
|
15
|
+
|
|
16
|
+
expect(error.name).to.equal('FetchDocumentError');
|
|
17
|
+
});
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
describe('#mayBeTransient', () => {
|
|
21
|
+
describe('transient errors', () => {
|
|
22
|
+
FetchDocumentError.LIKELY_TRANSIENT_ERRORS.forEach(errorCode => {
|
|
23
|
+
it(`returns true for ${errorCode}`, () => {
|
|
24
|
+
const error = new FetchDocumentError(errorCode);
|
|
25
|
+
|
|
26
|
+
expect(error.mayBeTransient).to.be.true;
|
|
27
|
+
});
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
describe('non-transient errors', () => {
|
|
32
|
+
[
|
|
33
|
+
'HTTP code 403',
|
|
34
|
+
'HTTP code 404',
|
|
35
|
+
'HTTP code 429',
|
|
36
|
+
].forEach(errorMessage => {
|
|
37
|
+
it(`returns false for "${errorMessage}"`, () => {
|
|
38
|
+
const error = new FetchDocumentError(errorMessage);
|
|
39
|
+
|
|
40
|
+
expect(error.mayBeTransient).to.be.false;
|
|
41
|
+
});
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
});
|
|
@@ -21,7 +21,7 @@ export default async function fetch(url, cssSelectors, config) {
|
|
|
21
21
|
await page.setDefaultNavigationTimeout(config.navigationTimeout);
|
|
22
22
|
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
|
|
23
23
|
|
|
24
|
-
response = await page.goto(url, { waitUntil: '
|
|
24
|
+
response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
|
|
25
25
|
|
|
26
26
|
if (!response) {
|
|
27
27
|
throw new Error(`Response is empty when trying to fetch '${url}'`);
|
package/src/archivist/index.js
CHANGED
|
@@ -29,6 +29,7 @@ export const EVENTS = [
|
|
|
29
29
|
'trackingCompleted',
|
|
30
30
|
'inaccessibleContent',
|
|
31
31
|
'info',
|
|
32
|
+
'warn',
|
|
32
33
|
'error',
|
|
33
34
|
'pluginError',
|
|
34
35
|
];
|
|
@@ -76,15 +77,35 @@ export default class Archivist extends events.EventEmitter {
|
|
|
76
77
|
|
|
77
78
|
initQueue() {
|
|
78
79
|
this.trackingQueue = async.queue(this.trackTermsChanges.bind(this), MAX_PARALLEL_TRACKING);
|
|
79
|
-
this.trackingQueue.error((
|
|
80
|
-
|
|
81
|
-
this.emit('inaccessibleContent', error, terms);
|
|
80
|
+
this.trackingQueue.error(this.handleTrackingError.bind(this));
|
|
81
|
+
}
|
|
82
82
|
|
|
83
|
-
|
|
84
|
-
|
|
83
|
+
handleTrackingError(error, { terms, isRetry }) {
|
|
84
|
+
if (!(error instanceof InaccessibleContentError)) {
|
|
85
|
+
this.emit('error', {
|
|
86
|
+
message: error.stack,
|
|
87
|
+
serviceId: terms.service.id,
|
|
88
|
+
termsType: terms.type,
|
|
89
|
+
});
|
|
85
90
|
|
|
86
|
-
|
|
87
|
-
}
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const isErrorLikelyTransient = error.errors.some(err => err instanceof FetchDocumentError && err.mayBeTransient);
|
|
95
|
+
|
|
96
|
+
if (isErrorLikelyTransient && !isRetry) {
|
|
97
|
+
this.emit('warn', {
|
|
98
|
+
message: `The documents cannot be accessed due to the following likely transient errors:\n- ${error.errors.map(err => err.message).join('\n- ')}\nA new attempt will be made once the current tracking is complete`,
|
|
99
|
+
serviceId: terms.service.id,
|
|
100
|
+
termsType: terms.type,
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
this.trackingQueue.push({ terms, isRetry: true });
|
|
104
|
+
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
this.emit('inaccessibleContent', error, terms);
|
|
88
109
|
}
|
|
89
110
|
|
|
90
111
|
attach(listener) {
|
|
@@ -171,7 +192,7 @@ export default class Archivist extends events.EventEmitter {
|
|
|
171
192
|
throw error;
|
|
172
193
|
}
|
|
173
194
|
|
|
174
|
-
fetchDocumentErrors.push(error
|
|
195
|
+
fetchDocumentErrors.push(error);
|
|
175
196
|
}
|
|
176
197
|
}));
|
|
177
198
|
|
|
@@ -206,7 +227,7 @@ export default class Archivist extends events.EventEmitter {
|
|
|
206
227
|
throw error;
|
|
207
228
|
}
|
|
208
229
|
|
|
209
|
-
extractDocumentErrors.push(error
|
|
230
|
+
extractDocumentErrors.push(error);
|
|
210
231
|
}
|
|
211
232
|
}));
|
|
212
233
|
|
|
@@ -8,6 +8,8 @@ import nock from 'nock';
|
|
|
8
8
|
import sinon from 'sinon';
|
|
9
9
|
import sinonChai from 'sinon-chai';
|
|
10
10
|
|
|
11
|
+
import { InaccessibleContentError } from './errors.js';
|
|
12
|
+
import { FetchDocumentError } from './fetcher/index.js';
|
|
11
13
|
import Git from './recorder/repositories/git/git.js';
|
|
12
14
|
|
|
13
15
|
import Archivist, { EVENTS } from './index.js';
|
|
@@ -245,6 +247,100 @@ describe('Archivist', function () {
|
|
|
245
247
|
});
|
|
246
248
|
});
|
|
247
249
|
|
|
250
|
+
describe('#handleTrackingError', () => {
|
|
251
|
+
let errorSpy;
|
|
252
|
+
let warnSpy;
|
|
253
|
+
let inaccessibleContentSpy;
|
|
254
|
+
let pushSpy;
|
|
255
|
+
let terms;
|
|
256
|
+
let app;
|
|
257
|
+
const retryableError = new FetchDocumentError(FetchDocumentError.LIKELY_TRANSIENT_ERRORS[0]);
|
|
258
|
+
|
|
259
|
+
before(async () => {
|
|
260
|
+
app = new Archivist({
|
|
261
|
+
recorderConfig: config.get('@opentermsarchive/engine.recorder'),
|
|
262
|
+
fetcherConfig: config.get('@opentermsarchive/engine.fetcher'),
|
|
263
|
+
});
|
|
264
|
+
await app.initialize();
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
beforeEach(() => {
|
|
268
|
+
errorSpy = sinon.spy();
|
|
269
|
+
warnSpy = sinon.spy();
|
|
270
|
+
inaccessibleContentSpy = sinon.spy();
|
|
271
|
+
pushSpy = sinon.spy(app.trackingQueue, 'push');
|
|
272
|
+
app.on('error', errorSpy);
|
|
273
|
+
app.on('warn', warnSpy);
|
|
274
|
+
app.on('inaccessibleContent', inaccessibleContentSpy);
|
|
275
|
+
|
|
276
|
+
terms = {
|
|
277
|
+
service: { id: 'test-service' },
|
|
278
|
+
type: 'test-type',
|
|
279
|
+
sourceDocuments: [
|
|
280
|
+
{ location: 'https://example.com/doc1' },
|
|
281
|
+
{ location: 'https://example.com/doc2' },
|
|
282
|
+
],
|
|
283
|
+
};
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
afterEach(() => {
|
|
287
|
+
errorSpy.resetHistory();
|
|
288
|
+
warnSpy.resetHistory();
|
|
289
|
+
inaccessibleContentSpy.resetHistory();
|
|
290
|
+
pushSpy.restore();
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
context('with an InaccessibleContentError', () => {
|
|
294
|
+
context('when error may be transient', () => {
|
|
295
|
+
beforeEach(() => {
|
|
296
|
+
const error = new InaccessibleContentError([retryableError]);
|
|
297
|
+
|
|
298
|
+
app.handleTrackingError(error, { terms });
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
it('does not emit an error event', () => {
|
|
302
|
+
expect(errorSpy).to.not.have.been.called;
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
it('does not emit an inaccessibleContent event', () => {
|
|
306
|
+
expect(inaccessibleContentSpy).to.not.have.been.called;
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
it('emits a warning', () => {
|
|
310
|
+
expect(warnSpy).to.have.been.called;
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
it('pushes terms to tracking queue for retry', () => {
|
|
314
|
+
expect(pushSpy).to.have.been.calledWith({ terms, isRetry: true });
|
|
315
|
+
});
|
|
316
|
+
});
|
|
317
|
+
|
|
318
|
+
context('when error comes from a retry', () => {
|
|
319
|
+
beforeEach(() => {
|
|
320
|
+
const error = new InaccessibleContentError([retryableError]);
|
|
321
|
+
|
|
322
|
+
app.handleTrackingError(error, { terms, isRetry: true });
|
|
323
|
+
});
|
|
324
|
+
|
|
325
|
+
it('does not emit an error event', () => {
|
|
326
|
+
expect(errorSpy).to.not.have.been.called;
|
|
327
|
+
});
|
|
328
|
+
|
|
329
|
+
it('does not emit a warning', () => {
|
|
330
|
+
expect(warnSpy).to.not.have.been.called;
|
|
331
|
+
});
|
|
332
|
+
|
|
333
|
+
it('emits an inaccessibleContent event with error and terms', () => {
|
|
334
|
+
expect(inaccessibleContentSpy).to.have.been.called;
|
|
335
|
+
});
|
|
336
|
+
|
|
337
|
+
it('does not push terms to tracking queue for retry', () => {
|
|
338
|
+
expect(pushSpy).to.not.have.been.called;
|
|
339
|
+
});
|
|
340
|
+
});
|
|
341
|
+
});
|
|
342
|
+
});
|
|
343
|
+
|
|
248
344
|
describe('Plugin system', () => {
|
|
249
345
|
const plugin = {};
|
|
250
346
|
|
package/src/logger/index.js
CHANGED
|
@@ -141,14 +141,20 @@ logger.onInaccessibleContent = ({ message }, terms) => {
|
|
|
141
141
|
logger.warn({ message, serviceId: terms.service.id, termsType: terms.type });
|
|
142
142
|
};
|
|
143
143
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
};
|
|
144
|
+
const createLogHandler = level => params => {
|
|
145
|
+
if (typeof params === 'string') {
|
|
146
|
+
logger[level]({ message: params });
|
|
147
|
+
} else {
|
|
148
|
+
const { serviceId, termsType, documentId, id, message } = params;
|
|
147
149
|
|
|
148
|
-
logger
|
|
149
|
-
|
|
150
|
+
logger[level]({ message, serviceId, termsType, documentId, id });
|
|
151
|
+
}
|
|
150
152
|
};
|
|
151
153
|
|
|
154
|
+
logger.onError = createLogHandler('error');
|
|
155
|
+
logger.onInfo = createLogHandler('info');
|
|
156
|
+
logger.onWarn = createLogHandler('warn');
|
|
157
|
+
|
|
152
158
|
logger.onPluginError = (error, pluginName) => {
|
|
153
159
|
logger.error({ message: `Error in "${pluginName}" plugin: ${error.stack}` });
|
|
154
160
|
};
|