@opentermsarchive/engine 5.5.0 → 5.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,18 +1,25 @@
|
|
|
1
1
|
export class FetchDocumentError extends Error {
|
|
2
|
+
static LIKELY_BOT_BLOCKING_ERRORS = [
|
|
3
|
+
'HTTP code 403',
|
|
4
|
+
'HTTP code 406',
|
|
5
|
+
'HTTP code 502',
|
|
6
|
+
'ECONNRESET',
|
|
7
|
+
];
|
|
8
|
+
|
|
2
9
|
static LIKELY_TRANSIENT_ERRORS = [
|
|
3
10
|
'EAI_AGAIN', // DNS lookup temporary failure - DNS server is temporarily unavailable or overloaded
|
|
4
11
|
'ETIMEDOUT', // Connection timeout - network latency or server load issues
|
|
5
|
-
'ECONNRESET', // Connection reset - connection was forcibly closed, often due to network issues
|
|
6
12
|
'ERR_NAME_NOT_RESOLVED', // DNS lookup temporary failure - DNS server is temporarily unavailable or overloaded
|
|
7
13
|
'HTTP code 500', // Internal Server Error - server encountered an error while processing the request
|
|
8
|
-
'HTTP code 502', // Bad Gateway - upstream server returned invalid response, often temporary
|
|
9
14
|
'HTTP code 503', // Service Unavailable - server is temporarily overloaded or down for maintenance
|
|
10
15
|
'HTTP code 504', // Gateway Timeout - upstream server took too long to respond, might be temporary
|
|
16
|
+
...FetchDocumentError.LIKELY_BOT_BLOCKING_ERRORS,
|
|
11
17
|
];
|
|
12
18
|
|
|
13
19
|
constructor(message) {
|
|
14
20
|
super(`Fetch failed: ${message}`);
|
|
15
21
|
this.name = 'FetchDocumentError';
|
|
16
22
|
this.mayBeTransient = FetchDocumentError.LIKELY_TRANSIENT_ERRORS.some(err => message.includes(err));
|
|
23
|
+
this.mayBeBotBlocking = FetchDocumentError.LIKELY_BOT_BLOCKING_ERRORS.some(err => message.includes(err));
|
|
17
24
|
}
|
|
18
25
|
}
|
|
@@ -12,13 +12,6 @@ export const FETCHER_TYPES = {
|
|
|
12
12
|
HTML_ONLY: 'htmlOnly',
|
|
13
13
|
};
|
|
14
14
|
|
|
15
|
-
const LIKELY_BOT_BLOCKING_ERRORS = [
|
|
16
|
-
'HTTP code 403',
|
|
17
|
-
'HTTP code 406',
|
|
18
|
-
'HTTP code 502',
|
|
19
|
-
'ECONNRESET',
|
|
20
|
-
];
|
|
21
|
-
|
|
22
15
|
/**
|
|
23
16
|
* Fetch a resource from the network, returning a promise which is fulfilled once the response is available
|
|
24
17
|
* @function fetch
|
|
@@ -70,9 +63,7 @@ async function fetchWithFallback(url, cssSelectors, fetcherConfig) {
|
|
|
70
63
|
try {
|
|
71
64
|
return await fetchWithHtmlOnly(url, fetcherConfig);
|
|
72
65
|
} catch (error) {
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if (!isBotBlockingError || fetcherConfig.executeClientScripts === false) {
|
|
66
|
+
if (!error.mayBeBotBlocking || fetcherConfig.executeClientScripts === false) {
|
|
76
67
|
throw error;
|
|
77
68
|
}
|
|
78
69
|
|
|
@@ -81,15 +72,23 @@ async function fetchWithFallback(url, cssSelectors, fetcherConfig) {
|
|
|
81
72
|
}
|
|
82
73
|
|
|
83
74
|
async function fetchWithFullDom(url, cssSelectors, fetcherConfig) {
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
75
|
+
try {
|
|
76
|
+
return {
|
|
77
|
+
...await fetchFullDom(url, cssSelectors, fetcherConfig),
|
|
78
|
+
fetcher: FETCHER_TYPES.FULL_DOM,
|
|
79
|
+
};
|
|
80
|
+
} catch (error) {
|
|
81
|
+
throw new FetchDocumentError(error.message);
|
|
82
|
+
}
|
|
88
83
|
}
|
|
89
84
|
|
|
90
85
|
async function fetchWithHtmlOnly(url, fetcherConfig) {
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
86
|
+
try {
|
|
87
|
+
return {
|
|
88
|
+
...await fetchHtmlOnly(url, fetcherConfig),
|
|
89
|
+
fetcher: FETCHER_TYPES.HTML_ONLY,
|
|
90
|
+
};
|
|
91
|
+
} catch (error) {
|
|
92
|
+
throw new FetchDocumentError(error.message);
|
|
93
|
+
}
|
|
95
94
|
}
|