@crawlee/http 4.0.0-beta.3 → 4.0.0-beta.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/internals/file-download.d.ts +46 -33
- package/internals/file-download.d.ts.map +1 -1
- package/internals/file-download.js +114 -74
- package/internals/file-download.js.map +1 -1
- package/internals/http-crawler.d.ts +80 -152
- package/internals/http-crawler.d.ts.map +1 -1
- package/internals/http-crawler.js +173 -297
- package/internals/http-crawler.js.map +1 -1
- package/internals/utils.d.ts +14 -0
- package/internals/utils.d.ts.map +1 -0
- package/internals/utils.js +71 -0
- package/internals/utils.js.map +1 -0
- package/package.json +7 -7
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Readable } from 'node:stream';
|
|
2
2
|
import util from 'node:util';
|
|
3
|
-
import {
|
|
3
|
+
import { BasicCrawler, BLOCKED_STATUS_CODES, ContextPipeline, mergeCookies, RequestState, Router, SessionError, } from '@crawlee/basic';
|
|
4
|
+
import { ResponseWithUrl } from '@crawlee/http-client';
|
|
4
5
|
import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
|
|
5
6
|
import * as cheerio from 'cheerio';
|
|
6
7
|
import contentTypeParser from 'content-type';
|
|
7
8
|
import iconv from 'iconv-lite';
|
|
8
|
-
import
|
|
9
|
-
import ow, { ObjectPredicate } from 'ow';
|
|
9
|
+
import ow from 'ow';
|
|
10
10
|
import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
|
|
11
|
-
import {
|
|
12
|
-
let TimeoutError;
|
|
11
|
+
import { parseContentTypeFromResponse, processHttpRequestOptions } from './utils.js';
|
|
13
12
|
/**
|
|
14
13
|
* Default mime types, which HttpScraper supports.
|
|
15
14
|
*/
|
|
@@ -46,11 +45,11 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
|
46
45
|
*
|
|
47
46
|
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
48
47
|
*
|
|
49
|
-
* We can use the `preNavigationHooks` to adjust
|
|
48
|
+
* We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
|
|
50
49
|
*
|
|
51
50
|
* ```javascript
|
|
52
51
|
* preNavigationHooks: [
|
|
53
|
-
* (crawlingContext
|
|
52
|
+
* (crawlingContext) => {
|
|
54
53
|
* // ...
|
|
55
54
|
* },
|
|
56
55
|
* ]
|
|
@@ -93,13 +92,6 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
|
93
92
|
* @category Crawlers
|
|
94
93
|
*/
|
|
95
94
|
export class HttpCrawler extends BasicCrawler {
|
|
96
|
-
config;
|
|
97
|
-
/**
|
|
98
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
99
|
-
* Only available if used by the crawler.
|
|
100
|
-
*/
|
|
101
|
-
proxyConfiguration;
|
|
102
|
-
userRequestHandlerTimeoutMillis;
|
|
103
95
|
preNavigationHooks;
|
|
104
96
|
postNavigationHooks;
|
|
105
97
|
persistCookiesPerSession;
|
|
@@ -117,7 +109,6 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
117
109
|
additionalMimeTypes: ow.optional.array.ofType(ow.string),
|
|
118
110
|
suggestResponseEncoding: ow.optional.string,
|
|
119
111
|
forceResponseEncoding: ow.optional.string,
|
|
120
|
-
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
121
112
|
persistCookiesPerSession: ow.optional.boolean,
|
|
122
113
|
additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
123
114
|
ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
@@ -127,22 +118,17 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
127
118
|
/**
|
|
128
119
|
* All `HttpCrawlerOptions` parameters are passed via an options object.
|
|
129
120
|
*/
|
|
130
|
-
constructor(options = {}
|
|
121
|
+
constructor(options = {}) {
|
|
131
122
|
ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape));
|
|
132
|
-
const {
|
|
123
|
+
const { navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [], additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
|
|
133
124
|
// BasicCrawler
|
|
134
|
-
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
|
|
125
|
+
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, contextPipelineBuilder, ...basicCrawlerOptions } = options;
|
|
135
126
|
super({
|
|
136
127
|
...basicCrawlerOptions,
|
|
137
|
-
requestHandler,
|
|
138
128
|
autoscaledPoolOptions,
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
}, config);
|
|
143
|
-
this.config = config;
|
|
144
|
-
// FIXME any
|
|
145
|
-
this.requestHandler = requestHandler ?? this.router;
|
|
129
|
+
contextPipelineBuilder: contextPipelineBuilder ??
|
|
130
|
+
(() => this.buildContextPipeline()),
|
|
131
|
+
});
|
|
146
132
|
// Cookies should be persisted per session only if session pool is used
|
|
147
133
|
if (!this.useSessionPool && persistCookiesPerSession) {
|
|
148
134
|
throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
|
|
@@ -153,14 +139,12 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
153
139
|
if (suggestResponseEncoding && forceResponseEncoding) {
|
|
154
140
|
this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
|
|
155
141
|
}
|
|
156
|
-
this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
157
142
|
this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
|
|
158
143
|
this.ignoreSslErrors = ignoreSslErrors;
|
|
159
144
|
this.suggestResponseEncoding = suggestResponseEncoding;
|
|
160
145
|
this.forceResponseEncoding = forceResponseEncoding;
|
|
161
146
|
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
|
|
162
147
|
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
|
|
163
|
-
this.proxyConfiguration = proxyConfiguration;
|
|
164
148
|
this.preNavigationHooks = preNavigationHooks;
|
|
165
149
|
this.postNavigationHooks = [
|
|
166
150
|
({ request, response }) => this._abortDownloadOfBody(request, response),
|
|
@@ -173,102 +157,111 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
173
157
|
this.persistCookiesPerSession = false;
|
|
174
158
|
}
|
|
175
159
|
}
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
const className = this.constructor.name;
|
|
184
|
-
const extensionOptions = extension.getCrawlerOptions();
|
|
185
|
-
for (const [key, value] of Object.entries(extensionOptions)) {
|
|
186
|
-
const isConfigurable = Object.hasOwn(this, key);
|
|
187
|
-
const originalType = typeof this[key];
|
|
188
|
-
const extensionType = typeof value; // What if we want to null something? It is really needed?
|
|
189
|
-
const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
|
|
190
|
-
const exists = this[key] != null;
|
|
191
|
-
if (!isConfigurable) {
|
|
192
|
-
// Test if the property can be configured on the crawler
|
|
193
|
-
throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
|
|
194
|
-
}
|
|
195
|
-
if (!isSameType && exists) {
|
|
196
|
-
// Assuming that extensions will only add up configuration
|
|
197
|
-
throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
|
|
198
|
-
}
|
|
199
|
-
this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
|
|
200
|
-
this[key] = value;
|
|
201
|
-
}
|
|
160
|
+
buildContextPipeline() {
|
|
161
|
+
return ContextPipeline.create()
|
|
162
|
+
.compose({
|
|
163
|
+
action: this.makeHttpRequest.bind(this),
|
|
164
|
+
})
|
|
165
|
+
.compose({ action: this.processHttpResponse.bind(this) })
|
|
166
|
+
.compose({ action: this.handleBlockedRequestByContent.bind(this) });
|
|
202
167
|
}
|
|
203
|
-
|
|
204
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
205
|
-
*/
|
|
206
|
-
async _runRequestHandler(crawlingContext) {
|
|
168
|
+
async makeHttpRequest(crawlingContext) {
|
|
207
169
|
const { request, session } = crawlingContext;
|
|
208
|
-
if (
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
const $ = cheerio.load(parsed.body.toString());
|
|
222
|
-
if ($(selector).get().length === 0) {
|
|
223
|
-
throw new Error(`Selector '${selector}' not found.`);
|
|
224
|
-
}
|
|
170
|
+
if (request.skipNavigation) {
|
|
171
|
+
return {
|
|
172
|
+
request: new Proxy(request, {
|
|
173
|
+
get(target, propertyName, receiver) {
|
|
174
|
+
if (propertyName === 'loadedUrl') {
|
|
175
|
+
throw new Error('The `request.loadedUrl` property is not available - `skipNavigation` was used');
|
|
176
|
+
}
|
|
177
|
+
return Reflect.get(target, propertyName, receiver);
|
|
178
|
+
},
|
|
179
|
+
}),
|
|
180
|
+
get response() {
|
|
181
|
+
throw new Error('The `response` property is not available - `skipNavigation` was used');
|
|
182
|
+
},
|
|
225
183
|
};
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
184
|
+
}
|
|
185
|
+
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
186
|
+
request.state = RequestState.BEFORE_NAV;
|
|
187
|
+
// Execute pre navigation hooks before applying session pool cookies,
|
|
188
|
+
// as they may also set cookies in the session
|
|
189
|
+
await this._executeHooks(this.preNavigationHooks, crawlingContext);
|
|
190
|
+
tryCancel();
|
|
191
|
+
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
192
|
+
const cookieString = this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
|
|
193
|
+
const proxyUrl = crawlingContext.proxyInfo?.url;
|
|
194
|
+
const httpResponse = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, cookieString }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
195
|
+
tryCancel();
|
|
196
|
+
request.loadedUrl = httpResponse?.url;
|
|
197
|
+
request.state = RequestState.AFTER_NAV;
|
|
198
|
+
return { request: request, response: httpResponse };
|
|
199
|
+
}
|
|
200
|
+
async processHttpResponse(crawlingContext) {
|
|
201
|
+
if (crawlingContext.request.skipNavigation) {
|
|
202
|
+
return {
|
|
203
|
+
get contentType() {
|
|
204
|
+
throw new Error('The `contentType` property is not available - `skipNavigation` was used');
|
|
205
|
+
},
|
|
206
|
+
get body() {
|
|
207
|
+
throw new Error('The `body` property is not available - `skipNavigation` was used');
|
|
208
|
+
},
|
|
209
|
+
get json() {
|
|
210
|
+
throw new Error('The `json` property is not available - `skipNavigation` was used');
|
|
211
|
+
},
|
|
212
|
+
get waitForSelector() {
|
|
213
|
+
throw new Error('The `waitForSelector` method is not available - `skipNavigation` was used');
|
|
214
|
+
},
|
|
215
|
+
get parseWithCheerio() {
|
|
216
|
+
throw new Error('The `parseWithCheerio` method is not available - `skipNavigation` was used');
|
|
217
|
+
},
|
|
232
218
|
};
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
219
|
+
}
|
|
220
|
+
await this._executeHooks(this.postNavigationHooks, crawlingContext);
|
|
221
|
+
tryCancel();
|
|
222
|
+
const parsed = await this._parseResponse(crawlingContext.request, crawlingContext.response);
|
|
223
|
+
tryCancel();
|
|
224
|
+
const response = parsed.response;
|
|
225
|
+
const contentType = parsed.contentType;
|
|
226
|
+
const waitForSelector = async (selector, _timeoutMs) => {
|
|
227
|
+
const $ = cheerio.load(parsed.body.toString());
|
|
228
|
+
if ($(selector).get().length === 0) {
|
|
229
|
+
throw new Error(`Selector '${selector}' not found.`);
|
|
238
230
|
}
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
request.noRetry = true;
|
|
245
|
-
request.state = RequestState.SKIPPED;
|
|
246
|
-
return;
|
|
231
|
+
};
|
|
232
|
+
const parseWithCheerio = async (selector, timeoutMs) => {
|
|
233
|
+
const $ = cheerio.load(parsed.body.toString());
|
|
234
|
+
if (selector) {
|
|
235
|
+
await crawlingContext.waitForSelector(selector, timeoutMs);
|
|
247
236
|
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
},
|
|
256
|
-
});
|
|
237
|
+
return $;
|
|
238
|
+
};
|
|
239
|
+
if (this.useSessionPool) {
|
|
240
|
+
this._throwOnBlockedRequest(crawlingContext.session, response.status);
|
|
241
|
+
}
|
|
242
|
+
if (this.persistCookiesPerSession) {
|
|
243
|
+
crawlingContext.session.setCookiesFromResponse(response);
|
|
257
244
|
}
|
|
245
|
+
return {
|
|
246
|
+
get json() {
|
|
247
|
+
if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
|
|
248
|
+
return null;
|
|
249
|
+
const jsonString = parsed.body.toString(contentType.encoding);
|
|
250
|
+
return JSON.parse(jsonString);
|
|
251
|
+
},
|
|
252
|
+
waitForSelector,
|
|
253
|
+
parseWithCheerio,
|
|
254
|
+
contentType,
|
|
255
|
+
body: parsed.body,
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
async handleBlockedRequestByContent(crawlingContext) {
|
|
258
259
|
if (this.retryOnBlocked) {
|
|
259
260
|
const error = await this.isRequestBlocked(crawlingContext);
|
|
260
261
|
if (error)
|
|
261
262
|
throw new SessionError(error);
|
|
262
263
|
}
|
|
263
|
-
|
|
264
|
-
try {
|
|
265
|
-
await addTimeoutToPromise(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
|
|
266
|
-
request.state = RequestState.DONE;
|
|
267
|
-
}
|
|
268
|
-
catch (e) {
|
|
269
|
-
request.state = RequestState.ERROR;
|
|
270
|
-
throw e;
|
|
271
|
-
}
|
|
264
|
+
return {};
|
|
272
265
|
}
|
|
273
266
|
async isRequestBlocked(crawlingContext) {
|
|
274
267
|
if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
|
|
@@ -278,84 +271,40 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
278
271
|
return `Found selectors: ${foundSelectors.join(', ')}`;
|
|
279
272
|
}
|
|
280
273
|
}
|
|
274
|
+
const blockedStatusCodes =
|
|
275
|
+
// eslint-disable-next-line dot-notation
|
|
276
|
+
(this.sessionPool?.['blockedStatusCodes'].length ?? 0) > 0
|
|
277
|
+
? // eslint-disable-next-line dot-notation
|
|
278
|
+
this.sessionPool['blockedStatusCodes']
|
|
279
|
+
: BLOCKED_STATUS_CODES;
|
|
280
|
+
if (blockedStatusCodes.includes(crawlingContext.response.status)) {
|
|
281
|
+
return `Blocked by status code ${crawlingContext.response.status}`;
|
|
282
|
+
}
|
|
281
283
|
return false;
|
|
282
284
|
}
|
|
283
|
-
async _handleNavigation(crawlingContext) {
|
|
284
|
-
const gotOptions = {};
|
|
285
|
-
const { request, session } = crawlingContext;
|
|
286
|
-
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
287
|
-
request.state = RequestState.BEFORE_NAV;
|
|
288
|
-
// Execute pre navigation hooks before applying session pool cookies,
|
|
289
|
-
// as they may also set cookies in the session
|
|
290
|
-
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
|
|
291
|
-
tryCancel();
|
|
292
|
-
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
293
|
-
this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
|
|
294
|
-
const proxyUrl = crawlingContext.proxyInfo?.url;
|
|
295
|
-
crawlingContext.response = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
296
|
-
tryCancel();
|
|
297
|
-
request.state = RequestState.AFTER_NAV;
|
|
298
|
-
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
|
|
299
|
-
tryCancel();
|
|
300
|
-
}
|
|
301
285
|
/**
|
|
302
|
-
*
|
|
286
|
+
* Returns the `Cookie` header value based on the current context and
|
|
287
|
+
* any changes that occurred in the navigation hooks.
|
|
303
288
|
*/
|
|
304
|
-
_applyCookies({ session, request },
|
|
289
|
+
_applyCookies({ session, request }, preHookCookies, postHookCookies) {
|
|
305
290
|
const sessionCookie = session?.getCookieString(request.url) ?? '';
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
|
|
309
|
-
this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
|
|
310
|
-
const sourceCookies = [];
|
|
311
|
-
if (Array.isArray(lowerCaseHeader)) {
|
|
312
|
-
sourceCookies.push(...lowerCaseHeader);
|
|
313
|
-
}
|
|
314
|
-
else {
|
|
315
|
-
sourceCookies.push(lowerCaseHeader);
|
|
316
|
-
}
|
|
317
|
-
if (Array.isArray(upperCaseHeader)) {
|
|
318
|
-
sourceCookies.push(...upperCaseHeader);
|
|
319
|
-
}
|
|
320
|
-
else {
|
|
321
|
-
sourceCookies.push(upperCaseHeader);
|
|
322
|
-
}
|
|
323
|
-
alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
|
|
324
|
-
}
|
|
325
|
-
const sourceCookies = [sessionCookie, preHookCookies];
|
|
326
|
-
if (Array.isArray(alteredGotOptionsCookies)) {
|
|
327
|
-
sourceCookies.push(...alteredGotOptionsCookies);
|
|
328
|
-
}
|
|
329
|
-
else {
|
|
330
|
-
sourceCookies.push(alteredGotOptionsCookies);
|
|
331
|
-
}
|
|
332
|
-
sourceCookies.push(postHookCookies);
|
|
333
|
-
const mergedCookie = mergeCookies(request.url, sourceCookies);
|
|
334
|
-
gotOptions.headers ??= {};
|
|
335
|
-
Reflect.deleteProperty(gotOptions.headers, 'Cookie');
|
|
336
|
-
Reflect.deleteProperty(gotOptions.headers, 'cookie');
|
|
337
|
-
if (mergedCookie !== '') {
|
|
338
|
-
gotOptions.headers.Cookie = mergedCookie;
|
|
339
|
-
}
|
|
291
|
+
const sourceCookies = [sessionCookie, preHookCookies, postHookCookies];
|
|
292
|
+
return mergeCookies(request.url, sourceCookies);
|
|
340
293
|
}
|
|
341
294
|
/**
|
|
342
295
|
* Function to make the HTTP request. It performs optimizations
|
|
343
296
|
* on the request such as only downloading the request body if the
|
|
344
297
|
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
345
298
|
*/
|
|
346
|
-
async _requestFunction({ request, session, proxyUrl,
|
|
347
|
-
|
|
348
|
-
// @ts-ignore
|
|
349
|
-
({ TimeoutError } = await import('got-scraping'));
|
|
350
|
-
}
|
|
351
|
-
const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
|
|
299
|
+
async _requestFunction({ request, session, proxyUrl, cookieString, }) {
|
|
300
|
+
const opts = this._getRequestOptions(request, session, proxyUrl);
|
|
352
301
|
try {
|
|
353
|
-
return await this._requestAsBrowser(opts, session);
|
|
302
|
+
return await this._requestAsBrowser(opts, session, cookieString);
|
|
354
303
|
}
|
|
355
304
|
catch (e) {
|
|
356
|
-
if (e instanceof TimeoutError) {
|
|
305
|
+
if (e instanceof Error && e.constructor.name === 'TimeoutError') {
|
|
357
306
|
this._handleRequestTimeout(session);
|
|
358
|
-
return
|
|
307
|
+
return new Response(); // this will never happen, as _handleRequestTimeout always throws
|
|
359
308
|
}
|
|
360
309
|
if (this.isProxyError(e)) {
|
|
361
310
|
throw new SessionError(this._getMessageFromError(e));
|
|
@@ -368,18 +317,18 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
368
317
|
/**
|
|
369
318
|
* Encodes and parses response according to the provided content type
|
|
370
319
|
*/
|
|
371
|
-
async _parseResponse(request,
|
|
372
|
-
const {
|
|
373
|
-
const { type, charset } = parseContentTypeFromResponse(
|
|
374
|
-
const { response, encoding } = this._encodeResponse(request,
|
|
320
|
+
async _parseResponse(request, response) {
|
|
321
|
+
const { status } = response;
|
|
322
|
+
const { type, charset } = parseContentTypeFromResponse(response);
|
|
323
|
+
const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset);
|
|
375
324
|
const contentType = { type, encoding };
|
|
376
|
-
if (
|
|
377
|
-
this.stats.registerStatusCode(
|
|
325
|
+
if (status >= 400 && status <= 599) {
|
|
326
|
+
this.stats.registerStatusCode(status);
|
|
378
327
|
}
|
|
379
|
-
const excludeError = this.ignoreHttpErrorStatusCodes.has(
|
|
380
|
-
const includeError = this.additionalHttpErrorStatusCodes.has(
|
|
381
|
-
if ((
|
|
382
|
-
const body = await
|
|
328
|
+
const excludeError = this.ignoreHttpErrorStatusCodes.has(status);
|
|
329
|
+
const includeError = this.additionalHttpErrorStatusCodes.has(status);
|
|
330
|
+
if ((status >= 500 && !excludeError) || includeError) {
|
|
331
|
+
const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text)
|
|
383
332
|
// Errors are often sent as JSON, so attempt to parse them,
|
|
384
333
|
// despite Accept header being set to text/html.
|
|
385
334
|
if (type === APPLICATION_JSON_MIME_TYPE) {
|
|
@@ -387,59 +336,47 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
387
336
|
let { message } = errorResponse;
|
|
388
337
|
if (!message)
|
|
389
338
|
message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
|
|
390
|
-
throw new Error(`${
|
|
339
|
+
throw new Error(`${status} - ${message}`);
|
|
391
340
|
}
|
|
392
341
|
if (includeError) {
|
|
393
|
-
throw new Error(`${
|
|
342
|
+
throw new Error(`${status} - Error status code was set by user.`);
|
|
394
343
|
}
|
|
395
344
|
// It's not a JSON, so it's probably some text. Get the first 100 chars of it.
|
|
396
|
-
throw new Error(`${
|
|
345
|
+
throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`);
|
|
397
346
|
}
|
|
398
347
|
else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
|
|
399
|
-
|
|
400
|
-
const parsed = await this._parseHTML(response, isXml, crawlingContext);
|
|
401
|
-
return { ...parsed, isXml, response, contentType };
|
|
348
|
+
return { response, contentType, body: await reencodedResponse.text() };
|
|
402
349
|
}
|
|
403
350
|
else {
|
|
404
|
-
const body = await
|
|
351
|
+
const body = Buffer.from(await reencodedResponse.bytes());
|
|
405
352
|
return {
|
|
406
353
|
body,
|
|
407
354
|
response,
|
|
408
355
|
contentType,
|
|
409
|
-
enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
|
|
410
356
|
};
|
|
411
357
|
}
|
|
412
358
|
}
|
|
413
|
-
async _parseHTML(response, _isXml, _crawlingContext) {
|
|
414
|
-
return {
|
|
415
|
-
body: await concatStreamToBuffer(response),
|
|
416
|
-
};
|
|
417
|
-
}
|
|
418
359
|
/**
|
|
419
360
|
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
420
361
|
*/
|
|
421
|
-
_getRequestOptions(request, session, proxyUrl
|
|
362
|
+
_getRequestOptions(request, session, proxyUrl) {
|
|
422
363
|
const requestOptions = {
|
|
423
364
|
url: request.url,
|
|
424
365
|
method: request.method,
|
|
425
366
|
proxyUrl,
|
|
426
|
-
timeout:
|
|
367
|
+
timeout: this.navigationTimeoutMillis,
|
|
368
|
+
cookieJar: this.persistCookiesPerSession ? session?.cookieJar : undefined,
|
|
427
369
|
sessionToken: session,
|
|
428
|
-
|
|
429
|
-
headers: { ...request.headers, ...gotOptions?.headers },
|
|
370
|
+
headers: request.headers,
|
|
430
371
|
https: {
|
|
431
|
-
...gotOptions?.https,
|
|
432
372
|
rejectUnauthorized: !this.ignoreSslErrors,
|
|
433
373
|
},
|
|
434
|
-
|
|
374
|
+
body: undefined,
|
|
435
375
|
};
|
|
436
376
|
// Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
|
|
437
377
|
Reflect.deleteProperty(requestOptions.headers, 'cookie');
|
|
438
|
-
// TODO this is incorrect, the check for man in the middle needs to be done
|
|
439
|
-
// on individual proxy level, not on the `proxyConfiguration` level,
|
|
440
|
-
// because users can use normal + MITM proxies in a single configuration.
|
|
441
378
|
// Disable SSL verification for MITM proxies
|
|
442
|
-
if (
|
|
379
|
+
if (session?.proxyInfo?.ignoreTlsErrors) {
|
|
443
380
|
requestOptions.https = {
|
|
444
381
|
...requestOptions.https,
|
|
445
382
|
rejectUnauthorized: false,
|
|
@@ -468,13 +405,13 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
468
405
|
if (iconv.encodingExists(encoding)) {
|
|
469
406
|
const encodeStream = iconv.encodeStream(utf8);
|
|
470
407
|
const decodeStream = iconv.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
408
|
+
const reencodedBody = response.body
|
|
409
|
+
? Readable.toWeb(Readable.from(Readable.fromWeb(response.body)
|
|
410
|
+
.pipe(decodeStream)
|
|
411
|
+
.pipe(encodeStream)))
|
|
412
|
+
: null;
|
|
476
413
|
return {
|
|
477
|
-
response:
|
|
414
|
+
response: new ResponseWithUrl(reencodedBody, response),
|
|
478
415
|
encoding: utf8,
|
|
479
416
|
};
|
|
480
417
|
}
|
|
@@ -503,15 +440,15 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
503
440
|
*/
|
|
504
441
|
_handleRequestTimeout(session) {
|
|
505
442
|
session?.markBad();
|
|
506
|
-
throw new Error(`request timed out after ${this.
|
|
443
|
+
throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
507
444
|
}
|
|
508
445
|
_abortDownloadOfBody(request, response) {
|
|
509
|
-
const {
|
|
446
|
+
const { status } = response;
|
|
510
447
|
const { type } = parseContentTypeFromResponse(response);
|
|
511
448
|
// eslint-disable-next-line dot-notation -- accessing private property
|
|
512
449
|
const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
|
|
513
450
|
// if we retry the request, can the Content-Type change?
|
|
514
|
-
const isTransientContentType =
|
|
451
|
+
const isTransientContentType = status >= 500 || blockedStatusCodes.includes(status);
|
|
515
452
|
if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
|
|
516
453
|
request.noRetry = true;
|
|
517
454
|
throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
|
|
@@ -521,89 +458,28 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
521
458
|
/**
|
|
522
459
|
* @internal wraps public utility for mocking purposes
|
|
523
460
|
*/
|
|
524
|
-
_requestAsBrowser = async (options, session) => {
|
|
525
|
-
const
|
|
461
|
+
_requestAsBrowser = async (options, session, cookieString) => {
|
|
462
|
+
const opts = processHttpRequestOptions({
|
|
526
463
|
...options,
|
|
527
|
-
cookieJar: options.cookieJar,
|
|
464
|
+
cookieJar: options.cookieJar,
|
|
528
465
|
responseType: 'text',
|
|
529
|
-
}), (redirectResponse, updatedRequest) => {
|
|
530
|
-
if (this.persistCookiesPerSession) {
|
|
531
|
-
session.setCookiesFromResponse(redirectResponse);
|
|
532
|
-
const cookieString = session.getCookieString(updatedRequest.url.toString());
|
|
533
|
-
if (cookieString !== '') {
|
|
534
|
-
updatedRequest.headers.Cookie = cookieString;
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
466
|
});
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
* The stream object returned from got does not have the below properties.
|
|
543
|
-
* At the same time, you can't read data directly from the response stream,
|
|
544
|
-
* because they won't get emitted unless you also read from the primary
|
|
545
|
-
* got stream. To be able to work with only one stream, we move the expected props
|
|
546
|
-
* from the response stream to the got stream.
|
|
547
|
-
* @internal
|
|
548
|
-
*/
|
|
549
|
-
function addResponsePropertiesToStream(stream, response) {
|
|
550
|
-
const properties = [
|
|
551
|
-
'statusCode',
|
|
552
|
-
'statusMessage',
|
|
553
|
-
'headers',
|
|
554
|
-
'complete',
|
|
555
|
-
'httpVersion',
|
|
556
|
-
'rawHeaders',
|
|
557
|
-
'rawTrailers',
|
|
558
|
-
'trailers',
|
|
559
|
-
'url',
|
|
560
|
-
'request',
|
|
561
|
-
];
|
|
562
|
-
stream.on('end', () => {
|
|
563
|
-
// @ts-expect-error
|
|
564
|
-
if (stream.rawTrailers)
|
|
565
|
-
stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
|
|
566
|
-
// @ts-expect-error
|
|
567
|
-
if (stream.trailers)
|
|
568
|
-
stream.trailers = response.trailers;
|
|
569
|
-
// @ts-expect-error
|
|
570
|
-
stream.complete = response.complete;
|
|
571
|
-
});
|
|
572
|
-
for (const prop of properties) {
|
|
573
|
-
if (!(prop in stream)) {
|
|
574
|
-
stream[prop] = response[prop];
|
|
467
|
+
if (cookieString) {
|
|
468
|
+
opts.headers?.delete('Cookie');
|
|
469
|
+
opts.headers?.delete('cookie');
|
|
470
|
+
opts.headers?.set('Cookie', cookieString);
|
|
575
471
|
}
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
}));
|
|
588
|
-
const { url, headers } = response;
|
|
589
|
-
let parsedContentType;
|
|
590
|
-
if (headers['content-type']) {
|
|
591
|
-
try {
|
|
592
|
-
parsedContentType = contentTypeParser.parse(headers['content-type']);
|
|
593
|
-
}
|
|
594
|
-
catch {
|
|
595
|
-
// Can not parse content type from Content-Type header. Try to parse it from file extension.
|
|
596
|
-
}
|
|
597
|
-
}
|
|
598
|
-
// Parse content type from file extension as fallback
|
|
599
|
-
if (!parsedContentType) {
|
|
600
|
-
const parsedUrl = new URL(url);
|
|
601
|
-
const contentTypeFromExtname = mime.contentType(extname(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
|
|
602
|
-
parsedContentType = contentTypeParser.parse(contentTypeFromExtname);
|
|
603
|
-
}
|
|
604
|
-
return {
|
|
605
|
-
type: parsedContentType.type,
|
|
606
|
-
charset: parsedContentType.parameters.charset,
|
|
472
|
+
const response = await this.httpClient.sendRequest(new Request(opts.url, {
|
|
473
|
+
body: opts.body ? Readable.toWeb(opts.body) : undefined,
|
|
474
|
+
headers: new Headers(opts.headers),
|
|
475
|
+
method: opts.method,
|
|
476
|
+
// Node-specific option to make the request body work with streams
|
|
477
|
+
duplex: 'half',
|
|
478
|
+
}), {
|
|
479
|
+
session,
|
|
480
|
+
timeout: opts.timeout,
|
|
481
|
+
});
|
|
482
|
+
return response;
|
|
607
483
|
};
|
|
608
484
|
}
|
|
609
485
|
/**
|