@crawlee/http 4.0.0-beta.2 → 4.0.0-beta.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/internals/file-download.d.ts +46 -33
- package/internals/file-download.d.ts.map +1 -1
- package/internals/file-download.js +85 -73
- package/internals/file-download.js.map +1 -1
- package/internals/http-crawler.d.ts +70 -149
- package/internals/http-crawler.d.ts.map +1 -1
- package/internals/http-crawler.js +178 -287
- package/internals/http-crawler.js.map +1 -1
- package/internals/utils.d.ts +9 -0
- package/internals/utils.d.ts.map +1 -0
- package/internals/utils.js +35 -0
- package/internals/utils.js.map +1 -0
- package/package.json +6 -6
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Readable } from 'node:stream';
|
|
2
2
|
import util from 'node:util';
|
|
3
|
-
import {
|
|
3
|
+
import { BasicCrawler, BLOCKED_STATUS_CODES, Configuration, ContextPipeline, mergeCookies, processHttpRequestOptions, RequestState, ResponseWithUrl, Router, SessionError, } from '@crawlee/basic';
|
|
4
4
|
import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
|
|
5
5
|
import * as cheerio from 'cheerio';
|
|
6
6
|
import contentTypeParser from 'content-type';
|
|
7
7
|
import iconv from 'iconv-lite';
|
|
8
|
-
import
|
|
9
|
-
import ow, { ObjectPredicate } from 'ow';
|
|
8
|
+
import ow from 'ow';
|
|
10
9
|
import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
|
|
11
|
-
import {
|
|
10
|
+
import { parseContentTypeFromResponse } from './utils.js';
|
|
12
11
|
let TimeoutError;
|
|
13
12
|
/**
|
|
14
13
|
* Default mime types, which HttpScraper supports.
|
|
@@ -46,11 +45,11 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
|
46
45
|
*
|
|
47
46
|
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
48
47
|
*
|
|
49
|
-
* We can use the `preNavigationHooks` to adjust
|
|
48
|
+
* We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
|
|
50
49
|
*
|
|
51
50
|
* ```javascript
|
|
52
51
|
* preNavigationHooks: [
|
|
53
|
-
* (crawlingContext
|
|
52
|
+
* (crawlingContext) => {
|
|
54
53
|
* // ...
|
|
55
54
|
* },
|
|
56
55
|
* ]
|
|
@@ -94,12 +93,6 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
|
94
93
|
*/
|
|
95
94
|
export class HttpCrawler extends BasicCrawler {
|
|
96
95
|
config;
|
|
97
|
-
/**
|
|
98
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
99
|
-
* Only available if used by the crawler.
|
|
100
|
-
*/
|
|
101
|
-
proxyConfiguration;
|
|
102
|
-
userRequestHandlerTimeoutMillis;
|
|
103
96
|
preNavigationHooks;
|
|
104
97
|
postNavigationHooks;
|
|
105
98
|
persistCookiesPerSession;
|
|
@@ -117,7 +110,6 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
117
110
|
additionalMimeTypes: ow.optional.array.ofType(ow.string),
|
|
118
111
|
suggestResponseEncoding: ow.optional.string,
|
|
119
112
|
forceResponseEncoding: ow.optional.string,
|
|
120
|
-
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
121
113
|
persistCookiesPerSession: ow.optional.boolean,
|
|
122
114
|
additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
123
115
|
ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
@@ -129,20 +121,16 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
129
121
|
*/
|
|
130
122
|
constructor(options = {}, config = Configuration.getGlobalConfig()) {
|
|
131
123
|
ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape));
|
|
132
|
-
const {
|
|
124
|
+
const { navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [], additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
|
|
133
125
|
// BasicCrawler
|
|
134
|
-
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
|
|
126
|
+
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, contextPipelineBuilder, ...basicCrawlerOptions } = options;
|
|
135
127
|
super({
|
|
136
128
|
...basicCrawlerOptions,
|
|
137
|
-
requestHandler,
|
|
138
129
|
autoscaledPoolOptions,
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
|
|
130
|
+
contextPipelineBuilder: contextPipelineBuilder ??
|
|
131
|
+
(() => this.buildContextPipeline()),
|
|
142
132
|
}, config);
|
|
143
133
|
this.config = config;
|
|
144
|
-
// FIXME any
|
|
145
|
-
this.requestHandler = requestHandler ?? this.router;
|
|
146
134
|
// Cookies should be persisted per session only if session pool is used
|
|
147
135
|
if (!this.useSessionPool && persistCookiesPerSession) {
|
|
148
136
|
throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
|
|
@@ -153,14 +141,12 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
153
141
|
if (suggestResponseEncoding && forceResponseEncoding) {
|
|
154
142
|
this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
|
|
155
143
|
}
|
|
156
|
-
this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
157
144
|
this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
|
|
158
145
|
this.ignoreSslErrors = ignoreSslErrors;
|
|
159
146
|
this.suggestResponseEncoding = suggestResponseEncoding;
|
|
160
147
|
this.forceResponseEncoding = forceResponseEncoding;
|
|
161
148
|
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
|
|
162
149
|
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
|
|
163
|
-
this.proxyConfiguration = proxyConfiguration;
|
|
164
150
|
this.preNavigationHooks = preNavigationHooks;
|
|
165
151
|
this.postNavigationHooks = [
|
|
166
152
|
({ request, response }) => this._abortDownloadOfBody(request, response),
|
|
@@ -173,102 +159,111 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
173
159
|
this.persistCookiesPerSession = false;
|
|
174
160
|
}
|
|
175
161
|
}
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
const className = this.constructor.name;
|
|
184
|
-
const extensionOptions = extension.getCrawlerOptions();
|
|
185
|
-
for (const [key, value] of Object.entries(extensionOptions)) {
|
|
186
|
-
const isConfigurable = Object.hasOwn(this, key);
|
|
187
|
-
const originalType = typeof this[key];
|
|
188
|
-
const extensionType = typeof value; // What if we want to null something? It is really needed?
|
|
189
|
-
const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
|
|
190
|
-
const exists = this[key] != null;
|
|
191
|
-
if (!isConfigurable) {
|
|
192
|
-
// Test if the property can be configured on the crawler
|
|
193
|
-
throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
|
|
194
|
-
}
|
|
195
|
-
if (!isSameType && exists) {
|
|
196
|
-
// Assuming that extensions will only add up configuration
|
|
197
|
-
throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
|
|
198
|
-
}
|
|
199
|
-
this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
|
|
200
|
-
this[key] = value;
|
|
201
|
-
}
|
|
162
|
+
buildContextPipeline() {
|
|
163
|
+
return ContextPipeline.create()
|
|
164
|
+
.compose({
|
|
165
|
+
action: this.makeHttpRequest.bind(this),
|
|
166
|
+
})
|
|
167
|
+
.compose({ action: this.processHttpResponse.bind(this) })
|
|
168
|
+
.compose({ action: this.handleBlockedRequestByContent.bind(this) });
|
|
202
169
|
}
|
|
203
|
-
|
|
204
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
205
|
-
*/
|
|
206
|
-
async _runRequestHandler(crawlingContext) {
|
|
170
|
+
async makeHttpRequest(crawlingContext) {
|
|
207
171
|
const { request, session } = crawlingContext;
|
|
208
|
-
if (
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
const $ = cheerio.load(parsed.body.toString());
|
|
222
|
-
if ($(selector).get().length === 0) {
|
|
223
|
-
throw new Error(`Selector '${selector}' not found.`);
|
|
224
|
-
}
|
|
172
|
+
if (request.skipNavigation) {
|
|
173
|
+
return {
|
|
174
|
+
request: new Proxy(request, {
|
|
175
|
+
get(target, propertyName, receiver) {
|
|
176
|
+
if (propertyName === 'loadedUrl') {
|
|
177
|
+
throw new Error('The `request.loadedUrl` property is not available - `skipNavigation` was used');
|
|
178
|
+
}
|
|
179
|
+
return Reflect.get(target, propertyName, receiver);
|
|
180
|
+
},
|
|
181
|
+
}),
|
|
182
|
+
get response() {
|
|
183
|
+
throw new Error('The `response` property is not available - `skipNavigation` was used');
|
|
184
|
+
},
|
|
225
185
|
};
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
186
|
+
}
|
|
187
|
+
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
188
|
+
request.state = RequestState.BEFORE_NAV;
|
|
189
|
+
// Execute pre navigation hooks before applying session pool cookies,
|
|
190
|
+
// as they may also set cookies in the session
|
|
191
|
+
await this._executeHooks(this.preNavigationHooks, crawlingContext);
|
|
192
|
+
tryCancel();
|
|
193
|
+
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
194
|
+
const cookieString = this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
|
|
195
|
+
const proxyUrl = crawlingContext.proxyInfo?.url;
|
|
196
|
+
const httpResponse = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, cookieString }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
197
|
+
tryCancel();
|
|
198
|
+
request.loadedUrl = httpResponse?.url;
|
|
199
|
+
request.state = RequestState.AFTER_NAV;
|
|
200
|
+
return { request: request, response: httpResponse };
|
|
201
|
+
}
|
|
202
|
+
async processHttpResponse(crawlingContext) {
|
|
203
|
+
if (crawlingContext.request.skipNavigation) {
|
|
204
|
+
return {
|
|
205
|
+
get contentType() {
|
|
206
|
+
throw new Error('The `contentType` property is not available - `skipNavigation` was used');
|
|
207
|
+
},
|
|
208
|
+
get body() {
|
|
209
|
+
throw new Error('The `body` property is not available - `skipNavigation` was used');
|
|
210
|
+
},
|
|
211
|
+
get json() {
|
|
212
|
+
throw new Error('The `json` property is not available - `skipNavigation` was used');
|
|
213
|
+
},
|
|
214
|
+
get waitForSelector() {
|
|
215
|
+
throw new Error('The `waitForSelector` method is not available - `skipNavigation` was used');
|
|
216
|
+
},
|
|
217
|
+
get parseWithCheerio() {
|
|
218
|
+
throw new Error('The `parseWithCheerio` method is not available - `skipNavigation` was used');
|
|
219
|
+
},
|
|
232
220
|
};
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
221
|
+
}
|
|
222
|
+
await this._executeHooks(this.postNavigationHooks, crawlingContext);
|
|
223
|
+
tryCancel();
|
|
224
|
+
const parsed = await this._parseResponse(crawlingContext.request, crawlingContext.response);
|
|
225
|
+
tryCancel();
|
|
226
|
+
const response = parsed.response;
|
|
227
|
+
const contentType = parsed.contentType;
|
|
228
|
+
const waitForSelector = async (selector, _timeoutMs) => {
|
|
229
|
+
const $ = cheerio.load(parsed.body.toString());
|
|
230
|
+
if ($(selector).get().length === 0) {
|
|
231
|
+
throw new Error(`Selector '${selector}' not found.`);
|
|
238
232
|
}
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
request.noRetry = true;
|
|
245
|
-
request.state = RequestState.SKIPPED;
|
|
246
|
-
return;
|
|
233
|
+
};
|
|
234
|
+
const parseWithCheerio = async (selector, timeoutMs) => {
|
|
235
|
+
const $ = cheerio.load(parsed.body.toString());
|
|
236
|
+
if (selector) {
|
|
237
|
+
await crawlingContext.waitForSelector(selector, timeoutMs);
|
|
247
238
|
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
return null;
|
|
253
|
-
const jsonString = parsed.body.toString(contentType.encoding);
|
|
254
|
-
return JSON.parse(jsonString);
|
|
255
|
-
},
|
|
256
|
-
});
|
|
239
|
+
return $;
|
|
240
|
+
};
|
|
241
|
+
if (this.useSessionPool) {
|
|
242
|
+
this._throwOnBlockedRequest(crawlingContext.session, response.status);
|
|
257
243
|
}
|
|
244
|
+
if (this.persistCookiesPerSession) {
|
|
245
|
+
crawlingContext.session.setCookiesFromResponse(response);
|
|
246
|
+
}
|
|
247
|
+
return {
|
|
248
|
+
get json() {
|
|
249
|
+
if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
|
|
250
|
+
return null;
|
|
251
|
+
const jsonString = parsed.body.toString(contentType.encoding);
|
|
252
|
+
return JSON.parse(jsonString);
|
|
253
|
+
},
|
|
254
|
+
waitForSelector,
|
|
255
|
+
parseWithCheerio,
|
|
256
|
+
contentType,
|
|
257
|
+
body: parsed.body,
|
|
258
|
+
};
|
|
259
|
+
}
|
|
260
|
+
async handleBlockedRequestByContent(crawlingContext) {
|
|
258
261
|
if (this.retryOnBlocked) {
|
|
259
262
|
const error = await this.isRequestBlocked(crawlingContext);
|
|
260
263
|
if (error)
|
|
261
264
|
throw new SessionError(error);
|
|
262
265
|
}
|
|
263
|
-
|
|
264
|
-
try {
|
|
265
|
-
await addTimeoutToPromise(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
|
|
266
|
-
request.state = RequestState.DONE;
|
|
267
|
-
}
|
|
268
|
-
catch (e) {
|
|
269
|
-
request.state = RequestState.ERROR;
|
|
270
|
-
throw e;
|
|
271
|
-
}
|
|
266
|
+
return {};
|
|
272
267
|
}
|
|
273
268
|
async isRequestBlocked(crawlingContext) {
|
|
274
269
|
if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
|
|
@@ -278,84 +273,44 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
278
273
|
return `Found selectors: ${foundSelectors.join(', ')}`;
|
|
279
274
|
}
|
|
280
275
|
}
|
|
276
|
+
const blockedStatusCodes =
|
|
277
|
+
// eslint-disable-next-line dot-notation
|
|
278
|
+
(this.sessionPool?.['blockedStatusCodes'].length ?? 0) > 0
|
|
279
|
+
? // eslint-disable-next-line dot-notation
|
|
280
|
+
this.sessionPool['blockedStatusCodes']
|
|
281
|
+
: BLOCKED_STATUS_CODES;
|
|
282
|
+
if (blockedStatusCodes.includes(crawlingContext.response.status)) {
|
|
283
|
+
return `Blocked by status code ${crawlingContext.response.status}`;
|
|
284
|
+
}
|
|
281
285
|
return false;
|
|
282
286
|
}
|
|
283
|
-
async _handleNavigation(crawlingContext) {
|
|
284
|
-
const gotOptions = {};
|
|
285
|
-
const { request, session } = crawlingContext;
|
|
286
|
-
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
287
|
-
request.state = RequestState.BEFORE_NAV;
|
|
288
|
-
// Execute pre navigation hooks before applying session pool cookies,
|
|
289
|
-
// as they may also set cookies in the session
|
|
290
|
-
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
|
|
291
|
-
tryCancel();
|
|
292
|
-
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
|
|
293
|
-
this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
|
|
294
|
-
const proxyUrl = crawlingContext.proxyInfo?.url;
|
|
295
|
-
crawlingContext.response = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
296
|
-
tryCancel();
|
|
297
|
-
request.state = RequestState.AFTER_NAV;
|
|
298
|
-
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
|
|
299
|
-
tryCancel();
|
|
300
|
-
}
|
|
301
287
|
/**
|
|
302
|
-
*
|
|
288
|
+
* Returns the `Cookie` header value based on the current context and
|
|
289
|
+
* any changes that occurred in the navigation hooks.
|
|
303
290
|
*/
|
|
304
|
-
_applyCookies({ session, request },
|
|
291
|
+
_applyCookies({ session, request }, preHookCookies, postHookCookies) {
|
|
305
292
|
const sessionCookie = session?.getCookieString(request.url) ?? '';
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
|
|
309
|
-
this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
|
|
310
|
-
const sourceCookies = [];
|
|
311
|
-
if (Array.isArray(lowerCaseHeader)) {
|
|
312
|
-
sourceCookies.push(...lowerCaseHeader);
|
|
313
|
-
}
|
|
314
|
-
else {
|
|
315
|
-
sourceCookies.push(lowerCaseHeader);
|
|
316
|
-
}
|
|
317
|
-
if (Array.isArray(upperCaseHeader)) {
|
|
318
|
-
sourceCookies.push(...upperCaseHeader);
|
|
319
|
-
}
|
|
320
|
-
else {
|
|
321
|
-
sourceCookies.push(upperCaseHeader);
|
|
322
|
-
}
|
|
323
|
-
alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
|
|
324
|
-
}
|
|
325
|
-
const sourceCookies = [sessionCookie, preHookCookies];
|
|
326
|
-
if (Array.isArray(alteredGotOptionsCookies)) {
|
|
327
|
-
sourceCookies.push(...alteredGotOptionsCookies);
|
|
328
|
-
}
|
|
329
|
-
else {
|
|
330
|
-
sourceCookies.push(alteredGotOptionsCookies);
|
|
331
|
-
}
|
|
332
|
-
sourceCookies.push(postHookCookies);
|
|
333
|
-
const mergedCookie = mergeCookies(request.url, sourceCookies);
|
|
334
|
-
gotOptions.headers ??= {};
|
|
335
|
-
Reflect.deleteProperty(gotOptions.headers, 'Cookie');
|
|
336
|
-
Reflect.deleteProperty(gotOptions.headers, 'cookie');
|
|
337
|
-
if (mergedCookie !== '') {
|
|
338
|
-
gotOptions.headers.Cookie = mergedCookie;
|
|
339
|
-
}
|
|
293
|
+
const sourceCookies = [sessionCookie, preHookCookies, postHookCookies];
|
|
294
|
+
return mergeCookies(request.url, sourceCookies);
|
|
340
295
|
}
|
|
341
296
|
/**
|
|
342
297
|
* Function to make the HTTP request. It performs optimizations
|
|
343
298
|
* on the request such as only downloading the request body if the
|
|
344
299
|
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
345
300
|
*/
|
|
346
|
-
async _requestFunction({ request, session, proxyUrl,
|
|
301
|
+
async _requestFunction({ request, session, proxyUrl, cookieString, }) {
|
|
347
302
|
if (!TimeoutError) {
|
|
348
303
|
// @ts-ignore
|
|
349
304
|
({ TimeoutError } = await import('got-scraping'));
|
|
350
305
|
}
|
|
351
|
-
const opts = this._getRequestOptions(request, session, proxyUrl
|
|
306
|
+
const opts = this._getRequestOptions(request, session, proxyUrl);
|
|
352
307
|
try {
|
|
353
|
-
return await this._requestAsBrowser(opts, session);
|
|
308
|
+
return await this._requestAsBrowser(opts, session, cookieString);
|
|
354
309
|
}
|
|
355
310
|
catch (e) {
|
|
356
311
|
if (e instanceof TimeoutError) {
|
|
357
312
|
this._handleRequestTimeout(session);
|
|
358
|
-
return
|
|
313
|
+
return new Response(); // this will never happen, as _handleRequestTimeout always throws
|
|
359
314
|
}
|
|
360
315
|
if (this.isProxyError(e)) {
|
|
361
316
|
throw new SessionError(this._getMessageFromError(e));
|
|
@@ -368,18 +323,18 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
368
323
|
/**
|
|
369
324
|
* Encodes and parses response according to the provided content type
|
|
370
325
|
*/
|
|
371
|
-
async _parseResponse(request,
|
|
372
|
-
const {
|
|
373
|
-
const { type, charset } = parseContentTypeFromResponse(
|
|
374
|
-
const { response, encoding } = this._encodeResponse(request,
|
|
326
|
+
async _parseResponse(request, response) {
|
|
327
|
+
const { status } = response;
|
|
328
|
+
const { type, charset } = parseContentTypeFromResponse(response);
|
|
329
|
+
const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset);
|
|
375
330
|
const contentType = { type, encoding };
|
|
376
|
-
if (
|
|
377
|
-
this.stats.registerStatusCode(
|
|
331
|
+
if (status >= 400 && status <= 599) {
|
|
332
|
+
this.stats.registerStatusCode(status);
|
|
378
333
|
}
|
|
379
|
-
const excludeError = this.ignoreHttpErrorStatusCodes.has(
|
|
380
|
-
const includeError = this.additionalHttpErrorStatusCodes.has(
|
|
381
|
-
if ((
|
|
382
|
-
const body = await
|
|
334
|
+
const excludeError = this.ignoreHttpErrorStatusCodes.has(status);
|
|
335
|
+
const includeError = this.additionalHttpErrorStatusCodes.has(status);
|
|
336
|
+
if ((status >= 500 && !excludeError) || includeError) {
|
|
337
|
+
const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text)
|
|
383
338
|
// Errors are often sent as JSON, so attempt to parse them,
|
|
384
339
|
// despite Accept header being set to text/html.
|
|
385
340
|
if (type === APPLICATION_JSON_MIME_TYPE) {
|
|
@@ -387,59 +342,47 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
387
342
|
let { message } = errorResponse;
|
|
388
343
|
if (!message)
|
|
389
344
|
message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
|
|
390
|
-
throw new Error(`${
|
|
345
|
+
throw new Error(`${status} - ${message}`);
|
|
391
346
|
}
|
|
392
347
|
if (includeError) {
|
|
393
|
-
throw new Error(`${
|
|
348
|
+
throw new Error(`${status} - Error status code was set by user.`);
|
|
394
349
|
}
|
|
395
350
|
// It's not a JSON, so it's probably some text. Get the first 100 chars of it.
|
|
396
|
-
throw new Error(`${
|
|
351
|
+
throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`);
|
|
397
352
|
}
|
|
398
353
|
else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
|
|
399
|
-
|
|
400
|
-
const parsed = await this._parseHTML(response, isXml, crawlingContext);
|
|
401
|
-
return { ...parsed, isXml, response, contentType };
|
|
354
|
+
return { response, contentType, body: await response.text() };
|
|
402
355
|
}
|
|
403
356
|
else {
|
|
404
|
-
const body = await
|
|
357
|
+
const body = Buffer.from(await response.bytes());
|
|
405
358
|
return {
|
|
406
359
|
body,
|
|
407
360
|
response,
|
|
408
361
|
contentType,
|
|
409
|
-
enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
|
|
410
362
|
};
|
|
411
363
|
}
|
|
412
364
|
}
|
|
413
|
-
async _parseHTML(response, _isXml, _crawlingContext) {
|
|
414
|
-
return {
|
|
415
|
-
body: await concatStreamToBuffer(response),
|
|
416
|
-
};
|
|
417
|
-
}
|
|
418
365
|
/**
|
|
419
366
|
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
420
367
|
*/
|
|
421
|
-
_getRequestOptions(request, session, proxyUrl
|
|
368
|
+
_getRequestOptions(request, session, proxyUrl) {
|
|
422
369
|
const requestOptions = {
|
|
423
370
|
url: request.url,
|
|
424
371
|
method: request.method,
|
|
425
372
|
proxyUrl,
|
|
426
|
-
timeout:
|
|
373
|
+
timeout: this.navigationTimeoutMillis,
|
|
374
|
+
cookieJar: this.persistCookiesPerSession ? session?.cookieJar : undefined,
|
|
427
375
|
sessionToken: session,
|
|
428
|
-
|
|
429
|
-
headers: { ...request.headers, ...gotOptions?.headers },
|
|
376
|
+
headers: request.headers,
|
|
430
377
|
https: {
|
|
431
|
-
...gotOptions?.https,
|
|
432
378
|
rejectUnauthorized: !this.ignoreSslErrors,
|
|
433
379
|
},
|
|
434
|
-
|
|
380
|
+
body: undefined,
|
|
435
381
|
};
|
|
436
382
|
// Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
|
|
437
383
|
Reflect.deleteProperty(requestOptions.headers, 'cookie');
|
|
438
|
-
// TODO this is incorrect, the check for man in the middle needs to be done
|
|
439
|
-
// on individual proxy level, not on the `proxyConfiguration` level,
|
|
440
|
-
// because users can use normal + MITM proxies in a single configuration.
|
|
441
384
|
// Disable SSL verification for MITM proxies
|
|
442
|
-
if (
|
|
385
|
+
if (session?.proxyInfo?.ignoreTlsErrors) {
|
|
443
386
|
requestOptions.https = {
|
|
444
387
|
...requestOptions.https,
|
|
445
388
|
rejectUnauthorized: false,
|
|
@@ -468,13 +411,13 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
468
411
|
if (iconv.encodingExists(encoding)) {
|
|
469
412
|
const encodeStream = iconv.encodeStream(utf8);
|
|
470
413
|
const decodeStream = iconv.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
414
|
+
const reencodedBody = response.body
|
|
415
|
+
? Readable.toWeb(Readable.from(Readable.fromWeb(response.body)
|
|
416
|
+
.pipe(decodeStream)
|
|
417
|
+
.pipe(encodeStream)))
|
|
418
|
+
: null;
|
|
476
419
|
return {
|
|
477
|
-
response:
|
|
420
|
+
response: new ResponseWithUrl(reencodedBody, response),
|
|
478
421
|
encoding: utf8,
|
|
479
422
|
};
|
|
480
423
|
}
|
|
@@ -503,15 +446,15 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
503
446
|
*/
|
|
504
447
|
_handleRequestTimeout(session) {
|
|
505
448
|
session?.markBad();
|
|
506
|
-
throw new Error(`request timed out after ${this.
|
|
449
|
+
throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
|
|
507
450
|
}
|
|
508
451
|
_abortDownloadOfBody(request, response) {
|
|
509
|
-
const {
|
|
452
|
+
const { status } = response;
|
|
510
453
|
const { type } = parseContentTypeFromResponse(response);
|
|
511
454
|
// eslint-disable-next-line dot-notation -- accessing private property
|
|
512
455
|
const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
|
|
513
456
|
// if we retry the request, can the Content-Type change?
|
|
514
|
-
const isTransientContentType =
|
|
457
|
+
const isTransientContentType = status >= 500 || blockedStatusCodes.includes(status);
|
|
515
458
|
if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
|
|
516
459
|
request.noRetry = true;
|
|
517
460
|
throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
|
|
@@ -521,89 +464,37 @@ export class HttpCrawler extends BasicCrawler {
|
|
|
521
464
|
/**
|
|
522
465
|
* @internal wraps public utility for mocking purposes
|
|
523
466
|
*/
|
|
524
|
-
_requestAsBrowser = async (options, session) => {
|
|
525
|
-
const
|
|
467
|
+
_requestAsBrowser = async (options, session, cookieString) => {
|
|
468
|
+
const opts = processHttpRequestOptions({
|
|
526
469
|
...options,
|
|
527
|
-
cookieJar: options.cookieJar,
|
|
470
|
+
cookieJar: options.cookieJar,
|
|
528
471
|
responseType: 'text',
|
|
529
|
-
}), (redirectResponse, updatedRequest) => {
|
|
530
|
-
if (this.persistCookiesPerSession) {
|
|
531
|
-
session.setCookiesFromResponse(redirectResponse);
|
|
532
|
-
const cookieString = session.getCookieString(updatedRequest.url.toString());
|
|
533
|
-
if (cookieString !== '') {
|
|
534
|
-
updatedRequest.headers.Cookie = cookieString;
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
472
|
});
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
* The stream object returned from got does not have the below properties.
|
|
543
|
-
* At the same time, you can't read data directly from the response stream,
|
|
544
|
-
* because they won't get emitted unless you also read from the primary
|
|
545
|
-
* got stream. To be able to work with only one stream, we move the expected props
|
|
546
|
-
* from the response stream to the got stream.
|
|
547
|
-
* @internal
|
|
548
|
-
*/
|
|
549
|
-
function addResponsePropertiesToStream(stream, response) {
|
|
550
|
-
const properties = [
|
|
551
|
-
'statusCode',
|
|
552
|
-
'statusMessage',
|
|
553
|
-
'headers',
|
|
554
|
-
'complete',
|
|
555
|
-
'httpVersion',
|
|
556
|
-
'rawHeaders',
|
|
557
|
-
'rawTrailers',
|
|
558
|
-
'trailers',
|
|
559
|
-
'url',
|
|
560
|
-
'request',
|
|
561
|
-
];
|
|
562
|
-
stream.on('end', () => {
|
|
563
|
-
// @ts-expect-error
|
|
564
|
-
if (stream.rawTrailers)
|
|
565
|
-
stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
|
|
566
|
-
// @ts-expect-error
|
|
567
|
-
if (stream.trailers)
|
|
568
|
-
stream.trailers = response.trailers;
|
|
569
|
-
// @ts-expect-error
|
|
570
|
-
stream.complete = response.complete;
|
|
571
|
-
});
|
|
572
|
-
for (const prop of properties) {
|
|
573
|
-
if (!(prop in stream)) {
|
|
574
|
-
stream[prop] = response[prop];
|
|
473
|
+
if (cookieString) {
|
|
474
|
+
opts.headers?.delete('Cookie');
|
|
475
|
+
opts.headers?.delete('cookie');
|
|
476
|
+
opts.headers?.set('Cookie', cookieString);
|
|
575
477
|
}
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
}
|
|
597
|
-
}
|
|
598
|
-
// Parse content type from file extension as fallback
|
|
599
|
-
if (!parsedContentType) {
|
|
600
|
-
const parsedUrl = new URL(url);
|
|
601
|
-
const contentTypeFromExtname = mime.contentType(extname(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
|
|
602
|
-
parsedContentType = contentTypeParser.parse(contentTypeFromExtname);
|
|
603
|
-
}
|
|
604
|
-
return {
|
|
605
|
-
type: parsedContentType.type,
|
|
606
|
-
charset: parsedContentType.parameters.charset,
|
|
478
|
+
const response = await this.httpClient.stream(new Request(opts.url, {
|
|
479
|
+
body: opts.body ? Readable.toWeb(opts.body) : undefined,
|
|
480
|
+
headers: new Headers(opts.headers),
|
|
481
|
+
method: opts.method,
|
|
482
|
+
// Node-specific option to make the request body work with streams
|
|
483
|
+
duplex: 'half',
|
|
484
|
+
}), {
|
|
485
|
+
session,
|
|
486
|
+
timeout: opts.timeout,
|
|
487
|
+
onRedirect: (redirectResponse, updatedRequest) => {
|
|
488
|
+
if (this.persistCookiesPerSession) {
|
|
489
|
+
session.setCookiesFromResponse(redirectResponse);
|
|
490
|
+
const cookieStringRedirected = session.getCookieString(updatedRequest.url.toString());
|
|
491
|
+
if (cookieStringRedirected !== '') {
|
|
492
|
+
updatedRequest.headers.set('Cookie', cookieStringRedirected);
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
},
|
|
496
|
+
});
|
|
497
|
+
return response;
|
|
607
498
|
};
|
|
608
499
|
}
|
|
609
500
|
/**
|