@crawlee/http 4.0.0-beta.2 → 4.0.0-beta.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,13 @@
1
- import { extname } from 'node:path';
1
+ import { Readable } from 'node:stream';
2
2
  import util from 'node:util';
3
- import { BASIC_CRAWLER_TIMEOUT_BUFFER_SECS, BasicCrawler, Configuration, CrawlerExtension, mergeCookies, processHttpRequestOptions, RequestState, Router, SessionError, validators, } from '@crawlee/basic';
3
+ import { BasicCrawler, BLOCKED_STATUS_CODES, Configuration, ContextPipeline, mergeCookies, processHttpRequestOptions, RequestState, ResponseWithUrl, Router, SessionError, } from '@crawlee/basic';
4
4
  import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
5
5
  import * as cheerio from 'cheerio';
6
6
  import contentTypeParser from 'content-type';
7
7
  import iconv from 'iconv-lite';
8
- import mime from 'mime-types';
9
- import ow, { ObjectPredicate } from 'ow';
8
+ import ow from 'ow';
10
9
  import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
11
- import { concatStreamToBuffer, readStreamToString } from '@apify/utilities';
10
+ import { parseContentTypeFromResponse } from './utils.js';
12
11
  let TimeoutError;
13
12
  /**
14
13
  * Default mime types, which HttpScraper supports.
@@ -46,11 +45,11 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
46
45
  *
47
46
  * The crawler finishes when there are no more {@link Request} objects to crawl.
48
47
  *
49
- * We can use the `preNavigationHooks` to adjust `gotOptions`:
48
+ * We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
50
49
  *
51
50
  * ```javascript
52
51
  * preNavigationHooks: [
53
- * (crawlingContext, gotOptions) => {
52
+ * (crawlingContext) => {
54
53
  * // ...
55
54
  * },
56
55
  * ]
@@ -94,12 +93,6 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
94
93
  */
95
94
  export class HttpCrawler extends BasicCrawler {
96
95
  config;
97
- /**
98
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
99
- * Only available if used by the crawler.
100
- */
101
- proxyConfiguration;
102
- userRequestHandlerTimeoutMillis;
103
96
  preNavigationHooks;
104
97
  postNavigationHooks;
105
98
  persistCookiesPerSession;
@@ -117,7 +110,6 @@ export class HttpCrawler extends BasicCrawler {
117
110
  additionalMimeTypes: ow.optional.array.ofType(ow.string),
118
111
  suggestResponseEncoding: ow.optional.string,
119
112
  forceResponseEncoding: ow.optional.string,
120
- proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
121
113
  persistCookiesPerSession: ow.optional.boolean,
122
114
  additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
123
115
  ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
@@ -129,20 +121,16 @@ export class HttpCrawler extends BasicCrawler {
129
121
  */
130
122
  constructor(options = {}, config = Configuration.getGlobalConfig()) {
131
123
  ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape));
132
- const { requestHandler, requestHandlerTimeoutSecs = 60, navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, proxyConfiguration, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [], additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
124
+ const { navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [], additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
133
125
  // BasicCrawler
134
- autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
126
+ autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, contextPipelineBuilder, ...basicCrawlerOptions } = options;
135
127
  super({
136
128
  ...basicCrawlerOptions,
137
- requestHandler,
138
129
  autoscaledPoolOptions,
139
- // We need to add some time for internal functions to finish,
140
- // but not too much so that we would stall the crawler.
141
- requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
130
+ contextPipelineBuilder: contextPipelineBuilder ??
131
+ (() => this.buildContextPipeline()),
142
132
  }, config);
143
133
  this.config = config;
144
- // FIXME any
145
- this.requestHandler = requestHandler ?? this.router;
146
134
  // Cookies should be persisted per session only if session pool is used
147
135
  if (!this.useSessionPool && persistCookiesPerSession) {
148
136
  throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
@@ -153,14 +141,12 @@ export class HttpCrawler extends BasicCrawler {
153
141
  if (suggestResponseEncoding && forceResponseEncoding) {
154
142
  this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
155
143
  }
156
- this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
157
144
  this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
158
145
  this.ignoreSslErrors = ignoreSslErrors;
159
146
  this.suggestResponseEncoding = suggestResponseEncoding;
160
147
  this.forceResponseEncoding = forceResponseEncoding;
161
148
  this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
162
149
  this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
163
- this.proxyConfiguration = proxyConfiguration;
164
150
  this.preNavigationHooks = preNavigationHooks;
165
151
  this.postNavigationHooks = [
166
152
  ({ request, response }) => this._abortDownloadOfBody(request, response),
@@ -173,102 +159,111 @@ export class HttpCrawler extends BasicCrawler {
173
159
  this.persistCookiesPerSession = false;
174
160
  }
175
161
  }
176
- /**
177
- * **EXPERIMENTAL**
178
- * Function for attaching CrawlerExtensions such as the Unblockers.
179
- * @param extension Crawler extension that overrides the crawler configuration.
180
- */
181
- use(extension) {
182
- ow(extension, ow.object.instanceOf(CrawlerExtension));
183
- const className = this.constructor.name;
184
- const extensionOptions = extension.getCrawlerOptions();
185
- for (const [key, value] of Object.entries(extensionOptions)) {
186
- const isConfigurable = Object.hasOwn(this, key);
187
- const originalType = typeof this[key];
188
- const extensionType = typeof value; // What if we want to null something? It is really needed?
189
- const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
190
- const exists = this[key] != null;
191
- if (!isConfigurable) {
192
- // Test if the property can be configured on the crawler
193
- throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
194
- }
195
- if (!isSameType && exists) {
196
- // Assuming that extensions will only add up configuration
197
- throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
198
- }
199
- this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
200
- this[key] = value;
201
- }
162
+ buildContextPipeline() {
163
+ return ContextPipeline.create()
164
+ .compose({
165
+ action: this.makeHttpRequest.bind(this),
166
+ })
167
+ .compose({ action: this.processHttpResponse.bind(this) })
168
+ .compose({ action: this.handleBlockedRequestByContent.bind(this) });
202
169
  }
203
- /**
204
- * Wrapper around requestHandler that opens and closes pages etc.
205
- */
206
- async _runRequestHandler(crawlingContext) {
170
+ async makeHttpRequest(crawlingContext) {
207
171
  const { request, session } = crawlingContext;
208
- if (this.proxyConfiguration) {
209
- const sessionId = session ? session.id : undefined;
210
- crawlingContext.proxyInfo = await this.proxyConfiguration.newProxyInfo(sessionId, { request });
211
- }
212
- if (!request.skipNavigation) {
213
- await this._handleNavigation(crawlingContext);
214
- tryCancel();
215
- const parsed = await this._parseResponse(request, crawlingContext.response, crawlingContext);
216
- const response = parsed.response;
217
- const contentType = parsed.contentType;
218
- tryCancel();
219
- // `??=` because descendant classes may already set optimized version
220
- crawlingContext.waitForSelector ??= async (selector, _timeoutMs) => {
221
- const $ = cheerio.load(parsed.body.toString());
222
- if ($(selector).get().length === 0) {
223
- throw new Error(`Selector '${selector}' not found.`);
224
- }
172
+ if (request.skipNavigation) {
173
+ return {
174
+ request: new Proxy(request, {
175
+ get(target, propertyName, receiver) {
176
+ if (propertyName === 'loadedUrl') {
177
+ throw new Error('The `request.loadedUrl` property is not available - `skipNavigation` was used');
178
+ }
179
+ return Reflect.get(target, propertyName, receiver);
180
+ },
181
+ }),
182
+ get response() {
183
+ throw new Error('The `response` property is not available - `skipNavigation` was used');
184
+ },
225
185
  };
226
- crawlingContext.parseWithCheerio ??= async (selector, timeoutMs) => {
227
- const $ = cheerio.load(parsed.body.toString());
228
- if (selector) {
229
- await crawlingContext.waitForSelector(selector, timeoutMs);
230
- }
231
- return $;
186
+ }
187
+ const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
188
+ request.state = RequestState.BEFORE_NAV;
189
+ // Execute pre navigation hooks before applying session pool cookies,
190
+ // as they may also set cookies in the session
191
+ await this._executeHooks(this.preNavigationHooks, crawlingContext);
192
+ tryCancel();
193
+ const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
194
+ const cookieString = this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
195
+ const proxyUrl = crawlingContext.proxyInfo?.url;
196
+ const httpResponse = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, cookieString }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
197
+ tryCancel();
198
+ request.loadedUrl = httpResponse?.url;
199
+ request.state = RequestState.AFTER_NAV;
200
+ return { request: request, response: httpResponse };
201
+ }
202
+ async processHttpResponse(crawlingContext) {
203
+ if (crawlingContext.request.skipNavigation) {
204
+ return {
205
+ get contentType() {
206
+ throw new Error('The `contentType` property is not available - `skipNavigation` was used');
207
+ },
208
+ get body() {
209
+ throw new Error('The `body` property is not available - `skipNavigation` was used');
210
+ },
211
+ get json() {
212
+ throw new Error('The `json` property is not available - `skipNavigation` was used');
213
+ },
214
+ get waitForSelector() {
215
+ throw new Error('The `waitForSelector` method is not available - `skipNavigation` was used');
216
+ },
217
+ get parseWithCheerio() {
218
+ throw new Error('The `parseWithCheerio` method is not available - `skipNavigation` was used');
219
+ },
232
220
  };
233
- if (this.useSessionPool) {
234
- this._throwOnBlockedRequest(crawlingContext.session, response.statusCode);
235
- }
236
- if (this.persistCookiesPerSession) {
237
- crawlingContext.session.setCookiesFromResponse(response);
221
+ }
222
+ await this._executeHooks(this.postNavigationHooks, crawlingContext);
223
+ tryCancel();
224
+ const parsed = await this._parseResponse(crawlingContext.request, crawlingContext.response);
225
+ tryCancel();
226
+ const response = parsed.response;
227
+ const contentType = parsed.contentType;
228
+ const waitForSelector = async (selector, _timeoutMs) => {
229
+ const $ = cheerio.load(parsed.body.toString());
230
+ if ($(selector).get().length === 0) {
231
+ throw new Error(`Selector '${selector}' not found.`);
238
232
  }
239
- request.loadedUrl = response.url;
240
- if (!this.requestMatchesEnqueueStrategy(request)) {
241
- this.log.debug(
242
- // eslint-disable-next-line dot-notation
243
- `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
244
- request.noRetry = true;
245
- request.state = RequestState.SKIPPED;
246
- return;
233
+ };
234
+ const parseWithCheerio = async (selector, timeoutMs) => {
235
+ const $ = cheerio.load(parsed.body.toString());
236
+ if (selector) {
237
+ await crawlingContext.waitForSelector(selector, timeoutMs);
247
238
  }
248
- Object.assign(crawlingContext, parsed);
249
- Object.defineProperty(crawlingContext, 'json', {
250
- get() {
251
- if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
252
- return null;
253
- const jsonString = parsed.body.toString(contentType.encoding);
254
- return JSON.parse(jsonString);
255
- },
256
- });
239
+ return $;
240
+ };
241
+ if (this.useSessionPool) {
242
+ this._throwOnBlockedRequest(crawlingContext.session, response.status);
257
243
  }
244
+ if (this.persistCookiesPerSession) {
245
+ crawlingContext.session.setCookiesFromResponse(response);
246
+ }
247
+ return {
248
+ get json() {
249
+ if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
250
+ return null;
251
+ const jsonString = parsed.body.toString(contentType.encoding);
252
+ return JSON.parse(jsonString);
253
+ },
254
+ waitForSelector,
255
+ parseWithCheerio,
256
+ contentType,
257
+ body: parsed.body,
258
+ };
259
+ }
260
+ async handleBlockedRequestByContent(crawlingContext) {
258
261
  if (this.retryOnBlocked) {
259
262
  const error = await this.isRequestBlocked(crawlingContext);
260
263
  if (error)
261
264
  throw new SessionError(error);
262
265
  }
263
- request.state = RequestState.REQUEST_HANDLER;
264
- try {
265
- await addTimeoutToPromise(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
266
- request.state = RequestState.DONE;
267
- }
268
- catch (e) {
269
- request.state = RequestState.ERROR;
270
- throw e;
271
- }
266
+ return {};
272
267
  }
273
268
  async isRequestBlocked(crawlingContext) {
274
269
  if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
@@ -278,84 +273,44 @@ export class HttpCrawler extends BasicCrawler {
278
273
  return `Found selectors: ${foundSelectors.join(', ')}`;
279
274
  }
280
275
  }
276
+ const blockedStatusCodes =
277
+ // eslint-disable-next-line dot-notation
278
+ (this.sessionPool?.['blockedStatusCodes'].length ?? 0) > 0
279
+ ? // eslint-disable-next-line dot-notation
280
+ this.sessionPool['blockedStatusCodes']
281
+ : BLOCKED_STATUS_CODES;
282
+ if (blockedStatusCodes.includes(crawlingContext.response.status)) {
283
+ return `Blocked by status code ${crawlingContext.response.status}`;
284
+ }
281
285
  return false;
282
286
  }
283
- async _handleNavigation(crawlingContext) {
284
- const gotOptions = {};
285
- const { request, session } = crawlingContext;
286
- const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
287
- request.state = RequestState.BEFORE_NAV;
288
- // Execute pre navigation hooks before applying session pool cookies,
289
- // as they may also set cookies in the session
290
- await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
291
- tryCancel();
292
- const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
293
- this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
294
- const proxyUrl = crawlingContext.proxyInfo?.url;
295
- crawlingContext.response = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
296
- tryCancel();
297
- request.state = RequestState.AFTER_NAV;
298
- await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
299
- tryCancel();
300
- }
301
287
  /**
302
- * Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
288
+ * Returns the `Cookie` header value based on the current context and
289
+ * any changes that occurred in the navigation hooks.
303
290
  */
304
- _applyCookies({ session, request }, gotOptions, preHookCookies, postHookCookies) {
291
+ _applyCookies({ session, request }, preHookCookies, postHookCookies) {
305
292
  const sessionCookie = session?.getCookieString(request.url) ?? '';
306
- let alteredGotOptionsCookies = gotOptions.headers?.Cookie || gotOptions.headers?.cookie || '';
307
- if (gotOptions.headers?.Cookie && gotOptions.headers?.cookie) {
308
- const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
309
- this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
310
- const sourceCookies = [];
311
- if (Array.isArray(lowerCaseHeader)) {
312
- sourceCookies.push(...lowerCaseHeader);
313
- }
314
- else {
315
- sourceCookies.push(lowerCaseHeader);
316
- }
317
- if (Array.isArray(upperCaseHeader)) {
318
- sourceCookies.push(...upperCaseHeader);
319
- }
320
- else {
321
- sourceCookies.push(upperCaseHeader);
322
- }
323
- alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
324
- }
325
- const sourceCookies = [sessionCookie, preHookCookies];
326
- if (Array.isArray(alteredGotOptionsCookies)) {
327
- sourceCookies.push(...alteredGotOptionsCookies);
328
- }
329
- else {
330
- sourceCookies.push(alteredGotOptionsCookies);
331
- }
332
- sourceCookies.push(postHookCookies);
333
- const mergedCookie = mergeCookies(request.url, sourceCookies);
334
- gotOptions.headers ??= {};
335
- Reflect.deleteProperty(gotOptions.headers, 'Cookie');
336
- Reflect.deleteProperty(gotOptions.headers, 'cookie');
337
- if (mergedCookie !== '') {
338
- gotOptions.headers.Cookie = mergedCookie;
339
- }
293
+ const sourceCookies = [sessionCookie, preHookCookies, postHookCookies];
294
+ return mergeCookies(request.url, sourceCookies);
340
295
  }
341
296
  /**
342
297
  * Function to make the HTTP request. It performs optimizations
343
298
  * on the request such as only downloading the request body if the
344
299
  * received content type matches text/html, application/xml, application/xhtml+xml.
345
300
  */
346
- async _requestFunction({ request, session, proxyUrl, gotOptions, }) {
301
+ async _requestFunction({ request, session, proxyUrl, cookieString, }) {
347
302
  if (!TimeoutError) {
348
303
  // @ts-ignore
349
304
  ({ TimeoutError } = await import('got-scraping'));
350
305
  }
351
- const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
306
+ const opts = this._getRequestOptions(request, session, proxyUrl);
352
307
  try {
353
- return await this._requestAsBrowser(opts, session);
308
+ return await this._requestAsBrowser(opts, session, cookieString);
354
309
  }
355
310
  catch (e) {
356
311
  if (e instanceof TimeoutError) {
357
312
  this._handleRequestTimeout(session);
358
- return undefined;
313
+ return new Response(); // this will never happen, as _handleRequestTimeout always throws
359
314
  }
360
315
  if (this.isProxyError(e)) {
361
316
  throw new SessionError(this._getMessageFromError(e));
@@ -368,18 +323,18 @@ export class HttpCrawler extends BasicCrawler {
368
323
  /**
369
324
  * Encodes and parses response according to the provided content type
370
325
  */
371
- async _parseResponse(request, responseStream, crawlingContext) {
372
- const { statusCode } = responseStream;
373
- const { type, charset } = parseContentTypeFromResponse(responseStream);
374
- const { response, encoding } = this._encodeResponse(request, responseStream, charset);
326
+ async _parseResponse(request, response) {
327
+ const { status } = response;
328
+ const { type, charset } = parseContentTypeFromResponse(response);
329
+ const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset);
375
330
  const contentType = { type, encoding };
376
- if (statusCode >= 400 && statusCode <= 599) {
377
- this.stats.registerStatusCode(statusCode);
331
+ if (status >= 400 && status <= 599) {
332
+ this.stats.registerStatusCode(status);
378
333
  }
379
- const excludeError = this.ignoreHttpErrorStatusCodes.has(statusCode);
380
- const includeError = this.additionalHttpErrorStatusCodes.has(statusCode);
381
- if ((statusCode >= 500 && !excludeError) || includeError) {
382
- const body = await readStreamToString(response, encoding);
334
+ const excludeError = this.ignoreHttpErrorStatusCodes.has(status);
335
+ const includeError = this.additionalHttpErrorStatusCodes.has(status);
336
+ if ((status >= 500 && !excludeError) || includeError) {
337
+ const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text)
383
338
  // Errors are often sent as JSON, so attempt to parse them,
384
339
  // despite Accept header being set to text/html.
385
340
  if (type === APPLICATION_JSON_MIME_TYPE) {
@@ -387,59 +342,47 @@ export class HttpCrawler extends BasicCrawler {
387
342
  let { message } = errorResponse;
388
343
  if (!message)
389
344
  message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
390
- throw new Error(`${statusCode} - ${message}`);
345
+ throw new Error(`${status} - ${message}`);
391
346
  }
392
347
  if (includeError) {
393
- throw new Error(`${statusCode} - Error status code was set by user.`);
348
+ throw new Error(`${status} - Error status code was set by user.`);
394
349
  }
395
350
  // It's not a JSON, so it's probably some text. Get the first 100 chars of it.
396
- throw new Error(`${statusCode} - Internal Server Error: ${body.slice(0, 100)}`);
351
+ throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`);
397
352
  }
398
353
  else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
399
- const isXml = type.includes('xml');
400
- const parsed = await this._parseHTML(response, isXml, crawlingContext);
401
- return { ...parsed, isXml, response, contentType };
354
+ return { response, contentType, body: await response.text() };
402
355
  }
403
356
  else {
404
- const body = await concatStreamToBuffer(response);
357
+ const body = Buffer.from(await response.bytes());
405
358
  return {
406
359
  body,
407
360
  response,
408
361
  contentType,
409
- enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
410
362
  };
411
363
  }
412
364
  }
413
- async _parseHTML(response, _isXml, _crawlingContext) {
414
- return {
415
- body: await concatStreamToBuffer(response),
416
- };
417
- }
418
365
  /**
419
366
  * Combines the provided `requestOptions` with mandatory (non-overridable) values.
420
367
  */
421
- _getRequestOptions(request, session, proxyUrl, gotOptions) {
368
+ _getRequestOptions(request, session, proxyUrl) {
422
369
  const requestOptions = {
423
370
  url: request.url,
424
371
  method: request.method,
425
372
  proxyUrl,
426
- timeout: { request: this.navigationTimeoutMillis },
373
+ timeout: this.navigationTimeoutMillis,
374
+ cookieJar: this.persistCookiesPerSession ? session?.cookieJar : undefined,
427
375
  sessionToken: session,
428
- ...gotOptions,
429
- headers: { ...request.headers, ...gotOptions?.headers },
376
+ headers: request.headers,
430
377
  https: {
431
- ...gotOptions?.https,
432
378
  rejectUnauthorized: !this.ignoreSslErrors,
433
379
  },
434
- isStream: true,
380
+ body: undefined,
435
381
  };
436
382
  // Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
437
383
  Reflect.deleteProperty(requestOptions.headers, 'cookie');
438
- // TODO this is incorrect, the check for man in the middle needs to be done
439
- // on individual proxy level, not on the `proxyConfiguration` level,
440
- // because users can use normal + MITM proxies in a single configuration.
441
384
  // Disable SSL verification for MITM proxies
442
- if (this.proxyConfiguration && this.proxyConfiguration.isManInTheMiddle) {
385
+ if (session?.proxyInfo?.ignoreTlsErrors) {
443
386
  requestOptions.https = {
444
387
  ...requestOptions.https,
445
388
  rejectUnauthorized: false,
@@ -468,13 +411,13 @@ export class HttpCrawler extends BasicCrawler {
468
411
  if (iconv.encodingExists(encoding)) {
469
412
  const encodeStream = iconv.encodeStream(utf8);
470
413
  const decodeStream = iconv.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
471
- response.on('error', (err) => decodeStream.emit('error', err));
472
- const encodedResponse = response.pipe(decodeStream).pipe(encodeStream);
473
- encodedResponse.statusCode = response.statusCode;
474
- encodedResponse.headers = response.headers;
475
- encodedResponse.url = response.url;
414
+ const reencodedBody = response.body
415
+ ? Readable.toWeb(Readable.from(Readable.fromWeb(response.body)
416
+ .pipe(decodeStream)
417
+ .pipe(encodeStream)))
418
+ : null;
476
419
  return {
477
- response: encodedResponse,
420
+ response: new ResponseWithUrl(reencodedBody, response),
478
421
  encoding: utf8,
479
422
  };
480
423
  }
@@ -503,15 +446,15 @@ export class HttpCrawler extends BasicCrawler {
503
446
  */
504
447
  _handleRequestTimeout(session) {
505
448
  session?.markBad();
506
- throw new Error(`request timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`);
449
+ throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
507
450
  }
508
451
  _abortDownloadOfBody(request, response) {
509
- const { statusCode } = response;
452
+ const { status } = response;
510
453
  const { type } = parseContentTypeFromResponse(response);
511
454
  // eslint-disable-next-line dot-notation -- accessing private property
512
455
  const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
513
456
  // if we retry the request, can the Content-Type change?
514
- const isTransientContentType = statusCode >= 500 || blockedStatusCodes.includes(statusCode);
457
+ const isTransientContentType = status >= 500 || blockedStatusCodes.includes(status);
515
458
  if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
516
459
  request.noRetry = true;
517
460
  throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
@@ -521,89 +464,37 @@ export class HttpCrawler extends BasicCrawler {
521
464
  /**
522
465
  * @internal wraps public utility for mocking purposes
523
466
  */
524
- _requestAsBrowser = async (options, session) => {
525
- const response = await this.httpClient.stream(processHttpRequestOptions({
467
+ _requestAsBrowser = async (options, session, cookieString) => {
468
+ const opts = processHttpRequestOptions({
526
469
  ...options,
527
- cookieJar: options.cookieJar, // HACK - the type of ToughCookieJar in got is wrong
470
+ cookieJar: options.cookieJar,
528
471
  responseType: 'text',
529
- }), (redirectResponse, updatedRequest) => {
530
- if (this.persistCookiesPerSession) {
531
- session.setCookiesFromResponse(redirectResponse);
532
- const cookieString = session.getCookieString(updatedRequest.url.toString());
533
- if (cookieString !== '') {
534
- updatedRequest.headers.Cookie = cookieString;
535
- }
536
- }
537
472
  });
538
- return addResponsePropertiesToStream(response.stream, response);
539
- };
540
- }
541
- /**
542
- * The stream object returned from got does not have the below properties.
543
- * At the same time, you can't read data directly from the response stream,
544
- * because they won't get emitted unless you also read from the primary
545
- * got stream. To be able to work with only one stream, we move the expected props
546
- * from the response stream to the got stream.
547
- * @internal
548
- */
549
- function addResponsePropertiesToStream(stream, response) {
550
- const properties = [
551
- 'statusCode',
552
- 'statusMessage',
553
- 'headers',
554
- 'complete',
555
- 'httpVersion',
556
- 'rawHeaders',
557
- 'rawTrailers',
558
- 'trailers',
559
- 'url',
560
- 'request',
561
- ];
562
- stream.on('end', () => {
563
- // @ts-expect-error
564
- if (stream.rawTrailers)
565
- stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
566
- // @ts-expect-error
567
- if (stream.trailers)
568
- stream.trailers = response.trailers;
569
- // @ts-expect-error
570
- stream.complete = response.complete;
571
- });
572
- for (const prop of properties) {
573
- if (!(prop in stream)) {
574
- stream[prop] = response[prop];
473
+ if (cookieString) {
474
+ opts.headers?.delete('Cookie');
475
+ opts.headers?.delete('cookie');
476
+ opts.headers?.set('Cookie', cookieString);
575
477
  }
576
- }
577
- return stream;
578
- }
579
- /**
580
- * Gets parsed content type from response object
581
- * @param response HTTP response object
582
- */
583
- function parseContentTypeFromResponse(response) {
584
- ow(response, ow.object.partialShape({
585
- url: ow.string.url,
586
- headers: new ObjectPredicate(),
587
- }));
588
- const { url, headers } = response;
589
- let parsedContentType;
590
- if (headers['content-type']) {
591
- try {
592
- parsedContentType = contentTypeParser.parse(headers['content-type']);
593
- }
594
- catch {
595
- // Can not parse content type from Content-Type header. Try to parse it from file extension.
596
- }
597
- }
598
- // Parse content type from file extension as fallback
599
- if (!parsedContentType) {
600
- const parsedUrl = new URL(url);
601
- const contentTypeFromExtname = mime.contentType(extname(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
602
- parsedContentType = contentTypeParser.parse(contentTypeFromExtname);
603
- }
604
- return {
605
- type: parsedContentType.type,
606
- charset: parsedContentType.parameters.charset,
478
+ const response = await this.httpClient.stream(new Request(opts.url, {
479
+ body: opts.body ? Readable.toWeb(opts.body) : undefined,
480
+ headers: new Headers(opts.headers),
481
+ method: opts.method,
482
+ // Node-specific option to make the request body work with streams
483
+ duplex: 'half',
484
+ }), {
485
+ session,
486
+ timeout: opts.timeout,
487
+ onRedirect: (redirectResponse, updatedRequest) => {
488
+ if (this.persistCookiesPerSession) {
489
+ session.setCookiesFromResponse(redirectResponse);
490
+ const cookieStringRedirected = session.getCookieString(updatedRequest.url.toString());
491
+ if (cookieStringRedirected !== '') {
492
+ updatedRequest.headers.set('Cookie', cookieStringRedirected);
493
+ }
494
+ }
495
+ },
496
+ });
497
+ return response;
607
498
  };
608
499
  }
609
500
  /**