@crawlee/http 4.0.0-beta.5 → 4.0.0-beta.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,14 @@
1
- import { extname } from 'node:path';
1
+ import { Readable } from 'node:stream';
2
2
  import util from 'node:util';
3
- import { BASIC_CRAWLER_TIMEOUT_BUFFER_SECS, BasicCrawler, Configuration, CrawlerExtension, mergeCookies, processHttpRequestOptions, RequestState, Router, SessionError, validators, } from '@crawlee/basic';
3
+ import { BasicCrawler, ContextPipeline, NavigationSkippedError, RequestState, Router, SessionError, } from '@crawlee/basic';
4
+ import { ResponseWithUrl } from '@crawlee/http-client';
4
5
  import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
5
6
  import * as cheerio from 'cheerio';
6
7
  import contentTypeParser from 'content-type';
7
8
  import iconv from 'iconv-lite';
8
- import mime from 'mime-types';
9
- import ow, { ObjectPredicate } from 'ow';
9
+ import ow from 'ow';
10
10
  import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
11
- import { concatStreamToBuffer, readStreamToString } from '@apify/utilities';
12
- let TimeoutError;
11
+ import { parseContentTypeFromResponse, processHttpRequestOptions } from './utils.js';
13
12
  /**
14
13
  * Default mime types, which HttpScraper supports.
15
14
  */
@@ -46,18 +45,18 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
46
45
  *
47
46
  * The crawler finishes when there are no more {@link Request} objects to crawl.
48
47
  *
49
- * We can use the `preNavigationHooks` to adjust `gotOptions`:
48
+ * We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
50
49
  *
51
50
  * ```javascript
52
51
  * preNavigationHooks: [
53
- * (crawlingContext, gotOptions) => {
52
+ * (crawlingContext) => {
54
53
  * // ...
55
54
  * },
56
55
  * ]
57
56
  * ```
58
57
  *
59
- * By default, this crawler only processes web pages with the `text/html`
60
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
58
+ * By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
59
+ * and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
61
60
  * and skips pages with other content types. If you want the crawler to process other content types,
62
61
  * use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option.
63
62
  * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
@@ -93,13 +92,6 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
93
92
  * @category Crawlers
94
93
  */
95
94
  export class HttpCrawler extends BasicCrawler {
96
- config;
97
- /**
98
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
99
- * Only available if used by the crawler.
100
- */
101
- proxyConfiguration;
102
- userRequestHandlerTimeoutMillis;
103
95
  preNavigationHooks;
104
96
  postNavigationHooks;
105
97
  persistCookiesPerSession;
@@ -107,8 +99,6 @@ export class HttpCrawler extends BasicCrawler {
107
99
  ignoreSslErrors;
108
100
  suggestResponseEncoding;
109
101
  forceResponseEncoding;
110
- additionalHttpErrorStatusCodes;
111
- ignoreHttpErrorStatusCodes;
112
102
  supportedMimeTypes;
113
103
  static optionsShape = {
114
104
  ...BasicCrawler.optionsShape,
@@ -117,157 +107,139 @@ export class HttpCrawler extends BasicCrawler {
117
107
  additionalMimeTypes: ow.optional.array.ofType(ow.string),
118
108
  suggestResponseEncoding: ow.optional.string,
119
109
  forceResponseEncoding: ow.optional.string,
120
- proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
121
110
  persistCookiesPerSession: ow.optional.boolean,
122
- additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
123
- ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
124
111
  preNavigationHooks: ow.optional.array,
125
112
  postNavigationHooks: ow.optional.array,
126
113
  };
127
114
  /**
128
115
  * All `HttpCrawlerOptions` parameters are passed via an options object.
129
116
  */
130
- constructor(options = {}, config = Configuration.getGlobalConfig()) {
117
+ constructor(options = {}) {
131
118
  ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape));
132
- const { requestHandler, requestHandlerTimeoutSecs = 60, navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, proxyConfiguration, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [], additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
119
+ const { navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, persistCookiesPerSession = true, preNavigationHooks = [], postNavigationHooks = [],
133
120
  // BasicCrawler
134
- autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
121
+ autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, contextPipelineBuilder, ...basicCrawlerOptions } = options;
135
122
  super({
136
123
  ...basicCrawlerOptions,
137
- requestHandler,
138
124
  autoscaledPoolOptions,
139
- // We need to add some time for internal functions to finish,
140
- // but not too much so that we would stall the crawler.
141
- requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
142
- }, config);
143
- this.config = config;
144
- this.requestHandler = requestHandler ?? this.router;
145
- // Cookies should be persisted per session only if session pool is used
146
- if (!this.useSessionPool && persistCookiesPerSession) {
147
- throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
148
- }
125
+ contextPipelineBuilder: contextPipelineBuilder ??
126
+ (() => this.buildContextPipeline()),
127
+ });
149
128
  this.supportedMimeTypes = new Set([...HTML_AND_XML_MIME_TYPES, APPLICATION_JSON_MIME_TYPE]);
150
129
  if (additionalMimeTypes.length)
151
130
  this._extendSupportedMimeTypes(additionalMimeTypes);
152
131
  if (suggestResponseEncoding && forceResponseEncoding) {
153
132
  this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
154
133
  }
155
- this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
156
134
  this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
157
135
  this.ignoreSslErrors = ignoreSslErrors;
158
136
  this.suggestResponseEncoding = suggestResponseEncoding;
159
137
  this.forceResponseEncoding = forceResponseEncoding;
160
- this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
161
- this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
162
- this.proxyConfiguration = proxyConfiguration;
163
138
  this.preNavigationHooks = preNavigationHooks;
164
139
  this.postNavigationHooks = [
165
140
  ({ request, response }) => this._abortDownloadOfBody(request, response),
166
141
  ...postNavigationHooks,
167
142
  ];
168
- if (this.useSessionPool) {
169
- this.persistCookiesPerSession = persistCookiesPerSession ?? true;
170
- }
171
- else {
172
- this.persistCookiesPerSession = false;
173
- }
143
+ this.persistCookiesPerSession = persistCookiesPerSession;
174
144
  }
175
- /**
176
- * **EXPERIMENTAL**
177
- * Function for attaching CrawlerExtensions such as the Unblockers.
178
- * @param extension Crawler extension that overrides the crawler configuration.
179
- */
180
- use(extension) {
181
- ow(extension, ow.object.instanceOf(CrawlerExtension));
182
- const className = this.constructor.name;
183
- const extensionOptions = extension.getCrawlerOptions();
184
- for (const [key, value] of Object.entries(extensionOptions)) {
185
- const isConfigurable = Object.hasOwn(this, key);
186
- const originalType = typeof this[key];
187
- const extensionType = typeof value; // What if we want to null something? It is really needed?
188
- const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
189
- const exists = this[key] != null;
190
- if (!isConfigurable) {
191
- // Test if the property can be configured on the crawler
192
- throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
193
- }
194
- if (!isSameType && exists) {
195
- // Assuming that extensions will only add up configuration
196
- throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
197
- }
198
- this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
199
- this[key] = value;
200
- }
145
+ buildContextPipeline() {
146
+ return ContextPipeline.create()
147
+ .compose({
148
+ action: this.makeHttpRequest.bind(this),
149
+ })
150
+ .compose({ action: this.processHttpResponse.bind(this) })
151
+ .compose({ action: this.handleBlockedRequestByContent.bind(this) });
201
152
  }
202
- /**
203
- * Wrapper around requestHandler that opens and closes pages etc.
204
- */
205
- async _runRequestHandler(crawlingContext) {
153
+ async makeHttpRequest(crawlingContext) {
206
154
  const { request, session } = crawlingContext;
207
- if (this.proxyConfiguration) {
208
- const sessionId = session ? session.id : undefined;
209
- crawlingContext.proxyInfo = await this.proxyConfiguration.newProxyInfo(sessionId, { request });
210
- }
211
- if (!request.skipNavigation) {
212
- await this._handleNavigation(crawlingContext);
213
- tryCancel();
214
- const parsed = await this._parseResponse(request, crawlingContext.response, crawlingContext);
215
- const response = parsed.response;
216
- const contentType = parsed.contentType;
217
- tryCancel();
218
- // `??=` because descendant classes may already set optimized version
219
- crawlingContext.waitForSelector ??= async (selector, _timeoutMs) => {
220
- const $ = cheerio.load(parsed.body.toString());
221
- if ($(selector).get().length === 0) {
222
- throw new Error(`Selector '${selector}' not found.`);
223
- }
155
+ if (request.skipNavigation) {
156
+ return {
157
+ request: new Proxy(request, {
158
+ get(target, propertyName, receiver) {
159
+ if (propertyName === 'loadedUrl') {
160
+ throw new NavigationSkippedError('The `request.loadedUrl` property is not available - `skipNavigation` was used');
161
+ }
162
+ return Reflect.get(target, propertyName, receiver);
163
+ },
164
+ }),
165
+ get response() {
166
+ throw new NavigationSkippedError('The `response` property is not available - `skipNavigation` was used');
167
+ },
224
168
  };
225
- crawlingContext.parseWithCheerio ??= async (selector, timeoutMs) => {
226
- const $ = cheerio.load(parsed.body.toString());
227
- if (selector) {
228
- await crawlingContext.waitForSelector(selector, timeoutMs);
229
- }
230
- return $;
169
+ }
170
+ request.state = RequestState.BEFORE_NAV;
171
+ await this._executeHooks(this.preNavigationHooks, crawlingContext);
172
+ tryCancel();
173
+ const proxyUrl = crawlingContext.proxyInfo?.url;
174
+ const httpResponse = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
175
+ tryCancel();
176
+ request.loadedUrl = httpResponse?.url;
177
+ request.state = RequestState.AFTER_NAV;
178
+ return { request: request, response: httpResponse };
179
+ }
180
+ async processHttpResponse(crawlingContext) {
181
+ if (crawlingContext.request.skipNavigation) {
182
+ return {
183
+ get contentType() {
184
+ throw new NavigationSkippedError('The `contentType` property is not available - `skipNavigation` was used');
185
+ },
186
+ get body() {
187
+ throw new NavigationSkippedError('The `body` property is not available - `skipNavigation` was used');
188
+ },
189
+ get json() {
190
+ throw new NavigationSkippedError('The `json` property is not available - `skipNavigation` was used');
191
+ },
192
+ get waitForSelector() {
193
+ throw new NavigationSkippedError('The `waitForSelector` method is not available - `skipNavigation` was used');
194
+ },
195
+ get parseWithCheerio() {
196
+ throw new NavigationSkippedError('The `parseWithCheerio` method is not available - `skipNavigation` was used');
197
+ },
231
198
  };
232
- if (this.useSessionPool) {
233
- this._throwOnBlockedRequest(crawlingContext.session, response.statusCode);
234
- }
235
- if (this.persistCookiesPerSession) {
236
- crawlingContext.session.setCookiesFromResponse(response);
199
+ }
200
+ await this._executeHooks(this.postNavigationHooks, crawlingContext);
201
+ tryCancel();
202
+ const parsed = await this._parseResponse(crawlingContext.request, crawlingContext.response);
203
+ tryCancel();
204
+ const response = parsed.response;
205
+ const contentType = parsed.contentType;
206
+ const waitForSelector = async (selector, _timeoutMs) => {
207
+ const $ = cheerio.load(parsed.body.toString());
208
+ if ($(selector).get().length === 0) {
209
+ throw new Error(`Selector '${selector}' not found.`);
237
210
  }
238
- request.loadedUrl = response.url;
239
- if (!this.requestMatchesEnqueueStrategy(request)) {
240
- this.log.debug(
241
- // eslint-disable-next-line dot-notation
242
- `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
243
- request.noRetry = true;
244
- request.state = RequestState.SKIPPED;
245
- return;
211
+ };
212
+ const parseWithCheerio = async (selector, timeoutMs) => {
213
+ const $ = cheerio.load(parsed.body.toString());
214
+ if (selector) {
215
+ await crawlingContext.waitForSelector(selector, timeoutMs);
246
216
  }
247
- Object.assign(crawlingContext, parsed);
248
- Object.defineProperty(crawlingContext, 'json', {
249
- get() {
250
- if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
251
- return null;
252
- const jsonString = parsed.body.toString(contentType.encoding);
253
- return JSON.parse(jsonString);
254
- },
255
- });
217
+ return $;
218
+ };
219
+ this._throwOnBlockedRequest(response.status);
220
+ if (this.persistCookiesPerSession) {
221
+ crawlingContext.session.setCookiesFromResponse(response);
256
222
  }
223
+ return {
224
+ get json() {
225
+ if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
226
+ return null;
227
+ const jsonString = parsed.body.toString(contentType.encoding);
228
+ return JSON.parse(jsonString);
229
+ },
230
+ waitForSelector,
231
+ parseWithCheerio,
232
+ contentType,
233
+ body: parsed.body,
234
+ };
235
+ }
236
+ async handleBlockedRequestByContent(crawlingContext) {
257
237
  if (this.retryOnBlocked) {
258
238
  const error = await this.isRequestBlocked(crawlingContext);
259
239
  if (error)
260
240
  throw new SessionError(error);
261
241
  }
262
- request.state = RequestState.REQUEST_HANDLER;
263
- try {
264
- await addTimeoutToPromise(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
265
- request.state = RequestState.DONE;
266
- }
267
- catch (e) {
268
- request.state = RequestState.ERROR;
269
- throw e;
270
- }
242
+ return {};
271
243
  }
272
244
  async isRequestBlocked(crawlingContext) {
273
245
  if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
@@ -277,84 +249,25 @@ export class HttpCrawler extends BasicCrawler {
277
249
  return `Found selectors: ${foundSelectors.join(', ')}`;
278
250
  }
279
251
  }
280
- return false;
281
- }
282
- async _handleNavigation(crawlingContext) {
283
- const gotOptions = {};
284
- const { request, session } = crawlingContext;
285
- const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
286
- request.state = RequestState.BEFORE_NAV;
287
- // Execute pre navigation hooks before applying session pool cookies,
288
- // as they may also set cookies in the session
289
- await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
290
- tryCancel();
291
- const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
292
- this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
293
- const proxyUrl = crawlingContext.proxyInfo?.url;
294
- crawlingContext.response = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
295
- tryCancel();
296
- request.state = RequestState.AFTER_NAV;
297
- await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
298
- tryCancel();
299
- }
300
- /**
301
- * Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
302
- */
303
- _applyCookies({ session, request }, gotOptions, preHookCookies, postHookCookies) {
304
- const sessionCookie = session?.getCookieString(request.url) ?? '';
305
- let alteredGotOptionsCookies = gotOptions.headers?.Cookie || gotOptions.headers?.cookie || '';
306
- if (gotOptions.headers?.Cookie && gotOptions.headers?.cookie) {
307
- const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
308
- this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
309
- const sourceCookies = [];
310
- if (Array.isArray(lowerCaseHeader)) {
311
- sourceCookies.push(...lowerCaseHeader);
312
- }
313
- else {
314
- sourceCookies.push(lowerCaseHeader);
315
- }
316
- if (Array.isArray(upperCaseHeader)) {
317
- sourceCookies.push(...upperCaseHeader);
318
- }
319
- else {
320
- sourceCookies.push(upperCaseHeader);
321
- }
322
- alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
323
- }
324
- const sourceCookies = [sessionCookie, preHookCookies];
325
- if (Array.isArray(alteredGotOptionsCookies)) {
326
- sourceCookies.push(...alteredGotOptionsCookies);
327
- }
328
- else {
329
- sourceCookies.push(alteredGotOptionsCookies);
330
- }
331
- sourceCookies.push(postHookCookies);
332
- const mergedCookie = mergeCookies(request.url, sourceCookies);
333
- gotOptions.headers ??= {};
334
- Reflect.deleteProperty(gotOptions.headers, 'Cookie');
335
- Reflect.deleteProperty(gotOptions.headers, 'cookie');
336
- if (mergedCookie !== '') {
337
- gotOptions.headers.Cookie = mergedCookie;
252
+ if (this.blockedStatusCodes.has(crawlingContext.response.status)) {
253
+ return `Blocked by status code ${crawlingContext.response.status}`;
338
254
  }
255
+ return false;
339
256
  }
340
257
  /**
341
258
  * Function to make the HTTP request. It performs optimizations
342
259
  * on the request such as only downloading the request body if the
343
260
  * received content type matches text/html, application/xml, application/xhtml+xml.
344
261
  */
345
- async _requestFunction({ request, session, proxyUrl, gotOptions, }) {
346
- if (!TimeoutError) {
347
- // @ts-ignore
348
- ({ TimeoutError } = await import('got-scraping'));
349
- }
350
- const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
262
+ async _requestFunction({ request, session, proxyUrl }) {
263
+ const opts = this._getRequestOptions(request, session, proxyUrl);
351
264
  try {
352
265
  return await this._requestAsBrowser(opts, session);
353
266
  }
354
267
  catch (e) {
355
- if (e instanceof TimeoutError) {
268
+ if (e instanceof Error && e.constructor.name === 'TimeoutError') {
356
269
  this._handleRequestTimeout(session);
357
- return undefined;
270
+ return new Response(); // this will never happen, as _handleRequestTimeout always throws
358
271
  }
359
272
  if (this.isProxyError(e)) {
360
273
  throw new SessionError(this._getMessageFromError(e));
@@ -367,18 +280,16 @@ export class HttpCrawler extends BasicCrawler {
367
280
  /**
368
281
  * Encodes and parses response according to the provided content type
369
282
  */
370
- async _parseResponse(request, responseStream, crawlingContext) {
371
- const { statusCode } = responseStream;
372
- const { type, charset } = parseContentTypeFromResponse(responseStream);
373
- const { response, encoding } = this._encodeResponse(request, responseStream, charset);
283
+ async _parseResponse(request, response) {
284
+ const { status } = response;
285
+ const { type, charset } = parseContentTypeFromResponse(response);
286
+ const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset);
374
287
  const contentType = { type, encoding };
375
- if (statusCode >= 400 && statusCode <= 599) {
376
- this.stats.registerStatusCode(statusCode);
288
+ if (status >= 400 && status <= 599) {
289
+ this.stats.registerStatusCode(status);
377
290
  }
378
- const excludeError = this.ignoreHttpErrorStatusCodes.has(statusCode);
379
- const includeError = this.additionalHttpErrorStatusCodes.has(statusCode);
380
- if ((statusCode >= 500 && !excludeError) || includeError) {
381
- const body = await readStreamToString(response, encoding);
291
+ if (this.isErrorStatusCode(status)) {
292
+ const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text)
382
293
  // Errors are often sent as JSON, so attempt to parse them,
383
294
  // despite Accept header being set to text/html.
384
295
  if (type === APPLICATION_JSON_MIME_TYPE) {
@@ -386,59 +297,49 @@ export class HttpCrawler extends BasicCrawler {
386
297
  let { message } = errorResponse;
387
298
  if (!message)
388
299
  message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
389
- throw new Error(`${statusCode} - ${message}`);
300
+ throw new Error(`${status} - ${message}`);
390
301
  }
391
- if (includeError) {
392
- throw new Error(`${statusCode} - Error status code was set by user.`);
302
+ if (this.additionalHttpErrorStatusCodes.has(status)) {
303
+ throw new Error(`${status} - Error status code was set by user.`);
393
304
  }
394
305
  // It's not a JSON, so it's probably some text. Get the first 100 chars of it.
395
- throw new Error(`${statusCode} - Internal Server Error: ${body.slice(0, 100)}`);
306
+ throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`);
396
307
  }
397
308
  else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
398
- const isXml = type.includes('xml');
399
- const parsed = await this._parseHTML(response, isXml, crawlingContext);
400
- return { ...parsed, isXml, response, contentType };
309
+ return { response, contentType, body: await reencodedResponse.text() };
401
310
  }
402
311
  else {
403
- const body = await concatStreamToBuffer(response);
312
+ const body = Buffer.from(await reencodedResponse.bytes());
404
313
  return {
405
314
  body,
406
315
  response,
407
316
  contentType,
408
- enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
409
317
  };
410
318
  }
411
319
  }
412
- async _parseHTML(response, _isXml, _crawlingContext) {
413
- return {
414
- body: await concatStreamToBuffer(response),
415
- };
416
- }
417
320
  /**
418
321
  * Combines the provided `requestOptions` with mandatory (non-overridable) values.
419
322
  */
420
- _getRequestOptions(request, session, proxyUrl, gotOptions) {
323
+ _getRequestOptions(request, session, proxyUrl) {
421
324
  const requestOptions = {
422
325
  url: request.url,
423
326
  method: request.method,
424
327
  proxyUrl,
425
- timeout: { request: this.navigationTimeoutMillis },
328
+ timeout: this.navigationTimeoutMillis,
329
+ cookieJar: this.persistCookiesPerSession ? session.cookieJar : undefined,
426
330
  sessionToken: session,
427
- ...gotOptions,
428
- headers: { ...request.headers, ...gotOptions?.headers },
331
+ headers: request.headers,
429
332
  https: {
430
- ...gotOptions?.https,
431
333
  rejectUnauthorized: !this.ignoreSslErrors,
432
334
  },
433
- isStream: true,
335
+ body: undefined,
434
336
  };
435
- // Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
436
- Reflect.deleteProperty(requestOptions.headers, 'cookie');
437
- // TODO this is incorrect, the check for man in the middle needs to be done
438
- // on individual proxy level, not on the `proxyConfiguration` level,
439
- // because users can use normal + MITM proxies in a single configuration.
337
+ if (requestOptions.headers?.cookie || requestOptions.headers?.Cookie) {
338
+ requestOptions.headers.Cookie = this._getCookieHeaderFromRequest(request);
339
+ delete requestOptions.headers.cookie;
340
+ }
440
341
  // Disable SSL verification for MITM proxies
441
- if (this.proxyConfiguration && this.proxyConfiguration.isManInTheMiddle) {
342
+ if (session.proxyInfo?.ignoreTlsErrors) {
442
343
  requestOptions.https = {
443
344
  ...requestOptions.https,
444
345
  rejectUnauthorized: false,
@@ -466,14 +367,16 @@ export class HttpCrawler extends BasicCrawler {
466
367
  // Try to re-encode a variety of unsupported encodings to utf-8
467
368
  if (iconv.encodingExists(encoding)) {
468
369
  const encodeStream = iconv.encodeStream(utf8);
469
- const decodeStream = iconv.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
470
- response.on('error', (err) => decodeStream.emit('error', err));
471
- const encodedResponse = response.pipe(decodeStream).pipe(encodeStream);
472
- encodedResponse.statusCode = response.statusCode;
473
- encodedResponse.headers = response.headers;
474
- encodedResponse.url = response.url;
370
+ const decodeStream = iconv
371
+ .decodeStream(encoding)
372
+ .on('error', (err) => encodeStream.emit('error', err));
373
+ const reencodedBody = response.body
374
+ ? Readable.toWeb(Readable.from(Readable.fromWeb(response.body)
375
+ .pipe(decodeStream)
376
+ .pipe(encodeStream)))
377
+ : null;
475
378
  return {
476
- response: encodedResponse,
379
+ response: new ResponseWithUrl(reencodedBody, response),
477
380
  encoding: utf8,
478
381
  };
479
382
  }
@@ -501,16 +404,13 @@ export class HttpCrawler extends BasicCrawler {
501
404
  * Handles timeout request
502
405
  */
503
406
  _handleRequestTimeout(session) {
504
- session?.markBad();
505
- throw new Error(`request timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`);
407
+ session.markBad();
408
+ throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
506
409
  }
507
410
  _abortDownloadOfBody(request, response) {
508
- const { statusCode } = response;
411
+ const { status } = response;
509
412
  const { type } = parseContentTypeFromResponse(response);
510
- // eslint-disable-next-line dot-notation -- accessing private property
511
- const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
512
- // if we retry the request, can the Content-Type change?
513
- const isTransientContentType = statusCode >= 500 || blockedStatusCodes.includes(statusCode);
413
+ const isTransientContentType = status >= 500 || this.blockedStatusCodes.has(status);
514
414
  if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
515
415
  request.noRetry = true;
516
416
  throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
@@ -521,88 +421,22 @@ export class HttpCrawler extends BasicCrawler {
521
421
  * @internal wraps public utility for mocking purposes
522
422
  */
523
423
  _requestAsBrowser = async (options, session) => {
524
- const response = await this.httpClient.stream(processHttpRequestOptions({
424
+ const opts = processHttpRequestOptions({
525
425
  ...options,
526
- cookieJar: options.cookieJar, // HACK - the type of ToughCookieJar in got is wrong
426
+ cookieJar: options.cookieJar,
527
427
  responseType: 'text',
528
- }), (redirectResponse, updatedRequest) => {
529
- if (this.persistCookiesPerSession) {
530
- session.setCookiesFromResponse(redirectResponse);
531
- const cookieString = session.getCookieString(updatedRequest.url.toString());
532
- if (cookieString !== '') {
533
- updatedRequest.headers.Cookie = cookieString;
534
- }
535
- }
536
428
  });
537
- return addResponsePropertiesToStream(response.stream, response);
538
- };
539
- }
540
- /**
541
- * The stream object returned from got does not have the below properties.
542
- * At the same time, you can't read data directly from the response stream,
543
- * because they won't get emitted unless you also read from the primary
544
- * got stream. To be able to work with only one stream, we move the expected props
545
- * from the response stream to the got stream.
546
- * @internal
547
- */
548
- function addResponsePropertiesToStream(stream, response) {
549
- const properties = [
550
- 'statusCode',
551
- 'statusMessage',
552
- 'headers',
553
- 'complete',
554
- 'httpVersion',
555
- 'rawHeaders',
556
- 'rawTrailers',
557
- 'trailers',
558
- 'url',
559
- 'request',
560
- ];
561
- stream.on('end', () => {
562
- // @ts-expect-error
563
- if (stream.rawTrailers)
564
- stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
565
- // @ts-expect-error
566
- if (stream.trailers)
567
- stream.trailers = response.trailers;
568
- // @ts-expect-error
569
- stream.complete = response.complete;
570
- });
571
- for (const prop of properties) {
572
- if (!(prop in stream)) {
573
- stream[prop] = response[prop];
574
- }
575
- }
576
- return stream;
577
- }
578
- /**
579
- * Gets parsed content type from response object
580
- * @param response HTTP response object
581
- */
582
- function parseContentTypeFromResponse(response) {
583
- ow(response, ow.object.partialShape({
584
- url: ow.string.url,
585
- headers: new ObjectPredicate(),
586
- }));
587
- const { url, headers } = response;
588
- let parsedContentType;
589
- if (headers['content-type']) {
590
- try {
591
- parsedContentType = contentTypeParser.parse(headers['content-type']);
592
- }
593
- catch {
594
- // Can not parse content type from Content-Type header. Try to parse it from file extension.
595
- }
596
- }
597
- // Parse content type from file extension as fallback
598
- if (!parsedContentType) {
599
- const parsedUrl = new URL(url);
600
- const contentTypeFromExtname = mime.contentType(extname(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
601
- parsedContentType = contentTypeParser.parse(contentTypeFromExtname);
602
- }
603
- return {
604
- type: parsedContentType.type,
605
- charset: parsedContentType.parameters.charset,
429
+ const response = await this.httpClient.sendRequest(new Request(opts.url, {
430
+ body: opts.body ? Readable.toWeb(opts.body) : undefined,
431
+ headers: new Headers(opts.headers),
432
+ method: opts.method,
433
+ // Node-specific option to make the request body work with streams
434
+ duplex: 'half',
435
+ }), {
436
+ session,
437
+ timeoutMillis: opts.timeout,
438
+ });
439
+ return response;
606
440
  };
607
441
  }
608
442
  /**