@crawlee/http 4.0.0-beta.4 → 4.0.0-beta.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,14 @@
1
- import { extname } from 'node:path';
1
+ import { Readable } from 'node:stream';
2
2
  import util from 'node:util';
3
- import { BASIC_CRAWLER_TIMEOUT_BUFFER_SECS, BasicCrawler, Configuration, CrawlerExtension, mergeCookies, processHttpRequestOptions, RequestState, Router, SessionError, validators, } from '@crawlee/basic';
3
+ import { BasicCrawler, ContextPipeline, mergeCookies, RequestState, Router, SessionError } from '@crawlee/basic';
4
+ import { ResponseWithUrl } from '@crawlee/http-client';
4
5
  import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
5
6
  import * as cheerio from 'cheerio';
6
7
  import contentTypeParser from 'content-type';
7
8
  import iconv from 'iconv-lite';
8
- import mime from 'mime-types';
9
- import ow, { ObjectPredicate } from 'ow';
9
+ import ow from 'ow';
10
10
  import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
11
- import { concatStreamToBuffer, readStreamToString } from '@apify/utilities';
12
- let TimeoutError;
11
+ import { parseContentTypeFromResponse, processHttpRequestOptions } from './utils.js';
13
12
  /**
14
13
  * Default mime types, which HttpScraper supports.
15
14
  */
@@ -46,18 +45,18 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
46
45
  *
47
46
  * The crawler finishes when there are no more {@link Request} objects to crawl.
48
47
  *
49
- * We can use the `preNavigationHooks` to adjust `gotOptions`:
48
+ * We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
50
49
  *
51
50
  * ```javascript
52
51
  * preNavigationHooks: [
53
- * (crawlingContext, gotOptions) => {
52
+ * (crawlingContext) => {
54
53
  * // ...
55
54
  * },
56
55
  * ]
57
56
  * ```
58
57
  *
59
- * By default, this crawler only processes web pages with the `text/html`
60
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
58
+ * By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
59
+ * and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
61
60
  * and skips pages with other content types. If you want the crawler to process other content types,
62
61
  * use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option.
63
62
  * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
@@ -93,13 +92,6 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
93
92
  * @category Crawlers
94
93
  */
95
94
  export class HttpCrawler extends BasicCrawler {
96
- config;
97
- /**
98
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
99
- * Only available if used by the crawler.
100
- */
101
- proxyConfiguration;
102
- userRequestHandlerTimeoutMillis;
103
95
  preNavigationHooks;
104
96
  postNavigationHooks;
105
97
  persistCookiesPerSession;
@@ -107,8 +99,6 @@ export class HttpCrawler extends BasicCrawler {
107
99
  ignoreSslErrors;
108
100
  suggestResponseEncoding;
109
101
  forceResponseEncoding;
110
- additionalHttpErrorStatusCodes;
111
- ignoreHttpErrorStatusCodes;
112
102
  supportedMimeTypes;
113
103
  static optionsShape = {
114
104
  ...BasicCrawler.optionsShape,
@@ -117,158 +107,144 @@ export class HttpCrawler extends BasicCrawler {
117
107
  additionalMimeTypes: ow.optional.array.ofType(ow.string),
118
108
  suggestResponseEncoding: ow.optional.string,
119
109
  forceResponseEncoding: ow.optional.string,
120
- proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
121
110
  persistCookiesPerSession: ow.optional.boolean,
122
- additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
123
- ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
124
111
  preNavigationHooks: ow.optional.array,
125
112
  postNavigationHooks: ow.optional.array,
126
113
  };
127
114
  /**
128
115
  * All `HttpCrawlerOptions` parameters are passed via an options object.
129
116
  */
130
- constructor(options = {}, config = Configuration.getGlobalConfig()) {
117
+ constructor(options = {}) {
131
118
  ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape));
132
- const { requestHandler, requestHandlerTimeoutSecs = 60, navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, proxyConfiguration, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [], additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
119
+ const { navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, persistCookiesPerSession = true, preNavigationHooks = [], postNavigationHooks = [],
133
120
  // BasicCrawler
134
- autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
121
+ autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, contextPipelineBuilder, ...basicCrawlerOptions } = options;
135
122
  super({
136
123
  ...basicCrawlerOptions,
137
- requestHandler,
138
124
  autoscaledPoolOptions,
139
- // We need to add some time for internal functions to finish,
140
- // but not too much so that we would stall the crawler.
141
- requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
142
- }, config);
143
- this.config = config;
144
- // FIXME any
145
- this.requestHandler = requestHandler ?? this.router;
146
- // Cookies should be persisted per session only if session pool is used
147
- if (!this.useSessionPool && persistCookiesPerSession) {
148
- throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
149
- }
125
+ contextPipelineBuilder: contextPipelineBuilder ??
126
+ (() => this.buildContextPipeline()),
127
+ });
150
128
  this.supportedMimeTypes = new Set([...HTML_AND_XML_MIME_TYPES, APPLICATION_JSON_MIME_TYPE]);
151
129
  if (additionalMimeTypes.length)
152
130
  this._extendSupportedMimeTypes(additionalMimeTypes);
153
131
  if (suggestResponseEncoding && forceResponseEncoding) {
154
132
  this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
155
133
  }
156
- this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
157
134
  this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
158
135
  this.ignoreSslErrors = ignoreSslErrors;
159
136
  this.suggestResponseEncoding = suggestResponseEncoding;
160
137
  this.forceResponseEncoding = forceResponseEncoding;
161
- this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
162
- this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
163
- this.proxyConfiguration = proxyConfiguration;
164
138
  this.preNavigationHooks = preNavigationHooks;
165
139
  this.postNavigationHooks = [
166
140
  ({ request, response }) => this._abortDownloadOfBody(request, response),
167
141
  ...postNavigationHooks,
168
142
  ];
169
- if (this.useSessionPool) {
170
- this.persistCookiesPerSession = persistCookiesPerSession ?? true;
171
- }
172
- else {
173
- this.persistCookiesPerSession = false;
174
- }
143
+ this.persistCookiesPerSession = persistCookiesPerSession;
175
144
  }
176
- /**
177
- * **EXPERIMENTAL**
178
- * Function for attaching CrawlerExtensions such as the Unblockers.
179
- * @param extension Crawler extension that overrides the crawler configuration.
180
- */
181
- use(extension) {
182
- ow(extension, ow.object.instanceOf(CrawlerExtension));
183
- const className = this.constructor.name;
184
- const extensionOptions = extension.getCrawlerOptions();
185
- for (const [key, value] of Object.entries(extensionOptions)) {
186
- const isConfigurable = Object.hasOwn(this, key);
187
- const originalType = typeof this[key];
188
- const extensionType = typeof value; // What if we want to null something? It is really needed?
189
- const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
190
- const exists = this[key] != null;
191
- if (!isConfigurable) {
192
- // Test if the property can be configured on the crawler
193
- throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
194
- }
195
- if (!isSameType && exists) {
196
- // Assuming that extensions will only add up configuration
197
- throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
198
- }
199
- this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
200
- this[key] = value;
201
- }
145
+ buildContextPipeline() {
146
+ return ContextPipeline.create()
147
+ .compose({
148
+ action: this.makeHttpRequest.bind(this),
149
+ })
150
+ .compose({ action: this.processHttpResponse.bind(this) })
151
+ .compose({ action: this.handleBlockedRequestByContent.bind(this) });
202
152
  }
203
- /**
204
- * Wrapper around requestHandler that opens and closes pages etc.
205
- */
206
- async _runRequestHandler(crawlingContext) {
153
+ async makeHttpRequest(crawlingContext) {
207
154
  const { request, session } = crawlingContext;
208
- if (this.proxyConfiguration) {
209
- const sessionId = session ? session.id : undefined;
210
- crawlingContext.proxyInfo = await this.proxyConfiguration.newProxyInfo(sessionId, { request });
211
- }
212
- if (!request.skipNavigation) {
213
- await this._handleNavigation(crawlingContext);
214
- tryCancel();
215
- const parsed = await this._parseResponse(request, crawlingContext.response, crawlingContext);
216
- const response = parsed.response;
217
- const contentType = parsed.contentType;
218
- tryCancel();
219
- // `??=` because descendant classes may already set optimized version
220
- crawlingContext.waitForSelector ??= async (selector, _timeoutMs) => {
221
- const $ = cheerio.load(parsed.body.toString());
222
- if ($(selector).get().length === 0) {
223
- throw new Error(`Selector '${selector}' not found.`);
224
- }
155
+ if (request.skipNavigation) {
156
+ return {
157
+ request: new Proxy(request, {
158
+ get(target, propertyName, receiver) {
159
+ if (propertyName === 'loadedUrl') {
160
+ throw new Error('The `request.loadedUrl` property is not available - `skipNavigation` was used');
161
+ }
162
+ return Reflect.get(target, propertyName, receiver);
163
+ },
164
+ }),
165
+ get response() {
166
+ throw new Error('The `response` property is not available - `skipNavigation` was used');
167
+ },
225
168
  };
226
- crawlingContext.parseWithCheerio ??= async (selector, timeoutMs) => {
227
- const $ = cheerio.load(parsed.body.toString());
228
- if (selector) {
229
- await crawlingContext.waitForSelector(selector, timeoutMs);
230
- }
231
- return $;
169
+ }
170
+ const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
171
+ request.state = RequestState.BEFORE_NAV;
172
+ // Execute pre navigation hooks before applying session pool cookies,
173
+ // as they may also set cookies in the session
174
+ await this._executeHooks(this.preNavigationHooks, crawlingContext);
175
+ tryCancel();
176
+ const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
177
+ const cookieString = this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
178
+ const proxyUrl = crawlingContext.proxyInfo?.url;
179
+ const httpResponse = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, cookieString }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
180
+ tryCancel();
181
+ request.loadedUrl = httpResponse?.url;
182
+ request.state = RequestState.AFTER_NAV;
183
+ return { request: request, response: httpResponse };
184
+ }
185
+ async processHttpResponse(crawlingContext) {
186
+ if (crawlingContext.request.skipNavigation) {
187
+ return {
188
+ get contentType() {
189
+ throw new Error('The `contentType` property is not available - `skipNavigation` was used');
190
+ },
191
+ get body() {
192
+ throw new Error('The `body` property is not available - `skipNavigation` was used');
193
+ },
194
+ get json() {
195
+ throw new Error('The `json` property is not available - `skipNavigation` was used');
196
+ },
197
+ get waitForSelector() {
198
+ throw new Error('The `waitForSelector` method is not available - `skipNavigation` was used');
199
+ },
200
+ get parseWithCheerio() {
201
+ throw new Error('The `parseWithCheerio` method is not available - `skipNavigation` was used');
202
+ },
232
203
  };
233
- if (this.useSessionPool) {
234
- this._throwOnBlockedRequest(crawlingContext.session, response.statusCode);
235
- }
236
- if (this.persistCookiesPerSession) {
237
- crawlingContext.session.setCookiesFromResponse(response);
204
+ }
205
+ await this._executeHooks(this.postNavigationHooks, crawlingContext);
206
+ tryCancel();
207
+ const parsed = await this._parseResponse(crawlingContext.request, crawlingContext.response);
208
+ tryCancel();
209
+ const response = parsed.response;
210
+ const contentType = parsed.contentType;
211
+ const waitForSelector = async (selector, _timeoutMs) => {
212
+ const $ = cheerio.load(parsed.body.toString());
213
+ if ($(selector).get().length === 0) {
214
+ throw new Error(`Selector '${selector}' not found.`);
238
215
  }
239
- request.loadedUrl = response.url;
240
- if (!this.requestMatchesEnqueueStrategy(request)) {
241
- this.log.debug(
242
- // eslint-disable-next-line dot-notation
243
- `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
244
- request.noRetry = true;
245
- request.state = RequestState.SKIPPED;
246
- return;
216
+ };
217
+ const parseWithCheerio = async (selector, timeoutMs) => {
218
+ const $ = cheerio.load(parsed.body.toString());
219
+ if (selector) {
220
+ await crawlingContext.waitForSelector(selector, timeoutMs);
247
221
  }
248
- Object.assign(crawlingContext, parsed);
249
- Object.defineProperty(crawlingContext, 'json', {
250
- get() {
251
- if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
252
- return null;
253
- const jsonString = parsed.body.toString(contentType.encoding);
254
- return JSON.parse(jsonString);
255
- },
256
- });
222
+ return $;
223
+ };
224
+ this._throwOnBlockedRequest(response.status);
225
+ if (this.persistCookiesPerSession) {
226
+ crawlingContext.session.setCookiesFromResponse(response);
257
227
  }
228
+ return {
229
+ get json() {
230
+ if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
231
+ return null;
232
+ const jsonString = parsed.body.toString(contentType.encoding);
233
+ return JSON.parse(jsonString);
234
+ },
235
+ waitForSelector,
236
+ parseWithCheerio,
237
+ contentType,
238
+ body: parsed.body,
239
+ };
240
+ }
241
+ async handleBlockedRequestByContent(crawlingContext) {
258
242
  if (this.retryOnBlocked) {
259
243
  const error = await this.isRequestBlocked(crawlingContext);
260
244
  if (error)
261
245
  throw new SessionError(error);
262
246
  }
263
- request.state = RequestState.REQUEST_HANDLER;
264
- try {
265
- await addTimeoutToPromise(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
266
- request.state = RequestState.DONE;
267
- }
268
- catch (e) {
269
- request.state = RequestState.ERROR;
270
- throw e;
271
- }
247
+ return {};
272
248
  }
273
249
  async isRequestBlocked(crawlingContext) {
274
250
  if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
@@ -278,84 +254,34 @@ export class HttpCrawler extends BasicCrawler {
278
254
  return `Found selectors: ${foundSelectors.join(', ')}`;
279
255
  }
280
256
  }
257
+ if (this.blockedStatusCodes.has(crawlingContext.response.status)) {
258
+ return `Blocked by status code ${crawlingContext.response.status}`;
259
+ }
281
260
  return false;
282
261
  }
283
- async _handleNavigation(crawlingContext) {
284
- const gotOptions = {};
285
- const { request, session } = crawlingContext;
286
- const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
287
- request.state = RequestState.BEFORE_NAV;
288
- // Execute pre navigation hooks before applying session pool cookies,
289
- // as they may also set cookies in the session
290
- await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
291
- tryCancel();
292
- const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
293
- this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
294
- const proxyUrl = crawlingContext.proxyInfo?.url;
295
- crawlingContext.response = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
296
- tryCancel();
297
- request.state = RequestState.AFTER_NAV;
298
- await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
299
- tryCancel();
300
- }
301
262
  /**
302
- * Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
263
+ * Returns the `Cookie` header value based on the current context and
264
+ * any changes that occurred in the navigation hooks.
303
265
  */
304
- _applyCookies({ session, request }, gotOptions, preHookCookies, postHookCookies) {
305
- const sessionCookie = session?.getCookieString(request.url) ?? '';
306
- let alteredGotOptionsCookies = gotOptions.headers?.Cookie || gotOptions.headers?.cookie || '';
307
- if (gotOptions.headers?.Cookie && gotOptions.headers?.cookie) {
308
- const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
309
- this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
310
- const sourceCookies = [];
311
- if (Array.isArray(lowerCaseHeader)) {
312
- sourceCookies.push(...lowerCaseHeader);
313
- }
314
- else {
315
- sourceCookies.push(lowerCaseHeader);
316
- }
317
- if (Array.isArray(upperCaseHeader)) {
318
- sourceCookies.push(...upperCaseHeader);
319
- }
320
- else {
321
- sourceCookies.push(upperCaseHeader);
322
- }
323
- alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
324
- }
325
- const sourceCookies = [sessionCookie, preHookCookies];
326
- if (Array.isArray(alteredGotOptionsCookies)) {
327
- sourceCookies.push(...alteredGotOptionsCookies);
328
- }
329
- else {
330
- sourceCookies.push(alteredGotOptionsCookies);
331
- }
332
- sourceCookies.push(postHookCookies);
333
- const mergedCookie = mergeCookies(request.url, sourceCookies);
334
- gotOptions.headers ??= {};
335
- Reflect.deleteProperty(gotOptions.headers, 'Cookie');
336
- Reflect.deleteProperty(gotOptions.headers, 'cookie');
337
- if (mergedCookie !== '') {
338
- gotOptions.headers.Cookie = mergedCookie;
339
- }
266
+ _applyCookies({ session, request }, preHookCookies, postHookCookies) {
267
+ const sessionCookie = session.getCookieString(request.url);
268
+ const sourceCookies = [sessionCookie, preHookCookies, postHookCookies];
269
+ return mergeCookies(request.url, sourceCookies);
340
270
  }
341
271
  /**
342
272
  * Function to make the HTTP request. It performs optimizations
343
273
  * on the request such as only downloading the request body if the
344
274
  * received content type matches text/html, application/xml, application/xhtml+xml.
345
275
  */
346
- async _requestFunction({ request, session, proxyUrl, gotOptions, }) {
347
- if (!TimeoutError) {
348
- // @ts-ignore
349
- ({ TimeoutError } = await import('got-scraping'));
350
- }
351
- const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
276
+ async _requestFunction({ request, session, proxyUrl, cookieString, }) {
277
+ const opts = this._getRequestOptions(request, session, proxyUrl);
352
278
  try {
353
- return await this._requestAsBrowser(opts, session);
279
+ return await this._requestAsBrowser(opts, session, cookieString);
354
280
  }
355
281
  catch (e) {
356
- if (e instanceof TimeoutError) {
282
+ if (e instanceof Error && e.constructor.name === 'TimeoutError') {
357
283
  this._handleRequestTimeout(session);
358
- return undefined;
284
+ return new Response(); // this will never happen, as _handleRequestTimeout always throws
359
285
  }
360
286
  if (this.isProxyError(e)) {
361
287
  throw new SessionError(this._getMessageFromError(e));
@@ -368,18 +294,16 @@ export class HttpCrawler extends BasicCrawler {
368
294
  /**
369
295
  * Encodes and parses response according to the provided content type
370
296
  */
371
- async _parseResponse(request, responseStream, crawlingContext) {
372
- const { statusCode } = responseStream;
373
- const { type, charset } = parseContentTypeFromResponse(responseStream);
374
- const { response, encoding } = this._encodeResponse(request, responseStream, charset);
297
+ async _parseResponse(request, response) {
298
+ const { status } = response;
299
+ const { type, charset } = parseContentTypeFromResponse(response);
300
+ const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset);
375
301
  const contentType = { type, encoding };
376
- if (statusCode >= 400 && statusCode <= 599) {
377
- this.stats.registerStatusCode(statusCode);
302
+ if (status >= 400 && status <= 599) {
303
+ this.stats.registerStatusCode(status);
378
304
  }
379
- const excludeError = this.ignoreHttpErrorStatusCodes.has(statusCode);
380
- const includeError = this.additionalHttpErrorStatusCodes.has(statusCode);
381
- if ((statusCode >= 500 && !excludeError) || includeError) {
382
- const body = await readStreamToString(response, encoding);
305
+ if (this.isErrorStatusCode(status)) {
306
+ const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text)
383
307
  // Errors are often sent as JSON, so attempt to parse them,
384
308
  // despite Accept header being set to text/html.
385
309
  if (type === APPLICATION_JSON_MIME_TYPE) {
@@ -387,59 +311,47 @@ export class HttpCrawler extends BasicCrawler {
387
311
  let { message } = errorResponse;
388
312
  if (!message)
389
313
  message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
390
- throw new Error(`${statusCode} - ${message}`);
314
+ throw new Error(`${status} - ${message}`);
391
315
  }
392
- if (includeError) {
393
- throw new Error(`${statusCode} - Error status code was set by user.`);
316
+ if (this.additionalHttpErrorStatusCodes.has(status)) {
317
+ throw new Error(`${status} - Error status code was set by user.`);
394
318
  }
395
319
  // It's not a JSON, so it's probably some text. Get the first 100 chars of it.
396
- throw new Error(`${statusCode} - Internal Server Error: ${body.slice(0, 100)}`);
320
+ throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`);
397
321
  }
398
322
  else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
399
- const isXml = type.includes('xml');
400
- const parsed = await this._parseHTML(response, isXml, crawlingContext);
401
- return { ...parsed, isXml, response, contentType };
323
+ return { response, contentType, body: await reencodedResponse.text() };
402
324
  }
403
325
  else {
404
- const body = await concatStreamToBuffer(response);
326
+ const body = Buffer.from(await reencodedResponse.bytes());
405
327
  return {
406
328
  body,
407
329
  response,
408
330
  contentType,
409
- enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
410
331
  };
411
332
  }
412
333
  }
413
- async _parseHTML(response, _isXml, _crawlingContext) {
414
- return {
415
- body: await concatStreamToBuffer(response),
416
- };
417
- }
418
334
  /**
419
335
  * Combines the provided `requestOptions` with mandatory (non-overridable) values.
420
336
  */
421
- _getRequestOptions(request, session, proxyUrl, gotOptions) {
337
+ _getRequestOptions(request, session, proxyUrl) {
422
338
  const requestOptions = {
423
339
  url: request.url,
424
340
  method: request.method,
425
341
  proxyUrl,
426
- timeout: { request: this.navigationTimeoutMillis },
342
+ timeout: this.navigationTimeoutMillis,
343
+ cookieJar: this.persistCookiesPerSession ? session.cookieJar : undefined,
427
344
  sessionToken: session,
428
- ...gotOptions,
429
- headers: { ...request.headers, ...gotOptions?.headers },
345
+ headers: request.headers,
430
346
  https: {
431
- ...gotOptions?.https,
432
347
  rejectUnauthorized: !this.ignoreSslErrors,
433
348
  },
434
- isStream: true,
349
+ body: undefined,
435
350
  };
436
351
  // Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
437
352
  Reflect.deleteProperty(requestOptions.headers, 'cookie');
438
- // TODO this is incorrect, the check for man in the middle needs to be done
439
- // on individual proxy level, not on the `proxyConfiguration` level,
440
- // because users can use normal + MITM proxies in a single configuration.
441
353
  // Disable SSL verification for MITM proxies
442
- if (this.proxyConfiguration && this.proxyConfiguration.isManInTheMiddle) {
354
+ if (session.proxyInfo?.ignoreTlsErrors) {
443
355
  requestOptions.https = {
444
356
  ...requestOptions.https,
445
357
  rejectUnauthorized: false,
@@ -468,13 +380,13 @@ export class HttpCrawler extends BasicCrawler {
468
380
  if (iconv.encodingExists(encoding)) {
469
381
  const encodeStream = iconv.encodeStream(utf8);
470
382
  const decodeStream = iconv.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
471
- response.on('error', (err) => decodeStream.emit('error', err));
472
- const encodedResponse = response.pipe(decodeStream).pipe(encodeStream);
473
- encodedResponse.statusCode = response.statusCode;
474
- encodedResponse.headers = response.headers;
475
- encodedResponse.url = response.url;
383
+ const reencodedBody = response.body
384
+ ? Readable.toWeb(Readable.from(Readable.fromWeb(response.body)
385
+ .pipe(decodeStream)
386
+ .pipe(encodeStream)))
387
+ : null;
476
388
  return {
477
- response: encodedResponse,
389
+ response: new ResponseWithUrl(reencodedBody, response),
478
390
  encoding: utf8,
479
391
  };
480
392
  }
@@ -502,16 +414,13 @@ export class HttpCrawler extends BasicCrawler {
502
414
  * Handles timeout request
503
415
  */
504
416
  _handleRequestTimeout(session) {
505
- session?.markBad();
506
- throw new Error(`request timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`);
417
+ session.markBad();
418
+ throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
507
419
  }
508
420
  _abortDownloadOfBody(request, response) {
509
- const { statusCode } = response;
421
+ const { status } = response;
510
422
  const { type } = parseContentTypeFromResponse(response);
511
- // eslint-disable-next-line dot-notation -- accessing private property
512
- const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
513
- // if we retry the request, can the Content-Type change?
514
- const isTransientContentType = statusCode >= 500 || blockedStatusCodes.includes(statusCode);
423
+ const isTransientContentType = status >= 500 || this.blockedStatusCodes.has(status);
515
424
  if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
516
425
  request.noRetry = true;
517
426
  throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
@@ -521,89 +430,28 @@ export class HttpCrawler extends BasicCrawler {
521
430
  /**
522
431
  * @internal wraps public utility for mocking purposes
523
432
  */
524
- _requestAsBrowser = async (options, session) => {
525
- const response = await this.httpClient.stream(processHttpRequestOptions({
433
+ _requestAsBrowser = async (options, session, cookieString) => {
434
+ const opts = processHttpRequestOptions({
526
435
  ...options,
527
- cookieJar: options.cookieJar, // HACK - the type of ToughCookieJar in got is wrong
436
+ cookieJar: options.cookieJar,
528
437
  responseType: 'text',
529
- }), (redirectResponse, updatedRequest) => {
530
- if (this.persistCookiesPerSession) {
531
- session.setCookiesFromResponse(redirectResponse);
532
- const cookieString = session.getCookieString(updatedRequest.url.toString());
533
- if (cookieString !== '') {
534
- updatedRequest.headers.Cookie = cookieString;
535
- }
536
- }
537
438
  });
538
- return addResponsePropertiesToStream(response.stream, response);
539
- };
540
- }
541
- /**
542
- * The stream object returned from got does not have the below properties.
543
- * At the same time, you can't read data directly from the response stream,
544
- * because they won't get emitted unless you also read from the primary
545
- * got stream. To be able to work with only one stream, we move the expected props
546
- * from the response stream to the got stream.
547
- * @internal
548
- */
549
- function addResponsePropertiesToStream(stream, response) {
550
- const properties = [
551
- 'statusCode',
552
- 'statusMessage',
553
- 'headers',
554
- 'complete',
555
- 'httpVersion',
556
- 'rawHeaders',
557
- 'rawTrailers',
558
- 'trailers',
559
- 'url',
560
- 'request',
561
- ];
562
- stream.on('end', () => {
563
- // @ts-expect-error
564
- if (stream.rawTrailers)
565
- stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
566
- // @ts-expect-error
567
- if (stream.trailers)
568
- stream.trailers = response.trailers;
569
- // @ts-expect-error
570
- stream.complete = response.complete;
571
- });
572
- for (const prop of properties) {
573
- if (!(prop in stream)) {
574
- stream[prop] = response[prop];
439
+ if (cookieString) {
440
+ opts.headers?.delete('Cookie');
441
+ opts.headers?.delete('cookie');
442
+ opts.headers?.set('Cookie', cookieString);
575
443
  }
576
- }
577
- return stream;
578
- }
579
- /**
580
- * Gets parsed content type from response object
581
- * @param response HTTP response object
582
- */
583
- function parseContentTypeFromResponse(response) {
584
- ow(response, ow.object.partialShape({
585
- url: ow.string.url,
586
- headers: new ObjectPredicate(),
587
- }));
588
- const { url, headers } = response;
589
- let parsedContentType;
590
- if (headers['content-type']) {
591
- try {
592
- parsedContentType = contentTypeParser.parse(headers['content-type']);
593
- }
594
- catch {
595
- // Can not parse content type from Content-Type header. Try to parse it from file extension.
596
- }
597
- }
598
- // Parse content type from file extension as fallback
599
- if (!parsedContentType) {
600
- const parsedUrl = new URL(url);
601
- const contentTypeFromExtname = mime.contentType(extname(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
602
- parsedContentType = contentTypeParser.parse(contentTypeFromExtname);
603
- }
604
- return {
605
- type: parsedContentType.type,
606
- charset: parsedContentType.parameters.charset,
444
+ const response = await this.httpClient.sendRequest(new Request(opts.url, {
445
+ body: opts.body ? Readable.toWeb(opts.body) : undefined,
446
+ headers: new Headers(opts.headers),
447
+ method: opts.method,
448
+ // Node-specific option to make the request body work with streams
449
+ duplex: 'half',
450
+ }), {
451
+ session,
452
+ timeoutMillis: opts.timeout,
453
+ });
454
+ return response;
607
455
  };
608
456
  }
609
457
  /**