npm - @crawlee/http - Versions diffs - 4.0.0-beta.4 → 4.0.0-beta.40 - Mend

@crawlee/http 4.0.0-beta.4 → 4.0.0-beta.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +9 -5
package/internals/file-download.d.ts +58 -32
package/internals/file-download.d.ts.map +1 -1
package/internals/file-download.js +116 -73
package/internals/file-download.js.map +1 -1
package/internals/http-crawler.d.ts +92 -175
package/internals/http-crawler.d.ts.map +1 -1
package/internals/http-crawler.js +169 -321
package/internals/http-crawler.js.map +1 -1
package/internals/utils.d.ts +14 -0
package/internals/utils.d.ts.map +1 -0
package/internals/utils.js +71 -0
package/internals/utils.js.map +1 -0
package/package.json +7 -7
package/tsconfig.build.tsbuildinfo +0 -1

package/internals/http-crawler.js CHANGED Viewed

@@ -1,15 +1,14 @@
-import { extname } from 'node:path';
+import { Readable } from 'node:stream';
 import util from 'node:util';
-import { BASIC_CRAWLER_TIMEOUT_BUFFER_SECS, BasicCrawler, Configuration, CrawlerExtension, mergeCookies, processHttpRequestOptions, RequestState, Router, SessionError, validators, } from '@crawlee/basic';
+import { BasicCrawler, ContextPipeline, mergeCookies, RequestState, Router, SessionError } from '@crawlee/basic';
+import { ResponseWithUrl } from '@crawlee/http-client';
 import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
 import * as cheerio from 'cheerio';
 import contentTypeParser from 'content-type';
 import iconv from 'iconv-lite';
-import mime from 'mime-types';
-import ow, { ObjectPredicate } from 'ow';
+import ow from 'ow';
 import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
-import { concatStreamToBuffer, readStreamToString } from '@apify/utilities';
-let TimeoutError;
+import { parseContentTypeFromResponse, processHttpRequestOptions } from './utils.js';
 /**
  * Default mime types, which HttpScraper supports.
  */
@@ -46,18 +45,18 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
  *
  * The crawler finishes when there are no more {@link Request} objects to crawl.
  *
- * We can use the `preNavigationHooks` to adjust `gotOptions`:
+ * We can use the `preNavigationHooks` to adjust the crawling context before the request is made:
  *
  * ```javascript
  * preNavigationHooks: [
- *     (crawlingContext, gotOptions) => {
+ *     (crawlingContext) => {
  *         // ...
  *     },
  * ]
  * ```
  *
- * By default, this crawler only processes web pages with the `text/html`
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
+ * By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
+ * and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
  * and skips pages with other content types. If you want the crawler to process other content types,
  * use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option.
  * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
@@ -93,13 +92,6 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
  * @category Crawlers
  */
 export class HttpCrawler extends BasicCrawler {
-    config;
-    /**
-     * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
-     * Only available if used by the crawler.
-     */
-    proxyConfiguration;
-    userRequestHandlerTimeoutMillis;
     preNavigationHooks;
     postNavigationHooks;
     persistCookiesPerSession;
@@ -107,8 +99,6 @@ export class HttpCrawler extends BasicCrawler {
     ignoreSslErrors;
     suggestResponseEncoding;
     forceResponseEncoding;
-    additionalHttpErrorStatusCodes;
-    ignoreHttpErrorStatusCodes;
     supportedMimeTypes;
     static optionsShape = {
         ...BasicCrawler.optionsShape,
@@ -117,158 +107,144 @@ export class HttpCrawler extends BasicCrawler {
         additionalMimeTypes: ow.optional.array.ofType(ow.string),
         suggestResponseEncoding: ow.optional.string,
         forceResponseEncoding: ow.optional.string,
-        proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
         persistCookiesPerSession: ow.optional.boolean,
-        additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
-        ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
         preNavigationHooks: ow.optional.array,
         postNavigationHooks: ow.optional.array,
     };
     /**
      * All `HttpCrawlerOptions` parameters are passed via an options object.
      */
-    constructor(options = {}, config = Configuration.getGlobalConfig()) {
+    constructor(options = {}) {
         ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape));
-        const { requestHandler, requestHandlerTimeoutSecs = 60, navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, proxyConfiguration, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [], additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
+        const { navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, persistCookiesPerSession = true, preNavigationHooks = [], postNavigationHooks = [],
         // BasicCrawler
-        autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
+        autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, contextPipelineBuilder, ...basicCrawlerOptions } = options;
         super({
             ...basicCrawlerOptions,
-            requestHandler,
             autoscaledPoolOptions,
-            // We need to add some time for internal functions to finish,
-            // but not too much so that we would stall the crawler.
-            requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
-        }, config);
-        this.config = config;
-        // FIXME any
-        this.requestHandler = requestHandler ?? this.router;
-        // Cookies should be persisted per session only if session pool is used
-        if (!this.useSessionPool && persistCookiesPerSession) {
-            throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
-        }
+            contextPipelineBuilder: contextPipelineBuilder ??
+                (() => this.buildContextPipeline()),
+        });
         this.supportedMimeTypes = new Set([...HTML_AND_XML_MIME_TYPES, APPLICATION_JSON_MIME_TYPE]);
         if (additionalMimeTypes.length)
             this._extendSupportedMimeTypes(additionalMimeTypes);
         if (suggestResponseEncoding && forceResponseEncoding) {
             this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
         }
-        this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
         this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
         this.ignoreSslErrors = ignoreSslErrors;
         this.suggestResponseEncoding = suggestResponseEncoding;
         this.forceResponseEncoding = forceResponseEncoding;
-        this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
-        this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
-        this.proxyConfiguration = proxyConfiguration;
         this.preNavigationHooks = preNavigationHooks;
         this.postNavigationHooks = [
             ({ request, response }) => this._abortDownloadOfBody(request, response),
             ...postNavigationHooks,
         ];
-        if (this.useSessionPool) {
-            this.persistCookiesPerSession = persistCookiesPerSession ?? true;
-        }
-        else {
-            this.persistCookiesPerSession = false;
-        }
+        this.persistCookiesPerSession = persistCookiesPerSession;
     }
-    /**
-     * **EXPERIMENTAL**
-     * Function for attaching CrawlerExtensions such as the Unblockers.
-     * @param extension Crawler extension that overrides the crawler configuration.
-     */
-    use(extension) {
-        ow(extension, ow.object.instanceOf(CrawlerExtension));
-        const className = this.constructor.name;
-        const extensionOptions = extension.getCrawlerOptions();
-        for (const [key, value] of Object.entries(extensionOptions)) {
-            const isConfigurable = Object.hasOwn(this, key);
-            const originalType = typeof this[key];
-            const extensionType = typeof value; // What if we want to null something? It is really needed?
-            const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
-            const exists = this[key] != null;
-            if (!isConfigurable) {
-                // Test if the property can be configured on the crawler
-                throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
-            }
-            if (!isSameType && exists) {
-                // Assuming that extensions will only add up configuration
-                throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
-            }
-            this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
-            this[key] = value;
-        }
+    buildContextPipeline() {
+        return ContextPipeline.create()
+            .compose({
+            action: this.makeHttpRequest.bind(this),
+        })
+            .compose({ action: this.processHttpResponse.bind(this) })
+            .compose({ action: this.handleBlockedRequestByContent.bind(this) });
     }
-    /**
-     * Wrapper around requestHandler that opens and closes pages etc.
-     */
-    async _runRequestHandler(crawlingContext) {
+    async makeHttpRequest(crawlingContext) {
         const { request, session } = crawlingContext;
-        if (this.proxyConfiguration) {
-            const sessionId = session ? session.id : undefined;
-            crawlingContext.proxyInfo = await this.proxyConfiguration.newProxyInfo(sessionId, { request });
-        }
-        if (!request.skipNavigation) {
-            await this._handleNavigation(crawlingContext);
-            tryCancel();
-            const parsed = await this._parseResponse(request, crawlingContext.response, crawlingContext);
-            const response = parsed.response;
-            const contentType = parsed.contentType;
-            tryCancel();
-            // `??=` because descendant classes may already set optimized version
-            crawlingContext.waitForSelector ??= async (selector, _timeoutMs) => {
-                const $ = cheerio.load(parsed.body.toString());
-                if ($(selector).get().length === 0) {
-                    throw new Error(`Selector '${selector}' not found.`);
-                }
+        if (request.skipNavigation) {
+            return {
+                request: new Proxy(request, {
+                    get(target, propertyName, receiver) {
+                        if (propertyName === 'loadedUrl') {
+                            throw new Error('The `request.loadedUrl` property is not available - `skipNavigation` was used');
+                        }
+                        return Reflect.get(target, propertyName, receiver);
+                    },
+                }),
+                get response() {
+                    throw new Error('The `response` property is not available - `skipNavigation` was used');
+                },
             };
-            crawlingContext.parseWithCheerio ??= async (selector, timeoutMs) => {
-                const $ = cheerio.load(parsed.body.toString());
-                if (selector) {
-                    await crawlingContext.waitForSelector(selector, timeoutMs);
-                }
-                return $;
+        }
+        const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
+        request.state = RequestState.BEFORE_NAV;
+        // Execute pre navigation hooks before applying session pool cookies,
+        // as they may also set cookies in the session
+        await this._executeHooks(this.preNavigationHooks, crawlingContext);
+        tryCancel();
+        const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
+        const cookieString = this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
+        const proxyUrl = crawlingContext.proxyInfo?.url;
+        const httpResponse = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, cookieString }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
+        tryCancel();
+        request.loadedUrl = httpResponse?.url;
+        request.state = RequestState.AFTER_NAV;
+        return { request: request, response: httpResponse };
+    }
+    async processHttpResponse(crawlingContext) {
+        if (crawlingContext.request.skipNavigation) {
+            return {
+                get contentType() {
+                    throw new Error('The `contentType` property is not available - `skipNavigation` was used');
+                },
+                get body() {
+                    throw new Error('The `body` property is not available - `skipNavigation` was used');
+                },
+                get json() {
+                    throw new Error('The `json` property is not available - `skipNavigation` was used');
+                },
+                get waitForSelector() {
+                    throw new Error('The `waitForSelector` method is not available - `skipNavigation` was used');
+                },
+                get parseWithCheerio() {
+                    throw new Error('The `parseWithCheerio` method is not available - `skipNavigation` was used');
+                },
             };
-            if (this.useSessionPool) {
-                this._throwOnBlockedRequest(crawlingContext.session, response.statusCode);
-            }
-            if (this.persistCookiesPerSession) {
-                crawlingContext.session.setCookiesFromResponse(response);
+        }
+        await this._executeHooks(this.postNavigationHooks, crawlingContext);
+        tryCancel();
+        const parsed = await this._parseResponse(crawlingContext.request, crawlingContext.response);
+        tryCancel();
+        const response = parsed.response;
+        const contentType = parsed.contentType;
+        const waitForSelector = async (selector, _timeoutMs) => {
+            const $ = cheerio.load(parsed.body.toString());
+            if ($(selector).get().length === 0) {
+                throw new Error(`Selector '${selector}' not found.`);
             }
-            request.loadedUrl = response.url;
-            if (!this.requestMatchesEnqueueStrategy(request)) {
-                this.log.debug(
-                // eslint-disable-next-line dot-notation
-                `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
-                request.noRetry = true;
-                request.state = RequestState.SKIPPED;
-                return;
+        };
+        const parseWithCheerio = async (selector, timeoutMs) => {
+            const $ = cheerio.load(parsed.body.toString());
+            if (selector) {
+                await crawlingContext.waitForSelector(selector, timeoutMs);
             }
-            Object.assign(crawlingContext, parsed);
-            Object.defineProperty(crawlingContext, 'json', {
-                get() {
-                    if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
-                        return null;
-                    const jsonString = parsed.body.toString(contentType.encoding);
-                    return JSON.parse(jsonString);
-                },
-            });
+            return $;
+        };
+        this._throwOnBlockedRequest(response.status);
+        if (this.persistCookiesPerSession) {
+            crawlingContext.session.setCookiesFromResponse(response);
         }
+        return {
+            get json() {
+                if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
+                    return null;
+                const jsonString = parsed.body.toString(contentType.encoding);
+                return JSON.parse(jsonString);
+            },
+            waitForSelector,
+            parseWithCheerio,
+            contentType,
+            body: parsed.body,
+        };
+    }
+    async handleBlockedRequestByContent(crawlingContext) {
         if (this.retryOnBlocked) {
             const error = await this.isRequestBlocked(crawlingContext);
             if (error)
                 throw new SessionError(error);
         }
-        request.state = RequestState.REQUEST_HANDLER;
-        try {
-            await addTimeoutToPromise(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
-            request.state = RequestState.DONE;
-        }
-        catch (e) {
-            request.state = RequestState.ERROR;
-            throw e;
-        }
+        return {};
     }
     async isRequestBlocked(crawlingContext) {
         if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
@@ -278,84 +254,34 @@ export class HttpCrawler extends BasicCrawler {
                 return `Found selectors: ${foundSelectors.join(', ')}`;
             }
         }
+        if (this.blockedStatusCodes.has(crawlingContext.response.status)) {
+            return `Blocked by status code ${crawlingContext.response.status}`;
+        }
         return false;
     }
-    async _handleNavigation(crawlingContext) {
-        const gotOptions = {};
-        const { request, session } = crawlingContext;
-        const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
-        request.state = RequestState.BEFORE_NAV;
-        // Execute pre navigation hooks before applying session pool cookies,
-        // as they may also set cookies in the session
-        await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
-        tryCancel();
-        const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
-        this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
-        const proxyUrl = crawlingContext.proxyInfo?.url;
-        crawlingContext.response = await addTimeoutToPromise(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
-        tryCancel();
-        request.state = RequestState.AFTER_NAV;
-        await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
-        tryCancel();
-    }
     /**
-     * Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
+     * Returns the `Cookie` header value based on the current context and
+     * any changes that occurred in the navigation hooks.
      */
-    _applyCookies({ session, request }, gotOptions, preHookCookies, postHookCookies) {
-        const sessionCookie = session?.getCookieString(request.url) ?? '';
-        let alteredGotOptionsCookies = gotOptions.headers?.Cookie || gotOptions.headers?.cookie || '';
-        if (gotOptions.headers?.Cookie && gotOptions.headers?.cookie) {
-            const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
-            this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
-            const sourceCookies = [];
-            if (Array.isArray(lowerCaseHeader)) {
-                sourceCookies.push(...lowerCaseHeader);
-            }
-            else {
-                sourceCookies.push(lowerCaseHeader);
-            }
-            if (Array.isArray(upperCaseHeader)) {
-                sourceCookies.push(...upperCaseHeader);
-            }
-            else {
-                sourceCookies.push(upperCaseHeader);
-            }
-            alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
-        }
-        const sourceCookies = [sessionCookie, preHookCookies];
-        if (Array.isArray(alteredGotOptionsCookies)) {
-            sourceCookies.push(...alteredGotOptionsCookies);
-        }
-        else {
-            sourceCookies.push(alteredGotOptionsCookies);
-        }
-        sourceCookies.push(postHookCookies);
-        const mergedCookie = mergeCookies(request.url, sourceCookies);
-        gotOptions.headers ??= {};
-        Reflect.deleteProperty(gotOptions.headers, 'Cookie');
-        Reflect.deleteProperty(gotOptions.headers, 'cookie');
-        if (mergedCookie !== '') {
-            gotOptions.headers.Cookie = mergedCookie;
-        }
+    _applyCookies({ session, request }, preHookCookies, postHookCookies) {
+        const sessionCookie = session.getCookieString(request.url);
+        const sourceCookies = [sessionCookie, preHookCookies, postHookCookies];
+        return mergeCookies(request.url, sourceCookies);
     }
     /**
      * Function to make the HTTP request. It performs optimizations
      * on the request such as only downloading the request body if the
      * received content type matches text/html, application/xml, application/xhtml+xml.
      */
-    async _requestFunction({ request, session, proxyUrl, gotOptions, }) {
-        if (!TimeoutError) {
-            // @ts-ignore
-            ({ TimeoutError } = await import('got-scraping'));
-        }
-        const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
+    async _requestFunction({ request, session, proxyUrl, cookieString, }) {
+        const opts = this._getRequestOptions(request, session, proxyUrl);
         try {
-            return await this._requestAsBrowser(opts, session);
+            return await this._requestAsBrowser(opts, session, cookieString);
         }
         catch (e) {
-            if (e instanceof TimeoutError) {
+            if (e instanceof Error && e.constructor.name === 'TimeoutError') {
                 this._handleRequestTimeout(session);
-                return undefined;
+                return new Response(); // this will never happen, as _handleRequestTimeout always throws
             }
             if (this.isProxyError(e)) {
                 throw new SessionError(this._getMessageFromError(e));
@@ -368,18 +294,16 @@ export class HttpCrawler extends BasicCrawler {
     /**
      * Encodes and parses response according to the provided content type
      */
-    async _parseResponse(request, responseStream, crawlingContext) {
-        const { statusCode } = responseStream;
-        const { type, charset } = parseContentTypeFromResponse(responseStream);
-        const { response, encoding } = this._encodeResponse(request, responseStream, charset);
+    async _parseResponse(request, response) {
+        const { status } = response;
+        const { type, charset } = parseContentTypeFromResponse(response);
+        const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset);
         const contentType = { type, encoding };
-        if (statusCode >= 400 && statusCode <= 599) {
-            this.stats.registerStatusCode(statusCode);
+        if (status >= 400 && status <= 599) {
+            this.stats.registerStatusCode(status);
         }
-        const excludeError = this.ignoreHttpErrorStatusCodes.has(statusCode);
-        const includeError = this.additionalHttpErrorStatusCodes.has(statusCode);
-        if ((statusCode >= 500 && !excludeError) || includeError) {
-            const body = await readStreamToString(response, encoding);
+        if (this.isErrorStatusCode(status)) {
+            const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text)
             // Errors are often sent as JSON, so attempt to parse them,
             // despite Accept header being set to text/html.
             if (type === APPLICATION_JSON_MIME_TYPE) {
@@ -387,59 +311,47 @@ export class HttpCrawler extends BasicCrawler {
                 let { message } = errorResponse;
                 if (!message)
                     message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
-                throw new Error(`${statusCode} - ${message}`);
+                throw new Error(`${status} - ${message}`);
             }
-            if (includeError) {
-                throw new Error(`${statusCode} - Error status code was set by user.`);
+            if (this.additionalHttpErrorStatusCodes.has(status)) {
+                throw new Error(`${status} - Error status code was set by user.`);
             }
             // It's not a JSON, so it's probably some text. Get the first 100 chars of it.
-            throw new Error(`${statusCode} - Internal Server Error: ${body.slice(0, 100)}`);
+            throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`);
         }
         else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
-            const isXml = type.includes('xml');
-            const parsed = await this._parseHTML(response, isXml, crawlingContext);
-            return { ...parsed, isXml, response, contentType };
+            return { response, contentType, body: await reencodedResponse.text() };
         }
         else {
-            const body = await concatStreamToBuffer(response);
+            const body = Buffer.from(await reencodedResponse.bytes());
             return {
                 body,
                 response,
                 contentType,
-                enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
             };
         }
     }
-    async _parseHTML(response, _isXml, _crawlingContext) {
-        return {
-            body: await concatStreamToBuffer(response),
-        };
-    }
     /**
      * Combines the provided `requestOptions` with mandatory (non-overridable) values.
      */
-    _getRequestOptions(request, session, proxyUrl, gotOptions) {
+    _getRequestOptions(request, session, proxyUrl) {
         const requestOptions = {
             url: request.url,
             method: request.method,
             proxyUrl,
-            timeout: { request: this.navigationTimeoutMillis },
+            timeout: this.navigationTimeoutMillis,
+            cookieJar: this.persistCookiesPerSession ? session.cookieJar : undefined,
             sessionToken: session,
-            ...gotOptions,
-            headers: { ...request.headers, ...gotOptions?.headers },
+            headers: request.headers,
             https: {
-                ...gotOptions?.https,
                 rejectUnauthorized: !this.ignoreSslErrors,
             },
-            isStream: true,
+            body: undefined,
         };
         // Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
         Reflect.deleteProperty(requestOptions.headers, 'cookie');
-        // TODO this is incorrect, the check for man in the middle needs to be done
-        //   on individual proxy level, not on the `proxyConfiguration` level,
-        //   because users can use normal + MITM proxies in a single configuration.
         // Disable SSL verification for MITM proxies
-        if (this.proxyConfiguration && this.proxyConfiguration.isManInTheMiddle) {
+        if (session.proxyInfo?.ignoreTlsErrors) {
             requestOptions.https = {
                 ...requestOptions.https,
                 rejectUnauthorized: false,
@@ -468,13 +380,13 @@ export class HttpCrawler extends BasicCrawler {
         if (iconv.encodingExists(encoding)) {
             const encodeStream = iconv.encodeStream(utf8);
             const decodeStream = iconv.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
-            response.on('error', (err) => decodeStream.emit('error', err));
-            const encodedResponse = response.pipe(decodeStream).pipe(encodeStream);
-            encodedResponse.statusCode = response.statusCode;
-            encodedResponse.headers = response.headers;
-            encodedResponse.url = response.url;
+            const reencodedBody = response.body
+                ? Readable.toWeb(Readable.from(Readable.fromWeb(response.body)
+                    .pipe(decodeStream)
+                    .pipe(encodeStream)))
+                : null;
             return {
-                response: encodedResponse,
+                response: new ResponseWithUrl(reencodedBody, response),
                 encoding: utf8,
             };
         }
@@ -502,16 +414,13 @@ export class HttpCrawler extends BasicCrawler {
      * Handles timeout request
      */
     _handleRequestTimeout(session) {
-        session?.markBad();
-        throw new Error(`request timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`);
+        session.markBad();
+        throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
     }
     _abortDownloadOfBody(request, response) {
-        const { statusCode } = response;
+        const { status } = response;
         const { type } = parseContentTypeFromResponse(response);
-        // eslint-disable-next-line dot-notation -- accessing private property
-        const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
-        // if we retry the request, can the Content-Type change?
-        const isTransientContentType = statusCode >= 500 || blockedStatusCodes.includes(statusCode);
+        const isTransientContentType = status >= 500 || this.blockedStatusCodes.has(status);
         if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
             request.noRetry = true;
             throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
@@ -521,89 +430,28 @@ export class HttpCrawler extends BasicCrawler {
     /**
      * @internal wraps public utility for mocking purposes
      */
-    _requestAsBrowser = async (options, session) => {
-        const response = await this.httpClient.stream(processHttpRequestOptions({
+    _requestAsBrowser = async (options, session, cookieString) => {
+        const opts = processHttpRequestOptions({
             ...options,
-            cookieJar: options.cookieJar, // HACK - the type of ToughCookieJar in got is wrong
+            cookieJar: options.cookieJar,
             responseType: 'text',
-        }), (redirectResponse, updatedRequest) => {
-            if (this.persistCookiesPerSession) {
-                session.setCookiesFromResponse(redirectResponse);
-                const cookieString = session.getCookieString(updatedRequest.url.toString());
-                if (cookieString !== '') {
-                    updatedRequest.headers.Cookie = cookieString;
-                }
-            }
         });
-        return addResponsePropertiesToStream(response.stream, response);
-    };
-}
-/**
- * The stream object returned from got does not have the below properties.
- * At the same time, you can't read data directly from the response stream,
- * because they won't get emitted unless you also read from the primary
- * got stream. To be able to work with only one stream, we move the expected props
- * from the response stream to the got stream.
- * @internal
- */
-function addResponsePropertiesToStream(stream, response) {
-    const properties = [
-        'statusCode',
-        'statusMessage',
-        'headers',
-        'complete',
-        'httpVersion',
-        'rawHeaders',
-        'rawTrailers',
-        'trailers',
-        'url',
-        'request',
-    ];
-    stream.on('end', () => {
-        // @ts-expect-error
-        if (stream.rawTrailers)
-            stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
-        // @ts-expect-error
-        if (stream.trailers)
-            stream.trailers = response.trailers;
-        // @ts-expect-error
-        stream.complete = response.complete;
-    });
-    for (const prop of properties) {
-        if (!(prop in stream)) {
-            stream[prop] = response[prop];
+        if (cookieString) {
+            opts.headers?.delete('Cookie');
+            opts.headers?.delete('cookie');
+            opts.headers?.set('Cookie', cookieString);
         }
-    }
-    return stream;
-}
-/**
- * Gets parsed content type from response object
- * @param response HTTP response object
- */
-function parseContentTypeFromResponse(response) {
-    ow(response, ow.object.partialShape({
-        url: ow.string.url,
-        headers: new ObjectPredicate(),
-    }));
-    const { url, headers } = response;
-    let parsedContentType;
-    if (headers['content-type']) {
-        try {
-            parsedContentType = contentTypeParser.parse(headers['content-type']);
-        }
-        catch {
-            // Can not parse content type from Content-Type header. Try to parse it from file extension.
-        }
-    }
-    // Parse content type from file extension as fallback
-    if (!parsedContentType) {
-        const parsedUrl = new URL(url);
-        const contentTypeFromExtname = mime.contentType(extname(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
-        parsedContentType = contentTypeParser.parse(contentTypeFromExtname);
-    }
-    return {
-        type: parsedContentType.type,
-        charset: parsedContentType.parameters.charset,
+        const response = await this.httpClient.sendRequest(new Request(opts.url, {
+            body: opts.body ? Readable.toWeb(opts.body) : undefined,
+            headers: new Headers(opts.headers),
+            method: opts.method,
+            // Node-specific option to make the request body work with streams
+            duplex: 'half',
+        }), {
+            session,
+            timeoutMillis: opts.timeout,
+        });
+        return response;
     };
 }
 /**