@crawlee/browser 4.0.0-beta.2 → 4.0.0-beta.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,6 +9,10 @@
9
9
  <small>A web scraping and browser automation library</small>
10
10
  </h1>
11
11
 
12
+ <p align=center>
13
+ <a href="https://trendshift.io/repositories/5179" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5179" alt="apify%2Fcrawlee | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
14
+ </p>
15
+
12
16
  <p align=center>
13
17
  <a href="https://www.npmjs.com/package/@crawlee/core" rel="nofollow"><img src="https://img.shields.io/npm/v/@crawlee/core.svg" alt="NPM latest version" data-canonical-src="https://img.shields.io/npm/v/@crawlee/core/next.svg" style="max-width: 100%;"></a>
14
18
  <a href="https://www.npmjs.com/package/@crawlee/core" rel="nofollow"><img src="https://img.shields.io/npm/dm/@crawlee/core.svg" alt="Downloads" data-canonical-src="https://img.shields.io/npm/dm/@crawlee/core.svg" style="max-width: 100%;"></a>
@@ -24,7 +28,7 @@ Crawlee is available as the [`crawlee`](https://www.npmjs.com/package/crawlee) N
24
28
 
25
29
  > 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev)** 👈
26
30
 
27
- > Crawlee for Python is open for early adopters. 🐍 [👉 Checkout the source code 👈](https://github.com/apify/crawlee-python).
31
+ > Do you prefer 🐍 Python instead of JavaScript? [👉 Checkout Crawlee for Python 👈](https://github.com/apify/crawlee-python).
28
32
 
29
33
  ## Installation
30
34
 
@@ -1,19 +1,38 @@
1
- import type { Awaitable, BasicCrawlerOptions, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler, LoadedContext, ProxyConfiguration, RequestHandler, RequestProvider, SkippedRequestCallback } from '@crawlee/basic';
2
- import { BasicCrawler, Configuration } from '@crawlee/basic';
1
+ import type { Awaitable, BasicCrawlerOptions, BasicCrawlingContext, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler, LoadedRequest, Request, RequestHandler, RequestProvider, SkippedRequestCallback } from '@crawlee/basic';
2
+ import { BasicCrawler, Configuration, ContextPipeline } from '@crawlee/basic';
3
3
  import type { BrowserController, BrowserPlugin, BrowserPoolHooks, BrowserPoolOptions, CommonPage, InferBrowserPluginArray, LaunchContext } from '@crawlee/browser-pool';
4
4
  import { BrowserPool } from '@crawlee/browser-pool';
5
+ import type { BatchAddRequestsResult } from '@crawlee/types';
5
6
  import type { RobotsTxtFile } from '@crawlee/utils';
6
7
  import type { ReadonlyDeep } from 'type-fest';
7
8
  import type { BrowserLaunchContext } from './browser-launcher.js';
8
- export interface BrowserCrawlingContext<Crawler = unknown, Page extends CommonPage = CommonPage, Response = Dictionary, ProvidedController = BrowserController, UserData extends Dictionary = Dictionary> extends CrawlingContext<Crawler, UserData> {
9
+ interface BaseResponse {
10
+ status(): number;
11
+ }
12
+ export interface BrowserCrawlingContext<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController = BrowserController, UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
13
+ /**
14
+ * An instance of the {@link BrowserController} that manages the browser instance and provides access to its API.
15
+ */
9
16
  browserController: ProvidedController;
17
+ /**
18
+ * The browser page object where the web page is loaded and rendered.
19
+ */
10
20
  page: Page;
11
- response?: Response;
21
+ /**
22
+ * The request object that was successfully loaded and navigated to, including the {@link Request.loadedUrl|`loadedUrl`} property.
23
+ */
24
+ request: LoadedRequest<Request<UserData>>;
25
+ /**
26
+ * The HTTP response object returned by the browser's navigation.
27
+ */
28
+ response: Response;
29
+ /**
30
+ * Helper function for extracting URLs from the current page and adding them to the request queue.
31
+ */
32
+ enqueueLinks: (options?: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>;
12
33
  }
13
- export type BrowserRequestHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = RequestHandler<Context>;
14
- export type BrowserErrorHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = ErrorHandler<Context>;
15
34
  export type BrowserHook<Context = BrowserCrawlingContext, GoToOptions extends Dictionary | undefined = Dictionary> = (crawlingContext: Context, gotoOptions: GoToOptions) => Awaitable<void>;
16
- export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext = BrowserCrawlingContext, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions, 'requestHandler' | 'failedRequestHandler' | 'errorHandler'> {
35
+ export interface BrowserCrawlerOptions<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController extends BrowserController = BrowserController, Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions<Context, ExtendedContext>, 'requestHandler' | 'failedRequestHandler' | 'errorHandler'> {
17
36
  launchContext?: BrowserLaunchContext<any, any>;
18
37
  /**
19
38
  * Function that is called to process each request.
@@ -42,7 +61,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
42
61
  * The exceptions are logged to the request using the
43
62
  * {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
44
63
  */
45
- requestHandler?: BrowserRequestHandler<LoadedContext<Context>>;
64
+ requestHandler?: RequestHandler<ExtendedContext>;
46
65
  /**
47
66
  * User-provided function that allows modifying the request object before it gets retried by the crawler.
48
67
  * It's executed before each retry for the requests that failed less than {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
@@ -53,7 +72,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
53
72
  * Second argument is the `Error` instance that
54
73
  * represents the last error thrown during processing of the request.
55
74
  */
56
- errorHandler?: BrowserErrorHandler<Context>;
75
+ errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
57
76
  /**
58
77
  * A function to handle requests that failed more than `option.maxRequestRetries` times.
59
78
  *
@@ -63,17 +82,12 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
63
82
  * Second argument is the `Error` instance that
64
83
  * represents the last error thrown during processing of the request.
65
84
  */
66
- failedRequestHandler?: BrowserErrorHandler<Context>;
85
+ failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
67
86
  /**
68
87
  * Custom options passed to the underlying {@link BrowserPool} constructor.
69
88
  * We can tweak those to fine-tune browser management.
70
89
  */
71
90
  browserPoolOptions?: Partial<BrowserPoolOptions> & Partial<BrowserPoolHooks<__BrowserControllerReturn, __LaunchContextReturn>>;
72
- /**
73
- * If set, the crawler will be configured for all connections to use
74
- * the Proxy URLs provided and rotated according to the configuration.
75
- */
76
- proxyConfiguration?: ProxyConfiguration;
77
91
  /**
78
92
  * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
79
93
  * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`,
@@ -178,21 +192,16 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
178
192
  *
179
193
  * @category Crawlers
180
194
  */
181
- export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext = BrowserCrawlingContext, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context> {
195
+ export declare abstract class BrowserCrawler<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController extends BrowserController = BrowserController, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
182
196
  readonly config: Configuration;
183
- /**
184
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
185
- * Only available if used by the crawler.
186
- */
187
- proxyConfiguration?: ProxyConfiguration;
188
197
  /**
189
198
  * A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers.
190
199
  */
191
200
  browserPool: BrowserPool<InternalBrowserPoolOptions>;
192
201
  launchContext: BrowserLaunchContext<LaunchOptions, unknown>;
193
- protected userProvidedRequestHandler: BrowserRequestHandler<Context>;
202
+ protected readonly ignoreShadowRoots: boolean;
203
+ protected readonly ignoreIframes: boolean;
194
204
  protected navigationTimeoutMillis: number;
195
- protected requestHandlerTimeoutInnerMillis: number;
196
205
  protected preNavigationHooks: BrowserHook<Context>[];
197
206
  protected postNavigationHooks: BrowserHook<Context>[];
198
207
  protected persistCookiesPerSession: boolean;
@@ -218,9 +227,9 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
218
227
  // @ts-ignore optional peer dependency or compatibility with es2022
219
228
  proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
220
229
  // @ts-ignore optional peer dependency or compatibility with es2022
221
- ignoreShadowRoots: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
230
+ contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
222
231
  // @ts-ignore optional peer dependency or compatibility with es2022
223
- ignoreIframes: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
232
+ extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
224
233
  // @ts-ignore optional peer dependency or compatibility with es2022
225
234
  requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
226
235
  // @ts-ignore optional peer dependency or compatibility with es2022
@@ -241,6 +250,8 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
241
250
  maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
242
251
  // @ts-ignore optional peer dependency or compatibility with es2022
243
252
  maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
253
+ // @ts-ignore optional peer dependency or compatibility with es2022
254
+ maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
244
255
  // @ts-ignore optional peer dependency or compatibility with es2022
245
256
  autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
246
257
  // @ts-ignore optional peer dependency or compatibility with es2022
@@ -250,7 +261,7 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
250
261
  // @ts-ignore optional peer dependency or compatibility with es2022
251
262
  retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
252
263
  // @ts-ignore optional peer dependency or compatibility with es2022
253
- respectRobotsTxtFile: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
264
+ respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
254
265
  // @ts-ignore optional peer dependency or compatibility with es2022
255
266
  onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
256
267
  // @ts-ignore optional peer dependency or compatibility with es2022
@@ -273,30 +284,27 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
273
284
  /**
274
285
  * All `BrowserCrawler` parameters are passed via an options object.
275
286
  */
276
- protected constructor(options?: BrowserCrawlerOptions<Context>, config?: Configuration);
277
- protected _cleanupContext(crawlingContext: Context): Promise<void>;
287
+ protected constructor(options: BrowserCrawlerOptions<Page, Response, ProvidedController, Context, ContextExtension, ExtendedContext> & {
288
+ contextPipelineBuilder: () => ContextPipeline<CrawlingContext, Context>;
289
+ }, config?: Configuration);
290
+ protected buildContextPipeline(): ContextPipeline<CrawlingContext, BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>>;
278
291
  private containsSelectors;
279
- protected isRequestBlocked(crawlingContext: Context): Promise<string | false>;
280
- /**
281
- * Wrapper around requestHandler that opens and closes pages etc.
282
- */
283
- protected _runRequestHandler(crawlingContext: Context): Promise<void>;
284
- protected _enhanceCrawlingContextWithPageInfo(crawlingContext: Context, page: CommonPage, createNewSession?: boolean): void;
285
- protected _handleNavigation(crawlingContext: Context): Promise<void>;
286
- protected _applyCookies({ session, request, page, browserController }: Context, preHooksCookies: string, postHooksCookies: string): Promise<void>;
292
+ protected isRequestBlocked(crawlingContext: BrowserCrawlingContext<Page, Response, ProvidedController>): Promise<string | false>;
293
+ private preparePage;
294
+ private performNavigation;
295
+ private handleBlockedRequestByContent;
296
+ private restoreRequestState;
297
+ protected _applyCookies({ session, request, page, browserController }: BrowserCrawlingContext, preHooksCookies: string, postHooksCookies: string): Promise<void>;
287
298
  /**
288
299
  * Marks session bad in case of navigation timeout.
289
300
  */
290
- protected _handleNavigationTimeout(crawlingContext: Context, error: Error): Promise<void>;
301
+ protected _handleNavigationTimeout(crawlingContext: BrowserCrawlingContext, error: Error): Promise<void>;
291
302
  /**
292
303
  * Transforms proxy-related errors to `SessionError`.
293
304
  */
294
305
  protected _throwIfProxyError(error: Error): void;
295
- protected abstract _navigationHandler(crawlingContext: Context, gotoOptions: GoToOptions): Promise<Context['response'] | null | undefined>;
296
- /**
297
- * Should be overridden in case of different automation library that does not support this response API.
298
- */
299
- protected _responseHandler(crawlingContext: Context): Promise<void>;
306
+ protected abstract _navigationHandler(crawlingContext: BrowserCrawlingContext<Page, Response, ProvidedController>, gotoOptions: GoToOptions): Promise<Context['response'] | null | undefined>;
307
+ private processResponse;
300
308
  protected _extendLaunchContext(_pageId: string, launchContext: LaunchContext): Promise<void>;
301
309
  protected _maybeAddSessionRetiredListener(_pageId: string, browserController: Context['browserController']): void;
302
310
  /**
@@ -316,8 +324,15 @@ interface EnqueueLinksInternalOptions {
316
324
  finalRequestUrl?: string;
317
325
  }
318
326
  /** @internal */
319
- // @ts-ignore optional peer dependency or compatibility with es2022
320
- export declare function browserCrawlerEnqueueLinks({ options, page, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }: EnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>;
327
+ interface BoundEnqueueLinksInternalOptions {
328
+ enqueueLinks: BasicCrawlingContext['enqueueLinks'];
329
+ options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
330
+ originalRequestUrl: string;
331
+ finalRequestUrl?: string;
332
+ page: CommonPage;
333
+ }
334
+ /** @internal */
335
+ export declare function browserCrawlerEnqueueLinks(options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions): Promise<unknown>;
321
336
  /**
322
337
  * Extracts URLs from a given page.
323
338
  * @ignore
@@ -1 +1 @@
1
- {"version":3,"file":"browser-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/browser-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,SAAS,EACT,mBAAmB,EACnB,eAAe,EACf,UAAU,EACV,mBAAmB,EACnB,YAAY,EACZ,aAAa,EACb,kBAAkB,EAElB,cAAc,EACd,eAAe,EAEf,sBAAsB,EACzB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EAEH,YAAY,EAEZ,aAAa,EAUhB,MAAM,gBAAgB,CAAC;AACxB,OAAO,KAAK,EACR,iBAAiB,EACjB,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,UAAU,EACV,uBAAuB,EACvB,aAAa,EAChB,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EAA6B,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAE/E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGpD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAI9C,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAElE,MAAM,WAAW,sBAAsB,CACnC,OAAO,GAAG,OAAO,EACjB,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,GAAG,UAAU,EACrB,kBAAkB,GAAG,iBAAiB,EACtC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAC1C,SAAQ,eAAe,CAAC,OAAO,EAAE,QAAQ,CAAC;IACxC,iBAAiB,EAAE,kBAAkB,CAAC;IACtC,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,CAAC,EAAE,QAAQ,CAAC;CACvB;AAED,MAAM,MAAM,qBAAqB,CAAC,OAAO,SAAS,sBAAsB,GAAG,sBAAsB,IAC7F,cAAc,CAAC,OAAO,CAAC,CAAC;AAE5B,MAAM,MAAM,mBAAmB,CAAC,OAAO,SAAS,sBAAsB,GAAG,sBAAsB,IAC3F,YAAY,CAAC,OAAO,CAAC,CAAC;AAE1B,MAAM,MAAM,WAAW,CAAC,OAAO,GAAG,sBAAsB,EAAE,WAAW,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,IAAI,CACjH,eAAe,EAAE,OAAO,EACxB,WAAW,EAAE,WAAW,KACvB,SAAS,CAAC,IAAI,CAAC,CAAC;AAErB,MAAM,WAAW,qBAAqB,CAClC,OAAO,SAAS,sBAAsB,GAAG,sBAAsB,EAC/D,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,gBAAgB,SAAS,aAAa,EAAE,GAAG,uBAAuB,CAAC,0BAA0B,CAAC,gBAAgB,CAAC,CAAC,EAChH,yBAAyB,SAAS,iBAAiB,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC,EAC9G,qBAAqB,SAAS,aAAa,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAC3G,SAAQ,IAAI,CACN,mBAAmB,EAEnB,gBAAgB,GAAG,sBAAsB,GAAG,cAAc,CAC7D;IACD,aAAa,CAAC,EAAE,oBAAoB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAE/C;;;;;;;;;;;;;;;;;;;;;;;;;;OA0BG;IACH,cAAc,CAAC,EAAE,qBAAqB,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC;IAE/D;;;;;;;;;OASG;IACH,YAAY,CAAC,EAAE,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAE5C;;;;;;;;OAQG;IACH,oBAAoB,CAAC,EAAE,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAEpD;;;OAGG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC,GAC5C,OAAO,CAAC,gBAAgB,CAAC,yBAAyB,EAAE,qBAAqB,CAAC,CAAC,CAAC;IAEhF;;;OAGG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IAExC;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,kBAAkB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE5C;;;;;;;;;;;;;;;;OAgBG;IACH,mBAAmB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE7C;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;OAGG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IAEnC;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,GAAG,KAAK,GAAG,KAAK,CAAC;IAEnC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAE5B;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AACH,8BAAsB,cAAc,CAChC,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,aAAa,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,EACzD,OAAO,SAAS,sBAAsB,GAAG,sBAAsB,EAC/D,WAAW,SAAS,UAAU,GAAG,UAAU,CAC7C,SAAQ,YAAY,CAAC,OAAO,CAAC;aA4CL,MAAM;IA3C5B;;;OAGG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IAExC;;OAEG;IACH,WAAW,EAAE,WAAW,CAAC,0BAA0B,CAAC,CAAC;IAErD,aAAa,EAAE,oBAAoB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;IAE5D,SAAS,CAAC,0BAA0B,EAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;IACtE,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,gCAAgC,EAAE,MAAM,CAAC;IACnD,SAAS,CAAC,kBAAkB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACrD,SAAS,CAAC,mBAAmB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACtD,SAAS,CAAC,wBAAwB,EAAE,OAAO,CAAC;IAE5C,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAgBpC;IAEF;;OAEG;IACH,SAAS,aACL,OAAO,GAAE,qBAAqB,CAAC,OAAO,CAAM,EAC1B,MAAM,gBAAkC;cAwErC,eAAe,CAAC,eAAe,EAAE,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC;YASnE,iBAAiB;cASN,gBAAgB,CAAC,eAAe,EAAE,OAAO,GAAG,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;IA8B5F;;OAEG;cACsB,kBAAkB,CAAC,eAAe,EAAE,OAAO;IAyFpE,SAAS,CAAC,mCAAmC,CACzC,eAAe,EAAE,OAAO,EACxB,IAAI,EAAE,UAAU,EAChB,gBAAgB,CAAC,EAAE,OAAO,GAC3B,IAAI;cAiCS,iBAAiB,CAAC,eAAe,EAAE,OAAO;cA6B1C,aAAa,CACzB,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,iBAAiB,EAAE,EAAE,OAAO,EACtD,eAAe,EAAE,MAAM,EACvB,gBAAgB,EAAE,MAAM;IAc5B;;OAEG;cACa,wBAAwB,CAAC,eAAe,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;IAU/F;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,KAAK,EAAE,KAAK;IAMzC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,CACjC,eAAe,EAAE,OAAO,EACxB,WAAW,EAAE,WAAW,GACzB,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI,GAAG,SAAS,CAAC;IAElD;;OAEG;cACa,gBAAgB,CAAC,eAAe,EAAE,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC;cAoBzD,oBAAoB,CAAC,OAAO,EAAE,MAAM,EAAE,aAAa,EAAE,aAAa,GAAG,OAAO,CAAC,IAAI,CAAC;IA4BlG,SAAS,CAAC,+BAA+B,CAAC,OAAO,EAAE,MAAM,EAAE,iBAAiB,EAAE,OAAO,CAAC,mBAAmB,CAAC,GAAG,IAAI;IAoBjH;;;OAGG;IACY,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;CAI3C;AAED,gBAAgB;AAChB,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,IAAI,EAAE,UAAU,CAAC;IACjB,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAC1C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gBAAgB;AAChB,wBAAsB,0BAA0B,CAAC,EAC7C,OAAO,EACP,IAAI,EACJ,YAAY,EACZ,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,eAAe,GAClB,EAAE,2BAA2B,4DAsB7B;AAED;;;GAGG;AACH,wBAAsB,mBAAmB,CAErC,IAAI,EAAE;IAAE,MAAM,EAAE,QAAQ,CAAA;CAAE,EAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GAChB,OAAO,CAAC,MAAM,EAAE,CAAC,CA0BnB"}
1
+ {"version":3,"file":"browser-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/browser-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,SAAS,EACT,mBAAmB,EACnB,oBAAoB,EACpB,eAAe,EACf,UAAU,EACV,mBAAmB,EACnB,YAAY,EACZ,aAAa,EAEb,OAAO,EACP,cAAc,EACd,eAAe,EAEf,sBAAsB,EACzB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACH,YAAY,EAEZ,aAAa,EACb,eAAe,EAUlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,KAAK,EACR,iBAAiB,EACjB,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,UAAU,EACV,uBAAuB,EACvB,aAAa,EAChB,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EAA6B,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAC/E,OAAO,KAAK,EAAE,sBAAsB,EAA0B,MAAM,gBAAgB,CAAC;AACrF,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGpD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAI9C,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAElE,UAAU,YAAY;IAClB,MAAM,IAAI,MAAM,CAAC;CACpB;AAID,MAAM,WAAW,sBAAsB,CACnC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,GAAG,iBAAiB,EACtC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAC1C,SAAQ,eAAe,CAAC,QAAQ,CAAC;IAC/B;;OAEG;IACH,iBAAiB,EAAE,kBAAkB,CAAC;IAEtC;;OAEG;IACH,IAAI,EAAE,IAAI,CAAC;IAEX;;OAEG;IACH,OAAO,EAAE,aAAa,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;IAE1C;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;OAEG;IACH,YAAY,EAAE,CAAC,OAAO,CAAC,EAAE,mBAAmB,KAAK,OAAO,CAAC,sBAAsB,CAAC,CAAC;CACpF;AAED,MAAM,MAAM,WAAW,CAAC,OAAO,GAAG,sBAAsB,EAAE,WAAW,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,IAAI,CACjH,eAAe,EAAE,OAAO,EACxB,WAAW,EAAE,WAAW,KACvB,SAAS,CAAC,IAAI,CAAC,CAAC;AAErB,MAAM,WAAW,qBAAqB,CAClC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,SAAS,iBAAiB,GAAG,iBAAiB,EAChE,OAAO,SAAS,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,GAAG,sBAAsB,CAC3G,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,UAAU,CACb,EACD,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,EAC5D,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,gBAAgB,SAAS,aAAa,EAAE,GAAG,uBAAuB,CAAC,0BAA0B,CAAC,gBAAgB,CAAC,CAAC,EAChH,yBAAyB,SAAS,iBAAiB,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC,EAC9G,qBAAqB,SAAS,aAAa,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAC3G,SAAQ,IAAI,CACN,mBAAmB,CAAC,OAAO,EAAE,eAAe,CAAC,EAE7C,gBAAgB,GAAG,sBAAsB,GAAG,cAAc,CAC7D;IACD,aAAa,CAAC,EAAE,oBAAoB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAE/C;;;;;;;;;;;;;;;;;;;;;;;;;;OA0BG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC,eAAe,CAAC,CAAC;IAEjD;;;;;;;;;OASG;IACH,YAAY,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAE9D;;;;;;;;OAQG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAEtE;;;OAGG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC,GAC5C,OAAO,CAAC,gBAAgB,CAAC,yBAAyB,EAAE,qBAAqB,CAAC,CAAC,CAAC;IAEhF;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,kBAAkB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE5C;;;;;;;;;;;;;;;;OAgBG;IACH,mBAAmB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE7C;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;OAGG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IAEnC;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,GAAG,KAAK,GAAG,KAAK,CAAC;IAEnC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAE5B;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AACH,8BAAsB,cAAc,CAChC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,SAAS,iBAAiB,GAAG,iBAAiB,EAChE,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,aAAa,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,EACzD,OAAO,SAAS,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,GAAG,sBAAsB,CAC3G,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,UAAU,CACb,EACD,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,EAC5D,WAAW,SAAS,UAAU,GAAG,UAAU,CAC7C,SAAQ,YAAY,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;aA8CxC,MAAM;IA7C5B;;OAEG;IACH,WAAW,EAAE,WAAW,CAAC,0BAA0B,CAAC,CAAC;IAErD,aAAa,EAAE,oBAAoB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;IAE5D,SAAS,CAAC,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IAC9C,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAE1C,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,kBAAkB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACrD,SAAS,CAAC,mBAAmB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACtD,SAAS,CAAC,wBAAwB,EAAE,OAAO,CAAC;IAE5C,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAcpC;IAEF;;OAEG;IACH,SAAS,aACL,OAAO,EAAE,qBAAqB,CAC1B,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,OAAO,EACP,gBAAgB,EAChB,eAAe,CAClB,GAAG;QACA,sBAAsB,EAAE,MAAM,eAAe,CAAC,eAAe,EAAE,OAAO,CAAC,CAAC;KAC3E,EACiB,MAAM,gBAAkC;IAuE9D,SAAS,CAAC,oBAAoB,IAAI,eAAe,CAC7C,eAAe,EACf,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,CACzE;YAgBa,iBAAiB;cASf,gBAAgB,CAC5B,eAAe,EAAE,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,CAAC,GAC5E,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;YA8BZ,WAAW;YAoEX,iBAAiB;YA0EjB,6BAA6B;YAW7B,mBAAmB;cAKjB,aAAa,CACzB,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,iBAAiB,EAAE,EAAE,sBAAsB,EACrE,eAAe,EAAE,MAAM,EACvB,gBAAgB,EAAE,MAAM;IAc5B;;OAEG;cACa,wBAAwB,CAAC,eAAe,EAAE,sBAAsB,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9G;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,KAAK,EAAE,KAAK;IAMzC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,CACjC,eAAe,EAAE,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,CAAC,EAC3E,WAAW,EAAE,WAAW,GACzB,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI,GAAG,SAAS,CAAC;YAEpC,eAAe;cAuBb,oBAAoB,CAAC,OAAO,EAAE,MAAM,EAAE,aAAa,EAAE,aAAa,GAAG,OAAO,CAAC,IAAI,CAAC;IA+BlG,SAAS,CAAC,+BAA+B,CAAC,OAAO,EAAE,MAAM,EAAE,iBAAiB,EAAE,OAAO,CAAC,mBAAmB,CAAC,GAAG,IAAI;IAoBjH;;;OAGG;IACY,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;CAI3C;AAED,gBAAgB;AAChB,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,IAAI,EAAE,UAAU,CAAC;IACjB,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAC1C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gBAAgB;AAChB,UAAU,gCAAgC;IACtC,YAAY,EAAE,oBAAoB,CAAC,cAAc,CAAC,CAAC;IACnD,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,IAAI,EAAE,UAAU,CAAC;CACpB;AASD,gBAAgB;AAChB,wBAAsB,0BAA0B,CAC5C,OAAO,EAAE,2BAA2B,GAAG,gCAAgC,oBAiC1E;AAED;;;GAGG;AACH,wBAAsB,mBAAmB,CAErC,IAAI,EAAE;IAAE,MAAM,EAAE,QAAQ,CAAA;CAAE,EAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GAChB,OAAO,CAAC,MAAM,EAAE,CAAC,CA0BnB"}
@@ -1,8 +1,8 @@
1
- import { BASIC_CRAWLER_TIMEOUT_BUFFER_SECS, BasicCrawler, BLOCKED_STATUS_CODES as DEFAULT_BLOCKED_STATUS_CODES, Configuration, cookieStringToToughCookie, enqueueLinks, EVENT_SESSION_RETIRED, handleRequestTimeout, RequestState, resolveBaseUrlForEnqueueLinksFiltering, SessionError, tryAbsoluteURL, validators, } from '@crawlee/basic';
1
+ import { BasicCrawler, BLOCKED_STATUS_CODES as DEFAULT_BLOCKED_STATUS_CODES, Configuration, ContextPipeline, cookieStringToToughCookie, enqueueLinks, EVENT_SESSION_RETIRED, handleRequestTimeout, RequestState, resolveBaseUrlForEnqueueLinksFiltering, SessionError, tryAbsoluteURL, validators, } from '@crawlee/basic';
2
2
  import { BrowserPool } from '@crawlee/browser-pool';
3
3
  import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
4
4
  import ow from 'ow';
5
- import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
5
+ import { tryCancel } from '@apify/timeout';
6
6
  /**
7
7
  * Provides a simple framework for parallel crawling of web pages
8
8
  * using headless browsers with [Puppeteer](https://github.com/puppeteer/puppeteer)
@@ -44,19 +44,14 @@ import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
44
44
  */
45
45
  export class BrowserCrawler extends BasicCrawler {
46
46
  config;
47
- /**
48
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
49
- * Only available if used by the crawler.
50
- */
51
- proxyConfiguration;
52
47
  /**
53
48
  * A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers.
54
49
  */
55
50
  browserPool;
56
51
  launchContext;
57
- userProvidedRequestHandler;
52
+ ignoreShadowRoots;
53
+ ignoreIframes;
58
54
  navigationTimeoutMillis;
59
- requestHandlerTimeoutInnerMillis;
60
55
  preNavigationHooks;
61
56
  postNavigationHooks;
62
57
  persistCookiesPerSession;
@@ -72,34 +67,33 @@ export class BrowserCrawler extends BasicCrawler {
72
67
  persistCookiesPerSession: ow.optional.boolean,
73
68
  useSessionPool: ow.optional.boolean,
74
69
  proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
75
- ignoreShadowRoots: ow.optional.boolean,
76
- ignoreIframes: ow.optional.boolean,
77
70
  };
78
71
  /**
79
72
  * All `BrowserCrawler` parameters are passed via an options object.
80
73
  */
81
- constructor(options = {}, config = Configuration.getGlobalConfig()) {
74
+ constructor(options, config = Configuration.getGlobalConfig()) {
82
75
  ow(options, 'BrowserCrawlerOptions', ow.object.exactShape(BrowserCrawler.optionsShape));
83
- const { navigationTimeoutSecs = 60, requestHandlerTimeoutSecs = 60, persistCookiesPerSession, proxyConfiguration, launchContext = {}, browserPoolOptions, preNavigationHooks = [], postNavigationHooks = [], requestHandler, failedRequestHandler, headless, ignoreShadowRoots, ignoreIframes, ...basicCrawlerOptions } = options;
76
+ const { navigationTimeoutSecs = 60, persistCookiesPerSession, launchContext = {}, browserPoolOptions, preNavigationHooks = [], postNavigationHooks = [], headless, ignoreIframes = false, ignoreShadowRoots = false, contextPipelineBuilder, extendContext, proxyConfiguration, ...basicCrawlerOptions } = options;
84
77
  super({
85
78
  ...basicCrawlerOptions,
86
- requestHandler: async (...args) => this._runRequestHandler(...args),
87
- requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
79
+ contextPipelineBuilder: () => contextPipelineBuilder()
80
+ .compose({ action: this.performNavigation.bind(this) })
81
+ .compose({ action: this.handleBlockedRequestByContent.bind(this) })
82
+ .compose({ action: this.restoreRequestState.bind(this) }),
83
+ extendContext: extendContext,
88
84
  }, config);
89
85
  this.config = config;
90
- // FIXME any
91
- this.userProvidedRequestHandler = requestHandler ?? this.router;
92
- this.failedRequestHandler = failedRequestHandler; // FIXME is this even needed?
93
86
  // Cookies should be persisted per session only if session pool is used
94
87
  if (!this.useSessionPool && persistCookiesPerSession) {
95
88
  throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
96
89
  }
97
90
  this.launchContext = launchContext;
98
91
  this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
99
- this.requestHandlerTimeoutInnerMillis = requestHandlerTimeoutSecs * 1000;
100
92
  this.proxyConfiguration = proxyConfiguration;
101
93
  this.preNavigationHooks = preNavigationHooks;
102
94
  this.postNavigationHooks = postNavigationHooks;
95
+ this.ignoreIframes = ignoreIframes;
96
+ this.ignoreShadowRoots = ignoreShadowRoots;
103
97
  if (headless != null) {
104
98
  this.launchContext.launchOptions ??= {};
105
99
  this.launchContext.launchOptions.headless = headless;
@@ -122,12 +116,17 @@ export class BrowserCrawler extends BasicCrawler {
122
116
  postLaunchHooks: [this._maybeAddSessionRetiredListener.bind(this), ...postLaunchHooks],
123
117
  });
124
118
  }
125
- async _cleanupContext(crawlingContext) {
126
- const { page } = crawlingContext;
127
- // Page creation may be aborted
128
- if (page) {
129
- await page.close().catch((error) => this.log.debug('Error while closing page', { error }));
130
- }
119
+ buildContextPipeline() {
120
+ return ContextPipeline.create().compose({
121
+ action: this.preparePage.bind(this),
122
+ cleanup: async (context) => {
123
+ context.registerDeferredCleanup(async () => {
124
+ await context.page
125
+ .close()
126
+ .catch((error) => this.log.debug('Error while closing page', { error }));
127
+ });
128
+ },
129
+ });
131
130
  }
132
131
  async containsSelectors(page, selectors) {
133
132
  const foundSelectors = (await Promise.all(selectors.map((selector) => page.$(selector))))
@@ -161,23 +160,17 @@ export class BrowserCrawler extends BasicCrawler {
161
160
  return `Received blocked status code: ${blockedStatusCode}`;
162
161
  return false;
163
162
  }
164
- /**
165
- * Wrapper around requestHandler that opens and closes pages etc.
166
- */
167
- async _runRequestHandler(crawlingContext) {
163
+ async preparePage(crawlingContext) {
168
164
  const newPageOptions = {
169
165
  id: crawlingContext.id,
170
166
  };
171
167
  const useIncognitoPages = this.launchContext?.useIncognitoPages;
172
- if (this.proxyConfiguration) {
173
- const { session } = crawlingContext;
174
- const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, {
175
- request: crawlingContext.request,
176
- });
168
+ if (crawlingContext.session?.proxyInfo) {
169
+ const proxyInfo = crawlingContext.session.proxyInfo;
177
170
  crawlingContext.proxyInfo = proxyInfo;
178
171
  newPageOptions.proxyUrl = proxyInfo?.url;
179
172
  newPageOptions.proxyTier = proxyInfo?.proxyTier;
180
- if (this.proxyConfiguration.isManInTheMiddle) {
173
+ if (proxyInfo?.ignoreTlsErrors) {
181
174
  /**
182
175
  * @see https://playwright.dev/docs/api/class-browser/#browser-new-context
183
176
  * @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
@@ -190,76 +183,49 @@ export class BrowserCrawler extends BasicCrawler {
190
183
  }
191
184
  const page = (await this.browserPool.newPage(newPageOptions));
192
185
  tryCancel();
193
- this._enhanceCrawlingContextWithPageInfo(crawlingContext, page, useIncognitoPages);
194
- // DO NOT MOVE THIS LINE ABOVE!
195
- // `enhanceCrawlingContextWithPageInfo` gives us a valid session.
196
- // For example, `sessionPoolOptions.sessionOptions.maxUsageCount` can be `1`.
197
- // So we must not save the session prior to making sure it was used only once, otherwise we would use it twice.
198
- const { request, session } = crawlingContext;
199
- if (!request.skipNavigation) {
200
- await this._handleNavigation(crawlingContext);
201
- tryCancel();
202
- await this._responseHandler(crawlingContext);
203
- tryCancel();
204
- // save cookies
205
- // TODO: Should we save the cookies also after/only the handle page?
206
- if (this.persistCookiesPerSession) {
207
- const cookies = await crawlingContext.browserController.getCookies(page);
208
- tryCancel();
209
- session?.setCookies(cookies, request.loadedUrl);
210
- }
211
- }
212
- if (!this.requestMatchesEnqueueStrategy(request)) {
213
- this.log.debug(
214
- // eslint-disable-next-line dot-notation
215
- `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
216
- request.noRetry = true;
217
- request.state = RequestState.SKIPPED;
218
- return;
219
- }
220
- if (this.retryOnBlocked) {
221
- const error = await this.isRequestBlocked(crawlingContext);
222
- if (error)
223
- throw new SessionError(error);
224
- }
225
- request.state = RequestState.REQUEST_HANDLER;
226
- try {
227
- await addTimeoutToPromise(async () => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)), this.requestHandlerTimeoutInnerMillis, `requestHandler timed out after ${this.requestHandlerTimeoutInnerMillis / 1000} seconds.`);
228
- request.state = RequestState.DONE;
229
- }
230
- catch (e) {
231
- request.state = RequestState.ERROR;
232
- throw e;
233
- }
234
- tryCancel();
235
- }
236
- _enhanceCrawlingContextWithPageInfo(crawlingContext, page, createNewSession) {
237
- crawlingContext.page = page;
238
- // This switch is because the crawlingContexts are created on per request basis.
239
- // However, we need to add the proxy info and session from browser, which is created based on the browser-pool configuration.
240
- // We would not have to do this switch if the proxy and configuration worked as in CheerioCrawler,
241
- // which configures proxy and session for every new request
242
186
  const browserControllerInstance = this.browserPool.getBrowserControllerByPage(page);
243
- crawlingContext.browserController = browserControllerInstance;
244
- if (!createNewSession) {
245
- crawlingContext.session = browserControllerInstance.launchContext.session;
246
- }
247
- if (!crawlingContext.proxyInfo) {
248
- crawlingContext.proxyInfo = browserControllerInstance.launchContext.proxyInfo;
249
- }
250
- crawlingContext.enqueueLinks = async (enqueueOptions) => {
251
- return browserCrawlerEnqueueLinks({
252
- options: enqueueOptions,
253
- page,
254
- requestQueue: await this.getRequestQueue(),
255
- robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
256
- onSkippedRequest: this.onSkippedRequest,
257
- originalRequestUrl: crawlingContext.request.url,
258
- finalRequestUrl: crawlingContext.request.loadedUrl,
259
- });
187
+ const contextEnqueueLinks = crawlingContext.enqueueLinks;
188
+ const session = useIncognitoPages
189
+ ? crawlingContext.session
190
+ : browserControllerInstance.launchContext.session;
191
+ return {
192
+ page,
193
+ get response() {
194
+ throw new Error("The `response` property is not available. This might mean that you're trying to access it before navigation or that navigation resulted in `null` (this should only happen with `about:` URLs)");
195
+ },
196
+ browserController: browserControllerInstance,
197
+ session,
198
+ proxyInfo: session?.proxyInfo,
199
+ enqueueLinks: async (enqueueOptions = {}) => {
200
+ return (await browserCrawlerEnqueueLinks({
201
+ options: { ...enqueueOptions, limit: this.calculateEnqueuedRequestLimit(enqueueOptions?.limit) },
202
+ page,
203
+ requestQueue: await this.getRequestQueue(),
204
+ robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
205
+ onSkippedRequest: this.handleSkippedRequest,
206
+ originalRequestUrl: crawlingContext.request.url,
207
+ finalRequestUrl: crawlingContext.request.loadedUrl,
208
+ enqueueLinks: contextEnqueueLinks,
209
+ })); // TODO make this type safe
210
+ },
260
211
  };
261
212
  }
262
- async _handleNavigation(crawlingContext) {
213
+ async performNavigation(crawlingContext) {
214
+ if (crawlingContext.request.skipNavigation) {
215
+ return {
216
+ request: new Proxy(crawlingContext.request, {
217
+ get(target, propertyName, receiver) {
218
+ if (propertyName === 'loadedUrl') {
219
+ throw new Error('The `request.loadedUrl` property is not available - `skipNavigation` was used');
220
+ }
221
+ return Reflect.get(target, propertyName, receiver);
222
+ },
223
+ }),
224
+ get response() {
225
+ throw new Error('The `response` property is not available - `skipNavigation` was used');
226
+ },
227
+ };
228
+ }
263
229
  const gotoOptions = { timeout: this.navigationTimeoutMillis };
264
230
  const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
265
231
  crawlingContext.request.state = RequestState.BEFORE_NAV;
@@ -267,8 +233,9 @@ export class BrowserCrawler extends BasicCrawler {
267
233
  tryCancel();
268
234
  const postNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
269
235
  await this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
236
+ let response;
270
237
  try {
271
- crawlingContext.response = (await this._navigationHandler(crawlingContext, gotoOptions)) ?? undefined;
238
+ response = (await this._navigationHandler(crawlingContext, gotoOptions)) ?? undefined;
272
239
  }
273
240
  catch (error) {
274
241
  await this._handleNavigationTimeout(crawlingContext, error);
@@ -279,6 +246,36 @@ export class BrowserCrawler extends BasicCrawler {
279
246
  tryCancel();
280
247
  crawlingContext.request.state = RequestState.AFTER_NAV;
281
248
  await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions);
249
+ await this.processResponse(response, crawlingContext);
250
+ tryCancel();
251
+ // save cookies
252
+ // TODO: Should we save the cookies also after/only the handle page?
253
+ if (this.persistCookiesPerSession) {
254
+ const cookies = await crawlingContext.browserController.getCookies(crawlingContext.page);
255
+ tryCancel();
256
+ crawlingContext.session?.setCookies(cookies, crawlingContext.request.loadedUrl);
257
+ }
258
+ if (response !== undefined) {
259
+ return {
260
+ request: crawlingContext.request,
261
+ response,
262
+ };
263
+ }
264
+ return {
265
+ request: crawlingContext.request,
266
+ };
267
+ }
268
+ async handleBlockedRequestByContent(crawlingContext) {
269
+ if (this.retryOnBlocked) {
270
+ const error = await this.isRequestBlocked(crawlingContext);
271
+ if (error)
272
+ throw new SessionError(error);
273
+ }
274
+ return {};
275
+ }
276
+ async restoreRequestState(crawlingContext) {
277
+ crawlingContext.request.state = RequestState.REQUEST_HANDLER;
278
+ return {};
282
279
  }
283
280
  async _applyCookies({ session, request, page, browserController }, preHooksCookies, postHooksCookies) {
284
281
  const sessionCookie = session?.getCookies(request.url) ?? [];
@@ -306,11 +303,8 @@ export class BrowserCrawler extends BasicCrawler {
306
303
  throw new SessionError(this._getMessageFromError(error));
307
304
  }
308
305
  }
309
- /**
310
- * Should be overridden in case of different automation library that does not support this response API.
311
- */
312
- async _responseHandler(crawlingContext) {
313
- const { response, session, request, page } = crawlingContext;
306
+ async processResponse(response, crawlingContext) {
307
+ const { session, request, page } = crawlingContext;
314
308
  if (typeof response === 'object' && typeof response.status === 'function') {
315
309
  const status = response.status();
316
310
  this.stats.registerStatusCode(status);
@@ -328,16 +322,18 @@ export class BrowserCrawler extends BasicCrawler {
328
322
  async _extendLaunchContext(_pageId, launchContext) {
329
323
  const launchContextExtends = {};
330
324
  if (this.sessionPool) {
331
- launchContextExtends.session = await this.sessionPool.getSession();
332
- }
333
- if (this.proxyConfiguration && !launchContext.proxyUrl) {
334
- const proxyInfo = await this.proxyConfiguration.newProxyInfo(launchContextExtends.session?.id, {
335
- proxyTier: launchContext.proxyTier ?? undefined,
325
+ launchContextExtends.session = await this.sessionPool.newSession({
326
+ proxyInfo: await this.proxyConfiguration?.newProxyInfo({
327
+ // cannot pass a request here, since session is created on browser launch
328
+ }),
336
329
  });
330
+ }
331
+ if (!launchContext.proxyUrl && launchContextExtends.session?.proxyInfo) {
332
+ const proxyInfo = launchContextExtends.session.proxyInfo;
337
333
  launchContext.proxyUrl = proxyInfo?.url;
338
334
  launchContextExtends.proxyInfo = proxyInfo;
339
335
  // Disable SSL verification for MITM proxies
340
- if (this.proxyConfiguration.isManInTheMiddle) {
336
+ if (proxyInfo?.ignoreTlsErrors) {
341
337
  /**
342
338
  * @see https://playwright.dev/docs/api/class-browser/#browser-new-context
343
339
  * @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
@@ -372,21 +368,33 @@ export class BrowserCrawler extends BasicCrawler {
372
368
  }
373
369
  }
374
370
  /** @internal */
375
- export async function browserCrawlerEnqueueLinks({ options, page, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }) {
371
+ function containsEnqueueLinks(options) {
372
+ return !!options.enqueueLinks;
373
+ }
374
+ /** @internal */
375
+ export async function browserCrawlerEnqueueLinks(options) {
376
+ const { options: enqueueLinksOptions, finalRequestUrl, originalRequestUrl, page } = options;
376
377
  const baseUrl = resolveBaseUrlForEnqueueLinksFiltering({
377
- enqueueStrategy: options?.strategy,
378
+ enqueueStrategy: enqueueLinksOptions?.strategy,
378
379
  finalRequestUrl,
379
380
  originalRequestUrl,
380
- userProvidedBaseUrl: options?.baseUrl,
381
+ userProvidedBaseUrl: enqueueLinksOptions?.baseUrl,
381
382
  });
382
- const urls = await extractUrlsFromPage(page, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
383
+ const urls = await extractUrlsFromPage(page, enqueueLinksOptions?.selector ?? 'a', enqueueLinksOptions?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
384
+ if (containsEnqueueLinks(options)) {
385
+ return options.enqueueLinks({
386
+ urls,
387
+ baseUrl,
388
+ ...enqueueLinksOptions,
389
+ });
390
+ }
383
391
  return enqueueLinks({
384
- requestQueue,
385
- robotsTxtFile,
386
- onSkippedRequest,
392
+ requestQueue: options.requestQueue,
393
+ robotsTxtFile: options.robotsTxtFile,
394
+ onSkippedRequest: options.onSkippedRequest,
387
395
  urls,
388
396
  baseUrl,
389
- ...options,
397
+ ...enqueueLinksOptions,
390
398
  });
391
399
  }
392
400
  /**