@crawlee/browser 4.0.0-beta.2 → 4.0.0-beta.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -9,6 +9,10 @@
|
|
|
9
9
|
<small>A web scraping and browser automation library</small>
|
|
10
10
|
</h1>
|
|
11
11
|
|
|
12
|
+
<p align=center>
|
|
13
|
+
<a href="https://trendshift.io/repositories/5179" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5179" alt="apify%2Fcrawlee | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
14
|
+
</p>
|
|
15
|
+
|
|
12
16
|
<p align=center>
|
|
13
17
|
<a href="https://www.npmjs.com/package/@crawlee/core" rel="nofollow"><img src="https://img.shields.io/npm/v/@crawlee/core.svg" alt="NPM latest version" data-canonical-src="https://img.shields.io/npm/v/@crawlee/core/next.svg" style="max-width: 100%;"></a>
|
|
14
18
|
<a href="https://www.npmjs.com/package/@crawlee/core" rel="nofollow"><img src="https://img.shields.io/npm/dm/@crawlee/core.svg" alt="Downloads" data-canonical-src="https://img.shields.io/npm/dm/@crawlee/core.svg" style="max-width: 100%;"></a>
|
|
@@ -24,7 +28,7 @@ Crawlee is available as the [`crawlee`](https://www.npmjs.com/package/crawlee) N
|
|
|
24
28
|
|
|
25
29
|
> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev)** 👈
|
|
26
30
|
|
|
27
|
-
>
|
|
31
|
+
> Do you prefer 🐍 Python instead of JavaScript? [👉 Checkout Crawlee for Python 👈](https://github.com/apify/crawlee-python).
|
|
28
32
|
|
|
29
33
|
## Installation
|
|
30
34
|
|
|
@@ -1,19 +1,38 @@
|
|
|
1
|
-
import type { Awaitable, BasicCrawlerOptions, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler,
|
|
2
|
-
import { BasicCrawler, Configuration } from '@crawlee/basic';
|
|
1
|
+
import type { Awaitable, BasicCrawlerOptions, BasicCrawlingContext, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler, LoadedRequest, Request, RequestHandler, RequestProvider, SkippedRequestCallback } from '@crawlee/basic';
|
|
2
|
+
import { BasicCrawler, Configuration, ContextPipeline } from '@crawlee/basic';
|
|
3
3
|
import type { BrowserController, BrowserPlugin, BrowserPoolHooks, BrowserPoolOptions, CommonPage, InferBrowserPluginArray, LaunchContext } from '@crawlee/browser-pool';
|
|
4
4
|
import { BrowserPool } from '@crawlee/browser-pool';
|
|
5
|
+
import type { BatchAddRequestsResult } from '@crawlee/types';
|
|
5
6
|
import type { RobotsTxtFile } from '@crawlee/utils';
|
|
6
7
|
import type { ReadonlyDeep } from 'type-fest';
|
|
7
8
|
import type { BrowserLaunchContext } from './browser-launcher.js';
|
|
8
|
-
|
|
9
|
+
interface BaseResponse {
|
|
10
|
+
status(): number;
|
|
11
|
+
}
|
|
12
|
+
export interface BrowserCrawlingContext<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController = BrowserController, UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
|
|
13
|
+
/**
|
|
14
|
+
* An instance of the {@link BrowserController} that manages the browser instance and provides access to its API.
|
|
15
|
+
*/
|
|
9
16
|
browserController: ProvidedController;
|
|
17
|
+
/**
|
|
18
|
+
* The browser page object where the web page is loaded and rendered.
|
|
19
|
+
*/
|
|
10
20
|
page: Page;
|
|
11
|
-
|
|
21
|
+
/**
|
|
22
|
+
* The request object that was successfully loaded and navigated to, including the {@link Request.loadedUrl|`loadedUrl`} property.
|
|
23
|
+
*/
|
|
24
|
+
request: LoadedRequest<Request<UserData>>;
|
|
25
|
+
/**
|
|
26
|
+
* The HTTP response object returned by the browser's navigation.
|
|
27
|
+
*/
|
|
28
|
+
response: Response;
|
|
29
|
+
/**
|
|
30
|
+
* Helper function for extracting URLs from the current page and adding them to the request queue.
|
|
31
|
+
*/
|
|
32
|
+
enqueueLinks: (options?: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>;
|
|
12
33
|
}
|
|
13
|
-
export type BrowserRequestHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = RequestHandler<Context>;
|
|
14
|
-
export type BrowserErrorHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = ErrorHandler<Context>;
|
|
15
34
|
export type BrowserHook<Context = BrowserCrawlingContext, GoToOptions extends Dictionary | undefined = Dictionary> = (crawlingContext: Context, gotoOptions: GoToOptions) => Awaitable<void>;
|
|
16
|
-
export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext = BrowserCrawlingContext, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions, 'requestHandler' | 'failedRequestHandler' | 'errorHandler'> {
|
|
35
|
+
export interface BrowserCrawlerOptions<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController extends BrowserController = BrowserController, Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions<Context, ExtendedContext>, 'requestHandler' | 'failedRequestHandler' | 'errorHandler'> {
|
|
17
36
|
launchContext?: BrowserLaunchContext<any, any>;
|
|
18
37
|
/**
|
|
19
38
|
* Function that is called to process each request.
|
|
@@ -42,7 +61,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
42
61
|
* The exceptions are logged to the request using the
|
|
43
62
|
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
44
63
|
*/
|
|
45
|
-
requestHandler?:
|
|
64
|
+
requestHandler?: RequestHandler<ExtendedContext>;
|
|
46
65
|
/**
|
|
47
66
|
* User-provided function that allows modifying the request object before it gets retried by the crawler.
|
|
48
67
|
* It's executed before each retry for the requests that failed less than {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
@@ -53,7 +72,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
53
72
|
* Second argument is the `Error` instance that
|
|
54
73
|
* represents the last error thrown during processing of the request.
|
|
55
74
|
*/
|
|
56
|
-
errorHandler?:
|
|
75
|
+
errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
57
76
|
/**
|
|
58
77
|
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
59
78
|
*
|
|
@@ -63,17 +82,12 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
63
82
|
* Second argument is the `Error` instance that
|
|
64
83
|
* represents the last error thrown during processing of the request.
|
|
65
84
|
*/
|
|
66
|
-
failedRequestHandler?:
|
|
85
|
+
failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
67
86
|
/**
|
|
68
87
|
* Custom options passed to the underlying {@link BrowserPool} constructor.
|
|
69
88
|
* We can tweak those to fine-tune browser management.
|
|
70
89
|
*/
|
|
71
90
|
browserPoolOptions?: Partial<BrowserPoolOptions> & Partial<BrowserPoolHooks<__BrowserControllerReturn, __LaunchContextReturn>>;
|
|
72
|
-
/**
|
|
73
|
-
* If set, the crawler will be configured for all connections to use
|
|
74
|
-
* the Proxy URLs provided and rotated according to the configuration.
|
|
75
|
-
*/
|
|
76
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
77
91
|
/**
|
|
78
92
|
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
79
93
|
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`,
|
|
@@ -178,21 +192,16 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
178
192
|
*
|
|
179
193
|
* @category Crawlers
|
|
180
194
|
*/
|
|
181
|
-
export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext = BrowserCrawlingContext, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context> {
|
|
195
|
+
export declare abstract class BrowserCrawler<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController extends BrowserController = BrowserController, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
|
|
182
196
|
readonly config: Configuration;
|
|
183
|
-
/**
|
|
184
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
185
|
-
* Only available if used by the crawler.
|
|
186
|
-
*/
|
|
187
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
188
197
|
/**
|
|
189
198
|
* A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers.
|
|
190
199
|
*/
|
|
191
200
|
browserPool: BrowserPool<InternalBrowserPoolOptions>;
|
|
192
201
|
launchContext: BrowserLaunchContext<LaunchOptions, unknown>;
|
|
193
|
-
protected
|
|
202
|
+
protected readonly ignoreShadowRoots: boolean;
|
|
203
|
+
protected readonly ignoreIframes: boolean;
|
|
194
204
|
protected navigationTimeoutMillis: number;
|
|
195
|
-
protected requestHandlerTimeoutInnerMillis: number;
|
|
196
205
|
protected preNavigationHooks: BrowserHook<Context>[];
|
|
197
206
|
protected postNavigationHooks: BrowserHook<Context>[];
|
|
198
207
|
protected persistCookiesPerSession: boolean;
|
|
@@ -218,9 +227,9 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
218
227
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
219
228
|
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
220
229
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
221
|
-
|
|
230
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
222
231
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
223
|
-
|
|
232
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
224
233
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
225
234
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
226
235
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -241,6 +250,8 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
241
250
|
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
242
251
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
243
252
|
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
253
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
254
|
+
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
244
255
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
245
256
|
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
246
257
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -250,7 +261,7 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
250
261
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
251
262
|
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
252
263
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
253
|
-
respectRobotsTxtFile: import("ow").
|
|
264
|
+
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
|
|
254
265
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
255
266
|
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
256
267
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -273,30 +284,27 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
273
284
|
/**
|
|
274
285
|
* All `BrowserCrawler` parameters are passed via an options object.
|
|
275
286
|
*/
|
|
276
|
-
protected constructor(options
|
|
277
|
-
|
|
287
|
+
protected constructor(options: BrowserCrawlerOptions<Page, Response, ProvidedController, Context, ContextExtension, ExtendedContext> & {
|
|
288
|
+
contextPipelineBuilder: () => ContextPipeline<CrawlingContext, Context>;
|
|
289
|
+
}, config?: Configuration);
|
|
290
|
+
protected buildContextPipeline(): ContextPipeline<CrawlingContext, BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>>;
|
|
278
291
|
private containsSelectors;
|
|
279
|
-
protected isRequestBlocked(crawlingContext:
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
protected
|
|
285
|
-
protected _handleNavigation(crawlingContext: Context): Promise<void>;
|
|
286
|
-
protected _applyCookies({ session, request, page, browserController }: Context, preHooksCookies: string, postHooksCookies: string): Promise<void>;
|
|
292
|
+
protected isRequestBlocked(crawlingContext: BrowserCrawlingContext<Page, Response, ProvidedController>): Promise<string | false>;
|
|
293
|
+
private preparePage;
|
|
294
|
+
private performNavigation;
|
|
295
|
+
private handleBlockedRequestByContent;
|
|
296
|
+
private restoreRequestState;
|
|
297
|
+
protected _applyCookies({ session, request, page, browserController }: BrowserCrawlingContext, preHooksCookies: string, postHooksCookies: string): Promise<void>;
|
|
287
298
|
/**
|
|
288
299
|
* Marks session bad in case of navigation timeout.
|
|
289
300
|
*/
|
|
290
|
-
protected _handleNavigationTimeout(crawlingContext:
|
|
301
|
+
protected _handleNavigationTimeout(crawlingContext: BrowserCrawlingContext, error: Error): Promise<void>;
|
|
291
302
|
/**
|
|
292
303
|
* Transforms proxy-related errors to `SessionError`.
|
|
293
304
|
*/
|
|
294
305
|
protected _throwIfProxyError(error: Error): void;
|
|
295
|
-
protected abstract _navigationHandler(crawlingContext:
|
|
296
|
-
|
|
297
|
-
* Should be overridden in case of different automation library that does not support this response API.
|
|
298
|
-
*/
|
|
299
|
-
protected _responseHandler(crawlingContext: Context): Promise<void>;
|
|
306
|
+
protected abstract _navigationHandler(crawlingContext: BrowserCrawlingContext<Page, Response, ProvidedController>, gotoOptions: GoToOptions): Promise<Context['response'] | null | undefined>;
|
|
307
|
+
private processResponse;
|
|
300
308
|
protected _extendLaunchContext(_pageId: string, launchContext: LaunchContext): Promise<void>;
|
|
301
309
|
protected _maybeAddSessionRetiredListener(_pageId: string, browserController: Context['browserController']): void;
|
|
302
310
|
/**
|
|
@@ -316,8 +324,15 @@ interface EnqueueLinksInternalOptions {
|
|
|
316
324
|
finalRequestUrl?: string;
|
|
317
325
|
}
|
|
318
326
|
/** @internal */
|
|
319
|
-
|
|
320
|
-
|
|
327
|
+
interface BoundEnqueueLinksInternalOptions {
|
|
328
|
+
enqueueLinks: BasicCrawlingContext['enqueueLinks'];
|
|
329
|
+
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
|
|
330
|
+
originalRequestUrl: string;
|
|
331
|
+
finalRequestUrl?: string;
|
|
332
|
+
page: CommonPage;
|
|
333
|
+
}
|
|
334
|
+
/** @internal */
|
|
335
|
+
export declare function browserCrawlerEnqueueLinks(options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions): Promise<unknown>;
|
|
321
336
|
/**
|
|
322
337
|
* Extracts URLs from a given page.
|
|
323
338
|
* @ignore
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"browser-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/browser-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,SAAS,EACT,mBAAmB,EACnB,eAAe,EACf,UAAU,EACV,mBAAmB,EACnB,YAAY,EACZ,aAAa,
|
|
1
|
+
{"version":3,"file":"browser-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/browser-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,SAAS,EACT,mBAAmB,EACnB,oBAAoB,EACpB,eAAe,EACf,UAAU,EACV,mBAAmB,EACnB,YAAY,EACZ,aAAa,EAEb,OAAO,EACP,cAAc,EACd,eAAe,EAEf,sBAAsB,EACzB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACH,YAAY,EAEZ,aAAa,EACb,eAAe,EAUlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,KAAK,EACR,iBAAiB,EACjB,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,UAAU,EACV,uBAAuB,EACvB,aAAa,EAChB,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EAA6B,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAC/E,OAAO,KAAK,EAAE,sBAAsB,EAA0B,MAAM,gBAAgB,CAAC;AACrF,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGpD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAI9C,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAElE,UAAU,YAAY;IAClB,MAAM,IAAI,MAAM,CAAC;CACpB;AAID,MAAM,WAAW,sBAAsB,CACnC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,GAAG,iBAAiB,EACtC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAC1C,SAAQ,eAAe,CAAC,QAAQ,CAAC;IAC/B;;OAEG;IACH,iBAAiB,EAAE,kBAAkB,CAAC;IAEtC;;OAEG;IACH,IAAI,EAAE,IAAI,CAAC;IAEX;;OAEG;IACH,OAAO,EAAE,aAAa,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;IAE1C;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;OAEG;IACH,YAAY,EAAE,CAAC,OAAO,CAAC,EAAE,mBAAmB,KAAK,OAAO,CAAC,sBAAsB,CAAC,CAAC;CACpF;AAED,MAAM,MAAM,WAAW,CAAC,OAAO,GAAG,sBAAsB,EAAE,WAAW,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,IAAI,CACjH,eAAe,EAAE,OAAO,EACxB,WAAW,EAAE,WAAW,KACvB,SAAS,CAAC,IAAI,CAAC,CAAC;AAErB,MAAM,WAAW,qBAAqB,CAClC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,SAAS,iBAAiB,GAAG,iBAAiB,EAChE,OAAO,SAAS,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,GAAG,sBAAsB,CAC3G,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,UAAU,CACb,EACD,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,EAC5D,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,gBAAgB,SAAS,aAAa,EAAE,GAAG,uBAAuB,CAAC,0BAA0B,CAAC,gBAAgB,CAAC,CAAC,EAChH,yBAAyB,SAAS,iBAAiB,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC,EAC9G,qBAAqB,SAAS,aAAa,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAC3G,SAAQ,IAAI,CACN,mBAAmB,CAAC,OAAO,EAAE,eAAe,CAAC,EAE7C,gBAAgB,GAAG,sBAAsB,GAAG,cAAc,CAC7D;IACD,aAAa,CAAC,EAAE,oBAAoB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAE/C;;;;;;;;;;;;;;;;;;;;;;;;;;OA0BG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC,eAAe,CAAC,CAAC;IAEjD;;;;;;;;;OASG;IACH,YAAY,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAE9D;;;;;;;;OAQG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAEtE;;;OAGG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC,GAC5C,OAAO,CAAC,gBAAgB,CAAC,yBAAyB,EAAE,qBAAqB,CAAC,CAAC,CAAC;IAEhF;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,kBAAkB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE5C;;;;;;;;;;;;;;;;OAgBG;IACH,mBAAmB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE7C;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;OAGG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IAEnC;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,GAAG,KAAK,GAAG,KAAK,CAAC;IAEnC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAE5B;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AACH,8BAAsB,cAAc,CAChC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,SAAS,iBAAiB,GAAG,iBAAiB,EAChE,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,aAAa,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,EACzD,OAAO,SAAS,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,GAAG,sBAAsB,CAC3G,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,UAAU,CACb,EACD,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,EAC5D,WAAW,SAAS,UAAU,GAAG,UAAU,CAC7C,SAAQ,YAAY,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;aA8CxC,MAAM;IA7C5B;;OAEG;IACH,WAAW,EAAE,WAAW,CAAC,0BAA0B,CAAC,CAAC;IAErD,aAAa,EAAE,oBAAoB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;IAE5D,SAAS,CAAC,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IAC9C,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAE1C,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,kBAAkB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACrD,SAAS,CAAC,mBAAmB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACtD,SAAS,CAAC,wBAAwB,EAAE,OAAO,CAAC;IAE5C,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAcpC;IAEF;;OAEG;IACH,SAAS,aACL,OAAO,EAAE,qBAAqB,CAC1B,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,OAAO,EACP,gBAAgB,EAChB,eAAe,CAClB,GAAG;QACA,sBAAsB,EAAE,MAAM,eAAe,CAAC,eAAe,EAAE,OAAO,CAAC,CAAC;KAC3E,EACiB,MAAM,gBAAkC;IAuE9D,SAAS,CAAC,oBAAoB,IAAI,eAAe,CAC7C,eAAe,EACf,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,CACzE;YAgBa,iBAAiB;cASf,gBAAgB,CAC5B,eAAe,EAAE,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,CAAC,GAC5E,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;YA8BZ,WAAW;YAoEX,iBAAiB;YA0EjB,6BAA6B;YAW7B,mBAAmB;cAKjB,aAAa,CACzB,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,iBAAiB,EAAE,EAAE,sBAAsB,EACrE,eAAe,EAAE,MAAM,EACvB,gBAAgB,EAAE,MAAM;IAc5B;;OAEG;cACa,wBAAwB,CAAC,eAAe,EAAE,sBAAsB,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9G;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,KAAK,EAAE,KAAK;IAMzC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,CACjC,eAAe,EAAE,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,CAAC,EAC3E,WAAW,EAAE,WAAW,GACzB,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI,GAAG,SAAS,CAAC;YAEpC,eAAe;cAuBb,oBAAoB,CAAC,OAAO,EAAE,MAAM,EAAE,aAAa,EAAE,aAAa,GAAG,OAAO,CAAC,IAAI,CAAC;IA+BlG,SAAS,CAAC,+BAA+B,CAAC,OAAO,EAAE,MAAM,EAAE,iBAAiB,EAAE,OAAO,CAAC,mBAAmB,CAAC,GAAG,IAAI;IAoBjH;;;OAGG;IACY,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;CAI3C;AAED,gBAAgB;AAChB,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,IAAI,EAAE,UAAU,CAAC;IACjB,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAC1C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gBAAgB;AAChB,UAAU,gCAAgC;IACtC,YAAY,EAAE,oBAAoB,CAAC,cAAc,CAAC,CAAC;IACnD,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,IAAI,EAAE,UAAU,CAAC;CACpB;AASD,gBAAgB;AAChB,wBAAsB,0BAA0B,CAC5C,OAAO,EAAE,2BAA2B,GAAG,gCAAgC,oBAiC1E;AAED;;;GAGG;AACH,wBAAsB,mBAAmB,CAErC,IAAI,EAAE;IAAE,MAAM,EAAE,QAAQ,CAAA;CAAE,EAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GAChB,OAAO,CAAC,MAAM,EAAE,CAAC,CA0BnB"}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { BasicCrawler, BLOCKED_STATUS_CODES as DEFAULT_BLOCKED_STATUS_CODES, Configuration, ContextPipeline, cookieStringToToughCookie, enqueueLinks, EVENT_SESSION_RETIRED, handleRequestTimeout, RequestState, resolveBaseUrlForEnqueueLinksFiltering, SessionError, tryAbsoluteURL, validators, } from '@crawlee/basic';
|
|
2
2
|
import { BrowserPool } from '@crawlee/browser-pool';
|
|
3
3
|
import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
|
|
4
4
|
import ow from 'ow';
|
|
5
|
-
import {
|
|
5
|
+
import { tryCancel } from '@apify/timeout';
|
|
6
6
|
/**
|
|
7
7
|
* Provides a simple framework for parallel crawling of web pages
|
|
8
8
|
* using headless browsers with [Puppeteer](https://github.com/puppeteer/puppeteer)
|
|
@@ -44,19 +44,14 @@ import { addTimeoutToPromise, tryCancel } from '@apify/timeout';
|
|
|
44
44
|
*/
|
|
45
45
|
export class BrowserCrawler extends BasicCrawler {
|
|
46
46
|
config;
|
|
47
|
-
/**
|
|
48
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
49
|
-
* Only available if used by the crawler.
|
|
50
|
-
*/
|
|
51
|
-
proxyConfiguration;
|
|
52
47
|
/**
|
|
53
48
|
* A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers.
|
|
54
49
|
*/
|
|
55
50
|
browserPool;
|
|
56
51
|
launchContext;
|
|
57
|
-
|
|
52
|
+
ignoreShadowRoots;
|
|
53
|
+
ignoreIframes;
|
|
58
54
|
navigationTimeoutMillis;
|
|
59
|
-
requestHandlerTimeoutInnerMillis;
|
|
60
55
|
preNavigationHooks;
|
|
61
56
|
postNavigationHooks;
|
|
62
57
|
persistCookiesPerSession;
|
|
@@ -72,34 +67,33 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
72
67
|
persistCookiesPerSession: ow.optional.boolean,
|
|
73
68
|
useSessionPool: ow.optional.boolean,
|
|
74
69
|
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
75
|
-
ignoreShadowRoots: ow.optional.boolean,
|
|
76
|
-
ignoreIframes: ow.optional.boolean,
|
|
77
70
|
};
|
|
78
71
|
/**
|
|
79
72
|
* All `BrowserCrawler` parameters are passed via an options object.
|
|
80
73
|
*/
|
|
81
|
-
constructor(options
|
|
74
|
+
constructor(options, config = Configuration.getGlobalConfig()) {
|
|
82
75
|
ow(options, 'BrowserCrawlerOptions', ow.object.exactShape(BrowserCrawler.optionsShape));
|
|
83
|
-
const { navigationTimeoutSecs = 60,
|
|
76
|
+
const { navigationTimeoutSecs = 60, persistCookiesPerSession, launchContext = {}, browserPoolOptions, preNavigationHooks = [], postNavigationHooks = [], headless, ignoreIframes = false, ignoreShadowRoots = false, contextPipelineBuilder, extendContext, proxyConfiguration, ...basicCrawlerOptions } = options;
|
|
84
77
|
super({
|
|
85
78
|
...basicCrawlerOptions,
|
|
86
|
-
|
|
87
|
-
|
|
79
|
+
contextPipelineBuilder: () => contextPipelineBuilder()
|
|
80
|
+
.compose({ action: this.performNavigation.bind(this) })
|
|
81
|
+
.compose({ action: this.handleBlockedRequestByContent.bind(this) })
|
|
82
|
+
.compose({ action: this.restoreRequestState.bind(this) }),
|
|
83
|
+
extendContext: extendContext,
|
|
88
84
|
}, config);
|
|
89
85
|
this.config = config;
|
|
90
|
-
// FIXME any
|
|
91
|
-
this.userProvidedRequestHandler = requestHandler ?? this.router;
|
|
92
|
-
this.failedRequestHandler = failedRequestHandler; // FIXME is this even needed?
|
|
93
86
|
// Cookies should be persisted per session only if session pool is used
|
|
94
87
|
if (!this.useSessionPool && persistCookiesPerSession) {
|
|
95
88
|
throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
|
|
96
89
|
}
|
|
97
90
|
this.launchContext = launchContext;
|
|
98
91
|
this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
|
|
99
|
-
this.requestHandlerTimeoutInnerMillis = requestHandlerTimeoutSecs * 1000;
|
|
100
92
|
this.proxyConfiguration = proxyConfiguration;
|
|
101
93
|
this.preNavigationHooks = preNavigationHooks;
|
|
102
94
|
this.postNavigationHooks = postNavigationHooks;
|
|
95
|
+
this.ignoreIframes = ignoreIframes;
|
|
96
|
+
this.ignoreShadowRoots = ignoreShadowRoots;
|
|
103
97
|
if (headless != null) {
|
|
104
98
|
this.launchContext.launchOptions ??= {};
|
|
105
99
|
this.launchContext.launchOptions.headless = headless;
|
|
@@ -122,12 +116,17 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
122
116
|
postLaunchHooks: [this._maybeAddSessionRetiredListener.bind(this), ...postLaunchHooks],
|
|
123
117
|
});
|
|
124
118
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
119
|
+
buildContextPipeline() {
|
|
120
|
+
return ContextPipeline.create().compose({
|
|
121
|
+
action: this.preparePage.bind(this),
|
|
122
|
+
cleanup: async (context) => {
|
|
123
|
+
context.registerDeferredCleanup(async () => {
|
|
124
|
+
await context.page
|
|
125
|
+
.close()
|
|
126
|
+
.catch((error) => this.log.debug('Error while closing page', { error }));
|
|
127
|
+
});
|
|
128
|
+
},
|
|
129
|
+
});
|
|
131
130
|
}
|
|
132
131
|
async containsSelectors(page, selectors) {
|
|
133
132
|
const foundSelectors = (await Promise.all(selectors.map((selector) => page.$(selector))))
|
|
@@ -161,23 +160,17 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
161
160
|
return `Received blocked status code: ${blockedStatusCode}`;
|
|
162
161
|
return false;
|
|
163
162
|
}
|
|
164
|
-
|
|
165
|
-
* Wrapper around requestHandler that opens and closes pages etc.
|
|
166
|
-
*/
|
|
167
|
-
async _runRequestHandler(crawlingContext) {
|
|
163
|
+
async preparePage(crawlingContext) {
|
|
168
164
|
const newPageOptions = {
|
|
169
165
|
id: crawlingContext.id,
|
|
170
166
|
};
|
|
171
167
|
const useIncognitoPages = this.launchContext?.useIncognitoPages;
|
|
172
|
-
if (
|
|
173
|
-
const
|
|
174
|
-
const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, {
|
|
175
|
-
request: crawlingContext.request,
|
|
176
|
-
});
|
|
168
|
+
if (crawlingContext.session?.proxyInfo) {
|
|
169
|
+
const proxyInfo = crawlingContext.session.proxyInfo;
|
|
177
170
|
crawlingContext.proxyInfo = proxyInfo;
|
|
178
171
|
newPageOptions.proxyUrl = proxyInfo?.url;
|
|
179
172
|
newPageOptions.proxyTier = proxyInfo?.proxyTier;
|
|
180
|
-
if (
|
|
173
|
+
if (proxyInfo?.ignoreTlsErrors) {
|
|
181
174
|
/**
|
|
182
175
|
* @see https://playwright.dev/docs/api/class-browser/#browser-new-context
|
|
183
176
|
* @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
|
|
@@ -190,76 +183,49 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
190
183
|
}
|
|
191
184
|
const page = (await this.browserPool.newPage(newPageOptions));
|
|
192
185
|
tryCancel();
|
|
193
|
-
this._enhanceCrawlingContextWithPageInfo(crawlingContext, page, useIncognitoPages);
|
|
194
|
-
// DO NOT MOVE THIS LINE ABOVE!
|
|
195
|
-
// `enhanceCrawlingContextWithPageInfo` gives us a valid session.
|
|
196
|
-
// For example, `sessionPoolOptions.sessionOptions.maxUsageCount` can be `1`.
|
|
197
|
-
// So we must not save the session prior to making sure it was used only once, otherwise we would use it twice.
|
|
198
|
-
const { request, session } = crawlingContext;
|
|
199
|
-
if (!request.skipNavigation) {
|
|
200
|
-
await this._handleNavigation(crawlingContext);
|
|
201
|
-
tryCancel();
|
|
202
|
-
await this._responseHandler(crawlingContext);
|
|
203
|
-
tryCancel();
|
|
204
|
-
// save cookies
|
|
205
|
-
// TODO: Should we save the cookies also after/only the handle page?
|
|
206
|
-
if (this.persistCookiesPerSession) {
|
|
207
|
-
const cookies = await crawlingContext.browserController.getCookies(page);
|
|
208
|
-
tryCancel();
|
|
209
|
-
session?.setCookies(cookies, request.loadedUrl);
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
if (!this.requestMatchesEnqueueStrategy(request)) {
|
|
213
|
-
this.log.debug(
|
|
214
|
-
// eslint-disable-next-line dot-notation
|
|
215
|
-
`Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
|
|
216
|
-
request.noRetry = true;
|
|
217
|
-
request.state = RequestState.SKIPPED;
|
|
218
|
-
return;
|
|
219
|
-
}
|
|
220
|
-
if (this.retryOnBlocked) {
|
|
221
|
-
const error = await this.isRequestBlocked(crawlingContext);
|
|
222
|
-
if (error)
|
|
223
|
-
throw new SessionError(error);
|
|
224
|
-
}
|
|
225
|
-
request.state = RequestState.REQUEST_HANDLER;
|
|
226
|
-
try {
|
|
227
|
-
await addTimeoutToPromise(async () => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)), this.requestHandlerTimeoutInnerMillis, `requestHandler timed out after ${this.requestHandlerTimeoutInnerMillis / 1000} seconds.`);
|
|
228
|
-
request.state = RequestState.DONE;
|
|
229
|
-
}
|
|
230
|
-
catch (e) {
|
|
231
|
-
request.state = RequestState.ERROR;
|
|
232
|
-
throw e;
|
|
233
|
-
}
|
|
234
|
-
tryCancel();
|
|
235
|
-
}
|
|
236
|
-
_enhanceCrawlingContextWithPageInfo(crawlingContext, page, createNewSession) {
|
|
237
|
-
crawlingContext.page = page;
|
|
238
|
-
// This switch is because the crawlingContexts are created on per request basis.
|
|
239
|
-
// However, we need to add the proxy info and session from browser, which is created based on the browser-pool configuration.
|
|
240
|
-
// We would not have to do this switch if the proxy and configuration worked as in CheerioCrawler,
|
|
241
|
-
// which configures proxy and session for every new request
|
|
242
186
|
const browserControllerInstance = this.browserPool.getBrowserControllerByPage(page);
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
crawlingContext.session
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
187
|
+
const contextEnqueueLinks = crawlingContext.enqueueLinks;
|
|
188
|
+
const session = useIncognitoPages
|
|
189
|
+
? crawlingContext.session
|
|
190
|
+
: browserControllerInstance.launchContext.session;
|
|
191
|
+
return {
|
|
192
|
+
page,
|
|
193
|
+
get response() {
|
|
194
|
+
throw new Error("The `response` property is not available. This might mean that you're trying to access it before navigation or that navigation resulted in `null` (this should only happen with `about:` URLs)");
|
|
195
|
+
},
|
|
196
|
+
browserController: browserControllerInstance,
|
|
197
|
+
session,
|
|
198
|
+
proxyInfo: session?.proxyInfo,
|
|
199
|
+
enqueueLinks: async (enqueueOptions = {}) => {
|
|
200
|
+
return (await browserCrawlerEnqueueLinks({
|
|
201
|
+
options: { ...enqueueOptions, limit: this.calculateEnqueuedRequestLimit(enqueueOptions?.limit) },
|
|
202
|
+
page,
|
|
203
|
+
requestQueue: await this.getRequestQueue(),
|
|
204
|
+
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
|
|
205
|
+
onSkippedRequest: this.handleSkippedRequest,
|
|
206
|
+
originalRequestUrl: crawlingContext.request.url,
|
|
207
|
+
finalRequestUrl: crawlingContext.request.loadedUrl,
|
|
208
|
+
enqueueLinks: contextEnqueueLinks,
|
|
209
|
+
})); // TODO make this type safe
|
|
210
|
+
},
|
|
260
211
|
};
|
|
261
212
|
}
|
|
262
|
-
async
|
|
213
|
+
async performNavigation(crawlingContext) {
|
|
214
|
+
if (crawlingContext.request.skipNavigation) {
|
|
215
|
+
return {
|
|
216
|
+
request: new Proxy(crawlingContext.request, {
|
|
217
|
+
get(target, propertyName, receiver) {
|
|
218
|
+
if (propertyName === 'loadedUrl') {
|
|
219
|
+
throw new Error('The `request.loadedUrl` property is not available - `skipNavigation` was used');
|
|
220
|
+
}
|
|
221
|
+
return Reflect.get(target, propertyName, receiver);
|
|
222
|
+
},
|
|
223
|
+
}),
|
|
224
|
+
get response() {
|
|
225
|
+
throw new Error('The `response` property is not available - `skipNavigation` was used');
|
|
226
|
+
},
|
|
227
|
+
};
|
|
228
|
+
}
|
|
263
229
|
const gotoOptions = { timeout: this.navigationTimeoutMillis };
|
|
264
230
|
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
|
|
265
231
|
crawlingContext.request.state = RequestState.BEFORE_NAV;
|
|
@@ -267,8 +233,9 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
267
233
|
tryCancel();
|
|
268
234
|
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
|
|
269
235
|
await this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
|
|
236
|
+
let response;
|
|
270
237
|
try {
|
|
271
|
-
|
|
238
|
+
response = (await this._navigationHandler(crawlingContext, gotoOptions)) ?? undefined;
|
|
272
239
|
}
|
|
273
240
|
catch (error) {
|
|
274
241
|
await this._handleNavigationTimeout(crawlingContext, error);
|
|
@@ -279,6 +246,36 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
279
246
|
tryCancel();
|
|
280
247
|
crawlingContext.request.state = RequestState.AFTER_NAV;
|
|
281
248
|
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions);
|
|
249
|
+
await this.processResponse(response, crawlingContext);
|
|
250
|
+
tryCancel();
|
|
251
|
+
// save cookies
|
|
252
|
+
// TODO: Should we save the cookies also after/only the handle page?
|
|
253
|
+
if (this.persistCookiesPerSession) {
|
|
254
|
+
const cookies = await crawlingContext.browserController.getCookies(crawlingContext.page);
|
|
255
|
+
tryCancel();
|
|
256
|
+
crawlingContext.session?.setCookies(cookies, crawlingContext.request.loadedUrl);
|
|
257
|
+
}
|
|
258
|
+
if (response !== undefined) {
|
|
259
|
+
return {
|
|
260
|
+
request: crawlingContext.request,
|
|
261
|
+
response,
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
return {
|
|
265
|
+
request: crawlingContext.request,
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
async handleBlockedRequestByContent(crawlingContext) {
|
|
269
|
+
if (this.retryOnBlocked) {
|
|
270
|
+
const error = await this.isRequestBlocked(crawlingContext);
|
|
271
|
+
if (error)
|
|
272
|
+
throw new SessionError(error);
|
|
273
|
+
}
|
|
274
|
+
return {};
|
|
275
|
+
}
|
|
276
|
+
async restoreRequestState(crawlingContext) {
|
|
277
|
+
crawlingContext.request.state = RequestState.REQUEST_HANDLER;
|
|
278
|
+
return {};
|
|
282
279
|
}
|
|
283
280
|
async _applyCookies({ session, request, page, browserController }, preHooksCookies, postHooksCookies) {
|
|
284
281
|
const sessionCookie = session?.getCookies(request.url) ?? [];
|
|
@@ -306,11 +303,8 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
306
303
|
throw new SessionError(this._getMessageFromError(error));
|
|
307
304
|
}
|
|
308
305
|
}
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
*/
|
|
312
|
-
async _responseHandler(crawlingContext) {
|
|
313
|
-
const { response, session, request, page } = crawlingContext;
|
|
306
|
+
async processResponse(response, crawlingContext) {
|
|
307
|
+
const { session, request, page } = crawlingContext;
|
|
314
308
|
if (typeof response === 'object' && typeof response.status === 'function') {
|
|
315
309
|
const status = response.status();
|
|
316
310
|
this.stats.registerStatusCode(status);
|
|
@@ -328,16 +322,18 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
328
322
|
async _extendLaunchContext(_pageId, launchContext) {
|
|
329
323
|
const launchContextExtends = {};
|
|
330
324
|
if (this.sessionPool) {
|
|
331
|
-
launchContextExtends.session = await this.sessionPool.
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
proxyTier: launchContext.proxyTier ?? undefined,
|
|
325
|
+
launchContextExtends.session = await this.sessionPool.newSession({
|
|
326
|
+
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
|
|
327
|
+
// cannot pass a request here, since session is created on browser launch
|
|
328
|
+
}),
|
|
336
329
|
});
|
|
330
|
+
}
|
|
331
|
+
if (!launchContext.proxyUrl && launchContextExtends.session?.proxyInfo) {
|
|
332
|
+
const proxyInfo = launchContextExtends.session.proxyInfo;
|
|
337
333
|
launchContext.proxyUrl = proxyInfo?.url;
|
|
338
334
|
launchContextExtends.proxyInfo = proxyInfo;
|
|
339
335
|
// Disable SSL verification for MITM proxies
|
|
340
|
-
if (
|
|
336
|
+
if (proxyInfo?.ignoreTlsErrors) {
|
|
341
337
|
/**
|
|
342
338
|
* @see https://playwright.dev/docs/api/class-browser/#browser-new-context
|
|
343
339
|
* @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
|
|
@@ -372,21 +368,33 @@ export class BrowserCrawler extends BasicCrawler {
|
|
|
372
368
|
}
|
|
373
369
|
}
|
|
374
370
|
/** @internal */
|
|
375
|
-
|
|
371
|
+
function containsEnqueueLinks(options) {
|
|
372
|
+
return !!options.enqueueLinks;
|
|
373
|
+
}
|
|
374
|
+
/** @internal */
|
|
375
|
+
export async function browserCrawlerEnqueueLinks(options) {
|
|
376
|
+
const { options: enqueueLinksOptions, finalRequestUrl, originalRequestUrl, page } = options;
|
|
376
377
|
const baseUrl = resolveBaseUrlForEnqueueLinksFiltering({
|
|
377
|
-
enqueueStrategy:
|
|
378
|
+
enqueueStrategy: enqueueLinksOptions?.strategy,
|
|
378
379
|
finalRequestUrl,
|
|
379
380
|
originalRequestUrl,
|
|
380
|
-
userProvidedBaseUrl:
|
|
381
|
+
userProvidedBaseUrl: enqueueLinksOptions?.baseUrl,
|
|
381
382
|
});
|
|
382
|
-
const urls = await extractUrlsFromPage(page,
|
|
383
|
+
const urls = await extractUrlsFromPage(page, enqueueLinksOptions?.selector ?? 'a', enqueueLinksOptions?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
|
|
384
|
+
if (containsEnqueueLinks(options)) {
|
|
385
|
+
return options.enqueueLinks({
|
|
386
|
+
urls,
|
|
387
|
+
baseUrl,
|
|
388
|
+
...enqueueLinksOptions,
|
|
389
|
+
});
|
|
390
|
+
}
|
|
383
391
|
return enqueueLinks({
|
|
384
|
-
requestQueue,
|
|
385
|
-
robotsTxtFile,
|
|
386
|
-
onSkippedRequest,
|
|
392
|
+
requestQueue: options.requestQueue,
|
|
393
|
+
robotsTxtFile: options.robotsTxtFile,
|
|
394
|
+
onSkippedRequest: options.onSkippedRequest,
|
|
387
395
|
urls,
|
|
388
396
|
baseUrl,
|
|
389
|
-
...
|
|
397
|
+
...enqueueLinksOptions,
|
|
390
398
|
});
|
|
391
399
|
}
|
|
392
400
|
/**
|