@crawlee/browser 4.0.0-beta.6 → 4.0.0-beta.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/internals/browser-crawler.d.ts +81 -60
- package/internals/browser-crawler.d.ts.map +1 -1
- package/internals/browser-crawler.js +150 -197
- package/internals/browser-crawler.js.map +1 -1
- package/internals/browser-launcher.d.ts +7 -0
- package/internals/browser-launcher.d.ts.map +1 -1
- package/internals/browser-launcher.js +4 -3
- package/internals/browser-launcher.js.map +1 -1
- package/package.json +7 -7
- package/tsconfig.build.tsbuildinfo +0 -1
package/README.md
CHANGED
|
@@ -9,6 +9,10 @@
|
|
|
9
9
|
<small>A web scraping and browser automation library</small>
|
|
10
10
|
</h1>
|
|
11
11
|
|
|
12
|
+
<p align=center>
|
|
13
|
+
<a href="https://trendshift.io/repositories/5179" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5179" alt="apify%2Fcrawlee | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
14
|
+
</p>
|
|
15
|
+
|
|
12
16
|
<p align=center>
|
|
13
17
|
<a href="https://www.npmjs.com/package/@crawlee/core" rel="nofollow"><img src="https://img.shields.io/npm/v/@crawlee/core.svg" alt="NPM latest version" data-canonical-src="https://img.shields.io/npm/v/@crawlee/core/next.svg" style="max-width: 100%;"></a>
|
|
14
18
|
<a href="https://www.npmjs.com/package/@crawlee/core" rel="nofollow"><img src="https://img.shields.io/npm/dm/@crawlee/core.svg" alt="Downloads" data-canonical-src="https://img.shields.io/npm/dm/@crawlee/core.svg" style="max-width: 100%;"></a>
|
|
@@ -24,7 +28,7 @@ Crawlee is available as the [`crawlee`](https://www.npmjs.com/package/crawlee) N
|
|
|
24
28
|
|
|
25
29
|
> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev)** 👈
|
|
26
30
|
|
|
27
|
-
>
|
|
31
|
+
> Do you prefer 🐍 Python instead of JavaScript? [👉 Checkout Crawlee for Python 👈](https://github.com/apify/crawlee-python).
|
|
28
32
|
|
|
29
33
|
## Installation
|
|
30
34
|
|
|
@@ -85,7 +89,7 @@ By default, Crawlee stores data to `./storage` in the current working directory.
|
|
|
85
89
|
We provide automated beta builds for every merged code change in Crawlee. You can find them in the npm [list of releases](https://www.npmjs.com/package/crawlee?activeTab=versions). If you want to test new features or bug fixes before we release them, feel free to install a beta build like this:
|
|
86
90
|
|
|
87
91
|
```bash
|
|
88
|
-
npm install crawlee@
|
|
92
|
+
npm install crawlee@next
|
|
89
93
|
```
|
|
90
94
|
|
|
91
95
|
If you also use the [Apify SDK](https://github.com/apify/apify-sdk-js), you need to specify dependency overrides in your `package.json` file so that you don't end up with multiple versions of Crawlee installed:
|
|
@@ -94,9 +98,9 @@ If you also use the [Apify SDK](https://github.com/apify/apify-sdk-js), you need
|
|
|
94
98
|
{
|
|
95
99
|
"overrides": {
|
|
96
100
|
"apify": {
|
|
97
|
-
"@crawlee/core": "
|
|
98
|
-
"@crawlee/types": "
|
|
99
|
-
"@crawlee/utils": "
|
|
101
|
+
"@crawlee/core": "$crawlee",
|
|
102
|
+
"@crawlee/types": "$crawlee",
|
|
103
|
+
"@crawlee/utils": "$crawlee"
|
|
100
104
|
}
|
|
101
105
|
}
|
|
102
106
|
}
|
|
@@ -1,19 +1,38 @@
|
|
|
1
|
-
import type { Awaitable, BasicCrawlerOptions, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler,
|
|
2
|
-
import { BasicCrawler,
|
|
1
|
+
import type { Awaitable, BasicCrawlerOptions, BasicCrawlingContext, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler, LoadedRequest, Request, RequestHandler, RequestProvider, SkippedRequestCallback } from '@crawlee/basic';
|
|
2
|
+
import { BasicCrawler, ContextPipeline } from '@crawlee/basic';
|
|
3
3
|
import type { BrowserController, BrowserPlugin, BrowserPoolHooks, BrowserPoolOptions, CommonPage, InferBrowserPluginArray, LaunchContext } from '@crawlee/browser-pool';
|
|
4
4
|
import { BrowserPool } from '@crawlee/browser-pool';
|
|
5
|
+
import type { BatchAddRequestsResult } from '@crawlee/types';
|
|
5
6
|
import type { RobotsTxtFile } from '@crawlee/utils';
|
|
6
7
|
import type { ReadonlyDeep } from 'type-fest';
|
|
7
8
|
import type { BrowserLaunchContext } from './browser-launcher.js';
|
|
8
|
-
|
|
9
|
+
interface BaseResponse {
|
|
10
|
+
status(): number;
|
|
11
|
+
}
|
|
12
|
+
export interface BrowserCrawlingContext<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController = BrowserController, UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
|
|
13
|
+
/**
|
|
14
|
+
* An instance of the {@link BrowserController} that manages the browser instance and provides access to its API.
|
|
15
|
+
*/
|
|
9
16
|
browserController: ProvidedController;
|
|
17
|
+
/**
|
|
18
|
+
* The browser page object where the web page is loaded and rendered.
|
|
19
|
+
*/
|
|
10
20
|
page: Page;
|
|
11
|
-
|
|
21
|
+
/**
|
|
22
|
+
* The request object that was successfully loaded and navigated to, including the {@link Request.loadedUrl|`loadedUrl`} property.
|
|
23
|
+
*/
|
|
24
|
+
request: LoadedRequest<Request<UserData>>;
|
|
25
|
+
/**
|
|
26
|
+
* The HTTP response object returned by the browser's navigation.
|
|
27
|
+
*/
|
|
28
|
+
response: Response;
|
|
29
|
+
/**
|
|
30
|
+
* Helper function for extracting URLs from the current page and adding them to the request queue.
|
|
31
|
+
*/
|
|
32
|
+
enqueueLinks: (options?: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>;
|
|
12
33
|
}
|
|
13
|
-
export type BrowserRequestHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = RequestHandler<Context>;
|
|
14
|
-
export type BrowserErrorHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = ErrorHandler<Context>;
|
|
15
34
|
export type BrowserHook<Context = BrowserCrawlingContext, GoToOptions extends Dictionary | undefined = Dictionary> = (crawlingContext: Context, gotoOptions: GoToOptions) => Awaitable<void>;
|
|
16
|
-
export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext = BrowserCrawlingContext, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions, 'requestHandler' | 'failedRequestHandler' | 'errorHandler'> {
|
|
35
|
+
export interface BrowserCrawlerOptions<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController extends BrowserController = BrowserController, Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions<Context, ExtendedContext>, 'requestHandler' | 'failedRequestHandler' | 'errorHandler'> {
|
|
17
36
|
launchContext?: BrowserLaunchContext<any, any>;
|
|
18
37
|
/**
|
|
19
38
|
* Function that is called to process each request.
|
|
@@ -42,7 +61,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
42
61
|
* The exceptions are logged to the request using the
|
|
43
62
|
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
44
63
|
*/
|
|
45
|
-
requestHandler?:
|
|
64
|
+
requestHandler?: RequestHandler<ExtendedContext>;
|
|
46
65
|
/**
|
|
47
66
|
* User-provided function that allows modifying the request object before it gets retried by the crawler.
|
|
48
67
|
* It's executed before each retry for the requests that failed less than {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
@@ -53,7 +72,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
53
72
|
* Second argument is the `Error` instance that
|
|
54
73
|
* represents the last error thrown during processing of the request.
|
|
55
74
|
*/
|
|
56
|
-
errorHandler?:
|
|
75
|
+
errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
57
76
|
/**
|
|
58
77
|
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
59
78
|
*
|
|
@@ -63,17 +82,12 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
63
82
|
* Second argument is the `Error` instance that
|
|
64
83
|
* represents the last error thrown during processing of the request.
|
|
65
84
|
*/
|
|
66
|
-
failedRequestHandler?:
|
|
85
|
+
failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
67
86
|
/**
|
|
68
87
|
* Custom options passed to the underlying {@link BrowserPool} constructor.
|
|
69
88
|
* We can tweak those to fine-tune browser management.
|
|
70
89
|
*/
|
|
71
90
|
browserPoolOptions?: Partial<BrowserPoolOptions> & Partial<BrowserPoolHooks<__BrowserControllerReturn, __LaunchContextReturn>>;
|
|
72
|
-
/**
|
|
73
|
-
* If set, the crawler will be configured for all connections to use
|
|
74
|
-
* the Proxy URLs provided and rotated according to the configuration.
|
|
75
|
-
*/
|
|
76
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
77
91
|
/**
|
|
78
92
|
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
79
93
|
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`,
|
|
@@ -119,10 +133,9 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
119
133
|
*/
|
|
120
134
|
navigationTimeoutSecs?: number;
|
|
121
135
|
/**
|
|
122
|
-
* Defines whether the cookies should be persisted for sessions.
|
|
123
|
-
* This can only be used when `useSessionPool` is set to `true`.
|
|
136
|
+
* Defines whether the cookies should be persisted for sessions. Enabled by default.
|
|
124
137
|
*/
|
|
125
|
-
|
|
138
|
+
saveResponseCookies?: boolean;
|
|
126
139
|
/**
|
|
127
140
|
* Whether to run browser in headless mode. Defaults to `true`.
|
|
128
141
|
* Can be also set via {@link Configuration}.
|
|
@@ -178,24 +191,18 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
178
191
|
*
|
|
179
192
|
* @category Crawlers
|
|
180
193
|
*/
|
|
181
|
-
export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext = BrowserCrawlingContext, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context> {
|
|
182
|
-
readonly config: Configuration;
|
|
183
|
-
/**
|
|
184
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
185
|
-
* Only available if used by the crawler.
|
|
186
|
-
*/
|
|
187
|
-
proxyConfiguration?: ProxyConfiguration;
|
|
194
|
+
export declare abstract class BrowserCrawler<Page extends CommonPage = CommonPage, Response extends BaseResponse = BaseResponse, ProvidedController extends BrowserController = BrowserController, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary> = BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
|
|
188
195
|
/**
|
|
189
196
|
* A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers.
|
|
190
197
|
*/
|
|
191
198
|
browserPool: BrowserPool<InternalBrowserPoolOptions>;
|
|
192
199
|
launchContext: BrowserLaunchContext<LaunchOptions, unknown>;
|
|
193
|
-
protected
|
|
200
|
+
protected readonly ignoreShadowRoots: boolean;
|
|
201
|
+
protected readonly ignoreIframes: boolean;
|
|
194
202
|
protected navigationTimeoutMillis: number;
|
|
195
|
-
protected requestHandlerTimeoutInnerMillis: number;
|
|
196
203
|
protected preNavigationHooks: BrowserHook<Context>[];
|
|
197
204
|
protected postNavigationHooks: BrowserHook<Context>[];
|
|
198
|
-
protected
|
|
205
|
+
protected saveResponseCookies: boolean;
|
|
199
206
|
protected static optionsShape: {
|
|
200
207
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
201
208
|
navigationTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
@@ -210,17 +217,13 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
210
217
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
211
218
|
browserPoolOptions: import("ow").ObjectPredicate<object>;
|
|
212
219
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
213
|
-
|
|
214
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
215
|
-
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
216
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
217
|
-
useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
220
|
+
saveResponseCookies: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
218
221
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
219
222
|
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
220
223
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
221
|
-
|
|
224
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
222
225
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
223
|
-
|
|
226
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
224
227
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
225
228
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
226
229
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -237,24 +240,40 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
237
240
|
maxRequestRetries: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
238
241
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
239
242
|
sameDomainDelaySecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
240
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
241
|
-
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
242
243
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
243
244
|
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
245
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
246
|
+
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
244
247
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
245
248
|
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
249
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
250
|
+
sessionPool: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
246
251
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
247
252
|
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
248
253
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
249
254
|
statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
255
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
256
|
+
additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
257
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
258
|
+
ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
259
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
260
|
+
blockedStatusCodes: import("ow").ArrayPredicate<number>;
|
|
250
261
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
251
262
|
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
252
263
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
253
|
-
respectRobotsTxtFile: import("ow").
|
|
264
|
+
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
|
|
254
265
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
255
266
|
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
256
267
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
257
268
|
httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
269
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
270
|
+
configuration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
271
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
272
|
+
storageClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
273
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
274
|
+
eventManager: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
275
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
276
|
+
logger: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
258
277
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
259
278
|
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
260
279
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -263,42 +282,37 @@ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends
|
|
|
263
282
|
maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
264
283
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
265
284
|
keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
266
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
267
|
-
log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
268
285
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
269
286
|
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
270
287
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
271
288
|
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
289
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
290
|
+
id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
272
291
|
};
|
|
273
292
|
/**
|
|
274
293
|
* All `BrowserCrawler` parameters are passed via an options object.
|
|
275
294
|
*/
|
|
276
|
-
protected constructor(options
|
|
277
|
-
|
|
295
|
+
protected constructor(options: BrowserCrawlerOptions<Page, Response, ProvidedController, Context, ContextExtension, ExtendedContext> & {
|
|
296
|
+
contextPipelineBuilder: () => ContextPipeline<CrawlingContext, Context>;
|
|
297
|
+
});
|
|
298
|
+
protected buildContextPipeline(): ContextPipeline<CrawlingContext, BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>>;
|
|
278
299
|
private containsSelectors;
|
|
279
|
-
protected isRequestBlocked(crawlingContext:
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
protected
|
|
285
|
-
protected _handleNavigation(crawlingContext: Context): Promise<void>;
|
|
286
|
-
protected _applyCookies({ session, request, page, browserController }: Context, preHooksCookies: string, postHooksCookies: string): Promise<void>;
|
|
300
|
+
protected isRequestBlocked(crawlingContext: BrowserCrawlingContext<Page, Response, ProvidedController>): Promise<string | false>;
|
|
301
|
+
private preparePage;
|
|
302
|
+
private performNavigation;
|
|
303
|
+
private handleBlockedRequestByContent;
|
|
304
|
+
private restoreRequestState;
|
|
305
|
+
protected _applyCookies({ session, request, page, browserController }: BrowserCrawlingContext, preHooksCookies: string, postHooksCookies: string): Promise<void>;
|
|
287
306
|
/**
|
|
288
307
|
* Marks session bad in case of navigation timeout.
|
|
289
308
|
*/
|
|
290
|
-
protected _handleNavigationTimeout(crawlingContext:
|
|
309
|
+
protected _handleNavigationTimeout(crawlingContext: BrowserCrawlingContext, error: Error): Promise<void>;
|
|
291
310
|
/**
|
|
292
311
|
* Transforms proxy-related errors to `SessionError`.
|
|
293
312
|
*/
|
|
294
313
|
protected _throwIfProxyError(error: Error): void;
|
|
295
|
-
protected abstract _navigationHandler(crawlingContext:
|
|
296
|
-
|
|
297
|
-
* Should be overridden in case of different automation library that does not support this response API.
|
|
298
|
-
*/
|
|
299
|
-
protected _responseHandler(crawlingContext: Context): Promise<void>;
|
|
300
|
-
protected _extendLaunchContext(_pageId: string, launchContext: LaunchContext): Promise<void>;
|
|
301
|
-
protected _maybeAddSessionRetiredListener(_pageId: string, browserController: Context['browserController']): void;
|
|
314
|
+
protected abstract _navigationHandler(crawlingContext: BrowserCrawlingContext<Page, Response, ProvidedController>, gotoOptions: GoToOptions): Promise<Context['response'] | null | undefined>;
|
|
315
|
+
private processResponse;
|
|
302
316
|
/**
|
|
303
317
|
* Function for cleaning up after all requests are processed.
|
|
304
318
|
* @ignore
|
|
@@ -316,8 +330,15 @@ interface EnqueueLinksInternalOptions {
|
|
|
316
330
|
finalRequestUrl?: string;
|
|
317
331
|
}
|
|
318
332
|
/** @internal */
|
|
319
|
-
|
|
320
|
-
|
|
333
|
+
interface BoundEnqueueLinksInternalOptions {
|
|
334
|
+
enqueueLinks: BasicCrawlingContext['enqueueLinks'];
|
|
335
|
+
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
|
|
336
|
+
originalRequestUrl: string;
|
|
337
|
+
finalRequestUrl?: string;
|
|
338
|
+
page: CommonPage;
|
|
339
|
+
}
|
|
340
|
+
/** @internal */
|
|
341
|
+
export declare function browserCrawlerEnqueueLinks(options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions): Promise<unknown>;
|
|
321
342
|
/**
|
|
322
343
|
* Extracts URLs from a given page.
|
|
323
344
|
* @ignore
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"browser-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/browser-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,SAAS,EACT,mBAAmB,EACnB,eAAe,EACf,UAAU,EACV,mBAAmB,EACnB,YAAY,
|
|
1
|
+
{"version":3,"file":"browser-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/browser-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACR,SAAS,EACT,mBAAmB,EACnB,oBAAoB,EACpB,eAAe,EACf,UAAU,EACV,mBAAmB,EACnB,YAAY,EACZ,aAAa,EACb,OAAO,EACP,cAAc,EACd,eAAe,EACf,sBAAsB,EACzB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACH,YAAY,EAEZ,eAAe,EAWlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,KAAK,EACR,iBAAiB,EACjB,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,UAAU,EACV,uBAAuB,EACvB,aAAa,EAChB,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EAAE,WAAW,EAAE,MAAM,uBAAuB,CAAC;AACpD,OAAO,KAAK,EAAE,sBAAsB,EAAoC,MAAM,gBAAgB,CAAC;AAC/F,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAGpD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAI9C,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAElE,UAAU,YAAY;IAClB,MAAM,IAAI,MAAM,CAAC;CACpB;AAID,MAAM,WAAW,sBAAsB,CACnC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,GAAG,iBAAiB,EACtC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAC1C,SAAQ,eAAe,CAAC,QAAQ,CAAC;IAC/B;;OAEG;IACH,iBAAiB,EAAE,kBAAkB,CAAC;IAEtC;;OAEG;IACH,IAAI,EAAE,IAAI,CAAC;IAEX;;OAEG;IACH,OAAO,EAAE,aAAa,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;IAE1C;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;OAEG;IACH,YAAY,EAAE,CAAC,OAAO,CAAC,EAAE,mBAAmB,KAAK,OAAO,CAAC,sBAAsB,CAAC,CAAC;CACpF;AAED,MAAM,MAAM,WAAW,CAAC,OAAO,GAAG,sBAAsB,EAAE,WAAW,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,IAAI,CACjH,eAAe,EAAE,OAAO,EACxB,WAAW,EAAE,WAAW,KACvB,SAAS,CAAC,IAAI,CAAC,CAAC;AAErB,MAAM,WAAW,qBAAqB,CAClC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,SAAS,iBAAiB,GAAG,iBAAiB,EAChE,OAAO,SAAS,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,GAAG,sBAAsB,CAC3G,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,UAAU,CACb,EACD,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,EAC5D,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,gBAAgB,SAAS,aAAa,EAAE,GAAG,uBAAuB,CAAC,0BAA0B,CAAC,gBAAgB,CAAC,CAAC,EAChH,yBAAyB,SAAS,iBAAiB,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC,EAC9G,qBAAqB,SAAS,aAAa,GAAG,UAAU,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAC3G,SAAQ,IAAI,CACV,mBAAmB,CAAC,OAAO,EAAE,eAAe,CAAC,EAE7C,gBAAgB,GAAG,sBAAsB,GAAG,cAAc,CAC7D;IACG,aAAa,CAAC,EAAE,oBAAoB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAE/C;;;;;;;;;;;;;;;;;;;;;;;;;;OA0BG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC,eAAe,CAAC,CAAC;IAEjD;;;;;;;;;OASG;IACH,YAAY,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAE9D;;;;;;;;OAQG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAEtE;;;OAGG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC,GAC5C,OAAO,CAAC,gBAAgB,CAAC,yBAAyB,EAAE,qBAAqB,CAAC,CAAC,CAAC;IAEhF;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,kBAAkB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE5C;;;;;;;;;;;;;;;;OAgBG;IACH,mBAAmB,CAAC,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IAE7C;;OAEG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;OAEG;IACH,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAE9B;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,GAAG,KAAK,GAAG,KAAK,CAAC;IAEnC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAE5B;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AACH,8BAAsB,cAAc,CAChC,IAAI,SAAS,UAAU,GAAG,UAAU,EACpC,QAAQ,SAAS,YAAY,GAAG,YAAY,EAC5C,kBAAkB,SAAS,iBAAiB,GAAG,iBAAiB,EAChE,0BAA0B,SAAS,kBAAkB,GAAG,kBAAkB,EAC1E,aAAa,SAAS,UAAU,GAAG,SAAS,GAAG,UAAU,EACzD,OAAO,SAAS,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,GAAG,sBAAsB,CAC3G,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,UAAU,CACb,EACD,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,EACpC,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB,EAC5D,WAAW,SAAS,UAAU,GAAG,UAAU,CAC7C,SAAQ,YAAY,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC;IAC9D;;OAEG;IACH,WAAW,EAAE,WAAW,CAAC,0BAA0B,CAAC,CAAC;IAErD,aAAa,EAAE,oBAAoB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;IAE5D,SAAS,CAAC,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IAC9C,SAAS,CAAC,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAE1C,SAAS,CAAC,uBAAuB,EAAE,MAAM,CAAC;IAC1C,SAAS,CAAC,kBAAkB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACrD,SAAS,CAAC,mBAAmB,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;IACtD,SAAS,CAAC,mBAAmB,EAAE,OAAO,CAAC;IAEvC,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAYpC;IAEF;;OAEG;IACH,SAAS,aACL,OAAO,EAAE,qBAAqB,CAC1B,IAAI,EACJ,QAAQ,EACR,kBAAkB,EAClB,OAAO,EACP,gBAAgB,EAChB,eAAe,CAClB,GAAG;QACA,sBAAsB,EAAE,MAAM,eAAe,CAAC,eAAe,EAAE,OAAO,CAAC,CAAC;KAC3E;cAqDc,oBAAoB,IAAI,eAAe,CACtD,eAAe,EACf,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,EAAE,UAAU,CAAC,CACzE;YA4Ba,iBAAiB;cASf,gBAAgB,CAC5B,eAAe,EAAE,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,CAAC,GAC5E,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;YAuBZ,WAAW;YAgDX,iBAAiB;YAqFjB,6BAA6B;YAW7B,mBAAmB;cAKjB,aAAa,CACzB,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,iBAAiB,EAAE,EAAE,sBAAsB,EACrE,eAAe,EAAE,MAAM,EACvB,gBAAgB,EAAE,MAAM;IAc5B;;OAEG;cACa,wBAAwB,CAAC,eAAe,EAAE,sBAAsB,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9G;;OAEG;IACH,SAAS,CAAC,kBAAkB,CAAC,KAAK,EAAE,KAAK;IAMzC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,CACjC,eAAe,EAAE,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,kBAAkB,CAAC,EAC3E,WAAW,EAAE,WAAW,GACzB,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,IAAI,GAAG,SAAS,CAAC;YAEpC,eAAe;IA+B7B;;;OAGG;IACY,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;CAI3C;AAED,gBAAgB;AAChB,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,IAAI,EAAE,UAAU,CAAC;IACjB,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAC1C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gBAAgB;AAChB,UAAU,gCAAgC;IACtC,YAAY,EAAE,oBAAoB,CAAC,cAAc,CAAC,CAAC;IACnD,OAAO,CAAC,EAAE,YAAY,CAAC,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC,GAAG,IAAI,CAAC,mBAAmB,EAAE,cAAc,CAAC,CAAC;IAC9G,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,IAAI,EAAE,UAAU,CAAC;CACpB;AASD,gBAAgB;AAChB,wBAAsB,0BAA0B,CAC5C,OAAO,EAAE,2BAA2B,GAAG,gCAAgC,oBAiC1E;AAED;;;GAGG;AACH,wBAAsB,mBAAmB,CAErC,IAAI,EAAE;IAAE,MAAM,EAAE,QAAQ,CAAA;CAAE,EAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GAChB,OAAO,CAAC,MAAM,EAAE,CAAC,CA0BnB"}
|