@crawlee/browser 3.13.3-beta.8 → 3.13.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -18,15 +18,15 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
18
18
|
/**
|
|
19
19
|
* Function that is called to process each request.
|
|
20
20
|
*
|
|
21
|
-
* The function receives the {@
|
|
21
|
+
* The function receives the {@link BrowserCrawlingContext}
|
|
22
22
|
* (actual context will be enhanced with the crawler specific properties) as an argument, where:
|
|
23
|
-
* - {@
|
|
23
|
+
* - {@link BrowserCrawlingContext.request|`request`} is an instance of the {@link Request} object
|
|
24
24
|
* with details about the URL to open, HTTP method etc;
|
|
25
|
-
* - {@
|
|
25
|
+
* - {@link BrowserCrawlingContext.page|`page`} is an instance of the
|
|
26
26
|
* Puppeteer [Page](https://pptr.dev/api/puppeteer.page) or
|
|
27
27
|
* Playwright [Page](https://playwright.dev/docs/api/class-page);
|
|
28
|
-
* - {@
|
|
29
|
-
* - {@
|
|
28
|
+
* - {@link BrowserCrawlingContext.browserController|`browserController`} is an instance of the {@link BrowserController};
|
|
29
|
+
* - {@link BrowserCrawlingContext.response|`response`} is an instance of the
|
|
30
30
|
* Puppeteer [Response](https://pptr.dev/api/puppeteer.httpresponse) or
|
|
31
31
|
* Playwright [Response](https://playwright.dev/docs/api/class-response),
|
|
32
32
|
* which is the main resource response as returned by the respective `page.goto()` function.
|
|
@@ -34,27 +34,27 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
34
34
|
* The function must return a promise, which is then awaited by the crawler.
|
|
35
35
|
*
|
|
36
36
|
* If the function throws an exception, the crawler will try to re-crawl the
|
|
37
|
-
* request later, up to the {@
|
|
37
|
+
* request later, up to the {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
38
38
|
* If all the retries fail, the crawler calls the function
|
|
39
|
-
* provided to the {@
|
|
39
|
+
* provided to the {@link BrowserCrawlerOptions.failedRequestHandler|`failedRequestHandler`} parameter.
|
|
40
40
|
* To make this work, we should **always**
|
|
41
41
|
* let our function throw exceptions rather than catch them.
|
|
42
42
|
* The exceptions are logged to the request using the
|
|
43
|
-
* {@
|
|
43
|
+
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
44
44
|
*/
|
|
45
45
|
requestHandler?: BrowserRequestHandler<LoadedContext<Context>>;
|
|
46
46
|
/**
|
|
47
47
|
* Function that is called to process each request.
|
|
48
48
|
*
|
|
49
|
-
* The function receives the {@
|
|
49
|
+
* The function receives the {@link BrowserCrawlingContext}
|
|
50
50
|
* (actual context will be enhanced with the crawler specific properties) as an argument, where:
|
|
51
|
-
* - {@
|
|
51
|
+
* - {@link BrowserCrawlingContext.request|`request`} is an instance of the {@link Request} object
|
|
52
52
|
* with details about the URL to open, HTTP method etc;
|
|
53
|
-
* - {@
|
|
53
|
+
* - {@link BrowserCrawlingContext.page|`page`} is an instance of the
|
|
54
54
|
* Puppeteer [Page](https://pptr.dev/api/puppeteer.page) or
|
|
55
55
|
* Playwright [Page](https://playwright.dev/docs/api/class-page);
|
|
56
|
-
* - {@
|
|
57
|
-
* - {@
|
|
56
|
+
* - {@link BrowserCrawlingContext.browserController|`browserController`} is an instance of the {@link BrowserController};
|
|
57
|
+
* - {@link BrowserCrawlingContext.response|`response`} is an instance of the
|
|
58
58
|
* Puppeteer [Response](https://pptr.dev/api/puppeteer.httpresponse) or
|
|
59
59
|
* Playwright [Response](https://playwright.dev/docs/api/class-response),
|
|
60
60
|
* which is the main resource response as returned by the respective `page.goto()` function.
|
|
@@ -62,13 +62,13 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
62
62
|
* The function must return a promise, which is then awaited by the crawler.
|
|
63
63
|
*
|
|
64
64
|
* If the function throws an exception, the crawler will try to re-crawl the
|
|
65
|
-
* request later, up to the {@
|
|
65
|
+
* request later, up to the {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
66
66
|
* If all the retries fail, the crawler calls the function
|
|
67
|
-
* provided to the {@
|
|
67
|
+
* provided to the {@link BrowserCrawlerOptions.failedRequestHandler|`failedRequestHandler`} parameter.
|
|
68
68
|
* To make this work, we should **always**
|
|
69
69
|
* let our function throw exceptions rather than catch them.
|
|
70
70
|
* The exceptions are logged to the request using the
|
|
71
|
-
* {@
|
|
71
|
+
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
72
72
|
*
|
|
73
73
|
* @deprecated `handlePageFunction` has been renamed to `requestHandler` and will be removed in a future version.
|
|
74
74
|
* @ignore
|
|
@@ -76,11 +76,11 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
76
76
|
handlePageFunction?: BrowserRequestHandler<LoadedContext<Context>>;
|
|
77
77
|
/**
|
|
78
78
|
* User-provided function that allows modifying the request object before it gets retried by the crawler.
|
|
79
|
-
* It's executed before each retry for the requests that failed less than {@
|
|
79
|
+
* It's executed before each retry for the requests that failed less than {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
80
80
|
*
|
|
81
|
-
* The function receives the {@
|
|
81
|
+
* The function receives the {@link BrowserCrawlingContext}
|
|
82
82
|
* (actual context will be enhanced with the crawler specific properties) as the first argument,
|
|
83
|
-
* where the {@
|
|
83
|
+
* where the {@link BrowserCrawlingContext.request|`request`} corresponds to the request to be retried.
|
|
84
84
|
* Second argument is the `Error` instance that
|
|
85
85
|
* represents the last error thrown during processing of the request.
|
|
86
86
|
*/
|
|
@@ -88,9 +88,9 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
88
88
|
/**
|
|
89
89
|
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
90
90
|
*
|
|
91
|
-
* The function receives the {@
|
|
91
|
+
* The function receives the {@link BrowserCrawlingContext}
|
|
92
92
|
* (actual context will be enhanced with the crawler specific properties) as the first argument,
|
|
93
|
-
* where the {@
|
|
93
|
+
* where the {@link BrowserCrawlingContext.request|`request`} corresponds to the failed request.
|
|
94
94
|
* Second argument is the `Error` instance that
|
|
95
95
|
* represents the last error thrown during processing of the request.
|
|
96
96
|
*/
|
|
@@ -98,9 +98,9 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
98
98
|
/**
|
|
99
99
|
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
100
100
|
*
|
|
101
|
-
* The function receives the {@
|
|
101
|
+
* The function receives the {@link BrowserCrawlingContext}
|
|
102
102
|
* (actual context will be enhanced with the crawler specific properties) as the first argument,
|
|
103
|
-
* where the {@
|
|
103
|
+
* where the {@link BrowserCrawlingContext.request|`request`} corresponds to the failed request.
|
|
104
104
|
* Second argument is the `Error` instance that
|
|
105
105
|
* represents the last error thrown during processing of the request.
|
|
106
106
|
*
|
|
@@ -109,7 +109,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
109
109
|
*/
|
|
110
110
|
handleFailedRequestFunction?: BrowserErrorHandler<Context>;
|
|
111
111
|
/**
|
|
112
|
-
* Custom options passed to the underlying {@
|
|
112
|
+
* Custom options passed to the underlying {@link BrowserPool} constructor.
|
|
113
113
|
* We can tweak those to fine-tune browser management.
|
|
114
114
|
*/
|
|
115
115
|
browserPoolOptions?: Partial<BrowserPoolOptions> & Partial<BrowserPoolHooks<__BrowserControllerReturn, __LaunchContextReturn>>;
|
|
@@ -137,7 +137,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
137
137
|
* ```
|
|
138
138
|
*
|
|
139
139
|
* Modyfing `pageOptions` is supported only in Playwright incognito.
|
|
140
|
-
* See {@
|
|
140
|
+
* See {@link PrePageCreateHook}
|
|
141
141
|
*/
|
|
142
142
|
preNavigationHooks?: BrowserHook<Context>[];
|
|
143
143
|
/**
|
|
@@ -169,7 +169,7 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
169
169
|
persistCookiesPerSession?: boolean;
|
|
170
170
|
/**
|
|
171
171
|
* Whether to run browser in headless mode. Defaults to `true`.
|
|
172
|
-
* Can be also set via {@
|
|
172
|
+
* Can be also set via {@link Configuration}.
|
|
173
173
|
*/
|
|
174
174
|
headless?: boolean | 'new' | 'old';
|
|
175
175
|
/**
|
|
@@ -192,45 +192,45 @@ export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext =
|
|
|
192
192
|
*
|
|
193
193
|
* Since `BrowserCrawler` uses headless (or even headful) browsers to download web pages and extract data,
|
|
194
194
|
* it is useful for crawling of websites that require to execute JavaScript.
|
|
195
|
-
* If the target website doesn't need JavaScript, we should consider using the {@
|
|
195
|
+
* If the target website doesn't need JavaScript, we should consider using the {@link CheerioCrawler},
|
|
196
196
|
* which downloads the pages using raw HTTP requests and is about 10x faster.
|
|
197
197
|
*
|
|
198
|
-
* The source URLs are represented by the {@
|
|
199
|
-
* provided by the {@
|
|
198
|
+
* The source URLs are represented by the {@link Request} objects that are fed from the {@link RequestList} or {@link RequestQueue} instances
|
|
199
|
+
* provided by the {@link BrowserCrawlerOptions.requestList|`requestList`} or {@link BrowserCrawlerOptions.requestQueue|`requestQueue`}
|
|
200
200
|
* constructor options, respectively. If neither `requestList` nor `requestQueue` options are provided,
|
|
201
|
-
* the crawler will open the default request queue either when the {@
|
|
202
|
-
* or if `requests` parameter (representing the initial requests) of the {@
|
|
201
|
+
* the crawler will open the default request queue either when the {@link BrowserCrawler.addRequests|`crawler.addRequests()`} function is called,
|
|
202
|
+
* or if `requests` parameter (representing the initial requests) of the {@link BrowserCrawler.run|`crawler.run()`} function is provided.
|
|
203
203
|
*
|
|
204
|
-
* If both {@
|
|
205
|
-
* the instance first processes URLs from the {@
|
|
206
|
-
* to the {@
|
|
204
|
+
* If both {@link BrowserCrawlerOptions.requestList|`requestList`} and {@link BrowserCrawlerOptions.requestQueue|`requestQueue`} options are used,
|
|
205
|
+
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
206
|
+
* to the {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
207
207
|
*
|
|
208
|
-
* The crawler finishes when there are no more {@
|
|
208
|
+
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
209
209
|
*
|
|
210
|
-
* `BrowserCrawler` opens a new browser page (i.e. tab or window) for each {@
|
|
211
|
-
* and then calls the function provided by user as the {@
|
|
210
|
+
* `BrowserCrawler` opens a new browser page (i.e. tab or window) for each {@link Request} object to crawl
|
|
211
|
+
* and then calls the function provided by user as the {@link BrowserCrawlerOptions.requestHandler|`requestHandler`} option.
|
|
212
212
|
*
|
|
213
213
|
* New pages are only opened when there is enough free CPU and memory available,
|
|
214
|
-
* using the functionality provided by the {@
|
|
215
|
-
* All {@
|
|
214
|
+
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
215
|
+
* All {@link AutoscaledPool} configuration options can be passed to the {@link BrowserCrawlerOptions.autoscaledPoolOptions|`autoscaledPoolOptions`}
|
|
216
216
|
* parameter of the `BrowserCrawler` constructor.
|
|
217
|
-
* For user convenience, the {@
|
|
218
|
-
* {@
|
|
219
|
-
* underlying {@
|
|
217
|
+
* For user convenience, the {@link AutoscaledPoolOptions.minConcurrency|`minConcurrency`} and
|
|
218
|
+
* {@link AutoscaledPoolOptions.maxConcurrency|`maxConcurrency`} options of the
|
|
219
|
+
* underlying {@link AutoscaledPool} constructor are available directly in the `BrowserCrawler` constructor.
|
|
220
220
|
*
|
|
221
|
-
* > *NOTE:* the pool of browser instances is internally managed by the {@
|
|
221
|
+
* > *NOTE:* the pool of browser instances is internally managed by the {@link BrowserPool} class.
|
|
222
222
|
*
|
|
223
223
|
* @category Crawlers
|
|
224
224
|
*/
|
|
225
225
|
export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext = BrowserCrawlingContext, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context> {
|
|
226
226
|
readonly config: Configuration;
|
|
227
227
|
/**
|
|
228
|
-
* A reference to the underlying {@
|
|
228
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
229
229
|
* Only available if used by the crawler.
|
|
230
230
|
*/
|
|
231
231
|
proxyConfiguration?: ProxyConfiguration;
|
|
232
232
|
/**
|
|
233
|
-
* A reference to the underlying {@
|
|
233
|
+
* A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers.
|
|
234
234
|
*/
|
|
235
235
|
browserPool: BrowserPool<InternalBrowserPoolOptions>;
|
|
236
236
|
launchContext: BrowserLaunchContext<LaunchOptions, unknown>;
|
|
@@ -18,33 +18,33 @@ const timeout_1 = require("@apify/timeout");
|
|
|
18
18
|
*
|
|
19
19
|
* Since `BrowserCrawler` uses headless (or even headful) browsers to download web pages and extract data,
|
|
20
20
|
* it is useful for crawling of websites that require to execute JavaScript.
|
|
21
|
-
* If the target website doesn't need JavaScript, we should consider using the {@
|
|
21
|
+
* If the target website doesn't need JavaScript, we should consider using the {@link CheerioCrawler},
|
|
22
22
|
* which downloads the pages using raw HTTP requests and is about 10x faster.
|
|
23
23
|
*
|
|
24
|
-
* The source URLs are represented by the {@
|
|
25
|
-
* provided by the {@
|
|
24
|
+
* The source URLs are represented by the {@link Request} objects that are fed from the {@link RequestList} or {@link RequestQueue} instances
|
|
25
|
+
* provided by the {@link BrowserCrawlerOptions.requestList|`requestList`} or {@link BrowserCrawlerOptions.requestQueue|`requestQueue`}
|
|
26
26
|
* constructor options, respectively. If neither `requestList` nor `requestQueue` options are provided,
|
|
27
|
-
* the crawler will open the default request queue either when the {@
|
|
28
|
-
* or if `requests` parameter (representing the initial requests) of the {@
|
|
27
|
+
* the crawler will open the default request queue either when the {@link BrowserCrawler.addRequests|`crawler.addRequests()`} function is called,
|
|
28
|
+
* or if `requests` parameter (representing the initial requests) of the {@link BrowserCrawler.run|`crawler.run()`} function is provided.
|
|
29
29
|
*
|
|
30
|
-
* If both {@
|
|
31
|
-
* the instance first processes URLs from the {@
|
|
32
|
-
* to the {@
|
|
30
|
+
* If both {@link BrowserCrawlerOptions.requestList|`requestList`} and {@link BrowserCrawlerOptions.requestQueue|`requestQueue`} options are used,
|
|
31
|
+
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
32
|
+
* to the {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
33
33
|
*
|
|
34
|
-
* The crawler finishes when there are no more {@
|
|
34
|
+
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
35
35
|
*
|
|
36
|
-
* `BrowserCrawler` opens a new browser page (i.e. tab or window) for each {@
|
|
37
|
-
* and then calls the function provided by user as the {@
|
|
36
|
+
* `BrowserCrawler` opens a new browser page (i.e. tab or window) for each {@link Request} object to crawl
|
|
37
|
+
* and then calls the function provided by user as the {@link BrowserCrawlerOptions.requestHandler|`requestHandler`} option.
|
|
38
38
|
*
|
|
39
39
|
* New pages are only opened when there is enough free CPU and memory available,
|
|
40
|
-
* using the functionality provided by the {@
|
|
41
|
-
* All {@
|
|
40
|
+
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
41
|
+
* All {@link AutoscaledPool} configuration options can be passed to the {@link BrowserCrawlerOptions.autoscaledPoolOptions|`autoscaledPoolOptions`}
|
|
42
42
|
* parameter of the `BrowserCrawler` constructor.
|
|
43
|
-
* For user convenience, the {@
|
|
44
|
-
* {@
|
|
45
|
-
* underlying {@
|
|
43
|
+
* For user convenience, the {@link AutoscaledPoolOptions.minConcurrency|`minConcurrency`} and
|
|
44
|
+
* {@link AutoscaledPoolOptions.maxConcurrency|`maxConcurrency`} options of the
|
|
45
|
+
* underlying {@link AutoscaledPool} constructor are available directly in the `BrowserCrawler` constructor.
|
|
46
46
|
*
|
|
47
|
-
* > *NOTE:* the pool of browser instances is internally managed by the {@
|
|
47
|
+
* > *NOTE:* the pool of browser instances is internally managed by the {@link BrowserPool} class.
|
|
48
48
|
*
|
|
49
49
|
* @category Crawlers
|
|
50
50
|
*/
|
|
@@ -70,7 +70,7 @@ class BrowserCrawler extends basic_1.BasicCrawler {
|
|
|
70
70
|
value: config
|
|
71
71
|
});
|
|
72
72
|
/**
|
|
73
|
-
* A reference to the underlying {@
|
|
73
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
74
74
|
* Only available if used by the crawler.
|
|
75
75
|
*/
|
|
76
76
|
Object.defineProperty(this, "proxyConfiguration", {
|
|
@@ -80,7 +80,7 @@ class BrowserCrawler extends basic_1.BasicCrawler {
|
|
|
80
80
|
value: void 0
|
|
81
81
|
});
|
|
82
82
|
/**
|
|
83
|
-
* A reference to the underlying {@
|
|
83
|
+
* A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers.
|
|
84
84
|
*/
|
|
85
85
|
Object.defineProperty(this, "browserPool", {
|
|
86
86
|
enumerable: true,
|
|
@@ -11,7 +11,7 @@ export interface BrowserLaunchContext<TOptions, Launcher> extends BrowserPluginO
|
|
|
11
11
|
*/
|
|
12
12
|
proxyUrl?: string;
|
|
13
13
|
/**
|
|
14
|
-
* If `true` and the `executablePath` option of {@
|
|
14
|
+
* If `true` and the `executablePath` option of {@link BrowserLaunchContext.launchOptions|`launchOptions`} is not set,
|
|
15
15
|
* the launcher will launch full Google Chrome browser available on the machine
|
|
16
16
|
* rather than the bundled Chromium. The path to Chrome executable
|
|
17
17
|
* is taken from the `CRAWLEE_CHROME_EXECUTABLE_PATH` environment variable if provided,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/browser",
|
|
3
|
-
"version": "3.13.3
|
|
3
|
+
"version": "3.13.3",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=16.0.0"
|
|
@@ -54,10 +54,10 @@
|
|
|
54
54
|
},
|
|
55
55
|
"dependencies": {
|
|
56
56
|
"@apify/timeout": "^0.3.0",
|
|
57
|
-
"@crawlee/basic": "3.13.3
|
|
58
|
-
"@crawlee/browser-pool": "3.13.3
|
|
59
|
-
"@crawlee/types": "3.13.3
|
|
60
|
-
"@crawlee/utils": "3.13.3
|
|
57
|
+
"@crawlee/basic": "3.13.3",
|
|
58
|
+
"@crawlee/browser-pool": "3.13.3",
|
|
59
|
+
"@crawlee/types": "3.13.3",
|
|
60
|
+
"@crawlee/utils": "3.13.3",
|
|
61
61
|
"ow": "^0.28.1",
|
|
62
62
|
"tslib": "^2.4.0",
|
|
63
63
|
"type-fest": "^4.0.0"
|
|
@@ -81,5 +81,5 @@
|
|
|
81
81
|
}
|
|
82
82
|
}
|
|
83
83
|
},
|
|
84
|
-
"gitHead": "
|
|
84
|
+
"gitHead": "279cadbd3cd6342f36cc4d841e07b999e472420d"
|
|
85
85
|
}
|