@crawlee/playwright 3.0.3-beta.9 → 3.0.4-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/index.mjs +1 -0
- package/internals/playwright-crawler.d.ts +29 -25
- package/internals/playwright-crawler.d.ts.map +1 -1
- package/internals/playwright-crawler.js +25 -19
- package/internals/playwright-crawler.js.map +1 -1
- package/internals/playwright-launcher.d.ts +2 -1
- package/internals/playwright-launcher.d.ts.map +1 -1
- package/internals/playwright-launcher.js +1 -1
- package/internals/playwright-launcher.js.map +1 -1
- package/internals/utils/playwright-utils.d.ts +59 -1
- package/internals/utils/playwright-utils.d.ts.map +1 -1
- package/internals/utils/playwright-utils.js +60 -2
- package/internals/utils/playwright-utils.js.map +1 -1
- package/package.json +5 -5
- package/tsconfig.build.tsbuildinfo +1 -1
package/README.md
CHANGED
|
@@ -63,7 +63,7 @@ Additionally, the package provides various helper functions to simplify running
|
|
|
63
63
|
|
|
64
64
|
## Quick Start
|
|
65
65
|
|
|
66
|
-
This short tutorial will set you up to start using Crawlee in a minute or two. If you want to learn more, proceed to the [Getting Started](https://crawlee.dev/docs/
|
|
66
|
+
This short tutorial will set you up to start using Crawlee in a minute or two. If you want to learn more, proceed to the [Getting Started](https://crawlee.dev/docs/introduction) tutorial that will take you step by step through creating your first scraper.
|
|
67
67
|
|
|
68
68
|
### Local stand-alone usage
|
|
69
69
|
|
package/index.mjs
CHANGED
|
@@ -37,6 +37,7 @@ export const REQUESTS_PERSISTENCE_KEY = mod.REQUESTS_PERSISTENCE_KEY;
|
|
|
37
37
|
export const Request = mod.Request;
|
|
38
38
|
export const RequestList = mod.RequestList;
|
|
39
39
|
export const RequestQueue = mod.RequestQueue;
|
|
40
|
+
export const RetryRequestError = mod.RetryRequestError;
|
|
40
41
|
export const Router = mod.Router;
|
|
41
42
|
export const STATE_PERSISTENCE_KEY = mod.STATE_PERSISTENCE_KEY;
|
|
42
43
|
export const STORAGE_CONSISTENCY_DELAY_MILLIS = mod.STORAGE_CONSISTENCY_DELAY_MILLIS;
|
|
@@ -18,14 +18,14 @@ export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions<Playwrig
|
|
|
18
18
|
browserPlugins: [PlaywrightPlugin];
|
|
19
19
|
}> {
|
|
20
20
|
/**
|
|
21
|
-
* The same options as used by {@
|
|
21
|
+
* The same options as used by {@apilink launchPlaywright}.
|
|
22
22
|
*/
|
|
23
23
|
launchContext?: PlaywrightLaunchContext;
|
|
24
24
|
/**
|
|
25
25
|
* Function that is called to process each request.
|
|
26
26
|
*
|
|
27
|
-
* The function receives the {@
|
|
28
|
-
* - `request` is an instance of the {@
|
|
27
|
+
* The function receives the {@apilink PlaywrightCrawlingContext} as an argument, where:
|
|
28
|
+
* - `request` is an instance of the {@apilink Request} object with details about the URL to open, HTTP method etc.
|
|
29
29
|
* - `page` is an instance of the `Playwright`
|
|
30
30
|
* [`Page`](https://playwright.dev/docs/api/class-page)
|
|
31
31
|
* - `browserController` is an instance of the
|
|
@@ -43,14 +43,14 @@ export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions<Playwrig
|
|
|
43
43
|
* To make this work, you should **always**
|
|
44
44
|
* let your function throw exceptions rather than catch them.
|
|
45
45
|
* The exceptions are logged to the request using the
|
|
46
|
-
* {@
|
|
46
|
+
* {@apilink Request.pushErrorMessage} function.
|
|
47
47
|
*/
|
|
48
48
|
requestHandler?: PlaywrightRequestHandler;
|
|
49
49
|
/**
|
|
50
50
|
* Function that is called to process each request.
|
|
51
51
|
*
|
|
52
|
-
* The function receives the {@
|
|
53
|
-
* - `request` is an instance of the {@
|
|
52
|
+
* The function receives the {@apilink PlaywrightCrawlingContext} as an argument, where:
|
|
53
|
+
* - `request` is an instance of the {@apilink Request} object with details about the URL to open, HTTP method etc.
|
|
54
54
|
* - `page` is an instance of the `Playwright`
|
|
55
55
|
* [`Page`](https://playwright.dev/docs/api/class-page)
|
|
56
56
|
* - `browserController` is an instance of the
|
|
@@ -68,7 +68,7 @@ export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions<Playwrig
|
|
|
68
68
|
* To make this work, you should **always**
|
|
69
69
|
* let your function throw exceptions rather than catch them.
|
|
70
70
|
* The exceptions are logged to the request using the
|
|
71
|
-
* {@
|
|
71
|
+
* {@apilink Request.pushErrorMessage} function.
|
|
72
72
|
*
|
|
73
73
|
* @deprecated `handlePageFunction` has been renamed to `requestHandler` and will be removed in a future version.
|
|
74
74
|
* @ignore
|
|
@@ -113,27 +113,27 @@ export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions<Playwrig
|
|
|
113
113
|
*
|
|
114
114
|
* Since `Playwright` uses headless browser to download web pages and extract data,
|
|
115
115
|
* it is useful for crawling of websites that require to execute JavaScript.
|
|
116
|
-
* If the target website doesn't need JavaScript, consider using {@
|
|
116
|
+
* If the target website doesn't need JavaScript, consider using {@apilink CheerioCrawler},
|
|
117
117
|
* which downloads the pages using raw HTTP requests and is about 10x faster.
|
|
118
118
|
*
|
|
119
|
-
* The source URLs are represented using {@
|
|
120
|
-
* {@
|
|
121
|
-
* or {@
|
|
119
|
+
* The source URLs are represented using {@apilink Request} objects that are fed from
|
|
120
|
+
* {@apilink RequestList} or {@apilink RequestQueue} instances provided by the {@apilink PlaywrightCrawlerOptions.requestList}
|
|
121
|
+
* or {@apilink PlaywrightCrawlerOptions.requestQueue} constructor options, respectively.
|
|
122
122
|
*
|
|
123
|
-
* If both {@
|
|
124
|
-
* the instance first processes URLs from the {@
|
|
125
|
-
* to {@
|
|
123
|
+
* If both {@apilink PlaywrightCrawlerOptions.requestList} and {@apilink PlaywrightCrawlerOptions.requestQueue} are used,
|
|
124
|
+
* the instance first processes URLs from the {@apilink RequestList} and automatically enqueues all of them
|
|
125
|
+
* to {@apilink RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
126
126
|
*
|
|
127
|
-
* The crawler finishes when there are no more {@
|
|
127
|
+
* The crawler finishes when there are no more {@apilink Request} objects to crawl.
|
|
128
128
|
*
|
|
129
|
-
* `PlaywrightCrawler` opens a new Chrome page (i.e. tab) for each {@
|
|
130
|
-
* and then calls the function provided by user as the {@
|
|
129
|
+
* `PlaywrightCrawler` opens a new Chrome page (i.e. tab) for each {@apilink Request} object to crawl
|
|
130
|
+
* and then calls the function provided by user as the {@apilink PlaywrightCrawlerOptions.requestHandler} option.
|
|
131
131
|
*
|
|
132
132
|
* New pages are only opened when there is enough free CPU and memory available,
|
|
133
|
-
* using the functionality provided by the {@
|
|
134
|
-
* All {@
|
|
133
|
+
* using the functionality provided by the {@apilink AutoscaledPool} class.
|
|
134
|
+
* All {@apilink AutoscaledPool} configuration options can be passed to the {@apilink PlaywrightCrawlerOptions.autoscaledPoolOptions}
|
|
135
135
|
* parameter of the `PlaywrightCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
|
|
136
|
-
* {@
|
|
136
|
+
* {@apilink AutoscaledPoolOptions} are available directly in the `PlaywrightCrawler` constructor.
|
|
137
137
|
*
|
|
138
138
|
* Note that the pool of Playwright instances is internally managed by the [BrowserPool](https://github.com/apify/browser-pool) class.
|
|
139
139
|
*
|
|
@@ -141,7 +141,6 @@ export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions<Playwrig
|
|
|
141
141
|
*
|
|
142
142
|
* ```javascript
|
|
143
143
|
* const crawler = new PlaywrightCrawler({
|
|
144
|
-
* requestList,
|
|
145
144
|
* async requestHandler({ page, request }) {
|
|
146
145
|
* // This function is called to extract data from a single web page
|
|
147
146
|
* // 'page' is an instance of Playwright.Page with page.goto(request.url) already called
|
|
@@ -162,7 +161,10 @@ export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions<Playwrig
|
|
|
162
161
|
* },
|
|
163
162
|
* });
|
|
164
163
|
*
|
|
165
|
-
* await crawler.run(
|
|
164
|
+
* await crawler.run([
|
|
165
|
+
* 'http://www.example.com/page-1',
|
|
166
|
+
* 'http://www.example.com/page-2',
|
|
167
|
+
* ]);
|
|
166
168
|
* ```
|
|
167
169
|
* @category Crawlers
|
|
168
170
|
*/
|
|
@@ -178,6 +180,7 @@ export declare class PlaywrightCrawler extends BrowserCrawler<{
|
|
|
178
180
|
preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
179
181
|
postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
|
|
180
182
|
launchContext: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
183
|
+
headless: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
181
184
|
sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
182
185
|
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
183
186
|
useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
@@ -197,6 +200,7 @@ export declare class PlaywrightCrawler extends BrowserCrawler<{
|
|
|
197
200
|
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
198
201
|
maxConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
199
202
|
maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
203
|
+
keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
200
204
|
log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
201
205
|
};
|
|
202
206
|
/**
|
|
@@ -207,9 +211,9 @@ export declare class PlaywrightCrawler extends BrowserCrawler<{
|
|
|
207
211
|
protected _navigationHandler(crawlingContext: PlaywrightCrawlingContext, gotoOptions: DirectNavigationOptions): Promise<Response | null>;
|
|
208
212
|
}
|
|
209
213
|
/**
|
|
210
|
-
* Creates new {@
|
|
211
|
-
* This instance can then serve as a `requestHandler` of your {@
|
|
212
|
-
* Defaults to the {@
|
|
214
|
+
* Creates new {@apilink Router} instance that works based on request labels.
|
|
215
|
+
* This instance can then serve as a `requestHandler` of your {@apilink PlaywrightCrawler}.
|
|
216
|
+
* Defaults to the {@apilink PlaywrightCrawlingContext}.
|
|
213
217
|
*
|
|
214
218
|
* > Serves as a shortcut for using `Router.create<PlaywrightCrawlingContext>()`.
|
|
215
219
|
*
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"playwright-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/playwright-crawler.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAChE,OAAO,KAAK,EAAsB,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACxG,OAAO,KAAK,EAAE,qBAAqB,EAAE,sBAAsB,EAAE,qBAAqB,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC1H,OAAO,EAAE,cAAc,EAAE,aAAa,EAAU,MAAM,kBAAkB,CAAC;AACzE,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAErE,OAAO,KAAK,EAAE,uBAAuB,EAAE,sBAAsB,EAAE,MAAM,0BAA0B,CAAC;AAGhG,MAAM,WAAW,yBAAyB,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAAE,SACjF,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,oBAAoB,EAAE,QAAQ,CAAC,EAAE,sBAAsB;CAAG;AACrG,MAAM,WAAW,cAAe,SAAQ,WAAW,CAAC,yBAAyB,EAAE,qBAAqB,CAAC;CAAG;AACxG,MAAM,WAAW,wBAAyB,SAAQ,qBAAqB,CAAC,yBAAyB,CAAC;CAAG;AACrG,oBAAY,qBAAqB,GAAG,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAEhE,MAAM,WAAW,wBAAyB,SAAQ,qBAAqB,CACnE,yBAAyB,EACzB;IAAE,cAAc,EAAE,CAAC,gBAAgB,CAAC,CAAA;CAAE,CACzC;IACG;;OAEG;IACH,aAAa,CAAC,EAAE,uBAAuB,CAAC;IAExC;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACH,cAAc,CAAC,EAAE,wBAAwB,CAAC;IAE1C;;;;;;;;;;;;;;;;;;;;;;;;;;OA0BG;IACH,kBAAkB,CAAC,EAAE,wBAAwB,CAAC;IAE9C;;;;;;;;;;;;OAYG;IACH,kBAAkB,CAAC,EAAE,cAAc,EAAE,CAAC;IAEtC;;;;;;;;;;;;;;OAcG;IACH,mBAAmB,CAAC,EAAE,cAAc,EAAE,CAAC;CAC1C;AAED
|
|
1
|
+
{"version":3,"file":"playwright-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/playwright-crawler.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAChE,OAAO,KAAK,EAAsB,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACxG,OAAO,KAAK,EAAE,qBAAqB,EAAE,sBAAsB,EAAE,qBAAqB,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC1H,OAAO,EAAE,cAAc,EAAE,aAAa,EAAU,MAAM,kBAAkB,CAAC;AACzE,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAErE,OAAO,KAAK,EAAE,uBAAuB,EAAE,sBAAsB,EAAE,MAAM,0BAA0B,CAAC;AAGhG,MAAM,WAAW,yBAAyB,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAAE,SACjF,sBAAsB,CAAC,IAAI,EAAE,QAAQ,EAAE,oBAAoB,EAAE,QAAQ,CAAC,EAAE,sBAAsB;CAAG;AACrG,MAAM,WAAW,cAAe,SAAQ,WAAW,CAAC,yBAAyB,EAAE,qBAAqB,CAAC;CAAG;AACxG,MAAM,WAAW,wBAAyB,SAAQ,qBAAqB,CAAC,yBAAyB,CAAC;CAAG;AACrG,oBAAY,qBAAqB,GAAG,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAEhE,MAAM,WAAW,wBAAyB,SAAQ,qBAAqB,CACnE,yBAAyB,EACzB;IAAE,cAAc,EAAE,CAAC,gBAAgB,CAAC,CAAA;CAAE,CACzC;IACG;;OAEG;IACH,aAAa,CAAC,EAAE,uBAAuB,CAAC;IAExC;;;;;;;;;;;;;;;;;;;;;;;OAuBG;IACH,cAAc,CAAC,EAAE,wBAAwB,CAAC;IAE1C;;;;;;;;;;;;;;;;;;;;;;;;;;OA0BG;IACH,kBAAkB,CAAC,EAAE,wBAAwB,CAAC;IAE9C;;;;;;;;;;;;OAYG;IACH,kBAAkB,CAAC,EAAE,cAAc,EAAE,CAAC;IAEtC;;;;;;;;;;;;;;OAcG;IACH,mBAAmB,CAAC,EAAE,cAAc,EAAE,CAAC;CAC1C;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8DG;AACH,qBAAa,iBAAkB,SAAQ,cAAc,CAAC;IAAE,cAAc,EAAE,CAAC,gBAAgB,CAAC,CAAA;CAAE,EAAE,aAAa,EAAE,yBAAyB,CAAC;aAU7D,MAAM;IAT5E,iBAA0B,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAIpC;IAEF;;OAEG;gBACS,OAAO,GAAE,wBAA6B,EAAoB,MAAM,gBAAkC;cA6BrF,kBAAkB,CAAC,OAAO,EAAE,yBAAyB;cAMrD,kBAAkB,CAAC,eAAe,EAAE,yBAAyB,EAAE,WAAW,EAAE,uBAAuB;CAG/H;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,sBAAsB,CAAC,OAAO,SAAS,yBAAyB,GAAG,yBAAyB,uDAE3G"}
|
|
@@ -14,27 +14,27 @@ const playwright_utils_1 = require("./utils/playwright-utils");
|
|
|
14
14
|
*
|
|
15
15
|
* Since `Playwright` uses headless browser to download web pages and extract data,
|
|
16
16
|
* it is useful for crawling of websites that require to execute JavaScript.
|
|
17
|
-
* If the target website doesn't need JavaScript, consider using {@
|
|
17
|
+
* If the target website doesn't need JavaScript, consider using {@apilink CheerioCrawler},
|
|
18
18
|
* which downloads the pages using raw HTTP requests and is about 10x faster.
|
|
19
19
|
*
|
|
20
|
-
* The source URLs are represented using {@
|
|
21
|
-
* {@
|
|
22
|
-
* or {@
|
|
20
|
+
* The source URLs are represented using {@apilink Request} objects that are fed from
|
|
21
|
+
* {@apilink RequestList} or {@apilink RequestQueue} instances provided by the {@apilink PlaywrightCrawlerOptions.requestList}
|
|
22
|
+
* or {@apilink PlaywrightCrawlerOptions.requestQueue} constructor options, respectively.
|
|
23
23
|
*
|
|
24
|
-
* If both {@
|
|
25
|
-
* the instance first processes URLs from the {@
|
|
26
|
-
* to {@
|
|
24
|
+
* If both {@apilink PlaywrightCrawlerOptions.requestList} and {@apilink PlaywrightCrawlerOptions.requestQueue} are used,
|
|
25
|
+
* the instance first processes URLs from the {@apilink RequestList} and automatically enqueues all of them
|
|
26
|
+
* to {@apilink RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
27
27
|
*
|
|
28
|
-
* The crawler finishes when there are no more {@
|
|
28
|
+
* The crawler finishes when there are no more {@apilink Request} objects to crawl.
|
|
29
29
|
*
|
|
30
|
-
* `PlaywrightCrawler` opens a new Chrome page (i.e. tab) for each {@
|
|
31
|
-
* and then calls the function provided by user as the {@
|
|
30
|
+
* `PlaywrightCrawler` opens a new Chrome page (i.e. tab) for each {@apilink Request} object to crawl
|
|
31
|
+
* and then calls the function provided by user as the {@apilink PlaywrightCrawlerOptions.requestHandler} option.
|
|
32
32
|
*
|
|
33
33
|
* New pages are only opened when there is enough free CPU and memory available,
|
|
34
|
-
* using the functionality provided by the {@
|
|
35
|
-
* All {@
|
|
34
|
+
* using the functionality provided by the {@apilink AutoscaledPool} class.
|
|
35
|
+
* All {@apilink AutoscaledPool} configuration options can be passed to the {@apilink PlaywrightCrawlerOptions.autoscaledPoolOptions}
|
|
36
36
|
* parameter of the `PlaywrightCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
|
|
37
|
-
* {@
|
|
37
|
+
* {@apilink AutoscaledPoolOptions} are available directly in the `PlaywrightCrawler` constructor.
|
|
38
38
|
*
|
|
39
39
|
* Note that the pool of Playwright instances is internally managed by the [BrowserPool](https://github.com/apify/browser-pool) class.
|
|
40
40
|
*
|
|
@@ -42,7 +42,6 @@ const playwright_utils_1 = require("./utils/playwright-utils");
|
|
|
42
42
|
*
|
|
43
43
|
* ```javascript
|
|
44
44
|
* const crawler = new PlaywrightCrawler({
|
|
45
|
-
* requestList,
|
|
46
45
|
* async requestHandler({ page, request }) {
|
|
47
46
|
* // This function is called to extract data from a single web page
|
|
48
47
|
* // 'page' is an instance of Playwright.Page with page.goto(request.url) already called
|
|
@@ -63,7 +62,10 @@ const playwright_utils_1 = require("./utils/playwright-utils");
|
|
|
63
62
|
* },
|
|
64
63
|
* });
|
|
65
64
|
*
|
|
66
|
-
* await crawler.run(
|
|
65
|
+
* await crawler.run([
|
|
66
|
+
* 'http://www.example.com/page-1',
|
|
67
|
+
* 'http://www.example.com/page-2',
|
|
68
|
+
* ]);
|
|
67
69
|
* ```
|
|
68
70
|
* @category Crawlers
|
|
69
71
|
*/
|
|
@@ -73,11 +75,15 @@ class PlaywrightCrawler extends browser_1.BrowserCrawler {
|
|
|
73
75
|
*/
|
|
74
76
|
constructor(options = {}, config = browser_1.Configuration.getGlobalConfig()) {
|
|
75
77
|
(0, ow_1.default)(options, 'PlaywrightCrawlerOptions', ow_1.default.object.exactShape(PlaywrightCrawler.optionsShape));
|
|
76
|
-
const { launchContext = {}, browserPoolOptions = {}, ...browserCrawlerOptions } = options;
|
|
78
|
+
const { launchContext = {}, headless, browserPoolOptions = {}, ...browserCrawlerOptions } = options;
|
|
77
79
|
if (launchContext.proxyUrl) {
|
|
78
80
|
throw new Error('PlaywrightCrawlerOptions.launchContext.proxyUrl is not allowed in PlaywrightCrawler.'
|
|
79
81
|
+ 'Use PlaywrightCrawlerOptions.proxyConfiguration');
|
|
80
82
|
}
|
|
83
|
+
if (headless != null) {
|
|
84
|
+
launchContext.launchOptions ?? (launchContext.launchOptions = {});
|
|
85
|
+
launchContext.launchOptions.headless = headless;
|
|
86
|
+
}
|
|
81
87
|
const playwrightLauncher = new playwright_launcher_1.PlaywrightLauncher(launchContext, config);
|
|
82
88
|
browserPoolOptions.browserPlugins = [
|
|
83
89
|
playwrightLauncher.createBrowserPlugin(),
|
|
@@ -111,9 +117,9 @@ Object.defineProperty(PlaywrightCrawler, "optionsShape", {
|
|
|
111
117
|
}
|
|
112
118
|
});
|
|
113
119
|
/**
|
|
114
|
-
* Creates new {@
|
|
115
|
-
* This instance can then serve as a `requestHandler` of your {@
|
|
116
|
-
* Defaults to the {@
|
|
120
|
+
* Creates new {@apilink Router} instance that works based on request labels.
|
|
121
|
+
* This instance can then serve as a `requestHandler` of your {@apilink PlaywrightCrawler}.
|
|
122
|
+
* Defaults to the {@apilink PlaywrightCrawlingContext}.
|
|
117
123
|
*
|
|
118
124
|
* > Serves as a shortcut for using `Router.create<PlaywrightCrawlingContext>()`.
|
|
119
125
|
*
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"playwright-crawler.js","sourceRoot":"","sources":["../../src/internals/playwright-crawler.ts"],"names":[],"mappings":";;;;AAAA,oDAAoB;AAIpB,8CAAyE;AAGzE,+DAA2D;AAE3D,+DAAgF;AAyGhF
|
|
1
|
+
{"version":3,"file":"playwright-crawler.js","sourceRoot":"","sources":["../../src/internals/playwright-crawler.ts"],"names":[],"mappings":";;;;AAAA,oDAAoB;AAIpB,8CAAyE;AAGzE,+DAA2D;AAE3D,+DAAgF;AAyGhF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8DG;AACH,MAAa,iBAAkB,SAAQ,wBAAgG;IAOnI;;OAEG;IACH,YAAY,UAAoC,EAAE,EAAoB,SAAS,uBAAa,CAAC,eAAe,EAAE;QAC1G,IAAA,YAAE,EAAC,OAAO,EAAE,0BAA0B,EAAE,YAAE,CAAC,MAAM,CAAC,UAAU,CAAC,iBAAiB,CAAC,YAAY,CAAC,CAAC,CAAC;QAE9F,MAAM,EACF,aAAa,GAAG,EAAE,EAClB,QAAQ,EACR,kBAAkB,GAAG,EAA0C,EAC/D,GAAG,qBAAqB,EAC3B,GAAG,OAAO,CAAC;QAEZ,IAAI,aAAa,CAAC,QAAQ,EAAE;YACxB,MAAM,IAAI,KAAK,CAAC,sFAAsF;kBAChG,iDAAiD,CAAC,CAAC;SAC5D;QAED,IAAI,QAAQ,IAAI,IAAI,EAAE;YAClB,aAAa,CAAC,aAAa,KAA3B,aAAa,CAAC,aAAa,GAAK,EAAmB,EAAC;YACpD,aAAa,CAAC,aAAa,CAAC,QAAQ,GAAG,QAAQ,CAAC;SACnD;QAED,MAAM,kBAAkB,GAAG,IAAI,wCAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;QAEzE,kBAAkB,CAAC,cAAc,GAAG;YAChC,kBAAkB,CAAC,mBAAmB,EAAE;SAC3C,CAAC;QAEF,KAAK,CAAC,EAAE,GAAG,qBAAqB,EAAE,aAAa,EAAE,kBAAkB,EAAE,EAAE,MAAM,CAAC,CAAC;;;;;mBA1Bb;;IA2BtE,CAAC;IAEkB,KAAK,CAAC,kBAAkB,CAAC,OAAkC;QAC1E,IAAA,yCAAsB,EAAC,OAAO,CAAC,CAAC;QAChC,gDAAgD;QAChD,MAAM,KAAK,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC;IAC5C,CAAC;IAEkB,KAAK,CAAC,kBAAkB,CAAC,eAA0C,EAAE,WAAoC;QACxH,OAAO,IAAA,+BAAY,EAAC,eAAe,CAAC,IAAI,EAAE,eAAe,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;IACpF,CAAC;;AA/CL,8CAgDC;AA/CG;;;;WAAyC;QACrC,GAAG,wBAAc,CAAC,YAAY;QAC9B,kBAAkB,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;QACtC,QAAQ,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;KAC/B;GAAC;AA6CN;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,SAAgB,sBAAsB;IAClC,OAAO,gBAAM,CAAC,MAAM,EAAW,CAAC;AACpC,CAAC;AAFD,wDAEC"}
|
|
@@ -53,6 +53,7 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext<LaunchOpti
|
|
|
53
53
|
/**
|
|
54
54
|
* @experimental
|
|
55
55
|
* Like `useIncognitoPages`, but for persistent contexts, so cache is used for faster loading.
|
|
56
|
+
* Works best with Firefox. Unstable on Chromium.
|
|
56
57
|
*/
|
|
57
58
|
experimentalContainers?: boolean;
|
|
58
59
|
/**
|
|
@@ -116,7 +117,7 @@ export declare class PlaywrightLauncher extends BrowserLauncher<PlaywrightPlugin
|
|
|
116
117
|
* @param [launchContext]
|
|
117
118
|
* Optional settings passed to `browserType.launch()`. In addition to
|
|
118
119
|
* [Playwright's options](https://playwright.dev/docs/api/class-browsertype?_highlight=launch#browsertypelaunchoptions)
|
|
119
|
-
* the object may contain our own {@
|
|
120
|
+
* the object may contain our own {@apilink PlaywrightLaunchContext} that enable additional features.
|
|
120
121
|
* @param [config]
|
|
121
122
|
* @returns
|
|
122
123
|
* Promise that resolves to Playwright's `Browser` instance.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"playwright-launcher.d.ts","sourceRoot":"","sources":["../../src/internals/playwright-launcher.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AACtE,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACzD,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,kBAAkB,CAAC;AAC7D,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAElE;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,WAAW,uBAAwB,SAAQ,oBAAoB,CAAC,aAAa,EAAE,WAAW,CAAC;IAC7F,4GAA4G;IAC5G,aAAa,CAAC,EAAE,aAAa,CAAC;IAE9B;;;;;OAKG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;;;;;;OAQG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB;;;;MAIE;IACF,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAE5B
|
|
1
|
+
{"version":3,"file":"playwright-launcher.d.ts","sourceRoot":"","sources":["../../src/internals/playwright-launcher.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AACtE,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACzD,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,kBAAkB,CAAC;AAC7D,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAElE;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,WAAW,uBAAwB,SAAQ,oBAAoB,CAAC,aAAa,EAAE,WAAW,CAAC;IAC7F,4GAA4G;IAC5G,aAAa,CAAC,EAAE,aAAa,CAAC;IAE9B;;;;;OAKG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;;;;;;OAQG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB;;;;MAIE;IACF,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAE5B;;;;MAIE;IACF,sBAAsB,CAAC,EAAE,OAAO,CAAC;IAEjC;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,QAAQ,CAAC,EAAE,WAAW,CAAC;CAC1B;AAED;;;GAGG;AACH,qBAAa,kBAAmB,SAAQ,eAAe,CAAC,gBAAgB,CAAC;aAW/C,MAAM;IAV5B,iBAA0B,YAAY;;;;;;;;;MAGpC;IAEF;;OAEG;gBAEC,aAAa,GAAE,uBAA4B,EACzB,MAAM,gBAAkC;CAqBjE;AA0BD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,wBAAsB,gBAAgB,CAAC,aAAa,CAAC,EAAE,uBAAuB,EAAE,MAAM,gBAAkC,GAAG,OAAO,CAAC,OAAO,CAAC,CAI1I"}
|
|
@@ -91,7 +91,7 @@ function getDefaultExecutablePath(launchContext, config) {
|
|
|
91
91
|
* @param [launchContext]
|
|
92
92
|
* Optional settings passed to `browserType.launch()`. In addition to
|
|
93
93
|
* [Playwright's options](https://playwright.dev/docs/api/class-browsertype?_highlight=launch#browsertypelaunchoptions)
|
|
94
|
-
* the object may contain our own {@
|
|
94
|
+
* the object may contain our own {@apilink PlaywrightLaunchContext} that enable additional features.
|
|
95
95
|
* @param [config]
|
|
96
96
|
* @returns
|
|
97
97
|
* Promise that resolves to Playwright's `Browser` instance.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"playwright-launcher.js","sourceRoot":"","sources":["../../src/internals/playwright-launcher.ts"],"names":[],"mappings":";;;;AAAA,oDAAoB;AAEpB,wDAAyD;AAEzD,8CAAkE;
|
|
1
|
+
{"version":3,"file":"playwright-launcher.js","sourceRoot":"","sources":["../../src/internals/playwright-launcher.ts"],"names":[],"mappings":";;;;AAAA,oDAAoB;AAEpB,wDAAyD;AAEzD,8CAAkE;AA0ElE;;;GAGG;AACH,MAAa,kBAAmB,SAAQ,yBAAiC;IAMrE;;OAEG;IACH,YACI,gBAAyC,EAAE,EACzB,SAAS,uBAAa,CAAC,eAAe,EAAE;QAE1D,IAAA,YAAE,EAAC,aAAa,EAAE,2BAA2B,EAAE,YAAE,CAAC,MAAM,CAAC,UAAU,CAAC,kBAAkB,CAAC,YAAY,CAAC,CAAC,CAAC;QAEtG,MAAM,EACF,QAAQ,GAAG,yBAAe,CAAC,sBAAsB,CAA8B,YAAY,EAAE,+BAA+B,CAAC,CAAC,QAAQ,GACzI,GAAG,aAAa,CAAC;QAElB,MAAM,EAAE,aAAa,GAAG,EAAE,EAAE,GAAG,IAAI,EAAE,GAAG,aAAa,CAAC;QAEtD,KAAK,CAAC;YACF,GAAG,IAAI;YACP,aAAa,EAAE;gBACX,GAAG,aAAa;gBAChB,cAAc,EAAE,wBAAwB,CAAC,aAAa,EAAE,MAAM,CAAC;aAClE;YACD,QAAQ;SACX,EAAE,MAAM,CAAC,CAAC;;;;;mBAjBO;;QAmBlB,IAAI,CAAC,MAAM,GAAG,+BAAgB,CAAC;IACnC,CAAC;;AA/BL,gDAgCC;AA/BG;;;;WAAyC;QACrC,GAAG,yBAAe,CAAC,YAAY;QAC/B,QAAQ,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;KAC/B;GAAC;AA8BN;;;;GAIG;AACH,SAAS,wBAAwB,CAAC,aAAsC,EAAE,MAAqB;IAC3F,MAAM,uBAAuB,GAAG,MAAM,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;IACjE,MAAM,EAAE,aAAa,GAAG,EAAE,EAAE,GAAG,aAAa,CAAC;IAE7C,IAAI,aAAa,CAAC,cAAc,EAAE;QAC9B,OAAO,aAAa,CAAC,cAAc,CAAC;KACvC;IAED,IAAI,aAAa,CAAC,SAAS,EAAE;QACzB,OAAO,SAAS,CAAC;KACpB;IAED,IAAI,uBAAuB,EAAE;QACzB,OAAO,uBAAuB,CAAC;KAClC;IAED,OAAO,SAAS,CAAC;AACrB,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACI,KAAK,UAAU,gBAAgB,CAAC,aAAuC,EAAE,MAAM,GAAG,uBAAa,CAAC,eAAe,EAAE;IACpH,MAAM,kBAAkB,GAAG,IAAI,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;IAEzE,OAAO,kBAAkB,CAAC,MAAM,EAAE,CAAC;AACvC,CAAC;AAJD,4CAIC"}
|
|
@@ -30,6 +30,19 @@ export interface InjectFileOptions {
|
|
|
30
30
|
*/
|
|
31
31
|
surviveNavigations?: boolean;
|
|
32
32
|
}
|
|
33
|
+
export interface BlockRequestsOptions {
|
|
34
|
+
/**
|
|
35
|
+
* The patterns of URLs to block from being loaded by the browser.
|
|
36
|
+
* Only `*` can be used as a wildcard. It is also automatically added to the beginning
|
|
37
|
+
* and end of the pattern. This limitation is enforced by the DevTools protocol.
|
|
38
|
+
* `.png` is the same as `*.png*`.
|
|
39
|
+
*/
|
|
40
|
+
urlPatterns?: string[];
|
|
41
|
+
/**
|
|
42
|
+
* If you just want to append to the default blocked patterns, use this property.
|
|
43
|
+
*/
|
|
44
|
+
extraUrlPatterns?: string[];
|
|
45
|
+
}
|
|
33
46
|
/**
|
|
34
47
|
* Injects a JavaScript file into a Playwright page.
|
|
35
48
|
* Unlike Playwright's `addScriptTag` function, this function works on pages
|
|
@@ -102,7 +115,50 @@ export interface DirectNavigationOptions {
|
|
|
102
115
|
*/
|
|
103
116
|
export declare function gotoExtended(page: Page, request: Request, gotoOptions?: DirectNavigationOptions): Promise<Response | null>;
|
|
104
117
|
/**
|
|
105
|
-
*
|
|
118
|
+
* Forces the Playwright browser tab to block loading URLs that match a provided pattern.
|
|
119
|
+
* This is useful to speed up crawling of websites, since it reduces the amount
|
|
120
|
+
* of data that needs to be downloaded from the web, but it may break some websites
|
|
121
|
+
* or unexpectedly prevent loading of resources.
|
|
122
|
+
*
|
|
123
|
+
* By default, the function will block all URLs including the following patterns:
|
|
124
|
+
*
|
|
125
|
+
* ```json
|
|
126
|
+
* [".css", ".jpg", ".jpeg", ".png", ".svg", ".gif", ".woff", ".pdf", ".zip"]
|
|
127
|
+
* ```
|
|
128
|
+
*
|
|
129
|
+
* If you want to extend this list further, use the `extraUrlPatterns` option,
|
|
130
|
+
* which will keep blocking the default patterns, as well as add your custom ones.
|
|
131
|
+
* If you would like to block only specific patterns, use the `urlPatterns` option,
|
|
132
|
+
* which will override the defaults and block only URLs with your custom patterns.
|
|
133
|
+
*
|
|
134
|
+
* This function does not use Playwright's request interception and therefore does not interfere
|
|
135
|
+
* with browser cache. It's also faster than blocking requests using interception,
|
|
136
|
+
* because the blocking happens directly in the browser without the round-trip to Node.js,
|
|
137
|
+
* but it does not provide the extra benefits of request interception.
|
|
138
|
+
*
|
|
139
|
+
* The function will never block main document loads and their respective redirects.
|
|
140
|
+
*
|
|
141
|
+
* **Example usage**
|
|
142
|
+
* ```javascript
|
|
143
|
+
* import { launchPlaywright, playwrightUtils } from 'crawlee';
|
|
144
|
+
*
|
|
145
|
+
* const browser = await launchPlaywright();
|
|
146
|
+
* const page = await browser.newPage();
|
|
147
|
+
*
|
|
148
|
+
* // Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
|
|
149
|
+
* await playwrightUtils.blockRequests(page, {
|
|
150
|
+
* extraUrlPatterns: ['adsbygoogle.js'],
|
|
151
|
+
* });
|
|
152
|
+
*
|
|
153
|
+
* await page.goto('https://cnn.com');
|
|
154
|
+
* ```
|
|
155
|
+
*
|
|
156
|
+
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
|
|
157
|
+
* @param [options]
|
|
158
|
+
*/
|
|
159
|
+
export declare function blockRequests(page: Page, options?: BlockRequestsOptions): Promise<void>;
|
|
160
|
+
/**
|
|
161
|
+
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@apilink CheerioCrawler}.
|
|
106
162
|
*
|
|
107
163
|
* **Example usage:**
|
|
108
164
|
* ```javascript
|
|
@@ -116,6 +172,7 @@ export declare function parseWithCheerio(page: Page): Promise<CheerioRoot>;
|
|
|
116
172
|
export interface PlaywrightContextUtils {
|
|
117
173
|
injectFile(filePath: string, options?: InjectFileOptions): Promise<unknown>;
|
|
118
174
|
injectJQuery(): Promise<unknown>;
|
|
175
|
+
blockRequests(options?: BlockRequestsOptions): Promise<void>;
|
|
119
176
|
parseWithCheerio(): Promise<CheerioRoot>;
|
|
120
177
|
}
|
|
121
178
|
export declare function registerUtilsToContext(context: PlaywrightCrawlingContext): void;
|
|
@@ -124,6 +181,7 @@ export declare const playwrightUtils: {
|
|
|
124
181
|
injectFile: typeof injectFile;
|
|
125
182
|
injectJQuery: typeof injectJQuery;
|
|
126
183
|
gotoExtended: typeof gotoExtended;
|
|
184
|
+
blockRequests: typeof blockRequests;
|
|
127
185
|
parseWithCheerio: typeof parseWithCheerio;
|
|
128
186
|
};
|
|
129
187
|
//# sourceMappingURL=playwright-utils.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"playwright-utils.d.ts","sourceRoot":"","sources":["../../../src/internals/utils/playwright-utils.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,OAAO,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAS,MAAM,YAAY,CAAC;AAGxD,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAE7C,OAAO,KAAK,EAAE,WAAW,EAAc,MAAM,gBAAgB,CAAC;AAE9D,OAAO,KAAK,EAAE,yBAAyB,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"playwright-utils.d.ts","sourceRoot":"","sources":["../../../src/internals/utils/playwright-utils.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,OAAO,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAS,MAAM,YAAY,CAAC;AAGxD,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAE7C,OAAO,KAAK,EAAE,WAAW,EAAc,MAAM,gBAAgB,CAAC;AAE9D,OAAO,KAAK,EAAE,yBAAyB,EAAE,MAAM,uBAAuB,CAAC;AASvE,MAAM,WAAW,iBAAiB;IAC9B;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC;CAChC;AAED,MAAM,WAAW,oBAAoB;IACjC;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IAEvB;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;CAC/B;AAOD;;;;;;;;;;GAUG;AACH,wBAAsB,UAAU,CAAC,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,GAAE,iBAAsB,GAAG,OAAO,CAAC,OAAO,CAAC,CAqBhH;AAED;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,CAGzD;AAED,MAAM,WAAW,uBAAuB;IACpC;;;;;OAKG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;;;OAKG;IACH,SAAS,CAAC,EAAE,kBAAkB,GAAG,MAAM,GAAG,aAAa,CAAC;IAExD;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;;;;;;;;;;GAWG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,WAAW,GAAE,uBAA4B,GAAG,OAAO,CAAC,QAAQ,GAAG,IAAI,CAAC,CA0CpI;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyCG;AACH,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,EAAE,OAAO,GAAE,oBAAyB,GAAG,OAAO,CAAC,IAAI,CAAC,CAkBjG;AAED;;;;;;;;;;GAUG;AACH,wBAAsB,gBAAgB,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAAC,WAAW,CAAC,CAIvE;AAED,MAAM,WAAW,sBAAsB;IACnC,UAAU,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;IAC5E,YAAY,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IACjC,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAC7D,gBAAgB,IAAI,OAAO,CAAC,WAAW,CAAC,CAAC;CAC5C;AAED,wBAAgB,sBAAsB,CAAC,OAAO,EAAE,yBAAyB,GAAG,IAAI,CAK/E;AAED,gBAAgB;AAChB,eAAO,MAAM,eAAe;;;;;;CAM3B,CAAC"}
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
* @module playwrightUtils
|
|
20
20
|
*/
|
|
21
21
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
-
exports.playwrightUtils = exports.registerUtilsToContext = exports.parseWithCheerio = exports.gotoExtended = exports.injectJQuery = exports.injectFile = void 0;
|
|
22
|
+
exports.playwrightUtils = exports.registerUtilsToContext = exports.parseWithCheerio = exports.blockRequests = exports.gotoExtended = exports.injectJQuery = exports.injectFile = void 0;
|
|
23
23
|
const tslib_1 = require("tslib");
|
|
24
24
|
const promises_1 = require("node:fs/promises");
|
|
25
25
|
const ow_1 = tslib_1.__importDefault(require("ow"));
|
|
@@ -30,6 +30,7 @@ const cheerio = tslib_1.__importStar(require("cheerio"));
|
|
|
30
30
|
const log = log_1.default.child({ prefix: 'Playwright Utils' });
|
|
31
31
|
const jqueryPath = require.resolve('jquery');
|
|
32
32
|
const MAX_INJECT_FILE_CACHE_SIZE = 10;
|
|
33
|
+
const DEFAULT_BLOCK_REQUEST_URL_PATTERNS = ['.css', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip'];
|
|
33
34
|
/**
|
|
34
35
|
* Cache contents of previously injected files to limit file system access.
|
|
35
36
|
*/
|
|
@@ -149,7 +150,62 @@ async function gotoExtended(page, request, gotoOptions = {}) {
|
|
|
149
150
|
}
|
|
150
151
|
exports.gotoExtended = gotoExtended;
|
|
151
152
|
/**
|
|
152
|
-
*
|
|
153
|
+
* Forces the Playwright browser tab to block loading URLs that match a provided pattern.
|
|
154
|
+
* This is useful to speed up crawling of websites, since it reduces the amount
|
|
155
|
+
* of data that needs to be downloaded from the web, but it may break some websites
|
|
156
|
+
* or unexpectedly prevent loading of resources.
|
|
157
|
+
*
|
|
158
|
+
* By default, the function will block all URLs including the following patterns:
|
|
159
|
+
*
|
|
160
|
+
* ```json
|
|
161
|
+
* [".css", ".jpg", ".jpeg", ".png", ".svg", ".gif", ".woff", ".pdf", ".zip"]
|
|
162
|
+
* ```
|
|
163
|
+
*
|
|
164
|
+
* If you want to extend this list further, use the `extraUrlPatterns` option,
|
|
165
|
+
* which will keep blocking the default patterns, as well as add your custom ones.
|
|
166
|
+
* If you would like to block only specific patterns, use the `urlPatterns` option,
|
|
167
|
+
* which will override the defaults and block only URLs with your custom patterns.
|
|
168
|
+
*
|
|
169
|
+
* This function does not use Playwright's request interception and therefore does not interfere
|
|
170
|
+
* with browser cache. It's also faster than blocking requests using interception,
|
|
171
|
+
* because the blocking happens directly in the browser without the round-trip to Node.js,
|
|
172
|
+
* but it does not provide the extra benefits of request interception.
|
|
173
|
+
*
|
|
174
|
+
* The function will never block main document loads and their respective redirects.
|
|
175
|
+
*
|
|
176
|
+
* **Example usage**
|
|
177
|
+
* ```javascript
|
|
178
|
+
* import { launchPlaywright, playwrightUtils } from 'crawlee';
|
|
179
|
+
*
|
|
180
|
+
* const browser = await launchPlaywright();
|
|
181
|
+
* const page = await browser.newPage();
|
|
182
|
+
*
|
|
183
|
+
* // Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
|
|
184
|
+
* await playwrightUtils.blockRequests(page, {
|
|
185
|
+
* extraUrlPatterns: ['adsbygoogle.js'],
|
|
186
|
+
* });
|
|
187
|
+
*
|
|
188
|
+
* await page.goto('https://cnn.com');
|
|
189
|
+
* ```
|
|
190
|
+
*
|
|
191
|
+
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
|
|
192
|
+
* @param [options]
|
|
193
|
+
*/
|
|
194
|
+
async function blockRequests(page, options = {}) {
|
|
195
|
+
(0, ow_1.default)(page, ow_1.default.object.validate(core_1.validators.browserPage));
|
|
196
|
+
(0, ow_1.default)(options, ow_1.default.object.exactShape({
|
|
197
|
+
urlPatterns: ow_1.default.optional.array.ofType(ow_1.default.string),
|
|
198
|
+
extraUrlPatterns: ow_1.default.optional.array.ofType(ow_1.default.string),
|
|
199
|
+
}));
|
|
200
|
+
const { urlPatterns = DEFAULT_BLOCK_REQUEST_URL_PATTERNS, extraUrlPatterns = [], } = options;
|
|
201
|
+
const patternsToBlock = [...urlPatterns, ...extraUrlPatterns];
|
|
202
|
+
const client = await page.context().newCDPSession(page);
|
|
203
|
+
await client.send('Network.enable');
|
|
204
|
+
await client.send('Network.setBlockedURLs', { urls: patternsToBlock });
|
|
205
|
+
}
|
|
206
|
+
exports.blockRequests = blockRequests;
|
|
207
|
+
/**
|
|
208
|
+
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@apilink CheerioCrawler}.
|
|
153
209
|
*
|
|
154
210
|
* **Example usage:**
|
|
155
211
|
* ```javascript
|
|
@@ -168,6 +224,7 @@ exports.parseWithCheerio = parseWithCheerio;
|
|
|
168
224
|
function registerUtilsToContext(context) {
|
|
169
225
|
context.injectFile = (filePath, options) => injectFile(context.page, filePath, options);
|
|
170
226
|
context.injectJQuery = () => injectJQuery(context.page);
|
|
227
|
+
context.blockRequests = (options) => blockRequests(context.page, options);
|
|
171
228
|
context.parseWithCheerio = () => parseWithCheerio(context.page);
|
|
172
229
|
}
|
|
173
230
|
exports.registerUtilsToContext = registerUtilsToContext;
|
|
@@ -176,6 +233,7 @@ exports.playwrightUtils = {
|
|
|
176
233
|
injectFile,
|
|
177
234
|
injectJQuery,
|
|
178
235
|
gotoExtended,
|
|
236
|
+
blockRequests,
|
|
179
237
|
parseWithCheerio,
|
|
180
238
|
};
|
|
181
239
|
//# sourceMappingURL=playwright-utils.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"playwright-utils.js","sourceRoot":"","sources":["../../../src/internals/utils/playwright-utils.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;GAkBG;;;;AAEH,+CAA4C;AAC5C,oDAAoB;AAEpB,0DAAiD;AACjD,6DAA8B;AAE9B,wCAA2C;AAE3C,yDAAmC;AAGnC,MAAM,GAAG,GAAG,aAAI,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,kBAAkB,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;AAE7C,MAAM,0BAA0B,GAAG,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"playwright-utils.js","sourceRoot":"","sources":["../../../src/internals/utils/playwright-utils.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;GAkBG;;;;AAEH,+CAA4C;AAC5C,oDAAoB;AAEpB,0DAAiD;AACjD,6DAA8B;AAE9B,wCAA2C;AAE3C,yDAAmC;AAGnC,MAAM,GAAG,GAAG,aAAI,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,kBAAkB,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;AAE7C,MAAM,0BAA0B,GAAG,EAAE,CAAC;AACtC,MAAM,kCAAkC,GAAG,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;AA0BtH;;GAEG;AACH,MAAM,kBAAkB,GAAG,IAAI,yBAAQ,CAAC,EAAE,SAAS,EAAE,0BAA0B,EAAE,CAAC,CAAC;AAEnF;;;;;;;;;;GAUG;AACI,KAAK,UAAU,UAAU,CAAC,IAAU,EAAE,QAAgB,EAAE,UAA6B,EAAE;IAC1F,IAAA,YAAE,EAAC,IAAI,EAAE,YAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAU,CAAC,WAAW,CAAC,CAAC,CAAC;IACrD,IAAA,YAAE,EAAC,QAAQ,EAAE,YAAE,CAAC,MAAM,CAAC,CAAC;IACxB,IAAA,YAAE,EAAC,OAAO,EAAE,YAAE,CAAC,MAAM,CAAC,UAAU,CAAC;QAC7B,kBAAkB,EAAE,YAAE,CAAC,QAAQ,CAAC,OAAO;KAC1C,CAAC,CAAC,CAAC;IAEJ,IAAI,QAAQ,GAAG,kBAAkB,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IAChD,IAAI,CAAC,QAAQ,EAAE;QACX,QAAQ,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;QAC5C,kBAAkB,CAAC,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;KAC9C;IACD,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEtC,IAAI,OAAO,CAAC,kBAAkB,EAAE;QAC5B,IAAI,CAAC,EAAE,CAAC,gBAAgB,EACpB,GAAG,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC;aACxB,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gDAAgD,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC;KACxG;IAED,OAAO,KAAK,CAAC;AACjB,CAAC;AArBD,gCAqBC;AAED;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,SAAgB,YAAY,CAAC,IAAU;IACnC,IAAA,YAAE,EAAC,IAAI,EAAE,YAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAU,CAAC,WAAW,CAAC,CAAC,CAAC;IACrD,OAAO,UAAU,CAAC,IAAI,EAAE,UAAU,EAAE,EAAE,kBAAkB,EAAE,IAAI,EAAE,CAAC,CAAC;AACtE,CAAC;AAHD,oCAGC;AAyBD;;;;;;;;;;;GAWG;AACI,KAAK,UAAU,YAAY,CAAC,IAAU,EAAE,OAAgB,EAAE,cAAuC,EAAE;IACtG,IAAA,YAAE,EAAC,IAAI,EAAE,YAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAU,CAAC,WAAW,CAAC,CAAC,CAAC;IACrD,IAAA,YAAE,EAAC,OAAO,EAAE,YAAE,CAAC,MAAM,CAAC,YAAY,CAAC;QAC/B,GAAG,EAAE,YAAE,CAAC,MAAM,CAAC,GAAG;QAClB,MAAM,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;QAC1B,OAAO,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;QAC3B,OAAO,EAAE,YAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,YAAE,CAAC,MAAM,EAAE,YAAE,CAAC,MAAM,CAAC;KACjD,CAAC,CAAC,CAAC;IACJ,IAAA,YAAE,EAAC,WAAW,EAAE,YAAE,CAAC,MAAM,CAAC,CAAC;IAE3B,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;IAClD,MAAM,OAAO,GAAG,CAAC,CAAU,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC;IAElE,IAAI,MAAM,KAAK,KAAK,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE;QAClD,sDAAsD;QACtD,GAAG,CAAC,UAAU,CAAC,+GAA+G;cACxH,4DAA4D,CAAC,CAAC;QACpE,IAAI,SAAS,GAAG,KAAK,CAAC;QACtB,MAAM,uBAAuB,GAAG,KAAK,EAAE,KAAY,EAAE,EAAE;YACnD,IAAI;gBACA,oGAAoG;gBACpG,uDAAuD;gBACvD,IAAI,SAAS,EAAE;oBACX,OAAO,MAAM,KAAK,CAAC,QAAQ,EAAE,CAAC;iBACjC;gBAED,SAAS,GAAG,IAAI,CAAC;gBACjB,MAAM,SAAS,GAAe,EAAE,CAAC;gBAEjC,IAAI,MAAM,KAAK,KAAK;oBAAE,SAAS,CAAC,MAAM,GAAG,MAAM,CAAC;gBAChD,IAAI,OAAO;oBAAE,SAAS,CAAC,QAAQ,GAAG,OAAO,CAAC;gBAC1C,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC;oBAAE,SAAS,CAAC,OAAO,GAAG,OAAO,CAAC;gBACnD,MAAM,KAAK,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;aACnC;YAAC,OAAO,KAAK,EAAE;gBACZ,GAAG,CAAC,KAAK,CAAC,kCAAkC,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;aAC5D;QACL,CAAC,CAAC;QAEF,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,uBAAuB,CAAC,CAAC;KACrD;IAED,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,WAAW,CAAC,CAAC;AACvC,CAAC;AA1CD,oCA0CC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyCG;AACI,KAAK,UAAU,aAAa,CAAC,IAAU,EAAE,UAAgC,EAAE;IAC9E,IAAA,YAAE,EAAC,IAAI,EAAE,YAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAU,CAAC,WAAW,CAAC,CAAC,CAAC;IACrD,IAAA,YAAE,EAAC,OAAO,EAAE,YAAE,CAAC,MAAM,CAAC,UAAU,CAAC;QAC7B,WAAW,EAAE,YAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,YAAE,CAAC,MAAM,CAAC;QAChD,gBAAgB,EAAE,YAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,YAAE,CAAC,MAAM,CAAC;KACxD,CAAC,CAAC,CAAC;IAEJ,MAAM,EACF,WAAW,GAAG,kCAAkC,EAChD,gBAAgB,GAAG,EAAE,GACxB,GAAG,OAAO,CAAC;IAEZ,MAAM,eAAe,GAAG,CAAC,GAAG,WAAW,EAAE,GAAG,gBAAgB,CAAC,CAAC;IAE9D,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;IAExD,MAAM,MAAM,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;IACpC,MAAM,MAAM,CAAC,IAAI,CAAC,wBAAwB,EAAE,EAAE,IAAI,EAAE,eAAe,EAAE,CAAC,CAAC;AAC3E,CAAC;AAlBD,sCAkBC;AAED;;;;;;;;;;GAUG;AACI,KAAK,UAAU,gBAAgB,CAAC,IAAU;IAC7C,IAAA,YAAE,EAAC,IAAI,EAAE,YAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAU,CAAC,WAAW,CAAC,CAAC,CAAC;IACrD,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;IACzC,OAAO,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;AACrC,CAAC;AAJD,4CAIC;AASD,SAAgB,sBAAsB,CAAC,OAAkC;IACrE,OAAO,CAAC,UAAU,GAAG,CAAC,QAAgB,EAAE,OAA2B,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;IACpH,OAAO,CAAC,YAAY,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IACxD,OAAO,CAAC,aAAa,GAAG,CAAC,OAA8B,EAAE,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACjG,OAAO,CAAC,gBAAgB,GAAG,GAAG,EAAE,CAAC,gBAAgB,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;AACpE,CAAC;AALD,wDAKC;AAED,gBAAgB;AACH,QAAA,eAAe,GAAG;IAC3B,UAAU;IACV,YAAY;IACZ,YAAY;IACZ,aAAa;IACb,gBAAgB;CACnB,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/playwright",
|
|
3
|
-
"version": "3.0.
|
|
3
|
+
"version": "3.0.4-beta.1",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=16.0.0"
|
|
@@ -55,10 +55,10 @@
|
|
|
55
55
|
"dependencies": {
|
|
56
56
|
"@apify/log": "^2.0.0",
|
|
57
57
|
"@apify/datastructures": "^2.0.0",
|
|
58
|
-
"@crawlee/browser": "^3.0.
|
|
59
|
-
"@crawlee/browser-pool": "^3.0.
|
|
60
|
-
"@crawlee/core": "^3.0.
|
|
61
|
-
"@crawlee/utils": "^3.0.
|
|
58
|
+
"@crawlee/browser": "^3.0.4-beta.1",
|
|
59
|
+
"@crawlee/browser-pool": "^3.0.4-beta.1",
|
|
60
|
+
"@crawlee/core": "^3.0.4-beta.1",
|
|
61
|
+
"@crawlee/utils": "^3.0.4-beta.1",
|
|
62
62
|
"cheerio": "1.0.0-rc.12",
|
|
63
63
|
"jquery": "^3.6.0",
|
|
64
64
|
"ow": "^0.28.1"
|