apify 2.3.1-beta.4 → 3.0.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -5
- package/package.json +69 -128
- package/build/actor.d.ts +0 -113
- package/build/actor.d.ts.map +0 -1
- package/build/actor.js +0 -582
- package/build/actor.js.map +0 -1
- package/build/apify.d.ts +0 -752
- package/build/apify.d.ts.map +0 -1
- package/build/apify.js +0 -877
- package/build/apify.js.map +0 -1
- package/build/autoscaling/autoscaled_pool.d.ts +0 -384
- package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
- package/build/autoscaling/autoscaled_pool.js +0 -557
- package/build/autoscaling/autoscaled_pool.js.map +0 -1
- package/build/autoscaling/snapshotter.d.ts +0 -278
- package/build/autoscaling/snapshotter.d.ts.map +0 -1
- package/build/autoscaling/snapshotter.js +0 -447
- package/build/autoscaling/snapshotter.js.map +0 -1
- package/build/autoscaling/system_status.d.ts +0 -224
- package/build/autoscaling/system_status.d.ts.map +0 -1
- package/build/autoscaling/system_status.js +0 -228
- package/build/autoscaling/system_status.js.map +0 -1
- package/build/browser_launchers/browser_launcher.d.ts +0 -154
- package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
- package/build/browser_launchers/browser_launcher.js +0 -160
- package/build/browser_launchers/browser_launcher.js.map +0 -1
- package/build/browser_launchers/browser_plugin.d.ts +0 -23
- package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
- package/build/browser_launchers/browser_plugin.js +0 -25
- package/build/browser_launchers/browser_plugin.js.map +0 -1
- package/build/browser_launchers/playwright_launcher.d.ts +0 -131
- package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
- package/build/browser_launchers/playwright_launcher.js +0 -150
- package/build/browser_launchers/playwright_launcher.js.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
- package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.js +0 -197
- package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
- package/build/cache_container.d.ts +0 -31
- package/build/cache_container.d.ts.map +0 -1
- package/build/cache_container.js +0 -48
- package/build/cache_container.js.map +0 -1
- package/build/configuration.d.ts +0 -226
- package/build/configuration.d.ts.map +0 -1
- package/build/configuration.js +0 -325
- package/build/configuration.js.map +0 -1
- package/build/constants.d.ts +0 -37
- package/build/constants.d.ts.map +0 -1
- package/build/constants.js +0 -41
- package/build/constants.js.map +0 -1
- package/build/crawlers/basic_crawler.d.ts +0 -443
- package/build/crawlers/basic_crawler.d.ts.map +0 -1
- package/build/crawlers/basic_crawler.js +0 -664
- package/build/crawlers/basic_crawler.js.map +0 -1
- package/build/crawlers/browser_crawler.d.ts +0 -512
- package/build/crawlers/browser_crawler.d.ts.map +0 -1
- package/build/crawlers/browser_crawler.js +0 -540
- package/build/crawlers/browser_crawler.js.map +0 -1
- package/build/crawlers/cheerio_crawler.d.ts +0 -931
- package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
- package/build/crawlers/cheerio_crawler.js +0 -913
- package/build/crawlers/cheerio_crawler.js.map +0 -1
- package/build/crawlers/crawler_extension.d.ts +0 -10
- package/build/crawlers/crawler_extension.d.ts.map +0 -1
- package/build/crawlers/crawler_extension.js +0 -19
- package/build/crawlers/crawler_extension.js.map +0 -1
- package/build/crawlers/crawler_utils.d.ts +0 -34
- package/build/crawlers/crawler_utils.d.ts.map +0 -1
- package/build/crawlers/crawler_utils.js +0 -87
- package/build/crawlers/crawler_utils.js.map +0 -1
- package/build/crawlers/playwright_crawler.d.ts +0 -448
- package/build/crawlers/playwright_crawler.d.ts.map +0 -1
- package/build/crawlers/playwright_crawler.js +0 -299
- package/build/crawlers/playwright_crawler.js.map +0 -1
- package/build/crawlers/puppeteer_crawler.d.ts +0 -425
- package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
- package/build/crawlers/puppeteer_crawler.js +0 -299
- package/build/crawlers/puppeteer_crawler.js.map +0 -1
- package/build/crawlers/statistics.d.ts +0 -185
- package/build/crawlers/statistics.d.ts.map +0 -1
- package/build/crawlers/statistics.js +0 -331
- package/build/crawlers/statistics.js.map +0 -1
- package/build/enqueue_links/click_elements.d.ts +0 -179
- package/build/enqueue_links/click_elements.d.ts.map +0 -1
- package/build/enqueue_links/click_elements.js +0 -434
- package/build/enqueue_links/click_elements.js.map +0 -1
- package/build/enqueue_links/enqueue_links.d.ts +0 -117
- package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
- package/build/enqueue_links/enqueue_links.js +0 -163
- package/build/enqueue_links/enqueue_links.js.map +0 -1
- package/build/enqueue_links/shared.d.ts +0 -42
- package/build/enqueue_links/shared.d.ts.map +0 -1
- package/build/enqueue_links/shared.js +0 -121
- package/build/enqueue_links/shared.js.map +0 -1
- package/build/errors.d.ts +0 -29
- package/build/errors.d.ts.map +0 -1
- package/build/errors.js +0 -38
- package/build/errors.js.map +0 -1
- package/build/events.d.ts +0 -11
- package/build/events.d.ts.map +0 -1
- package/build/events.js +0 -147
- package/build/events.js.map +0 -1
- package/build/index.d.ts +0 -4
- package/build/index.d.ts.map +0 -1
- package/build/index.js +0 -7
- package/build/index.js.map +0 -1
- package/build/main.d.ts +0 -179
- package/build/main.d.ts.map +0 -1
- package/build/main.js +0 -81
- package/build/main.js.map +0 -1
- package/build/playwright_utils.d.ts +0 -9
- package/build/playwright_utils.d.ts.map +0 -1
- package/build/playwright_utils.js +0 -90
- package/build/playwright_utils.js.map +0 -1
- package/build/proxy_configuration.d.ts +0 -411
- package/build/proxy_configuration.d.ts.map +0 -1
- package/build/proxy_configuration.js +0 -517
- package/build/proxy_configuration.js.map +0 -1
- package/build/pseudo_url.d.ts +0 -86
- package/build/pseudo_url.d.ts.map +0 -1
- package/build/pseudo_url.js +0 -153
- package/build/pseudo_url.js.map +0 -1
- package/build/puppeteer_request_interception.d.ts +0 -8
- package/build/puppeteer_request_interception.d.ts.map +0 -1
- package/build/puppeteer_request_interception.js +0 -235
- package/build/puppeteer_request_interception.js.map +0 -1
- package/build/puppeteer_utils.d.ts +0 -250
- package/build/puppeteer_utils.d.ts.map +0 -1
- package/build/puppeteer_utils.js +0 -551
- package/build/puppeteer_utils.js.map +0 -1
- package/build/request.d.ts +0 -180
- package/build/request.d.ts.map +0 -1
- package/build/request.js +0 -261
- package/build/request.js.map +0 -1
- package/build/request_list.d.ts +0 -581
- package/build/request_list.d.ts.map +0 -1
- package/build/request_list.js +0 -826
- package/build/request_list.js.map +0 -1
- package/build/serialization.d.ts +0 -5
- package/build/serialization.d.ts.map +0 -1
- package/build/serialization.js +0 -139
- package/build/serialization.js.map +0 -1
- package/build/session_pool/errors.d.ts +0 -11
- package/build/session_pool/errors.d.ts.map +0 -1
- package/build/session_pool/errors.js +0 -18
- package/build/session_pool/errors.js.map +0 -1
- package/build/session_pool/events.d.ts +0 -5
- package/build/session_pool/events.d.ts.map +0 -1
- package/build/session_pool/events.js +0 -6
- package/build/session_pool/events.js.map +0 -1
- package/build/session_pool/session.d.ts +0 -286
- package/build/session_pool/session.d.ts.map +0 -1
- package/build/session_pool/session.js +0 -355
- package/build/session_pool/session.js.map +0 -1
- package/build/session_pool/session_pool.d.ts +0 -280
- package/build/session_pool/session_pool.d.ts.map +0 -1
- package/build/session_pool/session_pool.js +0 -393
- package/build/session_pool/session_pool.js.map +0 -1
- package/build/session_pool/session_utils.d.ts +0 -4
- package/build/session_pool/session_utils.d.ts.map +0 -1
- package/build/session_pool/session_utils.js +0 -24
- package/build/session_pool/session_utils.js.map +0 -1
- package/build/stealth/hiding_tricks.d.ts +0 -22
- package/build/stealth/hiding_tricks.d.ts.map +0 -1
- package/build/stealth/hiding_tricks.js +0 -308
- package/build/stealth/hiding_tricks.js.map +0 -1
- package/build/stealth/stealth.d.ts +0 -56
- package/build/stealth/stealth.d.ts.map +0 -1
- package/build/stealth/stealth.js +0 -125
- package/build/stealth/stealth.js.map +0 -1
- package/build/storages/dataset.d.ts +0 -288
- package/build/storages/dataset.d.ts.map +0 -1
- package/build/storages/dataset.js +0 -480
- package/build/storages/dataset.js.map +0 -1
- package/build/storages/key_value_store.d.ts +0 -243
- package/build/storages/key_value_store.d.ts.map +0 -1
- package/build/storages/key_value_store.js +0 -462
- package/build/storages/key_value_store.js.map +0 -1
- package/build/storages/request_queue.d.ts +0 -318
- package/build/storages/request_queue.d.ts.map +0 -1
- package/build/storages/request_queue.js +0 -636
- package/build/storages/request_queue.js.map +0 -1
- package/build/storages/storage_manager.d.ts +0 -87
- package/build/storages/storage_manager.d.ts.map +0 -1
- package/build/storages/storage_manager.js +0 -150
- package/build/storages/storage_manager.js.map +0 -1
- package/build/tsconfig.tsbuildinfo +0 -1
- package/build/typedefs.d.ts +0 -146
- package/build/typedefs.d.ts.map +0 -1
- package/build/typedefs.js +0 -88
- package/build/typedefs.js.map +0 -1
- package/build/utils.d.ts +0 -175
- package/build/utils.d.ts.map +0 -1
- package/build/utils.js +0 -731
- package/build/utils.js.map +0 -1
- package/build/utils_log.d.ts +0 -41
- package/build/utils_log.d.ts.map +0 -1
- package/build/utils_log.js +0 -192
- package/build/utils_log.js.map +0 -1
- package/build/utils_request.d.ts +0 -77
- package/build/utils_request.d.ts.map +0 -1
- package/build/utils_request.js +0 -385
- package/build/utils_request.js.map +0 -1
- package/build/utils_social.d.ts +0 -210
- package/build/utils_social.d.ts.map +0 -1
- package/build/utils_social.js +0 -787
- package/build/utils_social.js.map +0 -1
- package/build/validators.d.ts +0 -23
- package/build/validators.d.ts.map +0 -1
- package/build/validators.js +0 -29
- package/build/validators.js.map +0 -1
|
@@ -1,434 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.clickElements = exports.isTargetRelevant = exports.clickElementsAndInterceptNavigationRequests = exports.enqueueLinksByClickingElements = void 0;
|
|
4
|
-
const tslib_1 = require("tslib");
|
|
5
|
-
const ow_1 = (0, tslib_1.__importDefault)(require("ow"));
|
|
6
|
-
const url_1 = require("url");
|
|
7
|
-
const utils_log_1 = (0, tslib_1.__importDefault)(require("../utils_log"));
|
|
8
|
-
const puppeteer_request_interception_1 = require("../puppeteer_request_interception");
|
|
9
|
-
const shared_1 = require("./shared");
|
|
10
|
-
/* eslint-enable import/named,no-unused-vars,import/order */
|
|
11
|
-
const STARTING_Z_INDEX = 2147400000;
|
|
12
|
-
/**
|
|
13
|
-
* The function finds elements matching a specific CSS selector in a Puppeteer page,
|
|
14
|
-
* clicks all those elements using a mouse move and a left mouse button click and intercepts
|
|
15
|
-
* all the navigation requests that are subsequently produced by the page. The intercepted
|
|
16
|
-
* requests, including their methods, headers and payloads are then enqueued to a provided
|
|
17
|
-
* {@link RequestQueue}. This is useful to crawl JavaScript heavy pages where links are not available
|
|
18
|
-
* in `href` elements, but rather navigations are triggered in click handlers.
|
|
19
|
-
* If you're looking to find URLs in `href` attributes of the page, see {@link utils#enqueueLinks}.
|
|
20
|
-
*
|
|
21
|
-
* Optionally, the function allows you to filter the target links' URLs using an array of {@link PseudoUrl} objects
|
|
22
|
-
* and override settings of the enqueued {@link Request} objects.
|
|
23
|
-
*
|
|
24
|
-
* **IMPORTANT**: To be able to do this, this function uses various mutations on the page,
|
|
25
|
-
* such as changing the Z-index of elements being clicked and their visibility. Therefore,
|
|
26
|
-
* it is recommended to only use this function as the last operation in the page.
|
|
27
|
-
*
|
|
28
|
-
* **USING HEADFUL BROWSER**: When using a headful browser, this function will only be able to click elements
|
|
29
|
-
* in the focused tab, effectively limiting concurrency to 1. In headless mode, full concurrency can be achieved.
|
|
30
|
-
*
|
|
31
|
-
* **PERFORMANCE**: Clicking elements with a mouse and intercepting requests is not a low level operation
|
|
32
|
-
* that takes nanoseconds. It's not very CPU intensive, but it takes time. We strongly recommend limiting
|
|
33
|
-
* the scope of the clicking as much as possible by using a specific selector that targets only the elements
|
|
34
|
-
* that you assume or know will produce a navigation. You can certainly click everything by using
|
|
35
|
-
* the `*` selector, but be prepared to wait minutes to get results on a large and complex page.
|
|
36
|
-
*
|
|
37
|
-
* **Example usage**
|
|
38
|
-
*
|
|
39
|
-
* ```javascript
|
|
40
|
-
* await Apify.utils.puppeteer.enqueueLinksByClickingElements({
|
|
41
|
-
* page,
|
|
42
|
-
* requestQueue,
|
|
43
|
-
* selector: 'a.product-detail',
|
|
44
|
-
* pseudoUrls: [
|
|
45
|
-
* 'https://www.example.com/handbags/[.*]'
|
|
46
|
-
* 'https://www.example.com/purses/[.*]'
|
|
47
|
-
* ],
|
|
48
|
-
* });
|
|
49
|
-
* ```
|
|
50
|
-
* @param {object} options
|
|
51
|
-
* All `enqueueLinksByClickingElements()` parameters are passed
|
|
52
|
-
* via an options object with the following keys:
|
|
53
|
-
* @param {Page} options.page
|
|
54
|
-
* Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
|
|
55
|
-
* @param {RequestQueue} options.requestQueue
|
|
56
|
-
* A request queue to which the URLs will be enqueued.
|
|
57
|
-
* @param {string} options.selector
|
|
58
|
-
* A CSS selector matching elements to be clicked on. Unlike in {@link utils#enqueueLinks}, there is no default
|
|
59
|
-
* value. This is to prevent suboptimal use of this function by using it too broadly.
|
|
60
|
-
* @param {Array<(string|RegExp|Object<string, *>)>} [options.pseudoUrls]
|
|
61
|
-
* An array of {@link PseudoUrl}s matching the URLs to be enqueued,
|
|
62
|
-
* or an array of strings or RegExps or plain Objects from which the {@link PseudoUrl}s can be constructed.
|
|
63
|
-
*
|
|
64
|
-
* The plain objects must include at least the `purl` property, which holds the pseudo-URL string or RegExp.
|
|
65
|
-
* All remaining keys will be used as the `requestTemplate` argument of the {@link PseudoUrl} constructor,
|
|
66
|
-
* which lets you specify special properties for the enqueued {@link Request} objects.
|
|
67
|
-
*
|
|
68
|
-
* If `pseudoUrls` is an empty array, `null` or `undefined`, then the function
|
|
69
|
-
* enqueues all links found on the page.
|
|
70
|
-
* @param {object} [options.clickOptions]
|
|
71
|
-
* click options for use in Puppeteer's click handler
|
|
72
|
-
* @param {number} [options.clickOptions.clickCount]
|
|
73
|
-
* Number of clicks to be executed. Defaults to 1
|
|
74
|
-
* @param {number} [options.clickOptions.delay]
|
|
75
|
-
* Time to wait between mousedown and mouseup in milliseconds. Defaults to 0
|
|
76
|
-
* @param {RequestTransform} [options.transformRequestFunction]
|
|
77
|
-
* Just before a new {@link Request} is constructed and enqueued to the {@link RequestQueue}, this function can be used
|
|
78
|
-
* to remove it or modify its contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful
|
|
79
|
-
* when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads,
|
|
80
|
-
* or to dynamically update or create `userData`.
|
|
81
|
-
*
|
|
82
|
-
* For example: by adding `useExtendedUniqueKey: true` to the `request` object, `uniqueKey` will be computed from
|
|
83
|
-
* a combination of `url`, `method` and `payload` which enables crawling of websites that navigate using form submits
|
|
84
|
-
* (POST requests).
|
|
85
|
-
*
|
|
86
|
-
* **Example:**
|
|
87
|
-
* ```javascript
|
|
88
|
-
* {
|
|
89
|
-
* transformRequestFunction: (request) => {
|
|
90
|
-
* request.userData.foo = 'bar';
|
|
91
|
-
* request.useExtendedUniqueKey = true;
|
|
92
|
-
* return request;
|
|
93
|
-
* }
|
|
94
|
-
* }
|
|
95
|
-
* ```
|
|
96
|
-
* @param {number} [options.waitForPageIdleSecs=1]
|
|
97
|
-
* Clicking in the page triggers various asynchronous operations that lead to new URLs being shown
|
|
98
|
-
* by the browser. It could be a simple JavaScript redirect or opening of a new tab in the browser.
|
|
99
|
-
* These events often happen only some time after the actual click. Requests typically take milliseconds
|
|
100
|
-
* while new tabs open in hundreds of milliseconds.
|
|
101
|
-
*
|
|
102
|
-
* To be able to capture all those events, the `enqueueLinksByClickingElements()` function repeatedly waits
|
|
103
|
-
* for the `waitForPageIdleSecs`. By repeatedly we mean that whenever a relevant event is triggered, the timer
|
|
104
|
-
* is restarted. As long as new events keep coming, the function will not return, unless
|
|
105
|
-
* the below `maxWaitForPageIdleSecs` timeout is reached.
|
|
106
|
-
*
|
|
107
|
-
* You may want to reduce this for example when you're sure that your clicks do not open new tabs,
|
|
108
|
-
* or increase when you're not getting all the expected URLs.
|
|
109
|
-
* @param {number} [options.maxWaitForPageIdleSecs=5]
|
|
110
|
-
* This is the maximum period for which the function will keep tracking events, even if more events keep coming.
|
|
111
|
-
* Its purpose is to prevent a deadlock in the page by periodic events, often unrelated to the clicking itself.
|
|
112
|
-
* See `waitForPageIdleSecs` above for an explanation.
|
|
113
|
-
* @return {Promise<Array<QueueOperationInfo>>}
|
|
114
|
-
* Promise that resolves to an array of {@link QueueOperationInfo} objects.
|
|
115
|
-
* @memberOf puppeteer
|
|
116
|
-
* @name enqueueLinksByClickingElements
|
|
117
|
-
* @function
|
|
118
|
-
*/
|
|
119
|
-
async function enqueueLinksByClickingElements(options) {
|
|
120
|
-
(0, ow_1.default)(options, ow_1.default.object.exactShape({
|
|
121
|
-
page: ow_1.default.object.hasKeys('goto', 'evaluate'),
|
|
122
|
-
requestQueue: ow_1.default.object.hasKeys('fetchNextRequest', 'addRequest'),
|
|
123
|
-
clickOptions: ow_1.default.optional.object.hasKeys('clickCount', 'delay'),
|
|
124
|
-
selector: ow_1.default.string,
|
|
125
|
-
pseudoUrls: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.regExp, ow_1.default.object.hasKeys('purl'))),
|
|
126
|
-
transformRequestFunction: ow_1.default.optional.function,
|
|
127
|
-
waitForPageIdleSecs: ow_1.default.optional.number,
|
|
128
|
-
maxWaitForPageIdleSecs: ow_1.default.optional.number,
|
|
129
|
-
}));
|
|
130
|
-
const { page, requestQueue, selector, pseudoUrls, transformRequestFunction, waitForPageIdleSecs = 1, maxWaitForPageIdleSecs = 5, clickOptions, } = options;
|
|
131
|
-
const waitForPageIdleMillis = waitForPageIdleSecs * 1000;
|
|
132
|
-
const maxWaitForPageIdleMillis = maxWaitForPageIdleSecs * 1000;
|
|
133
|
-
const pseudoUrlInstances = (0, shared_1.constructPseudoUrlInstances)(pseudoUrls || []);
|
|
134
|
-
const interceptedRequests = await clickElementsAndInterceptNavigationRequests({
|
|
135
|
-
page,
|
|
136
|
-
selector,
|
|
137
|
-
waitForPageIdleMillis,
|
|
138
|
-
maxWaitForPageIdleMillis,
|
|
139
|
-
clickOptions,
|
|
140
|
-
});
|
|
141
|
-
let requestOptions = (0, shared_1.createRequestOptions)(interceptedRequests);
|
|
142
|
-
if (transformRequestFunction) {
|
|
143
|
-
requestOptions = requestOptions.map(transformRequestFunction).filter((r) => !!r);
|
|
144
|
-
}
|
|
145
|
-
const requests = (0, shared_1.createRequests)(requestOptions, pseudoUrlInstances);
|
|
146
|
-
return (0, shared_1.addRequestsToQueueInBatches)(requests, requestQueue);
|
|
147
|
-
}
|
|
148
|
-
exports.enqueueLinksByClickingElements = enqueueLinksByClickingElements;
|
|
149
|
-
/**
|
|
150
|
-
* Clicks all elements of given page matching given selector.
|
|
151
|
-
* Catches and intercepts all initiated navigation requests and opened pages.
|
|
152
|
-
* Returns a list of all target URLs.
|
|
153
|
-
*
|
|
154
|
-
* @param {object} options
|
|
155
|
-
* @param {Page} options.page
|
|
156
|
-
* @param {string} options.selector
|
|
157
|
-
* @param {number} [options.waitForPageIdleMillis]
|
|
158
|
-
* @param {number} [options.maxWaitForPageIdleMillis]
|
|
159
|
-
* @param {object} [clickOptions]
|
|
160
|
-
* @param {number} [clickOptions.clickCount]
|
|
161
|
-
* @param {number} [clickOptions.delay]
|
|
162
|
-
* @return {Promise<Array<*>>}
|
|
163
|
-
* @ignore
|
|
164
|
-
*/
|
|
165
|
-
async function clickElementsAndInterceptNavigationRequests(options) {
|
|
166
|
-
const { page, selector, waitForPageIdleMillis, maxWaitForPageIdleMillis, clickOptions, } = options;
|
|
167
|
-
const uniqueRequests = new Set();
|
|
168
|
-
const browser = page.browser();
|
|
169
|
-
const onInterceptedRequest = createInterceptRequestHandler(page, uniqueRequests);
|
|
170
|
-
const onTargetCreated = createTargetCreatedHandler(page, uniqueRequests);
|
|
171
|
-
const onFrameNavigated = createFrameNavigatedHandler(page, uniqueRequests);
|
|
172
|
-
await (0, puppeteer_request_interception_1.addInterceptRequestHandler)(page, onInterceptedRequest);
|
|
173
|
-
browser.on('targetcreated', onTargetCreated);
|
|
174
|
-
page.on('framenavigated', onFrameNavigated);
|
|
175
|
-
await preventHistoryNavigation(page);
|
|
176
|
-
await clickElements(page, selector, clickOptions);
|
|
177
|
-
await waitForPageIdle({ page, waitForPageIdleMillis, maxWaitForPageIdleMillis });
|
|
178
|
-
await restoreHistoryNavigationAndSaveCapturedUrls(page, uniqueRequests);
|
|
179
|
-
browser.removeListener('targetcreated', onTargetCreated);
|
|
180
|
-
page.removeListener('framenavigated', onFrameNavigated);
|
|
181
|
-
await (0, puppeteer_request_interception_1.removeInterceptRequestHandler)(page, onInterceptedRequest);
|
|
182
|
-
const serializedRequests = Array.from(uniqueRequests);
|
|
183
|
-
return serializedRequests.map((r) => JSON.parse(r));
|
|
184
|
-
}
|
|
185
|
-
exports.clickElementsAndInterceptNavigationRequests = clickElementsAndInterceptNavigationRequests;
|
|
186
|
-
/**
|
|
187
|
-
* @param {Page} page
|
|
188
|
-
* @param {Set<*>} requests
|
|
189
|
-
* @return {Function}
|
|
190
|
-
* @ignore
|
|
191
|
-
*/
|
|
192
|
-
function createInterceptRequestHandler(page, requests) {
|
|
193
|
-
return function onInterceptedRequest(req) {
|
|
194
|
-
if (!isTopFrameNavigationRequest(page, req))
|
|
195
|
-
return req.continue();
|
|
196
|
-
const url = req.url();
|
|
197
|
-
requests.add(JSON.stringify({
|
|
198
|
-
url,
|
|
199
|
-
headers: req.headers(),
|
|
200
|
-
method: req.method(),
|
|
201
|
-
payload: req.postData(),
|
|
202
|
-
}));
|
|
203
|
-
if (req.redirectChain().length) {
|
|
204
|
-
req.respond({ body: '' }); // Prevents 301/302 redirect
|
|
205
|
-
}
|
|
206
|
-
else {
|
|
207
|
-
req.abort('aborted'); // Prevents navigation by js
|
|
208
|
-
}
|
|
209
|
-
};
|
|
210
|
-
}
|
|
211
|
-
/**
|
|
212
|
-
* @param {Page} page
|
|
213
|
-
* @param {PuppeteerRequest} req
|
|
214
|
-
* @return {boolean}
|
|
215
|
-
* @ignore
|
|
216
|
-
*/
|
|
217
|
-
function isTopFrameNavigationRequest(page, req) {
|
|
218
|
-
return req.isNavigationRequest()
|
|
219
|
-
&& req.frame() === page.mainFrame();
|
|
220
|
-
}
|
|
221
|
-
/**
|
|
222
|
-
* @param {Page} page
|
|
223
|
-
* @param {Set<*>} requests
|
|
224
|
-
* @return {Function}
|
|
225
|
-
* @ignore
|
|
226
|
-
*/
|
|
227
|
-
function createTargetCreatedHandler(page, requests) {
|
|
228
|
-
return async function onTargetCreated(target) {
|
|
229
|
-
if (!isTargetRelevant(page, target))
|
|
230
|
-
return;
|
|
231
|
-
const url = target.url();
|
|
232
|
-
requests.add(JSON.stringify({ url }));
|
|
233
|
-
// We want to close the page but don't care about
|
|
234
|
-
// possible errors like target closed.
|
|
235
|
-
try {
|
|
236
|
-
const createdPage = await target.page();
|
|
237
|
-
await createdPage.close();
|
|
238
|
-
}
|
|
239
|
-
catch (err) {
|
|
240
|
-
utils_log_1.default.debug('enqueueLinksByClickingElements: Could not close spawned page.', { error: err.stack });
|
|
241
|
-
}
|
|
242
|
-
};
|
|
243
|
-
}
|
|
244
|
-
/**
|
|
245
|
-
* We're only interested in pages created by the page we're currently clicking in.
|
|
246
|
-
* There will generally be a lot of other targets being created in the browser.
|
|
247
|
-
* @param {Page} page
|
|
248
|
-
* @param {Target} target
|
|
249
|
-
* @return {boolean}
|
|
250
|
-
*/
|
|
251
|
-
function isTargetRelevant(page, target) {
|
|
252
|
-
return target.type() === 'page'
|
|
253
|
-
&& page.target() === target.opener();
|
|
254
|
-
}
|
|
255
|
-
exports.isTargetRelevant = isTargetRelevant;
|
|
256
|
-
/**
|
|
257
|
-
* @param {Page} page
|
|
258
|
-
* @param {Set<*>} requests
|
|
259
|
-
* @return {Function}
|
|
260
|
-
* @ignore
|
|
261
|
-
*/
|
|
262
|
-
function createFrameNavigatedHandler(page, requests) {
|
|
263
|
-
return function onFrameNavigated(frame) {
|
|
264
|
-
if (frame !== page.mainFrame())
|
|
265
|
-
return;
|
|
266
|
-
const url = frame.url();
|
|
267
|
-
requests.add(JSON.stringify({ url }));
|
|
268
|
-
};
|
|
269
|
-
}
|
|
270
|
-
/**
|
|
271
|
-
* @param {Page} page
|
|
272
|
-
* @return {Promise<*>}
|
|
273
|
-
* @ignore
|
|
274
|
-
*/
|
|
275
|
-
async function preventHistoryNavigation(page) {
|
|
276
|
-
/* istanbul ignore next */
|
|
277
|
-
return page.evaluate(() => {
|
|
278
|
-
window.__originalHistory__ = window.history; // eslint-disable-line no-underscore-dangle
|
|
279
|
-
delete window.history; // Simple override does not work.
|
|
280
|
-
window.history = {
|
|
281
|
-
stateHistory: [],
|
|
282
|
-
length: 0,
|
|
283
|
-
state: {},
|
|
284
|
-
go() { },
|
|
285
|
-
back() { },
|
|
286
|
-
forward() { },
|
|
287
|
-
pushState(...args) {
|
|
288
|
-
this.stateHistory.push(args);
|
|
289
|
-
},
|
|
290
|
-
replaceState(...args) {
|
|
291
|
-
this.stateHistory.push(args);
|
|
292
|
-
},
|
|
293
|
-
};
|
|
294
|
-
});
|
|
295
|
-
}
|
|
296
|
-
/**
|
|
297
|
-
* Click all elements matching the given selector. To be able to do this using
|
|
298
|
-
* Puppeteer's `.click()` we need to make sure the elements are reachable by mouse,
|
|
299
|
-
* so we first move them to the top of the page's stacking context and then click.
|
|
300
|
-
* We do all in series to prevent elements from hiding one another. Therefore,
|
|
301
|
-
* for large element sets, this will take considerable amount of time.
|
|
302
|
-
*
|
|
303
|
-
* @param {Page} page
|
|
304
|
-
* @param {string} selector
|
|
305
|
-
* @param {object} [clickOptions]
|
|
306
|
-
* @param {number} [clickOptions.clickCount]
|
|
307
|
-
* @param {number} [clickOptions.delay]
|
|
308
|
-
* @return {Promise<void>}
|
|
309
|
-
* @ignore
|
|
310
|
-
*/
|
|
311
|
-
async function clickElements(page, selector, clickOptions) {
|
|
312
|
-
const elementHandles = await page.$$(selector);
|
|
313
|
-
utils_log_1.default.debug(`enqueueLinksByClickingElements: There are ${elementHandles.length} elements to click.`);
|
|
314
|
-
let clickedElementsCount = 0;
|
|
315
|
-
let zIndex = STARTING_Z_INDEX;
|
|
316
|
-
let shouldLogWarning = true;
|
|
317
|
-
for (const handle of elementHandles) {
|
|
318
|
-
try {
|
|
319
|
-
await page.evaluate(updateElementCssToEnableMouseClick, handle, zIndex++);
|
|
320
|
-
await handle.click(clickOptions);
|
|
321
|
-
clickedElementsCount++;
|
|
322
|
-
}
|
|
323
|
-
catch (err) {
|
|
324
|
-
if (shouldLogWarning && err.stack.includes('is detached from document')) {
|
|
325
|
-
utils_log_1.default.warning(`An element with selector ${selector} that you're trying to click has been removed from the page. `
|
|
326
|
-
+ 'This was probably caused by an earlier click which triggered some JavaScript on the page that caused it to change. '
|
|
327
|
-
+ 'If you\'re trying to enqueue pagination links, we suggest using the "next" button, if available and going one by one.');
|
|
328
|
-
shouldLogWarning = false;
|
|
329
|
-
}
|
|
330
|
-
utils_log_1.default.debug('enqueueLinksByClickingElements: Click failed.', { stack: err.stack });
|
|
331
|
-
}
|
|
332
|
-
}
|
|
333
|
-
utils_log_1.default.debug(`enqueueLinksByClickingElements: Successfully clicked ${clickedElementsCount} elements out of ${elementHandles.length}`);
|
|
334
|
-
}
|
|
335
|
-
exports.clickElements = clickElements;
|
|
336
|
-
/* istanbul ignore next */
|
|
337
|
-
/**
|
|
338
|
-
* This is an in browser function!
|
|
339
|
-
* @param {Element} el
|
|
340
|
-
* @param {number} zIndex
|
|
341
|
-
*/
|
|
342
|
-
function updateElementCssToEnableMouseClick(el, zIndex) {
|
|
343
|
-
el.style.visibility = 'visible';
|
|
344
|
-
el.style.display = 'block';
|
|
345
|
-
el.style.position = 'fixed';
|
|
346
|
-
el.style.zIndex = zIndex;
|
|
347
|
-
el.style.left = 0;
|
|
348
|
-
el.style.top = 0;
|
|
349
|
-
const boundingRect = el.getBoundingClientRect();
|
|
350
|
-
if (!boundingRect.height)
|
|
351
|
-
el.style.height = '10px';
|
|
352
|
-
if (!boundingRect.width)
|
|
353
|
-
el.style.width = '10px';
|
|
354
|
-
}
|
|
355
|
-
/**
|
|
356
|
-
* This function tracks whether any requests, frame navigations or targets were emitted
|
|
357
|
-
* in the past idleIntervalMillis and whenever the interval registers no activity,
|
|
358
|
-
* the function returns.
|
|
359
|
-
*
|
|
360
|
-
* It will also return when a final timeout, represented by the timeoutMillis parameter
|
|
361
|
-
* is reached, to prevent blocking on pages with constant network activity.
|
|
362
|
-
*
|
|
363
|
-
* We need this to make sure we don't finish too soon when intercepting requests triggered
|
|
364
|
-
* by clicking in the page. They often get registered by the Node.js process only some
|
|
365
|
-
* milliseconds after clicking and we would lose those requests. This is especially prevalent
|
|
366
|
-
* when there's only a single element to click.
|
|
367
|
-
*
|
|
368
|
-
* @param {Object} options
|
|
369
|
-
* @param {Page} options.page
|
|
370
|
-
* @param {number} options.waitForPageIdleMillis
|
|
371
|
-
* @param {number} options.maxWaitForPageIdleMillis
|
|
372
|
-
* @return {Promise<void>}
|
|
373
|
-
* @ignore
|
|
374
|
-
*/
|
|
375
|
-
async function waitForPageIdle({ page, waitForPageIdleMillis, maxWaitForPageIdleMillis }) {
|
|
376
|
-
return new Promise((resolve) => {
|
|
377
|
-
let timeout;
|
|
378
|
-
let maxTimeout;
|
|
379
|
-
const context = page.browserContext();
|
|
380
|
-
function newTabTracker(target) {
|
|
381
|
-
if (isTargetRelevant(page, target))
|
|
382
|
-
activityHandler();
|
|
383
|
-
}
|
|
384
|
-
function activityHandler() {
|
|
385
|
-
clearTimeout(timeout);
|
|
386
|
-
timeout = setTimeout(() => {
|
|
387
|
-
clearTimeout(maxTimeout);
|
|
388
|
-
finish();
|
|
389
|
-
}, waitForPageIdleMillis);
|
|
390
|
-
}
|
|
391
|
-
function maxTimeoutHandler() {
|
|
392
|
-
utils_log_1.default.debug(`enqueueLinksByClickingElements: Page still showed activity after ${maxWaitForPageIdleMillis}ms. `
|
|
393
|
-
+ 'This is probably due to the website itself dispatching requests, but some links may also have been missed.');
|
|
394
|
-
finish();
|
|
395
|
-
}
|
|
396
|
-
function finish() {
|
|
397
|
-
page.removeListener('request', activityHandler);
|
|
398
|
-
page.removeListener('framenavigated', activityHandler);
|
|
399
|
-
context.removeListener('targetcreated', newTabTracker);
|
|
400
|
-
resolve();
|
|
401
|
-
}
|
|
402
|
-
maxTimeout = setTimeout(maxTimeoutHandler, maxWaitForPageIdleMillis);
|
|
403
|
-
timeout = activityHandler(); // We call this once manually in case there would be no requests at all.
|
|
404
|
-
page.on('request', activityHandler);
|
|
405
|
-
page.on('framenavigated', activityHandler);
|
|
406
|
-
context.on('targetcreated', newTabTracker);
|
|
407
|
-
});
|
|
408
|
-
}
|
|
409
|
-
/**
|
|
410
|
-
* @param {Page} page
|
|
411
|
-
* @param {Set<*>} requests
|
|
412
|
-
* @return {Promise<void>}
|
|
413
|
-
* @ignore
|
|
414
|
-
*/
|
|
415
|
-
async function restoreHistoryNavigationAndSaveCapturedUrls(page, requests) {
|
|
416
|
-
/* eslint-disable no-shadow */
|
|
417
|
-
/* istanbul ignore next */
|
|
418
|
-
const stateHistory = await page.evaluate(() => {
|
|
419
|
-
const { stateHistory } = window.history;
|
|
420
|
-
window.history = window.__originalHistory__; // eslint-disable-line no-underscore-dangle
|
|
421
|
-
return stateHistory;
|
|
422
|
-
});
|
|
423
|
-
stateHistory.forEach((args) => {
|
|
424
|
-
try {
|
|
425
|
-
const stateUrl = args[args.length - 1];
|
|
426
|
-
const url = new url_1.URL(stateUrl, page.url()).href;
|
|
427
|
-
requests.add(JSON.stringify({ url }));
|
|
428
|
-
}
|
|
429
|
-
catch (err) {
|
|
430
|
-
utils_log_1.default.debug('enqueueLinksByClickingElements: Failed to ', { error: err.stack });
|
|
431
|
-
}
|
|
432
|
-
});
|
|
433
|
-
}
|
|
434
|
-
//# sourceMappingURL=click_elements.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"click_elements.js","sourceRoot":"","sources":["../../src/enqueue_links/click_elements.js"],"names":[],"mappings":";;;;AAAA,yDAAoB;AACpB,6BAA0B;AAC1B,0EAA+B;AAE/B,sFAA8G;AAG9G,qCAMkB;AAClB,4DAA4D;AAE5D,MAAM,gBAAgB,GAAG,UAAU,CAAC;AAEpC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA0GG;AACI,KAAK,UAAU,8BAA8B,CAAC,OAAO;IACxD,IAAA,YAAE,EAAC,OAAO,EAAE,YAAE,CAAC,MAAM,CAAC,UAAU,CAAC;QAC7B,IAAI,EAAE,YAAE,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,UAAU,CAAC;QAC3C,YAAY,EAAE,YAAE,CAAC,MAAM,CAAC,OAAO,CAAC,kBAAkB,EAAE,YAAY,CAAC;QACjE,YAAY,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,YAAY,EAAE,OAAO,CAAC;QAC/D,QAAQ,EAAE,YAAE,CAAC,MAAM;QACnB,UAAU,EAAE,YAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,YAAE,CAAC,GAAG,CAAC,YAAE,CAAC,MAAM,EAAE,YAAE,CAAC,MAAM,EAAE,YAAE,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC;QAC7F,wBAAwB,EAAE,YAAE,CAAC,QAAQ,CAAC,QAAQ;QAC9C,mBAAmB,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;QACvC,sBAAsB,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;KAC7C,CAAC,CAAC,CAAC;IAEJ,MAAM,EACF,IAAI,EACJ,YAAY,EACZ,QAAQ,EACR,UAAU,EACV,wBAAwB,EACxB,mBAAmB,GAAG,CAAC,EACvB,sBAAsB,GAAG,CAAC,EAC1B,YAAY,GACf,GAAG,OAAO,CAAC;IAEZ,MAAM,qBAAqB,GAAG,mBAAmB,GAAG,IAAI,CAAC;IACzD,MAAM,wBAAwB,GAAG,sBAAsB,GAAG,IAAI,CAAC;IAE/D,MAAM,kBAAkB,GAAG,IAAA,oCAA2B,EAAC,UAAU,IAAI,EAAE,CAAC,CAAC;IACzE,MAAM,mBAAmB,GAAG,MAAM,2CAA2C,CAAC;QAC1E,IAAI;QACJ,QAAQ;QACR,qBAAqB;QACrB,wBAAwB;QACxB,YAAY;KACf,CAAC,CAAC;IACH,IAAI,cAAc,GAAG,IAAA,6BAAoB,EAAC,mBAAmB,CAAC,CAAC;IAC/D,IAAI,wBAAwB,EAAE;QAC1B,cAAc,GAAG,cAAc,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;KACpF;IACD,MAAM,QAAQ,GAAG,IAAA,uBAAc,EAAC,cAAc,EAAE,kBAAkB,CAAC,CAAC;IACpE,OAAO,IAAA,oCAA2B,EAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;AAC/D,CAAC;AAxCD,wEAwCC;AAED;;;;;;;;;;;;;;;GAeG;AACI,KAAK,UAAU,2CAA2C,CAAC,OAAO;IACrE,MAAM,EACF,IAAI,EACJ,QAAQ,EACR,qBAAqB,EACrB,wBAAwB,EACxB,YAAY,GACf,GAAG,OAAO,CAAC;IAEZ,MAAM,cAAc,GAAG,IAAI,GAAG,EAAE,CAAC;IACjC,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC;IAE/B,MAAM,oBAAoB,GAAG,6BAA6B,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;IACjF,MAAM,eAAe,GAAG,0BAA0B,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;IACzE,MAAM,gBAAgB,GAAG,2BAA2B,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;IAE3E,MAAM,IAAA,2DAA0B,EAAC,IAAI,EAAE,oBAAoB,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAC7C,IAAI,CAAC,EAAE,CAAC,gBAAgB,EAAE,gBAAgB,CAAC,CAAC;IAE5C,MAAM,wBAAwB,CAAC,IAAI,CAAC,CAAC;IAErC,MAAM,aAAa,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,CAAC,CAAC;IAClD,MAAM,eAAe,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,wBAAwB,EAAE,CAAC,CAAC;IAEjF,MAAM,2CAA2C,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;IAExE,OAAO,CAAC,cAAc,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IACzD,IAAI,CAAC,cAAc,CAAC,gBAAgB,EAAE,gBAAgB,CAAC,CAAC;IACxD,MAAM,IAAA,8DAA6B,EAAC,IAAI,EAAE,oBAAoB,CAAC,CAAC;IAEhE,MAAM,kBAAkB,GAAG,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IACtD,OAAO,kBAAkB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;AACxD,CAAC;AAjCD,kGAiCC;AAED;;;;;GAKG;AACH,SAAS,6BAA6B,CAAC,IAAI,EAAE,QAAQ;IACjD,OAAO,SAAS,oBAAoB,CAAC,GAAG;QACpC,IAAI,CAAC,2BAA2B,CAAC,IAAI,EAAE,GAAG,CAAC;YAAE,OAAO,GAAG,CAAC,QAAQ,EAAE,CAAC;QACnE,MAAM,GAAG,GAAG,GAAG,CAAC,GAAG,EAAE,CAAC;QACtB,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC;YACxB,GAAG;YACH,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE;YACtB,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE;YACpB,OAAO,EAAE,GAAG,CAAC,QAAQ,EAAE;SAC1B,CAAC,CAAC,CAAC;QAEJ,IAAI,GAAG,CAAC,aAAa,EAAE,CAAC,MAAM,EAAE;YAC5B,GAAG,CAAC,OAAO,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,4BAA4B;SAC1D;aAAM;YACH,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,4BAA4B;SACrD;IACL,CAAC,CAAC;AACN,CAAC;AAED;;;;;GAKG;AACH,SAAS,2BAA2B,CAAC,IAAI,EAAE,GAAG;IAC1C,OAAO,GAAG,CAAC,mBAAmB,EAAE;WACzB,GAAG,CAAC,KAAK,EAAE,KAAK,IAAI,CAAC,SAAS,EAAE,CAAC;AAC5C,CAAC;AAED;;;;;GAKG;AACH,SAAS,0BAA0B,CAAC,IAAI,EAAE,QAAQ;IAC9C,OAAO,KAAK,UAAU,eAAe,CAAC,MAAM;QACxC,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC;YAAE,OAAO;QAC5C,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,EAAE,CAAC;QACzB,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;QAEtC,iDAAiD;QACjD,sCAAsC;QACtC,IAAI;YACA,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;YACxC,MAAM,WAAW,CAAC,KAAK,EAAE,CAAC;SAC7B;QAAC,OAAO,GAAG,EAAE;YACV,mBAAG,CAAC,KAAK,CAAC,+DAA+D,EAAE,EAAE,KAAK,EAAE,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC;SACpG;IACL,CAAC,CAAC;AACN,CAAC;AAED;;;;;;GAMG;AACH,SAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM;IACzC,OAAO,MAAM,CAAC,IAAI,EAAE,KAAK,MAAM;WACxB,IAAI,CAAC,MAAM,EAAE,KAAK,MAAM,CAAC,MAAM,EAAE,CAAC;AAC7C,CAAC;AAHD,4CAGC;AAED;;;;;GAKG;AACH,SAAS,2BAA2B,CAAC,IAAI,EAAE,QAAQ;IAC/C,OAAO,SAAS,gBAAgB,CAAC,KAAK;QAClC,IAAI,KAAK,KAAK,IAAI,CAAC,SAAS,EAAE;YAAE,OAAO;QACvC,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,EAAE,CAAC;QACxB,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;IAC1C,CAAC,CAAC;AACN,CAAC;AAED;;;;GAIG;AACH,KAAK,UAAU,wBAAwB,CAAC,IAAI;IACxC,0BAA0B;IAC1B,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAG,EAAE;QACtB,MAAM,CAAC,mBAAmB,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,2CAA2C;QACxF,OAAO,MAAM,CAAC,OAAO,CAAC,CAAC,iCAAiC;QACxD,MAAM,CAAC,OAAO,GAAG;YACb,YAAY,EAAE,EAAE;YAChB,MAAM,EAAE,CAAC;YACT,KAAK,EAAE,EAAE;YACT,EAAE,KAAI,CAAC;YACP,IAAI,KAAI,CAAC;YACT,OAAO,KAAI,CAAC;YACZ,SAAS,CAAC,GAAG,IAAI;gBACb,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,CAAC;YACD,YAAY,CAAC,GAAG,IAAI;gBAChB,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,CAAC;SACJ,CAAC;IACN,CAAC,CAAC,CAAC;AACP,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACI,KAAK,UAAU,aAAa,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY;IAC5D,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,QAAQ,CAAC,CAAC;IAC/C,mBAAG,CAAC,KAAK,CAAC,6CAA6C,cAAc,CAAC,MAAM,qBAAqB,CAAC,CAAC;IACnG,IAAI,oBAAoB,GAAG,CAAC,CAAC;IAC7B,IAAI,MAAM,GAAG,gBAAgB,CAAC;IAC9B,IAAI,gBAAgB,GAAG,IAAI,CAAC;IAC5B,KAAK,MAAM,MAAM,IAAI,cAAc,EAAE;QACjC,IAAI;YACA,MAAM,IAAI,CAAC,QAAQ,CAAC,kCAAkC,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;YAC1E,MAAM,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;YACjC,oBAAoB,EAAE,CAAC;SAC1B;QAAC,OAAO,GAAG,EAAE;YACV,IAAI,gBAAgB,IAAI,GAAG,CAAC,KAAK,CAAC,QAAQ,CAAC,2BAA2B,CAAC,EAAE;gBACrE,mBAAG,CAAC,OAAO,CAAC,4BAA4B,QAAQ,+DAA+D;sBACzG,qHAAqH;sBACrH,uHAAuH,CAAC,CAAC;gBAC/H,gBAAgB,GAAG,KAAK,CAAC;aAC5B;YACD,mBAAG,CAAC,KAAK,CAAC,+CAA+C,EAAE,EAAE,KAAK,EAAE,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC;SACpF;KACJ;IACD,mBAAG,CAAC,KAAK,CAAC,wDAAwD,oBAAoB,oBAAoB,cAAc,CAAC,MAAM,EAAE,CAAC,CAAC;AACvI,CAAC;AAtBD,sCAsBC;AAED,0BAA0B;AAC1B;;;;GAIG;AACH,SAAS,kCAAkC,CAAC,EAAE,EAAE,MAAM;IAClD,EAAE,CAAC,KAAK,CAAC,UAAU,GAAG,SAAS,CAAC;IAChC,EAAE,CAAC,KAAK,CAAC,OAAO,GAAG,OAAO,CAAC;IAC3B,EAAE,CAAC,KAAK,CAAC,QAAQ,GAAG,OAAO,CAAC;IAC5B,EAAE,CAAC,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC;IACzB,EAAE,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,CAAC;IAClB,EAAE,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC;IACjB,MAAM,YAAY,GAAG,EAAE,CAAC,qBAAqB,EAAE,CAAC;IAChD,IAAI,CAAC,YAAY,CAAC,MAAM;QAAE,EAAE,CAAC,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC;IACnD,IAAI,CAAC,YAAY,CAAC,KAAK;QAAE,EAAE,CAAC,KAAK,CAAC,KAAK,GAAG,MAAM,CAAC;AACrD,CAAC;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,KAAK,UAAU,eAAe,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,wBAAwB,EAAE;IACpF,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC3B,IAAI,OAAO,CAAC;QACZ,IAAI,UAAU,CAAC;QACf,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QAEtC,SAAS,aAAa,CAAC,MAAM;YACzB,IAAI,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC;gBAAE,eAAe,EAAE,CAAC;QAC1D,CAAC;QAED,SAAS,eAAe;YACpB,YAAY,CAAC,OAAO,CAAC,CAAC;YACtB,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE;gBACtB,YAAY,CAAC,UAAU,CAAC,CAAC;gBACzB,MAAM,EAAE,CAAC;YACb,CAAC,EAAE,qBAAqB,CAAC,CAAC;QAC9B,CAAC;QAED,SAAS,iBAAiB;YACtB,mBAAG,CAAC,KAAK,CAAC,oEAAoE,wBAAwB,MAAM;kBACtG,4GAA4G,CAAC,CAAC;YACpH,MAAM,EAAE,CAAC;QACb,CAAC;QAED,SAAS,MAAM;YACX,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,eAAe,CAAC,CAAC;YAChD,IAAI,CAAC,cAAc,CAAC,gBAAgB,EAAE,eAAe,CAAC,CAAC;YACvD,OAAO,CAAC,cAAc,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YACvD,OAAO,EAAE,CAAC;QACd,CAAC;QAED,UAAU,GAAG,UAAU,CAAC,iBAAiB,EAAE,wBAAwB,CAAC,CAAC;QACrE,OAAO,GAAG,eAAe,EAAE,CAAC,CAAC,wEAAwE;QACrG,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,eAAe,CAAC,CAAC;QACpC,IAAI,CAAC,EAAE,CAAC,gBAAgB,EAAE,eAAe,CAAC,CAAC;QAC3C,OAAO,CAAC,EAAE,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;AACP,CAAC;AAED;;;;;GAKG;AACH,KAAK,UAAU,2CAA2C,CAAC,IAAI,EAAE,QAAQ;IACrE,8BAA8B;IAC9B,0BAA0B;IAC1B,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,GAAG,EAAE;QAC1C,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC;QACxC,MAAM,CAAC,OAAO,GAAG,MAAM,CAAC,mBAAmB,CAAC,CAAC,2CAA2C;QACxF,OAAO,YAAY,CAAC;IACxB,CAAC,CAAC,CAAC;IACH,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;QAC1B,IAAI;YACA,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACvC,MAAM,GAAG,GAAG,IAAI,SAAG,CAAC,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC;YAC/C,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;SACzC;QAAC,OAAO,GAAG,EAAE;YACV,mBAAG,CAAC,KAAK,CAAC,4CAA4C,EAAE,EAAE,KAAK,EAAE,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC;SACjF;IACL,CAAC,CAAC,CAAC;AACP,CAAC"}
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* The function finds elements matching a specific CSS selector (HTML anchor (`<a>`) by default)
|
|
3
|
-
* either in a Puppeteer page, or in a Cheerio object (parsed HTML),
|
|
4
|
-
* and enqueues the URLs in their `href` attributes to the provided {@link RequestQueue}.
|
|
5
|
-
* If you're looking to find URLs in JavaScript heavy pages where links are not available
|
|
6
|
-
* in `href` elements, but rather navigations are triggered in click handlers
|
|
7
|
-
* see {@link puppeteer#enqueueLinksByClickingElements}.
|
|
8
|
-
*
|
|
9
|
-
* Optionally, the function allows you to filter the target links' URLs using an array of {@link PseudoUrl} objects
|
|
10
|
-
* and override settings of the enqueued {@link Request} objects.
|
|
11
|
-
*
|
|
12
|
-
* **Example usage**
|
|
13
|
-
*
|
|
14
|
-
* ```javascript
|
|
15
|
-
* await Apify.utils.enqueueLinks({
|
|
16
|
-
* page,
|
|
17
|
-
* requestQueue,
|
|
18
|
-
* selector: 'a.product-detail',
|
|
19
|
-
* pseudoUrls: [
|
|
20
|
-
* 'https://www.example.com/handbags/[.*]',
|
|
21
|
-
* 'https://www.example.com/purses/[.*]'
|
|
22
|
-
* ],
|
|
23
|
-
* });
|
|
24
|
-
* ```
|
|
25
|
-
*
|
|
26
|
-
* @param {object} options
|
|
27
|
-
* All `enqueueLinks()` parameters are passed
|
|
28
|
-
* via an options object with the following keys:
|
|
29
|
-
* @param {PuppeteerPage|PlaywrightPage} [options.page]
|
|
30
|
-
* Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
|
|
31
|
-
* Either `page` or `$` option must be provided.
|
|
32
|
-
* @param {Number} [options.limit]
|
|
33
|
-
* Limit the count of actually enqueued URLs to this number. Useful for testing across the entire crawling scope.
|
|
34
|
-
* @param {CheerioAPI} [options.$]
|
|
35
|
-
* [`Cheerio`](https://github.com/cheeriojs/cheerio) function with loaded HTML.
|
|
36
|
-
* Either `page` or `$` option must be provided.
|
|
37
|
-
* @param {RequestQueue} options.requestQueue
|
|
38
|
-
* A request queue to which the URLs will be enqueued.
|
|
39
|
-
* @param {string} [options.selector='a']
|
|
40
|
-
* A CSS selector matching links to be enqueued.
|
|
41
|
-
* @param {string} [options.baseUrl]
|
|
42
|
-
* A base URL that will be used to resolve relative URLs when using Cheerio. Ignored when using Puppeteer,
|
|
43
|
-
* since the relative URL resolution is done inside the browser automatically.
|
|
44
|
-
* @param {Array<Object<string, *>>|Array<string>} [options.pseudoUrls]
|
|
45
|
-
* An array of {@link PseudoUrl}s matching the URLs to be enqueued,
|
|
46
|
-
* or an array of strings or RegExps or plain Objects from which the {@link PseudoUrl}s can be constructed.
|
|
47
|
-
*
|
|
48
|
-
* The plain objects must include at least the `purl` property, which holds the pseudo-URL string or RegExp.
|
|
49
|
-
* All remaining keys will be used as the `requestTemplate` argument of the {@link PseudoUrl} constructor,
|
|
50
|
-
* which lets you specify special properties for the enqueued {@link Request} objects.
|
|
51
|
-
*
|
|
52
|
-
* If `pseudoUrls` is an empty array, `null` or `undefined`, then the function
|
|
53
|
-
* enqueues all links found on the page.
|
|
54
|
-
* @param {RequestTransform} [options.transformRequestFunction]
|
|
55
|
-
* Just before a new {@link Request} is constructed and enqueued to the {@link RequestQueue}, this function can be used
|
|
56
|
-
* to remove it or modify its contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful
|
|
57
|
-
* when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads,
|
|
58
|
-
* or to dynamically update or create `userData`.
|
|
59
|
-
*
|
|
60
|
-
* For example: by adding `keepUrlFragment: true` to the `request` object, URL fragments will not be removed
|
|
61
|
-
* when `uniqueKey` is computed.
|
|
62
|
-
*
|
|
63
|
-
* **Example:**
|
|
64
|
-
* ```javascript
|
|
65
|
-
* {
|
|
66
|
-
* transformRequestFunction: (request) => {
|
|
67
|
-
* request.userData.foo = 'bar';
|
|
68
|
-
* request.keepUrlFragment = true;
|
|
69
|
-
* return request;
|
|
70
|
-
* }
|
|
71
|
-
* }
|
|
72
|
-
* ```
|
|
73
|
-
* @return {Promise<Array<QueueOperationInfo>>}
|
|
74
|
-
* Promise that resolves to an array of {@link QueueOperationInfo} objects.
|
|
75
|
-
* @memberOf utils
|
|
76
|
-
* @name enqueueLinks
|
|
77
|
-
* @function
|
|
78
|
-
*/
|
|
79
|
-
export function enqueueLinks(options: {
|
|
80
|
-
page?: PuppeteerPage | PlaywrightPage | undefined;
|
|
81
|
-
limit?: number | undefined;
|
|
82
|
-
$?: CheerioAPI | undefined;
|
|
83
|
-
requestQueue: RequestQueue;
|
|
84
|
-
selector?: string | undefined;
|
|
85
|
-
baseUrl?: string | undefined;
|
|
86
|
-
pseudoUrls?: string[] | {
|
|
87
|
-
[x: string]: any;
|
|
88
|
-
}[] | undefined;
|
|
89
|
-
transformRequestFunction?: RequestTransform | undefined;
|
|
90
|
-
}): Promise<Array<QueueOperationInfo>>;
|
|
91
|
-
/**
|
|
92
|
-
* Extracts URLs from a given Puppeteer Page.
|
|
93
|
-
*
|
|
94
|
-
* @param {PuppeteerPage|PlaywrightPage} page
|
|
95
|
-
* @param {string} selector
|
|
96
|
-
* @return {Promise<Array<string>>}
|
|
97
|
-
* @ignore
|
|
98
|
-
*/
|
|
99
|
-
export function extractUrlsFromPage(page: PuppeteerPage | PlaywrightPage, selector: string): Promise<Array<string>>;
|
|
100
|
-
/**
|
|
101
|
-
* Extracts URLs from a given Cheerio object.
|
|
102
|
-
*
|
|
103
|
-
* @param {CheerioAPI} $
|
|
104
|
-
* @param {string} selector
|
|
105
|
-
* @param {string} baseUrl
|
|
106
|
-
* @return {string[]}
|
|
107
|
-
* @ignore
|
|
108
|
-
*/
|
|
109
|
-
export function extractUrlsFromCheerio($: CheerioAPI, selector: string, baseUrl: string): string[];
|
|
110
|
-
// @ts-ignore optional peer dependency
|
|
111
|
-
import { Page as PuppeteerPage } from "puppeteer";
|
|
112
|
-
import { Page as PlaywrightPage } from "playwright-core";
|
|
113
|
-
import { CheerioAPI } from "cheerio/lib/load";
|
|
114
|
-
import { RequestQueue } from "../storages/request_queue";
|
|
115
|
-
import { RequestTransform } from "./shared";
|
|
116
|
-
import { QueueOperationInfo } from "../storages/request_queue";
|
|
117
|
-
//# sourceMappingURL=enqueue_links.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"enqueue_links.d.ts","sourceRoot":"","sources":["../../src/enqueue_links/enqueue_links.js"],"names":[],"mappings":"AAkBA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6EG;AACH;IAlDkD,IAAI;IAG1B,KAAK;IAED,CAAC;IAGA,YAAY,EAAlC,YAAY;IAEK,QAAQ;IAER,OAAO;IAGyB,UAAU;;;IAUhC,wBAAwB;IAmBlD,QAAQ,MAAM,kBAAkB,CAAC,CAAC,CAsD7C;AAED;;;;;;;GAOG;AACH,0CALW,aAAa,GAAC,cAAc,YAC5B,MAAM,GACL,QAAQ,MAAM,MAAM,CAAC,CAAC,CAMjC;AAED;;;;;;;;GAQG;AACH,0CANW,UAAU,YACV,MAAM,WACN,MAAM,GACL,MAAM,EAAE,CAmBnB"}
|