apify 2.3.1-beta.4 → 3.0.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -5
- package/package.json +69 -128
- package/build/actor.d.ts +0 -113
- package/build/actor.d.ts.map +0 -1
- package/build/actor.js +0 -582
- package/build/actor.js.map +0 -1
- package/build/apify.d.ts +0 -752
- package/build/apify.d.ts.map +0 -1
- package/build/apify.js +0 -877
- package/build/apify.js.map +0 -1
- package/build/autoscaling/autoscaled_pool.d.ts +0 -384
- package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
- package/build/autoscaling/autoscaled_pool.js +0 -557
- package/build/autoscaling/autoscaled_pool.js.map +0 -1
- package/build/autoscaling/snapshotter.d.ts +0 -278
- package/build/autoscaling/snapshotter.d.ts.map +0 -1
- package/build/autoscaling/snapshotter.js +0 -447
- package/build/autoscaling/snapshotter.js.map +0 -1
- package/build/autoscaling/system_status.d.ts +0 -224
- package/build/autoscaling/system_status.d.ts.map +0 -1
- package/build/autoscaling/system_status.js +0 -228
- package/build/autoscaling/system_status.js.map +0 -1
- package/build/browser_launchers/browser_launcher.d.ts +0 -154
- package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
- package/build/browser_launchers/browser_launcher.js +0 -160
- package/build/browser_launchers/browser_launcher.js.map +0 -1
- package/build/browser_launchers/browser_plugin.d.ts +0 -23
- package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
- package/build/browser_launchers/browser_plugin.js +0 -25
- package/build/browser_launchers/browser_plugin.js.map +0 -1
- package/build/browser_launchers/playwright_launcher.d.ts +0 -131
- package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
- package/build/browser_launchers/playwright_launcher.js +0 -150
- package/build/browser_launchers/playwright_launcher.js.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
- package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.js +0 -197
- package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
- package/build/cache_container.d.ts +0 -31
- package/build/cache_container.d.ts.map +0 -1
- package/build/cache_container.js +0 -48
- package/build/cache_container.js.map +0 -1
- package/build/configuration.d.ts +0 -226
- package/build/configuration.d.ts.map +0 -1
- package/build/configuration.js +0 -325
- package/build/configuration.js.map +0 -1
- package/build/constants.d.ts +0 -37
- package/build/constants.d.ts.map +0 -1
- package/build/constants.js +0 -41
- package/build/constants.js.map +0 -1
- package/build/crawlers/basic_crawler.d.ts +0 -443
- package/build/crawlers/basic_crawler.d.ts.map +0 -1
- package/build/crawlers/basic_crawler.js +0 -664
- package/build/crawlers/basic_crawler.js.map +0 -1
- package/build/crawlers/browser_crawler.d.ts +0 -512
- package/build/crawlers/browser_crawler.d.ts.map +0 -1
- package/build/crawlers/browser_crawler.js +0 -540
- package/build/crawlers/browser_crawler.js.map +0 -1
- package/build/crawlers/cheerio_crawler.d.ts +0 -931
- package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
- package/build/crawlers/cheerio_crawler.js +0 -913
- package/build/crawlers/cheerio_crawler.js.map +0 -1
- package/build/crawlers/crawler_extension.d.ts +0 -10
- package/build/crawlers/crawler_extension.d.ts.map +0 -1
- package/build/crawlers/crawler_extension.js +0 -19
- package/build/crawlers/crawler_extension.js.map +0 -1
- package/build/crawlers/crawler_utils.d.ts +0 -34
- package/build/crawlers/crawler_utils.d.ts.map +0 -1
- package/build/crawlers/crawler_utils.js +0 -87
- package/build/crawlers/crawler_utils.js.map +0 -1
- package/build/crawlers/playwright_crawler.d.ts +0 -448
- package/build/crawlers/playwright_crawler.d.ts.map +0 -1
- package/build/crawlers/playwright_crawler.js +0 -299
- package/build/crawlers/playwright_crawler.js.map +0 -1
- package/build/crawlers/puppeteer_crawler.d.ts +0 -425
- package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
- package/build/crawlers/puppeteer_crawler.js +0 -299
- package/build/crawlers/puppeteer_crawler.js.map +0 -1
- package/build/crawlers/statistics.d.ts +0 -185
- package/build/crawlers/statistics.d.ts.map +0 -1
- package/build/crawlers/statistics.js +0 -331
- package/build/crawlers/statistics.js.map +0 -1
- package/build/enqueue_links/click_elements.d.ts +0 -179
- package/build/enqueue_links/click_elements.d.ts.map +0 -1
- package/build/enqueue_links/click_elements.js +0 -434
- package/build/enqueue_links/click_elements.js.map +0 -1
- package/build/enqueue_links/enqueue_links.d.ts +0 -117
- package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
- package/build/enqueue_links/enqueue_links.js +0 -163
- package/build/enqueue_links/enqueue_links.js.map +0 -1
- package/build/enqueue_links/shared.d.ts +0 -42
- package/build/enqueue_links/shared.d.ts.map +0 -1
- package/build/enqueue_links/shared.js +0 -121
- package/build/enqueue_links/shared.js.map +0 -1
- package/build/errors.d.ts +0 -29
- package/build/errors.d.ts.map +0 -1
- package/build/errors.js +0 -38
- package/build/errors.js.map +0 -1
- package/build/events.d.ts +0 -11
- package/build/events.d.ts.map +0 -1
- package/build/events.js +0 -147
- package/build/events.js.map +0 -1
- package/build/index.d.ts +0 -4
- package/build/index.d.ts.map +0 -1
- package/build/index.js +0 -7
- package/build/index.js.map +0 -1
- package/build/main.d.ts +0 -179
- package/build/main.d.ts.map +0 -1
- package/build/main.js +0 -81
- package/build/main.js.map +0 -1
- package/build/playwright_utils.d.ts +0 -9
- package/build/playwright_utils.d.ts.map +0 -1
- package/build/playwright_utils.js +0 -90
- package/build/playwright_utils.js.map +0 -1
- package/build/proxy_configuration.d.ts +0 -411
- package/build/proxy_configuration.d.ts.map +0 -1
- package/build/proxy_configuration.js +0 -517
- package/build/proxy_configuration.js.map +0 -1
- package/build/pseudo_url.d.ts +0 -86
- package/build/pseudo_url.d.ts.map +0 -1
- package/build/pseudo_url.js +0 -153
- package/build/pseudo_url.js.map +0 -1
- package/build/puppeteer_request_interception.d.ts +0 -8
- package/build/puppeteer_request_interception.d.ts.map +0 -1
- package/build/puppeteer_request_interception.js +0 -235
- package/build/puppeteer_request_interception.js.map +0 -1
- package/build/puppeteer_utils.d.ts +0 -250
- package/build/puppeteer_utils.d.ts.map +0 -1
- package/build/puppeteer_utils.js +0 -551
- package/build/puppeteer_utils.js.map +0 -1
- package/build/request.d.ts +0 -180
- package/build/request.d.ts.map +0 -1
- package/build/request.js +0 -261
- package/build/request.js.map +0 -1
- package/build/request_list.d.ts +0 -581
- package/build/request_list.d.ts.map +0 -1
- package/build/request_list.js +0 -826
- package/build/request_list.js.map +0 -1
- package/build/serialization.d.ts +0 -5
- package/build/serialization.d.ts.map +0 -1
- package/build/serialization.js +0 -139
- package/build/serialization.js.map +0 -1
- package/build/session_pool/errors.d.ts +0 -11
- package/build/session_pool/errors.d.ts.map +0 -1
- package/build/session_pool/errors.js +0 -18
- package/build/session_pool/errors.js.map +0 -1
- package/build/session_pool/events.d.ts +0 -5
- package/build/session_pool/events.d.ts.map +0 -1
- package/build/session_pool/events.js +0 -6
- package/build/session_pool/events.js.map +0 -1
- package/build/session_pool/session.d.ts +0 -286
- package/build/session_pool/session.d.ts.map +0 -1
- package/build/session_pool/session.js +0 -355
- package/build/session_pool/session.js.map +0 -1
- package/build/session_pool/session_pool.d.ts +0 -280
- package/build/session_pool/session_pool.d.ts.map +0 -1
- package/build/session_pool/session_pool.js +0 -393
- package/build/session_pool/session_pool.js.map +0 -1
- package/build/session_pool/session_utils.d.ts +0 -4
- package/build/session_pool/session_utils.d.ts.map +0 -1
- package/build/session_pool/session_utils.js +0 -24
- package/build/session_pool/session_utils.js.map +0 -1
- package/build/stealth/hiding_tricks.d.ts +0 -22
- package/build/stealth/hiding_tricks.d.ts.map +0 -1
- package/build/stealth/hiding_tricks.js +0 -308
- package/build/stealth/hiding_tricks.js.map +0 -1
- package/build/stealth/stealth.d.ts +0 -56
- package/build/stealth/stealth.d.ts.map +0 -1
- package/build/stealth/stealth.js +0 -125
- package/build/stealth/stealth.js.map +0 -1
- package/build/storages/dataset.d.ts +0 -288
- package/build/storages/dataset.d.ts.map +0 -1
- package/build/storages/dataset.js +0 -480
- package/build/storages/dataset.js.map +0 -1
- package/build/storages/key_value_store.d.ts +0 -243
- package/build/storages/key_value_store.d.ts.map +0 -1
- package/build/storages/key_value_store.js +0 -462
- package/build/storages/key_value_store.js.map +0 -1
- package/build/storages/request_queue.d.ts +0 -318
- package/build/storages/request_queue.d.ts.map +0 -1
- package/build/storages/request_queue.js +0 -636
- package/build/storages/request_queue.js.map +0 -1
- package/build/storages/storage_manager.d.ts +0 -87
- package/build/storages/storage_manager.d.ts.map +0 -1
- package/build/storages/storage_manager.js +0 -150
- package/build/storages/storage_manager.js.map +0 -1
- package/build/tsconfig.tsbuildinfo +0 -1
- package/build/typedefs.d.ts +0 -146
- package/build/typedefs.d.ts.map +0 -1
- package/build/typedefs.js +0 -88
- package/build/typedefs.js.map +0 -1
- package/build/utils.d.ts +0 -175
- package/build/utils.d.ts.map +0 -1
- package/build/utils.js +0 -731
- package/build/utils.js.map +0 -1
- package/build/utils_log.d.ts +0 -41
- package/build/utils_log.d.ts.map +0 -1
- package/build/utils_log.js +0 -192
- package/build/utils_log.js.map +0 -1
- package/build/utils_request.d.ts +0 -77
- package/build/utils_request.d.ts.map +0 -1
- package/build/utils_request.js +0 -385
- package/build/utils_request.js.map +0 -1
- package/build/utils_social.d.ts +0 -210
- package/build/utils_social.d.ts.map +0 -1
- package/build/utils_social.js +0 -787
- package/build/utils_social.js.map +0 -1
- package/build/validators.d.ts +0 -23
- package/build/validators.d.ts.map +0 -1
- package/build/validators.js +0 -29
- package/build/validators.js.map +0 -1
|
@@ -1,931 +0,0 @@
|
|
|
1
|
-
/// <reference types="node" />
|
|
2
|
-
export default CheerioCrawler;
|
|
3
|
-
export type CheerioCrawlerOptions = {
|
|
4
|
-
/**
|
|
5
|
-
* User-provided function that performs the logic of the crawler. It is called for each page
|
|
6
|
-
* loaded and parsed by the crawler.
|
|
7
|
-
*
|
|
8
|
-
* The function receives the following object as an argument:
|
|
9
|
-
* ```
|
|
10
|
-
* {
|
|
11
|
-
* // The Cheerio object's function with the parsed HTML.
|
|
12
|
-
* $: Cheerio,
|
|
13
|
-
*
|
|
14
|
-
* // The request body of the web page, whose type depends on the content type.
|
|
15
|
-
* body: String|Buffer,
|
|
16
|
-
*
|
|
17
|
-
* // The parsed object from JSON for responses with the "application/json" content types.
|
|
18
|
-
* // For other content types it's null.
|
|
19
|
-
* json: Object,
|
|
20
|
-
*
|
|
21
|
-
* // Apify.Request object with details of the requested web page
|
|
22
|
-
* request: Request,
|
|
23
|
-
*
|
|
24
|
-
* // Parsed Content-Type HTTP header: { type, encoding }
|
|
25
|
-
* contentType: Object,
|
|
26
|
-
*
|
|
27
|
-
* // An instance of Node's http.IncomingMessage object,
|
|
28
|
-
* response: Object,
|
|
29
|
-
*
|
|
30
|
-
* // Session object, useful to work around anti-scraping protections
|
|
31
|
-
* session: Session
|
|
32
|
-
*
|
|
33
|
-
* // ProxyInfo object with information about currently used proxy
|
|
34
|
-
* proxyInfo: ProxyInfo
|
|
35
|
-
*
|
|
36
|
-
* // The running cheerio crawler instance.
|
|
37
|
-
* crawler: CheerioCrawler
|
|
38
|
-
* }
|
|
39
|
-
* ```
|
|
40
|
-
*
|
|
41
|
-
* Type of `body` depends on the `Content-Type` header of the web page:
|
|
42
|
-
* - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
|
|
43
|
-
* - Buffer for others MIME content types
|
|
44
|
-
*
|
|
45
|
-
* Parsed `Content-Type` header using
|
|
46
|
-
* [content-type package](https://www.npmjs.com/package/content-type)
|
|
47
|
-
* is stored in `contentType`.
|
|
48
|
-
*
|
|
49
|
-
* Cheerio is available only for HTML and XML content types.
|
|
50
|
-
*
|
|
51
|
-
* With the {@link Request } object representing the URL to crawl.
|
|
52
|
-
*
|
|
53
|
-
* If the function returns, the returned promise is awaited by the crawler.
|
|
54
|
-
*
|
|
55
|
-
* If the function throws an exception, the crawler will try to re-crawl the
|
|
56
|
-
* request later, up to `option.maxRequestRetries` times.
|
|
57
|
-
* If all the retries fail, the crawler calls the function
|
|
58
|
-
* provided to the `handleFailedRequestFunction` parameter.
|
|
59
|
-
* To make this work, you should **always**
|
|
60
|
-
* let your function throw exceptions rather than catch them.
|
|
61
|
-
* The exceptions are logged to the request using the
|
|
62
|
-
* {@link RequestpushErrorMessage } function.
|
|
63
|
-
*/
|
|
64
|
-
handlePageFunction: CheerioHandlePage;
|
|
65
|
-
/**
|
|
66
|
-
* Static list of URLs to be processed.
|
|
67
|
-
* Either `requestList` or `requestQueue` option must be provided (or both).
|
|
68
|
-
*/
|
|
69
|
-
requestList?: RequestList | undefined;
|
|
70
|
-
/**
|
|
71
|
-
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
72
|
-
* Either `requestList` or `requestQueue` option must be provided (or both).
|
|
73
|
-
*/
|
|
74
|
-
requestQueue?: RequestQueue | undefined;
|
|
75
|
-
/**
|
|
76
|
-
* > This option is deprecated, use `preNavigationHooks` instead.
|
|
77
|
-
*
|
|
78
|
-
* A function that executes before the HTTP request is made to the target resource.
|
|
79
|
-
* This function is suitable for setting dynamic properties such as cookies to the {@link Request }.
|
|
80
|
-
*
|
|
81
|
-
* The function receives the following object as an argument:
|
|
82
|
-
* ```
|
|
83
|
-
* {
|
|
84
|
-
* request: Request,
|
|
85
|
-
* session: Session,
|
|
86
|
-
* proxyInfo: ProxyInfo,
|
|
87
|
-
* crawler: CheerioCrawler,
|
|
88
|
-
* }
|
|
89
|
-
* ```
|
|
90
|
-
* where the {@link Request } instance corresponds to the initialized request
|
|
91
|
-
* and the {@link Session } instance corresponds to used session.
|
|
92
|
-
*
|
|
93
|
-
* The function should modify the properties of the passed {@link Request } instance
|
|
94
|
-
* in place because there are already earlier references to it. Making a copy and returning it from
|
|
95
|
-
* this function is therefore not supported, because it would create inconsistencies where
|
|
96
|
-
* different parts of SDK would have access to a different {@link Request } instance.
|
|
97
|
-
*/
|
|
98
|
-
prepareRequestFunction?: PrepareRequest | undefined;
|
|
99
|
-
/**
|
|
100
|
-
* > This option is deprecated, use `postNavigationHooks` instead.
|
|
101
|
-
*
|
|
102
|
-
* A function that executes right after the HTTP request is made to the target resource and response is returned.
|
|
103
|
-
* This function is suitable for overriding custom properties of response e.g. setting headers because of response parsing.
|
|
104
|
-
*
|
|
105
|
-
* **Example usage:**
|
|
106
|
-
*
|
|
107
|
-
* ```javascript
|
|
108
|
-
* const cheerioCrawlerOptions = {
|
|
109
|
-
* // ...
|
|
110
|
-
* postResponseFunction: ({ request, response }) => {
|
|
111
|
-
* if (request.userData.parseAsJSON) {
|
|
112
|
-
* response.headers['content-type'] = 'application/json; charset=utf-8';
|
|
113
|
-
* }
|
|
114
|
-
* }
|
|
115
|
-
* }
|
|
116
|
-
* ```
|
|
117
|
-
* The function receives the following object as an argument:
|
|
118
|
-
* ```
|
|
119
|
-
* {
|
|
120
|
-
* response: Object,
|
|
121
|
-
* request: Request,
|
|
122
|
-
* session: Session,
|
|
123
|
-
* proxyInfo: ProxyInfo,
|
|
124
|
-
* crawler: CheerioCrawler,
|
|
125
|
-
* }
|
|
126
|
-
* ```
|
|
127
|
-
* The response is an instance of Node's http.IncomingMessage object.
|
|
128
|
-
*/
|
|
129
|
-
postResponseFunction?: PostResponse | undefined;
|
|
130
|
-
/**
|
|
131
|
-
* Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds.
|
|
132
|
-
*/
|
|
133
|
-
handlePageTimeoutSecs?: number | undefined;
|
|
134
|
-
/**
|
|
135
|
-
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
|
|
136
|
-
*/
|
|
137
|
-
requestTimeoutSecs?: number | undefined;
|
|
138
|
-
/**
|
|
139
|
-
* If set to true, SSL certificate errors will be ignored.
|
|
140
|
-
*/
|
|
141
|
-
ignoreSslErrors?: boolean | undefined;
|
|
142
|
-
/**
|
|
143
|
-
* If set, `CheerioCrawler` will be configured for all connections to use
|
|
144
|
-
* [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
|
|
145
|
-
* For more information, see the [documentation](https://docs.apify.com/proxy).
|
|
146
|
-
*/
|
|
147
|
-
proxyConfiguration?: ProxyConfiguration | undefined;
|
|
148
|
-
/**
|
|
149
|
-
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
150
|
-
* The function receives the following object as an argument:
|
|
151
|
-
* ```
|
|
152
|
-
* {
|
|
153
|
-
* error: Error,
|
|
154
|
-
* request: Request,
|
|
155
|
-
* session: Session,
|
|
156
|
-
* $: Cheerio,
|
|
157
|
-
* body: String|Buffer,
|
|
158
|
-
* json: Object,
|
|
159
|
-
* contentType: Object,
|
|
160
|
-
* response: Object,
|
|
161
|
-
* proxyInfo: ProxyInfo,
|
|
162
|
-
* crawler: CheerioCrawler,
|
|
163
|
-
* }
|
|
164
|
-
* ```
|
|
165
|
-
* where the {@link Request } instance corresponds to the failed request, and the `Error` instance
|
|
166
|
-
* represents the last error thrown during processing of the request.
|
|
167
|
-
*
|
|
168
|
-
* See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13)
|
|
169
|
-
* for the default implementation of this function.
|
|
170
|
-
*/
|
|
171
|
-
handleFailedRequestFunction?: HandleFailedRequest | undefined;
|
|
172
|
-
/**
|
|
173
|
-
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
174
|
-
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `requestAsBrowserOptions`,
|
|
175
|
-
* which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
|
|
176
|
-
* Example:
|
|
177
|
-
* ```
|
|
178
|
-
* preNavigationHooks: [
|
|
179
|
-
* async (crawlingContext, requestAsBrowserOptions) => {
|
|
180
|
-
* requestAsBrowserOptions.forceUrlEncoding = true;
|
|
181
|
-
* },
|
|
182
|
-
* ]
|
|
183
|
-
* ```
|
|
184
|
-
*/
|
|
185
|
-
preNavigationHooks?: Hook[] | undefined;
|
|
186
|
-
/**
|
|
187
|
-
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
|
|
188
|
-
* The function accepts `crawlingContext` as the only parameter.
|
|
189
|
-
* Example:
|
|
190
|
-
* ```
|
|
191
|
-
* postNavigationHooks: [
|
|
192
|
-
* async (crawlingContext) => {
|
|
193
|
-
* // ...
|
|
194
|
-
* },
|
|
195
|
-
* ]
|
|
196
|
-
* ```
|
|
197
|
-
*/
|
|
198
|
-
postNavigationHooks?: Hook[] | undefined;
|
|
199
|
-
/**
|
|
200
|
-
* An array of <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types"
|
|
201
|
-
* target="_blank">MIME types</a> you want the crawler to load and process.
|
|
202
|
-
* By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
|
|
203
|
-
*/
|
|
204
|
-
additionalMimeTypes?: string[] | undefined;
|
|
205
|
-
/**
|
|
206
|
-
* By default `CheerioCrawler` will extract correct encoding from the HTTP response headers.
|
|
207
|
-
* Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
|
|
208
|
-
* If those sites actually use a different encoding, the response will be corrupted. You can use
|
|
209
|
-
* `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
|
|
210
|
-
* To force a certain encoding, disregarding the response headers, use {@link CheerioCrawlerOptions.forceResponseEncoding }```
|
|
211
|
-
* // Will fall back to windows-1250 encoding if none found
|
|
212
|
-
* suggestResponseEncoding: 'windows-1250'
|
|
213
|
-
* ```
|
|
214
|
-
*/
|
|
215
|
-
suggestResponseEncoding?: string | undefined;
|
|
216
|
-
/**
|
|
217
|
-
* By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
|
|
218
|
-
* to force a certain encoding, disregarding the response headers.
|
|
219
|
-
* To only provide a default for missing encodings, use {@link CheerioCrawlerOptions.suggestResponseEncoding }```
|
|
220
|
-
* // Will force windows-1250 encoding even if headers say otherwise
|
|
221
|
-
* forceResponseEncoding: 'windows-1250'
|
|
222
|
-
* ```
|
|
223
|
-
*/
|
|
224
|
-
forceResponseEncoding?: string | undefined;
|
|
225
|
-
/**
|
|
226
|
-
* Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails.
|
|
227
|
-
*/
|
|
228
|
-
maxRequestRetries?: number | undefined;
|
|
229
|
-
/**
|
|
230
|
-
* Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
|
|
231
|
-
* Always set this value in order to prevent infinite loops in misconfigured crawlers.
|
|
232
|
-
* Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
|
|
233
|
-
*/
|
|
234
|
-
maxRequestsPerCrawl?: number | undefined;
|
|
235
|
-
/**
|
|
236
|
-
* Custom options passed to the underlying {@link AutoscaledPool } constructor.
|
|
237
|
-
* Note that the `runTaskFunction`, `isTaskReadyFunction` and `isFinishedFunction` options
|
|
238
|
-
* are provided by `CheerioCrawler` and cannot be overridden. Reasonable {@link Snapshotter }
|
|
239
|
-
* and {@link SystemStatus } defaults are provided to account for the fact that `cheerio`
|
|
240
|
-
* parses HTML synchronously and therefore blocks the event loop.
|
|
241
|
-
*/
|
|
242
|
-
autoscaledPoolOptions?: AutoscaledPoolOptions | undefined;
|
|
243
|
-
/**
|
|
244
|
-
* Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool } option.
|
|
245
|
-
*
|
|
246
|
-
* *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
|
|
247
|
-
* If you're not sure, just keep the default value and the concurrency will scale up automatically.
|
|
248
|
-
*/
|
|
249
|
-
minConcurrency?: number | undefined;
|
|
250
|
-
/**
|
|
251
|
-
* Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool } option.
|
|
252
|
-
*/
|
|
253
|
-
maxConcurrency?: number | undefined;
|
|
254
|
-
/**
|
|
255
|
-
* If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes.
|
|
256
|
-
* It also marks Session as bad after a request timeout.
|
|
257
|
-
*/
|
|
258
|
-
useSessionPool?: boolean | undefined;
|
|
259
|
-
/**
|
|
260
|
-
* Custom options passed to the underlying {@link SessionPool } constructor.
|
|
261
|
-
*/
|
|
262
|
-
sessionPoolOptions?: SessionPoolOptions | undefined;
|
|
263
|
-
/**
|
|
264
|
-
* Automatically saves cookies to Session. Works only if Session Pool is used.
|
|
265
|
-
*
|
|
266
|
-
* It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
|
|
267
|
-
* It passes the "Cookie" header to the request with the session cookies.
|
|
268
|
-
*/
|
|
269
|
-
persistCookiesPerSession?: boolean | undefined;
|
|
270
|
-
};
|
|
271
|
-
export type PrepareRequestInputs = {
|
|
272
|
-
/**
|
|
273
|
-
* Original instance fo the {Request} object. Must be modified in-place.
|
|
274
|
-
*/
|
|
275
|
-
request: Request;
|
|
276
|
-
/**
|
|
277
|
-
* The current session
|
|
278
|
-
*/
|
|
279
|
-
session?: Session | undefined;
|
|
280
|
-
/**
|
|
281
|
-
* An object with information about currently used proxy by the crawler
|
|
282
|
-
* and configured by the {@link ProxyConfiguration } class.
|
|
283
|
-
*/
|
|
284
|
-
proxyInfo?: ProxyInfo | undefined;
|
|
285
|
-
crawler?: CheerioCrawler | undefined;
|
|
286
|
-
};
|
|
287
|
-
export type PrepareRequest = (inputs: PrepareRequestInputs) => (void | Promise<void>);
|
|
288
|
-
export type PostResponseInputs = {
|
|
289
|
-
/**
|
|
290
|
-
* stream
|
|
291
|
-
*/
|
|
292
|
-
response: (IncomingMessage | Readable);
|
|
293
|
-
/**
|
|
294
|
-
* Original instance fo the {Request} object. Must be modified in-place.
|
|
295
|
-
*/
|
|
296
|
-
request: Request;
|
|
297
|
-
/**
|
|
298
|
-
* The current session
|
|
299
|
-
*/
|
|
300
|
-
session?: Session | undefined;
|
|
301
|
-
/**
|
|
302
|
-
* An object with information about currently used proxy by the crawler
|
|
303
|
-
* and configured by the {@link ProxyConfiguration } class.
|
|
304
|
-
*/
|
|
305
|
-
proxyInfo?: ProxyInfo | undefined;
|
|
306
|
-
crawler: CheerioCrawler;
|
|
307
|
-
};
|
|
308
|
-
export type PostResponse = (inputs: PostResponseInputs) => (void | Promise<void>);
|
|
309
|
-
export type CheerioHandlePageInputs = {
|
|
310
|
-
/**
|
|
311
|
-
* The [Cheerio](https://cheerio.js.org/) object with parsed HTML.
|
|
312
|
-
*/
|
|
313
|
-
$: CheerioAPI;
|
|
314
|
-
/**
|
|
315
|
-
* The request body of the web page.
|
|
316
|
-
*/
|
|
317
|
-
body: (string | Buffer);
|
|
318
|
-
/**
|
|
319
|
-
* The parsed object from JSON string if the response contains the content type application/json.
|
|
320
|
-
*/
|
|
321
|
-
json: any;
|
|
322
|
-
/**
|
|
323
|
-
* The original {@link Request } object.
|
|
324
|
-
*/
|
|
325
|
-
request: Request;
|
|
326
|
-
/**
|
|
327
|
-
* Parsed `Content-Type header: { type, encoding }`.
|
|
328
|
-
*/
|
|
329
|
-
contentType: {
|
|
330
|
-
type: string;
|
|
331
|
-
encoding: string;
|
|
332
|
-
};
|
|
333
|
-
/**
|
|
334
|
-
* An instance of Node's [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) object,
|
|
335
|
-
*/
|
|
336
|
-
response: IncomingMessage;
|
|
337
|
-
session: Session;
|
|
338
|
-
/**
|
|
339
|
-
* An object with information about currently used proxy by the crawler
|
|
340
|
-
* and configured by the {@link ProxyConfiguration } class.
|
|
341
|
-
*/
|
|
342
|
-
proxyInfo: ProxyInfo;
|
|
343
|
-
crawler: CheerioCrawler;
|
|
344
|
-
};
|
|
345
|
-
export type CheerioHandlePage = (inputs: CheerioHandlePageInputs) => Promise<void>;
|
|
346
|
-
/**
|
|
347
|
-
* @typedef CheerioCrawlerOptions
|
|
348
|
-
* @property {CheerioHandlePage} handlePageFunction
|
|
349
|
-
* User-provided function that performs the logic of the crawler. It is called for each page
|
|
350
|
-
* loaded and parsed by the crawler.
|
|
351
|
-
*
|
|
352
|
-
* The function receives the following object as an argument:
|
|
353
|
-
* ```
|
|
354
|
-
* {
|
|
355
|
-
* // The Cheerio object's function with the parsed HTML.
|
|
356
|
-
* $: Cheerio,
|
|
357
|
-
*
|
|
358
|
-
* // The request body of the web page, whose type depends on the content type.
|
|
359
|
-
* body: String|Buffer,
|
|
360
|
-
*
|
|
361
|
-
* // The parsed object from JSON for responses with the "application/json" content types.
|
|
362
|
-
* // For other content types it's null.
|
|
363
|
-
* json: Object,
|
|
364
|
-
*
|
|
365
|
-
* // Apify.Request object with details of the requested web page
|
|
366
|
-
* request: Request,
|
|
367
|
-
*
|
|
368
|
-
* // Parsed Content-Type HTTP header: { type, encoding }
|
|
369
|
-
* contentType: Object,
|
|
370
|
-
*
|
|
371
|
-
* // An instance of Node's http.IncomingMessage object,
|
|
372
|
-
* response: Object,
|
|
373
|
-
*
|
|
374
|
-
* // Session object, useful to work around anti-scraping protections
|
|
375
|
-
* session: Session
|
|
376
|
-
*
|
|
377
|
-
* // ProxyInfo object with information about currently used proxy
|
|
378
|
-
* proxyInfo: ProxyInfo
|
|
379
|
-
*
|
|
380
|
-
* // The running cheerio crawler instance.
|
|
381
|
-
* crawler: CheerioCrawler
|
|
382
|
-
* }
|
|
383
|
-
* ```
|
|
384
|
-
*
|
|
385
|
-
* Type of `body` depends on the `Content-Type` header of the web page:
|
|
386
|
-
* - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
|
|
387
|
-
* - Buffer for others MIME content types
|
|
388
|
-
*
|
|
389
|
-
* Parsed `Content-Type` header using
|
|
390
|
-
* [content-type package](https://www.npmjs.com/package/content-type)
|
|
391
|
-
* is stored in `contentType`.
|
|
392
|
-
*
|
|
393
|
-
* Cheerio is available only for HTML and XML content types.
|
|
394
|
-
*
|
|
395
|
-
* With the {@link Request} object representing the URL to crawl.
|
|
396
|
-
*
|
|
397
|
-
* If the function returns, the returned promise is awaited by the crawler.
|
|
398
|
-
*
|
|
399
|
-
* If the function throws an exception, the crawler will try to re-crawl the
|
|
400
|
-
* request later, up to `option.maxRequestRetries` times.
|
|
401
|
-
* If all the retries fail, the crawler calls the function
|
|
402
|
-
* provided to the `handleFailedRequestFunction` parameter.
|
|
403
|
-
* To make this work, you should **always**
|
|
404
|
-
* let your function throw exceptions rather than catch them.
|
|
405
|
-
* The exceptions are logged to the request using the
|
|
406
|
-
* {@link Request#pushErrorMessage} function.
|
|
407
|
-
* @property {RequestList} [requestList]
|
|
408
|
-
* Static list of URLs to be processed.
|
|
409
|
-
* Either `requestList` or `requestQueue` option must be provided (or both).
|
|
410
|
-
* @property {RequestQueue} [requestQueue]
|
|
411
|
-
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
412
|
-
* Either `requestList` or `requestQueue` option must be provided (or both).
|
|
413
|
-
* @property {PrepareRequest} [prepareRequestFunction]
|
|
414
|
-
* > This option is deprecated, use `preNavigationHooks` instead.
|
|
415
|
-
*
|
|
416
|
-
* A function that executes before the HTTP request is made to the target resource.
|
|
417
|
-
* This function is suitable for setting dynamic properties such as cookies to the {@link Request}.
|
|
418
|
-
*
|
|
419
|
-
* The function receives the following object as an argument:
|
|
420
|
-
* ```
|
|
421
|
-
* {
|
|
422
|
-
* request: Request,
|
|
423
|
-
* session: Session,
|
|
424
|
-
* proxyInfo: ProxyInfo,
|
|
425
|
-
* crawler: CheerioCrawler,
|
|
426
|
-
* }
|
|
427
|
-
* ```
|
|
428
|
-
* where the {@link Request} instance corresponds to the initialized request
|
|
429
|
-
* and the {@link Session} instance corresponds to used session.
|
|
430
|
-
*
|
|
431
|
-
* The function should modify the properties of the passed {@link Request} instance
|
|
432
|
-
* in place because there are already earlier references to it. Making a copy and returning it from
|
|
433
|
-
* this function is therefore not supported, because it would create inconsistencies where
|
|
434
|
-
* different parts of SDK would have access to a different {@link Request} instance.
|
|
435
|
-
*
|
|
436
|
-
* @property {PostResponse} [postResponseFunction]
|
|
437
|
-
* > This option is deprecated, use `postNavigationHooks` instead.
|
|
438
|
-
*
|
|
439
|
-
* A function that executes right after the HTTP request is made to the target resource and response is returned.
|
|
440
|
-
* This function is suitable for overriding custom properties of response e.g. setting headers because of response parsing.
|
|
441
|
-
*
|
|
442
|
-
* **Example usage:**
|
|
443
|
-
*
|
|
444
|
-
* ```javascript
|
|
445
|
-
* const cheerioCrawlerOptions = {
|
|
446
|
-
* // ...
|
|
447
|
-
* postResponseFunction: ({ request, response }) => {
|
|
448
|
-
* if (request.userData.parseAsJSON) {
|
|
449
|
-
* response.headers['content-type'] = 'application/json; charset=utf-8';
|
|
450
|
-
* }
|
|
451
|
-
* }
|
|
452
|
-
* }
|
|
453
|
-
* ```
|
|
454
|
-
* The function receives the following object as an argument:
|
|
455
|
-
* ```
|
|
456
|
-
* {
|
|
457
|
-
* response: Object,
|
|
458
|
-
* request: Request,
|
|
459
|
-
* session: Session,
|
|
460
|
-
* proxyInfo: ProxyInfo,
|
|
461
|
-
* crawler: CheerioCrawler,
|
|
462
|
-
* }
|
|
463
|
-
* ```
|
|
464
|
-
* The response is an instance of Node's http.IncomingMessage object.
|
|
465
|
-
*
|
|
466
|
-
* @property {number} [handlePageTimeoutSecs=60]
|
|
467
|
-
* Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds.
|
|
468
|
-
* @property {number} [requestTimeoutSecs=30]
|
|
469
|
-
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
|
|
470
|
-
* @property {boolean} [ignoreSslErrors=true]
|
|
471
|
-
* If set to true, SSL certificate errors will be ignored.
|
|
472
|
-
* @property {ProxyConfiguration} [proxyConfiguration]
|
|
473
|
-
* If set, `CheerioCrawler` will be configured for all connections to use
|
|
474
|
-
* [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
|
|
475
|
-
* For more information, see the [documentation](https://docs.apify.com/proxy).
|
|
476
|
-
* @property {HandleFailedRequest} [handleFailedRequestFunction]
|
|
477
|
-
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
478
|
-
* The function receives the following object as an argument:
|
|
479
|
-
* ```
|
|
480
|
-
* {
|
|
481
|
-
* error: Error,
|
|
482
|
-
* request: Request,
|
|
483
|
-
* session: Session,
|
|
484
|
-
* $: Cheerio,
|
|
485
|
-
* body: String|Buffer,
|
|
486
|
-
* json: Object,
|
|
487
|
-
* contentType: Object,
|
|
488
|
-
* response: Object,
|
|
489
|
-
* proxyInfo: ProxyInfo,
|
|
490
|
-
* crawler: CheerioCrawler,
|
|
491
|
-
* }
|
|
492
|
-
* ```
|
|
493
|
-
* where the {@link Request} instance corresponds to the failed request, and the `Error` instance
|
|
494
|
-
* represents the last error thrown during processing of the request.
|
|
495
|
-
*
|
|
496
|
-
* See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13)
|
|
497
|
-
* for the default implementation of this function.
|
|
498
|
-
* @property {Array<Hook>} [preNavigationHooks]
|
|
499
|
-
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
500
|
-
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `requestAsBrowserOptions`,
|
|
501
|
-
* which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
|
|
502
|
-
* Example:
|
|
503
|
-
* ```
|
|
504
|
-
* preNavigationHooks: [
|
|
505
|
-
* async (crawlingContext, requestAsBrowserOptions) => {
|
|
506
|
-
* requestAsBrowserOptions.forceUrlEncoding = true;
|
|
507
|
-
* },
|
|
508
|
-
* ]
|
|
509
|
-
* ```
|
|
510
|
-
* @property {Array<Hook>} [postNavigationHooks]
|
|
511
|
-
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
|
|
512
|
-
* The function accepts `crawlingContext` as the only parameter.
|
|
513
|
-
* Example:
|
|
514
|
-
* ```
|
|
515
|
-
* postNavigationHooks: [
|
|
516
|
-
* async (crawlingContext) => {
|
|
517
|
-
* // ...
|
|
518
|
-
* },
|
|
519
|
-
* ]
|
|
520
|
-
* ```
|
|
521
|
-
* @property {string[]} [additionalMimeTypes]
|
|
522
|
-
* An array of <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types"
|
|
523
|
-
* target="_blank">MIME types</a> you want the crawler to load and process.
|
|
524
|
-
* By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
|
|
525
|
-
* @property {string} [suggestResponseEncoding]
|
|
526
|
-
* By default `CheerioCrawler` will extract correct encoding from the HTTP response headers.
|
|
527
|
-
* Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
|
|
528
|
-
* If those sites actually use a different encoding, the response will be corrupted. You can use
|
|
529
|
-
* `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
|
|
530
|
-
* To force a certain encoding, disregarding the response headers, use {@link CheerioCrawlerOptions.forceResponseEncoding}
|
|
531
|
-
* ```
|
|
532
|
-
* // Will fall back to windows-1250 encoding if none found
|
|
533
|
-
* suggestResponseEncoding: 'windows-1250'
|
|
534
|
-
* ```
|
|
535
|
-
* @property {string} [forceResponseEncoding]
|
|
536
|
-
* By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
|
|
537
|
-
* to force a certain encoding, disregarding the response headers.
|
|
538
|
-
* To only provide a default for missing encodings, use {@link CheerioCrawlerOptions.suggestResponseEncoding}
|
|
539
|
-
* ```
|
|
540
|
-
* // Will force windows-1250 encoding even if headers say otherwise
|
|
541
|
-
* forceResponseEncoding: 'windows-1250'
|
|
542
|
-
* ```
|
|
543
|
-
* @property {number} [maxRequestRetries=3]
|
|
544
|
-
* Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails.
|
|
545
|
-
* @property {number} [maxRequestsPerCrawl]
|
|
546
|
-
* Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
|
|
547
|
-
* Always set this value in order to prevent infinite loops in misconfigured crawlers.
|
|
548
|
-
* Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
|
|
549
|
-
* @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
|
|
550
|
-
* Custom options passed to the underlying {@link AutoscaledPool} constructor.
|
|
551
|
-
* Note that the `runTaskFunction`, `isTaskReadyFunction` and `isFinishedFunction` options
|
|
552
|
-
* are provided by `CheerioCrawler` and cannot be overridden. Reasonable {@link Snapshotter}
|
|
553
|
-
* and {@link SystemStatus} defaults are provided to account for the fact that `cheerio`
|
|
554
|
-
* parses HTML synchronously and therefore blocks the event loop.
|
|
555
|
-
* @property {number} [minConcurrency=1]
|
|
556
|
-
* Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
|
|
557
|
-
*
|
|
558
|
-
* *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
|
|
559
|
-
* If you're not sure, just keep the default value and the concurrency will scale up automatically.
|
|
560
|
-
* @property {number} [maxConcurrency=1000]
|
|
561
|
-
* Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
|
|
562
|
-
* @property {boolean} [useSessionPool=true]
|
|
563
|
-
* If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes.
|
|
564
|
-
* It also marks Session as bad after a request timeout.
|
|
565
|
-
* @property {SessionPoolOptions} [sessionPoolOptions]
|
|
566
|
-
* Custom options passed to the underlying {@link SessionPool} constructor.
|
|
567
|
-
* @property {boolean} [persistCookiesPerSession]
|
|
568
|
-
* Automatically saves cookies to Session. Works only if Session Pool is used.
|
|
569
|
-
*
|
|
570
|
-
* It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
|
|
571
|
-
* It passes the "Cookie" header to the request with the session cookies.
|
|
572
|
-
*/
|
|
573
|
-
/**
|
|
574
|
-
* Provides a framework for the parallel crawling of web pages using plain HTTP requests and
|
|
575
|
-
* [cheerio](https://www.npmjs.com/package/cheerio) HTML parser.
|
|
576
|
-
* The URLs to crawl are fed either from a static list of URLs
|
|
577
|
-
* or from a dynamic queue of URLs enabling recursive crawling of websites.
|
|
578
|
-
*
|
|
579
|
-
* Since `CheerioCrawler` uses raw HTTP requests to download web pages,
|
|
580
|
-
* it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
|
|
581
|
-
* to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
|
|
582
|
-
* because it loads the pages using full-featured headless Chrome browser.
|
|
583
|
-
*
|
|
584
|
-
* `CheerioCrawler` downloads each URL using a plain HTTP request,
|
|
585
|
-
* parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio)
|
|
586
|
-
* and then invokes the user-provided {@link CheerioCrawlerOptions.handlePageFunction} to extract page data
|
|
587
|
-
* using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM.
|
|
588
|
-
*
|
|
589
|
-
* The source URLs are represented using {@link Request} objects that are fed from
|
|
590
|
-
* {@link RequestList} or {@link RequestQueue} instances provided by the {@link CheerioCrawlerOptions.requestList}
|
|
591
|
-
* or {@link CheerioCrawlerOptions.requestQueue} constructor options, respectively.
|
|
592
|
-
*
|
|
593
|
-
* If both {@link CheerioCrawlerOptions.requestList} and {@link CheerioCrawlerOptions.requestQueue} are used,
|
|
594
|
-
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
595
|
-
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
596
|
-
*
|
|
597
|
-
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
598
|
-
*
|
|
599
|
-
* `CheerioCrawler` downloads the web pages using the `{@link utils#requestAsBrowser}` utility function.
|
|
600
|
-
* As opposed to the browser based crawlers that are automatically encoding the URLs, the
|
|
601
|
-
* `{@link utils#requestAsBrowser}` function will not do so. We either need to manually encode the URLs
|
|
602
|
-
* via `encodeURI()` function, or set `forceUrlEncoding: true` in the `requestAsBrowserOptions`,
|
|
603
|
-
* which will automatically encode all the URLs before accessing them.
|
|
604
|
-
*
|
|
605
|
-
* > We can either use `forceUrlEncoding` or encode manually, but not both - it would
|
|
606
|
-
* > result in double encoding and therefore lead to invalid URLs.
|
|
607
|
-
*
|
|
608
|
-
* We can use the `preNavigationHooks` to adjust `requestAsBrowserOptions`:
|
|
609
|
-
*
|
|
610
|
-
* ```
|
|
611
|
-
* preNavigationHooks: [
|
|
612
|
-
* (crawlingContext, requestAsBrowserOptions) => {
|
|
613
|
-
* requestAsBrowserOptions.forceUrlEncoding = true;
|
|
614
|
-
* },
|
|
615
|
-
* ]
|
|
616
|
-
* ```
|
|
617
|
-
*
|
|
618
|
-
* By default, `CheerioCrawler` only processes web pages with the `text/html`
|
|
619
|
-
* and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
|
|
620
|
-
* and skips pages with other content types. If you want the crawler to process other content types,
|
|
621
|
-
* use the {@link CheerioCrawlerOptions.additionalMimeTypes} constructor option.
|
|
622
|
-
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
|
|
623
|
-
* For details, see {@link CheerioCrawlerOptions.handlePageFunction}.
|
|
624
|
-
*
|
|
625
|
-
* New requests are only dispatched when there is enough free CPU and memory available,
|
|
626
|
-
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
627
|
-
* All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
|
|
628
|
-
* parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
|
|
629
|
-
* {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
|
|
630
|
-
*
|
|
631
|
-
* **Example usage:**
|
|
632
|
-
*
|
|
633
|
-
* ```javascript
|
|
634
|
-
* // Prepare a list of URLs to crawl
|
|
635
|
-
* const requestList = new Apify.RequestList({
|
|
636
|
-
* sources: [
|
|
637
|
-
* { url: 'http://www.example.com/page-1' },
|
|
638
|
-
* { url: 'http://www.example.com/page-2' },
|
|
639
|
-
* ],
|
|
640
|
-
* });
|
|
641
|
-
* await requestList.initialize();
|
|
642
|
-
*
|
|
643
|
-
* // Crawl the URLs
|
|
644
|
-
* const crawler = new Apify.CheerioCrawler({
|
|
645
|
-
* requestList,
|
|
646
|
-
* handlePageFunction: async ({ request, response, body, contentType, $ }) => {
|
|
647
|
-
* const data = [];
|
|
648
|
-
*
|
|
649
|
-
* // Do some data extraction from the page with Cheerio.
|
|
650
|
-
* $('.some-collection').each((index, el) => {
|
|
651
|
-
* data.push({ title: $(el).find('.some-title').text() });
|
|
652
|
-
* });
|
|
653
|
-
*
|
|
654
|
-
* // Save the data to dataset.
|
|
655
|
-
* await Apify.pushData({
|
|
656
|
-
* url: request.url,
|
|
657
|
-
* html: body,
|
|
658
|
-
* data,
|
|
659
|
-
* })
|
|
660
|
-
* },
|
|
661
|
-
* });
|
|
662
|
-
*
|
|
663
|
-
* await crawler.run();
|
|
664
|
-
* ```
|
|
665
|
-
* @property {Statistics} stats
|
|
666
|
-
* Contains statistics about the current run.
|
|
667
|
-
* @property {?RequestList} requestList
|
|
668
|
-
* A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
|
|
669
|
-
* Only available if used by the crawler.
|
|
670
|
-
* @property {?RequestQueue} requestQueue
|
|
671
|
-
* A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
|
|
672
|
-
* Only available if used by the crawler.
|
|
673
|
-
* @property {?SessionPool} sessionPool
|
|
674
|
-
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
|
|
675
|
-
* Only available if used by the crawler.
|
|
676
|
-
* @property {?ProxyConfiguration} proxyConfiguration
|
|
677
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
678
|
-
* Only available if used by the crawler.
|
|
679
|
-
* @property {AutoscaledPool} autoscaledPool
|
|
680
|
-
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
681
|
-
* Note that this property is only initialized after calling the {@link CheerioCrawler#run} function.
|
|
682
|
-
* You can use it to change the concurrency settings on the fly,
|
|
683
|
-
* to pause the crawler by calling {@link AutoscaledPool#pause}
|
|
684
|
-
* or to abort it by calling {@link AutoscaledPool#abort}.
|
|
685
|
-
*/
|
|
686
|
-
declare class CheerioCrawler extends BasicCrawler {
|
|
687
|
-
/**
|
|
688
|
-
* @param {CheerioCrawlerOptions} options
|
|
689
|
-
* All `CheerioCrawler` parameters are passed via an options object.
|
|
690
|
-
*/
|
|
691
|
-
constructor(options: CheerioCrawlerOptions);
|
|
692
|
-
supportedMimeTypes: Set<string>;
|
|
693
|
-
handlePageTimeoutMillis: number;
|
|
694
|
-
requestTimeoutMillis: number;
|
|
695
|
-
ignoreSslErrors: boolean;
|
|
696
|
-
suggestResponseEncoding: string | undefined;
|
|
697
|
-
forceResponseEncoding: string | undefined;
|
|
698
|
-
prepareRequestFunction: PrepareRequest | undefined;
|
|
699
|
-
postResponseFunction: PostResponse | undefined;
|
|
700
|
-
proxyConfiguration: ProxyConfiguration | undefined;
|
|
701
|
-
/**
|
|
702
|
-
* @type {Array<any>}
|
|
703
|
-
* @ignore
|
|
704
|
-
* */
|
|
705
|
-
preNavigationHooks: Array<any>;
|
|
706
|
-
/**
|
|
707
|
-
* @type {Array<any>}
|
|
708
|
-
* @ignore
|
|
709
|
-
* */
|
|
710
|
-
postNavigationHooks: Array<any>;
|
|
711
|
-
persistCookiesPerSession: boolean;
|
|
712
|
-
/**
|
|
713
|
-
* **EXPERIMENTAL**
|
|
714
|
-
* Function for attaching CrawlerExtensions such as the Unblockers.
|
|
715
|
-
* @param {CrawlerExtension} extension - Crawler extension that overrides the crawler configuration.
|
|
716
|
-
*/
|
|
717
|
-
use(extension: CrawlerExtension): void;
|
|
718
|
-
/**
|
|
719
|
-
* @param {CrawlingContext} crawlingContext
|
|
720
|
-
* @ignore
|
|
721
|
-
* @protected
|
|
722
|
-
* @internal
|
|
723
|
-
*/
|
|
724
|
-
protected _handleNavigation(crawlingContext: CrawlingContext): Promise<void>;
|
|
725
|
-
/**
|
|
726
|
-
* When users change `request.headers.cookie` inside preNavigationHook, the change would be ignored,
|
|
727
|
-
* as `request.headers` are already merged into the `requestAsBrowserOptions`. This method is using
|
|
728
|
-
* old `request.headers` snapshot (before hooks are executed), makes a diff with the cookie value
|
|
729
|
-
* after hooks are executed, and merges any new cookies back to `requestAsBrowserOptions`.
|
|
730
|
-
*
|
|
731
|
-
* This way we can still use both `requestAsBrowserOptions` and `context.request` in the hooks (not both).
|
|
732
|
-
*
|
|
733
|
-
* @param {Request} request
|
|
734
|
-
* @param {string} cookieSnapshot
|
|
735
|
-
* @param {RequestAsBrowserOptions} requestAsBrowserOptions
|
|
736
|
-
* @private
|
|
737
|
-
* @ignore
|
|
738
|
-
* @internal
|
|
739
|
-
*/
|
|
740
|
-
private _mergeRequestCookieDiff;
|
|
741
|
-
/**
|
|
742
|
-
* Function to make the HTTP request. It performs optimizations
|
|
743
|
-
* on the request such as only downloading the request body if the
|
|
744
|
-
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
745
|
-
*
|
|
746
|
-
* @param {object} options
|
|
747
|
-
* @param {Request} options.request
|
|
748
|
-
* @param {Session} options.session
|
|
749
|
-
* @param {string} options.proxyUrl
|
|
750
|
-
* @param {RequestAsBrowserOptions} options.requestAsBrowserOptions
|
|
751
|
-
* @returns {Promise<IncomingMessage|Readable>}
|
|
752
|
-
* @ignore
|
|
753
|
-
* @protected
|
|
754
|
-
* @internal
|
|
755
|
-
*/
|
|
756
|
-
protected _requestFunction({ request, session, proxyUrl, requestAsBrowserOptions }: {
|
|
757
|
-
request: Request;
|
|
758
|
-
session: Session;
|
|
759
|
-
proxyUrl: string;
|
|
760
|
-
requestAsBrowserOptions: RequestAsBrowserOptions;
|
|
761
|
-
}): Promise<IncomingMessage | Readable>;
|
|
762
|
-
/**
|
|
763
|
-
* Sets the cookie header to `requestAsBrowserOptions` based on provided session and request. If some cookies were already set,
|
|
764
|
-
* the session cookie will be merged with them. User provided cookies on `request` object have precedence.
|
|
765
|
-
*
|
|
766
|
-
* @param {CrawlingContext} crawlingContext
|
|
767
|
-
* @param {RequestAsBrowserOptions} requestAsBrowserOptions
|
|
768
|
-
* @return {void}
|
|
769
|
-
* @ignore
|
|
770
|
-
* @private
|
|
771
|
-
* @internal
|
|
772
|
-
*/
|
|
773
|
-
private _applySessionCookie;
|
|
774
|
-
/**
|
|
775
|
-
* Encodes and parses response according to the provided content type
|
|
776
|
-
* @param {Request} request
|
|
777
|
-
* @param {IncomingMessage|Readable} responseStream
|
|
778
|
-
* @returns {Promise<object>}
|
|
779
|
-
* @ignore
|
|
780
|
-
* @protected
|
|
781
|
-
* @internal
|
|
782
|
-
*/
|
|
783
|
-
protected _parseResponse(request: Request, responseStream: IncomingMessage | Readable): Promise<object>;
|
|
784
|
-
/**
|
|
785
|
-
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
786
|
-
* @param {Request} request
|
|
787
|
-
* @param {Session} [session]
|
|
788
|
-
* @param {string} [proxyUrl]
|
|
789
|
-
* @param {RequestAsBrowserOptions} [requestAsBrowserOptions]
|
|
790
|
-
* @ignore
|
|
791
|
-
* @protected
|
|
792
|
-
* @internal
|
|
793
|
-
*/
|
|
794
|
-
protected _getRequestOptions(request: Request, session?: Session | undefined, proxyUrl?: string | undefined, requestAsBrowserOptions?: RequestAsBrowserOptions | undefined): {
|
|
795
|
-
headers: {
|
|
796
|
-
[x: string]: string;
|
|
797
|
-
};
|
|
798
|
-
https: any;
|
|
799
|
-
isStream: boolean;
|
|
800
|
-
/**
|
|
801
|
-
* URL of the target endpoint. Supports both HTTP and HTTPS schemes.
|
|
802
|
-
*/
|
|
803
|
-
url: string;
|
|
804
|
-
/**
|
|
805
|
-
* HTTP method.
|
|
806
|
-
*/
|
|
807
|
-
method: string;
|
|
808
|
-
/**
|
|
809
|
-
* An HTTP proxy to be passed down to the HTTP request. Supports proxy authentication with Basic Auth.
|
|
810
|
-
*/
|
|
811
|
-
proxyUrl: string | undefined;
|
|
812
|
-
/**
|
|
813
|
-
* Configuration to be used for generating correct browser headers.
|
|
814
|
-
* See the [`header-generator`](https://github.com/apify/header-generator) library.
|
|
815
|
-
*/
|
|
816
|
-
headerGeneratorOptions?: object | undefined;
|
|
817
|
-
/**
|
|
818
|
-
* Two-letter ISO 639 language code.
|
|
819
|
-
*/
|
|
820
|
-
languageCode?: string | undefined;
|
|
821
|
-
/**
|
|
822
|
-
* Two-letter ISO 3166 country code.
|
|
823
|
-
*/
|
|
824
|
-
countryCode?: string | undefined;
|
|
825
|
-
/**
|
|
826
|
-
* If `true`, the function uses User-Agent of a mobile browser.
|
|
827
|
-
*/
|
|
828
|
-
useMobileVersion?: boolean | undefined;
|
|
829
|
-
/**
|
|
830
|
-
* If set to true, SSL/TLS certificate errors will be ignored.
|
|
831
|
-
*/
|
|
832
|
-
ignoreSslErrors?: boolean | undefined;
|
|
833
|
-
/**
|
|
834
|
-
* Node.js' HTTP parser is stricter than parsers used by web browsers, which prevents scraping of websites
|
|
835
|
-
* whose servers do not comply with HTTP specs, either by accident or due to some anti-scraping protections,
|
|
836
|
-
* causing e.g. the `invalid header value char` error. The `useInsecureHttpParser` option forces
|
|
837
|
-
* the HTTP parser to ignore certain errors which lets you scrape such websites.
|
|
838
|
-
* However, it will also open your application to some security vulnerabilities,
|
|
839
|
-
* although the risk should be negligible as these vulnerabilities mainly relate to server applications, not clients.
|
|
840
|
-
* Learn more in this [blog post](https://snyk.io/blog/node-js-release-fixes-a-critical-http-security-vulnerability/).
|
|
841
|
-
*/
|
|
842
|
-
useInsecureHttpParser?: boolean | undefined;
|
|
843
|
-
/**
|
|
844
|
-
* Function accepts `response` object as a single parameter and should return `true` or `false`.
|
|
845
|
-
* If function returns true, request gets aborted.
|
|
846
|
-
*/
|
|
847
|
-
abortFunction?: import("../utils_request").AbortFunction | undefined;
|
|
848
|
-
/**
|
|
849
|
-
* If set to false, it will prevent use of HTTP2 requests. This is strongly discouraged. Websites
|
|
850
|
-
* expect HTTP2 connections, because browsers use HTTP2 by default. It will automatically downgrade
|
|
851
|
-
* to HTTP/1.1 for websites that do not support HTTP2.
|
|
852
|
-
*/
|
|
853
|
-
useHttp2?: boolean | undefined;
|
|
854
|
-
/**
|
|
855
|
-
* A unique object used to generate browser headers. By default, new headers are generated on every call.
|
|
856
|
-
* Set this option to make these headers persistent.
|
|
857
|
-
*/
|
|
858
|
-
sessionToken: object | Session | undefined;
|
|
859
|
-
timeout: {
|
|
860
|
-
request: number;
|
|
861
|
-
};
|
|
862
|
-
};
|
|
863
|
-
/**
|
|
864
|
-
* @param {*} request
|
|
865
|
-
* @param {*} response
|
|
866
|
-
* @param {*} encoding
|
|
867
|
-
* @ignore
|
|
868
|
-
* @protected
|
|
869
|
-
* @internal
|
|
870
|
-
*/
|
|
871
|
-
protected _encodeResponse(request: any, response: any, encoding: any): {
|
|
872
|
-
response: any;
|
|
873
|
-
encoding: string;
|
|
874
|
-
};
|
|
875
|
-
/**
|
|
876
|
-
* @param {*} response
|
|
877
|
-
* @ignore
|
|
878
|
-
* @protected
|
|
879
|
-
* @internal
|
|
880
|
-
*/
|
|
881
|
-
protected _parseHtmlToDom(response: any): Promise<any>;
|
|
882
|
-
/**
|
|
883
|
-
* Checks and extends supported mime types
|
|
884
|
-
* @param {Array<(string|Object)>} additionalMimeTypes
|
|
885
|
-
* @ignore
|
|
886
|
-
* @protected
|
|
887
|
-
* @internal
|
|
888
|
-
*/
|
|
889
|
-
protected _extendSupportedMimeTypes(additionalMimeTypes: Array<(string | Object)>): void;
|
|
890
|
-
/**
|
|
891
|
-
* Handles blocked request
|
|
892
|
-
* @param {Session} session
|
|
893
|
-
* @param {number} statusCode
|
|
894
|
-
* @ignore
|
|
895
|
-
* @protected
|
|
896
|
-
* @internal
|
|
897
|
-
*/
|
|
898
|
-
protected _throwOnBlockedRequest(session: Session, statusCode: number): void;
|
|
899
|
-
/**
|
|
900
|
-
* Handles timeout request
|
|
901
|
-
* @param {Session} session
|
|
902
|
-
* @ignore
|
|
903
|
-
* @protected
|
|
904
|
-
* @internal
|
|
905
|
-
*/
|
|
906
|
-
protected _handleRequestTimeout(session: Session): void;
|
|
907
|
-
/**
|
|
908
|
-
* @param {Request} request
|
|
909
|
-
* @param {IncomingMessage|Readable} response
|
|
910
|
-
* @private
|
|
911
|
-
*/
|
|
912
|
-
private _abortDownloadOfBody;
|
|
913
|
-
}
|
|
914
|
-
import { RequestList } from "../request_list";
|
|
915
|
-
import { RequestQueue } from "../storages/request_queue";
|
|
916
|
-
import { ProxyConfiguration } from "../proxy_configuration";
|
|
917
|
-
import { HandleFailedRequest } from "./basic_crawler";
|
|
918
|
-
import { Hook } from "./browser_crawler";
|
|
919
|
-
import { AutoscaledPoolOptions } from "../autoscaling/autoscaled_pool";
|
|
920
|
-
import { SessionPoolOptions } from "../session_pool/session_pool";
|
|
921
|
-
import Request from "../request";
|
|
922
|
-
import { Session } from "../session_pool/session";
|
|
923
|
-
import { ProxyInfo } from "../proxy_configuration";
|
|
924
|
-
import { IncomingMessage } from "http";
|
|
925
|
-
import { Readable } from "stream";
|
|
926
|
-
import { CheerioAPI } from "cheerio/lib/load";
|
|
927
|
-
import { BasicCrawler } from "./basic_crawler";
|
|
928
|
-
import CrawlerExtension from "./crawler_extension";
|
|
929
|
-
import { CrawlingContext } from "./basic_crawler";
|
|
930
|
-
import { RequestAsBrowserOptions } from "../utils_request";
|
|
931
|
-
//# sourceMappingURL=cheerio_crawler.d.ts.map
|