apify 2.3.1-beta.4 → 3.0.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -5
- package/package.json +69 -128
- package/build/actor.d.ts +0 -113
- package/build/actor.d.ts.map +0 -1
- package/build/actor.js +0 -582
- package/build/actor.js.map +0 -1
- package/build/apify.d.ts +0 -752
- package/build/apify.d.ts.map +0 -1
- package/build/apify.js +0 -877
- package/build/apify.js.map +0 -1
- package/build/autoscaling/autoscaled_pool.d.ts +0 -384
- package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
- package/build/autoscaling/autoscaled_pool.js +0 -557
- package/build/autoscaling/autoscaled_pool.js.map +0 -1
- package/build/autoscaling/snapshotter.d.ts +0 -278
- package/build/autoscaling/snapshotter.d.ts.map +0 -1
- package/build/autoscaling/snapshotter.js +0 -447
- package/build/autoscaling/snapshotter.js.map +0 -1
- package/build/autoscaling/system_status.d.ts +0 -224
- package/build/autoscaling/system_status.d.ts.map +0 -1
- package/build/autoscaling/system_status.js +0 -228
- package/build/autoscaling/system_status.js.map +0 -1
- package/build/browser_launchers/browser_launcher.d.ts +0 -154
- package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
- package/build/browser_launchers/browser_launcher.js +0 -160
- package/build/browser_launchers/browser_launcher.js.map +0 -1
- package/build/browser_launchers/browser_plugin.d.ts +0 -23
- package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
- package/build/browser_launchers/browser_plugin.js +0 -25
- package/build/browser_launchers/browser_plugin.js.map +0 -1
- package/build/browser_launchers/playwright_launcher.d.ts +0 -131
- package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
- package/build/browser_launchers/playwright_launcher.js +0 -150
- package/build/browser_launchers/playwright_launcher.js.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
- package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.js +0 -197
- package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
- package/build/cache_container.d.ts +0 -31
- package/build/cache_container.d.ts.map +0 -1
- package/build/cache_container.js +0 -48
- package/build/cache_container.js.map +0 -1
- package/build/configuration.d.ts +0 -226
- package/build/configuration.d.ts.map +0 -1
- package/build/configuration.js +0 -325
- package/build/configuration.js.map +0 -1
- package/build/constants.d.ts +0 -37
- package/build/constants.d.ts.map +0 -1
- package/build/constants.js +0 -41
- package/build/constants.js.map +0 -1
- package/build/crawlers/basic_crawler.d.ts +0 -443
- package/build/crawlers/basic_crawler.d.ts.map +0 -1
- package/build/crawlers/basic_crawler.js +0 -664
- package/build/crawlers/basic_crawler.js.map +0 -1
- package/build/crawlers/browser_crawler.d.ts +0 -512
- package/build/crawlers/browser_crawler.d.ts.map +0 -1
- package/build/crawlers/browser_crawler.js +0 -540
- package/build/crawlers/browser_crawler.js.map +0 -1
- package/build/crawlers/cheerio_crawler.d.ts +0 -931
- package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
- package/build/crawlers/cheerio_crawler.js +0 -913
- package/build/crawlers/cheerio_crawler.js.map +0 -1
- package/build/crawlers/crawler_extension.d.ts +0 -10
- package/build/crawlers/crawler_extension.d.ts.map +0 -1
- package/build/crawlers/crawler_extension.js +0 -19
- package/build/crawlers/crawler_extension.js.map +0 -1
- package/build/crawlers/crawler_utils.d.ts +0 -34
- package/build/crawlers/crawler_utils.d.ts.map +0 -1
- package/build/crawlers/crawler_utils.js +0 -87
- package/build/crawlers/crawler_utils.js.map +0 -1
- package/build/crawlers/playwright_crawler.d.ts +0 -448
- package/build/crawlers/playwright_crawler.d.ts.map +0 -1
- package/build/crawlers/playwright_crawler.js +0 -299
- package/build/crawlers/playwright_crawler.js.map +0 -1
- package/build/crawlers/puppeteer_crawler.d.ts +0 -425
- package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
- package/build/crawlers/puppeteer_crawler.js +0 -299
- package/build/crawlers/puppeteer_crawler.js.map +0 -1
- package/build/crawlers/statistics.d.ts +0 -185
- package/build/crawlers/statistics.d.ts.map +0 -1
- package/build/crawlers/statistics.js +0 -331
- package/build/crawlers/statistics.js.map +0 -1
- package/build/enqueue_links/click_elements.d.ts +0 -179
- package/build/enqueue_links/click_elements.d.ts.map +0 -1
- package/build/enqueue_links/click_elements.js +0 -434
- package/build/enqueue_links/click_elements.js.map +0 -1
- package/build/enqueue_links/enqueue_links.d.ts +0 -117
- package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
- package/build/enqueue_links/enqueue_links.js +0 -163
- package/build/enqueue_links/enqueue_links.js.map +0 -1
- package/build/enqueue_links/shared.d.ts +0 -42
- package/build/enqueue_links/shared.d.ts.map +0 -1
- package/build/enqueue_links/shared.js +0 -121
- package/build/enqueue_links/shared.js.map +0 -1
- package/build/errors.d.ts +0 -29
- package/build/errors.d.ts.map +0 -1
- package/build/errors.js +0 -38
- package/build/errors.js.map +0 -1
- package/build/events.d.ts +0 -11
- package/build/events.d.ts.map +0 -1
- package/build/events.js +0 -147
- package/build/events.js.map +0 -1
- package/build/index.d.ts +0 -4
- package/build/index.d.ts.map +0 -1
- package/build/index.js +0 -7
- package/build/index.js.map +0 -1
- package/build/main.d.ts +0 -179
- package/build/main.d.ts.map +0 -1
- package/build/main.js +0 -81
- package/build/main.js.map +0 -1
- package/build/playwright_utils.d.ts +0 -9
- package/build/playwright_utils.d.ts.map +0 -1
- package/build/playwright_utils.js +0 -90
- package/build/playwright_utils.js.map +0 -1
- package/build/proxy_configuration.d.ts +0 -411
- package/build/proxy_configuration.d.ts.map +0 -1
- package/build/proxy_configuration.js +0 -517
- package/build/proxy_configuration.js.map +0 -1
- package/build/pseudo_url.d.ts +0 -86
- package/build/pseudo_url.d.ts.map +0 -1
- package/build/pseudo_url.js +0 -153
- package/build/pseudo_url.js.map +0 -1
- package/build/puppeteer_request_interception.d.ts +0 -8
- package/build/puppeteer_request_interception.d.ts.map +0 -1
- package/build/puppeteer_request_interception.js +0 -235
- package/build/puppeteer_request_interception.js.map +0 -1
- package/build/puppeteer_utils.d.ts +0 -250
- package/build/puppeteer_utils.d.ts.map +0 -1
- package/build/puppeteer_utils.js +0 -551
- package/build/puppeteer_utils.js.map +0 -1
- package/build/request.d.ts +0 -180
- package/build/request.d.ts.map +0 -1
- package/build/request.js +0 -261
- package/build/request.js.map +0 -1
- package/build/request_list.d.ts +0 -581
- package/build/request_list.d.ts.map +0 -1
- package/build/request_list.js +0 -826
- package/build/request_list.js.map +0 -1
- package/build/serialization.d.ts +0 -5
- package/build/serialization.d.ts.map +0 -1
- package/build/serialization.js +0 -139
- package/build/serialization.js.map +0 -1
- package/build/session_pool/errors.d.ts +0 -11
- package/build/session_pool/errors.d.ts.map +0 -1
- package/build/session_pool/errors.js +0 -18
- package/build/session_pool/errors.js.map +0 -1
- package/build/session_pool/events.d.ts +0 -5
- package/build/session_pool/events.d.ts.map +0 -1
- package/build/session_pool/events.js +0 -6
- package/build/session_pool/events.js.map +0 -1
- package/build/session_pool/session.d.ts +0 -286
- package/build/session_pool/session.d.ts.map +0 -1
- package/build/session_pool/session.js +0 -355
- package/build/session_pool/session.js.map +0 -1
- package/build/session_pool/session_pool.d.ts +0 -280
- package/build/session_pool/session_pool.d.ts.map +0 -1
- package/build/session_pool/session_pool.js +0 -393
- package/build/session_pool/session_pool.js.map +0 -1
- package/build/session_pool/session_utils.d.ts +0 -4
- package/build/session_pool/session_utils.d.ts.map +0 -1
- package/build/session_pool/session_utils.js +0 -24
- package/build/session_pool/session_utils.js.map +0 -1
- package/build/stealth/hiding_tricks.d.ts +0 -22
- package/build/stealth/hiding_tricks.d.ts.map +0 -1
- package/build/stealth/hiding_tricks.js +0 -308
- package/build/stealth/hiding_tricks.js.map +0 -1
- package/build/stealth/stealth.d.ts +0 -56
- package/build/stealth/stealth.d.ts.map +0 -1
- package/build/stealth/stealth.js +0 -125
- package/build/stealth/stealth.js.map +0 -1
- package/build/storages/dataset.d.ts +0 -288
- package/build/storages/dataset.d.ts.map +0 -1
- package/build/storages/dataset.js +0 -480
- package/build/storages/dataset.js.map +0 -1
- package/build/storages/key_value_store.d.ts +0 -243
- package/build/storages/key_value_store.d.ts.map +0 -1
- package/build/storages/key_value_store.js +0 -462
- package/build/storages/key_value_store.js.map +0 -1
- package/build/storages/request_queue.d.ts +0 -318
- package/build/storages/request_queue.d.ts.map +0 -1
- package/build/storages/request_queue.js +0 -636
- package/build/storages/request_queue.js.map +0 -1
- package/build/storages/storage_manager.d.ts +0 -87
- package/build/storages/storage_manager.d.ts.map +0 -1
- package/build/storages/storage_manager.js +0 -150
- package/build/storages/storage_manager.js.map +0 -1
- package/build/tsconfig.tsbuildinfo +0 -1
- package/build/typedefs.d.ts +0 -146
- package/build/typedefs.d.ts.map +0 -1
- package/build/typedefs.js +0 -88
- package/build/typedefs.js.map +0 -1
- package/build/utils.d.ts +0 -175
- package/build/utils.d.ts.map +0 -1
- package/build/utils.js +0 -731
- package/build/utils.js.map +0 -1
- package/build/utils_log.d.ts +0 -41
- package/build/utils_log.d.ts.map +0 -1
- package/build/utils_log.js +0 -192
- package/build/utils_log.js.map +0 -1
- package/build/utils_request.d.ts +0 -77
- package/build/utils_request.d.ts.map +0 -1
- package/build/utils_request.js +0 -385
- package/build/utils_request.js.map +0 -1
- package/build/utils_social.d.ts +0 -210
- package/build/utils_social.d.ts.map +0 -1
- package/build/utils_social.js +0 -787
- package/build/utils_social.js.map +0 -1
- package/build/validators.d.ts +0 -23
- package/build/validators.d.ts.map +0 -1
- package/build/validators.js +0 -29
- package/build/validators.js.map +0 -1
|
@@ -1,913 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
const tslib_1 = require("tslib");
|
|
4
|
-
/* eslint-disable class-methods-use-this */
|
|
5
|
-
const utilities_1 = require("@apify/utilities");
|
|
6
|
-
const timeout_1 = require("@apify/timeout");
|
|
7
|
-
const cheerio_1 = (0, tslib_1.__importDefault)(require("cheerio")); // eslint-disable-line import/no-duplicates
|
|
8
|
-
const content_type_1 = (0, tslib_1.__importDefault)(require("content-type"));
|
|
9
|
-
const htmlparser2_1 = require("htmlparser2");
|
|
10
|
-
const WritableStream_1 = require("htmlparser2/lib/WritableStream");
|
|
11
|
-
const iconv_lite_1 = (0, tslib_1.__importDefault)(require("iconv-lite"));
|
|
12
|
-
const ow_1 = (0, tslib_1.__importDefault)(require("ow"));
|
|
13
|
-
const util_1 = (0, tslib_1.__importDefault)(require("util"));
|
|
14
|
-
const got_scraping_1 = require("got-scraping");
|
|
15
|
-
const constants_1 = require("../constants");
|
|
16
|
-
const utils_1 = require("../utils");
|
|
17
|
-
const utils_request_1 = require("../utils_request"); // eslint-disable-line import/no-duplicates
|
|
18
|
-
const crawler_utils_1 = require("./crawler_utils");
|
|
19
|
-
const basic_crawler_1 = require("./basic_crawler"); // eslint-disable-line import/no-duplicates
|
|
20
|
-
const crawler_extension_1 = (0, tslib_1.__importDefault)(require("./crawler_extension"));
|
|
21
|
-
const validators_1 = require("../validators");
|
|
22
|
-
/* eslint-enable no-unused-vars,import/named,import/no-duplicates,import/order */
|
|
23
|
-
/**
|
|
24
|
-
* Default mime types, which CheerioScraper supports.
|
|
25
|
-
*/
|
|
26
|
-
const HTML_AND_XML_MIME_TYPES = ['text/html', 'text/xml', 'application/xhtml+xml', 'application/xml'];
|
|
27
|
-
const APPLICATION_JSON_MIME_TYPE = 'application/json';
|
|
28
|
-
const CHEERIO_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
|
|
29
|
-
snapshotterOptions: {
|
|
30
|
-
eventLoopSnapshotIntervalSecs: 2,
|
|
31
|
-
maxBlockedMillis: 100,
|
|
32
|
-
},
|
|
33
|
-
systemStatusOptions: {
|
|
34
|
-
maxEventLoopOverloadedRatio: 0.7,
|
|
35
|
-
},
|
|
36
|
-
};
|
|
37
|
-
/**
|
|
38
|
-
* @typedef CheerioCrawlerOptions
|
|
39
|
-
* @property {CheerioHandlePage} handlePageFunction
|
|
40
|
-
* User-provided function that performs the logic of the crawler. It is called for each page
|
|
41
|
-
* loaded and parsed by the crawler.
|
|
42
|
-
*
|
|
43
|
-
* The function receives the following object as an argument:
|
|
44
|
-
* ```
|
|
45
|
-
* {
|
|
46
|
-
* // The Cheerio object's function with the parsed HTML.
|
|
47
|
-
* $: Cheerio,
|
|
48
|
-
*
|
|
49
|
-
* // The request body of the web page, whose type depends on the content type.
|
|
50
|
-
* body: String|Buffer,
|
|
51
|
-
*
|
|
52
|
-
* // The parsed object from JSON for responses with the "application/json" content types.
|
|
53
|
-
* // For other content types it's null.
|
|
54
|
-
* json: Object,
|
|
55
|
-
*
|
|
56
|
-
* // Apify.Request object with details of the requested web page
|
|
57
|
-
* request: Request,
|
|
58
|
-
*
|
|
59
|
-
* // Parsed Content-Type HTTP header: { type, encoding }
|
|
60
|
-
* contentType: Object,
|
|
61
|
-
*
|
|
62
|
-
* // An instance of Node's http.IncomingMessage object,
|
|
63
|
-
* response: Object,
|
|
64
|
-
*
|
|
65
|
-
* // Session object, useful to work around anti-scraping protections
|
|
66
|
-
* session: Session
|
|
67
|
-
*
|
|
68
|
-
* // ProxyInfo object with information about currently used proxy
|
|
69
|
-
* proxyInfo: ProxyInfo
|
|
70
|
-
*
|
|
71
|
-
* // The running cheerio crawler instance.
|
|
72
|
-
* crawler: CheerioCrawler
|
|
73
|
-
* }
|
|
74
|
-
* ```
|
|
75
|
-
*
|
|
76
|
-
* Type of `body` depends on the `Content-Type` header of the web page:
|
|
77
|
-
* - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
|
|
78
|
-
* - Buffer for others MIME content types
|
|
79
|
-
*
|
|
80
|
-
* Parsed `Content-Type` header using
|
|
81
|
-
* [content-type package](https://www.npmjs.com/package/content-type)
|
|
82
|
-
* is stored in `contentType`.
|
|
83
|
-
*
|
|
84
|
-
* Cheerio is available only for HTML and XML content types.
|
|
85
|
-
*
|
|
86
|
-
* With the {@link Request} object representing the URL to crawl.
|
|
87
|
-
*
|
|
88
|
-
* If the function returns, the returned promise is awaited by the crawler.
|
|
89
|
-
*
|
|
90
|
-
* If the function throws an exception, the crawler will try to re-crawl the
|
|
91
|
-
* request later, up to `option.maxRequestRetries` times.
|
|
92
|
-
* If all the retries fail, the crawler calls the function
|
|
93
|
-
* provided to the `handleFailedRequestFunction` parameter.
|
|
94
|
-
* To make this work, you should **always**
|
|
95
|
-
* let your function throw exceptions rather than catch them.
|
|
96
|
-
* The exceptions are logged to the request using the
|
|
97
|
-
* {@link Request#pushErrorMessage} function.
|
|
98
|
-
* @property {RequestList} [requestList]
|
|
99
|
-
* Static list of URLs to be processed.
|
|
100
|
-
* Either `requestList` or `requestQueue` option must be provided (or both).
|
|
101
|
-
* @property {RequestQueue} [requestQueue]
|
|
102
|
-
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
103
|
-
* Either `requestList` or `requestQueue` option must be provided (or both).
|
|
104
|
-
* @property {PrepareRequest} [prepareRequestFunction]
|
|
105
|
-
* > This option is deprecated, use `preNavigationHooks` instead.
|
|
106
|
-
*
|
|
107
|
-
* A function that executes before the HTTP request is made to the target resource.
|
|
108
|
-
* This function is suitable for setting dynamic properties such as cookies to the {@link Request}.
|
|
109
|
-
*
|
|
110
|
-
* The function receives the following object as an argument:
|
|
111
|
-
* ```
|
|
112
|
-
* {
|
|
113
|
-
* request: Request,
|
|
114
|
-
* session: Session,
|
|
115
|
-
* proxyInfo: ProxyInfo,
|
|
116
|
-
* crawler: CheerioCrawler,
|
|
117
|
-
* }
|
|
118
|
-
* ```
|
|
119
|
-
* where the {@link Request} instance corresponds to the initialized request
|
|
120
|
-
* and the {@link Session} instance corresponds to used session.
|
|
121
|
-
*
|
|
122
|
-
* The function should modify the properties of the passed {@link Request} instance
|
|
123
|
-
* in place because there are already earlier references to it. Making a copy and returning it from
|
|
124
|
-
* this function is therefore not supported, because it would create inconsistencies where
|
|
125
|
-
* different parts of SDK would have access to a different {@link Request} instance.
|
|
126
|
-
*
|
|
127
|
-
* @property {PostResponse} [postResponseFunction]
|
|
128
|
-
* > This option is deprecated, use `postNavigationHooks` instead.
|
|
129
|
-
*
|
|
130
|
-
* A function that executes right after the HTTP request is made to the target resource and response is returned.
|
|
131
|
-
* This function is suitable for overriding custom properties of response e.g. setting headers because of response parsing.
|
|
132
|
-
*
|
|
133
|
-
* **Example usage:**
|
|
134
|
-
*
|
|
135
|
-
* ```javascript
|
|
136
|
-
* const cheerioCrawlerOptions = {
|
|
137
|
-
* // ...
|
|
138
|
-
* postResponseFunction: ({ request, response }) => {
|
|
139
|
-
* if (request.userData.parseAsJSON) {
|
|
140
|
-
* response.headers['content-type'] = 'application/json; charset=utf-8';
|
|
141
|
-
* }
|
|
142
|
-
* }
|
|
143
|
-
* }
|
|
144
|
-
* ```
|
|
145
|
-
* The function receives the following object as an argument:
|
|
146
|
-
* ```
|
|
147
|
-
* {
|
|
148
|
-
* response: Object,
|
|
149
|
-
* request: Request,
|
|
150
|
-
* session: Session,
|
|
151
|
-
* proxyInfo: ProxyInfo,
|
|
152
|
-
* crawler: CheerioCrawler,
|
|
153
|
-
* }
|
|
154
|
-
* ```
|
|
155
|
-
* The response is an instance of Node's http.IncomingMessage object.
|
|
156
|
-
*
|
|
157
|
-
* @property {number} [handlePageTimeoutSecs=60]
|
|
158
|
-
* Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds.
|
|
159
|
-
* @property {number} [requestTimeoutSecs=30]
|
|
160
|
-
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
|
|
161
|
-
* @property {boolean} [ignoreSslErrors=true]
|
|
162
|
-
* If set to true, SSL certificate errors will be ignored.
|
|
163
|
-
* @property {ProxyConfiguration} [proxyConfiguration]
|
|
164
|
-
* If set, `CheerioCrawler` will be configured for all connections to use
|
|
165
|
-
* [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
|
|
166
|
-
* For more information, see the [documentation](https://docs.apify.com/proxy).
|
|
167
|
-
* @property {HandleFailedRequest} [handleFailedRequestFunction]
|
|
168
|
-
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
169
|
-
* The function receives the following object as an argument:
|
|
170
|
-
* ```
|
|
171
|
-
* {
|
|
172
|
-
* error: Error,
|
|
173
|
-
* request: Request,
|
|
174
|
-
* session: Session,
|
|
175
|
-
* $: Cheerio,
|
|
176
|
-
* body: String|Buffer,
|
|
177
|
-
* json: Object,
|
|
178
|
-
* contentType: Object,
|
|
179
|
-
* response: Object,
|
|
180
|
-
* proxyInfo: ProxyInfo,
|
|
181
|
-
* crawler: CheerioCrawler,
|
|
182
|
-
* }
|
|
183
|
-
* ```
|
|
184
|
-
* where the {@link Request} instance corresponds to the failed request, and the `Error` instance
|
|
185
|
-
* represents the last error thrown during processing of the request.
|
|
186
|
-
*
|
|
187
|
-
* See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13)
|
|
188
|
-
* for the default implementation of this function.
|
|
189
|
-
* @property {Array<Hook>} [preNavigationHooks]
|
|
190
|
-
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
|
|
191
|
-
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `requestAsBrowserOptions`,
|
|
192
|
-
* which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
|
|
193
|
-
* Example:
|
|
194
|
-
* ```
|
|
195
|
-
* preNavigationHooks: [
|
|
196
|
-
* async (crawlingContext, requestAsBrowserOptions) => {
|
|
197
|
-
* requestAsBrowserOptions.forceUrlEncoding = true;
|
|
198
|
-
* },
|
|
199
|
-
* ]
|
|
200
|
-
* ```
|
|
201
|
-
* @property {Array<Hook>} [postNavigationHooks]
|
|
202
|
-
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
|
|
203
|
-
* The function accepts `crawlingContext` as the only parameter.
|
|
204
|
-
* Example:
|
|
205
|
-
* ```
|
|
206
|
-
* postNavigationHooks: [
|
|
207
|
-
* async (crawlingContext) => {
|
|
208
|
-
* // ...
|
|
209
|
-
* },
|
|
210
|
-
* ]
|
|
211
|
-
* ```
|
|
212
|
-
* @property {string[]} [additionalMimeTypes]
|
|
213
|
-
* An array of <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types"
|
|
214
|
-
* target="_blank">MIME types</a> you want the crawler to load and process.
|
|
215
|
-
* By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
|
|
216
|
-
* @property {string} [suggestResponseEncoding]
|
|
217
|
-
* By default `CheerioCrawler` will extract correct encoding from the HTTP response headers.
|
|
218
|
-
* Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
|
|
219
|
-
* If those sites actually use a different encoding, the response will be corrupted. You can use
|
|
220
|
-
* `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
|
|
221
|
-
* To force a certain encoding, disregarding the response headers, use {@link CheerioCrawlerOptions.forceResponseEncoding}
|
|
222
|
-
* ```
|
|
223
|
-
* // Will fall back to windows-1250 encoding if none found
|
|
224
|
-
* suggestResponseEncoding: 'windows-1250'
|
|
225
|
-
* ```
|
|
226
|
-
* @property {string} [forceResponseEncoding]
|
|
227
|
-
* By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
|
|
228
|
-
* to force a certain encoding, disregarding the response headers.
|
|
229
|
-
* To only provide a default for missing encodings, use {@link CheerioCrawlerOptions.suggestResponseEncoding}
|
|
230
|
-
* ```
|
|
231
|
-
* // Will force windows-1250 encoding even if headers say otherwise
|
|
232
|
-
* forceResponseEncoding: 'windows-1250'
|
|
233
|
-
* ```
|
|
234
|
-
* @property {number} [maxRequestRetries=3]
|
|
235
|
-
* Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails.
|
|
236
|
-
* @property {number} [maxRequestsPerCrawl]
|
|
237
|
-
* Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
|
|
238
|
-
* Always set this value in order to prevent infinite loops in misconfigured crawlers.
|
|
239
|
-
* Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
|
|
240
|
-
* @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
|
|
241
|
-
* Custom options passed to the underlying {@link AutoscaledPool} constructor.
|
|
242
|
-
* Note that the `runTaskFunction`, `isTaskReadyFunction` and `isFinishedFunction` options
|
|
243
|
-
* are provided by `CheerioCrawler` and cannot be overridden. Reasonable {@link Snapshotter}
|
|
244
|
-
* and {@link SystemStatus} defaults are provided to account for the fact that `cheerio`
|
|
245
|
-
* parses HTML synchronously and therefore blocks the event loop.
|
|
246
|
-
* @property {number} [minConcurrency=1]
|
|
247
|
-
* Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
|
|
248
|
-
*
|
|
249
|
-
* *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
|
|
250
|
-
* If you're not sure, just keep the default value and the concurrency will scale up automatically.
|
|
251
|
-
* @property {number} [maxConcurrency=1000]
|
|
252
|
-
* Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
|
|
253
|
-
* @property {boolean} [useSessionPool=true]
|
|
254
|
-
* If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes.
|
|
255
|
-
* It also marks Session as bad after a request timeout.
|
|
256
|
-
* @property {SessionPoolOptions} [sessionPoolOptions]
|
|
257
|
-
* Custom options passed to the underlying {@link SessionPool} constructor.
|
|
258
|
-
* @property {boolean} [persistCookiesPerSession]
|
|
259
|
-
* Automatically saves cookies to Session. Works only if Session Pool is used.
|
|
260
|
-
*
|
|
261
|
-
* It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
|
|
262
|
-
* It passes the "Cookie" header to the request with the session cookies.
|
|
263
|
-
*/
|
|
264
|
-
/**
|
|
265
|
-
* Provides a framework for the parallel crawling of web pages using plain HTTP requests and
|
|
266
|
-
* [cheerio](https://www.npmjs.com/package/cheerio) HTML parser.
|
|
267
|
-
* The URLs to crawl are fed either from a static list of URLs
|
|
268
|
-
* or from a dynamic queue of URLs enabling recursive crawling of websites.
|
|
269
|
-
*
|
|
270
|
-
* Since `CheerioCrawler` uses raw HTTP requests to download web pages,
|
|
271
|
-
* it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
|
|
272
|
-
* to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
|
|
273
|
-
* because it loads the pages using full-featured headless Chrome browser.
|
|
274
|
-
*
|
|
275
|
-
* `CheerioCrawler` downloads each URL using a plain HTTP request,
|
|
276
|
-
* parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio)
|
|
277
|
-
* and then invokes the user-provided {@link CheerioCrawlerOptions.handlePageFunction} to extract page data
|
|
278
|
-
* using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM.
|
|
279
|
-
*
|
|
280
|
-
* The source URLs are represented using {@link Request} objects that are fed from
|
|
281
|
-
* {@link RequestList} or {@link RequestQueue} instances provided by the {@link CheerioCrawlerOptions.requestList}
|
|
282
|
-
* or {@link CheerioCrawlerOptions.requestQueue} constructor options, respectively.
|
|
283
|
-
*
|
|
284
|
-
* If both {@link CheerioCrawlerOptions.requestList} and {@link CheerioCrawlerOptions.requestQueue} are used,
|
|
285
|
-
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
286
|
-
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
287
|
-
*
|
|
288
|
-
* The crawler finishes when there are no more {@link Request} objects to crawl.
|
|
289
|
-
*
|
|
290
|
-
* `CheerioCrawler` downloads the web pages using the `{@link utils#requestAsBrowser}` utility function.
|
|
291
|
-
* As opposed to the browser based crawlers that are automatically encoding the URLs, the
|
|
292
|
-
* `{@link utils#requestAsBrowser}` function will not do so. We either need to manually encode the URLs
|
|
293
|
-
* via `encodeURI()` function, or set `forceUrlEncoding: true` in the `requestAsBrowserOptions`,
|
|
294
|
-
* which will automatically encode all the URLs before accessing them.
|
|
295
|
-
*
|
|
296
|
-
* > We can either use `forceUrlEncoding` or encode manually, but not both - it would
|
|
297
|
-
* > result in double encoding and therefore lead to invalid URLs.
|
|
298
|
-
*
|
|
299
|
-
* We can use the `preNavigationHooks` to adjust `requestAsBrowserOptions`:
|
|
300
|
-
*
|
|
301
|
-
* ```
|
|
302
|
-
* preNavigationHooks: [
|
|
303
|
-
* (crawlingContext, requestAsBrowserOptions) => {
|
|
304
|
-
* requestAsBrowserOptions.forceUrlEncoding = true;
|
|
305
|
-
* },
|
|
306
|
-
* ]
|
|
307
|
-
* ```
|
|
308
|
-
*
|
|
309
|
-
* By default, `CheerioCrawler` only processes web pages with the `text/html`
|
|
310
|
-
* and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
|
|
311
|
-
* and skips pages with other content types. If you want the crawler to process other content types,
|
|
312
|
-
* use the {@link CheerioCrawlerOptions.additionalMimeTypes} constructor option.
|
|
313
|
-
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
|
|
314
|
-
* For details, see {@link CheerioCrawlerOptions.handlePageFunction}.
|
|
315
|
-
*
|
|
316
|
-
* New requests are only dispatched when there is enough free CPU and memory available,
|
|
317
|
-
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
318
|
-
* All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
|
|
319
|
-
* parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
|
|
320
|
-
* {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
|
|
321
|
-
*
|
|
322
|
-
* **Example usage:**
|
|
323
|
-
*
|
|
324
|
-
* ```javascript
|
|
325
|
-
* // Prepare a list of URLs to crawl
|
|
326
|
-
* const requestList = new Apify.RequestList({
|
|
327
|
-
* sources: [
|
|
328
|
-
* { url: 'http://www.example.com/page-1' },
|
|
329
|
-
* { url: 'http://www.example.com/page-2' },
|
|
330
|
-
* ],
|
|
331
|
-
* });
|
|
332
|
-
* await requestList.initialize();
|
|
333
|
-
*
|
|
334
|
-
* // Crawl the URLs
|
|
335
|
-
* const crawler = new Apify.CheerioCrawler({
|
|
336
|
-
* requestList,
|
|
337
|
-
* handlePageFunction: async ({ request, response, body, contentType, $ }) => {
|
|
338
|
-
* const data = [];
|
|
339
|
-
*
|
|
340
|
-
* // Do some data extraction from the page with Cheerio.
|
|
341
|
-
* $('.some-collection').each((index, el) => {
|
|
342
|
-
* data.push({ title: $(el).find('.some-title').text() });
|
|
343
|
-
* });
|
|
344
|
-
*
|
|
345
|
-
* // Save the data to dataset.
|
|
346
|
-
* await Apify.pushData({
|
|
347
|
-
* url: request.url,
|
|
348
|
-
* html: body,
|
|
349
|
-
* data,
|
|
350
|
-
* })
|
|
351
|
-
* },
|
|
352
|
-
* });
|
|
353
|
-
*
|
|
354
|
-
* await crawler.run();
|
|
355
|
-
* ```
|
|
356
|
-
* @property {Statistics} stats
|
|
357
|
-
* Contains statistics about the current run.
|
|
358
|
-
* @property {?RequestList} requestList
|
|
359
|
-
* A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
|
|
360
|
-
* Only available if used by the crawler.
|
|
361
|
-
* @property {?RequestQueue} requestQueue
|
|
362
|
-
* A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
|
|
363
|
-
* Only available if used by the crawler.
|
|
364
|
-
* @property {?SessionPool} sessionPool
|
|
365
|
-
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
|
|
366
|
-
* Only available if used by the crawler.
|
|
367
|
-
* @property {?ProxyConfiguration} proxyConfiguration
|
|
368
|
-
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
369
|
-
* Only available if used by the crawler.
|
|
370
|
-
* @property {AutoscaledPool} autoscaledPool
|
|
371
|
-
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
372
|
-
* Note that this property is only initialized after calling the {@link CheerioCrawler#run} function.
|
|
373
|
-
* You can use it to change the concurrency settings on the fly,
|
|
374
|
-
* to pause the crawler by calling {@link AutoscaledPool#pause}
|
|
375
|
-
* or to abort it by calling {@link AutoscaledPool#abort}.
|
|
376
|
-
*/
|
|
377
|
-
class CheerioCrawler extends basic_crawler_1.BasicCrawler {
|
|
378
|
-
/**
|
|
379
|
-
* @param {CheerioCrawlerOptions} options
|
|
380
|
-
* All `CheerioCrawler` parameters are passed via an options object.
|
|
381
|
-
*/
|
|
382
|
-
constructor(options) {
|
|
383
|
-
(0, ow_1.default)(options, 'CheerioCrawlerOptions', ow_1.default.object.exactShape(CheerioCrawler.optionsShape));
|
|
384
|
-
const { handlePageFunction, requestTimeoutSecs = 30, handlePageTimeoutSecs = 60, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, proxyConfiguration, prepareRequestFunction, postResponseFunction, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [],
|
|
385
|
-
// BasicCrawler
|
|
386
|
-
autoscaledPoolOptions = CHEERIO_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
|
|
387
|
-
super({
|
|
388
|
-
...basicCrawlerOptions,
|
|
389
|
-
// TODO temporary until the API is unified in V2
|
|
390
|
-
handleRequestFunction: handlePageFunction,
|
|
391
|
-
autoscaledPoolOptions,
|
|
392
|
-
// We need to add some time for internal functions to finish,
|
|
393
|
-
// but not too much so that we would stall the crawler.
|
|
394
|
-
handleRequestTimeoutSecs: requestTimeoutSecs + handlePageTimeoutSecs + constants_1.BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
|
|
395
|
-
});
|
|
396
|
-
// Cookies should be persisted per session only if session pool is used
|
|
397
|
-
if (!this.useSessionPool && persistCookiesPerSession) {
|
|
398
|
-
throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
|
|
399
|
-
}
|
|
400
|
-
this.supportedMimeTypes = new Set([...HTML_AND_XML_MIME_TYPES, APPLICATION_JSON_MIME_TYPE]);
|
|
401
|
-
if (additionalMimeTypes.length)
|
|
402
|
-
this._extendSupportedMimeTypes(additionalMimeTypes);
|
|
403
|
-
if (suggestResponseEncoding && forceResponseEncoding) {
|
|
404
|
-
this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
|
|
405
|
-
}
|
|
406
|
-
this.handlePageTimeoutMillis = handlePageTimeoutSecs * 1000;
|
|
407
|
-
this.requestTimeoutMillis = requestTimeoutSecs * 1000;
|
|
408
|
-
this.ignoreSslErrors = ignoreSslErrors;
|
|
409
|
-
this.suggestResponseEncoding = suggestResponseEncoding;
|
|
410
|
-
this.forceResponseEncoding = forceResponseEncoding;
|
|
411
|
-
this.prepareRequestFunction = prepareRequestFunction;
|
|
412
|
-
this.postResponseFunction = postResponseFunction;
|
|
413
|
-
this.proxyConfiguration = proxyConfiguration;
|
|
414
|
-
/**
|
|
415
|
-
* @type {Array<any>}
|
|
416
|
-
* @ignore
|
|
417
|
-
* */
|
|
418
|
-
this.preNavigationHooks = preNavigationHooks;
|
|
419
|
-
/**
|
|
420
|
-
* @type {Array<any>}
|
|
421
|
-
* @ignore
|
|
422
|
-
* */
|
|
423
|
-
this.postNavigationHooks = [
|
|
424
|
-
({ request, response }) => this._abortDownloadOfBody(request, response),
|
|
425
|
-
...postNavigationHooks,
|
|
426
|
-
];
|
|
427
|
-
if (this.useSessionPool) {
|
|
428
|
-
this.persistCookiesPerSession = persistCookiesPerSession !== undefined ? persistCookiesPerSession : true;
|
|
429
|
-
}
|
|
430
|
-
else {
|
|
431
|
-
this.persistCookiesPerSession = false;
|
|
432
|
-
}
|
|
433
|
-
}
|
|
434
|
-
/**
|
|
435
|
-
* **EXPERIMENTAL**
|
|
436
|
-
* Function for attaching CrawlerExtensions such as the Unblockers.
|
|
437
|
-
* @param {CrawlerExtension} extension - Crawler extension that overrides the crawler configuration.
|
|
438
|
-
*/
|
|
439
|
-
use(extension) {
|
|
440
|
-
(0, ow_1.default)(extension, ow_1.default.object.instanceOf(crawler_extension_1.default));
|
|
441
|
-
const extensionOptions = extension.getCrawlerOptions();
|
|
442
|
-
// TODO temporary until the API is unified in V2
|
|
443
|
-
extensionOptions.userProvidedHandler = extensionOptions.handlePageFunction;
|
|
444
|
-
delete extensionOptions.handlePageFunction;
|
|
445
|
-
for (const [key, value] of Object.entries(extensionOptions)) {
|
|
446
|
-
const isConfigurable = this.hasOwnProperty(key); // eslint-disable-line
|
|
447
|
-
const originalType = typeof this[key];
|
|
448
|
-
const extensionType = typeof value; // What if we want to null something? It is really needed?
|
|
449
|
-
const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
|
|
450
|
-
const exists = this[key] != null;
|
|
451
|
-
if (!isConfigurable) { // Test if the property can be configured on the crawler
|
|
452
|
-
throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on CheerioCrawler instance.`);
|
|
453
|
-
}
|
|
454
|
-
if (!isSameType && exists) { // Assuming that extensions will only add up configuration
|
|
455
|
-
throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "CheerioCrawler.${key}: ${originalType}".`);
|
|
456
|
-
}
|
|
457
|
-
this.log.warning(`${extension.name} is overriding "CheerioCrawler.${key}: ${originalType}" with ${value}.`);
|
|
458
|
-
this[key] = value;
|
|
459
|
-
}
|
|
460
|
-
}
|
|
461
|
-
/**
|
|
462
|
-
* Wrapper around handlePageFunction that opens and closes pages etc.
|
|
463
|
-
*
|
|
464
|
-
* @param {CrawlingContext} crawlingContext
|
|
465
|
-
* @ignore
|
|
466
|
-
* @protected
|
|
467
|
-
* @internal
|
|
468
|
-
*/
|
|
469
|
-
async _handleRequestFunction(crawlingContext) {
|
|
470
|
-
const { request, session } = crawlingContext;
|
|
471
|
-
if (this.proxyConfiguration) {
|
|
472
|
-
const sessionId = session ? session.id : undefined;
|
|
473
|
-
crawlingContext.proxyInfo = this.proxyConfiguration.newProxyInfo(sessionId);
|
|
474
|
-
}
|
|
475
|
-
await this._handleNavigation(crawlingContext);
|
|
476
|
-
(0, timeout_1.tryCancel)();
|
|
477
|
-
const { dom, isXml, body, contentType, response } = await this._parseResponse(request, crawlingContext.response);
|
|
478
|
-
(0, timeout_1.tryCancel)();
|
|
479
|
-
if (this.useSessionPool) {
|
|
480
|
-
this._throwOnBlockedRequest(session, response.statusCode);
|
|
481
|
-
}
|
|
482
|
-
if (this.persistCookiesPerSession) {
|
|
483
|
-
session.setCookiesFromResponse(response);
|
|
484
|
-
}
|
|
485
|
-
request.loadedUrl = response.url;
|
|
486
|
-
const $ = dom
|
|
487
|
-
? cheerio_1.default.load(dom, {
|
|
488
|
-
xmlMode: isXml,
|
|
489
|
-
// Recent versions of cheerio use parse5 as the HTML parser/serializer. It's more strict than htmlparser2
|
|
490
|
-
// and not good for scraping. It also does not have a great streaming interface.
|
|
491
|
-
// Here we tell cheerio to use htmlparser2 for serialization, otherwise the conflict produces weird errors.
|
|
492
|
-
_useHtmlParser2: true,
|
|
493
|
-
})
|
|
494
|
-
: null;
|
|
495
|
-
crawlingContext.$ = $;
|
|
496
|
-
crawlingContext.contentType = contentType;
|
|
497
|
-
crawlingContext.response = response;
|
|
498
|
-
Object.defineProperty(crawlingContext, 'json', {
|
|
499
|
-
get() {
|
|
500
|
-
if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
|
|
501
|
-
return null;
|
|
502
|
-
const jsonString = body.toString(contentType.encoding);
|
|
503
|
-
return JSON.parse(jsonString);
|
|
504
|
-
},
|
|
505
|
-
});
|
|
506
|
-
Object.defineProperty(crawlingContext, 'body', {
|
|
507
|
-
get() {
|
|
508
|
-
// NOTE: For XML/HTML documents, we don't store the original body and only reconstruct it from Cheerio's DOM.
|
|
509
|
-
// This is to save memory for high-concurrency crawls. The downside is that changes
|
|
510
|
-
// made to DOM are reflected in the HTML, but we can live with that...
|
|
511
|
-
if (dom) {
|
|
512
|
-
return isXml ? $.xml() : $.html({ decodeEntities: false });
|
|
513
|
-
}
|
|
514
|
-
return body;
|
|
515
|
-
},
|
|
516
|
-
});
|
|
517
|
-
return (0, timeout_1.addTimeoutToPromise)(() => this.userProvidedHandler(crawlingContext), this.handlePageTimeoutMillis, `handlePageFunction timed out after ${this.handlePageTimeoutMillis / 1000} seconds.`);
|
|
518
|
-
}
|
|
519
|
-
/**
|
|
520
|
-
* @param {CrawlingContext} crawlingContext
|
|
521
|
-
* @ignore
|
|
522
|
-
* @protected
|
|
523
|
-
* @internal
|
|
524
|
-
*/
|
|
525
|
-
async _handleNavigation(crawlingContext) {
|
|
526
|
-
if (this.prepareRequestFunction) {
|
|
527
|
-
this.log.deprecated('Option "prepareRequestFunction" is deprecated. Use "preNavigationHooks" instead.');
|
|
528
|
-
await this.prepareRequestFunction(crawlingContext);
|
|
529
|
-
(0, timeout_1.tryCancel)();
|
|
530
|
-
}
|
|
531
|
-
const requestAsBrowserOptions = {};
|
|
532
|
-
if (this.useSessionPool) {
|
|
533
|
-
this._applySessionCookie(crawlingContext, requestAsBrowserOptions);
|
|
534
|
-
}
|
|
535
|
-
const { request, session } = crawlingContext;
|
|
536
|
-
const cookieSnapshot = request.headers.Cookie ?? request.headers.cookie;
|
|
537
|
-
await this._executeHooks(this.preNavigationHooks, crawlingContext, requestAsBrowserOptions);
|
|
538
|
-
(0, timeout_1.tryCancel)();
|
|
539
|
-
const proxyUrl = crawlingContext.proxyInfo && crawlingContext.proxyInfo.url;
|
|
540
|
-
this._mergeRequestCookieDiff(request, cookieSnapshot, requestAsBrowserOptions);
|
|
541
|
-
crawlingContext.response = await (0, timeout_1.addTimeoutToPromise)(() => this._requestFunction({ request, session, proxyUrl, requestAsBrowserOptions }), this.requestTimeoutMillis, `request timed out after ${this.requestTimeoutMillis / 1000} seconds.`);
|
|
542
|
-
(0, timeout_1.tryCancel)();
|
|
543
|
-
await this._executeHooks(this.postNavigationHooks, crawlingContext, requestAsBrowserOptions);
|
|
544
|
-
(0, timeout_1.tryCancel)();
|
|
545
|
-
if (this.postResponseFunction) {
|
|
546
|
-
this.log.deprecated('Option "postResponseFunction" is deprecated. Use "postNavigationHooks" instead.');
|
|
547
|
-
await this.postResponseFunction(crawlingContext);
|
|
548
|
-
(0, timeout_1.tryCancel)();
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
/**
|
|
552
|
-
* When users change `request.headers.cookie` inside preNavigationHook, the change would be ignored,
|
|
553
|
-
* as `request.headers` are already merged into the `requestAsBrowserOptions`. This method is using
|
|
554
|
-
* old `request.headers` snapshot (before hooks are executed), makes a diff with the cookie value
|
|
555
|
-
* after hooks are executed, and merges any new cookies back to `requestAsBrowserOptions`.
|
|
556
|
-
*
|
|
557
|
-
* This way we can still use both `requestAsBrowserOptions` and `context.request` in the hooks (not both).
|
|
558
|
-
*
|
|
559
|
-
* @param {Request} request
|
|
560
|
-
* @param {string} cookieSnapshot
|
|
561
|
-
* @param {RequestAsBrowserOptions} requestAsBrowserOptions
|
|
562
|
-
* @private
|
|
563
|
-
* @ignore
|
|
564
|
-
* @internal
|
|
565
|
-
*/
|
|
566
|
-
_mergeRequestCookieDiff(request, cookieSnapshot, requestAsBrowserOptions) {
|
|
567
|
-
const cookieDiff = (0, crawler_utils_1.diffCookies)(request.url, cookieSnapshot, request.headers.Cookie ?? request.headers.cookie);
|
|
568
|
-
if (cookieDiff.length > 0) {
|
|
569
|
-
requestAsBrowserOptions.headers ?? (requestAsBrowserOptions.headers = {});
|
|
570
|
-
requestAsBrowserOptions.headers.Cookie = (0, crawler_utils_1.mergeCookies)(request.url, [
|
|
571
|
-
requestAsBrowserOptions.headers.Cookie,
|
|
572
|
-
cookieDiff,
|
|
573
|
-
]);
|
|
574
|
-
}
|
|
575
|
-
}
|
|
576
|
-
/**
|
|
577
|
-
* Function to make the HTTP request. It performs optimizations
|
|
578
|
-
* on the request such as only downloading the request body if the
|
|
579
|
-
* received content type matches text/html, application/xml, application/xhtml+xml.
|
|
580
|
-
*
|
|
581
|
-
* @param {object} options
|
|
582
|
-
* @param {Request} options.request
|
|
583
|
-
* @param {Session} options.session
|
|
584
|
-
* @param {string} options.proxyUrl
|
|
585
|
-
* @param {RequestAsBrowserOptions} options.requestAsBrowserOptions
|
|
586
|
-
* @returns {Promise<IncomingMessage|Readable>}
|
|
587
|
-
* @ignore
|
|
588
|
-
* @protected
|
|
589
|
-
* @internal
|
|
590
|
-
*/
|
|
591
|
-
async _requestFunction({ request, session, proxyUrl, requestAsBrowserOptions }) {
|
|
592
|
-
const opts = this._getRequestOptions(request, session, proxyUrl, requestAsBrowserOptions);
|
|
593
|
-
let responseWithStream;
|
|
594
|
-
try {
|
|
595
|
-
responseWithStream = await (0, utils_request_1.requestAsBrowser)(opts);
|
|
596
|
-
}
|
|
597
|
-
catch (e) {
|
|
598
|
-
if (e instanceof got_scraping_1.TimeoutError) {
|
|
599
|
-
this._handleRequestTimeout(session);
|
|
600
|
-
}
|
|
601
|
-
else {
|
|
602
|
-
throw e;
|
|
603
|
-
}
|
|
604
|
-
}
|
|
605
|
-
return responseWithStream;
|
|
606
|
-
}
|
|
607
|
-
/**
|
|
608
|
-
* Sets the cookie header to `requestAsBrowserOptions` based on provided session and request. If some cookies were already set,
|
|
609
|
-
* the session cookie will be merged with them. User provided cookies on `request` object have precedence.
|
|
610
|
-
*
|
|
611
|
-
* @param {CrawlingContext} crawlingContext
|
|
612
|
-
* @param {RequestAsBrowserOptions} requestAsBrowserOptions
|
|
613
|
-
* @return {void}
|
|
614
|
-
* @ignore
|
|
615
|
-
* @private
|
|
616
|
-
* @internal
|
|
617
|
-
*/
|
|
618
|
-
_applySessionCookie({ request, session }, requestAsBrowserOptions) {
|
|
619
|
-
const userCookie = request.headers.Cookie ?? request.headers.cookie;
|
|
620
|
-
const sessionCookie = session.getCookieString(request.url);
|
|
621
|
-
const mergedCookies = (0, crawler_utils_1.mergeCookies)(request.url, [sessionCookie, userCookie]);
|
|
622
|
-
// merge cookies from all possible sources
|
|
623
|
-
if (mergedCookies) {
|
|
624
|
-
requestAsBrowserOptions.headers ?? (requestAsBrowserOptions.headers = {});
|
|
625
|
-
requestAsBrowserOptions.headers.Cookie = mergedCookies;
|
|
626
|
-
}
|
|
627
|
-
}
|
|
628
|
-
/**
|
|
629
|
-
* Encodes and parses response according to the provided content type
|
|
630
|
-
* @param {Request} request
|
|
631
|
-
* @param {IncomingMessage|Readable} responseStream
|
|
632
|
-
* @returns {Promise<object>}
|
|
633
|
-
* @ignore
|
|
634
|
-
* @protected
|
|
635
|
-
* @internal
|
|
636
|
-
*/
|
|
637
|
-
async _parseResponse(request, responseStream) {
|
|
638
|
-
const { statusCode } = responseStream;
|
|
639
|
-
const { type, charset } = (0, utils_1.parseContentTypeFromResponse)(responseStream);
|
|
640
|
-
const { response, encoding } = this._encodeResponse(request, responseStream, charset);
|
|
641
|
-
const contentType = { type, encoding };
|
|
642
|
-
if (statusCode >= 500) {
|
|
643
|
-
const body = await (0, utilities_1.readStreamToString)(response, encoding);
|
|
644
|
-
// Errors are often sent as JSON, so attempt to parse them,
|
|
645
|
-
// despite Accept header being set to text/html.
|
|
646
|
-
if (type === APPLICATION_JSON_MIME_TYPE) {
|
|
647
|
-
const errorResponse = JSON.parse(body);
|
|
648
|
-
let { message } = errorResponse;
|
|
649
|
-
if (!message)
|
|
650
|
-
message = util_1.default.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
|
|
651
|
-
throw new Error(`${statusCode} - ${message}`);
|
|
652
|
-
}
|
|
653
|
-
// It's not a JSON so it's probably some text. Get the first 100 chars of it.
|
|
654
|
-
throw new Error(`${statusCode} - Internal Server Error: ${body.substr(0, 100)}`);
|
|
655
|
-
}
|
|
656
|
-
else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
|
|
657
|
-
const dom = await this._parseHtmlToDom(response);
|
|
658
|
-
return ({ dom, isXml: type.includes('xml'), response, contentType });
|
|
659
|
-
}
|
|
660
|
-
else {
|
|
661
|
-
const body = await (0, utilities_1.concatStreamToBuffer)(response);
|
|
662
|
-
return { body, response, contentType };
|
|
663
|
-
}
|
|
664
|
-
}
|
|
665
|
-
/**
|
|
666
|
-
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
|
|
667
|
-
* @param {Request} request
|
|
668
|
-
* @param {Session} [session]
|
|
669
|
-
* @param {string} [proxyUrl]
|
|
670
|
-
* @param {RequestAsBrowserOptions} [requestAsBrowserOptions]
|
|
671
|
-
* @ignore
|
|
672
|
-
* @protected
|
|
673
|
-
* @internal
|
|
674
|
-
*/
|
|
675
|
-
_getRequestOptions(request, session, proxyUrl, requestAsBrowserOptions) {
|
|
676
|
-
const requestOptions = {
|
|
677
|
-
url: request.url,
|
|
678
|
-
method: request.method,
|
|
679
|
-
proxyUrl,
|
|
680
|
-
timeout: { request: this.requestTimeoutMillis },
|
|
681
|
-
sessionToken: session,
|
|
682
|
-
...requestAsBrowserOptions,
|
|
683
|
-
headers: { ...request.headers, ...requestAsBrowserOptions.headers },
|
|
684
|
-
https: {
|
|
685
|
-
...requestAsBrowserOptions.https,
|
|
686
|
-
rejectUnauthorized: !this.ignoreSslErrors,
|
|
687
|
-
},
|
|
688
|
-
isStream: true,
|
|
689
|
-
};
|
|
690
|
-
// TODO this is incorrect, the check for man in the middle needs to be done
|
|
691
|
-
// on individual proxy level, not on the `proxyConfiguration` level,
|
|
692
|
-
// because users can use normal + MITM proxies in a single configuration.
|
|
693
|
-
// Disable SSL verification for MITM proxies
|
|
694
|
-
if (this.proxyConfiguration && this.proxyConfiguration.isManInTheMiddle) {
|
|
695
|
-
requestOptions.https = {
|
|
696
|
-
...requestOptions.https,
|
|
697
|
-
rejectUnauthorized: false,
|
|
698
|
-
};
|
|
699
|
-
}
|
|
700
|
-
if (/PATCH|POST|PUT/.test(request.method))
|
|
701
|
-
requestOptions.body = request.payload;
|
|
702
|
-
return requestOptions;
|
|
703
|
-
}
|
|
704
|
-
/**
|
|
705
|
-
* @param {*} request
|
|
706
|
-
* @param {*} response
|
|
707
|
-
* @param {*} encoding
|
|
708
|
-
* @ignore
|
|
709
|
-
* @protected
|
|
710
|
-
* @internal
|
|
711
|
-
*/
|
|
712
|
-
_encodeResponse(request, response, encoding) {
|
|
713
|
-
if (this.forceResponseEncoding) {
|
|
714
|
-
encoding = this.forceResponseEncoding;
|
|
715
|
-
}
|
|
716
|
-
else if (!encoding && this.suggestResponseEncoding) {
|
|
717
|
-
encoding = this.suggestResponseEncoding;
|
|
718
|
-
}
|
|
719
|
-
// Fall back to utf-8 if we still don't have encoding.
|
|
720
|
-
const utf8 = 'utf8';
|
|
721
|
-
if (!encoding)
|
|
722
|
-
return { response, encoding: utf8 };
|
|
723
|
-
// This means that the encoding is one of Node.js supported
|
|
724
|
-
// encodings and we don't need to re-encode it.
|
|
725
|
-
if (Buffer.isEncoding(encoding))
|
|
726
|
-
return { response, encoding };
|
|
727
|
-
// Try to re-encode a variety of unsupported encodings to utf-8
|
|
728
|
-
if (iconv_lite_1.default.encodingExists(encoding)) {
|
|
729
|
-
const encodeStream = iconv_lite_1.default.encodeStream(utf8);
|
|
730
|
-
const decodeStream = iconv_lite_1.default.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
|
|
731
|
-
response.on('error', (err) => decodeStream.emit('error', err));
|
|
732
|
-
const encodedResponse = response.pipe(decodeStream).pipe(encodeStream);
|
|
733
|
-
encodedResponse.statusCode = response.statusCode;
|
|
734
|
-
encodedResponse.headers = response.headers;
|
|
735
|
-
encodedResponse.url = response.url;
|
|
736
|
-
return {
|
|
737
|
-
response: encodedResponse,
|
|
738
|
-
encoding: utf8,
|
|
739
|
-
};
|
|
740
|
-
}
|
|
741
|
-
throw new Error(`Resource ${request.url} served with unsupported charset/encoding: ${encoding}`);
|
|
742
|
-
}
|
|
743
|
-
/**
|
|
744
|
-
* @param {*} response
|
|
745
|
-
* @ignore
|
|
746
|
-
* @protected
|
|
747
|
-
* @internal
|
|
748
|
-
*/
|
|
749
|
-
async _parseHtmlToDom(response) {
|
|
750
|
-
return new Promise((resolve, reject) => {
|
|
751
|
-
const domHandler = new htmlparser2_1.DomHandler((err, dom) => {
|
|
752
|
-
if (err)
|
|
753
|
-
reject(err);
|
|
754
|
-
else
|
|
755
|
-
resolve(dom);
|
|
756
|
-
});
|
|
757
|
-
const parser = new WritableStream_1.WritableStream(domHandler, { decodeEntities: true });
|
|
758
|
-
parser.on('error', reject);
|
|
759
|
-
response
|
|
760
|
-
.on('error', reject)
|
|
761
|
-
.pipe(parser);
|
|
762
|
-
});
|
|
763
|
-
}
|
|
764
|
-
/**
|
|
765
|
-
* Checks and extends supported mime types
|
|
766
|
-
* @param {Array<(string|Object)>} additionalMimeTypes
|
|
767
|
-
* @ignore
|
|
768
|
-
* @protected
|
|
769
|
-
* @internal
|
|
770
|
-
*/
|
|
771
|
-
_extendSupportedMimeTypes(additionalMimeTypes) {
|
|
772
|
-
additionalMimeTypes.forEach((mimeType) => {
|
|
773
|
-
try {
|
|
774
|
-
const parsedType = content_type_1.default.parse(mimeType);
|
|
775
|
-
this.supportedMimeTypes.add(parsedType.type);
|
|
776
|
-
}
|
|
777
|
-
catch (err) {
|
|
778
|
-
throw new Error(`Can not parse mime type ${mimeType} from "options.additionalMimeTypes".`);
|
|
779
|
-
}
|
|
780
|
-
});
|
|
781
|
-
}
|
|
782
|
-
/**
|
|
783
|
-
* Handles blocked request
|
|
784
|
-
* @param {Session} session
|
|
785
|
-
* @param {number} statusCode
|
|
786
|
-
* @ignore
|
|
787
|
-
* @protected
|
|
788
|
-
* @internal
|
|
789
|
-
*/
|
|
790
|
-
_throwOnBlockedRequest(session, statusCode) {
|
|
791
|
-
const isBlocked = session.retireOnBlockedStatusCodes(statusCode);
|
|
792
|
-
if (isBlocked) {
|
|
793
|
-
throw new Error(`Request blocked - received ${statusCode} status code`);
|
|
794
|
-
}
|
|
795
|
-
}
|
|
796
|
-
/**
|
|
797
|
-
* Handles timeout request
|
|
798
|
-
* @param {Session} session
|
|
799
|
-
* @ignore
|
|
800
|
-
* @protected
|
|
801
|
-
* @internal
|
|
802
|
-
*/
|
|
803
|
-
_handleRequestTimeout(session) {
|
|
804
|
-
if (session)
|
|
805
|
-
session.markBad();
|
|
806
|
-
throw new Error(`request timed out after ${this.handlePageTimeoutMillis / 1000} seconds.`);
|
|
807
|
-
}
|
|
808
|
-
/**
|
|
809
|
-
* @param {Request} request
|
|
810
|
-
* @param {IncomingMessage|Readable} response
|
|
811
|
-
* @private
|
|
812
|
-
*/
|
|
813
|
-
_abortDownloadOfBody(request, response) {
|
|
814
|
-
const { statusCode } = response;
|
|
815
|
-
const { type } = (0, utils_1.parseContentTypeFromResponse)(response);
|
|
816
|
-
if (statusCode === 406) {
|
|
817
|
-
request.noRetry = true;
|
|
818
|
-
throw new Error(`Resource ${request.url} is not available in the format requested by the Accept header. Skipping resource.`);
|
|
819
|
-
}
|
|
820
|
-
if (!this.supportedMimeTypes.has(type) && statusCode < 500) {
|
|
821
|
-
request.noRetry = true;
|
|
822
|
-
throw new Error(`Resource ${request.url} served Content-Type ${type}, `
|
|
823
|
-
+ `but only ${Array.from(this.supportedMimeTypes).join(', ')} are allowed. Skipping resource.`);
|
|
824
|
-
}
|
|
825
|
-
}
|
|
826
|
-
}
|
|
827
|
-
/**
|
|
828
|
-
* @internal
|
|
829
|
-
* @type any
|
|
830
|
-
*/
|
|
831
|
-
Object.defineProperty(CheerioCrawler, "optionsShape", {
|
|
832
|
-
enumerable: true,
|
|
833
|
-
configurable: true,
|
|
834
|
-
writable: true,
|
|
835
|
-
value: {
|
|
836
|
-
...basic_crawler_1.BasicCrawler.optionsShape,
|
|
837
|
-
// TODO temporary until the API is unified in V2
|
|
838
|
-
handleRequestFunction: ow_1.default.undefined,
|
|
839
|
-
handlePageFunction: ow_1.default.function,
|
|
840
|
-
requestTimeoutSecs: ow_1.default.optional.number,
|
|
841
|
-
handlePageTimeoutSecs: ow_1.default.optional.number,
|
|
842
|
-
ignoreSslErrors: ow_1.default.optional.boolean,
|
|
843
|
-
additionalMimeTypes: ow_1.default.optional.array.ofType(ow_1.default.string),
|
|
844
|
-
suggestResponseEncoding: ow_1.default.optional.string,
|
|
845
|
-
forceResponseEncoding: ow_1.default.optional.string,
|
|
846
|
-
proxyConfiguration: ow_1.default.optional.object.validate(validators_1.validators.proxyConfiguration),
|
|
847
|
-
prepareRequestFunction: ow_1.default.optional.function,
|
|
848
|
-
postResponseFunction: ow_1.default.optional.function,
|
|
849
|
-
persistCookiesPerSession: ow_1.default.optional.boolean,
|
|
850
|
-
preNavigationHooks: ow_1.default.optional.array,
|
|
851
|
-
postNavigationHooks: ow_1.default.optional.array,
|
|
852
|
-
}
|
|
853
|
-
});
|
|
854
|
-
exports.default = CheerioCrawler;
|
|
855
|
-
/**
|
|
856
|
-
* @typedef PrepareRequestInputs
|
|
857
|
-
* @property {Request} request
|
|
858
|
-
* Original instance fo the {Request} object. Must be modified in-place.
|
|
859
|
-
* @property {Session} [session]
|
|
860
|
-
* The current session
|
|
861
|
-
* @property {ProxyInfo} [proxyInfo]
|
|
862
|
-
* An object with information about currently used proxy by the crawler
|
|
863
|
-
* and configured by the {@link ProxyConfiguration} class.
|
|
864
|
-
* @property {CheerioCrawler} [crawler]
|
|
865
|
-
*/
|
|
866
|
-
/**
|
|
867
|
-
* @callback PrepareRequest
|
|
868
|
-
* @param {PrepareRequestInputs} inputs Arguments passed to this callback.
|
|
869
|
-
* @returns {(void|Promise<void>)}
|
|
870
|
-
*/
|
|
871
|
-
/**
|
|
872
|
-
* @typedef PostResponseInputs
|
|
873
|
-
* @property {(IncomingMessage|Readable)} response stream
|
|
874
|
-
* @property {Request} request
|
|
875
|
-
* Original instance fo the {Request} object. Must be modified in-place.
|
|
876
|
-
* @property {Session} [session]
|
|
877
|
-
* The current session
|
|
878
|
-
* @property {ProxyInfo} [proxyInfo]
|
|
879
|
-
* An object with information about currently used proxy by the crawler
|
|
880
|
-
* and configured by the {@link ProxyConfiguration} class.
|
|
881
|
-
* @property {CheerioCrawler} crawler
|
|
882
|
-
*/
|
|
883
|
-
/**
|
|
884
|
-
* @callback PostResponse
|
|
885
|
-
* @param {PostResponseInputs} inputs Arguments passed to this callback.
|
|
886
|
-
* @returns {(void|Promise<void>)}
|
|
887
|
-
*/
|
|
888
|
-
/**
|
|
889
|
-
* @typedef CheerioHandlePageInputs
|
|
890
|
-
* @property {CheerioAPI} $
|
|
891
|
-
* The [Cheerio](https://cheerio.js.org/) object with parsed HTML.
|
|
892
|
-
* @property {(string|Buffer)} body
|
|
893
|
-
* The request body of the web page.
|
|
894
|
-
* @property {*} json
|
|
895
|
-
* The parsed object from JSON string if the response contains the content type application/json.
|
|
896
|
-
* @property {Request} request
|
|
897
|
-
* The original {@link Request} object.
|
|
898
|
-
* @property {{ type: string, encoding: string }} contentType
|
|
899
|
-
* Parsed `Content-Type header: { type, encoding }`.
|
|
900
|
-
* @property {IncomingMessage} response
|
|
901
|
-
* An instance of Node's [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) object,
|
|
902
|
-
* @property {Session} session
|
|
903
|
-
* @property {ProxyInfo} proxyInfo
|
|
904
|
-
* An object with information about currently used proxy by the crawler
|
|
905
|
-
* and configured by the {@link ProxyConfiguration} class.
|
|
906
|
-
* @property {CheerioCrawler} crawler
|
|
907
|
-
*/
|
|
908
|
-
/**
|
|
909
|
-
* @callback CheerioHandlePage
|
|
910
|
-
* @param {CheerioHandlePageInputs} inputs Arguments passed to this callback.
|
|
911
|
-
* @returns {Promise<void>}
|
|
912
|
-
*/
|
|
913
|
-
//# sourceMappingURL=cheerio_crawler.js.map
|