apify 2.3.1-beta.4 → 3.0.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. package/README.md +6 -5
  2. package/package.json +69 -128
  3. package/build/actor.d.ts +0 -113
  4. package/build/actor.d.ts.map +0 -1
  5. package/build/actor.js +0 -582
  6. package/build/actor.js.map +0 -1
  7. package/build/apify.d.ts +0 -752
  8. package/build/apify.d.ts.map +0 -1
  9. package/build/apify.js +0 -877
  10. package/build/apify.js.map +0 -1
  11. package/build/autoscaling/autoscaled_pool.d.ts +0 -384
  12. package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
  13. package/build/autoscaling/autoscaled_pool.js +0 -557
  14. package/build/autoscaling/autoscaled_pool.js.map +0 -1
  15. package/build/autoscaling/snapshotter.d.ts +0 -278
  16. package/build/autoscaling/snapshotter.d.ts.map +0 -1
  17. package/build/autoscaling/snapshotter.js +0 -447
  18. package/build/autoscaling/snapshotter.js.map +0 -1
  19. package/build/autoscaling/system_status.d.ts +0 -224
  20. package/build/autoscaling/system_status.d.ts.map +0 -1
  21. package/build/autoscaling/system_status.js +0 -228
  22. package/build/autoscaling/system_status.js.map +0 -1
  23. package/build/browser_launchers/browser_launcher.d.ts +0 -154
  24. package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
  25. package/build/browser_launchers/browser_launcher.js +0 -160
  26. package/build/browser_launchers/browser_launcher.js.map +0 -1
  27. package/build/browser_launchers/browser_plugin.d.ts +0 -23
  28. package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
  29. package/build/browser_launchers/browser_plugin.js +0 -25
  30. package/build/browser_launchers/browser_plugin.js.map +0 -1
  31. package/build/browser_launchers/playwright_launcher.d.ts +0 -131
  32. package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
  33. package/build/browser_launchers/playwright_launcher.js +0 -150
  34. package/build/browser_launchers/playwright_launcher.js.map +0 -1
  35. package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
  36. package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
  37. package/build/browser_launchers/puppeteer_launcher.js +0 -197
  38. package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
  39. package/build/cache_container.d.ts +0 -31
  40. package/build/cache_container.d.ts.map +0 -1
  41. package/build/cache_container.js +0 -48
  42. package/build/cache_container.js.map +0 -1
  43. package/build/configuration.d.ts +0 -226
  44. package/build/configuration.d.ts.map +0 -1
  45. package/build/configuration.js +0 -325
  46. package/build/configuration.js.map +0 -1
  47. package/build/constants.d.ts +0 -37
  48. package/build/constants.d.ts.map +0 -1
  49. package/build/constants.js +0 -41
  50. package/build/constants.js.map +0 -1
  51. package/build/crawlers/basic_crawler.d.ts +0 -443
  52. package/build/crawlers/basic_crawler.d.ts.map +0 -1
  53. package/build/crawlers/basic_crawler.js +0 -664
  54. package/build/crawlers/basic_crawler.js.map +0 -1
  55. package/build/crawlers/browser_crawler.d.ts +0 -512
  56. package/build/crawlers/browser_crawler.d.ts.map +0 -1
  57. package/build/crawlers/browser_crawler.js +0 -540
  58. package/build/crawlers/browser_crawler.js.map +0 -1
  59. package/build/crawlers/cheerio_crawler.d.ts +0 -931
  60. package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
  61. package/build/crawlers/cheerio_crawler.js +0 -913
  62. package/build/crawlers/cheerio_crawler.js.map +0 -1
  63. package/build/crawlers/crawler_extension.d.ts +0 -10
  64. package/build/crawlers/crawler_extension.d.ts.map +0 -1
  65. package/build/crawlers/crawler_extension.js +0 -19
  66. package/build/crawlers/crawler_extension.js.map +0 -1
  67. package/build/crawlers/crawler_utils.d.ts +0 -34
  68. package/build/crawlers/crawler_utils.d.ts.map +0 -1
  69. package/build/crawlers/crawler_utils.js +0 -87
  70. package/build/crawlers/crawler_utils.js.map +0 -1
  71. package/build/crawlers/playwright_crawler.d.ts +0 -448
  72. package/build/crawlers/playwright_crawler.d.ts.map +0 -1
  73. package/build/crawlers/playwright_crawler.js +0 -299
  74. package/build/crawlers/playwright_crawler.js.map +0 -1
  75. package/build/crawlers/puppeteer_crawler.d.ts +0 -425
  76. package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
  77. package/build/crawlers/puppeteer_crawler.js +0 -299
  78. package/build/crawlers/puppeteer_crawler.js.map +0 -1
  79. package/build/crawlers/statistics.d.ts +0 -185
  80. package/build/crawlers/statistics.d.ts.map +0 -1
  81. package/build/crawlers/statistics.js +0 -331
  82. package/build/crawlers/statistics.js.map +0 -1
  83. package/build/enqueue_links/click_elements.d.ts +0 -179
  84. package/build/enqueue_links/click_elements.d.ts.map +0 -1
  85. package/build/enqueue_links/click_elements.js +0 -434
  86. package/build/enqueue_links/click_elements.js.map +0 -1
  87. package/build/enqueue_links/enqueue_links.d.ts +0 -117
  88. package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
  89. package/build/enqueue_links/enqueue_links.js +0 -163
  90. package/build/enqueue_links/enqueue_links.js.map +0 -1
  91. package/build/enqueue_links/shared.d.ts +0 -42
  92. package/build/enqueue_links/shared.d.ts.map +0 -1
  93. package/build/enqueue_links/shared.js +0 -121
  94. package/build/enqueue_links/shared.js.map +0 -1
  95. package/build/errors.d.ts +0 -29
  96. package/build/errors.d.ts.map +0 -1
  97. package/build/errors.js +0 -38
  98. package/build/errors.js.map +0 -1
  99. package/build/events.d.ts +0 -11
  100. package/build/events.d.ts.map +0 -1
  101. package/build/events.js +0 -147
  102. package/build/events.js.map +0 -1
  103. package/build/index.d.ts +0 -4
  104. package/build/index.d.ts.map +0 -1
  105. package/build/index.js +0 -7
  106. package/build/index.js.map +0 -1
  107. package/build/main.d.ts +0 -179
  108. package/build/main.d.ts.map +0 -1
  109. package/build/main.js +0 -81
  110. package/build/main.js.map +0 -1
  111. package/build/playwright_utils.d.ts +0 -9
  112. package/build/playwright_utils.d.ts.map +0 -1
  113. package/build/playwright_utils.js +0 -90
  114. package/build/playwright_utils.js.map +0 -1
  115. package/build/proxy_configuration.d.ts +0 -411
  116. package/build/proxy_configuration.d.ts.map +0 -1
  117. package/build/proxy_configuration.js +0 -517
  118. package/build/proxy_configuration.js.map +0 -1
  119. package/build/pseudo_url.d.ts +0 -86
  120. package/build/pseudo_url.d.ts.map +0 -1
  121. package/build/pseudo_url.js +0 -153
  122. package/build/pseudo_url.js.map +0 -1
  123. package/build/puppeteer_request_interception.d.ts +0 -8
  124. package/build/puppeteer_request_interception.d.ts.map +0 -1
  125. package/build/puppeteer_request_interception.js +0 -235
  126. package/build/puppeteer_request_interception.js.map +0 -1
  127. package/build/puppeteer_utils.d.ts +0 -250
  128. package/build/puppeteer_utils.d.ts.map +0 -1
  129. package/build/puppeteer_utils.js +0 -551
  130. package/build/puppeteer_utils.js.map +0 -1
  131. package/build/request.d.ts +0 -180
  132. package/build/request.d.ts.map +0 -1
  133. package/build/request.js +0 -261
  134. package/build/request.js.map +0 -1
  135. package/build/request_list.d.ts +0 -581
  136. package/build/request_list.d.ts.map +0 -1
  137. package/build/request_list.js +0 -826
  138. package/build/request_list.js.map +0 -1
  139. package/build/serialization.d.ts +0 -5
  140. package/build/serialization.d.ts.map +0 -1
  141. package/build/serialization.js +0 -139
  142. package/build/serialization.js.map +0 -1
  143. package/build/session_pool/errors.d.ts +0 -11
  144. package/build/session_pool/errors.d.ts.map +0 -1
  145. package/build/session_pool/errors.js +0 -18
  146. package/build/session_pool/errors.js.map +0 -1
  147. package/build/session_pool/events.d.ts +0 -5
  148. package/build/session_pool/events.d.ts.map +0 -1
  149. package/build/session_pool/events.js +0 -6
  150. package/build/session_pool/events.js.map +0 -1
  151. package/build/session_pool/session.d.ts +0 -286
  152. package/build/session_pool/session.d.ts.map +0 -1
  153. package/build/session_pool/session.js +0 -355
  154. package/build/session_pool/session.js.map +0 -1
  155. package/build/session_pool/session_pool.d.ts +0 -280
  156. package/build/session_pool/session_pool.d.ts.map +0 -1
  157. package/build/session_pool/session_pool.js +0 -393
  158. package/build/session_pool/session_pool.js.map +0 -1
  159. package/build/session_pool/session_utils.d.ts +0 -4
  160. package/build/session_pool/session_utils.d.ts.map +0 -1
  161. package/build/session_pool/session_utils.js +0 -24
  162. package/build/session_pool/session_utils.js.map +0 -1
  163. package/build/stealth/hiding_tricks.d.ts +0 -22
  164. package/build/stealth/hiding_tricks.d.ts.map +0 -1
  165. package/build/stealth/hiding_tricks.js +0 -308
  166. package/build/stealth/hiding_tricks.js.map +0 -1
  167. package/build/stealth/stealth.d.ts +0 -56
  168. package/build/stealth/stealth.d.ts.map +0 -1
  169. package/build/stealth/stealth.js +0 -125
  170. package/build/stealth/stealth.js.map +0 -1
  171. package/build/storages/dataset.d.ts +0 -288
  172. package/build/storages/dataset.d.ts.map +0 -1
  173. package/build/storages/dataset.js +0 -480
  174. package/build/storages/dataset.js.map +0 -1
  175. package/build/storages/key_value_store.d.ts +0 -243
  176. package/build/storages/key_value_store.d.ts.map +0 -1
  177. package/build/storages/key_value_store.js +0 -462
  178. package/build/storages/key_value_store.js.map +0 -1
  179. package/build/storages/request_queue.d.ts +0 -318
  180. package/build/storages/request_queue.d.ts.map +0 -1
  181. package/build/storages/request_queue.js +0 -636
  182. package/build/storages/request_queue.js.map +0 -1
  183. package/build/storages/storage_manager.d.ts +0 -87
  184. package/build/storages/storage_manager.d.ts.map +0 -1
  185. package/build/storages/storage_manager.js +0 -150
  186. package/build/storages/storage_manager.js.map +0 -1
  187. package/build/tsconfig.tsbuildinfo +0 -1
  188. package/build/typedefs.d.ts +0 -146
  189. package/build/typedefs.d.ts.map +0 -1
  190. package/build/typedefs.js +0 -88
  191. package/build/typedefs.js.map +0 -1
  192. package/build/utils.d.ts +0 -175
  193. package/build/utils.d.ts.map +0 -1
  194. package/build/utils.js +0 -731
  195. package/build/utils.js.map +0 -1
  196. package/build/utils_log.d.ts +0 -41
  197. package/build/utils_log.d.ts.map +0 -1
  198. package/build/utils_log.js +0 -192
  199. package/build/utils_log.js.map +0 -1
  200. package/build/utils_request.d.ts +0 -77
  201. package/build/utils_request.d.ts.map +0 -1
  202. package/build/utils_request.js +0 -385
  203. package/build/utils_request.js.map +0 -1
  204. package/build/utils_social.d.ts +0 -210
  205. package/build/utils_social.d.ts.map +0 -1
  206. package/build/utils_social.js +0 -787
  207. package/build/utils_social.js.map +0 -1
  208. package/build/validators.d.ts +0 -23
  209. package/build/validators.d.ts.map +0 -1
  210. package/build/validators.js +0 -29
  211. package/build/validators.js.map +0 -1
@@ -1,913 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- const tslib_1 = require("tslib");
4
- /* eslint-disable class-methods-use-this */
5
- const utilities_1 = require("@apify/utilities");
6
- const timeout_1 = require("@apify/timeout");
7
- const cheerio_1 = (0, tslib_1.__importDefault)(require("cheerio")); // eslint-disable-line import/no-duplicates
8
- const content_type_1 = (0, tslib_1.__importDefault)(require("content-type"));
9
- const htmlparser2_1 = require("htmlparser2");
10
- const WritableStream_1 = require("htmlparser2/lib/WritableStream");
11
- const iconv_lite_1 = (0, tslib_1.__importDefault)(require("iconv-lite"));
12
- const ow_1 = (0, tslib_1.__importDefault)(require("ow"));
13
- const util_1 = (0, tslib_1.__importDefault)(require("util"));
14
- const got_scraping_1 = require("got-scraping");
15
- const constants_1 = require("../constants");
16
- const utils_1 = require("../utils");
17
- const utils_request_1 = require("../utils_request"); // eslint-disable-line import/no-duplicates
18
- const crawler_utils_1 = require("./crawler_utils");
19
- const basic_crawler_1 = require("./basic_crawler"); // eslint-disable-line import/no-duplicates
20
- const crawler_extension_1 = (0, tslib_1.__importDefault)(require("./crawler_extension"));
21
- const validators_1 = require("../validators");
22
- /* eslint-enable no-unused-vars,import/named,import/no-duplicates,import/order */
23
- /**
24
- * Default mime types, which CheerioScraper supports.
25
- */
26
- const HTML_AND_XML_MIME_TYPES = ['text/html', 'text/xml', 'application/xhtml+xml', 'application/xml'];
27
- const APPLICATION_JSON_MIME_TYPE = 'application/json';
28
- const CHEERIO_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
29
- snapshotterOptions: {
30
- eventLoopSnapshotIntervalSecs: 2,
31
- maxBlockedMillis: 100,
32
- },
33
- systemStatusOptions: {
34
- maxEventLoopOverloadedRatio: 0.7,
35
- },
36
- };
37
- /**
38
- * @typedef CheerioCrawlerOptions
39
- * @property {CheerioHandlePage} handlePageFunction
40
- * User-provided function that performs the logic of the crawler. It is called for each page
41
- * loaded and parsed by the crawler.
42
- *
43
- * The function receives the following object as an argument:
44
- * ```
45
- * {
46
- * // The Cheerio object's function with the parsed HTML.
47
- * $: Cheerio,
48
- *
49
- * // The request body of the web page, whose type depends on the content type.
50
- * body: String|Buffer,
51
- *
52
- * // The parsed object from JSON for responses with the "application/json" content types.
53
- * // For other content types it's null.
54
- * json: Object,
55
- *
56
- * // Apify.Request object with details of the requested web page
57
- * request: Request,
58
- *
59
- * // Parsed Content-Type HTTP header: { type, encoding }
60
- * contentType: Object,
61
- *
62
- * // An instance of Node's http.IncomingMessage object,
63
- * response: Object,
64
- *
65
- * // Session object, useful to work around anti-scraping protections
66
- * session: Session
67
- *
68
- * // ProxyInfo object with information about currently used proxy
69
- * proxyInfo: ProxyInfo
70
- *
71
- * // The running cheerio crawler instance.
72
- * crawler: CheerioCrawler
73
- * }
74
- * ```
75
- *
76
- * Type of `body` depends on the `Content-Type` header of the web page:
77
- * - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
78
- * - Buffer for others MIME content types
79
- *
80
- * Parsed `Content-Type` header using
81
- * [content-type package](https://www.npmjs.com/package/content-type)
82
- * is stored in `contentType`.
83
- *
84
- * Cheerio is available only for HTML and XML content types.
85
- *
86
- * With the {@link Request} object representing the URL to crawl.
87
- *
88
- * If the function returns, the returned promise is awaited by the crawler.
89
- *
90
- * If the function throws an exception, the crawler will try to re-crawl the
91
- * request later, up to `option.maxRequestRetries` times.
92
- * If all the retries fail, the crawler calls the function
93
- * provided to the `handleFailedRequestFunction` parameter.
94
- * To make this work, you should **always**
95
- * let your function throw exceptions rather than catch them.
96
- * The exceptions are logged to the request using the
97
- * {@link Request#pushErrorMessage} function.
98
- * @property {RequestList} [requestList]
99
- * Static list of URLs to be processed.
100
- * Either `requestList` or `requestQueue` option must be provided (or both).
101
- * @property {RequestQueue} [requestQueue]
102
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
103
- * Either `requestList` or `requestQueue` option must be provided (or both).
104
- * @property {PrepareRequest} [prepareRequestFunction]
105
- * > This option is deprecated, use `preNavigationHooks` instead.
106
- *
107
- * A function that executes before the HTTP request is made to the target resource.
108
- * This function is suitable for setting dynamic properties such as cookies to the {@link Request}.
109
- *
110
- * The function receives the following object as an argument:
111
- * ```
112
- * {
113
- * request: Request,
114
- * session: Session,
115
- * proxyInfo: ProxyInfo,
116
- * crawler: CheerioCrawler,
117
- * }
118
- * ```
119
- * where the {@link Request} instance corresponds to the initialized request
120
- * and the {@link Session} instance corresponds to used session.
121
- *
122
- * The function should modify the properties of the passed {@link Request} instance
123
- * in place because there are already earlier references to it. Making a copy and returning it from
124
- * this function is therefore not supported, because it would create inconsistencies where
125
- * different parts of SDK would have access to a different {@link Request} instance.
126
- *
127
- * @property {PostResponse} [postResponseFunction]
128
- * > This option is deprecated, use `postNavigationHooks` instead.
129
- *
130
- * A function that executes right after the HTTP request is made to the target resource and response is returned.
131
- * This function is suitable for overriding custom properties of response e.g. setting headers because of response parsing.
132
- *
133
- * **Example usage:**
134
- *
135
- * ```javascript
136
- * const cheerioCrawlerOptions = {
137
- * // ...
138
- * postResponseFunction: ({ request, response }) => {
139
- * if (request.userData.parseAsJSON) {
140
- * response.headers['content-type'] = 'application/json; charset=utf-8';
141
- * }
142
- * }
143
- * }
144
- * ```
145
- * The function receives the following object as an argument:
146
- * ```
147
- * {
148
- * response: Object,
149
- * request: Request,
150
- * session: Session,
151
- * proxyInfo: ProxyInfo,
152
- * crawler: CheerioCrawler,
153
- * }
154
- * ```
155
- * The response is an instance of Node's http.IncomingMessage object.
156
- *
157
- * @property {number} [handlePageTimeoutSecs=60]
158
- * Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds.
159
- * @property {number} [requestTimeoutSecs=30]
160
- * Timeout in which the HTTP request to the resource needs to finish, given in seconds.
161
- * @property {boolean} [ignoreSslErrors=true]
162
- * If set to true, SSL certificate errors will be ignored.
163
- * @property {ProxyConfiguration} [proxyConfiguration]
164
- * If set, `CheerioCrawler` will be configured for all connections to use
165
- * [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
166
- * For more information, see the [documentation](https://docs.apify.com/proxy).
167
- * @property {HandleFailedRequest} [handleFailedRequestFunction]
168
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
169
- * The function receives the following object as an argument:
170
- * ```
171
- * {
172
- * error: Error,
173
- * request: Request,
174
- * session: Session,
175
- * $: Cheerio,
176
- * body: String|Buffer,
177
- * json: Object,
178
- * contentType: Object,
179
- * response: Object,
180
- * proxyInfo: ProxyInfo,
181
- * crawler: CheerioCrawler,
182
- * }
183
- * ```
184
- * where the {@link Request} instance corresponds to the failed request, and the `Error` instance
185
- * represents the last error thrown during processing of the request.
186
- *
187
- * See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13)
188
- * for the default implementation of this function.
189
- * @property {Array<Hook>} [preNavigationHooks]
190
- * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
191
- * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `requestAsBrowserOptions`,
192
- * which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
193
- * Example:
194
- * ```
195
- * preNavigationHooks: [
196
- * async (crawlingContext, requestAsBrowserOptions) => {
197
- * requestAsBrowserOptions.forceUrlEncoding = true;
198
- * },
199
- * ]
200
- * ```
201
- * @property {Array<Hook>} [postNavigationHooks]
202
- * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
203
- * The function accepts `crawlingContext` as the only parameter.
204
- * Example:
205
- * ```
206
- * postNavigationHooks: [
207
- * async (crawlingContext) => {
208
- * // ...
209
- * },
210
- * ]
211
- * ```
212
- * @property {string[]} [additionalMimeTypes]
213
- * An array of <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types"
214
- * target="_blank">MIME types</a> you want the crawler to load and process.
215
- * By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
216
- * @property {string} [suggestResponseEncoding]
217
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers.
218
- * Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
219
- * If those sites actually use a different encoding, the response will be corrupted. You can use
220
- * `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
221
- * To force a certain encoding, disregarding the response headers, use {@link CheerioCrawlerOptions.forceResponseEncoding}
222
- * ```
223
- * // Will fall back to windows-1250 encoding if none found
224
- * suggestResponseEncoding: 'windows-1250'
225
- * ```
226
- * @property {string} [forceResponseEncoding]
227
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
228
- * to force a certain encoding, disregarding the response headers.
229
- * To only provide a default for missing encodings, use {@link CheerioCrawlerOptions.suggestResponseEncoding}
230
- * ```
231
- * // Will force windows-1250 encoding even if headers say otherwise
232
- * forceResponseEncoding: 'windows-1250'
233
- * ```
234
- * @property {number} [maxRequestRetries=3]
235
- * Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails.
236
- * @property {number} [maxRequestsPerCrawl]
237
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
238
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
239
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
240
- * @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
241
- * Custom options passed to the underlying {@link AutoscaledPool} constructor.
242
- * Note that the `runTaskFunction`, `isTaskReadyFunction` and `isFinishedFunction` options
243
- * are provided by `CheerioCrawler` and cannot be overridden. Reasonable {@link Snapshotter}
244
- * and {@link SystemStatus} defaults are provided to account for the fact that `cheerio`
245
- * parses HTML synchronously and therefore blocks the event loop.
246
- * @property {number} [minConcurrency=1]
247
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
248
- *
249
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
250
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
251
- * @property {number} [maxConcurrency=1000]
252
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
253
- * @property {boolean} [useSessionPool=true]
254
- * If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes.
255
- * It also marks Session as bad after a request timeout.
256
- * @property {SessionPoolOptions} [sessionPoolOptions]
257
- * Custom options passed to the underlying {@link SessionPool} constructor.
258
- * @property {boolean} [persistCookiesPerSession]
259
- * Automatically saves cookies to Session. Works only if Session Pool is used.
260
- *
261
- * It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
262
- * It passes the "Cookie" header to the request with the session cookies.
263
- */
264
- /**
265
- * Provides a framework for the parallel crawling of web pages using plain HTTP requests and
266
- * [cheerio](https://www.npmjs.com/package/cheerio) HTML parser.
267
- * The URLs to crawl are fed either from a static list of URLs
268
- * or from a dynamic queue of URLs enabling recursive crawling of websites.
269
- *
270
- * Since `CheerioCrawler` uses raw HTTP requests to download web pages,
271
- * it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
272
- * to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
273
- * because it loads the pages using full-featured headless Chrome browser.
274
- *
275
- * `CheerioCrawler` downloads each URL using a plain HTTP request,
276
- * parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio)
277
- * and then invokes the user-provided {@link CheerioCrawlerOptions.handlePageFunction} to extract page data
278
- * using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM.
279
- *
280
- * The source URLs are represented using {@link Request} objects that are fed from
281
- * {@link RequestList} or {@link RequestQueue} instances provided by the {@link CheerioCrawlerOptions.requestList}
282
- * or {@link CheerioCrawlerOptions.requestQueue} constructor options, respectively.
283
- *
284
- * If both {@link CheerioCrawlerOptions.requestList} and {@link CheerioCrawlerOptions.requestQueue} are used,
285
- * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
286
- * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
287
- *
288
- * The crawler finishes when there are no more {@link Request} objects to crawl.
289
- *
290
- * `CheerioCrawler` downloads the web pages using the `{@link utils#requestAsBrowser}` utility function.
291
- * As opposed to the browser based crawlers that are automatically encoding the URLs, the
292
- * `{@link utils#requestAsBrowser}` function will not do so. We either need to manually encode the URLs
293
- * via `encodeURI()` function, or set `forceUrlEncoding: true` in the `requestAsBrowserOptions`,
294
- * which will automatically encode all the URLs before accessing them.
295
- *
296
- * > We can either use `forceUrlEncoding` or encode manually, but not both - it would
297
- * > result in double encoding and therefore lead to invalid URLs.
298
- *
299
- * We can use the `preNavigationHooks` to adjust `requestAsBrowserOptions`:
300
- *
301
- * ```
302
- * preNavigationHooks: [
303
- * (crawlingContext, requestAsBrowserOptions) => {
304
- * requestAsBrowserOptions.forceUrlEncoding = true;
305
- * },
306
- * ]
307
- * ```
308
- *
309
- * By default, `CheerioCrawler` only processes web pages with the `text/html`
310
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
311
- * and skips pages with other content types. If you want the crawler to process other content types,
312
- * use the {@link CheerioCrawlerOptions.additionalMimeTypes} constructor option.
313
- * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
314
- * For details, see {@link CheerioCrawlerOptions.handlePageFunction}.
315
- *
316
- * New requests are only dispatched when there is enough free CPU and memory available,
317
- * using the functionality provided by the {@link AutoscaledPool} class.
318
- * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
319
- * parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
320
- * {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
321
- *
322
- * **Example usage:**
323
- *
324
- * ```javascript
325
- * // Prepare a list of URLs to crawl
326
- * const requestList = new Apify.RequestList({
327
- * sources: [
328
- * { url: 'http://www.example.com/page-1' },
329
- * { url: 'http://www.example.com/page-2' },
330
- * ],
331
- * });
332
- * await requestList.initialize();
333
- *
334
- * // Crawl the URLs
335
- * const crawler = new Apify.CheerioCrawler({
336
- * requestList,
337
- * handlePageFunction: async ({ request, response, body, contentType, $ }) => {
338
- * const data = [];
339
- *
340
- * // Do some data extraction from the page with Cheerio.
341
- * $('.some-collection').each((index, el) => {
342
- * data.push({ title: $(el).find('.some-title').text() });
343
- * });
344
- *
345
- * // Save the data to dataset.
346
- * await Apify.pushData({
347
- * url: request.url,
348
- * html: body,
349
- * data,
350
- * })
351
- * },
352
- * });
353
- *
354
- * await crawler.run();
355
- * ```
356
- * @property {Statistics} stats
357
- * Contains statistics about the current run.
358
- * @property {?RequestList} requestList
359
- * A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
360
- * Only available if used by the crawler.
361
- * @property {?RequestQueue} requestQueue
362
- * A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
363
- * Only available if used by the crawler.
364
- * @property {?SessionPool} sessionPool
365
- * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
366
- * Only available if used by the crawler.
367
- * @property {?ProxyConfiguration} proxyConfiguration
368
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
369
- * Only available if used by the crawler.
370
- * @property {AutoscaledPool} autoscaledPool
371
- * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
372
- * Note that this property is only initialized after calling the {@link CheerioCrawler#run} function.
373
- * You can use it to change the concurrency settings on the fly,
374
- * to pause the crawler by calling {@link AutoscaledPool#pause}
375
- * or to abort it by calling {@link AutoscaledPool#abort}.
376
- */
377
- class CheerioCrawler extends basic_crawler_1.BasicCrawler {
378
- /**
379
- * @param {CheerioCrawlerOptions} options
380
- * All `CheerioCrawler` parameters are passed via an options object.
381
- */
382
- constructor(options) {
383
- (0, ow_1.default)(options, 'CheerioCrawlerOptions', ow_1.default.object.exactShape(CheerioCrawler.optionsShape));
384
- const { handlePageFunction, requestTimeoutSecs = 30, handlePageTimeoutSecs = 60, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, proxyConfiguration, prepareRequestFunction, postResponseFunction, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [],
385
- // BasicCrawler
386
- autoscaledPoolOptions = CHEERIO_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
387
- super({
388
- ...basicCrawlerOptions,
389
- // TODO temporary until the API is unified in V2
390
- handleRequestFunction: handlePageFunction,
391
- autoscaledPoolOptions,
392
- // We need to add some time for internal functions to finish,
393
- // but not too much so that we would stall the crawler.
394
- handleRequestTimeoutSecs: requestTimeoutSecs + handlePageTimeoutSecs + constants_1.BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
395
- });
396
- // Cookies should be persisted per session only if session pool is used
397
- if (!this.useSessionPool && persistCookiesPerSession) {
398
- throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
399
- }
400
- this.supportedMimeTypes = new Set([...HTML_AND_XML_MIME_TYPES, APPLICATION_JSON_MIME_TYPE]);
401
- if (additionalMimeTypes.length)
402
- this._extendSupportedMimeTypes(additionalMimeTypes);
403
- if (suggestResponseEncoding && forceResponseEncoding) {
404
- this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
405
- }
406
- this.handlePageTimeoutMillis = handlePageTimeoutSecs * 1000;
407
- this.requestTimeoutMillis = requestTimeoutSecs * 1000;
408
- this.ignoreSslErrors = ignoreSslErrors;
409
- this.suggestResponseEncoding = suggestResponseEncoding;
410
- this.forceResponseEncoding = forceResponseEncoding;
411
- this.prepareRequestFunction = prepareRequestFunction;
412
- this.postResponseFunction = postResponseFunction;
413
- this.proxyConfiguration = proxyConfiguration;
414
- /**
415
- * @type {Array<any>}
416
- * @ignore
417
- * */
418
- this.preNavigationHooks = preNavigationHooks;
419
- /**
420
- * @type {Array<any>}
421
- * @ignore
422
- * */
423
- this.postNavigationHooks = [
424
- ({ request, response }) => this._abortDownloadOfBody(request, response),
425
- ...postNavigationHooks,
426
- ];
427
- if (this.useSessionPool) {
428
- this.persistCookiesPerSession = persistCookiesPerSession !== undefined ? persistCookiesPerSession : true;
429
- }
430
- else {
431
- this.persistCookiesPerSession = false;
432
- }
433
- }
434
- /**
435
- * **EXPERIMENTAL**
436
- * Function for attaching CrawlerExtensions such as the Unblockers.
437
- * @param {CrawlerExtension} extension - Crawler extension that overrides the crawler configuration.
438
- */
439
- use(extension) {
440
- (0, ow_1.default)(extension, ow_1.default.object.instanceOf(crawler_extension_1.default));
441
- const extensionOptions = extension.getCrawlerOptions();
442
- // TODO temporary until the API is unified in V2
443
- extensionOptions.userProvidedHandler = extensionOptions.handlePageFunction;
444
- delete extensionOptions.handlePageFunction;
445
- for (const [key, value] of Object.entries(extensionOptions)) {
446
- const isConfigurable = this.hasOwnProperty(key); // eslint-disable-line
447
- const originalType = typeof this[key];
448
- const extensionType = typeof value; // What if we want to null something? It is really needed?
449
- const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
450
- const exists = this[key] != null;
451
- if (!isConfigurable) { // Test if the property can be configured on the crawler
452
- throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on CheerioCrawler instance.`);
453
- }
454
- if (!isSameType && exists) { // Assuming that extensions will only add up configuration
455
- throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "CheerioCrawler.${key}: ${originalType}".`);
456
- }
457
- this.log.warning(`${extension.name} is overriding "CheerioCrawler.${key}: ${originalType}" with ${value}.`);
458
- this[key] = value;
459
- }
460
- }
461
- /**
462
- * Wrapper around handlePageFunction that opens and closes pages etc.
463
- *
464
- * @param {CrawlingContext} crawlingContext
465
- * @ignore
466
- * @protected
467
- * @internal
468
- */
469
- async _handleRequestFunction(crawlingContext) {
470
- const { request, session } = crawlingContext;
471
- if (this.proxyConfiguration) {
472
- const sessionId = session ? session.id : undefined;
473
- crawlingContext.proxyInfo = this.proxyConfiguration.newProxyInfo(sessionId);
474
- }
475
- await this._handleNavigation(crawlingContext);
476
- (0, timeout_1.tryCancel)();
477
- const { dom, isXml, body, contentType, response } = await this._parseResponse(request, crawlingContext.response);
478
- (0, timeout_1.tryCancel)();
479
- if (this.useSessionPool) {
480
- this._throwOnBlockedRequest(session, response.statusCode);
481
- }
482
- if (this.persistCookiesPerSession) {
483
- session.setCookiesFromResponse(response);
484
- }
485
- request.loadedUrl = response.url;
486
- const $ = dom
487
- ? cheerio_1.default.load(dom, {
488
- xmlMode: isXml,
489
- // Recent versions of cheerio use parse5 as the HTML parser/serializer. It's more strict than htmlparser2
490
- // and not good for scraping. It also does not have a great streaming interface.
491
- // Here we tell cheerio to use htmlparser2 for serialization, otherwise the conflict produces weird errors.
492
- _useHtmlParser2: true,
493
- })
494
- : null;
495
- crawlingContext.$ = $;
496
- crawlingContext.contentType = contentType;
497
- crawlingContext.response = response;
498
- Object.defineProperty(crawlingContext, 'json', {
499
- get() {
500
- if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
501
- return null;
502
- const jsonString = body.toString(contentType.encoding);
503
- return JSON.parse(jsonString);
504
- },
505
- });
506
- Object.defineProperty(crawlingContext, 'body', {
507
- get() {
508
- // NOTE: For XML/HTML documents, we don't store the original body and only reconstruct it from Cheerio's DOM.
509
- // This is to save memory for high-concurrency crawls. The downside is that changes
510
- // made to DOM are reflected in the HTML, but we can live with that...
511
- if (dom) {
512
- return isXml ? $.xml() : $.html({ decodeEntities: false });
513
- }
514
- return body;
515
- },
516
- });
517
- return (0, timeout_1.addTimeoutToPromise)(() => this.userProvidedHandler(crawlingContext), this.handlePageTimeoutMillis, `handlePageFunction timed out after ${this.handlePageTimeoutMillis / 1000} seconds.`);
518
- }
519
- /**
520
- * @param {CrawlingContext} crawlingContext
521
- * @ignore
522
- * @protected
523
- * @internal
524
- */
525
- async _handleNavigation(crawlingContext) {
526
- if (this.prepareRequestFunction) {
527
- this.log.deprecated('Option "prepareRequestFunction" is deprecated. Use "preNavigationHooks" instead.');
528
- await this.prepareRequestFunction(crawlingContext);
529
- (0, timeout_1.tryCancel)();
530
- }
531
- const requestAsBrowserOptions = {};
532
- if (this.useSessionPool) {
533
- this._applySessionCookie(crawlingContext, requestAsBrowserOptions);
534
- }
535
- const { request, session } = crawlingContext;
536
- const cookieSnapshot = request.headers.Cookie ?? request.headers.cookie;
537
- await this._executeHooks(this.preNavigationHooks, crawlingContext, requestAsBrowserOptions);
538
- (0, timeout_1.tryCancel)();
539
- const proxyUrl = crawlingContext.proxyInfo && crawlingContext.proxyInfo.url;
540
- this._mergeRequestCookieDiff(request, cookieSnapshot, requestAsBrowserOptions);
541
- crawlingContext.response = await (0, timeout_1.addTimeoutToPromise)(() => this._requestFunction({ request, session, proxyUrl, requestAsBrowserOptions }), this.requestTimeoutMillis, `request timed out after ${this.requestTimeoutMillis / 1000} seconds.`);
542
- (0, timeout_1.tryCancel)();
543
- await this._executeHooks(this.postNavigationHooks, crawlingContext, requestAsBrowserOptions);
544
- (0, timeout_1.tryCancel)();
545
- if (this.postResponseFunction) {
546
- this.log.deprecated('Option "postResponseFunction" is deprecated. Use "postNavigationHooks" instead.');
547
- await this.postResponseFunction(crawlingContext);
548
- (0, timeout_1.tryCancel)();
549
- }
550
- }
551
- /**
552
- * When users change `request.headers.cookie` inside preNavigationHook, the change would be ignored,
553
- * as `request.headers` are already merged into the `requestAsBrowserOptions`. This method is using
554
- * old `request.headers` snapshot (before hooks are executed), makes a diff with the cookie value
555
- * after hooks are executed, and merges any new cookies back to `requestAsBrowserOptions`.
556
- *
557
- * This way we can still use both `requestAsBrowserOptions` and `context.request` in the hooks (not both).
558
- *
559
- * @param {Request} request
560
- * @param {string} cookieSnapshot
561
- * @param {RequestAsBrowserOptions} requestAsBrowserOptions
562
- * @private
563
- * @ignore
564
- * @internal
565
- */
566
- _mergeRequestCookieDiff(request, cookieSnapshot, requestAsBrowserOptions) {
567
- const cookieDiff = (0, crawler_utils_1.diffCookies)(request.url, cookieSnapshot, request.headers.Cookie ?? request.headers.cookie);
568
- if (cookieDiff.length > 0) {
569
- requestAsBrowserOptions.headers ?? (requestAsBrowserOptions.headers = {});
570
- requestAsBrowserOptions.headers.Cookie = (0, crawler_utils_1.mergeCookies)(request.url, [
571
- requestAsBrowserOptions.headers.Cookie,
572
- cookieDiff,
573
- ]);
574
- }
575
- }
576
- /**
577
- * Function to make the HTTP request. It performs optimizations
578
- * on the request such as only downloading the request body if the
579
- * received content type matches text/html, application/xml, application/xhtml+xml.
580
- *
581
- * @param {object} options
582
- * @param {Request} options.request
583
- * @param {Session} options.session
584
- * @param {string} options.proxyUrl
585
- * @param {RequestAsBrowserOptions} options.requestAsBrowserOptions
586
- * @returns {Promise<IncomingMessage|Readable>}
587
- * @ignore
588
- * @protected
589
- * @internal
590
- */
591
- async _requestFunction({ request, session, proxyUrl, requestAsBrowserOptions }) {
592
- const opts = this._getRequestOptions(request, session, proxyUrl, requestAsBrowserOptions);
593
- let responseWithStream;
594
- try {
595
- responseWithStream = await (0, utils_request_1.requestAsBrowser)(opts);
596
- }
597
- catch (e) {
598
- if (e instanceof got_scraping_1.TimeoutError) {
599
- this._handleRequestTimeout(session);
600
- }
601
- else {
602
- throw e;
603
- }
604
- }
605
- return responseWithStream;
606
- }
607
- /**
608
- * Sets the cookie header to `requestAsBrowserOptions` based on provided session and request. If some cookies were already set,
609
- * the session cookie will be merged with them. User provided cookies on `request` object have precedence.
610
- *
611
- * @param {CrawlingContext} crawlingContext
612
- * @param {RequestAsBrowserOptions} requestAsBrowserOptions
613
- * @return {void}
614
- * @ignore
615
- * @private
616
- * @internal
617
- */
618
- _applySessionCookie({ request, session }, requestAsBrowserOptions) {
619
- const userCookie = request.headers.Cookie ?? request.headers.cookie;
620
- const sessionCookie = session.getCookieString(request.url);
621
- const mergedCookies = (0, crawler_utils_1.mergeCookies)(request.url, [sessionCookie, userCookie]);
622
- // merge cookies from all possible sources
623
- if (mergedCookies) {
624
- requestAsBrowserOptions.headers ?? (requestAsBrowserOptions.headers = {});
625
- requestAsBrowserOptions.headers.Cookie = mergedCookies;
626
- }
627
- }
628
- /**
629
- * Encodes and parses response according to the provided content type
630
- * @param {Request} request
631
- * @param {IncomingMessage|Readable} responseStream
632
- * @returns {Promise<object>}
633
- * @ignore
634
- * @protected
635
- * @internal
636
- */
637
- async _parseResponse(request, responseStream) {
638
- const { statusCode } = responseStream;
639
- const { type, charset } = (0, utils_1.parseContentTypeFromResponse)(responseStream);
640
- const { response, encoding } = this._encodeResponse(request, responseStream, charset);
641
- const contentType = { type, encoding };
642
- if (statusCode >= 500) {
643
- const body = await (0, utilities_1.readStreamToString)(response, encoding);
644
- // Errors are often sent as JSON, so attempt to parse them,
645
- // despite Accept header being set to text/html.
646
- if (type === APPLICATION_JSON_MIME_TYPE) {
647
- const errorResponse = JSON.parse(body);
648
- let { message } = errorResponse;
649
- if (!message)
650
- message = util_1.default.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
651
- throw new Error(`${statusCode} - ${message}`);
652
- }
653
- // It's not a JSON so it's probably some text. Get the first 100 chars of it.
654
- throw new Error(`${statusCode} - Internal Server Error: ${body.substr(0, 100)}`);
655
- }
656
- else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
657
- const dom = await this._parseHtmlToDom(response);
658
- return ({ dom, isXml: type.includes('xml'), response, contentType });
659
- }
660
- else {
661
- const body = await (0, utilities_1.concatStreamToBuffer)(response);
662
- return { body, response, contentType };
663
- }
664
- }
665
- /**
666
- * Combines the provided `requestOptions` with mandatory (non-overridable) values.
667
- * @param {Request} request
668
- * @param {Session} [session]
669
- * @param {string} [proxyUrl]
670
- * @param {RequestAsBrowserOptions} [requestAsBrowserOptions]
671
- * @ignore
672
- * @protected
673
- * @internal
674
- */
675
- _getRequestOptions(request, session, proxyUrl, requestAsBrowserOptions) {
676
- const requestOptions = {
677
- url: request.url,
678
- method: request.method,
679
- proxyUrl,
680
- timeout: { request: this.requestTimeoutMillis },
681
- sessionToken: session,
682
- ...requestAsBrowserOptions,
683
- headers: { ...request.headers, ...requestAsBrowserOptions.headers },
684
- https: {
685
- ...requestAsBrowserOptions.https,
686
- rejectUnauthorized: !this.ignoreSslErrors,
687
- },
688
- isStream: true,
689
- };
690
- // TODO this is incorrect, the check for man in the middle needs to be done
691
- // on individual proxy level, not on the `proxyConfiguration` level,
692
- // because users can use normal + MITM proxies in a single configuration.
693
- // Disable SSL verification for MITM proxies
694
- if (this.proxyConfiguration && this.proxyConfiguration.isManInTheMiddle) {
695
- requestOptions.https = {
696
- ...requestOptions.https,
697
- rejectUnauthorized: false,
698
- };
699
- }
700
- if (/PATCH|POST|PUT/.test(request.method))
701
- requestOptions.body = request.payload;
702
- return requestOptions;
703
- }
704
- /**
705
- * @param {*} request
706
- * @param {*} response
707
- * @param {*} encoding
708
- * @ignore
709
- * @protected
710
- * @internal
711
- */
712
- _encodeResponse(request, response, encoding) {
713
- if (this.forceResponseEncoding) {
714
- encoding = this.forceResponseEncoding;
715
- }
716
- else if (!encoding && this.suggestResponseEncoding) {
717
- encoding = this.suggestResponseEncoding;
718
- }
719
- // Fall back to utf-8 if we still don't have encoding.
720
- const utf8 = 'utf8';
721
- if (!encoding)
722
- return { response, encoding: utf8 };
723
- // This means that the encoding is one of Node.js supported
724
- // encodings and we don't need to re-encode it.
725
- if (Buffer.isEncoding(encoding))
726
- return { response, encoding };
727
- // Try to re-encode a variety of unsupported encodings to utf-8
728
- if (iconv_lite_1.default.encodingExists(encoding)) {
729
- const encodeStream = iconv_lite_1.default.encodeStream(utf8);
730
- const decodeStream = iconv_lite_1.default.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
731
- response.on('error', (err) => decodeStream.emit('error', err));
732
- const encodedResponse = response.pipe(decodeStream).pipe(encodeStream);
733
- encodedResponse.statusCode = response.statusCode;
734
- encodedResponse.headers = response.headers;
735
- encodedResponse.url = response.url;
736
- return {
737
- response: encodedResponse,
738
- encoding: utf8,
739
- };
740
- }
741
- throw new Error(`Resource ${request.url} served with unsupported charset/encoding: ${encoding}`);
742
- }
743
- /**
744
- * @param {*} response
745
- * @ignore
746
- * @protected
747
- * @internal
748
- */
749
- async _parseHtmlToDom(response) {
750
- return new Promise((resolve, reject) => {
751
- const domHandler = new htmlparser2_1.DomHandler((err, dom) => {
752
- if (err)
753
- reject(err);
754
- else
755
- resolve(dom);
756
- });
757
- const parser = new WritableStream_1.WritableStream(domHandler, { decodeEntities: true });
758
- parser.on('error', reject);
759
- response
760
- .on('error', reject)
761
- .pipe(parser);
762
- });
763
- }
764
- /**
765
- * Checks and extends supported mime types
766
- * @param {Array<(string|Object)>} additionalMimeTypes
767
- * @ignore
768
- * @protected
769
- * @internal
770
- */
771
- _extendSupportedMimeTypes(additionalMimeTypes) {
772
- additionalMimeTypes.forEach((mimeType) => {
773
- try {
774
- const parsedType = content_type_1.default.parse(mimeType);
775
- this.supportedMimeTypes.add(parsedType.type);
776
- }
777
- catch (err) {
778
- throw new Error(`Can not parse mime type ${mimeType} from "options.additionalMimeTypes".`);
779
- }
780
- });
781
- }
782
- /**
783
- * Handles blocked request
784
- * @param {Session} session
785
- * @param {number} statusCode
786
- * @ignore
787
- * @protected
788
- * @internal
789
- */
790
- _throwOnBlockedRequest(session, statusCode) {
791
- const isBlocked = session.retireOnBlockedStatusCodes(statusCode);
792
- if (isBlocked) {
793
- throw new Error(`Request blocked - received ${statusCode} status code`);
794
- }
795
- }
796
- /**
797
- * Handles timeout request
798
- * @param {Session} session
799
- * @ignore
800
- * @protected
801
- * @internal
802
- */
803
- _handleRequestTimeout(session) {
804
- if (session)
805
- session.markBad();
806
- throw new Error(`request timed out after ${this.handlePageTimeoutMillis / 1000} seconds.`);
807
- }
808
- /**
809
- * @param {Request} request
810
- * @param {IncomingMessage|Readable} response
811
- * @private
812
- */
813
- _abortDownloadOfBody(request, response) {
814
- const { statusCode } = response;
815
- const { type } = (0, utils_1.parseContentTypeFromResponse)(response);
816
- if (statusCode === 406) {
817
- request.noRetry = true;
818
- throw new Error(`Resource ${request.url} is not available in the format requested by the Accept header. Skipping resource.`);
819
- }
820
- if (!this.supportedMimeTypes.has(type) && statusCode < 500) {
821
- request.noRetry = true;
822
- throw new Error(`Resource ${request.url} served Content-Type ${type}, `
823
- + `but only ${Array.from(this.supportedMimeTypes).join(', ')} are allowed. Skipping resource.`);
824
- }
825
- }
826
- }
827
- /**
828
- * @internal
829
- * @type any
830
- */
831
- Object.defineProperty(CheerioCrawler, "optionsShape", {
832
- enumerable: true,
833
- configurable: true,
834
- writable: true,
835
- value: {
836
- ...basic_crawler_1.BasicCrawler.optionsShape,
837
- // TODO temporary until the API is unified in V2
838
- handleRequestFunction: ow_1.default.undefined,
839
- handlePageFunction: ow_1.default.function,
840
- requestTimeoutSecs: ow_1.default.optional.number,
841
- handlePageTimeoutSecs: ow_1.default.optional.number,
842
- ignoreSslErrors: ow_1.default.optional.boolean,
843
- additionalMimeTypes: ow_1.default.optional.array.ofType(ow_1.default.string),
844
- suggestResponseEncoding: ow_1.default.optional.string,
845
- forceResponseEncoding: ow_1.default.optional.string,
846
- proxyConfiguration: ow_1.default.optional.object.validate(validators_1.validators.proxyConfiguration),
847
- prepareRequestFunction: ow_1.default.optional.function,
848
- postResponseFunction: ow_1.default.optional.function,
849
- persistCookiesPerSession: ow_1.default.optional.boolean,
850
- preNavigationHooks: ow_1.default.optional.array,
851
- postNavigationHooks: ow_1.default.optional.array,
852
- }
853
- });
854
- exports.default = CheerioCrawler;
855
- /**
856
- * @typedef PrepareRequestInputs
857
- * @property {Request} request
858
- * Original instance fo the {Request} object. Must be modified in-place.
859
- * @property {Session} [session]
860
- * The current session
861
- * @property {ProxyInfo} [proxyInfo]
862
- * An object with information about currently used proxy by the crawler
863
- * and configured by the {@link ProxyConfiguration} class.
864
- * @property {CheerioCrawler} [crawler]
865
- */
866
- /**
867
- * @callback PrepareRequest
868
- * @param {PrepareRequestInputs} inputs Arguments passed to this callback.
869
- * @returns {(void|Promise<void>)}
870
- */
871
- /**
872
- * @typedef PostResponseInputs
873
- * @property {(IncomingMessage|Readable)} response stream
874
- * @property {Request} request
875
- * Original instance fo the {Request} object. Must be modified in-place.
876
- * @property {Session} [session]
877
- * The current session
878
- * @property {ProxyInfo} [proxyInfo]
879
- * An object with information about currently used proxy by the crawler
880
- * and configured by the {@link ProxyConfiguration} class.
881
- * @property {CheerioCrawler} crawler
882
- */
883
- /**
884
- * @callback PostResponse
885
- * @param {PostResponseInputs} inputs Arguments passed to this callback.
886
- * @returns {(void|Promise<void>)}
887
- */
888
- /**
889
- * @typedef CheerioHandlePageInputs
890
- * @property {CheerioAPI} $
891
- * The [Cheerio](https://cheerio.js.org/) object with parsed HTML.
892
- * @property {(string|Buffer)} body
893
- * The request body of the web page.
894
- * @property {*} json
895
- * The parsed object from JSON string if the response contains the content type application/json.
896
- * @property {Request} request
897
- * The original {@link Request} object.
898
- * @property {{ type: string, encoding: string }} contentType
899
- * Parsed `Content-Type header: { type, encoding }`.
900
- * @property {IncomingMessage} response
901
- * An instance of Node's [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) object,
902
- * @property {Session} session
903
- * @property {ProxyInfo} proxyInfo
904
- * An object with information about currently used proxy by the crawler
905
- * and configured by the {@link ProxyConfiguration} class.
906
- * @property {CheerioCrawler} crawler
907
- */
908
- /**
909
- * @callback CheerioHandlePage
910
- * @param {CheerioHandlePageInputs} inputs Arguments passed to this callback.
911
- * @returns {Promise<void>}
912
- */
913
- //# sourceMappingURL=cheerio_crawler.js.map