apify 2.3.1-beta.4 → 3.0.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. package/README.md +6 -5
  2. package/package.json +69 -128
  3. package/build/actor.d.ts +0 -113
  4. package/build/actor.d.ts.map +0 -1
  5. package/build/actor.js +0 -582
  6. package/build/actor.js.map +0 -1
  7. package/build/apify.d.ts +0 -752
  8. package/build/apify.d.ts.map +0 -1
  9. package/build/apify.js +0 -877
  10. package/build/apify.js.map +0 -1
  11. package/build/autoscaling/autoscaled_pool.d.ts +0 -384
  12. package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
  13. package/build/autoscaling/autoscaled_pool.js +0 -557
  14. package/build/autoscaling/autoscaled_pool.js.map +0 -1
  15. package/build/autoscaling/snapshotter.d.ts +0 -278
  16. package/build/autoscaling/snapshotter.d.ts.map +0 -1
  17. package/build/autoscaling/snapshotter.js +0 -447
  18. package/build/autoscaling/snapshotter.js.map +0 -1
  19. package/build/autoscaling/system_status.d.ts +0 -224
  20. package/build/autoscaling/system_status.d.ts.map +0 -1
  21. package/build/autoscaling/system_status.js +0 -228
  22. package/build/autoscaling/system_status.js.map +0 -1
  23. package/build/browser_launchers/browser_launcher.d.ts +0 -154
  24. package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
  25. package/build/browser_launchers/browser_launcher.js +0 -160
  26. package/build/browser_launchers/browser_launcher.js.map +0 -1
  27. package/build/browser_launchers/browser_plugin.d.ts +0 -23
  28. package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
  29. package/build/browser_launchers/browser_plugin.js +0 -25
  30. package/build/browser_launchers/browser_plugin.js.map +0 -1
  31. package/build/browser_launchers/playwright_launcher.d.ts +0 -131
  32. package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
  33. package/build/browser_launchers/playwright_launcher.js +0 -150
  34. package/build/browser_launchers/playwright_launcher.js.map +0 -1
  35. package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
  36. package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
  37. package/build/browser_launchers/puppeteer_launcher.js +0 -197
  38. package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
  39. package/build/cache_container.d.ts +0 -31
  40. package/build/cache_container.d.ts.map +0 -1
  41. package/build/cache_container.js +0 -48
  42. package/build/cache_container.js.map +0 -1
  43. package/build/configuration.d.ts +0 -226
  44. package/build/configuration.d.ts.map +0 -1
  45. package/build/configuration.js +0 -325
  46. package/build/configuration.js.map +0 -1
  47. package/build/constants.d.ts +0 -37
  48. package/build/constants.d.ts.map +0 -1
  49. package/build/constants.js +0 -41
  50. package/build/constants.js.map +0 -1
  51. package/build/crawlers/basic_crawler.d.ts +0 -443
  52. package/build/crawlers/basic_crawler.d.ts.map +0 -1
  53. package/build/crawlers/basic_crawler.js +0 -664
  54. package/build/crawlers/basic_crawler.js.map +0 -1
  55. package/build/crawlers/browser_crawler.d.ts +0 -512
  56. package/build/crawlers/browser_crawler.d.ts.map +0 -1
  57. package/build/crawlers/browser_crawler.js +0 -540
  58. package/build/crawlers/browser_crawler.js.map +0 -1
  59. package/build/crawlers/cheerio_crawler.d.ts +0 -931
  60. package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
  61. package/build/crawlers/cheerio_crawler.js +0 -913
  62. package/build/crawlers/cheerio_crawler.js.map +0 -1
  63. package/build/crawlers/crawler_extension.d.ts +0 -10
  64. package/build/crawlers/crawler_extension.d.ts.map +0 -1
  65. package/build/crawlers/crawler_extension.js +0 -19
  66. package/build/crawlers/crawler_extension.js.map +0 -1
  67. package/build/crawlers/crawler_utils.d.ts +0 -34
  68. package/build/crawlers/crawler_utils.d.ts.map +0 -1
  69. package/build/crawlers/crawler_utils.js +0 -87
  70. package/build/crawlers/crawler_utils.js.map +0 -1
  71. package/build/crawlers/playwright_crawler.d.ts +0 -448
  72. package/build/crawlers/playwright_crawler.d.ts.map +0 -1
  73. package/build/crawlers/playwright_crawler.js +0 -299
  74. package/build/crawlers/playwright_crawler.js.map +0 -1
  75. package/build/crawlers/puppeteer_crawler.d.ts +0 -425
  76. package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
  77. package/build/crawlers/puppeteer_crawler.js +0 -299
  78. package/build/crawlers/puppeteer_crawler.js.map +0 -1
  79. package/build/crawlers/statistics.d.ts +0 -185
  80. package/build/crawlers/statistics.d.ts.map +0 -1
  81. package/build/crawlers/statistics.js +0 -331
  82. package/build/crawlers/statistics.js.map +0 -1
  83. package/build/enqueue_links/click_elements.d.ts +0 -179
  84. package/build/enqueue_links/click_elements.d.ts.map +0 -1
  85. package/build/enqueue_links/click_elements.js +0 -434
  86. package/build/enqueue_links/click_elements.js.map +0 -1
  87. package/build/enqueue_links/enqueue_links.d.ts +0 -117
  88. package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
  89. package/build/enqueue_links/enqueue_links.js +0 -163
  90. package/build/enqueue_links/enqueue_links.js.map +0 -1
  91. package/build/enqueue_links/shared.d.ts +0 -42
  92. package/build/enqueue_links/shared.d.ts.map +0 -1
  93. package/build/enqueue_links/shared.js +0 -121
  94. package/build/enqueue_links/shared.js.map +0 -1
  95. package/build/errors.d.ts +0 -29
  96. package/build/errors.d.ts.map +0 -1
  97. package/build/errors.js +0 -38
  98. package/build/errors.js.map +0 -1
  99. package/build/events.d.ts +0 -11
  100. package/build/events.d.ts.map +0 -1
  101. package/build/events.js +0 -147
  102. package/build/events.js.map +0 -1
  103. package/build/index.d.ts +0 -4
  104. package/build/index.d.ts.map +0 -1
  105. package/build/index.js +0 -7
  106. package/build/index.js.map +0 -1
  107. package/build/main.d.ts +0 -179
  108. package/build/main.d.ts.map +0 -1
  109. package/build/main.js +0 -81
  110. package/build/main.js.map +0 -1
  111. package/build/playwright_utils.d.ts +0 -9
  112. package/build/playwright_utils.d.ts.map +0 -1
  113. package/build/playwright_utils.js +0 -90
  114. package/build/playwright_utils.js.map +0 -1
  115. package/build/proxy_configuration.d.ts +0 -411
  116. package/build/proxy_configuration.d.ts.map +0 -1
  117. package/build/proxy_configuration.js +0 -517
  118. package/build/proxy_configuration.js.map +0 -1
  119. package/build/pseudo_url.d.ts +0 -86
  120. package/build/pseudo_url.d.ts.map +0 -1
  121. package/build/pseudo_url.js +0 -153
  122. package/build/pseudo_url.js.map +0 -1
  123. package/build/puppeteer_request_interception.d.ts +0 -8
  124. package/build/puppeteer_request_interception.d.ts.map +0 -1
  125. package/build/puppeteer_request_interception.js +0 -235
  126. package/build/puppeteer_request_interception.js.map +0 -1
  127. package/build/puppeteer_utils.d.ts +0 -250
  128. package/build/puppeteer_utils.d.ts.map +0 -1
  129. package/build/puppeteer_utils.js +0 -551
  130. package/build/puppeteer_utils.js.map +0 -1
  131. package/build/request.d.ts +0 -180
  132. package/build/request.d.ts.map +0 -1
  133. package/build/request.js +0 -261
  134. package/build/request.js.map +0 -1
  135. package/build/request_list.d.ts +0 -581
  136. package/build/request_list.d.ts.map +0 -1
  137. package/build/request_list.js +0 -826
  138. package/build/request_list.js.map +0 -1
  139. package/build/serialization.d.ts +0 -5
  140. package/build/serialization.d.ts.map +0 -1
  141. package/build/serialization.js +0 -139
  142. package/build/serialization.js.map +0 -1
  143. package/build/session_pool/errors.d.ts +0 -11
  144. package/build/session_pool/errors.d.ts.map +0 -1
  145. package/build/session_pool/errors.js +0 -18
  146. package/build/session_pool/errors.js.map +0 -1
  147. package/build/session_pool/events.d.ts +0 -5
  148. package/build/session_pool/events.d.ts.map +0 -1
  149. package/build/session_pool/events.js +0 -6
  150. package/build/session_pool/events.js.map +0 -1
  151. package/build/session_pool/session.d.ts +0 -286
  152. package/build/session_pool/session.d.ts.map +0 -1
  153. package/build/session_pool/session.js +0 -355
  154. package/build/session_pool/session.js.map +0 -1
  155. package/build/session_pool/session_pool.d.ts +0 -280
  156. package/build/session_pool/session_pool.d.ts.map +0 -1
  157. package/build/session_pool/session_pool.js +0 -393
  158. package/build/session_pool/session_pool.js.map +0 -1
  159. package/build/session_pool/session_utils.d.ts +0 -4
  160. package/build/session_pool/session_utils.d.ts.map +0 -1
  161. package/build/session_pool/session_utils.js +0 -24
  162. package/build/session_pool/session_utils.js.map +0 -1
  163. package/build/stealth/hiding_tricks.d.ts +0 -22
  164. package/build/stealth/hiding_tricks.d.ts.map +0 -1
  165. package/build/stealth/hiding_tricks.js +0 -308
  166. package/build/stealth/hiding_tricks.js.map +0 -1
  167. package/build/stealth/stealth.d.ts +0 -56
  168. package/build/stealth/stealth.d.ts.map +0 -1
  169. package/build/stealth/stealth.js +0 -125
  170. package/build/stealth/stealth.js.map +0 -1
  171. package/build/storages/dataset.d.ts +0 -288
  172. package/build/storages/dataset.d.ts.map +0 -1
  173. package/build/storages/dataset.js +0 -480
  174. package/build/storages/dataset.js.map +0 -1
  175. package/build/storages/key_value_store.d.ts +0 -243
  176. package/build/storages/key_value_store.d.ts.map +0 -1
  177. package/build/storages/key_value_store.js +0 -462
  178. package/build/storages/key_value_store.js.map +0 -1
  179. package/build/storages/request_queue.d.ts +0 -318
  180. package/build/storages/request_queue.d.ts.map +0 -1
  181. package/build/storages/request_queue.js +0 -636
  182. package/build/storages/request_queue.js.map +0 -1
  183. package/build/storages/storage_manager.d.ts +0 -87
  184. package/build/storages/storage_manager.d.ts.map +0 -1
  185. package/build/storages/storage_manager.js +0 -150
  186. package/build/storages/storage_manager.js.map +0 -1
  187. package/build/tsconfig.tsbuildinfo +0 -1
  188. package/build/typedefs.d.ts +0 -146
  189. package/build/typedefs.d.ts.map +0 -1
  190. package/build/typedefs.js +0 -88
  191. package/build/typedefs.js.map +0 -1
  192. package/build/utils.d.ts +0 -175
  193. package/build/utils.d.ts.map +0 -1
  194. package/build/utils.js +0 -731
  195. package/build/utils.js.map +0 -1
  196. package/build/utils_log.d.ts +0 -41
  197. package/build/utils_log.d.ts.map +0 -1
  198. package/build/utils_log.js +0 -192
  199. package/build/utils_log.js.map +0 -1
  200. package/build/utils_request.d.ts +0 -77
  201. package/build/utils_request.d.ts.map +0 -1
  202. package/build/utils_request.js +0 -385
  203. package/build/utils_request.js.map +0 -1
  204. package/build/utils_social.d.ts +0 -210
  205. package/build/utils_social.d.ts.map +0 -1
  206. package/build/utils_social.js +0 -787
  207. package/build/utils_social.js.map +0 -1
  208. package/build/validators.d.ts +0 -23
  209. package/build/validators.d.ts.map +0 -1
  210. package/build/validators.js +0 -29
  211. package/build/validators.js.map +0 -1
@@ -1,931 +0,0 @@
1
- /// <reference types="node" />
2
- export default CheerioCrawler;
3
- export type CheerioCrawlerOptions = {
4
- /**
5
- * User-provided function that performs the logic of the crawler. It is called for each page
6
- * loaded and parsed by the crawler.
7
- *
8
- * The function receives the following object as an argument:
9
- * ```
10
- * {
11
- * // The Cheerio object's function with the parsed HTML.
12
- * $: Cheerio,
13
- *
14
- * // The request body of the web page, whose type depends on the content type.
15
- * body: String|Buffer,
16
- *
17
- * // The parsed object from JSON for responses with the "application/json" content types.
18
- * // For other content types it's null.
19
- * json: Object,
20
- *
21
- * // Apify.Request object with details of the requested web page
22
- * request: Request,
23
- *
24
- * // Parsed Content-Type HTTP header: { type, encoding }
25
- * contentType: Object,
26
- *
27
- * // An instance of Node's http.IncomingMessage object,
28
- * response: Object,
29
- *
30
- * // Session object, useful to work around anti-scraping protections
31
- * session: Session
32
- *
33
- * // ProxyInfo object with information about currently used proxy
34
- * proxyInfo: ProxyInfo
35
- *
36
- * // The running cheerio crawler instance.
37
- * crawler: CheerioCrawler
38
- * }
39
- * ```
40
- *
41
- * Type of `body` depends on the `Content-Type` header of the web page:
42
- * - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
43
- * - Buffer for others MIME content types
44
- *
45
- * Parsed `Content-Type` header using
46
- * [content-type package](https://www.npmjs.com/package/content-type)
47
- * is stored in `contentType`.
48
- *
49
- * Cheerio is available only for HTML and XML content types.
50
- *
51
- * With the {@link Request } object representing the URL to crawl.
52
- *
53
- * If the function returns, the returned promise is awaited by the crawler.
54
- *
55
- * If the function throws an exception, the crawler will try to re-crawl the
56
- * request later, up to `option.maxRequestRetries` times.
57
- * If all the retries fail, the crawler calls the function
58
- * provided to the `handleFailedRequestFunction` parameter.
59
- * To make this work, you should **always**
60
- * let your function throw exceptions rather than catch them.
61
- * The exceptions are logged to the request using the
62
- * {@link RequestpushErrorMessage } function.
63
- */
64
- handlePageFunction: CheerioHandlePage;
65
- /**
66
- * Static list of URLs to be processed.
67
- * Either `requestList` or `requestQueue` option must be provided (or both).
68
- */
69
- requestList?: RequestList | undefined;
70
- /**
71
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
72
- * Either `requestList` or `requestQueue` option must be provided (or both).
73
- */
74
- requestQueue?: RequestQueue | undefined;
75
- /**
76
- * > This option is deprecated, use `preNavigationHooks` instead.
77
- *
78
- * A function that executes before the HTTP request is made to the target resource.
79
- * This function is suitable for setting dynamic properties such as cookies to the {@link Request }.
80
- *
81
- * The function receives the following object as an argument:
82
- * ```
83
- * {
84
- * request: Request,
85
- * session: Session,
86
- * proxyInfo: ProxyInfo,
87
- * crawler: CheerioCrawler,
88
- * }
89
- * ```
90
- * where the {@link Request } instance corresponds to the initialized request
91
- * and the {@link Session } instance corresponds to used session.
92
- *
93
- * The function should modify the properties of the passed {@link Request } instance
94
- * in place because there are already earlier references to it. Making a copy and returning it from
95
- * this function is therefore not supported, because it would create inconsistencies where
96
- * different parts of SDK would have access to a different {@link Request } instance.
97
- */
98
- prepareRequestFunction?: PrepareRequest | undefined;
99
- /**
100
- * > This option is deprecated, use `postNavigationHooks` instead.
101
- *
102
- * A function that executes right after the HTTP request is made to the target resource and response is returned.
103
- * This function is suitable for overriding custom properties of response e.g. setting headers because of response parsing.
104
- *
105
- * **Example usage:**
106
- *
107
- * ```javascript
108
- * const cheerioCrawlerOptions = {
109
- * // ...
110
- * postResponseFunction: ({ request, response }) => {
111
- * if (request.userData.parseAsJSON) {
112
- * response.headers['content-type'] = 'application/json; charset=utf-8';
113
- * }
114
- * }
115
- * }
116
- * ```
117
- * The function receives the following object as an argument:
118
- * ```
119
- * {
120
- * response: Object,
121
- * request: Request,
122
- * session: Session,
123
- * proxyInfo: ProxyInfo,
124
- * crawler: CheerioCrawler,
125
- * }
126
- * ```
127
- * The response is an instance of Node's http.IncomingMessage object.
128
- */
129
- postResponseFunction?: PostResponse | undefined;
130
- /**
131
- * Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds.
132
- */
133
- handlePageTimeoutSecs?: number | undefined;
134
- /**
135
- * Timeout in which the HTTP request to the resource needs to finish, given in seconds.
136
- */
137
- requestTimeoutSecs?: number | undefined;
138
- /**
139
- * If set to true, SSL certificate errors will be ignored.
140
- */
141
- ignoreSslErrors?: boolean | undefined;
142
- /**
143
- * If set, `CheerioCrawler` will be configured for all connections to use
144
- * [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
145
- * For more information, see the [documentation](https://docs.apify.com/proxy).
146
- */
147
- proxyConfiguration?: ProxyConfiguration | undefined;
148
- /**
149
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
150
- * The function receives the following object as an argument:
151
- * ```
152
- * {
153
- * error: Error,
154
- * request: Request,
155
- * session: Session,
156
- * $: Cheerio,
157
- * body: String|Buffer,
158
- * json: Object,
159
- * contentType: Object,
160
- * response: Object,
161
- * proxyInfo: ProxyInfo,
162
- * crawler: CheerioCrawler,
163
- * }
164
- * ```
165
- * where the {@link Request } instance corresponds to the failed request, and the `Error` instance
166
- * represents the last error thrown during processing of the request.
167
- *
168
- * See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13)
169
- * for the default implementation of this function.
170
- */
171
- handleFailedRequestFunction?: HandleFailedRequest | undefined;
172
- /**
173
- * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
174
- * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `requestAsBrowserOptions`,
175
- * which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
176
- * Example:
177
- * ```
178
- * preNavigationHooks: [
179
- * async (crawlingContext, requestAsBrowserOptions) => {
180
- * requestAsBrowserOptions.forceUrlEncoding = true;
181
- * },
182
- * ]
183
- * ```
184
- */
185
- preNavigationHooks?: Hook[] | undefined;
186
- /**
187
- * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
188
- * The function accepts `crawlingContext` as the only parameter.
189
- * Example:
190
- * ```
191
- * postNavigationHooks: [
192
- * async (crawlingContext) => {
193
- * // ...
194
- * },
195
- * ]
196
- * ```
197
- */
198
- postNavigationHooks?: Hook[] | undefined;
199
- /**
200
- * An array of <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types"
201
- * target="_blank">MIME types</a> you want the crawler to load and process.
202
- * By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
203
- */
204
- additionalMimeTypes?: string[] | undefined;
205
- /**
206
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers.
207
- * Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
208
- * If those sites actually use a different encoding, the response will be corrupted. You can use
209
- * `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
210
- * To force a certain encoding, disregarding the response headers, use {@link CheerioCrawlerOptions.forceResponseEncoding }```
211
- * // Will fall back to windows-1250 encoding if none found
212
- * suggestResponseEncoding: 'windows-1250'
213
- * ```
214
- */
215
- suggestResponseEncoding?: string | undefined;
216
- /**
217
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
218
- * to force a certain encoding, disregarding the response headers.
219
- * To only provide a default for missing encodings, use {@link CheerioCrawlerOptions.suggestResponseEncoding }```
220
- * // Will force windows-1250 encoding even if headers say otherwise
221
- * forceResponseEncoding: 'windows-1250'
222
- * ```
223
- */
224
- forceResponseEncoding?: string | undefined;
225
- /**
226
- * Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails.
227
- */
228
- maxRequestRetries?: number | undefined;
229
- /**
230
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
231
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
232
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
233
- */
234
- maxRequestsPerCrawl?: number | undefined;
235
- /**
236
- * Custom options passed to the underlying {@link AutoscaledPool } constructor.
237
- * Note that the `runTaskFunction`, `isTaskReadyFunction` and `isFinishedFunction` options
238
- * are provided by `CheerioCrawler` and cannot be overridden. Reasonable {@link Snapshotter }
239
- * and {@link SystemStatus } defaults are provided to account for the fact that `cheerio`
240
- * parses HTML synchronously and therefore blocks the event loop.
241
- */
242
- autoscaledPoolOptions?: AutoscaledPoolOptions | undefined;
243
- /**
244
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool } option.
245
- *
246
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
247
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
248
- */
249
- minConcurrency?: number | undefined;
250
- /**
251
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool } option.
252
- */
253
- maxConcurrency?: number | undefined;
254
- /**
255
- * If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes.
256
- * It also marks Session as bad after a request timeout.
257
- */
258
- useSessionPool?: boolean | undefined;
259
- /**
260
- * Custom options passed to the underlying {@link SessionPool } constructor.
261
- */
262
- sessionPoolOptions?: SessionPoolOptions | undefined;
263
- /**
264
- * Automatically saves cookies to Session. Works only if Session Pool is used.
265
- *
266
- * It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
267
- * It passes the "Cookie" header to the request with the session cookies.
268
- */
269
- persistCookiesPerSession?: boolean | undefined;
270
- };
271
- export type PrepareRequestInputs = {
272
- /**
273
- * Original instance fo the {Request} object. Must be modified in-place.
274
- */
275
- request: Request;
276
- /**
277
- * The current session
278
- */
279
- session?: Session | undefined;
280
- /**
281
- * An object with information about currently used proxy by the crawler
282
- * and configured by the {@link ProxyConfiguration } class.
283
- */
284
- proxyInfo?: ProxyInfo | undefined;
285
- crawler?: CheerioCrawler | undefined;
286
- };
287
- export type PrepareRequest = (inputs: PrepareRequestInputs) => (void | Promise<void>);
288
- export type PostResponseInputs = {
289
- /**
290
- * stream
291
- */
292
- response: (IncomingMessage | Readable);
293
- /**
294
- * Original instance fo the {Request} object. Must be modified in-place.
295
- */
296
- request: Request;
297
- /**
298
- * The current session
299
- */
300
- session?: Session | undefined;
301
- /**
302
- * An object with information about currently used proxy by the crawler
303
- * and configured by the {@link ProxyConfiguration } class.
304
- */
305
- proxyInfo?: ProxyInfo | undefined;
306
- crawler: CheerioCrawler;
307
- };
308
- export type PostResponse = (inputs: PostResponseInputs) => (void | Promise<void>);
309
- export type CheerioHandlePageInputs = {
310
- /**
311
- * The [Cheerio](https://cheerio.js.org/) object with parsed HTML.
312
- */
313
- $: CheerioAPI;
314
- /**
315
- * The request body of the web page.
316
- */
317
- body: (string | Buffer);
318
- /**
319
- * The parsed object from JSON string if the response contains the content type application/json.
320
- */
321
- json: any;
322
- /**
323
- * The original {@link Request } object.
324
- */
325
- request: Request;
326
- /**
327
- * Parsed `Content-Type header: { type, encoding }`.
328
- */
329
- contentType: {
330
- type: string;
331
- encoding: string;
332
- };
333
- /**
334
- * An instance of Node's [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) object,
335
- */
336
- response: IncomingMessage;
337
- session: Session;
338
- /**
339
- * An object with information about currently used proxy by the crawler
340
- * and configured by the {@link ProxyConfiguration } class.
341
- */
342
- proxyInfo: ProxyInfo;
343
- crawler: CheerioCrawler;
344
- };
345
- export type CheerioHandlePage = (inputs: CheerioHandlePageInputs) => Promise<void>;
346
- /**
347
- * @typedef CheerioCrawlerOptions
348
- * @property {CheerioHandlePage} handlePageFunction
349
- * User-provided function that performs the logic of the crawler. It is called for each page
350
- * loaded and parsed by the crawler.
351
- *
352
- * The function receives the following object as an argument:
353
- * ```
354
- * {
355
- * // The Cheerio object's function with the parsed HTML.
356
- * $: Cheerio,
357
- *
358
- * // The request body of the web page, whose type depends on the content type.
359
- * body: String|Buffer,
360
- *
361
- * // The parsed object from JSON for responses with the "application/json" content types.
362
- * // For other content types it's null.
363
- * json: Object,
364
- *
365
- * // Apify.Request object with details of the requested web page
366
- * request: Request,
367
- *
368
- * // Parsed Content-Type HTTP header: { type, encoding }
369
- * contentType: Object,
370
- *
371
- * // An instance of Node's http.IncomingMessage object,
372
- * response: Object,
373
- *
374
- * // Session object, useful to work around anti-scraping protections
375
- * session: Session
376
- *
377
- * // ProxyInfo object with information about currently used proxy
378
- * proxyInfo: ProxyInfo
379
- *
380
- * // The running cheerio crawler instance.
381
- * crawler: CheerioCrawler
382
- * }
383
- * ```
384
- *
385
- * Type of `body` depends on the `Content-Type` header of the web page:
386
- * - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
387
- * - Buffer for others MIME content types
388
- *
389
- * Parsed `Content-Type` header using
390
- * [content-type package](https://www.npmjs.com/package/content-type)
391
- * is stored in `contentType`.
392
- *
393
- * Cheerio is available only for HTML and XML content types.
394
- *
395
- * With the {@link Request} object representing the URL to crawl.
396
- *
397
- * If the function returns, the returned promise is awaited by the crawler.
398
- *
399
- * If the function throws an exception, the crawler will try to re-crawl the
400
- * request later, up to `option.maxRequestRetries` times.
401
- * If all the retries fail, the crawler calls the function
402
- * provided to the `handleFailedRequestFunction` parameter.
403
- * To make this work, you should **always**
404
- * let your function throw exceptions rather than catch them.
405
- * The exceptions are logged to the request using the
406
- * {@link Request#pushErrorMessage} function.
407
- * @property {RequestList} [requestList]
408
- * Static list of URLs to be processed.
409
- * Either `requestList` or `requestQueue` option must be provided (or both).
410
- * @property {RequestQueue} [requestQueue]
411
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
412
- * Either `requestList` or `requestQueue` option must be provided (or both).
413
- * @property {PrepareRequest} [prepareRequestFunction]
414
- * > This option is deprecated, use `preNavigationHooks` instead.
415
- *
416
- * A function that executes before the HTTP request is made to the target resource.
417
- * This function is suitable for setting dynamic properties such as cookies to the {@link Request}.
418
- *
419
- * The function receives the following object as an argument:
420
- * ```
421
- * {
422
- * request: Request,
423
- * session: Session,
424
- * proxyInfo: ProxyInfo,
425
- * crawler: CheerioCrawler,
426
- * }
427
- * ```
428
- * where the {@link Request} instance corresponds to the initialized request
429
- * and the {@link Session} instance corresponds to used session.
430
- *
431
- * The function should modify the properties of the passed {@link Request} instance
432
- * in place because there are already earlier references to it. Making a copy and returning it from
433
- * this function is therefore not supported, because it would create inconsistencies where
434
- * different parts of SDK would have access to a different {@link Request} instance.
435
- *
436
- * @property {PostResponse} [postResponseFunction]
437
- * > This option is deprecated, use `postNavigationHooks` instead.
438
- *
439
- * A function that executes right after the HTTP request is made to the target resource and response is returned.
440
- * This function is suitable for overriding custom properties of response e.g. setting headers because of response parsing.
441
- *
442
- * **Example usage:**
443
- *
444
- * ```javascript
445
- * const cheerioCrawlerOptions = {
446
- * // ...
447
- * postResponseFunction: ({ request, response }) => {
448
- * if (request.userData.parseAsJSON) {
449
- * response.headers['content-type'] = 'application/json; charset=utf-8';
450
- * }
451
- * }
452
- * }
453
- * ```
454
- * The function receives the following object as an argument:
455
- * ```
456
- * {
457
- * response: Object,
458
- * request: Request,
459
- * session: Session,
460
- * proxyInfo: ProxyInfo,
461
- * crawler: CheerioCrawler,
462
- * }
463
- * ```
464
- * The response is an instance of Node's http.IncomingMessage object.
465
- *
466
- * @property {number} [handlePageTimeoutSecs=60]
467
- * Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds.
468
- * @property {number} [requestTimeoutSecs=30]
469
- * Timeout in which the HTTP request to the resource needs to finish, given in seconds.
470
- * @property {boolean} [ignoreSslErrors=true]
471
- * If set to true, SSL certificate errors will be ignored.
472
- * @property {ProxyConfiguration} [proxyConfiguration]
473
- * If set, `CheerioCrawler` will be configured for all connections to use
474
- * [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
475
- * For more information, see the [documentation](https://docs.apify.com/proxy).
476
- * @property {HandleFailedRequest} [handleFailedRequestFunction]
477
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
478
- * The function receives the following object as an argument:
479
- * ```
480
- * {
481
- * error: Error,
482
- * request: Request,
483
- * session: Session,
484
- * $: Cheerio,
485
- * body: String|Buffer,
486
- * json: Object,
487
- * contentType: Object,
488
- * response: Object,
489
- * proxyInfo: ProxyInfo,
490
- * crawler: CheerioCrawler,
491
- * }
492
- * ```
493
- * where the {@link Request} instance corresponds to the failed request, and the `Error` instance
494
- * represents the last error thrown during processing of the request.
495
- *
496
- * See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13)
497
- * for the default implementation of this function.
498
- * @property {Array<Hook>} [preNavigationHooks]
499
- * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
500
- * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `requestAsBrowserOptions`,
501
- * which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
502
- * Example:
503
- * ```
504
- * preNavigationHooks: [
505
- * async (crawlingContext, requestAsBrowserOptions) => {
506
- * requestAsBrowserOptions.forceUrlEncoding = true;
507
- * },
508
- * ]
509
- * ```
510
- * @property {Array<Hook>} [postNavigationHooks]
511
- * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
512
- * The function accepts `crawlingContext` as the only parameter.
513
- * Example:
514
- * ```
515
- * postNavigationHooks: [
516
- * async (crawlingContext) => {
517
- * // ...
518
- * },
519
- * ]
520
- * ```
521
- * @property {string[]} [additionalMimeTypes]
522
- * An array of <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types"
523
- * target="_blank">MIME types</a> you want the crawler to load and process.
524
- * By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
525
- * @property {string} [suggestResponseEncoding]
526
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers.
527
- * Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
528
- * If those sites actually use a different encoding, the response will be corrupted. You can use
529
- * `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
530
- * To force a certain encoding, disregarding the response headers, use {@link CheerioCrawlerOptions.forceResponseEncoding}
531
- * ```
532
- * // Will fall back to windows-1250 encoding if none found
533
- * suggestResponseEncoding: 'windows-1250'
534
- * ```
535
- * @property {string} [forceResponseEncoding]
536
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
537
- * to force a certain encoding, disregarding the response headers.
538
- * To only provide a default for missing encodings, use {@link CheerioCrawlerOptions.suggestResponseEncoding}
539
- * ```
540
- * // Will force windows-1250 encoding even if headers say otherwise
541
- * forceResponseEncoding: 'windows-1250'
542
- * ```
543
- * @property {number} [maxRequestRetries=3]
544
- * Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails.
545
- * @property {number} [maxRequestsPerCrawl]
546
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
547
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
548
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
549
- * @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
550
- * Custom options passed to the underlying {@link AutoscaledPool} constructor.
551
- * Note that the `runTaskFunction`, `isTaskReadyFunction` and `isFinishedFunction` options
552
- * are provided by `CheerioCrawler` and cannot be overridden. Reasonable {@link Snapshotter}
553
- * and {@link SystemStatus} defaults are provided to account for the fact that `cheerio`
554
- * parses HTML synchronously and therefore blocks the event loop.
555
- * @property {number} [minConcurrency=1]
556
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
557
- *
558
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
559
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
560
- * @property {number} [maxConcurrency=1000]
561
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
562
- * @property {boolean} [useSessionPool=true]
563
- * If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes.
564
- * It also marks Session as bad after a request timeout.
565
- * @property {SessionPoolOptions} [sessionPoolOptions]
566
- * Custom options passed to the underlying {@link SessionPool} constructor.
567
- * @property {boolean} [persistCookiesPerSession]
568
- * Automatically saves cookies to Session. Works only if Session Pool is used.
569
- *
570
- * It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
571
- * It passes the "Cookie" header to the request with the session cookies.
572
- */
573
- /**
574
- * Provides a framework for the parallel crawling of web pages using plain HTTP requests and
575
- * [cheerio](https://www.npmjs.com/package/cheerio) HTML parser.
576
- * The URLs to crawl are fed either from a static list of URLs
577
- * or from a dynamic queue of URLs enabling recursive crawling of websites.
578
- *
579
- * Since `CheerioCrawler` uses raw HTTP requests to download web pages,
580
- * it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
581
- * to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
582
- * because it loads the pages using full-featured headless Chrome browser.
583
- *
584
- * `CheerioCrawler` downloads each URL using a plain HTTP request,
585
- * parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio)
586
- * and then invokes the user-provided {@link CheerioCrawlerOptions.handlePageFunction} to extract page data
587
- * using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM.
588
- *
589
- * The source URLs are represented using {@link Request} objects that are fed from
590
- * {@link RequestList} or {@link RequestQueue} instances provided by the {@link CheerioCrawlerOptions.requestList}
591
- * or {@link CheerioCrawlerOptions.requestQueue} constructor options, respectively.
592
- *
593
- * If both {@link CheerioCrawlerOptions.requestList} and {@link CheerioCrawlerOptions.requestQueue} are used,
594
- * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
595
- * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
596
- *
597
- * The crawler finishes when there are no more {@link Request} objects to crawl.
598
- *
599
- * `CheerioCrawler` downloads the web pages using the `{@link utils#requestAsBrowser}` utility function.
600
- * As opposed to the browser based crawlers that are automatically encoding the URLs, the
601
- * `{@link utils#requestAsBrowser}` function will not do so. We either need to manually encode the URLs
602
- * via `encodeURI()` function, or set `forceUrlEncoding: true` in the `requestAsBrowserOptions`,
603
- * which will automatically encode all the URLs before accessing them.
604
- *
605
- * > We can either use `forceUrlEncoding` or encode manually, but not both - it would
606
- * > result in double encoding and therefore lead to invalid URLs.
607
- *
608
- * We can use the `preNavigationHooks` to adjust `requestAsBrowserOptions`:
609
- *
610
- * ```
611
- * preNavigationHooks: [
612
- * (crawlingContext, requestAsBrowserOptions) => {
613
- * requestAsBrowserOptions.forceUrlEncoding = true;
614
- * },
615
- * ]
616
- * ```
617
- *
618
- * By default, `CheerioCrawler` only processes web pages with the `text/html`
619
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
620
- * and skips pages with other content types. If you want the crawler to process other content types,
621
- * use the {@link CheerioCrawlerOptions.additionalMimeTypes} constructor option.
622
- * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
623
- * For details, see {@link CheerioCrawlerOptions.handlePageFunction}.
624
- *
625
- * New requests are only dispatched when there is enough free CPU and memory available,
626
- * using the functionality provided by the {@link AutoscaledPool} class.
627
- * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
628
- * parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
629
- * {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
630
- *
631
- * **Example usage:**
632
- *
633
- * ```javascript
634
- * // Prepare a list of URLs to crawl
635
- * const requestList = new Apify.RequestList({
636
- * sources: [
637
- * { url: 'http://www.example.com/page-1' },
638
- * { url: 'http://www.example.com/page-2' },
639
- * ],
640
- * });
641
- * await requestList.initialize();
642
- *
643
- * // Crawl the URLs
644
- * const crawler = new Apify.CheerioCrawler({
645
- * requestList,
646
- * handlePageFunction: async ({ request, response, body, contentType, $ }) => {
647
- * const data = [];
648
- *
649
- * // Do some data extraction from the page with Cheerio.
650
- * $('.some-collection').each((index, el) => {
651
- * data.push({ title: $(el).find('.some-title').text() });
652
- * });
653
- *
654
- * // Save the data to dataset.
655
- * await Apify.pushData({
656
- * url: request.url,
657
- * html: body,
658
- * data,
659
- * })
660
- * },
661
- * });
662
- *
663
- * await crawler.run();
664
- * ```
665
- * @property {Statistics} stats
666
- * Contains statistics about the current run.
667
- * @property {?RequestList} requestList
668
- * A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
669
- * Only available if used by the crawler.
670
- * @property {?RequestQueue} requestQueue
671
- * A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
672
- * Only available if used by the crawler.
673
- * @property {?SessionPool} sessionPool
674
- * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
675
- * Only available if used by the crawler.
676
- * @property {?ProxyConfiguration} proxyConfiguration
677
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
678
- * Only available if used by the crawler.
679
- * @property {AutoscaledPool} autoscaledPool
680
- * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
681
- * Note that this property is only initialized after calling the {@link CheerioCrawler#run} function.
682
- * You can use it to change the concurrency settings on the fly,
683
- * to pause the crawler by calling {@link AutoscaledPool#pause}
684
- * or to abort it by calling {@link AutoscaledPool#abort}.
685
- */
686
- declare class CheerioCrawler extends BasicCrawler {
687
- /**
688
- * @param {CheerioCrawlerOptions} options
689
- * All `CheerioCrawler` parameters are passed via an options object.
690
- */
691
- constructor(options: CheerioCrawlerOptions);
692
- supportedMimeTypes: Set<string>;
693
- handlePageTimeoutMillis: number;
694
- requestTimeoutMillis: number;
695
- ignoreSslErrors: boolean;
696
- suggestResponseEncoding: string | undefined;
697
- forceResponseEncoding: string | undefined;
698
- prepareRequestFunction: PrepareRequest | undefined;
699
- postResponseFunction: PostResponse | undefined;
700
- proxyConfiguration: ProxyConfiguration | undefined;
701
- /**
702
- * @type {Array<any>}
703
- * @ignore
704
- * */
705
- preNavigationHooks: Array<any>;
706
- /**
707
- * @type {Array<any>}
708
- * @ignore
709
- * */
710
- postNavigationHooks: Array<any>;
711
- persistCookiesPerSession: boolean;
712
- /**
713
- * **EXPERIMENTAL**
714
- * Function for attaching CrawlerExtensions such as the Unblockers.
715
- * @param {CrawlerExtension} extension - Crawler extension that overrides the crawler configuration.
716
- */
717
- use(extension: CrawlerExtension): void;
718
- /**
719
- * @param {CrawlingContext} crawlingContext
720
- * @ignore
721
- * @protected
722
- * @internal
723
- */
724
- protected _handleNavigation(crawlingContext: CrawlingContext): Promise<void>;
725
- /**
726
- * When users change `request.headers.cookie` inside preNavigationHook, the change would be ignored,
727
- * as `request.headers` are already merged into the `requestAsBrowserOptions`. This method is using
728
- * old `request.headers` snapshot (before hooks are executed), makes a diff with the cookie value
729
- * after hooks are executed, and merges any new cookies back to `requestAsBrowserOptions`.
730
- *
731
- * This way we can still use both `requestAsBrowserOptions` and `context.request` in the hooks (not both).
732
- *
733
- * @param {Request} request
734
- * @param {string} cookieSnapshot
735
- * @param {RequestAsBrowserOptions} requestAsBrowserOptions
736
- * @private
737
- * @ignore
738
- * @internal
739
- */
740
- private _mergeRequestCookieDiff;
741
- /**
742
- * Function to make the HTTP request. It performs optimizations
743
- * on the request such as only downloading the request body if the
744
- * received content type matches text/html, application/xml, application/xhtml+xml.
745
- *
746
- * @param {object} options
747
- * @param {Request} options.request
748
- * @param {Session} options.session
749
- * @param {string} options.proxyUrl
750
- * @param {RequestAsBrowserOptions} options.requestAsBrowserOptions
751
- * @returns {Promise<IncomingMessage|Readable>}
752
- * @ignore
753
- * @protected
754
- * @internal
755
- */
756
- protected _requestFunction({ request, session, proxyUrl, requestAsBrowserOptions }: {
757
- request: Request;
758
- session: Session;
759
- proxyUrl: string;
760
- requestAsBrowserOptions: RequestAsBrowserOptions;
761
- }): Promise<IncomingMessage | Readable>;
762
- /**
763
- * Sets the cookie header to `requestAsBrowserOptions` based on provided session and request. If some cookies were already set,
764
- * the session cookie will be merged with them. User provided cookies on `request` object have precedence.
765
- *
766
- * @param {CrawlingContext} crawlingContext
767
- * @param {RequestAsBrowserOptions} requestAsBrowserOptions
768
- * @return {void}
769
- * @ignore
770
- * @private
771
- * @internal
772
- */
773
- private _applySessionCookie;
774
- /**
775
- * Encodes and parses response according to the provided content type
776
- * @param {Request} request
777
- * @param {IncomingMessage|Readable} responseStream
778
- * @returns {Promise<object>}
779
- * @ignore
780
- * @protected
781
- * @internal
782
- */
783
- protected _parseResponse(request: Request, responseStream: IncomingMessage | Readable): Promise<object>;
784
- /**
785
- * Combines the provided `requestOptions` with mandatory (non-overridable) values.
786
- * @param {Request} request
787
- * @param {Session} [session]
788
- * @param {string} [proxyUrl]
789
- * @param {RequestAsBrowserOptions} [requestAsBrowserOptions]
790
- * @ignore
791
- * @protected
792
- * @internal
793
- */
794
- protected _getRequestOptions(request: Request, session?: Session | undefined, proxyUrl?: string | undefined, requestAsBrowserOptions?: RequestAsBrowserOptions | undefined): {
795
- headers: {
796
- [x: string]: string;
797
- };
798
- https: any;
799
- isStream: boolean;
800
- /**
801
- * URL of the target endpoint. Supports both HTTP and HTTPS schemes.
802
- */
803
- url: string;
804
- /**
805
- * HTTP method.
806
- */
807
- method: string;
808
- /**
809
- * An HTTP proxy to be passed down to the HTTP request. Supports proxy authentication with Basic Auth.
810
- */
811
- proxyUrl: string | undefined;
812
- /**
813
- * Configuration to be used for generating correct browser headers.
814
- * See the [`header-generator`](https://github.com/apify/header-generator) library.
815
- */
816
- headerGeneratorOptions?: object | undefined;
817
- /**
818
- * Two-letter ISO 639 language code.
819
- */
820
- languageCode?: string | undefined;
821
- /**
822
- * Two-letter ISO 3166 country code.
823
- */
824
- countryCode?: string | undefined;
825
- /**
826
- * If `true`, the function uses User-Agent of a mobile browser.
827
- */
828
- useMobileVersion?: boolean | undefined;
829
- /**
830
- * If set to true, SSL/TLS certificate errors will be ignored.
831
- */
832
- ignoreSslErrors?: boolean | undefined;
833
- /**
834
- * Node.js' HTTP parser is stricter than parsers used by web browsers, which prevents scraping of websites
835
- * whose servers do not comply with HTTP specs, either by accident or due to some anti-scraping protections,
836
- * causing e.g. the `invalid header value char` error. The `useInsecureHttpParser` option forces
837
- * the HTTP parser to ignore certain errors which lets you scrape such websites.
838
- * However, it will also open your application to some security vulnerabilities,
839
- * although the risk should be negligible as these vulnerabilities mainly relate to server applications, not clients.
840
- * Learn more in this [blog post](https://snyk.io/blog/node-js-release-fixes-a-critical-http-security-vulnerability/).
841
- */
842
- useInsecureHttpParser?: boolean | undefined;
843
- /**
844
- * Function accepts `response` object as a single parameter and should return `true` or `false`.
845
- * If function returns true, request gets aborted.
846
- */
847
- abortFunction?: import("../utils_request").AbortFunction | undefined;
848
- /**
849
- * If set to false, it will prevent use of HTTP2 requests. This is strongly discouraged. Websites
850
- * expect HTTP2 connections, because browsers use HTTP2 by default. It will automatically downgrade
851
- * to HTTP/1.1 for websites that do not support HTTP2.
852
- */
853
- useHttp2?: boolean | undefined;
854
- /**
855
- * A unique object used to generate browser headers. By default, new headers are generated on every call.
856
- * Set this option to make these headers persistent.
857
- */
858
- sessionToken: object | Session | undefined;
859
- timeout: {
860
- request: number;
861
- };
862
- };
863
- /**
864
- * @param {*} request
865
- * @param {*} response
866
- * @param {*} encoding
867
- * @ignore
868
- * @protected
869
- * @internal
870
- */
871
- protected _encodeResponse(request: any, response: any, encoding: any): {
872
- response: any;
873
- encoding: string;
874
- };
875
- /**
876
- * @param {*} response
877
- * @ignore
878
- * @protected
879
- * @internal
880
- */
881
- protected _parseHtmlToDom(response: any): Promise<any>;
882
- /**
883
- * Checks and extends supported mime types
884
- * @param {Array<(string|Object)>} additionalMimeTypes
885
- * @ignore
886
- * @protected
887
- * @internal
888
- */
889
- protected _extendSupportedMimeTypes(additionalMimeTypes: Array<(string | Object)>): void;
890
- /**
891
- * Handles blocked request
892
- * @param {Session} session
893
- * @param {number} statusCode
894
- * @ignore
895
- * @protected
896
- * @internal
897
- */
898
- protected _throwOnBlockedRequest(session: Session, statusCode: number): void;
899
- /**
900
- * Handles timeout request
901
- * @param {Session} session
902
- * @ignore
903
- * @protected
904
- * @internal
905
- */
906
- protected _handleRequestTimeout(session: Session): void;
907
- /**
908
- * @param {Request} request
909
- * @param {IncomingMessage|Readable} response
910
- * @private
911
- */
912
- private _abortDownloadOfBody;
913
- }
914
- import { RequestList } from "../request_list";
915
- import { RequestQueue } from "../storages/request_queue";
916
- import { ProxyConfiguration } from "../proxy_configuration";
917
- import { HandleFailedRequest } from "./basic_crawler";
918
- import { Hook } from "./browser_crawler";
919
- import { AutoscaledPoolOptions } from "../autoscaling/autoscaled_pool";
920
- import { SessionPoolOptions } from "../session_pool/session_pool";
921
- import Request from "../request";
922
- import { Session } from "../session_pool/session";
923
- import { ProxyInfo } from "../proxy_configuration";
924
- import { IncomingMessage } from "http";
925
- import { Readable } from "stream";
926
- import { CheerioAPI } from "cheerio/lib/load";
927
- import { BasicCrawler } from "./basic_crawler";
928
- import CrawlerExtension from "./crawler_extension";
929
- import { CrawlingContext } from "./basic_crawler";
930
- import { RequestAsBrowserOptions } from "../utils_request";
931
- //# sourceMappingURL=cheerio_crawler.d.ts.map