apify 2.3.1-beta.4 → 3.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. package/README.md +6 -5
  2. package/actor.d.ts +1092 -0
  3. package/actor.d.ts.map +1 -0
  4. package/actor.js +1221 -0
  5. package/actor.js.map +1 -0
  6. package/index.d.ts +4 -0
  7. package/index.d.ts.map +1 -0
  8. package/index.js +7 -0
  9. package/index.js.map +1 -0
  10. package/index.mjs +7 -0
  11. package/package.json +54 -128
  12. package/platform_event_manager.d.ts +55 -0
  13. package/platform_event_manager.d.ts.map +1 -0
  14. package/platform_event_manager.js +116 -0
  15. package/platform_event_manager.js.map +1 -0
  16. package/proxy_configuration.d.ts +210 -0
  17. package/proxy_configuration.d.ts.map +1 -0
  18. package/proxy_configuration.js +297 -0
  19. package/proxy_configuration.js.map +1 -0
  20. package/tsconfig.build.tsbuildinfo +1 -0
  21. package/utils.d.ts +11 -0
  22. package/utils.d.ts.map +1 -0
  23. package/utils.js +40 -0
  24. package/utils.js.map +1 -0
  25. package/build/actor.d.ts +0 -113
  26. package/build/actor.d.ts.map +0 -1
  27. package/build/actor.js +0 -582
  28. package/build/actor.js.map +0 -1
  29. package/build/apify.d.ts +0 -752
  30. package/build/apify.d.ts.map +0 -1
  31. package/build/apify.js +0 -877
  32. package/build/apify.js.map +0 -1
  33. package/build/autoscaling/autoscaled_pool.d.ts +0 -384
  34. package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
  35. package/build/autoscaling/autoscaled_pool.js +0 -557
  36. package/build/autoscaling/autoscaled_pool.js.map +0 -1
  37. package/build/autoscaling/snapshotter.d.ts +0 -278
  38. package/build/autoscaling/snapshotter.d.ts.map +0 -1
  39. package/build/autoscaling/snapshotter.js +0 -447
  40. package/build/autoscaling/snapshotter.js.map +0 -1
  41. package/build/autoscaling/system_status.d.ts +0 -224
  42. package/build/autoscaling/system_status.d.ts.map +0 -1
  43. package/build/autoscaling/system_status.js +0 -228
  44. package/build/autoscaling/system_status.js.map +0 -1
  45. package/build/browser_launchers/browser_launcher.d.ts +0 -154
  46. package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
  47. package/build/browser_launchers/browser_launcher.js +0 -160
  48. package/build/browser_launchers/browser_launcher.js.map +0 -1
  49. package/build/browser_launchers/browser_plugin.d.ts +0 -23
  50. package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
  51. package/build/browser_launchers/browser_plugin.js +0 -25
  52. package/build/browser_launchers/browser_plugin.js.map +0 -1
  53. package/build/browser_launchers/playwright_launcher.d.ts +0 -131
  54. package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
  55. package/build/browser_launchers/playwright_launcher.js +0 -150
  56. package/build/browser_launchers/playwright_launcher.js.map +0 -1
  57. package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
  58. package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
  59. package/build/browser_launchers/puppeteer_launcher.js +0 -197
  60. package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
  61. package/build/cache_container.d.ts +0 -31
  62. package/build/cache_container.d.ts.map +0 -1
  63. package/build/cache_container.js +0 -48
  64. package/build/cache_container.js.map +0 -1
  65. package/build/configuration.d.ts +0 -226
  66. package/build/configuration.d.ts.map +0 -1
  67. package/build/configuration.js +0 -325
  68. package/build/configuration.js.map +0 -1
  69. package/build/constants.d.ts +0 -37
  70. package/build/constants.d.ts.map +0 -1
  71. package/build/constants.js +0 -41
  72. package/build/constants.js.map +0 -1
  73. package/build/crawlers/basic_crawler.d.ts +0 -443
  74. package/build/crawlers/basic_crawler.d.ts.map +0 -1
  75. package/build/crawlers/basic_crawler.js +0 -664
  76. package/build/crawlers/basic_crawler.js.map +0 -1
  77. package/build/crawlers/browser_crawler.d.ts +0 -512
  78. package/build/crawlers/browser_crawler.d.ts.map +0 -1
  79. package/build/crawlers/browser_crawler.js +0 -540
  80. package/build/crawlers/browser_crawler.js.map +0 -1
  81. package/build/crawlers/cheerio_crawler.d.ts +0 -931
  82. package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
  83. package/build/crawlers/cheerio_crawler.js +0 -913
  84. package/build/crawlers/cheerio_crawler.js.map +0 -1
  85. package/build/crawlers/crawler_extension.d.ts +0 -10
  86. package/build/crawlers/crawler_extension.d.ts.map +0 -1
  87. package/build/crawlers/crawler_extension.js +0 -19
  88. package/build/crawlers/crawler_extension.js.map +0 -1
  89. package/build/crawlers/crawler_utils.d.ts +0 -34
  90. package/build/crawlers/crawler_utils.d.ts.map +0 -1
  91. package/build/crawlers/crawler_utils.js +0 -87
  92. package/build/crawlers/crawler_utils.js.map +0 -1
  93. package/build/crawlers/playwright_crawler.d.ts +0 -448
  94. package/build/crawlers/playwright_crawler.d.ts.map +0 -1
  95. package/build/crawlers/playwright_crawler.js +0 -299
  96. package/build/crawlers/playwright_crawler.js.map +0 -1
  97. package/build/crawlers/puppeteer_crawler.d.ts +0 -425
  98. package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
  99. package/build/crawlers/puppeteer_crawler.js +0 -299
  100. package/build/crawlers/puppeteer_crawler.js.map +0 -1
  101. package/build/crawlers/statistics.d.ts +0 -185
  102. package/build/crawlers/statistics.d.ts.map +0 -1
  103. package/build/crawlers/statistics.js +0 -331
  104. package/build/crawlers/statistics.js.map +0 -1
  105. package/build/enqueue_links/click_elements.d.ts +0 -179
  106. package/build/enqueue_links/click_elements.d.ts.map +0 -1
  107. package/build/enqueue_links/click_elements.js +0 -434
  108. package/build/enqueue_links/click_elements.js.map +0 -1
  109. package/build/enqueue_links/enqueue_links.d.ts +0 -117
  110. package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
  111. package/build/enqueue_links/enqueue_links.js +0 -163
  112. package/build/enqueue_links/enqueue_links.js.map +0 -1
  113. package/build/enqueue_links/shared.d.ts +0 -42
  114. package/build/enqueue_links/shared.d.ts.map +0 -1
  115. package/build/enqueue_links/shared.js +0 -121
  116. package/build/enqueue_links/shared.js.map +0 -1
  117. package/build/errors.d.ts +0 -29
  118. package/build/errors.d.ts.map +0 -1
  119. package/build/errors.js +0 -38
  120. package/build/errors.js.map +0 -1
  121. package/build/events.d.ts +0 -11
  122. package/build/events.d.ts.map +0 -1
  123. package/build/events.js +0 -147
  124. package/build/events.js.map +0 -1
  125. package/build/index.d.ts +0 -4
  126. package/build/index.d.ts.map +0 -1
  127. package/build/index.js +0 -7
  128. package/build/index.js.map +0 -1
  129. package/build/main.d.ts +0 -179
  130. package/build/main.d.ts.map +0 -1
  131. package/build/main.js +0 -81
  132. package/build/main.js.map +0 -1
  133. package/build/playwright_utils.d.ts +0 -9
  134. package/build/playwright_utils.d.ts.map +0 -1
  135. package/build/playwright_utils.js +0 -90
  136. package/build/playwright_utils.js.map +0 -1
  137. package/build/proxy_configuration.d.ts +0 -411
  138. package/build/proxy_configuration.d.ts.map +0 -1
  139. package/build/proxy_configuration.js +0 -517
  140. package/build/proxy_configuration.js.map +0 -1
  141. package/build/pseudo_url.d.ts +0 -86
  142. package/build/pseudo_url.d.ts.map +0 -1
  143. package/build/pseudo_url.js +0 -153
  144. package/build/pseudo_url.js.map +0 -1
  145. package/build/puppeteer_request_interception.d.ts +0 -8
  146. package/build/puppeteer_request_interception.d.ts.map +0 -1
  147. package/build/puppeteer_request_interception.js +0 -235
  148. package/build/puppeteer_request_interception.js.map +0 -1
  149. package/build/puppeteer_utils.d.ts +0 -250
  150. package/build/puppeteer_utils.d.ts.map +0 -1
  151. package/build/puppeteer_utils.js +0 -551
  152. package/build/puppeteer_utils.js.map +0 -1
  153. package/build/request.d.ts +0 -180
  154. package/build/request.d.ts.map +0 -1
  155. package/build/request.js +0 -261
  156. package/build/request.js.map +0 -1
  157. package/build/request_list.d.ts +0 -581
  158. package/build/request_list.d.ts.map +0 -1
  159. package/build/request_list.js +0 -826
  160. package/build/request_list.js.map +0 -1
  161. package/build/serialization.d.ts +0 -5
  162. package/build/serialization.d.ts.map +0 -1
  163. package/build/serialization.js +0 -139
  164. package/build/serialization.js.map +0 -1
  165. package/build/session_pool/errors.d.ts +0 -11
  166. package/build/session_pool/errors.d.ts.map +0 -1
  167. package/build/session_pool/errors.js +0 -18
  168. package/build/session_pool/errors.js.map +0 -1
  169. package/build/session_pool/events.d.ts +0 -5
  170. package/build/session_pool/events.d.ts.map +0 -1
  171. package/build/session_pool/events.js +0 -6
  172. package/build/session_pool/events.js.map +0 -1
  173. package/build/session_pool/session.d.ts +0 -286
  174. package/build/session_pool/session.d.ts.map +0 -1
  175. package/build/session_pool/session.js +0 -355
  176. package/build/session_pool/session.js.map +0 -1
  177. package/build/session_pool/session_pool.d.ts +0 -280
  178. package/build/session_pool/session_pool.d.ts.map +0 -1
  179. package/build/session_pool/session_pool.js +0 -393
  180. package/build/session_pool/session_pool.js.map +0 -1
  181. package/build/session_pool/session_utils.d.ts +0 -4
  182. package/build/session_pool/session_utils.d.ts.map +0 -1
  183. package/build/session_pool/session_utils.js +0 -24
  184. package/build/session_pool/session_utils.js.map +0 -1
  185. package/build/stealth/hiding_tricks.d.ts +0 -22
  186. package/build/stealth/hiding_tricks.d.ts.map +0 -1
  187. package/build/stealth/hiding_tricks.js +0 -308
  188. package/build/stealth/hiding_tricks.js.map +0 -1
  189. package/build/stealth/stealth.d.ts +0 -56
  190. package/build/stealth/stealth.d.ts.map +0 -1
  191. package/build/stealth/stealth.js +0 -125
  192. package/build/stealth/stealth.js.map +0 -1
  193. package/build/storages/dataset.d.ts +0 -288
  194. package/build/storages/dataset.d.ts.map +0 -1
  195. package/build/storages/dataset.js +0 -480
  196. package/build/storages/dataset.js.map +0 -1
  197. package/build/storages/key_value_store.d.ts +0 -243
  198. package/build/storages/key_value_store.d.ts.map +0 -1
  199. package/build/storages/key_value_store.js +0 -462
  200. package/build/storages/key_value_store.js.map +0 -1
  201. package/build/storages/request_queue.d.ts +0 -318
  202. package/build/storages/request_queue.d.ts.map +0 -1
  203. package/build/storages/request_queue.js +0 -636
  204. package/build/storages/request_queue.js.map +0 -1
  205. package/build/storages/storage_manager.d.ts +0 -87
  206. package/build/storages/storage_manager.d.ts.map +0 -1
  207. package/build/storages/storage_manager.js +0 -150
  208. package/build/storages/storage_manager.js.map +0 -1
  209. package/build/tsconfig.tsbuildinfo +0 -1
  210. package/build/typedefs.d.ts +0 -146
  211. package/build/typedefs.d.ts.map +0 -1
  212. package/build/typedefs.js +0 -88
  213. package/build/typedefs.js.map +0 -1
  214. package/build/utils.d.ts +0 -175
  215. package/build/utils.d.ts.map +0 -1
  216. package/build/utils.js +0 -731
  217. package/build/utils.js.map +0 -1
  218. package/build/utils_log.d.ts +0 -41
  219. package/build/utils_log.d.ts.map +0 -1
  220. package/build/utils_log.js +0 -192
  221. package/build/utils_log.js.map +0 -1
  222. package/build/utils_request.d.ts +0 -77
  223. package/build/utils_request.d.ts.map +0 -1
  224. package/build/utils_request.js +0 -385
  225. package/build/utils_request.js.map +0 -1
  226. package/build/utils_social.d.ts +0 -210
  227. package/build/utils_social.d.ts.map +0 -1
  228. package/build/utils_social.js +0 -787
  229. package/build/utils_social.js.map +0 -1
  230. package/build/validators.d.ts +0 -23
  231. package/build/validators.d.ts.map +0 -1
  232. package/build/validators.js +0 -29
  233. package/build/validators.js.map +0 -1
@@ -1,931 +0,0 @@
1
- /// <reference types="node" />
2
- export default CheerioCrawler;
3
- export type CheerioCrawlerOptions = {
4
- /**
5
- * User-provided function that performs the logic of the crawler. It is called for each page
6
- * loaded and parsed by the crawler.
7
- *
8
- * The function receives the following object as an argument:
9
- * ```
10
- * {
11
- * // The Cheerio object's function with the parsed HTML.
12
- * $: Cheerio,
13
- *
14
- * // The request body of the web page, whose type depends on the content type.
15
- * body: String|Buffer,
16
- *
17
- * // The parsed object from JSON for responses with the "application/json" content types.
18
- * // For other content types it's null.
19
- * json: Object,
20
- *
21
- * // Apify.Request object with details of the requested web page
22
- * request: Request,
23
- *
24
- * // Parsed Content-Type HTTP header: { type, encoding }
25
- * contentType: Object,
26
- *
27
- * // An instance of Node's http.IncomingMessage object,
28
- * response: Object,
29
- *
30
- * // Session object, useful to work around anti-scraping protections
31
- * session: Session
32
- *
33
- * // ProxyInfo object with information about currently used proxy
34
- * proxyInfo: ProxyInfo
35
- *
36
- * // The running cheerio crawler instance.
37
- * crawler: CheerioCrawler
38
- * }
39
- * ```
40
- *
41
- * Type of `body` depends on the `Content-Type` header of the web page:
42
- * - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
43
- * - Buffer for others MIME content types
44
- *
45
- * Parsed `Content-Type` header using
46
- * [content-type package](https://www.npmjs.com/package/content-type)
47
- * is stored in `contentType`.
48
- *
49
- * Cheerio is available only for HTML and XML content types.
50
- *
51
- * With the {@link Request } object representing the URL to crawl.
52
- *
53
- * If the function returns, the returned promise is awaited by the crawler.
54
- *
55
- * If the function throws an exception, the crawler will try to re-crawl the
56
- * request later, up to `option.maxRequestRetries` times.
57
- * If all the retries fail, the crawler calls the function
58
- * provided to the `handleFailedRequestFunction` parameter.
59
- * To make this work, you should **always**
60
- * let your function throw exceptions rather than catch them.
61
- * The exceptions are logged to the request using the
62
- * {@link RequestpushErrorMessage } function.
63
- */
64
- handlePageFunction: CheerioHandlePage;
65
- /**
66
- * Static list of URLs to be processed.
67
- * Either `requestList` or `requestQueue` option must be provided (or both).
68
- */
69
- requestList?: RequestList | undefined;
70
- /**
71
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
72
- * Either `requestList` or `requestQueue` option must be provided (or both).
73
- */
74
- requestQueue?: RequestQueue | undefined;
75
- /**
76
- * > This option is deprecated, use `preNavigationHooks` instead.
77
- *
78
- * A function that executes before the HTTP request is made to the target resource.
79
- * This function is suitable for setting dynamic properties such as cookies to the {@link Request }.
80
- *
81
- * The function receives the following object as an argument:
82
- * ```
83
- * {
84
- * request: Request,
85
- * session: Session,
86
- * proxyInfo: ProxyInfo,
87
- * crawler: CheerioCrawler,
88
- * }
89
- * ```
90
- * where the {@link Request } instance corresponds to the initialized request
91
- * and the {@link Session } instance corresponds to used session.
92
- *
93
- * The function should modify the properties of the passed {@link Request } instance
94
- * in place because there are already earlier references to it. Making a copy and returning it from
95
- * this function is therefore not supported, because it would create inconsistencies where
96
- * different parts of SDK would have access to a different {@link Request } instance.
97
- */
98
- prepareRequestFunction?: PrepareRequest | undefined;
99
- /**
100
- * > This option is deprecated, use `postNavigationHooks` instead.
101
- *
102
- * A function that executes right after the HTTP request is made to the target resource and response is returned.
103
- * This function is suitable for overriding custom properties of response e.g. setting headers because of response parsing.
104
- *
105
- * **Example usage:**
106
- *
107
- * ```javascript
108
- * const cheerioCrawlerOptions = {
109
- * // ...
110
- * postResponseFunction: ({ request, response }) => {
111
- * if (request.userData.parseAsJSON) {
112
- * response.headers['content-type'] = 'application/json; charset=utf-8';
113
- * }
114
- * }
115
- * }
116
- * ```
117
- * The function receives the following object as an argument:
118
- * ```
119
- * {
120
- * response: Object,
121
- * request: Request,
122
- * session: Session,
123
- * proxyInfo: ProxyInfo,
124
- * crawler: CheerioCrawler,
125
- * }
126
- * ```
127
- * The response is an instance of Node's http.IncomingMessage object.
128
- */
129
- postResponseFunction?: PostResponse | undefined;
130
- /**
131
- * Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds.
132
- */
133
- handlePageTimeoutSecs?: number | undefined;
134
- /**
135
- * Timeout in which the HTTP request to the resource needs to finish, given in seconds.
136
- */
137
- requestTimeoutSecs?: number | undefined;
138
- /**
139
- * If set to true, SSL certificate errors will be ignored.
140
- */
141
- ignoreSslErrors?: boolean | undefined;
142
- /**
143
- * If set, `CheerioCrawler` will be configured for all connections to use
144
- * [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
145
- * For more information, see the [documentation](https://docs.apify.com/proxy).
146
- */
147
- proxyConfiguration?: ProxyConfiguration | undefined;
148
- /**
149
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
150
- * The function receives the following object as an argument:
151
- * ```
152
- * {
153
- * error: Error,
154
- * request: Request,
155
- * session: Session,
156
- * $: Cheerio,
157
- * body: String|Buffer,
158
- * json: Object,
159
- * contentType: Object,
160
- * response: Object,
161
- * proxyInfo: ProxyInfo,
162
- * crawler: CheerioCrawler,
163
- * }
164
- * ```
165
- * where the {@link Request } instance corresponds to the failed request, and the `Error` instance
166
- * represents the last error thrown during processing of the request.
167
- *
168
- * See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13)
169
- * for the default implementation of this function.
170
- */
171
- handleFailedRequestFunction?: HandleFailedRequest | undefined;
172
- /**
173
- * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
174
- * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `requestAsBrowserOptions`,
175
- * which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
176
- * Example:
177
- * ```
178
- * preNavigationHooks: [
179
- * async (crawlingContext, requestAsBrowserOptions) => {
180
- * requestAsBrowserOptions.forceUrlEncoding = true;
181
- * },
182
- * ]
183
- * ```
184
- */
185
- preNavigationHooks?: Hook[] | undefined;
186
- /**
187
- * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
188
- * The function accepts `crawlingContext` as the only parameter.
189
- * Example:
190
- * ```
191
- * postNavigationHooks: [
192
- * async (crawlingContext) => {
193
- * // ...
194
- * },
195
- * ]
196
- * ```
197
- */
198
- postNavigationHooks?: Hook[] | undefined;
199
- /**
200
- * An array of <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types"
201
- * target="_blank">MIME types</a> you want the crawler to load and process.
202
- * By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
203
- */
204
- additionalMimeTypes?: string[] | undefined;
205
- /**
206
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers.
207
- * Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
208
- * If those sites actually use a different encoding, the response will be corrupted. You can use
209
- * `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
210
- * To force a certain encoding, disregarding the response headers, use {@link CheerioCrawlerOptions.forceResponseEncoding }```
211
- * // Will fall back to windows-1250 encoding if none found
212
- * suggestResponseEncoding: 'windows-1250'
213
- * ```
214
- */
215
- suggestResponseEncoding?: string | undefined;
216
- /**
217
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
218
- * to force a certain encoding, disregarding the response headers.
219
- * To only provide a default for missing encodings, use {@link CheerioCrawlerOptions.suggestResponseEncoding }```
220
- * // Will force windows-1250 encoding even if headers say otherwise
221
- * forceResponseEncoding: 'windows-1250'
222
- * ```
223
- */
224
- forceResponseEncoding?: string | undefined;
225
- /**
226
- * Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails.
227
- */
228
- maxRequestRetries?: number | undefined;
229
- /**
230
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
231
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
232
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
233
- */
234
- maxRequestsPerCrawl?: number | undefined;
235
- /**
236
- * Custom options passed to the underlying {@link AutoscaledPool } constructor.
237
- * Note that the `runTaskFunction`, `isTaskReadyFunction` and `isFinishedFunction` options
238
- * are provided by `CheerioCrawler` and cannot be overridden. Reasonable {@link Snapshotter }
239
- * and {@link SystemStatus } defaults are provided to account for the fact that `cheerio`
240
- * parses HTML synchronously and therefore blocks the event loop.
241
- */
242
- autoscaledPoolOptions?: AutoscaledPoolOptions | undefined;
243
- /**
244
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool } option.
245
- *
246
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
247
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
248
- */
249
- minConcurrency?: number | undefined;
250
- /**
251
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool } option.
252
- */
253
- maxConcurrency?: number | undefined;
254
- /**
255
- * If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes.
256
- * It also marks Session as bad after a request timeout.
257
- */
258
- useSessionPool?: boolean | undefined;
259
- /**
260
- * Custom options passed to the underlying {@link SessionPool } constructor.
261
- */
262
- sessionPoolOptions?: SessionPoolOptions | undefined;
263
- /**
264
- * Automatically saves cookies to Session. Works only if Session Pool is used.
265
- *
266
- * It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
267
- * It passes the "Cookie" header to the request with the session cookies.
268
- */
269
- persistCookiesPerSession?: boolean | undefined;
270
- };
271
- export type PrepareRequestInputs = {
272
- /**
273
- * Original instance fo the {Request} object. Must be modified in-place.
274
- */
275
- request: Request;
276
- /**
277
- * The current session
278
- */
279
- session?: Session | undefined;
280
- /**
281
- * An object with information about currently used proxy by the crawler
282
- * and configured by the {@link ProxyConfiguration } class.
283
- */
284
- proxyInfo?: ProxyInfo | undefined;
285
- crawler?: CheerioCrawler | undefined;
286
- };
287
- export type PrepareRequest = (inputs: PrepareRequestInputs) => (void | Promise<void>);
288
- export type PostResponseInputs = {
289
- /**
290
- * stream
291
- */
292
- response: (IncomingMessage | Readable);
293
- /**
294
- * Original instance fo the {Request} object. Must be modified in-place.
295
- */
296
- request: Request;
297
- /**
298
- * The current session
299
- */
300
- session?: Session | undefined;
301
- /**
302
- * An object with information about currently used proxy by the crawler
303
- * and configured by the {@link ProxyConfiguration } class.
304
- */
305
- proxyInfo?: ProxyInfo | undefined;
306
- crawler: CheerioCrawler;
307
- };
308
- export type PostResponse = (inputs: PostResponseInputs) => (void | Promise<void>);
309
- export type CheerioHandlePageInputs = {
310
- /**
311
- * The [Cheerio](https://cheerio.js.org/) object with parsed HTML.
312
- */
313
- $: CheerioAPI;
314
- /**
315
- * The request body of the web page.
316
- */
317
- body: (string | Buffer);
318
- /**
319
- * The parsed object from JSON string if the response contains the content type application/json.
320
- */
321
- json: any;
322
- /**
323
- * The original {@link Request } object.
324
- */
325
- request: Request;
326
- /**
327
- * Parsed `Content-Type header: { type, encoding }`.
328
- */
329
- contentType: {
330
- type: string;
331
- encoding: string;
332
- };
333
- /**
334
- * An instance of Node's [http.IncomingMessage](https://nodejs.org/api/http.html#http_class_http_incomingmessage) object,
335
- */
336
- response: IncomingMessage;
337
- session: Session;
338
- /**
339
- * An object with information about currently used proxy by the crawler
340
- * and configured by the {@link ProxyConfiguration } class.
341
- */
342
- proxyInfo: ProxyInfo;
343
- crawler: CheerioCrawler;
344
- };
345
- export type CheerioHandlePage = (inputs: CheerioHandlePageInputs) => Promise<void>;
346
- /**
347
- * @typedef CheerioCrawlerOptions
348
- * @property {CheerioHandlePage} handlePageFunction
349
- * User-provided function that performs the logic of the crawler. It is called for each page
350
- * loaded and parsed by the crawler.
351
- *
352
- * The function receives the following object as an argument:
353
- * ```
354
- * {
355
- * // The Cheerio object's function with the parsed HTML.
356
- * $: Cheerio,
357
- *
358
- * // The request body of the web page, whose type depends on the content type.
359
- * body: String|Buffer,
360
- *
361
- * // The parsed object from JSON for responses with the "application/json" content types.
362
- * // For other content types it's null.
363
- * json: Object,
364
- *
365
- * // Apify.Request object with details of the requested web page
366
- * request: Request,
367
- *
368
- * // Parsed Content-Type HTTP header: { type, encoding }
369
- * contentType: Object,
370
- *
371
- * // An instance of Node's http.IncomingMessage object,
372
- * response: Object,
373
- *
374
- * // Session object, useful to work around anti-scraping protections
375
- * session: Session
376
- *
377
- * // ProxyInfo object with information about currently used proxy
378
- * proxyInfo: ProxyInfo
379
- *
380
- * // The running cheerio crawler instance.
381
- * crawler: CheerioCrawler
382
- * }
383
- * ```
384
- *
385
- * Type of `body` depends on the `Content-Type` header of the web page:
386
- * - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
387
- * - Buffer for others MIME content types
388
- *
389
- * Parsed `Content-Type` header using
390
- * [content-type package](https://www.npmjs.com/package/content-type)
391
- * is stored in `contentType`.
392
- *
393
- * Cheerio is available only for HTML and XML content types.
394
- *
395
- * With the {@link Request} object representing the URL to crawl.
396
- *
397
- * If the function returns, the returned promise is awaited by the crawler.
398
- *
399
- * If the function throws an exception, the crawler will try to re-crawl the
400
- * request later, up to `option.maxRequestRetries` times.
401
- * If all the retries fail, the crawler calls the function
402
- * provided to the `handleFailedRequestFunction` parameter.
403
- * To make this work, you should **always**
404
- * let your function throw exceptions rather than catch them.
405
- * The exceptions are logged to the request using the
406
- * {@link Request#pushErrorMessage} function.
407
- * @property {RequestList} [requestList]
408
- * Static list of URLs to be processed.
409
- * Either `requestList` or `requestQueue` option must be provided (or both).
410
- * @property {RequestQueue} [requestQueue]
411
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
412
- * Either `requestList` or `requestQueue` option must be provided (or both).
413
- * @property {PrepareRequest} [prepareRequestFunction]
414
- * > This option is deprecated, use `preNavigationHooks` instead.
415
- *
416
- * A function that executes before the HTTP request is made to the target resource.
417
- * This function is suitable for setting dynamic properties such as cookies to the {@link Request}.
418
- *
419
- * The function receives the following object as an argument:
420
- * ```
421
- * {
422
- * request: Request,
423
- * session: Session,
424
- * proxyInfo: ProxyInfo,
425
- * crawler: CheerioCrawler,
426
- * }
427
- * ```
428
- * where the {@link Request} instance corresponds to the initialized request
429
- * and the {@link Session} instance corresponds to used session.
430
- *
431
- * The function should modify the properties of the passed {@link Request} instance
432
- * in place because there are already earlier references to it. Making a copy and returning it from
433
- * this function is therefore not supported, because it would create inconsistencies where
434
- * different parts of SDK would have access to a different {@link Request} instance.
435
- *
436
- * @property {PostResponse} [postResponseFunction]
437
- * > This option is deprecated, use `postNavigationHooks` instead.
438
- *
439
- * A function that executes right after the HTTP request is made to the target resource and response is returned.
440
- * This function is suitable for overriding custom properties of response e.g. setting headers because of response parsing.
441
- *
442
- * **Example usage:**
443
- *
444
- * ```javascript
445
- * const cheerioCrawlerOptions = {
446
- * // ...
447
- * postResponseFunction: ({ request, response }) => {
448
- * if (request.userData.parseAsJSON) {
449
- * response.headers['content-type'] = 'application/json; charset=utf-8';
450
- * }
451
- * }
452
- * }
453
- * ```
454
- * The function receives the following object as an argument:
455
- * ```
456
- * {
457
- * response: Object,
458
- * request: Request,
459
- * session: Session,
460
- * proxyInfo: ProxyInfo,
461
- * crawler: CheerioCrawler,
462
- * }
463
- * ```
464
- * The response is an instance of Node's http.IncomingMessage object.
465
- *
466
- * @property {number} [handlePageTimeoutSecs=60]
467
- * Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds.
468
- * @property {number} [requestTimeoutSecs=30]
469
- * Timeout in which the HTTP request to the resource needs to finish, given in seconds.
470
- * @property {boolean} [ignoreSslErrors=true]
471
- * If set to true, SSL certificate errors will be ignored.
472
- * @property {ProxyConfiguration} [proxyConfiguration]
473
- * If set, `CheerioCrawler` will be configured for all connections to use
474
- * [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
475
- * For more information, see the [documentation](https://docs.apify.com/proxy).
476
- * @property {HandleFailedRequest} [handleFailedRequestFunction]
477
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
478
- * The function receives the following object as an argument:
479
- * ```
480
- * {
481
- * error: Error,
482
- * request: Request,
483
- * session: Session,
484
- * $: Cheerio,
485
- * body: String|Buffer,
486
- * json: Object,
487
- * contentType: Object,
488
- * response: Object,
489
- * proxyInfo: ProxyInfo,
490
- * crawler: CheerioCrawler,
491
- * }
492
- * ```
493
- * where the {@link Request} instance corresponds to the failed request, and the `Error` instance
494
- * represents the last error thrown during processing of the request.
495
- *
496
- * See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13)
497
- * for the default implementation of this function.
498
- * @property {Array<Hook>} [preNavigationHooks]
499
- * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
500
- * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `requestAsBrowserOptions`,
501
- * which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
502
- * Example:
503
- * ```
504
- * preNavigationHooks: [
505
- * async (crawlingContext, requestAsBrowserOptions) => {
506
- * requestAsBrowserOptions.forceUrlEncoding = true;
507
- * },
508
- * ]
509
- * ```
510
- * @property {Array<Hook>} [postNavigationHooks]
511
- * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
512
- * The function accepts `crawlingContext` as the only parameter.
513
- * Example:
514
- * ```
515
- * postNavigationHooks: [
516
- * async (crawlingContext) => {
517
- * // ...
518
- * },
519
- * ]
520
- * ```
521
- * @property {string[]} [additionalMimeTypes]
522
- * An array of <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types"
523
- * target="_blank">MIME types</a> you want the crawler to load and process.
524
- * By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
525
- * @property {string} [suggestResponseEncoding]
526
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers.
527
- * Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
528
- * If those sites actually use a different encoding, the response will be corrupted. You can use
529
- * `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
530
- * To force a certain encoding, disregarding the response headers, use {@link CheerioCrawlerOptions.forceResponseEncoding}
531
- * ```
532
- * // Will fall back to windows-1250 encoding if none found
533
- * suggestResponseEncoding: 'windows-1250'
534
- * ```
535
- * @property {string} [forceResponseEncoding]
536
- * By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
537
- * to force a certain encoding, disregarding the response headers.
538
- * To only provide a default for missing encodings, use {@link CheerioCrawlerOptions.suggestResponseEncoding}
539
- * ```
540
- * // Will force windows-1250 encoding even if headers say otherwise
541
- * forceResponseEncoding: 'windows-1250'
542
- * ```
543
- * @property {number} [maxRequestRetries=3]
544
- * Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails.
545
- * @property {number} [maxRequestsPerCrawl]
546
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
547
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
548
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
549
- * @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
550
- * Custom options passed to the underlying {@link AutoscaledPool} constructor.
551
- * Note that the `runTaskFunction`, `isTaskReadyFunction` and `isFinishedFunction` options
552
- * are provided by `CheerioCrawler` and cannot be overridden. Reasonable {@link Snapshotter}
553
- * and {@link SystemStatus} defaults are provided to account for the fact that `cheerio`
554
- * parses HTML synchronously and therefore blocks the event loop.
555
- * @property {number} [minConcurrency=1]
556
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
557
- *
558
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
559
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
560
- * @property {number} [maxConcurrency=1000]
561
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
562
- * @property {boolean} [useSessionPool=true]
563
- * If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes.
564
- * It also marks Session as bad after a request timeout.
565
- * @property {SessionPoolOptions} [sessionPoolOptions]
566
- * Custom options passed to the underlying {@link SessionPool} constructor.
567
- * @property {boolean} [persistCookiesPerSession]
568
- * Automatically saves cookies to Session. Works only if Session Pool is used.
569
- *
570
- * It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
571
- * It passes the "Cookie" header to the request with the session cookies.
572
- */
573
- /**
574
- * Provides a framework for the parallel crawling of web pages using plain HTTP requests and
575
- * [cheerio](https://www.npmjs.com/package/cheerio) HTML parser.
576
- * The URLs to crawl are fed either from a static list of URLs
577
- * or from a dynamic queue of URLs enabling recursive crawling of websites.
578
- *
579
- * Since `CheerioCrawler` uses raw HTTP requests to download web pages,
580
- * it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
581
- * to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
582
- * because it loads the pages using full-featured headless Chrome browser.
583
- *
584
- * `CheerioCrawler` downloads each URL using a plain HTTP request,
585
- * parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio)
586
- * and then invokes the user-provided {@link CheerioCrawlerOptions.handlePageFunction} to extract page data
587
- * using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM.
588
- *
589
- * The source URLs are represented using {@link Request} objects that are fed from
590
- * {@link RequestList} or {@link RequestQueue} instances provided by the {@link CheerioCrawlerOptions.requestList}
591
- * or {@link CheerioCrawlerOptions.requestQueue} constructor options, respectively.
592
- *
593
- * If both {@link CheerioCrawlerOptions.requestList} and {@link CheerioCrawlerOptions.requestQueue} are used,
594
- * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
595
- * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
596
- *
597
- * The crawler finishes when there are no more {@link Request} objects to crawl.
598
- *
599
- * `CheerioCrawler` downloads the web pages using the `{@link utils#requestAsBrowser}` utility function.
600
- * As opposed to the browser based crawlers that are automatically encoding the URLs, the
601
- * `{@link utils#requestAsBrowser}` function will not do so. We either need to manually encode the URLs
602
- * via `encodeURI()` function, or set `forceUrlEncoding: true` in the `requestAsBrowserOptions`,
603
- * which will automatically encode all the URLs before accessing them.
604
- *
605
- * > We can either use `forceUrlEncoding` or encode manually, but not both - it would
606
- * > result in double encoding and therefore lead to invalid URLs.
607
- *
608
- * We can use the `preNavigationHooks` to adjust `requestAsBrowserOptions`:
609
- *
610
- * ```
611
- * preNavigationHooks: [
612
- * (crawlingContext, requestAsBrowserOptions) => {
613
- * requestAsBrowserOptions.forceUrlEncoding = true;
614
- * },
615
- * ]
616
- * ```
617
- *
618
- * By default, `CheerioCrawler` only processes web pages with the `text/html`
619
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
620
- * and skips pages with other content types. If you want the crawler to process other content types,
621
- * use the {@link CheerioCrawlerOptions.additionalMimeTypes} constructor option.
622
- * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
623
- * For details, see {@link CheerioCrawlerOptions.handlePageFunction}.
624
- *
625
- * New requests are only dispatched when there is enough free CPU and memory available,
626
- * using the functionality provided by the {@link AutoscaledPool} class.
627
- * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
628
- * parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
629
- * {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
630
- *
631
- * **Example usage:**
632
- *
633
- * ```javascript
634
- * // Prepare a list of URLs to crawl
635
- * const requestList = new Apify.RequestList({
636
- * sources: [
637
- * { url: 'http://www.example.com/page-1' },
638
- * { url: 'http://www.example.com/page-2' },
639
- * ],
640
- * });
641
- * await requestList.initialize();
642
- *
643
- * // Crawl the URLs
644
- * const crawler = new Apify.CheerioCrawler({
645
- * requestList,
646
- * handlePageFunction: async ({ request, response, body, contentType, $ }) => {
647
- * const data = [];
648
- *
649
- * // Do some data extraction from the page with Cheerio.
650
- * $('.some-collection').each((index, el) => {
651
- * data.push({ title: $(el).find('.some-title').text() });
652
- * });
653
- *
654
- * // Save the data to dataset.
655
- * await Apify.pushData({
656
- * url: request.url,
657
- * html: body,
658
- * data,
659
- * })
660
- * },
661
- * });
662
- *
663
- * await crawler.run();
664
- * ```
665
- * @property {Statistics} stats
666
- * Contains statistics about the current run.
667
- * @property {?RequestList} requestList
668
- * A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
669
- * Only available if used by the crawler.
670
- * @property {?RequestQueue} requestQueue
671
- * A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
672
- * Only available if used by the crawler.
673
- * @property {?SessionPool} sessionPool
674
- * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
675
- * Only available if used by the crawler.
676
- * @property {?ProxyConfiguration} proxyConfiguration
677
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
678
- * Only available if used by the crawler.
679
- * @property {AutoscaledPool} autoscaledPool
680
- * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
681
- * Note that this property is only initialized after calling the {@link CheerioCrawler#run} function.
682
- * You can use it to change the concurrency settings on the fly,
683
- * to pause the crawler by calling {@link AutoscaledPool#pause}
684
- * or to abort it by calling {@link AutoscaledPool#abort}.
685
- */
686
- declare class CheerioCrawler extends BasicCrawler {
687
- /**
688
- * @param {CheerioCrawlerOptions} options
689
- * All `CheerioCrawler` parameters are passed via an options object.
690
- */
691
- constructor(options: CheerioCrawlerOptions);
692
- supportedMimeTypes: Set<string>;
693
- handlePageTimeoutMillis: number;
694
- requestTimeoutMillis: number;
695
- ignoreSslErrors: boolean;
696
- suggestResponseEncoding: string | undefined;
697
- forceResponseEncoding: string | undefined;
698
- prepareRequestFunction: PrepareRequest | undefined;
699
- postResponseFunction: PostResponse | undefined;
700
- proxyConfiguration: ProxyConfiguration | undefined;
701
- /**
702
- * @type {Array<any>}
703
- * @ignore
704
- * */
705
- preNavigationHooks: Array<any>;
706
- /**
707
- * @type {Array<any>}
708
- * @ignore
709
- * */
710
- postNavigationHooks: Array<any>;
711
- persistCookiesPerSession: boolean;
712
- /**
713
- * **EXPERIMENTAL**
714
- * Function for attaching CrawlerExtensions such as the Unblockers.
715
- * @param {CrawlerExtension} extension - Crawler extension that overrides the crawler configuration.
716
- */
717
- use(extension: CrawlerExtension): void;
718
- /**
719
- * @param {CrawlingContext} crawlingContext
720
- * @ignore
721
- * @protected
722
- * @internal
723
- */
724
- protected _handleNavigation(crawlingContext: CrawlingContext): Promise<void>;
725
- /**
726
- * When users change `request.headers.cookie` inside preNavigationHook, the change would be ignored,
727
- * as `request.headers` are already merged into the `requestAsBrowserOptions`. This method is using
728
- * old `request.headers` snapshot (before hooks are executed), makes a diff with the cookie value
729
- * after hooks are executed, and merges any new cookies back to `requestAsBrowserOptions`.
730
- *
731
- * This way we can still use both `requestAsBrowserOptions` and `context.request` in the hooks (not both).
732
- *
733
- * @param {Request} request
734
- * @param {string} cookieSnapshot
735
- * @param {RequestAsBrowserOptions} requestAsBrowserOptions
736
- * @private
737
- * @ignore
738
- * @internal
739
- */
740
- private _mergeRequestCookieDiff;
741
- /**
742
- * Function to make the HTTP request. It performs optimizations
743
- * on the request such as only downloading the request body if the
744
- * received content type matches text/html, application/xml, application/xhtml+xml.
745
- *
746
- * @param {object} options
747
- * @param {Request} options.request
748
- * @param {Session} options.session
749
- * @param {string} options.proxyUrl
750
- * @param {RequestAsBrowserOptions} options.requestAsBrowserOptions
751
- * @returns {Promise<IncomingMessage|Readable>}
752
- * @ignore
753
- * @protected
754
- * @internal
755
- */
756
- protected _requestFunction({ request, session, proxyUrl, requestAsBrowserOptions }: {
757
- request: Request;
758
- session: Session;
759
- proxyUrl: string;
760
- requestAsBrowserOptions: RequestAsBrowserOptions;
761
- }): Promise<IncomingMessage | Readable>;
762
- /**
763
- * Sets the cookie header to `requestAsBrowserOptions` based on provided session and request. If some cookies were already set,
764
- * the session cookie will be merged with them. User provided cookies on `request` object have precedence.
765
- *
766
- * @param {CrawlingContext} crawlingContext
767
- * @param {RequestAsBrowserOptions} requestAsBrowserOptions
768
- * @return {void}
769
- * @ignore
770
- * @private
771
- * @internal
772
- */
773
- private _applySessionCookie;
774
- /**
775
- * Encodes and parses response according to the provided content type
776
- * @param {Request} request
777
- * @param {IncomingMessage|Readable} responseStream
778
- * @returns {Promise<object>}
779
- * @ignore
780
- * @protected
781
- * @internal
782
- */
783
- protected _parseResponse(request: Request, responseStream: IncomingMessage | Readable): Promise<object>;
784
- /**
785
- * Combines the provided `requestOptions` with mandatory (non-overridable) values.
786
- * @param {Request} request
787
- * @param {Session} [session]
788
- * @param {string} [proxyUrl]
789
- * @param {RequestAsBrowserOptions} [requestAsBrowserOptions]
790
- * @ignore
791
- * @protected
792
- * @internal
793
- */
794
- protected _getRequestOptions(request: Request, session?: Session | undefined, proxyUrl?: string | undefined, requestAsBrowserOptions?: RequestAsBrowserOptions | undefined): {
795
- headers: {
796
- [x: string]: string;
797
- };
798
- https: any;
799
- isStream: boolean;
800
- /**
801
- * URL of the target endpoint. Supports both HTTP and HTTPS schemes.
802
- */
803
- url: string;
804
- /**
805
- * HTTP method.
806
- */
807
- method: string;
808
- /**
809
- * An HTTP proxy to be passed down to the HTTP request. Supports proxy authentication with Basic Auth.
810
- */
811
- proxyUrl: string | undefined;
812
- /**
813
- * Configuration to be used for generating correct browser headers.
814
- * See the [`header-generator`](https://github.com/apify/header-generator) library.
815
- */
816
- headerGeneratorOptions?: object | undefined;
817
- /**
818
- * Two-letter ISO 639 language code.
819
- */
820
- languageCode?: string | undefined;
821
- /**
822
- * Two-letter ISO 3166 country code.
823
- */
824
- countryCode?: string | undefined;
825
- /**
826
- * If `true`, the function uses User-Agent of a mobile browser.
827
- */
828
- useMobileVersion?: boolean | undefined;
829
- /**
830
- * If set to true, SSL/TLS certificate errors will be ignored.
831
- */
832
- ignoreSslErrors?: boolean | undefined;
833
- /**
834
- * Node.js' HTTP parser is stricter than parsers used by web browsers, which prevents scraping of websites
835
- * whose servers do not comply with HTTP specs, either by accident or due to some anti-scraping protections,
836
- * causing e.g. the `invalid header value char` error. The `useInsecureHttpParser` option forces
837
- * the HTTP parser to ignore certain errors which lets you scrape such websites.
838
- * However, it will also open your application to some security vulnerabilities,
839
- * although the risk should be negligible as these vulnerabilities mainly relate to server applications, not clients.
840
- * Learn more in this [blog post](https://snyk.io/blog/node-js-release-fixes-a-critical-http-security-vulnerability/).
841
- */
842
- useInsecureHttpParser?: boolean | undefined;
843
- /**
844
- * Function accepts `response` object as a single parameter and should return `true` or `false`.
845
- * If function returns true, request gets aborted.
846
- */
847
- abortFunction?: import("../utils_request").AbortFunction | undefined;
848
- /**
849
- * If set to false, it will prevent use of HTTP2 requests. This is strongly discouraged. Websites
850
- * expect HTTP2 connections, because browsers use HTTP2 by default. It will automatically downgrade
851
- * to HTTP/1.1 for websites that do not support HTTP2.
852
- */
853
- useHttp2?: boolean | undefined;
854
- /**
855
- * A unique object used to generate browser headers. By default, new headers are generated on every call.
856
- * Set this option to make these headers persistent.
857
- */
858
- sessionToken: object | Session | undefined;
859
- timeout: {
860
- request: number;
861
- };
862
- };
863
- /**
864
- * @param {*} request
865
- * @param {*} response
866
- * @param {*} encoding
867
- * @ignore
868
- * @protected
869
- * @internal
870
- */
871
- protected _encodeResponse(request: any, response: any, encoding: any): {
872
- response: any;
873
- encoding: string;
874
- };
875
- /**
876
- * @param {*} response
877
- * @ignore
878
- * @protected
879
- * @internal
880
- */
881
- protected _parseHtmlToDom(response: any): Promise<any>;
882
- /**
883
- * Checks and extends supported mime types
884
- * @param {Array<(string|Object)>} additionalMimeTypes
885
- * @ignore
886
- * @protected
887
- * @internal
888
- */
889
- protected _extendSupportedMimeTypes(additionalMimeTypes: Array<(string | Object)>): void;
890
- /**
891
- * Handles blocked request
892
- * @param {Session} session
893
- * @param {number} statusCode
894
- * @ignore
895
- * @protected
896
- * @internal
897
- */
898
- protected _throwOnBlockedRequest(session: Session, statusCode: number): void;
899
- /**
900
- * Handles timeout request
901
- * @param {Session} session
902
- * @ignore
903
- * @protected
904
- * @internal
905
- */
906
- protected _handleRequestTimeout(session: Session): void;
907
- /**
908
- * @param {Request} request
909
- * @param {IncomingMessage|Readable} response
910
- * @private
911
- */
912
- private _abortDownloadOfBody;
913
- }
914
- import { RequestList } from "../request_list";
915
- import { RequestQueue } from "../storages/request_queue";
916
- import { ProxyConfiguration } from "../proxy_configuration";
917
- import { HandleFailedRequest } from "./basic_crawler";
918
- import { Hook } from "./browser_crawler";
919
- import { AutoscaledPoolOptions } from "../autoscaling/autoscaled_pool";
920
- import { SessionPoolOptions } from "../session_pool/session_pool";
921
- import Request from "../request";
922
- import { Session } from "../session_pool/session";
923
- import { ProxyInfo } from "../proxy_configuration";
924
- import { IncomingMessage } from "http";
925
- import { Readable } from "stream";
926
- import { CheerioAPI } from "cheerio/lib/load";
927
- import { BasicCrawler } from "./basic_crawler";
928
- import CrawlerExtension from "./crawler_extension";
929
- import { CrawlingContext } from "./basic_crawler";
930
- import { RequestAsBrowserOptions } from "../utils_request";
931
- //# sourceMappingURL=cheerio_crawler.d.ts.map