apify 2.3.1-beta.4 → 3.0.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. package/README.md +6 -5
  2. package/package.json +69 -128
  3. package/build/actor.d.ts +0 -113
  4. package/build/actor.d.ts.map +0 -1
  5. package/build/actor.js +0 -582
  6. package/build/actor.js.map +0 -1
  7. package/build/apify.d.ts +0 -752
  8. package/build/apify.d.ts.map +0 -1
  9. package/build/apify.js +0 -877
  10. package/build/apify.js.map +0 -1
  11. package/build/autoscaling/autoscaled_pool.d.ts +0 -384
  12. package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
  13. package/build/autoscaling/autoscaled_pool.js +0 -557
  14. package/build/autoscaling/autoscaled_pool.js.map +0 -1
  15. package/build/autoscaling/snapshotter.d.ts +0 -278
  16. package/build/autoscaling/snapshotter.d.ts.map +0 -1
  17. package/build/autoscaling/snapshotter.js +0 -447
  18. package/build/autoscaling/snapshotter.js.map +0 -1
  19. package/build/autoscaling/system_status.d.ts +0 -224
  20. package/build/autoscaling/system_status.d.ts.map +0 -1
  21. package/build/autoscaling/system_status.js +0 -228
  22. package/build/autoscaling/system_status.js.map +0 -1
  23. package/build/browser_launchers/browser_launcher.d.ts +0 -154
  24. package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
  25. package/build/browser_launchers/browser_launcher.js +0 -160
  26. package/build/browser_launchers/browser_launcher.js.map +0 -1
  27. package/build/browser_launchers/browser_plugin.d.ts +0 -23
  28. package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
  29. package/build/browser_launchers/browser_plugin.js +0 -25
  30. package/build/browser_launchers/browser_plugin.js.map +0 -1
  31. package/build/browser_launchers/playwright_launcher.d.ts +0 -131
  32. package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
  33. package/build/browser_launchers/playwright_launcher.js +0 -150
  34. package/build/browser_launchers/playwright_launcher.js.map +0 -1
  35. package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
  36. package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
  37. package/build/browser_launchers/puppeteer_launcher.js +0 -197
  38. package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
  39. package/build/cache_container.d.ts +0 -31
  40. package/build/cache_container.d.ts.map +0 -1
  41. package/build/cache_container.js +0 -48
  42. package/build/cache_container.js.map +0 -1
  43. package/build/configuration.d.ts +0 -226
  44. package/build/configuration.d.ts.map +0 -1
  45. package/build/configuration.js +0 -325
  46. package/build/configuration.js.map +0 -1
  47. package/build/constants.d.ts +0 -37
  48. package/build/constants.d.ts.map +0 -1
  49. package/build/constants.js +0 -41
  50. package/build/constants.js.map +0 -1
  51. package/build/crawlers/basic_crawler.d.ts +0 -443
  52. package/build/crawlers/basic_crawler.d.ts.map +0 -1
  53. package/build/crawlers/basic_crawler.js +0 -664
  54. package/build/crawlers/basic_crawler.js.map +0 -1
  55. package/build/crawlers/browser_crawler.d.ts +0 -512
  56. package/build/crawlers/browser_crawler.d.ts.map +0 -1
  57. package/build/crawlers/browser_crawler.js +0 -540
  58. package/build/crawlers/browser_crawler.js.map +0 -1
  59. package/build/crawlers/cheerio_crawler.d.ts +0 -931
  60. package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
  61. package/build/crawlers/cheerio_crawler.js +0 -913
  62. package/build/crawlers/cheerio_crawler.js.map +0 -1
  63. package/build/crawlers/crawler_extension.d.ts +0 -10
  64. package/build/crawlers/crawler_extension.d.ts.map +0 -1
  65. package/build/crawlers/crawler_extension.js +0 -19
  66. package/build/crawlers/crawler_extension.js.map +0 -1
  67. package/build/crawlers/crawler_utils.d.ts +0 -34
  68. package/build/crawlers/crawler_utils.d.ts.map +0 -1
  69. package/build/crawlers/crawler_utils.js +0 -87
  70. package/build/crawlers/crawler_utils.js.map +0 -1
  71. package/build/crawlers/playwright_crawler.d.ts +0 -448
  72. package/build/crawlers/playwright_crawler.d.ts.map +0 -1
  73. package/build/crawlers/playwright_crawler.js +0 -299
  74. package/build/crawlers/playwright_crawler.js.map +0 -1
  75. package/build/crawlers/puppeteer_crawler.d.ts +0 -425
  76. package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
  77. package/build/crawlers/puppeteer_crawler.js +0 -299
  78. package/build/crawlers/puppeteer_crawler.js.map +0 -1
  79. package/build/crawlers/statistics.d.ts +0 -185
  80. package/build/crawlers/statistics.d.ts.map +0 -1
  81. package/build/crawlers/statistics.js +0 -331
  82. package/build/crawlers/statistics.js.map +0 -1
  83. package/build/enqueue_links/click_elements.d.ts +0 -179
  84. package/build/enqueue_links/click_elements.d.ts.map +0 -1
  85. package/build/enqueue_links/click_elements.js +0 -434
  86. package/build/enqueue_links/click_elements.js.map +0 -1
  87. package/build/enqueue_links/enqueue_links.d.ts +0 -117
  88. package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
  89. package/build/enqueue_links/enqueue_links.js +0 -163
  90. package/build/enqueue_links/enqueue_links.js.map +0 -1
  91. package/build/enqueue_links/shared.d.ts +0 -42
  92. package/build/enqueue_links/shared.d.ts.map +0 -1
  93. package/build/enqueue_links/shared.js +0 -121
  94. package/build/enqueue_links/shared.js.map +0 -1
  95. package/build/errors.d.ts +0 -29
  96. package/build/errors.d.ts.map +0 -1
  97. package/build/errors.js +0 -38
  98. package/build/errors.js.map +0 -1
  99. package/build/events.d.ts +0 -11
  100. package/build/events.d.ts.map +0 -1
  101. package/build/events.js +0 -147
  102. package/build/events.js.map +0 -1
  103. package/build/index.d.ts +0 -4
  104. package/build/index.d.ts.map +0 -1
  105. package/build/index.js +0 -7
  106. package/build/index.js.map +0 -1
  107. package/build/main.d.ts +0 -179
  108. package/build/main.d.ts.map +0 -1
  109. package/build/main.js +0 -81
  110. package/build/main.js.map +0 -1
  111. package/build/playwright_utils.d.ts +0 -9
  112. package/build/playwright_utils.d.ts.map +0 -1
  113. package/build/playwright_utils.js +0 -90
  114. package/build/playwright_utils.js.map +0 -1
  115. package/build/proxy_configuration.d.ts +0 -411
  116. package/build/proxy_configuration.d.ts.map +0 -1
  117. package/build/proxy_configuration.js +0 -517
  118. package/build/proxy_configuration.js.map +0 -1
  119. package/build/pseudo_url.d.ts +0 -86
  120. package/build/pseudo_url.d.ts.map +0 -1
  121. package/build/pseudo_url.js +0 -153
  122. package/build/pseudo_url.js.map +0 -1
  123. package/build/puppeteer_request_interception.d.ts +0 -8
  124. package/build/puppeteer_request_interception.d.ts.map +0 -1
  125. package/build/puppeteer_request_interception.js +0 -235
  126. package/build/puppeteer_request_interception.js.map +0 -1
  127. package/build/puppeteer_utils.d.ts +0 -250
  128. package/build/puppeteer_utils.d.ts.map +0 -1
  129. package/build/puppeteer_utils.js +0 -551
  130. package/build/puppeteer_utils.js.map +0 -1
  131. package/build/request.d.ts +0 -180
  132. package/build/request.d.ts.map +0 -1
  133. package/build/request.js +0 -261
  134. package/build/request.js.map +0 -1
  135. package/build/request_list.d.ts +0 -581
  136. package/build/request_list.d.ts.map +0 -1
  137. package/build/request_list.js +0 -826
  138. package/build/request_list.js.map +0 -1
  139. package/build/serialization.d.ts +0 -5
  140. package/build/serialization.d.ts.map +0 -1
  141. package/build/serialization.js +0 -139
  142. package/build/serialization.js.map +0 -1
  143. package/build/session_pool/errors.d.ts +0 -11
  144. package/build/session_pool/errors.d.ts.map +0 -1
  145. package/build/session_pool/errors.js +0 -18
  146. package/build/session_pool/errors.js.map +0 -1
  147. package/build/session_pool/events.d.ts +0 -5
  148. package/build/session_pool/events.d.ts.map +0 -1
  149. package/build/session_pool/events.js +0 -6
  150. package/build/session_pool/events.js.map +0 -1
  151. package/build/session_pool/session.d.ts +0 -286
  152. package/build/session_pool/session.d.ts.map +0 -1
  153. package/build/session_pool/session.js +0 -355
  154. package/build/session_pool/session.js.map +0 -1
  155. package/build/session_pool/session_pool.d.ts +0 -280
  156. package/build/session_pool/session_pool.d.ts.map +0 -1
  157. package/build/session_pool/session_pool.js +0 -393
  158. package/build/session_pool/session_pool.js.map +0 -1
  159. package/build/session_pool/session_utils.d.ts +0 -4
  160. package/build/session_pool/session_utils.d.ts.map +0 -1
  161. package/build/session_pool/session_utils.js +0 -24
  162. package/build/session_pool/session_utils.js.map +0 -1
  163. package/build/stealth/hiding_tricks.d.ts +0 -22
  164. package/build/stealth/hiding_tricks.d.ts.map +0 -1
  165. package/build/stealth/hiding_tricks.js +0 -308
  166. package/build/stealth/hiding_tricks.js.map +0 -1
  167. package/build/stealth/stealth.d.ts +0 -56
  168. package/build/stealth/stealth.d.ts.map +0 -1
  169. package/build/stealth/stealth.js +0 -125
  170. package/build/stealth/stealth.js.map +0 -1
  171. package/build/storages/dataset.d.ts +0 -288
  172. package/build/storages/dataset.d.ts.map +0 -1
  173. package/build/storages/dataset.js +0 -480
  174. package/build/storages/dataset.js.map +0 -1
  175. package/build/storages/key_value_store.d.ts +0 -243
  176. package/build/storages/key_value_store.d.ts.map +0 -1
  177. package/build/storages/key_value_store.js +0 -462
  178. package/build/storages/key_value_store.js.map +0 -1
  179. package/build/storages/request_queue.d.ts +0 -318
  180. package/build/storages/request_queue.d.ts.map +0 -1
  181. package/build/storages/request_queue.js +0 -636
  182. package/build/storages/request_queue.js.map +0 -1
  183. package/build/storages/storage_manager.d.ts +0 -87
  184. package/build/storages/storage_manager.d.ts.map +0 -1
  185. package/build/storages/storage_manager.js +0 -150
  186. package/build/storages/storage_manager.js.map +0 -1
  187. package/build/tsconfig.tsbuildinfo +0 -1
  188. package/build/typedefs.d.ts +0 -146
  189. package/build/typedefs.d.ts.map +0 -1
  190. package/build/typedefs.js +0 -88
  191. package/build/typedefs.js.map +0 -1
  192. package/build/utils.d.ts +0 -175
  193. package/build/utils.d.ts.map +0 -1
  194. package/build/utils.js +0 -731
  195. package/build/utils.js.map +0 -1
  196. package/build/utils_log.d.ts +0 -41
  197. package/build/utils_log.d.ts.map +0 -1
  198. package/build/utils_log.js +0 -192
  199. package/build/utils_log.js.map +0 -1
  200. package/build/utils_request.d.ts +0 -77
  201. package/build/utils_request.d.ts.map +0 -1
  202. package/build/utils_request.js +0 -385
  203. package/build/utils_request.js.map +0 -1
  204. package/build/utils_social.d.ts +0 -210
  205. package/build/utils_social.d.ts.map +0 -1
  206. package/build/utils_social.js +0 -787
  207. package/build/utils_social.js.map +0 -1
  208. package/build/validators.d.ts +0 -23
  209. package/build/validators.d.ts.map +0 -1
  210. package/build/validators.js +0 -29
  211. package/build/validators.js.map +0 -1
@@ -1,664 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.BasicCrawler = void 0;
4
- const tslib_1 = require("tslib");
5
- const consts_1 = require("@apify/consts");
6
- const utilities_1 = require("@apify/utilities");
7
- const ow_1 = (0, tslib_1.__importStar)(require("ow"));
8
- const underscore_1 = (0, tslib_1.__importDefault)(require("underscore"));
9
- const timeout_1 = require("@apify/timeout");
10
- const autoscaled_pool_1 = (0, tslib_1.__importDefault)(require("../autoscaling/autoscaled_pool")); // eslint-disable-line import/no-duplicates
11
- const events_1 = (0, tslib_1.__importDefault)(require("../events"));
12
- const session_pool_1 = require("../session_pool/session_pool"); // eslint-disable-line import/no-duplicates
13
- const statistics_1 = (0, tslib_1.__importDefault)(require("./statistics"));
14
- const utils_log_1 = (0, tslib_1.__importDefault)(require("../utils_log")); // eslint-disable-line import/no-duplicates
15
- const validators_1 = require("../validators");
16
- /* eslint-enable no-unused-vars,import/named,import/no-duplicates,import/order */
17
- /**
18
- * @typedef {object} CrawlingContext
19
- * @property {string} id
20
- * @property {Request} request
21
- * @property {Session} session
22
- * @property {ProxyInfo} proxyInfo
23
- * @property {*} response
24
- */
25
- /**
26
- * Since there's no set number of seconds before the container is terminated after
27
- * a migration event, we need some reasonable number to use for RequestList persistence.
28
- * Once a migration event is received, the Crawler will be paused and it will wait for
29
- * this long before persisting the RequestList state. This should allow most healthy
30
- * requests to finish and be marked as handled, thus lowering the amount of duplicate
31
- * results after migration.
32
- *
33
- * @type {number}
34
- * @ignore
35
- */
36
- const SAFE_MIGRATION_WAIT_MILLIS = 20000;
37
- /**
38
- * @typedef BasicCrawlerOptions
39
- * @property {HandleRequest} handleRequestFunction
40
- * User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
41
- *
42
- * The function receives the following object as an argument:
43
- * ```
44
- * {
45
- * request: Request,
46
- * session: Session,
47
- * crawler: BasicCrawler,
48
- * }
49
- * ```
50
- * where the {@link Request} instance represents the URL to crawl.
51
- *
52
- * The function must return a promise, which is then awaited by the crawler.
53
- *
54
- * If the function throws an exception, the crawler will try to re-crawl the
55
- * request later, up to `option.maxRequestRetries` times.
56
- * If all the retries fail, the crawler calls the function
57
- * provided to the `handleFailedRequestFunction` parameter.
58
- * To make this work, you should **always**
59
- * let your function throw exceptions rather than catch them.
60
- * The exceptions are logged to the request using the
61
- * {@link Request#pushErrorMessage} function.
62
- * @property {RequestList} [requestList]
63
- * Static list of URLs to be processed.
64
- * Either `requestList` or `requestQueue` option must be provided (or both).
65
- * @property {RequestQueue} [requestQueue]
66
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
67
- * Either `requestList` or `requestQueue` option must be provided (or both).
68
- * @property {number} [handleRequestTimeoutSecs=60]
69
- * Timeout in which the function passed as `handleRequestFunction` needs to finish, in seconds.
70
- * @property {HandleFailedRequest} [handleFailedRequestFunction]
71
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
72
- *
73
- * The function receives the following object as an argument:
74
- * ```
75
- * {
76
- * request: Request,
77
- * error: Error,
78
- * session: Session,
79
- * crawler: BasicCrawler,
80
- * }
81
- * ```
82
- * where the {@link Request} instance corresponds to the failed request, and the `Error` instance
83
- * represents the last error thrown during processing of the request.
84
- *
85
- * See
86
- * [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/basic_crawler.js#L11)
87
- * for the default implementation of this function.
88
- * @property {number} [maxRequestRetries=3]
89
- * Indicates how many times the request is retried if {@link BasicCrawlerOptions.handleRequestFunction} fails.
90
- * @property {number} [maxRequestsPerCrawl]
91
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
92
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
93
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
94
- * @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
95
- * Custom options passed to the underlying {@link AutoscaledPool} constructor.
96
- * Note that the `runTaskFunction` and `isTaskReadyFunction` options
97
- * are provided by `BasicCrawler` and cannot be overridden.
98
- * However, you can provide a custom implementation of `isFinishedFunction`.
99
- * @property {number} [minConcurrency=1]
100
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
101
- *
102
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
103
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
104
- * @property {number} [maxConcurrency=1000]
105
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
106
- * @property {boolean} [useSessionPool=true]
107
- * Basic crawler will initialize the {@link SessionPool} with the corresponding `sessionPoolOptions`.
108
- * The session instance will be than available in the `handleRequestFunction`.
109
- * @property {SessionPoolOptions} [sessionPoolOptions] The configuration options for {@link SessionPool} to use.
110
- */
111
- /**
112
- * Provides a simple framework for parallel crawling of web pages.
113
- * The URLs to crawl are fed either from a static list of URLs
114
- * or from a dynamic queue of URLs enabling recursive crawling of websites.
115
- *
116
- * `BasicCrawler` is a low-level tool that requires the user to implement the page
117
- * download and data extraction functionality themselves.
118
- * If you want a crawler that already facilitates this functionality,
119
- * please consider using {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}.
120
- *
121
- * `BasicCrawler` invokes the user-provided {@link BasicCrawlerOptions.handleRequestFunction}
122
- * for each {@link Request} object, which represents a single URL to crawl.
123
- * The {@link Request} objects are fed from the {@link RequestList} or the {@link RequestQueue}
124
- * instances provided by the {@link BasicCrawlerOptions.requestList} or {@link BasicCrawlerOptions.requestQueue}
125
- * constructor options, respectively.
126
- *
127
- * If both {@link BasicCrawlerOptions.requestList} and {@link BasicCrawlerOptions.requestQueue} options are used,
128
- * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
129
- * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
130
- *
131
- * The crawler finishes if there are no more {@link Request} objects to crawl.
132
- *
133
- * New requests are only dispatched when there is enough free CPU and memory available,
134
- * using the functionality provided by the {@link AutoscaledPool} class.
135
- * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
136
- * parameter of the `BasicCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
137
- * {@link AutoscaledPool} options are available directly in the `BasicCrawler` constructor.
138
- *
139
- * **Example usage:**
140
- *
141
- * ```javascript
142
- * // Prepare a list of URLs to crawl
143
- * const requestList = new Apify.RequestList({
144
- * sources: [
145
- * { url: 'http://www.example.com/page-1' },
146
- * { url: 'http://www.example.com/page-2' },
147
- * ],
148
- * });
149
- * await requestList.initialize();
150
- *
151
- * // Crawl the URLs
152
- * const crawler = new Apify.BasicCrawler({
153
- * requestList,
154
- * handleRequestFunction: async ({ request }) => {
155
- * // 'request' contains an instance of the Request class
156
- * // Here we simply fetch the HTML of the page and store it to a dataset
157
- * const { body } = await Apify.utils.requestAsBrowser(request);
158
- * await Apify.pushData({
159
- * url: request.url,
160
- * html: body,
161
- * })
162
- * },
163
- * });
164
- *
165
- * await crawler.run();
166
- * ```
167
- * @property {Statistics} stats
168
- * Contains statistics about the current run.
169
- * @property {RequestList} [requestList]
170
- * A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
171
- * Only available if used by the crawler.
172
- * @property {RequestQueue} [requestQueue]
173
- * A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
174
- * Only available if used by the crawler.
175
- * @property {SessionPool} [sessionPool]
176
- * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
177
- * Only available if used by the crawler.
178
- * @property {AutoscaledPool} autoscaledPool
179
- * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
180
- * Note that this property is only initialized after calling the {@link BasicCrawler#run} function.
181
- * You can use it to change the concurrency settings on the fly,
182
- * to pause the crawler by calling {@link AutoscaledPool#pause}
183
- * or to abort it by calling {@link AutoscaledPool#abort}.
184
- */
185
- class BasicCrawler {
186
- /**
187
- * @param {BasicCrawlerOptions} options
188
- * All `BasicCrawler` parameters are passed via an options object.
189
- */
190
- constructor(options) {
191
- (0, ow_1.default)(options, 'BasicCrawlerOptions', ow_1.default.object.exactShape(BasicCrawler.optionsShape));
192
- const { requestList, requestQueue, handleRequestFunction, handleRequestTimeoutSecs = 60, handleFailedRequestFunction, maxRequestRetries = 3, maxRequestsPerCrawl, autoscaledPoolOptions = {}, sessionPoolOptions = {}, useSessionPool = true,
193
- // AutoscaledPool shorthands
194
- minConcurrency, maxConcurrency,
195
- // internal
196
- log = utils_log_1.default.child({ prefix: this.constructor.name }), } = options;
197
- if (!requestList && !requestQueue) {
198
- const msg = 'At least one of the parameters "options.requestList" and "options.requestQueue" must be provided!';
199
- throw new ow_1.ArgumentError(msg, this.constructor);
200
- }
201
- // assigning {} to the options as default break proper typing
202
- /** @type {Log} */
203
- this.log = log;
204
- this.requestList = requestList;
205
- this.requestQueue = requestQueue;
206
- this.userProvidedHandler = handleRequestFunction;
207
- this.failedContextHandler = handleFailedRequestFunction;
208
- this.handleRequestTimeoutMillis = handleRequestTimeoutSecs * 1000;
209
- const tryEnv = (val) => (val == null ? val : +val);
210
- // allow at least 5min for internal timeouts
211
- this.internalTimeoutMillis = tryEnv(process.env.APIFY_INTERNAL_TIMEOUT) ?? Math.max(this.handleRequestTimeoutMillis * 2, 300e3);
212
- // override the default internal timeout of request queue to respect `handleRequestTimeoutMillis`
213
- if (this.requestQueue) {
214
- this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
215
- }
216
- this.handleFailedRequestFunction = handleFailedRequestFunction;
217
- this.maxRequestRetries = maxRequestRetries;
218
- this.handledRequestsCount = 0;
219
- this.stats = new statistics_1.default({ logMessage: `${log.getOptions().prefix} request statistics:` });
220
- /** @type {SessionPoolOptions} */
221
- this.sessionPoolOptions = {
222
- ...sessionPoolOptions,
223
- log,
224
- };
225
- this.useSessionPool = useSessionPool;
226
- this.crawlingContexts = new Map();
227
- const maxSignedInteger = 2 ** 31 - 1;
228
- if (this.handleRequestTimeoutMillis > maxSignedInteger) {
229
- log.warning(`handleRequestTimeoutMillis ${this.handleRequestTimeoutMillis}`
230
- + `does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
231
- this.handleRequestTimeoutMillis = maxSignedInteger;
232
- }
233
- let shouldLogMaxPagesExceeded = true;
234
- const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
235
- const { isFinishedFunction } = autoscaledPoolOptions;
236
- const basicCrawlerAutoscaledPoolConfiguration = {
237
- minConcurrency,
238
- maxConcurrency,
239
- runTaskFunction: this._runTaskFunction.bind(this),
240
- isTaskReadyFunction: async () => {
241
- if (isMaxPagesExceeded()) {
242
- if (shouldLogMaxPagesExceeded) {
243
- log.info('Crawler reached the maxRequestsPerCrawl limit of '
244
- + `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
245
- shouldLogMaxPagesExceeded = false;
246
- }
247
- return false;
248
- }
249
- return this._isTaskReadyFunction();
250
- },
251
- isFinishedFunction: async () => {
252
- if (isMaxPagesExceeded()) {
253
- log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests `
254
- + 'and all requests that were in progress at that time have now finished. '
255
- + `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
256
- return true;
257
- }
258
- const isFinished = isFinishedFunction
259
- ? await isFinishedFunction()
260
- : await this._defaultIsFinishedFunction();
261
- if (isFinished) {
262
- const reason = isFinishedFunction
263
- ? 'Crawler\'s custom isFinishedFunction() returned true, the crawler will shut down.'
264
- : 'All the requests from request list and/or request queue have been processed, the crawler will shut down.';
265
- log.info(reason);
266
- }
267
- return isFinished;
268
- },
269
- log,
270
- };
271
- this.autoscaledPoolOptions = underscore_1.default.defaults({}, basicCrawlerAutoscaledPoolConfiguration, autoscaledPoolOptions);
272
- this.isRunningPromise = null;
273
- // Attach a listener to handle migration and aborting events gracefully.
274
- events_1.default.on(consts_1.ACTOR_EVENT_NAMES.MIGRATING, this._pauseOnMigration.bind(this));
275
- events_1.default.on(consts_1.ACTOR_EVENT_NAMES.ABORTING, this._pauseOnMigration.bind(this));
276
- }
277
- /**
278
- * Runs the crawler. Returns a promise that gets resolved once all the requests are processed.
279
- *
280
- * @return {Promise<void>}
281
- */
282
- async run() {
283
- if (this.isRunningPromise)
284
- return this.isRunningPromise;
285
- await this._init();
286
- this.isRunningPromise = this.autoscaledPool.run();
287
- await this.stats.startCapturing();
288
- try {
289
- await this.isRunningPromise;
290
- }
291
- finally {
292
- await this.teardown();
293
- await this.stats.stopCapturing();
294
- const finalStats = this.stats.calculate();
295
- const { requestsFailed, requestsFinished } = this.stats.state;
296
- this.log.info('Final request statistics:', {
297
- requestsFinished,
298
- requestsFailed,
299
- retryHistogram: this.stats.requestRetryHistogram,
300
- ...finalStats,
301
- });
302
- }
303
- }
304
- /**
305
- * @return {Promise<void>}
306
- * @ignore
307
- * @protected
308
- * @internal
309
- */
310
- async _init() {
311
- // Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
312
- // so that the caller can get a reference to it before awaiting the promise returned from run()
313
- // (otherwise there would be no way)
314
- this.autoscaledPool = new autoscaled_pool_1.default(this.autoscaledPoolOptions);
315
- if (this.useSessionPool) {
316
- this.sessionPool = await (0, session_pool_1.openSessionPool)(this.sessionPoolOptions);
317
- // Assuming there are not more than 20 browsers running at once;
318
- this.sessionPool.setMaxListeners(20);
319
- }
320
- await this._loadHandledRequestCount();
321
- }
322
- /**
323
- * @param {CrawlingContext} crawlingContext
324
- * @return {Promise<void>}
325
- * @ignore
326
- * @protected
327
- * @internal
328
- */
329
- async _handleRequestFunction(crawlingContext) {
330
- await this.userProvidedHandler(crawlingContext);
331
- }
332
- /**
333
- * @ignore
334
- * @protected
335
- * @internal
336
- */
337
- async _pauseOnMigration() {
338
- if (this.autoscaledPool) {
339
- // if run wasn't called, this is going to crash
340
- await this.autoscaledPool.pause(SAFE_MIGRATION_WAIT_MILLIS)
341
- .catch((err) => {
342
- if (err.message.includes('running tasks did not finish')) {
343
- this.log.error('The crawler was paused due to migration to another host, '
344
- + 'but some requests did not finish in time. Those requests\' results may be duplicated.');
345
- }
346
- else {
347
- throw err;
348
- }
349
- });
350
- }
351
- const requestListPersistPromise = (async () => {
352
- if (this.requestList) {
353
- if (await this.requestList.isFinished())
354
- return;
355
- await this.requestList.persistState()
356
- .catch((err) => {
357
- if (err.message.includes('Cannot persist state.')) {
358
- this.log.error('The crawler attempted to persist its request list\'s state and failed due to missing or '
359
- + 'invalid config. Make sure to use either Apify.openRequestList() or the "stateKeyPrefix" option of RequestList '
360
- + 'constructor to ensure your crawling state is persisted through host migrations and restarts.');
361
- }
362
- else {
363
- this.log.exception(err, 'An unexpected error occured when the crawler '
364
- + 'attempted to persist its request list\'s state.');
365
- }
366
- });
367
- }
368
- })();
369
- await Promise.all([
370
- requestListPersistPromise,
371
- this.stats.persistState(),
372
- ]);
373
- }
374
- /**
375
- * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
376
- * and RequestQueue is present then enqueues it to the queue first.
377
- *
378
- * @ignore
379
- * @protected
380
- * @internal
381
- */
382
- async _fetchNextRequest() {
383
- if (!this.requestList)
384
- return this.requestQueue.fetchNextRequest();
385
- const request = await this.requestList.fetchNextRequest();
386
- if (!this.requestQueue)
387
- return request;
388
- if (!request)
389
- return this.requestQueue.fetchNextRequest();
390
- try {
391
- await this.requestQueue.addRequest(request, { forefront: true });
392
- }
393
- catch (err) {
394
- // If requestQueue.addRequest() fails here then we must reclaim it back to
395
- // the RequestList because probably it's not yet in the queue!
396
- this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
397
- await this.requestList.reclaimRequest(request);
398
- return null;
399
- }
400
- await this.requestList.markRequestHandled(request);
401
- return this.requestQueue.fetchNextRequest();
402
- }
403
- /**
404
- * Wrapper around handleRequestFunction that fetches requests from RequestList/RequestQueue
405
- * then retries them in a case of an error, etc.
406
- *
407
- * @ignore
408
- * @protected
409
- * @internal
410
- */
411
- async _runTaskFunction() {
412
- const source = this.requestQueue || this.requestList;
413
- let request;
414
- let session;
415
- await this._timeoutAndRetry(async () => {
416
- request = await this._fetchNextRequest();
417
- }, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
418
- (0, timeout_1.tryCancel)();
419
- if (this.useSessionPool) {
420
- await this._timeoutAndRetry(async () => {
421
- session = await this.sessionPool.getSession();
422
- }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
423
- }
424
- (0, timeout_1.tryCancel)();
425
- if (!request)
426
- return;
427
- // Reset loadedUrl so an old one is not carried over to retries.
428
- request.loadedUrl = undefined;
429
- const statisticsId = request.id || request.uniqueKey;
430
- this.stats.startJob(statisticsId);
431
- // Shared crawling context
432
- const crawlingContext = {
433
- id: (0, utilities_1.cryptoRandomObjectId)(10),
434
- crawler: this,
435
- request,
436
- session,
437
- };
438
- this.crawlingContexts.set(crawlingContext.id, crawlingContext);
439
- try {
440
- await (0, timeout_1.addTimeoutToPromise)(() => this._handleRequestFunction(crawlingContext), this.handleRequestTimeoutMillis, `handleRequestFunction timed out after ${this.handleRequestTimeoutMillis / 1000} seconds (${request.id}).`);
441
- await this._timeoutAndRetry(() => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
442
- this.stats.finishJob(statisticsId);
443
- this.handledRequestsCount++;
444
- // reclaim session if request finishes successfully
445
- if (session)
446
- session.markGood();
447
- }
448
- catch (err) {
449
- try {
450
- await this._timeoutAndRetry(() => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
451
- }
452
- catch (secondaryError) {
453
- this.log.exception(secondaryError, 'runTaskFunction error handler threw an exception. '
454
- + 'This places the crawler and its underlying storages into an unknown state and crawling will be terminated. '
455
- + 'This may have happened due to an internal error of Apify\'s API or due to a misconfigured crawler. '
456
- + 'If you are sure that there is no error in your code, selecting "Restart on error" in the actor\'s settings'
457
- + 'will make sure that the run continues where it left off, if programmed to handle restarts correctly.');
458
- throw secondaryError;
459
- }
460
- }
461
- finally {
462
- this.crawlingContexts.delete(crawlingContext.id);
463
- }
464
- }
465
- /**
466
- * Run async callback with given timeout and retry.
467
- * @ignore
468
- */
469
- async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
470
- try {
471
- await (0, timeout_1.addTimeoutToPromise)(handler, timeout, error);
472
- }
473
- catch (e) {
474
- if (retried <= maxRetries) { // we retry on any error, not just timeout
475
- this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
476
- return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
477
- }
478
- throw e;
479
- }
480
- }
481
- /**
482
- * Returns true if either RequestList or RequestQueue have a request ready for processing.
483
- *
484
- * @ignore
485
- * @protected
486
- * @internal
487
- */
488
- async _isTaskReadyFunction() {
489
- // First check RequestList, since it's only in memory.
490
- const isRequestListEmpty = this.requestList ? (await this.requestList.isEmpty()) : true;
491
- // If RequestList is not empty, task is ready, no reason to check RequestQueue.
492
- if (!isRequestListEmpty)
493
- return true;
494
- // If RequestQueue is not empty, task is ready, return true, otherwise false.
495
- return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
496
- }
497
- /**
498
- * Returns true if both RequestList and RequestQueue have all requests finished.
499
- *
500
- * @ignore
501
- * @protected
502
- * @internal
503
- */
504
- async _defaultIsFinishedFunction() {
505
- const [isRequestListFinished, isRequestQueueFinished,] = await Promise.all([
506
- this.requestList ? this.requestList.isFinished() : true,
507
- this.requestQueue ? this.requestQueue.isFinished() : true,
508
- ]);
509
- // If both are finished, return true, otherwise return false.
510
- return isRequestListFinished && isRequestQueueFinished;
511
- }
512
- /**
513
- * Handles errors thrown by user provided handleRequestFunction()
514
- * @param {Error} error
515
- * @param {object} crawlingContext
516
- * @param {Request} crawlingContext.request
517
- * @param {(RequestList|RequestQueue)} source
518
- * @return {Promise<void>}
519
- * @ignore
520
- * @protected
521
- * @internal
522
- */
523
- async _requestFunctionErrorHandler(error, crawlingContext, source) {
524
- const { request } = crawlingContext;
525
- request.pushErrorMessage(error);
526
- const shouldRetryRequest = !request.noRetry && request.retryCount < this.maxRequestRetries;
527
- if (shouldRetryRequest) {
528
- request.retryCount++;
529
- this.log.exception(error, 'handleRequestFunction failed, reclaiming failed request back to the list or queue', underscore_1.default.pick(request, 'url', 'retryCount', 'id'));
530
- await source.reclaimRequest(request);
531
- }
532
- else {
533
- // If we get here, the request is either not retryable
534
- // or failed more than retryCount times and will not be retried anymore.
535
- // Mark the request as failed and do not retry.
536
- this.handledRequestsCount++;
537
- await source.markRequestHandled(request);
538
- this.stats.failJob(request.id || request.url);
539
- crawlingContext.error = error;
540
- await this._handleFailedRequestFunction(crawlingContext); // This function prints an error message.
541
- }
542
- }
543
- /**
544
- * @param {object} crawlingContext
545
- * @param {Error} crawlingContext.error
546
- * @param {Request} crawlingContext.request
547
- * @return {Promise<void>}
548
- * @ignore
549
- * @protected
550
- * @internal
551
- */
552
- async _handleFailedRequestFunction(crawlingContext) {
553
- if (this.failedContextHandler) {
554
- await this.failedContextHandler(crawlingContext);
555
- }
556
- else {
557
- const { id, url, method, uniqueKey } = crawlingContext.request;
558
- this.log.exception(crawlingContext.error, 'Request failed and reached maximum retries', { id, url, method, uniqueKey });
559
- }
560
- }
561
- /**
562
- * Updates handledRequestsCount from possibly stored counts,
563
- * usually after worker migration. Since one of the stores
564
- * needs to have priority when both are present,
565
- * it is the request queue, because generally, the request
566
- * list will first be dumped into the queue and then left
567
- * empty.
568
- *
569
- * @return {Promise<void>}
570
- * @ignore
571
- * @protected
572
- * @internal
573
- */
574
- async _loadHandledRequestCount() {
575
- if (this.requestQueue) {
576
- this.handledRequestsCount = await this.requestQueue.handledCount();
577
- }
578
- else if (this.requestList) {
579
- this.handledRequestsCount = this.requestList.handledCount();
580
- }
581
- }
582
- /**
583
- * @param {Array<any>} hooks
584
- * @param {*} args
585
- * @ignore
586
- * @protected
587
- * @internal
588
- */
589
- async _executeHooks(hooks, ...args) {
590
- if (Array.isArray(hooks) && hooks.length) {
591
- for (const hook of hooks) {
592
- await hook(...args);
593
- }
594
- }
595
- }
596
- /**
597
- * Function for cleaning up after all request are processed.
598
- * @ignore
599
- */
600
- async teardown() {
601
- if (this.useSessionPool) {
602
- await this.sessionPool.teardown();
603
- }
604
- }
605
- }
606
- exports.BasicCrawler = BasicCrawler;
607
- /**
608
- * @internal
609
- * @type any
610
- */
611
- Object.defineProperty(BasicCrawler, "optionsShape", {
612
- enumerable: true,
613
- configurable: true,
614
- writable: true,
615
- value: {
616
- requestList: ow_1.default.optional.object.validate(validators_1.validators.requestList),
617
- requestQueue: ow_1.default.optional.object.validate(validators_1.validators.requestQueue),
618
- // Subclasses override this function instead of passing it
619
- // in constructor, so this validation needs to apply only
620
- // if the user creates an instance of BasicCrawler directly.
621
- handleRequestFunction: ow_1.default.function,
622
- handleRequestTimeoutSecs: ow_1.default.optional.number,
623
- handleFailedRequestFunction: ow_1.default.optional.function,
624
- maxRequestRetries: ow_1.default.optional.number,
625
- maxRequestsPerCrawl: ow_1.default.optional.number,
626
- autoscaledPoolOptions: ow_1.default.optional.object,
627
- sessionPoolOptions: ow_1.default.optional.object,
628
- useSessionPool: ow_1.default.optional.boolean,
629
- // AutoscaledPool shorthands
630
- minConcurrency: ow_1.default.optional.number,
631
- maxConcurrency: ow_1.default.optional.number,
632
- // internal
633
- log: ow_1.default.optional.object,
634
- }
635
- });
636
- /**
637
- * @callback HandleRequest
638
- * @param {HandleRequestInputs} inputs Arguments passed to this callback.
639
- * @returns {Promise<void>}
640
- */
641
- /**
642
- * @typedef HandleRequestInputs
643
- * @property {Request} request The original {Request} object.
644
- * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
645
- * Note that this property is only initialized after calling the {@link BasicCrawler#run} function.
646
- * You can use it to change the concurrency settings on the fly,
647
- * to pause the crawler by calling {@link AutoscaledPool#pause}
648
- * or to abort it by calling {@link AutoscaledPool#abort}.
649
- * @property {Session} [session]
650
- * @property {BasicCrawler} [crawler]
651
- */
652
- /**
653
- * @callback HandleFailedRequest
654
- * @param {HandleFailedRequestInput} inputs Arguments passed to this callback.
655
- * @returns {Promise<void>}
656
- */
657
- /**
658
- * @typedef HandleFailedRequestInput
659
- * @property {Error} error The Error thrown by `handleRequestFunction`.
660
- * @property {Request} request The original {Request} object.
661
- * @property {Session} session
662
- * @property {ProxyInfo} proxyInfo
663
- */
664
- //# sourceMappingURL=basic_crawler.js.map