apify 2.3.1-beta.4 → 3.0.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. package/README.md +6 -5
  2. package/package.json +69 -128
  3. package/build/actor.d.ts +0 -113
  4. package/build/actor.d.ts.map +0 -1
  5. package/build/actor.js +0 -582
  6. package/build/actor.js.map +0 -1
  7. package/build/apify.d.ts +0 -752
  8. package/build/apify.d.ts.map +0 -1
  9. package/build/apify.js +0 -877
  10. package/build/apify.js.map +0 -1
  11. package/build/autoscaling/autoscaled_pool.d.ts +0 -384
  12. package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
  13. package/build/autoscaling/autoscaled_pool.js +0 -557
  14. package/build/autoscaling/autoscaled_pool.js.map +0 -1
  15. package/build/autoscaling/snapshotter.d.ts +0 -278
  16. package/build/autoscaling/snapshotter.d.ts.map +0 -1
  17. package/build/autoscaling/snapshotter.js +0 -447
  18. package/build/autoscaling/snapshotter.js.map +0 -1
  19. package/build/autoscaling/system_status.d.ts +0 -224
  20. package/build/autoscaling/system_status.d.ts.map +0 -1
  21. package/build/autoscaling/system_status.js +0 -228
  22. package/build/autoscaling/system_status.js.map +0 -1
  23. package/build/browser_launchers/browser_launcher.d.ts +0 -154
  24. package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
  25. package/build/browser_launchers/browser_launcher.js +0 -160
  26. package/build/browser_launchers/browser_launcher.js.map +0 -1
  27. package/build/browser_launchers/browser_plugin.d.ts +0 -23
  28. package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
  29. package/build/browser_launchers/browser_plugin.js +0 -25
  30. package/build/browser_launchers/browser_plugin.js.map +0 -1
  31. package/build/browser_launchers/playwright_launcher.d.ts +0 -131
  32. package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
  33. package/build/browser_launchers/playwright_launcher.js +0 -150
  34. package/build/browser_launchers/playwright_launcher.js.map +0 -1
  35. package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
  36. package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
  37. package/build/browser_launchers/puppeteer_launcher.js +0 -197
  38. package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
  39. package/build/cache_container.d.ts +0 -31
  40. package/build/cache_container.d.ts.map +0 -1
  41. package/build/cache_container.js +0 -48
  42. package/build/cache_container.js.map +0 -1
  43. package/build/configuration.d.ts +0 -226
  44. package/build/configuration.d.ts.map +0 -1
  45. package/build/configuration.js +0 -325
  46. package/build/configuration.js.map +0 -1
  47. package/build/constants.d.ts +0 -37
  48. package/build/constants.d.ts.map +0 -1
  49. package/build/constants.js +0 -41
  50. package/build/constants.js.map +0 -1
  51. package/build/crawlers/basic_crawler.d.ts +0 -443
  52. package/build/crawlers/basic_crawler.d.ts.map +0 -1
  53. package/build/crawlers/basic_crawler.js +0 -664
  54. package/build/crawlers/basic_crawler.js.map +0 -1
  55. package/build/crawlers/browser_crawler.d.ts +0 -512
  56. package/build/crawlers/browser_crawler.d.ts.map +0 -1
  57. package/build/crawlers/browser_crawler.js +0 -540
  58. package/build/crawlers/browser_crawler.js.map +0 -1
  59. package/build/crawlers/cheerio_crawler.d.ts +0 -931
  60. package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
  61. package/build/crawlers/cheerio_crawler.js +0 -913
  62. package/build/crawlers/cheerio_crawler.js.map +0 -1
  63. package/build/crawlers/crawler_extension.d.ts +0 -10
  64. package/build/crawlers/crawler_extension.d.ts.map +0 -1
  65. package/build/crawlers/crawler_extension.js +0 -19
  66. package/build/crawlers/crawler_extension.js.map +0 -1
  67. package/build/crawlers/crawler_utils.d.ts +0 -34
  68. package/build/crawlers/crawler_utils.d.ts.map +0 -1
  69. package/build/crawlers/crawler_utils.js +0 -87
  70. package/build/crawlers/crawler_utils.js.map +0 -1
  71. package/build/crawlers/playwright_crawler.d.ts +0 -448
  72. package/build/crawlers/playwright_crawler.d.ts.map +0 -1
  73. package/build/crawlers/playwright_crawler.js +0 -299
  74. package/build/crawlers/playwright_crawler.js.map +0 -1
  75. package/build/crawlers/puppeteer_crawler.d.ts +0 -425
  76. package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
  77. package/build/crawlers/puppeteer_crawler.js +0 -299
  78. package/build/crawlers/puppeteer_crawler.js.map +0 -1
  79. package/build/crawlers/statistics.d.ts +0 -185
  80. package/build/crawlers/statistics.d.ts.map +0 -1
  81. package/build/crawlers/statistics.js +0 -331
  82. package/build/crawlers/statistics.js.map +0 -1
  83. package/build/enqueue_links/click_elements.d.ts +0 -179
  84. package/build/enqueue_links/click_elements.d.ts.map +0 -1
  85. package/build/enqueue_links/click_elements.js +0 -434
  86. package/build/enqueue_links/click_elements.js.map +0 -1
  87. package/build/enqueue_links/enqueue_links.d.ts +0 -117
  88. package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
  89. package/build/enqueue_links/enqueue_links.js +0 -163
  90. package/build/enqueue_links/enqueue_links.js.map +0 -1
  91. package/build/enqueue_links/shared.d.ts +0 -42
  92. package/build/enqueue_links/shared.d.ts.map +0 -1
  93. package/build/enqueue_links/shared.js +0 -121
  94. package/build/enqueue_links/shared.js.map +0 -1
  95. package/build/errors.d.ts +0 -29
  96. package/build/errors.d.ts.map +0 -1
  97. package/build/errors.js +0 -38
  98. package/build/errors.js.map +0 -1
  99. package/build/events.d.ts +0 -11
  100. package/build/events.d.ts.map +0 -1
  101. package/build/events.js +0 -147
  102. package/build/events.js.map +0 -1
  103. package/build/index.d.ts +0 -4
  104. package/build/index.d.ts.map +0 -1
  105. package/build/index.js +0 -7
  106. package/build/index.js.map +0 -1
  107. package/build/main.d.ts +0 -179
  108. package/build/main.d.ts.map +0 -1
  109. package/build/main.js +0 -81
  110. package/build/main.js.map +0 -1
  111. package/build/playwright_utils.d.ts +0 -9
  112. package/build/playwright_utils.d.ts.map +0 -1
  113. package/build/playwright_utils.js +0 -90
  114. package/build/playwright_utils.js.map +0 -1
  115. package/build/proxy_configuration.d.ts +0 -411
  116. package/build/proxy_configuration.d.ts.map +0 -1
  117. package/build/proxy_configuration.js +0 -517
  118. package/build/proxy_configuration.js.map +0 -1
  119. package/build/pseudo_url.d.ts +0 -86
  120. package/build/pseudo_url.d.ts.map +0 -1
  121. package/build/pseudo_url.js +0 -153
  122. package/build/pseudo_url.js.map +0 -1
  123. package/build/puppeteer_request_interception.d.ts +0 -8
  124. package/build/puppeteer_request_interception.d.ts.map +0 -1
  125. package/build/puppeteer_request_interception.js +0 -235
  126. package/build/puppeteer_request_interception.js.map +0 -1
  127. package/build/puppeteer_utils.d.ts +0 -250
  128. package/build/puppeteer_utils.d.ts.map +0 -1
  129. package/build/puppeteer_utils.js +0 -551
  130. package/build/puppeteer_utils.js.map +0 -1
  131. package/build/request.d.ts +0 -180
  132. package/build/request.d.ts.map +0 -1
  133. package/build/request.js +0 -261
  134. package/build/request.js.map +0 -1
  135. package/build/request_list.d.ts +0 -581
  136. package/build/request_list.d.ts.map +0 -1
  137. package/build/request_list.js +0 -826
  138. package/build/request_list.js.map +0 -1
  139. package/build/serialization.d.ts +0 -5
  140. package/build/serialization.d.ts.map +0 -1
  141. package/build/serialization.js +0 -139
  142. package/build/serialization.js.map +0 -1
  143. package/build/session_pool/errors.d.ts +0 -11
  144. package/build/session_pool/errors.d.ts.map +0 -1
  145. package/build/session_pool/errors.js +0 -18
  146. package/build/session_pool/errors.js.map +0 -1
  147. package/build/session_pool/events.d.ts +0 -5
  148. package/build/session_pool/events.d.ts.map +0 -1
  149. package/build/session_pool/events.js +0 -6
  150. package/build/session_pool/events.js.map +0 -1
  151. package/build/session_pool/session.d.ts +0 -286
  152. package/build/session_pool/session.d.ts.map +0 -1
  153. package/build/session_pool/session.js +0 -355
  154. package/build/session_pool/session.js.map +0 -1
  155. package/build/session_pool/session_pool.d.ts +0 -280
  156. package/build/session_pool/session_pool.d.ts.map +0 -1
  157. package/build/session_pool/session_pool.js +0 -393
  158. package/build/session_pool/session_pool.js.map +0 -1
  159. package/build/session_pool/session_utils.d.ts +0 -4
  160. package/build/session_pool/session_utils.d.ts.map +0 -1
  161. package/build/session_pool/session_utils.js +0 -24
  162. package/build/session_pool/session_utils.js.map +0 -1
  163. package/build/stealth/hiding_tricks.d.ts +0 -22
  164. package/build/stealth/hiding_tricks.d.ts.map +0 -1
  165. package/build/stealth/hiding_tricks.js +0 -308
  166. package/build/stealth/hiding_tricks.js.map +0 -1
  167. package/build/stealth/stealth.d.ts +0 -56
  168. package/build/stealth/stealth.d.ts.map +0 -1
  169. package/build/stealth/stealth.js +0 -125
  170. package/build/stealth/stealth.js.map +0 -1
  171. package/build/storages/dataset.d.ts +0 -288
  172. package/build/storages/dataset.d.ts.map +0 -1
  173. package/build/storages/dataset.js +0 -480
  174. package/build/storages/dataset.js.map +0 -1
  175. package/build/storages/key_value_store.d.ts +0 -243
  176. package/build/storages/key_value_store.d.ts.map +0 -1
  177. package/build/storages/key_value_store.js +0 -462
  178. package/build/storages/key_value_store.js.map +0 -1
  179. package/build/storages/request_queue.d.ts +0 -318
  180. package/build/storages/request_queue.d.ts.map +0 -1
  181. package/build/storages/request_queue.js +0 -636
  182. package/build/storages/request_queue.js.map +0 -1
  183. package/build/storages/storage_manager.d.ts +0 -87
  184. package/build/storages/storage_manager.d.ts.map +0 -1
  185. package/build/storages/storage_manager.js +0 -150
  186. package/build/storages/storage_manager.js.map +0 -1
  187. package/build/tsconfig.tsbuildinfo +0 -1
  188. package/build/typedefs.d.ts +0 -146
  189. package/build/typedefs.d.ts.map +0 -1
  190. package/build/typedefs.js +0 -88
  191. package/build/typedefs.js.map +0 -1
  192. package/build/utils.d.ts +0 -175
  193. package/build/utils.d.ts.map +0 -1
  194. package/build/utils.js +0 -731
  195. package/build/utils.js.map +0 -1
  196. package/build/utils_log.d.ts +0 -41
  197. package/build/utils_log.d.ts.map +0 -1
  198. package/build/utils_log.js +0 -192
  199. package/build/utils_log.js.map +0 -1
  200. package/build/utils_request.d.ts +0 -77
  201. package/build/utils_request.d.ts.map +0 -1
  202. package/build/utils_request.js +0 -385
  203. package/build/utils_request.js.map +0 -1
  204. package/build/utils_social.d.ts +0 -210
  205. package/build/utils_social.d.ts.map +0 -1
  206. package/build/utils_social.js +0 -787
  207. package/build/utils_social.js.map +0 -1
  208. package/build/validators.d.ts +0 -23
  209. package/build/validators.d.ts.map +0 -1
  210. package/build/validators.js +0 -29
  211. package/build/validators.js.map +0 -1
@@ -1,37 +0,0 @@
1
- /**
2
- * The default user agent used by `Apify.launchPuppeteer`.
3
- * Last updated on 2020-05-22.
4
- */
5
- export const DEFAULT_USER_AGENT: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36";
6
- export namespace EXIT_CODES {
7
- const SUCCESS: number;
8
- const ERROR_USER_FUNCTION_THREW: number;
9
- const ERROR_UNKNOWN: number;
10
- }
11
- /**
12
- * These events are just internal for Apify package, so we don't need them in apify-shared package.
13
- *
14
- * @type {{CPU_INFO: string, SYSTEM_INFO: string, MIGRATING: string, PERSIST_STATE: string, ABORTING: string}}
15
- */
16
- export const ACTOR_EVENT_NAMES_EX: {
17
- CPU_INFO: string;
18
- SYSTEM_INFO: string;
19
- MIGRATING: string;
20
- PERSIST_STATE: string;
21
- ABORTING: string;
22
- };
23
- /**
24
- * Base URL of Apify's API endpoints.
25
- * @type {string}
26
- */
27
- export const APIFY_API_BASE_URL: string;
28
- /**
29
- * Additional number of seconds used in CheerioCrawler and BrowserCrawler to set a reasonable
30
- * handleRequestTimeoutSecs for BasicCrawler that would not impare functionality (not timeout before crawlers).
31
- *
32
- * @type {number}
33
- */
34
- export const BASIC_CRAWLER_TIMEOUT_BUFFER_SECS: number;
35
- export const COUNTRY_CODE_REGEX: RegExp;
36
- export const STATUS_CODES_BLOCKED: number[];
37
- //# sourceMappingURL=constants.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../src/constants.js"],"names":[],"mappings":"AAEA;;;GAGG;AAEH,4JAA6J;;;;;;AAa7J;;;;GAIG;AACH,mCAFU;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,WAAW,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAC,CAEjB;AAE5F;;;GAGG;AACH,iCAFU,MAAM,CAE6C;AAE7D;;;;;GAKG;AACH,gDAFU,MAAM,CAEoC;AAEpD,wCAA+C;AAE/C,4CAAoD"}
@@ -1,41 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.STATUS_CODES_BLOCKED = exports.COUNTRY_CODE_REGEX = exports.BASIC_CRAWLER_TIMEOUT_BUFFER_SECS = exports.APIFY_API_BASE_URL = exports.ACTOR_EVENT_NAMES_EX = exports.EXIT_CODES = exports.DEFAULT_USER_AGENT = void 0;
4
- const consts_1 = require("@apify/consts");
5
- /**
6
- * The default user agent used by `Apify.launchPuppeteer`.
7
- * Last updated on 2020-05-22.
8
- */
9
- // eslint-disable-next-line max-len
10
- exports.DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36';
11
- /**
12
- * Exit codes for the actor process.
13
- * The error codes must be in the range 1-128, to avoid collision with signal exits
14
- * and to ensure Docker will handle them correctly!
15
- */
16
- exports.EXIT_CODES = {
17
- SUCCESS: 0,
18
- ERROR_USER_FUNCTION_THREW: 91,
19
- ERROR_UNKNOWN: 92,
20
- };
21
- /**
22
- * These events are just internal for Apify package, so we don't need them in apify-shared package.
23
- *
24
- * @type {{CPU_INFO: string, SYSTEM_INFO: string, MIGRATING: string, PERSIST_STATE: string, ABORTING: string}}
25
- */
26
- exports.ACTOR_EVENT_NAMES_EX = { ...consts_1.ACTOR_EVENT_NAMES, PERSIST_STATE: 'persistState' };
27
- /**
28
- * Base URL of Apify's API endpoints.
29
- * @type {string}
30
- */
31
- exports.APIFY_API_BASE_URL = 'https://api.apify.com/v2';
32
- /**
33
- * Additional number of seconds used in CheerioCrawler and BrowserCrawler to set a reasonable
34
- * handleRequestTimeoutSecs for BasicCrawler that would not impare functionality (not timeout before crawlers).
35
- *
36
- * @type {number}
37
- */
38
- exports.BASIC_CRAWLER_TIMEOUT_BUFFER_SECS = 10;
39
- exports.COUNTRY_CODE_REGEX = /^[A-Z]{2}$/;
40
- exports.STATUS_CODES_BLOCKED = [401, 403, 429];
41
- //# sourceMappingURL=constants.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"constants.js","sourceRoot":"","sources":["../src/constants.js"],"names":[],"mappings":";;;AAAA,0CAAkD;AAElD;;;GAGG;AACH,mCAAmC;AACtB,QAAA,kBAAkB,GAAG,0HAA0H,CAAC;AAE7J;;;;GAIG;AACU,QAAA,UAAU,GAAG;IACtB,OAAO,EAAE,CAAC;IACV,yBAAyB,EAAE,EAAE;IAC7B,aAAa,EAAE,EAAE;CACpB,CAAC;AAEF;;;;GAIG;AACU,QAAA,oBAAoB,GAAG,EAAE,GAAG,0BAAiB,EAAE,aAAa,EAAE,cAAc,EAAE,CAAC;AAE5F;;;GAGG;AACU,QAAA,kBAAkB,GAAG,0BAA0B,CAAC;AAE7D;;;;;GAKG;AACU,QAAA,iCAAiC,GAAG,EAAE,CAAC;AAEvC,QAAA,kBAAkB,GAAG,YAAY,CAAC;AAElC,QAAA,oBAAoB,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC"}
@@ -1,443 +0,0 @@
1
- /**
2
- * @typedef BasicCrawlerOptions
3
- * @property {HandleRequest} handleRequestFunction
4
- * User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
5
- *
6
- * The function receives the following object as an argument:
7
- * ```
8
- * {
9
- * request: Request,
10
- * session: Session,
11
- * crawler: BasicCrawler,
12
- * }
13
- * ```
14
- * where the {@link Request} instance represents the URL to crawl.
15
- *
16
- * The function must return a promise, which is then awaited by the crawler.
17
- *
18
- * If the function throws an exception, the crawler will try to re-crawl the
19
- * request later, up to `option.maxRequestRetries` times.
20
- * If all the retries fail, the crawler calls the function
21
- * provided to the `handleFailedRequestFunction` parameter.
22
- * To make this work, you should **always**
23
- * let your function throw exceptions rather than catch them.
24
- * The exceptions are logged to the request using the
25
- * {@link Request#pushErrorMessage} function.
26
- * @property {RequestList} [requestList]
27
- * Static list of URLs to be processed.
28
- * Either `requestList` or `requestQueue` option must be provided (or both).
29
- * @property {RequestQueue} [requestQueue]
30
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
31
- * Either `requestList` or `requestQueue` option must be provided (or both).
32
- * @property {number} [handleRequestTimeoutSecs=60]
33
- * Timeout in which the function passed as `handleRequestFunction` needs to finish, in seconds.
34
- * @property {HandleFailedRequest} [handleFailedRequestFunction]
35
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
36
- *
37
- * The function receives the following object as an argument:
38
- * ```
39
- * {
40
- * request: Request,
41
- * error: Error,
42
- * session: Session,
43
- * crawler: BasicCrawler,
44
- * }
45
- * ```
46
- * where the {@link Request} instance corresponds to the failed request, and the `Error` instance
47
- * represents the last error thrown during processing of the request.
48
- *
49
- * See
50
- * [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/basic_crawler.js#L11)
51
- * for the default implementation of this function.
52
- * @property {number} [maxRequestRetries=3]
53
- * Indicates how many times the request is retried if {@link BasicCrawlerOptions.handleRequestFunction} fails.
54
- * @property {number} [maxRequestsPerCrawl]
55
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
56
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
57
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
58
- * @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
59
- * Custom options passed to the underlying {@link AutoscaledPool} constructor.
60
- * Note that the `runTaskFunction` and `isTaskReadyFunction` options
61
- * are provided by `BasicCrawler` and cannot be overridden.
62
- * However, you can provide a custom implementation of `isFinishedFunction`.
63
- * @property {number} [minConcurrency=1]
64
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
65
- *
66
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
67
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
68
- * @property {number} [maxConcurrency=1000]
69
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
70
- * @property {boolean} [useSessionPool=true]
71
- * Basic crawler will initialize the {@link SessionPool} with the corresponding `sessionPoolOptions`.
72
- * The session instance will be than available in the `handleRequestFunction`.
73
- * @property {SessionPoolOptions} [sessionPoolOptions] The configuration options for {@link SessionPool} to use.
74
- */
75
- /**
76
- * Provides a simple framework for parallel crawling of web pages.
77
- * The URLs to crawl are fed either from a static list of URLs
78
- * or from a dynamic queue of URLs enabling recursive crawling of websites.
79
- *
80
- * `BasicCrawler` is a low-level tool that requires the user to implement the page
81
- * download and data extraction functionality themselves.
82
- * If you want a crawler that already facilitates this functionality,
83
- * please consider using {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}.
84
- *
85
- * `BasicCrawler` invokes the user-provided {@link BasicCrawlerOptions.handleRequestFunction}
86
- * for each {@link Request} object, which represents a single URL to crawl.
87
- * The {@link Request} objects are fed from the {@link RequestList} or the {@link RequestQueue}
88
- * instances provided by the {@link BasicCrawlerOptions.requestList} or {@link BasicCrawlerOptions.requestQueue}
89
- * constructor options, respectively.
90
- *
91
- * If both {@link BasicCrawlerOptions.requestList} and {@link BasicCrawlerOptions.requestQueue} options are used,
92
- * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
93
- * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
94
- *
95
- * The crawler finishes if there are no more {@link Request} objects to crawl.
96
- *
97
- * New requests are only dispatched when there is enough free CPU and memory available,
98
- * using the functionality provided by the {@link AutoscaledPool} class.
99
- * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
100
- * parameter of the `BasicCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
101
- * {@link AutoscaledPool} options are available directly in the `BasicCrawler` constructor.
102
- *
103
- * **Example usage:**
104
- *
105
- * ```javascript
106
- * // Prepare a list of URLs to crawl
107
- * const requestList = new Apify.RequestList({
108
- * sources: [
109
- * { url: 'http://www.example.com/page-1' },
110
- * { url: 'http://www.example.com/page-2' },
111
- * ],
112
- * });
113
- * await requestList.initialize();
114
- *
115
- * // Crawl the URLs
116
- * const crawler = new Apify.BasicCrawler({
117
- * requestList,
118
- * handleRequestFunction: async ({ request }) => {
119
- * // 'request' contains an instance of the Request class
120
- * // Here we simply fetch the HTML of the page and store it to a dataset
121
- * const { body } = await Apify.utils.requestAsBrowser(request);
122
- * await Apify.pushData({
123
- * url: request.url,
124
- * html: body,
125
- * })
126
- * },
127
- * });
128
- *
129
- * await crawler.run();
130
- * ```
131
- * @property {Statistics} stats
132
- * Contains statistics about the current run.
133
- * @property {RequestList} [requestList]
134
- * A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
135
- * Only available if used by the crawler.
136
- * @property {RequestQueue} [requestQueue]
137
- * A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
138
- * Only available if used by the crawler.
139
- * @property {SessionPool} [sessionPool]
140
- * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
141
- * Only available if used by the crawler.
142
- * @property {AutoscaledPool} autoscaledPool
143
- * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
144
- * Note that this property is only initialized after calling the {@link BasicCrawler#run} function.
145
- * You can use it to change the concurrency settings on the fly,
146
- * to pause the crawler by calling {@link AutoscaledPool#pause}
147
- * or to abort it by calling {@link AutoscaledPool#abort}.
148
- */
149
- export class BasicCrawler {
150
- /**
151
- * @internal
152
- * @type any
153
- */
154
- static optionsShape: any;
155
- /**
156
- * @param {BasicCrawlerOptions} options
157
- * All `BasicCrawler` parameters are passed via an options object.
158
- */
159
- constructor(options: BasicCrawlerOptions);
160
- /** @type {Log} */
161
- log: Log;
162
- requestList: RequestList | undefined;
163
- requestQueue: RequestQueue | undefined;
164
- userProvidedHandler: HandleRequest;
165
- failedContextHandler: HandleFailedRequest | undefined;
166
- handleRequestTimeoutMillis: number;
167
- internalTimeoutMillis: any;
168
- handleFailedRequestFunction: HandleFailedRequest | undefined;
169
- maxRequestRetries: number;
170
- handledRequestsCount: number;
171
- stats: Statistics;
172
- /** @type {SessionPoolOptions} */
173
- sessionPoolOptions: SessionPoolOptions;
174
- useSessionPool: boolean;
175
- crawlingContexts: Map<any, any>;
176
- autoscaledPoolOptions: any;
177
- isRunningPromise: Promise<void> | null;
178
- /**
179
- * Runs the crawler. Returns a promise that gets resolved once all the requests are processed.
180
- *
181
- * @return {Promise<void>}
182
- */
183
- run(): Promise<void>;
184
- /**
185
- * @return {Promise<void>}
186
- * @ignore
187
- * @protected
188
- * @internal
189
- */
190
- protected _init(): Promise<void>;
191
- autoscaledPool: AutoscaledPool | undefined;
192
- sessionPool: import("../session_pool/session_pool").SessionPool | undefined;
193
- /**
194
- * @param {CrawlingContext} crawlingContext
195
- * @return {Promise<void>}
196
- * @ignore
197
- * @protected
198
- * @internal
199
- */
200
- protected _handleRequestFunction(crawlingContext: CrawlingContext): Promise<void>;
201
- /**
202
- * @ignore
203
- * @protected
204
- * @internal
205
- */
206
- protected _pauseOnMigration(): Promise<void>;
207
- /**
208
- * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
209
- * and RequestQueue is present then enqueues it to the queue first.
210
- *
211
- * @ignore
212
- * @protected
213
- * @internal
214
- */
215
- protected _fetchNextRequest(): Promise<Request | null>;
216
- /**
217
- * Wrapper around handleRequestFunction that fetches requests from RequestList/RequestQueue
218
- * then retries them in a case of an error, etc.
219
- *
220
- * @ignore
221
- * @protected
222
- * @internal
223
- */
224
- protected _runTaskFunction(): Promise<void>;
225
- /**
226
- * Run async callback with given timeout and retry.
227
- * @ignore
228
- */
229
- _timeoutAndRetry(handler: any, timeout: any, error: any, maxRetries?: number, retried?: number): any;
230
- /**
231
- * Returns true if either RequestList or RequestQueue have a request ready for processing.
232
- *
233
- * @ignore
234
- * @protected
235
- * @internal
236
- */
237
- protected _isTaskReadyFunction(): Promise<boolean>;
238
- /**
239
- * Returns true if both RequestList and RequestQueue have all requests finished.
240
- *
241
- * @ignore
242
- * @protected
243
- * @internal
244
- */
245
- protected _defaultIsFinishedFunction(): Promise<boolean>;
246
- /**
247
- * Handles errors thrown by user provided handleRequestFunction()
248
- * @param {Error} error
249
- * @param {object} crawlingContext
250
- * @param {Request} crawlingContext.request
251
- * @param {(RequestList|RequestQueue)} source
252
- * @return {Promise<void>}
253
- * @ignore
254
- * @protected
255
- * @internal
256
- */
257
- protected _requestFunctionErrorHandler(error: Error, crawlingContext: {
258
- request: Request;
259
- }, source: (RequestList | RequestQueue)): Promise<void>;
260
- /**
261
- * @param {object} crawlingContext
262
- * @param {Error} crawlingContext.error
263
- * @param {Request} crawlingContext.request
264
- * @return {Promise<void>}
265
- * @ignore
266
- * @protected
267
- * @internal
268
- */
269
- protected _handleFailedRequestFunction(crawlingContext: {
270
- error: Error;
271
- request: Request;
272
- }): Promise<void>;
273
- /**
274
- * Updates handledRequestsCount from possibly stored counts,
275
- * usually after worker migration. Since one of the stores
276
- * needs to have priority when both are present,
277
- * it is the request queue, because generally, the request
278
- * list will first be dumped into the queue and then left
279
- * empty.
280
- *
281
- * @return {Promise<void>}
282
- * @ignore
283
- * @protected
284
- * @internal
285
- */
286
- protected _loadHandledRequestCount(): Promise<void>;
287
- /**
288
- * @param {Array<any>} hooks
289
- * @param {*} args
290
- * @ignore
291
- * @protected
292
- * @internal
293
- */
294
- protected _executeHooks(hooks: Array<any>, ...args: any): Promise<void>;
295
- /**
296
- * Function for cleaning up after all request are processed.
297
- * @ignore
298
- */
299
- teardown(): Promise<void>;
300
- }
301
- export type CrawlingContext = {
302
- id: string;
303
- request: Request;
304
- session: Session;
305
- proxyInfo: ProxyInfo;
306
- response: any;
307
- };
308
- export type BasicCrawlerOptions = {
309
- /**
310
- * User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
311
- *
312
- * The function receives the following object as an argument:
313
- * ```
314
- * {
315
- * request: Request,
316
- * session: Session,
317
- * crawler: BasicCrawler,
318
- * }
319
- * ```
320
- * where the {@link Request } instance represents the URL to crawl.
321
- *
322
- * The function must return a promise, which is then awaited by the crawler.
323
- *
324
- * If the function throws an exception, the crawler will try to re-crawl the
325
- * request later, up to `option.maxRequestRetries` times.
326
- * If all the retries fail, the crawler calls the function
327
- * provided to the `handleFailedRequestFunction` parameter.
328
- * To make this work, you should **always**
329
- * let your function throw exceptions rather than catch them.
330
- * The exceptions are logged to the request using the
331
- * {@link RequestpushErrorMessage } function.
332
- */
333
- handleRequestFunction: HandleRequest;
334
- /**
335
- * Static list of URLs to be processed.
336
- * Either `requestList` or `requestQueue` option must be provided (or both).
337
- */
338
- requestList?: RequestList | undefined;
339
- /**
340
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
341
- * Either `requestList` or `requestQueue` option must be provided (or both).
342
- */
343
- requestQueue?: RequestQueue | undefined;
344
- /**
345
- * Timeout in which the function passed as `handleRequestFunction` needs to finish, in seconds.
346
- */
347
- handleRequestTimeoutSecs?: number | undefined;
348
- /**
349
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
350
- *
351
- * The function receives the following object as an argument:
352
- * ```
353
- * {
354
- * request: Request,
355
- * error: Error,
356
- * session: Session,
357
- * crawler: BasicCrawler,
358
- * }
359
- * ```
360
- * where the {@link Request } instance corresponds to the failed request, and the `Error` instance
361
- * represents the last error thrown during processing of the request.
362
- *
363
- * See
364
- * [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/basic_crawler.js#L11)
365
- * for the default implementation of this function.
366
- */
367
- handleFailedRequestFunction?: HandleFailedRequest | undefined;
368
- /**
369
- * Indicates how many times the request is retried if {@link BasicCrawlerOptions.handleRequestFunction } fails.
370
- */
371
- maxRequestRetries?: number | undefined;
372
- /**
373
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
374
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
375
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
376
- */
377
- maxRequestsPerCrawl?: number | undefined;
378
- /**
379
- * Custom options passed to the underlying {@link AutoscaledPool } constructor.
380
- * Note that the `runTaskFunction` and `isTaskReadyFunction` options
381
- * are provided by `BasicCrawler` and cannot be overridden.
382
- * However, you can provide a custom implementation of `isFinishedFunction`.
383
- */
384
- autoscaledPoolOptions?: AutoscaledPoolOptions | undefined;
385
- /**
386
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool } option.
387
- *
388
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
389
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
390
- */
391
- minConcurrency?: number | undefined;
392
- /**
393
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool } option.
394
- */
395
- maxConcurrency?: number | undefined;
396
- /**
397
- * Basic crawler will initialize the {@link SessionPool } with the corresponding `sessionPoolOptions`.
398
- * The session instance will be than available in the `handleRequestFunction`.
399
- */
400
- useSessionPool?: boolean | undefined;
401
- /**
402
- * The configuration options for {@link SessionPool } to use.
403
- */
404
- sessionPoolOptions?: SessionPoolOptions | undefined;
405
- };
406
- export type HandleRequest = (inputs: HandleRequestInputs) => Promise<void>;
407
- export type HandleRequestInputs = {
408
- /**
409
- * The original {Request} object.
410
- * A reference to the underlying {@link AutoscaledPool } class that manages the concurrency of the crawler.
411
- * Note that this property is only initialized after calling the {@link BasicCrawlerrun } function.
412
- * You can use it to change the concurrency settings on the fly,
413
- * to pause the crawler by calling {@link AutoscaledPoolpause }
414
- * or to abort it by calling {@link AutoscaledPoolabort }.
415
- */
416
- request: Request;
417
- session?: Session | undefined;
418
- crawler?: BasicCrawler | undefined;
419
- };
420
- export type HandleFailedRequest = (inputs: HandleFailedRequestInput) => Promise<void>;
421
- export type HandleFailedRequestInput = {
422
- /**
423
- * The Error thrown by `handleRequestFunction`.
424
- */
425
- error: Error;
426
- /**
427
- * The original {Request} object.
428
- */
429
- request: Request;
430
- session: Session;
431
- proxyInfo: ProxyInfo;
432
- };
433
- import { Log } from "../utils_log";
434
- import { RequestList } from "../request_list";
435
- import { RequestQueue } from "../storages/request_queue";
436
- import Statistics from "./statistics";
437
- import { SessionPoolOptions } from "../session_pool/session_pool";
438
- import AutoscaledPool from "../autoscaling/autoscaled_pool";
439
- import Request from "../request";
440
- import { Session } from "../session_pool/session";
441
- import { ProxyInfo } from "../proxy_configuration";
442
- import { AutoscaledPoolOptions } from "../autoscaling/autoscaled_pool";
443
- //# sourceMappingURL=basic_crawler.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"basic_crawler.d.ts","sourceRoot":"","sources":["../../src/crawlers/basic_crawler.js"],"names":[],"mappings":"AA+CA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyEG;AAEH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyEG;AACH;IACI;;;OAGG;IACH,qBAFS,GAAG,CAuBV;IAEF;;;OAGG;IACH,qBAHW,mBAAmB,EAsH7B;IAtFG,kBAAkB;IAClB,KADW,GAAG,CACA;IACd,qCAA8B;IAC9B,uCAAgC;IAChC,mCAAgD;IAChD,sDAAuD;IACvD,mCAAiE;IAGjE,2BAA+H;IAK/H,6DAA8D;IAC9D,0BAA0C;IAC1C,6BAA6B;IAC7B,kBAA6F;IAC7F,iCAAiC;IACjC,oBADW,kBAAkB,CAI5B;IACD,wBAAoC;IACpC,gCAAiC;IAuDjC,2BAA2G;IAE3G,uCAA4B;IAOhC;;;;OAIG;IACH,OAFY,QAAQ,IAAI,CAAC,CAuBxB;IAED;;;;;OAKG;IACH,mBALY,QAAQ,IAAI,CAAC,CAkBxB;IATG,2CAAoE;IAGhE,4EAAiE;IAQzE;;;;;;OAMG;IACH,kDANW,eAAe,GACd,QAAQ,IAAI,CAAC,CAOxB;IAED;;;;OAIG;IACH,6CAmCC;IAED;;;;;;;OAOG;IACH,uDAiBC;IAED;;;;;;;OAOG;IACH,4CAiFC;IAED;;;OAGG;IACH,qGAWC;IAED;;;;;;OAMG;IACH,mDAOC;IAED;;;;;;OAMG;IACH,yDAUC;IAED;;;;;;;;;;OAUG;IACH,8CATW,KAAK;QAEoB,OAAO,EAAhC,OAAO;eACP,CAAC,WAAW,GAAC,YAAY,CAAC,GACzB,QAAQ,IAAI,CAAC,CA4BxB;IAED;;;;;;;;OAQG;IACH;QAPkC,KAAK,EAA5B,KAAK;QACoB,OAAO,EAAhC,OAAO;QACN,QAAQ,IAAI,CAAC,CAgBxB;IAED;;;;;;;;;;;;OAYG;IACH,sCALY,QAAQ,IAAI,CAAC,CAWxB;IAED;;;;;;OAMG;IACH,+BANW,MAAM,GAAG,CAAC,+BAYpB;IAED;;;OAGG;IACH,0BAIC;CACJ;;QArrBa,MAAM;aACN,OAAO;aACP,OAAO;eACP,SAAS;;;;;;;;;;;;;;;;;;;;;;;;;;;;2BAmBT,aAAa;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;qCAmqBhB,mBAAmB,KACjB,QAAQ,IAAI,CAAC;;;;;;;;;;aAIZ,OAAO;;;;2CAYV,wBAAwB,KACtB,QAAQ,IAAI,CAAC;;;;;WAKZ,KAAK;;;;aACL,OAAO;aACP,OAAO;eACP,SAAS"}