apify 2.3.1-beta.4 → 3.0.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. package/README.md +6 -5
  2. package/package.json +69 -128
  3. package/build/actor.d.ts +0 -113
  4. package/build/actor.d.ts.map +0 -1
  5. package/build/actor.js +0 -582
  6. package/build/actor.js.map +0 -1
  7. package/build/apify.d.ts +0 -752
  8. package/build/apify.d.ts.map +0 -1
  9. package/build/apify.js +0 -877
  10. package/build/apify.js.map +0 -1
  11. package/build/autoscaling/autoscaled_pool.d.ts +0 -384
  12. package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
  13. package/build/autoscaling/autoscaled_pool.js +0 -557
  14. package/build/autoscaling/autoscaled_pool.js.map +0 -1
  15. package/build/autoscaling/snapshotter.d.ts +0 -278
  16. package/build/autoscaling/snapshotter.d.ts.map +0 -1
  17. package/build/autoscaling/snapshotter.js +0 -447
  18. package/build/autoscaling/snapshotter.js.map +0 -1
  19. package/build/autoscaling/system_status.d.ts +0 -224
  20. package/build/autoscaling/system_status.d.ts.map +0 -1
  21. package/build/autoscaling/system_status.js +0 -228
  22. package/build/autoscaling/system_status.js.map +0 -1
  23. package/build/browser_launchers/browser_launcher.d.ts +0 -154
  24. package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
  25. package/build/browser_launchers/browser_launcher.js +0 -160
  26. package/build/browser_launchers/browser_launcher.js.map +0 -1
  27. package/build/browser_launchers/browser_plugin.d.ts +0 -23
  28. package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
  29. package/build/browser_launchers/browser_plugin.js +0 -25
  30. package/build/browser_launchers/browser_plugin.js.map +0 -1
  31. package/build/browser_launchers/playwright_launcher.d.ts +0 -131
  32. package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
  33. package/build/browser_launchers/playwright_launcher.js +0 -150
  34. package/build/browser_launchers/playwright_launcher.js.map +0 -1
  35. package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
  36. package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
  37. package/build/browser_launchers/puppeteer_launcher.js +0 -197
  38. package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
  39. package/build/cache_container.d.ts +0 -31
  40. package/build/cache_container.d.ts.map +0 -1
  41. package/build/cache_container.js +0 -48
  42. package/build/cache_container.js.map +0 -1
  43. package/build/configuration.d.ts +0 -226
  44. package/build/configuration.d.ts.map +0 -1
  45. package/build/configuration.js +0 -325
  46. package/build/configuration.js.map +0 -1
  47. package/build/constants.d.ts +0 -37
  48. package/build/constants.d.ts.map +0 -1
  49. package/build/constants.js +0 -41
  50. package/build/constants.js.map +0 -1
  51. package/build/crawlers/basic_crawler.d.ts +0 -443
  52. package/build/crawlers/basic_crawler.d.ts.map +0 -1
  53. package/build/crawlers/basic_crawler.js +0 -664
  54. package/build/crawlers/basic_crawler.js.map +0 -1
  55. package/build/crawlers/browser_crawler.d.ts +0 -512
  56. package/build/crawlers/browser_crawler.d.ts.map +0 -1
  57. package/build/crawlers/browser_crawler.js +0 -540
  58. package/build/crawlers/browser_crawler.js.map +0 -1
  59. package/build/crawlers/cheerio_crawler.d.ts +0 -931
  60. package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
  61. package/build/crawlers/cheerio_crawler.js +0 -913
  62. package/build/crawlers/cheerio_crawler.js.map +0 -1
  63. package/build/crawlers/crawler_extension.d.ts +0 -10
  64. package/build/crawlers/crawler_extension.d.ts.map +0 -1
  65. package/build/crawlers/crawler_extension.js +0 -19
  66. package/build/crawlers/crawler_extension.js.map +0 -1
  67. package/build/crawlers/crawler_utils.d.ts +0 -34
  68. package/build/crawlers/crawler_utils.d.ts.map +0 -1
  69. package/build/crawlers/crawler_utils.js +0 -87
  70. package/build/crawlers/crawler_utils.js.map +0 -1
  71. package/build/crawlers/playwright_crawler.d.ts +0 -448
  72. package/build/crawlers/playwright_crawler.d.ts.map +0 -1
  73. package/build/crawlers/playwright_crawler.js +0 -299
  74. package/build/crawlers/playwright_crawler.js.map +0 -1
  75. package/build/crawlers/puppeteer_crawler.d.ts +0 -425
  76. package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
  77. package/build/crawlers/puppeteer_crawler.js +0 -299
  78. package/build/crawlers/puppeteer_crawler.js.map +0 -1
  79. package/build/crawlers/statistics.d.ts +0 -185
  80. package/build/crawlers/statistics.d.ts.map +0 -1
  81. package/build/crawlers/statistics.js +0 -331
  82. package/build/crawlers/statistics.js.map +0 -1
  83. package/build/enqueue_links/click_elements.d.ts +0 -179
  84. package/build/enqueue_links/click_elements.d.ts.map +0 -1
  85. package/build/enqueue_links/click_elements.js +0 -434
  86. package/build/enqueue_links/click_elements.js.map +0 -1
  87. package/build/enqueue_links/enqueue_links.d.ts +0 -117
  88. package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
  89. package/build/enqueue_links/enqueue_links.js +0 -163
  90. package/build/enqueue_links/enqueue_links.js.map +0 -1
  91. package/build/enqueue_links/shared.d.ts +0 -42
  92. package/build/enqueue_links/shared.d.ts.map +0 -1
  93. package/build/enqueue_links/shared.js +0 -121
  94. package/build/enqueue_links/shared.js.map +0 -1
  95. package/build/errors.d.ts +0 -29
  96. package/build/errors.d.ts.map +0 -1
  97. package/build/errors.js +0 -38
  98. package/build/errors.js.map +0 -1
  99. package/build/events.d.ts +0 -11
  100. package/build/events.d.ts.map +0 -1
  101. package/build/events.js +0 -147
  102. package/build/events.js.map +0 -1
  103. package/build/index.d.ts +0 -4
  104. package/build/index.d.ts.map +0 -1
  105. package/build/index.js +0 -7
  106. package/build/index.js.map +0 -1
  107. package/build/main.d.ts +0 -179
  108. package/build/main.d.ts.map +0 -1
  109. package/build/main.js +0 -81
  110. package/build/main.js.map +0 -1
  111. package/build/playwright_utils.d.ts +0 -9
  112. package/build/playwright_utils.d.ts.map +0 -1
  113. package/build/playwright_utils.js +0 -90
  114. package/build/playwright_utils.js.map +0 -1
  115. package/build/proxy_configuration.d.ts +0 -411
  116. package/build/proxy_configuration.d.ts.map +0 -1
  117. package/build/proxy_configuration.js +0 -517
  118. package/build/proxy_configuration.js.map +0 -1
  119. package/build/pseudo_url.d.ts +0 -86
  120. package/build/pseudo_url.d.ts.map +0 -1
  121. package/build/pseudo_url.js +0 -153
  122. package/build/pseudo_url.js.map +0 -1
  123. package/build/puppeteer_request_interception.d.ts +0 -8
  124. package/build/puppeteer_request_interception.d.ts.map +0 -1
  125. package/build/puppeteer_request_interception.js +0 -235
  126. package/build/puppeteer_request_interception.js.map +0 -1
  127. package/build/puppeteer_utils.d.ts +0 -250
  128. package/build/puppeteer_utils.d.ts.map +0 -1
  129. package/build/puppeteer_utils.js +0 -551
  130. package/build/puppeteer_utils.js.map +0 -1
  131. package/build/request.d.ts +0 -180
  132. package/build/request.d.ts.map +0 -1
  133. package/build/request.js +0 -261
  134. package/build/request.js.map +0 -1
  135. package/build/request_list.d.ts +0 -581
  136. package/build/request_list.d.ts.map +0 -1
  137. package/build/request_list.js +0 -826
  138. package/build/request_list.js.map +0 -1
  139. package/build/serialization.d.ts +0 -5
  140. package/build/serialization.d.ts.map +0 -1
  141. package/build/serialization.js +0 -139
  142. package/build/serialization.js.map +0 -1
  143. package/build/session_pool/errors.d.ts +0 -11
  144. package/build/session_pool/errors.d.ts.map +0 -1
  145. package/build/session_pool/errors.js +0 -18
  146. package/build/session_pool/errors.js.map +0 -1
  147. package/build/session_pool/events.d.ts +0 -5
  148. package/build/session_pool/events.d.ts.map +0 -1
  149. package/build/session_pool/events.js +0 -6
  150. package/build/session_pool/events.js.map +0 -1
  151. package/build/session_pool/session.d.ts +0 -286
  152. package/build/session_pool/session.d.ts.map +0 -1
  153. package/build/session_pool/session.js +0 -355
  154. package/build/session_pool/session.js.map +0 -1
  155. package/build/session_pool/session_pool.d.ts +0 -280
  156. package/build/session_pool/session_pool.d.ts.map +0 -1
  157. package/build/session_pool/session_pool.js +0 -393
  158. package/build/session_pool/session_pool.js.map +0 -1
  159. package/build/session_pool/session_utils.d.ts +0 -4
  160. package/build/session_pool/session_utils.d.ts.map +0 -1
  161. package/build/session_pool/session_utils.js +0 -24
  162. package/build/session_pool/session_utils.js.map +0 -1
  163. package/build/stealth/hiding_tricks.d.ts +0 -22
  164. package/build/stealth/hiding_tricks.d.ts.map +0 -1
  165. package/build/stealth/hiding_tricks.js +0 -308
  166. package/build/stealth/hiding_tricks.js.map +0 -1
  167. package/build/stealth/stealth.d.ts +0 -56
  168. package/build/stealth/stealth.d.ts.map +0 -1
  169. package/build/stealth/stealth.js +0 -125
  170. package/build/stealth/stealth.js.map +0 -1
  171. package/build/storages/dataset.d.ts +0 -288
  172. package/build/storages/dataset.d.ts.map +0 -1
  173. package/build/storages/dataset.js +0 -480
  174. package/build/storages/dataset.js.map +0 -1
  175. package/build/storages/key_value_store.d.ts +0 -243
  176. package/build/storages/key_value_store.d.ts.map +0 -1
  177. package/build/storages/key_value_store.js +0 -462
  178. package/build/storages/key_value_store.js.map +0 -1
  179. package/build/storages/request_queue.d.ts +0 -318
  180. package/build/storages/request_queue.d.ts.map +0 -1
  181. package/build/storages/request_queue.js +0 -636
  182. package/build/storages/request_queue.js.map +0 -1
  183. package/build/storages/storage_manager.d.ts +0 -87
  184. package/build/storages/storage_manager.d.ts.map +0 -1
  185. package/build/storages/storage_manager.js +0 -150
  186. package/build/storages/storage_manager.js.map +0 -1
  187. package/build/tsconfig.tsbuildinfo +0 -1
  188. package/build/typedefs.d.ts +0 -146
  189. package/build/typedefs.d.ts.map +0 -1
  190. package/build/typedefs.js +0 -88
  191. package/build/typedefs.js.map +0 -1
  192. package/build/utils.d.ts +0 -175
  193. package/build/utils.d.ts.map +0 -1
  194. package/build/utils.js +0 -731
  195. package/build/utils.js.map +0 -1
  196. package/build/utils_log.d.ts +0 -41
  197. package/build/utils_log.d.ts.map +0 -1
  198. package/build/utils_log.js +0 -192
  199. package/build/utils_log.js.map +0 -1
  200. package/build/utils_request.d.ts +0 -77
  201. package/build/utils_request.d.ts.map +0 -1
  202. package/build/utils_request.js +0 -385
  203. package/build/utils_request.js.map +0 -1
  204. package/build/utils_social.d.ts +0 -210
  205. package/build/utils_social.d.ts.map +0 -1
  206. package/build/utils_social.js +0 -787
  207. package/build/utils_social.js.map +0 -1
  208. package/build/validators.d.ts +0 -23
  209. package/build/validators.d.ts.map +0 -1
  210. package/build/validators.js +0 -29
  211. package/build/validators.js.map +0 -1
@@ -1,299 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- const tslib_1 = require("tslib");
4
- const ow_1 = (0, tslib_1.__importDefault)(require("ow"));
5
- const puppeteer_utils_1 = require("../puppeteer_utils");
6
- const stealth_1 = (0, tslib_1.__importDefault)(require("../stealth/stealth"));
7
- /* eslint-disable no-unused-vars,import/named,import/no-duplicates,import/order */
8
- const puppeteer_launcher_1 = require("../browser_launchers/puppeteer_launcher");
9
- const browser_crawler_1 = (0, tslib_1.__importDefault)(require("./browser_crawler"));
10
- /* eslint-enable no-unused-vars,import/named,import/no-duplicates,import/order */
11
- /**
12
- * @callback PuppeteerHook
13
- * @param {{ page: Page, crawler: PuppeteerCrawler } & BrowserCrawlingContext & CrawlingContext} crawlingContext
14
- * @param {DirectNavigationOptions} gotoOptions
15
- * @returns {Promise<void>}
16
- */
17
- /**
18
- * @typedef PuppeteerHandlePageFunctionParam
19
- * @property {Page} page
20
- * @property {PuppeteerCrawler} crawler
21
- */
22
- /**
23
- * @callback PuppeteerHandlePage
24
- * @param {CrawlingContext & BrowserCrawlingContext & { page: Page, crawler: PuppeteerCrawler }} context
25
- * @returns {Promise<void>}
26
- */
27
- /**
28
- * @typedef PuppeteerCrawlerOptions
29
- * @property {PuppeteerHandlePage} handlePageFunction
30
- * Function that is called to process each request.
31
- * It is passed an object with the following fields:
32
- *
33
- * ```
34
- * {
35
- * request: Request,
36
- * response: Response,
37
- * page: Page,
38
- * session: Session,
39
- * browserController: BrowserController,
40
- * proxyInfo: ProxyInfo,
41
- * crawler: PuppeteerCrawler,
42
- * }
43
- * ```
44
- *
45
- * `request` is an instance of the {@link Request} object with details about the URL to open, HTTP method etc.
46
- * `page` is an instance of the `Puppeteer`
47
- * [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page)
48
- * `browserPool` is an instance of the
49
- * [`BrowserPool`](https://github.com/apify/browser-pool#BrowserPool),
50
- * `browserController` is an instance of the
51
- * [`BrowserController`](https://github.com/apify/browser-pool#browsercontroller),
52
- * `response` is an instance of the `Puppeteer`
53
- * [`Response`](https://pptr.dev/#?product=Puppeteer&show=api-class-response),
54
- * which is the main resource response as returned by `page.goto(request.url)`.
55
- * The function must return a promise, which is then awaited by the crawler.
56
- *
57
- * If the function throws an exception, the crawler will try to re-crawl the
58
- * request later, up to `option.maxRequestRetries` times.
59
- * If all the retries fail, the crawler calls the function
60
- * provided to the `handleFailedRequestFunction` parameter.
61
- * To make this work, you should **always**
62
- * let your function throw exceptions rather than catch them.
63
- * The exceptions are logged to the request using the
64
- * {@link Request#pushErrorMessage} function.
65
- * @property {number} [navigationTimeoutSecs=60]
66
- * Timeout in which page navigation needs to finish, in seconds.
67
- * @property {HandleFailedRequest} [handleFailedRequestFunction]
68
- * A function to handle requests that failed more than `option.maxRequestRetries` times.
69
- *
70
- * The function receives the following object as an argument:
71
- * ```
72
- * {
73
- * request: Request,
74
- * response: Response,
75
- * page: Page,
76
- * session: Session,
77
- * browserController: BrowserController,
78
- * proxyInfo: ProxyInfo,
79
- * crawler: PuppeteerCrawler,
80
- * }
81
- * ```
82
- * Where the {@link Request} instance corresponds to the failed request, and the `Error` instance
83
- * represents the last error thrown during processing of the request.
84
- * @property {PuppeteerLaunchContext} [launchContext]
85
- * Options used by {@link Apify#launchPuppeteer} to start new Puppeteer instances.
86
- * @property {number} [handlePageTimeoutSecs=60]
87
- * Timeout in which the function passed as `handlePageFunction` needs to finish, in seconds.
88
- * @property {BrowserPoolOptions} [browserPoolOptions]
89
- * Custom options passed to the underlying [`BrowserPool`](https://github.com/apify/browser-pool#BrowserPool) constructor.
90
- * You can tweak those to fine-tune browser management.
91
- * @property {boolean} [persistCookiesPerSession=true]
92
- * Automatically saves cookies to Session. Works only if Session Pool is used.
93
- * @property {ProxyConfiguration} [proxyConfiguration]
94
- * If set, `PuppeteerCrawler` will be configured for all connections to use
95
- * [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
96
- * For more information, see the [documentation](https://docs.apify.com/proxy).
97
- * @property {Array<PuppeteerHook>} [preNavigationHooks]
98
- * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
99
- * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`,
100
- * which are passed to the `page.goto()` function the crawler calls to navigate.
101
- * Example:
102
- * ```
103
- * preNavigationHooks: [
104
- * async (crawlingContext, gotoOptions) => {
105
- * const { page } = crawlingContext;
106
- * await page.evaluate((attr) => { window.foo = attr; }, 'bar');
107
- * },
108
- * ]
109
- * ```
110
- * @property {Array<PuppeteerHook>} [postNavigationHooks]
111
- * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
112
- * The function accepts `crawlingContext` as the only parameter.
113
- * Example:
114
- * ```
115
- * postNavigationHooks: [
116
- * async (crawlingContext) => {
117
- * const { page } = crawlingContext;
118
- * if (hasCaptcha(page)) {
119
- * await solveCaptcha (page);
120
- * }
121
- * },
122
- * ]
123
- * ```
124
- * @property {RequestList} [requestList]
125
- * Static list of URLs to be processed.
126
- * Either `requestList` or `requestQueue` option must be provided (or both).
127
- * @property {RequestQueue} [requestQueue]
128
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
129
- * Either `requestList` or `requestQueue` option must be provided (or both).
130
- * @property {number} [maxRequestRetries=3]
131
- * Indicates how many times the request is retried if {@link PuppeteerCrawlerOptions.handlePageFunction} fails.
132
- * @property {number} [maxRequestsPerCrawl]
133
- * Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
134
- * Always set this value in order to prevent infinite loops in misconfigured crawlers.
135
- * Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
136
- * @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
137
- * Custom options passed to the underlying {@link AutoscaledPool} constructor.
138
- * Note that the `runTaskFunction` and `isTaskReadyFunction` options
139
- * are provided by the crawler and cannot be overridden.
140
- * However, you can provide a custom implementation of `isFinishedFunction`.
141
- * @property {number} [minConcurrency=1]
142
- * Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
143
- *
144
- * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
145
- * If you're not sure, just keep the default value and the concurrency will scale up automatically.
146
- * @property {number} [maxConcurrency=1000]
147
- * Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
148
- * @property {boolean} [useSessionPool=true]
149
- * Puppeteer crawler will initialize the {@link SessionPool} with the corresponding `sessionPoolOptions`.
150
- * The session instance will be than available in the `handleRequestFunction`.
151
- * @property {SessionPoolOptions} [sessionPoolOptions] The configuration options for {@link SessionPool} to use.
152
- */
153
- /**
154
- * Provides a simple framework for parallel crawling of web pages
155
- * using headless Chrome with [Puppeteer](https://github.com/puppeteer/puppeteer).
156
- * The URLs to crawl are fed either from a static list of URLs
157
- * or from a dynamic queue of URLs enabling recursive crawling of websites.
158
- *
159
- * Since `PuppeteerCrawler` uses headless Chrome to download web pages and extract data,
160
- * it is useful for crawling of websites that require to execute JavaScript.
161
- * If the target website doesn't need JavaScript, consider using {@link CheerioCrawler},
162
- * which downloads the pages using raw HTTP requests and is about 10x faster.
163
- *
164
- * The source URLs are represented using {@link Request} objects that are fed from
165
- * {@link RequestList} or {@link RequestQueue} instances provided by the {@link PuppeteerCrawlerOptions.requestList}
166
- * or {@link PuppeteerCrawlerOptions.requestQueue} constructor options, respectively.
167
- *
168
- * If both {@link PuppeteerCrawlerOptions.requestList} and {@link PuppeteerCrawlerOptions.requestQueue} are used,
169
- * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
170
- * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
171
- *
172
- * The crawler finishes when there are no more {@link Request} objects to crawl.
173
- *
174
- * `PuppeteerCrawler` opens a new Chrome page (i.e. tab) for each {@link Request} object to crawl
175
- * and then calls the function provided by user as the {@link PuppeteerCrawlerOptions.handlePageFunction} option.
176
- *
177
- * New pages are only opened when there is enough free CPU and memory available,
178
- * using the functionality provided by the {@link AutoscaledPool} class.
179
- * All {@link AutoscaledPool} configuration options can be passed to the {@link PuppeteerCrawlerOptions.autoscaledPoolOptions}
180
- * parameter of the `PuppeteerCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
181
- * {@link AutoscaledPoolOptions} are available directly in the `PuppeteerCrawler` constructor.
182
- *
183
- * Note that the pool of Puppeteer instances is internally managed by the {@link BrowserPool} class.
184
- *
185
- * **Example usage:**
186
- *
187
- * ```javascript
188
- * const crawler = new Apify.PuppeteerCrawler({
189
- * requestList,
190
- * handlePageFunction: async ({ page, request }) => {
191
- * // This function is called to extract data from a single web page
192
- * // 'page' is an instance of Puppeteer.Page with page.goto(request.url) already called
193
- * // 'request' is an instance of Request class with information about the page to load
194
- * await Apify.pushData({
195
- * title: await page.title(),
196
- * url: request.url,
197
- * succeeded: true,
198
- * })
199
- * },
200
- * handleFailedRequestFunction: async ({ request }) => {
201
- * // This function is called when the crawling of a request failed too many times
202
- * await Apify.pushData({
203
- * url: request.url,
204
- * succeeded: false,
205
- * errors: request.errorMessages,
206
- * })
207
- * },
208
- * });
209
- *
210
- * await crawler.run();
211
- * ```
212
- * @property {Statistics} stats
213
- * Contains statistics about the current run.
214
- * @property {RequestList} [requestList]
215
- * A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
216
- * Only available if used by the crawler.
217
- * @property {RequestQueue} [requestQueue]
218
- * A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
219
- * Only available if used by the crawler.
220
- * @property {SessionPool} [sessionPool]
221
- * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
222
- * Only available if used by the crawler.
223
- * @property {ProxyConfiguration} proxyConfiguration
224
- * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
225
- * Only available if used by the crawler.
226
- * @property {BrowserPool} browserPool
227
- * A reference to the underlying `BrowserPool` class that manages the crawler's browsers.
228
- * For more information about it, see the [`browser-pool` module](https://github.com/apify/browser-pool).
229
- * @property {AutoscaledPool} autoscaledPool
230
- * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
231
- * Note that this property is only initialized after calling the {@link CheerioCrawler#run} function.
232
- * You can use it to change the concurrency settings on the fly,
233
- * to pause the crawler by calling {@link AutoscaledPool#pause}
234
- * or to abort it by calling {@link AutoscaledPool#abort}.
235
- */
236
- class PuppeteerCrawler extends browser_crawler_1.default {
237
- /**
238
- * @param {PuppeteerCrawlerOptions} options
239
- * All `PuppeteerCrawler` parameters are passed via an options object.
240
- */
241
- constructor(options = {}) {
242
- (0, ow_1.default)(options, 'PuppeteerCrawlerOptions', ow_1.default.object.exactShape(PuppeteerCrawler.optionsShape));
243
- const { launchContext = {}, // @TODO: should not launcher be inside launchContext
244
- browserPoolOptions = {}, proxyConfiguration, ...browserCrawlerOptions } = options;
245
- const { stealth = false, } = launchContext;
246
- if (launchContext.proxyUrl) {
247
- throw new Error('PuppeteerCrawlerOptions.launchContext.proxyUrl is not allowed in PuppeteerCrawler.'
248
- + 'Use PuppeteerCrawlerOptions.proxyConfiguration');
249
- }
250
- const puppeteerLauncher = new puppeteer_launcher_1.PuppeteerLauncher(launchContext);
251
- browserPoolOptions.browserPlugins = [
252
- puppeteerLauncher.createBrowserPlugin(),
253
- ];
254
- super({
255
- ...browserCrawlerOptions,
256
- proxyConfiguration,
257
- browserPoolOptions,
258
- });
259
- if (stealth) {
260
- this.browserPool.postLaunchHooks.push(async (pageId, browserController) => {
261
- // @TODO: We can do this better now. It is not necessary to override the page.
262
- // we can modify the page in the postPageCreateHook
263
- const { hideWebDriver, ...newStealthOptions } = puppeteerLauncher.stealthOptions;
264
- await (0, stealth_1.default)(browserController.browser, newStealthOptions);
265
- });
266
- }
267
- this.launchContext = launchContext;
268
- }
269
- /**
270
- * @param {*} crawlingContext
271
- * @param {*} gotoOptions
272
- * @ignore
273
- * @protected
274
- * @internal
275
- */
276
- async _navigationHandler(crawlingContext, gotoOptions) {
277
- if (this.gotoFunction) {
278
- this.log.deprecated('PuppeteerCrawlerOptions.gotoFunction is deprecated. Use "preNavigationHooks" and "postNavigationHooks" instead.');
279
- return this.gotoFunction(crawlingContext, gotoOptions);
280
- }
281
- return (0, puppeteer_utils_1.gotoExtended)(crawlingContext.page, crawlingContext.request, gotoOptions);
282
- }
283
- }
284
- /**
285
- * @internal
286
- * @type any
287
- */
288
- Object.defineProperty(PuppeteerCrawler, "optionsShape", {
289
- enumerable: true,
290
- configurable: true,
291
- writable: true,
292
- value: {
293
- ...browser_crawler_1.default.optionsShape,
294
- browserPoolOptions: ow_1.default.optional.object,
295
- launchContext: ow_1.default.optional.object,
296
- }
297
- });
298
- exports.default = PuppeteerCrawler;
299
- //# sourceMappingURL=puppeteer_crawler.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"puppeteer_crawler.js","sourceRoot":"","sources":["../../src/crawlers/puppeteer_crawler.js"],"names":[],"mappings":";;;AAAA,yDAAoB;AAEpB,wDAAkD;AAClD,8EAAuD;AAEvD,kFAAkF;AAClF,gFAAoG;AAEpG,qFAA2E;AAU3E,iFAAiF;AAEjF;;;;;GAKG;AACH;;;;GAIG;AACH;;;;GAIG;AACH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6HG;AAEH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkFG;AACH,MAAM,gBAAiB,SAAQ,yBAAc;IAWzC;;;MAGE;IACF,YAAY,OAAO,GAAG,EAAE;QACpB,IAAA,YAAE,EAAC,OAAO,EAAE,yBAAyB,EAAE,YAAE,CAAC,MAAM,CAAC,UAAU,CAAC,gBAAgB,CAAC,YAAY,CAAC,CAAC,CAAC;QAE5F,MAAM,EACF,aAAa,GAAG,EAAE,EAAE,qDAAqD;QACzE,kBAAkB,GAAG,EAAE,EACvB,kBAAkB,EAClB,GAAG,qBAAqB,EAC3B,GAAG,OAAO,CAAC;QAEZ,MAAM,EACF,OAAO,GAAG,KAAK,GAClB,GAAG,aAAa,CAAC;QAElB,IAAI,aAAa,CAAC,QAAQ,EAAE;YACxB,MAAM,IAAI,KAAK,CAAC,oFAAoF;kBAC9F,gDAAgD,CAAC,CAAC;SAC3D;QACD,MAAM,iBAAiB,GAAG,IAAI,sCAAiB,CAAC,aAAa,CAAC,CAAC;QAE/D,kBAAkB,CAAC,cAAc,GAAG;YAChC,iBAAiB,CAAC,mBAAmB,EAAE;SAC1C,CAAC;QAEF,KAAK,CAAC;YACF,GAAG,qBAAqB;YACxB,kBAAkB;YAClB,kBAAkB;SACrB,CAAC,CAAC;QAEH,IAAI,OAAO,EAAE;YACT,IAAI,CAAC,WAAW,CAAC,eAAe,CAAC,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,iBAAiB,EAAE,EAAE;gBACtE,8EAA8E;gBAC9E,mDAAmD;gBACnD,MAAM,EAAE,aAAa,EAAE,GAAG,iBAAiB,EAAE,GAAG,iBAAiB,CAAC,cAAc,CAAC;gBACjF,MAAM,IAAA,iBAAqB,EAAC,iBAAiB,CAAC,OAAO,EAAE,iBAAiB,CAAC,CAAC;YAC9E,CAAC,CAAC,CAAC;SACN;QAED,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC;IACvC,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,kBAAkB,CAAC,eAAe,EAAE,WAAW;QACjD,IAAI,IAAI,CAAC,YAAY,EAAE;YACnB,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,iHAAiH,CAAC,CAAC;YAEvI,OAAO,IAAI,CAAC,YAAY,CAAC,eAAe,EAAE,WAAW,CAAC,CAAC;SAC1D;QACD,OAAO,IAAA,8BAAY,EAAC,eAAe,CAAC,IAAI,EAAE,eAAe,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;IACpF,CAAC;;AAtED;;;GAGG;AACI;;;;WAAe;QAClB,GAAG,yBAAc,CAAC,YAAY;QAC9B,kBAAkB,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;QACtC,aAAa,EAAE,YAAE,CAAC,QAAQ,CAAC,MAAM;KACpC;GAAC;AAiEN,kBAAe,gBAAgB,CAAC"}
@@ -1,185 +0,0 @@
1
- export default Statistics;
2
- export type StatisticsOptions = {
3
- logIntervalSecs?: number | undefined;
4
- logMessage?: string | undefined;
5
- };
6
- /**
7
- * Format of the persisted stats
8
- */
9
- export type StatisticPersistedState = {
10
- requestRetryHistogram: number[];
11
- statsId: number;
12
- requestAvgFailedDurationMillis: number;
13
- requestAvgFinishedDurationMillis: number;
14
- requestsFinishedPerMinute: number;
15
- requestsFailedPerMinute: number;
16
- requestTotalDurationMillis: number;
17
- requestsTotal: number;
18
- crawlerRuntimeMillis: number;
19
- crawlerLastStartTimestamp: number;
20
- statsPersistedAt: string;
21
- };
22
- /**
23
- * Contains the statistics state
24
- */
25
- export type StatisticState = {
26
- requestsFinished: number;
27
- requestsFailed: number;
28
- requestsRetries: number;
29
- requestsFailedPerMinute: number;
30
- requestsFinishedPerMinute: number;
31
- requestMinDurationMillis: number;
32
- requestMaxDurationMillis: number;
33
- requestTotalFailedDurationMillis: number;
34
- requestTotalFinishedDurationMillis: number;
35
- crawlerStartedAt: Date | string | null;
36
- crawlerFinishedAt: Date | string | null;
37
- crawlerRuntimeMillis: number;
38
- statsPersistedAt: Date | string | null;
39
- };
40
- /**
41
- * The statistics class provides an interface to collecting and logging run
42
- * statistics for requests.
43
- *
44
- * All statistic information is saved on key value store
45
- * under the key SDK_CRAWLER_STATISTICS_*, persists between
46
- * migrations and abort/resurrect
47
- *
48
- * @property {StatisticState} state
49
- * Current statistic state used for doing calculations on {@link Statistics#calculate} calls
50
- * @property {number} id
51
- * Statistic instance id
52
- * @property {number[]} requestRetryHistogram
53
- * Contains the current retries histogram.
54
- * Index 0 means 0 retries, index 2, 2 retries,
55
- * and so on
56
- */
57
- declare class Statistics {
58
- /**
59
- * @param {StatisticsOptions} [options]
60
- * @hideconstructor
61
- */
62
- constructor(options?: StatisticsOptions | undefined);
63
- log: import("@apify/log/log").Log;
64
- logIntervalMillis: number;
65
- logMessage: string;
66
- keyValueStore: import("../storages/key_value_store").KeyValueStore | null;
67
- id: number;
68
- persistStateKey: string;
69
- listener: () => Promise<void>;
70
- requestRetryHistogram: any[];
71
- /**
72
- * @private
73
- * @type {Object<string|number, Job>}
74
- */
75
- private requestsInProgress;
76
- /**
77
- * Set the current statistic instance to pristine values
78
- */
79
- reset(): void;
80
- state: {
81
- requestsFinished: number;
82
- requestsFailed: number;
83
- requestsRetries: number;
84
- requestsFailedPerMinute: number;
85
- requestsFinishedPerMinute: number;
86
- requestMinDurationMillis: number;
87
- requestMaxDurationMillis: number;
88
- requestTotalFailedDurationMillis: number;
89
- requestTotalFinishedDurationMillis: number;
90
- crawlerStartedAt: null;
91
- crawlerFinishedAt: null;
92
- statsPersistedAt: null;
93
- crawlerRuntimeMillis: number;
94
- } | undefined;
95
- instanceStart: number | undefined;
96
- /**
97
- * Starts a job
98
- *
99
- * @param {number|string} id
100
- * @ignore
101
- */
102
- startJob(id: number | string): void;
103
- /**
104
- * Mark job as finished and sets the state
105
- *
106
- * @param {number|string} id
107
- * @ignore
108
- */
109
- finishJob(id: number | string): void;
110
- /**
111
- * Mark job as failed and sets the state
112
- *
113
- * @param {number|string} id
114
- * @ignore
115
- */
116
- failJob(id: number | string): void;
117
- /**
118
- * Calculate the current statistics
119
- */
120
- calculate(): {
121
- requestAvgFailedDurationMillis: number;
122
- requestAvgFinishedDurationMillis: number;
123
- requestsFinishedPerMinute: number;
124
- requestsFailedPerMinute: number;
125
- requestTotalDurationMillis: any;
126
- requestsTotal: any;
127
- crawlerRuntimeMillis: number;
128
- };
129
- /**
130
- * Initializes the key value store for persisting the statistics,
131
- * displaying the current state in predefined intervals
132
- */
133
- startCapturing(): Promise<void>;
134
- logInterval: NodeJS.Timer | null | undefined;
135
- /**
136
- * Stops logging and remove event listeners, then persist
137
- */
138
- stopCapturing(): Promise<void>;
139
- /**
140
- * @param {Job} job
141
- * @ignore
142
- * @protected
143
- * @internal
144
- */
145
- protected _saveRetryCountForJob(job: Job): void;
146
- /**
147
- * Persist internal state to the key value store
148
- */
149
- persistState(): Promise<void>;
150
- /**
151
- * Loads the current statistic from the key value store if any
152
- * @ignore
153
- * @protected
154
- * @internal
155
- */
156
- protected _maybeLoadStatistics(): Promise<void>;
157
- /**
158
- * @ignore
159
- * @protected
160
- * @internal
161
- */
162
- protected _teardown(): void;
163
- /**
164
- * Make this class serializable when called with `JSON.stringify(statsInstance)` directly
165
- * or through `keyValueStore.setValue('KEY', statsInstance)`
166
- *
167
- * @returns {StatisticPersistedState & StatisticState}
168
- */
169
- toJSON(): StatisticPersistedState & StatisticState;
170
- }
171
- declare namespace Statistics {
172
- const id: number;
173
- }
174
- /**
175
- * @ignore
176
- */
177
- declare class Job {
178
- lastRunAt: number | null;
179
- runs: number;
180
- run(): number;
181
- finish(): number;
182
- durationMillis: number | undefined;
183
- retryCount(): number;
184
- }
185
- //# sourceMappingURL=statistics.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"statistics.d.ts","sourceRoot":"","sources":["../../src/crawlers/statistics.js"],"names":[],"mappings":";;;;;;;;;2BAsVc,MAAM,EAAE;aACR,MAAM;oCACN,MAAM;sCACN,MAAM;+BACN,MAAM;6BACN,MAAM;gCACN,MAAM;mBACN,MAAM;0BACN,MAAM;+BACN,MAAM;sBACN,MAAM;;;;;;sBAON,MAAM;oBACN,MAAM;qBACN,MAAM;6BACN,MAAM;+BACN,MAAM;8BACN,MAAM;8BACN,MAAM;sCACN,MAAM;wCACN,MAAM;sBACN,IAAI,GAAC,MAAM,GAAC,IAAI;uBAChB,IAAI,GAAC,MAAM,GAAC,IAAI;0BAChB,MAAM;sBACN,IAAI,GAAC,MAAM,GAAC,IAAI;;AApV9B;;;;;;;;;;;;;;;;GAgBG;AACH;IACI;;;OAGG;IACH,qDA6BC;IAlBG,kCAAqD;IACrD,0BAA+C;IAC/C,mBAA4B;IAC5B,0EAAyB;IAEzB,WAAyB;IACzB,wBAA0D;IAC1D,8BAA4C;IAC5C,6BAA+B;IAE/B;;;OAGG;IACH,2BAAmC;IAMvC;;OAEG;IACH,cAsBC;IArBG;;;;;;;;;;;;;;kBAcC;IAID,kCAA+B;IAKnC;;;;;OAKG;IACH,aAHW,MAAM,GAAC,MAAM,QAQvB;IAED;;;;;OAKG;IACH,cAHW,MAAM,GAAC,MAAM,QAavB;IAED;;;;;OAKG;IACH,YAHW,MAAM,GAAC,MAAM,QAUvB;IAED;;OAEG;IACH;;;;;;;;MAmBC;IAED;;;OAGG;IACH,gCAiBC;IANG,6CAK0B;IAG9B;;OAEG;IACH,+BAMC;IAED;;;;;OAKG;IACH,qCALW,GAAG,QAWb;IAED;;OAEG;IACH,8BASC;IAED;;;;;OAKG;IACH,gDAsCC;IAED;;;;OAIG;IACH,4BAQC;IAED;;;;;OAKG;IACH,UAFa,uBAAuB,GAAG,cAAc,CAgBpD;CACJ;;;;AA9TD;;GAEG;AACH;IAEQ,yBAAqB;IACrB,aAAa;IAGjB,cAGC;IAED,iBAGC;IAFG,mCAAiD;IAIrD,qBAEC;CACJ"}