apify 2.3.1-beta.4 → 3.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. package/README.md +6 -5
  2. package/actor.d.ts +1092 -0
  3. package/actor.d.ts.map +1 -0
  4. package/actor.js +1221 -0
  5. package/actor.js.map +1 -0
  6. package/index.d.ts +4 -0
  7. package/index.d.ts.map +1 -0
  8. package/index.js +7 -0
  9. package/index.js.map +1 -0
  10. package/index.mjs +7 -0
  11. package/package.json +54 -128
  12. package/platform_event_manager.d.ts +55 -0
  13. package/platform_event_manager.d.ts.map +1 -0
  14. package/platform_event_manager.js +116 -0
  15. package/platform_event_manager.js.map +1 -0
  16. package/proxy_configuration.d.ts +210 -0
  17. package/proxy_configuration.d.ts.map +1 -0
  18. package/proxy_configuration.js +297 -0
  19. package/proxy_configuration.js.map +1 -0
  20. package/tsconfig.build.tsbuildinfo +1 -0
  21. package/utils.d.ts +11 -0
  22. package/utils.d.ts.map +1 -0
  23. package/utils.js +40 -0
  24. package/utils.js.map +1 -0
  25. package/build/actor.d.ts +0 -113
  26. package/build/actor.d.ts.map +0 -1
  27. package/build/actor.js +0 -582
  28. package/build/actor.js.map +0 -1
  29. package/build/apify.d.ts +0 -752
  30. package/build/apify.d.ts.map +0 -1
  31. package/build/apify.js +0 -877
  32. package/build/apify.js.map +0 -1
  33. package/build/autoscaling/autoscaled_pool.d.ts +0 -384
  34. package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
  35. package/build/autoscaling/autoscaled_pool.js +0 -557
  36. package/build/autoscaling/autoscaled_pool.js.map +0 -1
  37. package/build/autoscaling/snapshotter.d.ts +0 -278
  38. package/build/autoscaling/snapshotter.d.ts.map +0 -1
  39. package/build/autoscaling/snapshotter.js +0 -447
  40. package/build/autoscaling/snapshotter.js.map +0 -1
  41. package/build/autoscaling/system_status.d.ts +0 -224
  42. package/build/autoscaling/system_status.d.ts.map +0 -1
  43. package/build/autoscaling/system_status.js +0 -228
  44. package/build/autoscaling/system_status.js.map +0 -1
  45. package/build/browser_launchers/browser_launcher.d.ts +0 -154
  46. package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
  47. package/build/browser_launchers/browser_launcher.js +0 -160
  48. package/build/browser_launchers/browser_launcher.js.map +0 -1
  49. package/build/browser_launchers/browser_plugin.d.ts +0 -23
  50. package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
  51. package/build/browser_launchers/browser_plugin.js +0 -25
  52. package/build/browser_launchers/browser_plugin.js.map +0 -1
  53. package/build/browser_launchers/playwright_launcher.d.ts +0 -131
  54. package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
  55. package/build/browser_launchers/playwright_launcher.js +0 -150
  56. package/build/browser_launchers/playwright_launcher.js.map +0 -1
  57. package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
  58. package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
  59. package/build/browser_launchers/puppeteer_launcher.js +0 -197
  60. package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
  61. package/build/cache_container.d.ts +0 -31
  62. package/build/cache_container.d.ts.map +0 -1
  63. package/build/cache_container.js +0 -48
  64. package/build/cache_container.js.map +0 -1
  65. package/build/configuration.d.ts +0 -226
  66. package/build/configuration.d.ts.map +0 -1
  67. package/build/configuration.js +0 -325
  68. package/build/configuration.js.map +0 -1
  69. package/build/constants.d.ts +0 -37
  70. package/build/constants.d.ts.map +0 -1
  71. package/build/constants.js +0 -41
  72. package/build/constants.js.map +0 -1
  73. package/build/crawlers/basic_crawler.d.ts +0 -443
  74. package/build/crawlers/basic_crawler.d.ts.map +0 -1
  75. package/build/crawlers/basic_crawler.js +0 -664
  76. package/build/crawlers/basic_crawler.js.map +0 -1
  77. package/build/crawlers/browser_crawler.d.ts +0 -512
  78. package/build/crawlers/browser_crawler.d.ts.map +0 -1
  79. package/build/crawlers/browser_crawler.js +0 -540
  80. package/build/crawlers/browser_crawler.js.map +0 -1
  81. package/build/crawlers/cheerio_crawler.d.ts +0 -931
  82. package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
  83. package/build/crawlers/cheerio_crawler.js +0 -913
  84. package/build/crawlers/cheerio_crawler.js.map +0 -1
  85. package/build/crawlers/crawler_extension.d.ts +0 -10
  86. package/build/crawlers/crawler_extension.d.ts.map +0 -1
  87. package/build/crawlers/crawler_extension.js +0 -19
  88. package/build/crawlers/crawler_extension.js.map +0 -1
  89. package/build/crawlers/crawler_utils.d.ts +0 -34
  90. package/build/crawlers/crawler_utils.d.ts.map +0 -1
  91. package/build/crawlers/crawler_utils.js +0 -87
  92. package/build/crawlers/crawler_utils.js.map +0 -1
  93. package/build/crawlers/playwright_crawler.d.ts +0 -448
  94. package/build/crawlers/playwright_crawler.d.ts.map +0 -1
  95. package/build/crawlers/playwright_crawler.js +0 -299
  96. package/build/crawlers/playwright_crawler.js.map +0 -1
  97. package/build/crawlers/puppeteer_crawler.d.ts +0 -425
  98. package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
  99. package/build/crawlers/puppeteer_crawler.js +0 -299
  100. package/build/crawlers/puppeteer_crawler.js.map +0 -1
  101. package/build/crawlers/statistics.d.ts +0 -185
  102. package/build/crawlers/statistics.d.ts.map +0 -1
  103. package/build/crawlers/statistics.js +0 -331
  104. package/build/crawlers/statistics.js.map +0 -1
  105. package/build/enqueue_links/click_elements.d.ts +0 -179
  106. package/build/enqueue_links/click_elements.d.ts.map +0 -1
  107. package/build/enqueue_links/click_elements.js +0 -434
  108. package/build/enqueue_links/click_elements.js.map +0 -1
  109. package/build/enqueue_links/enqueue_links.d.ts +0 -117
  110. package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
  111. package/build/enqueue_links/enqueue_links.js +0 -163
  112. package/build/enqueue_links/enqueue_links.js.map +0 -1
  113. package/build/enqueue_links/shared.d.ts +0 -42
  114. package/build/enqueue_links/shared.d.ts.map +0 -1
  115. package/build/enqueue_links/shared.js +0 -121
  116. package/build/enqueue_links/shared.js.map +0 -1
  117. package/build/errors.d.ts +0 -29
  118. package/build/errors.d.ts.map +0 -1
  119. package/build/errors.js +0 -38
  120. package/build/errors.js.map +0 -1
  121. package/build/events.d.ts +0 -11
  122. package/build/events.d.ts.map +0 -1
  123. package/build/events.js +0 -147
  124. package/build/events.js.map +0 -1
  125. package/build/index.d.ts +0 -4
  126. package/build/index.d.ts.map +0 -1
  127. package/build/index.js +0 -7
  128. package/build/index.js.map +0 -1
  129. package/build/main.d.ts +0 -179
  130. package/build/main.d.ts.map +0 -1
  131. package/build/main.js +0 -81
  132. package/build/main.js.map +0 -1
  133. package/build/playwright_utils.d.ts +0 -9
  134. package/build/playwright_utils.d.ts.map +0 -1
  135. package/build/playwright_utils.js +0 -90
  136. package/build/playwright_utils.js.map +0 -1
  137. package/build/proxy_configuration.d.ts +0 -411
  138. package/build/proxy_configuration.d.ts.map +0 -1
  139. package/build/proxy_configuration.js +0 -517
  140. package/build/proxy_configuration.js.map +0 -1
  141. package/build/pseudo_url.d.ts +0 -86
  142. package/build/pseudo_url.d.ts.map +0 -1
  143. package/build/pseudo_url.js +0 -153
  144. package/build/pseudo_url.js.map +0 -1
  145. package/build/puppeteer_request_interception.d.ts +0 -8
  146. package/build/puppeteer_request_interception.d.ts.map +0 -1
  147. package/build/puppeteer_request_interception.js +0 -235
  148. package/build/puppeteer_request_interception.js.map +0 -1
  149. package/build/puppeteer_utils.d.ts +0 -250
  150. package/build/puppeteer_utils.d.ts.map +0 -1
  151. package/build/puppeteer_utils.js +0 -551
  152. package/build/puppeteer_utils.js.map +0 -1
  153. package/build/request.d.ts +0 -180
  154. package/build/request.d.ts.map +0 -1
  155. package/build/request.js +0 -261
  156. package/build/request.js.map +0 -1
  157. package/build/request_list.d.ts +0 -581
  158. package/build/request_list.d.ts.map +0 -1
  159. package/build/request_list.js +0 -826
  160. package/build/request_list.js.map +0 -1
  161. package/build/serialization.d.ts +0 -5
  162. package/build/serialization.d.ts.map +0 -1
  163. package/build/serialization.js +0 -139
  164. package/build/serialization.js.map +0 -1
  165. package/build/session_pool/errors.d.ts +0 -11
  166. package/build/session_pool/errors.d.ts.map +0 -1
  167. package/build/session_pool/errors.js +0 -18
  168. package/build/session_pool/errors.js.map +0 -1
  169. package/build/session_pool/events.d.ts +0 -5
  170. package/build/session_pool/events.d.ts.map +0 -1
  171. package/build/session_pool/events.js +0 -6
  172. package/build/session_pool/events.js.map +0 -1
  173. package/build/session_pool/session.d.ts +0 -286
  174. package/build/session_pool/session.d.ts.map +0 -1
  175. package/build/session_pool/session.js +0 -355
  176. package/build/session_pool/session.js.map +0 -1
  177. package/build/session_pool/session_pool.d.ts +0 -280
  178. package/build/session_pool/session_pool.d.ts.map +0 -1
  179. package/build/session_pool/session_pool.js +0 -393
  180. package/build/session_pool/session_pool.js.map +0 -1
  181. package/build/session_pool/session_utils.d.ts +0 -4
  182. package/build/session_pool/session_utils.d.ts.map +0 -1
  183. package/build/session_pool/session_utils.js +0 -24
  184. package/build/session_pool/session_utils.js.map +0 -1
  185. package/build/stealth/hiding_tricks.d.ts +0 -22
  186. package/build/stealth/hiding_tricks.d.ts.map +0 -1
  187. package/build/stealth/hiding_tricks.js +0 -308
  188. package/build/stealth/hiding_tricks.js.map +0 -1
  189. package/build/stealth/stealth.d.ts +0 -56
  190. package/build/stealth/stealth.d.ts.map +0 -1
  191. package/build/stealth/stealth.js +0 -125
  192. package/build/stealth/stealth.js.map +0 -1
  193. package/build/storages/dataset.d.ts +0 -288
  194. package/build/storages/dataset.d.ts.map +0 -1
  195. package/build/storages/dataset.js +0 -480
  196. package/build/storages/dataset.js.map +0 -1
  197. package/build/storages/key_value_store.d.ts +0 -243
  198. package/build/storages/key_value_store.d.ts.map +0 -1
  199. package/build/storages/key_value_store.js +0 -462
  200. package/build/storages/key_value_store.js.map +0 -1
  201. package/build/storages/request_queue.d.ts +0 -318
  202. package/build/storages/request_queue.d.ts.map +0 -1
  203. package/build/storages/request_queue.js +0 -636
  204. package/build/storages/request_queue.js.map +0 -1
  205. package/build/storages/storage_manager.d.ts +0 -87
  206. package/build/storages/storage_manager.d.ts.map +0 -1
  207. package/build/storages/storage_manager.js +0 -150
  208. package/build/storages/storage_manager.js.map +0 -1
  209. package/build/tsconfig.tsbuildinfo +0 -1
  210. package/build/typedefs.d.ts +0 -146
  211. package/build/typedefs.d.ts.map +0 -1
  212. package/build/typedefs.js +0 -88
  213. package/build/typedefs.js.map +0 -1
  214. package/build/utils.d.ts +0 -175
  215. package/build/utils.d.ts.map +0 -1
  216. package/build/utils.js +0 -731
  217. package/build/utils.js.map +0 -1
  218. package/build/utils_log.d.ts +0 -41
  219. package/build/utils_log.d.ts.map +0 -1
  220. package/build/utils_log.js +0 -192
  221. package/build/utils_log.js.map +0 -1
  222. package/build/utils_request.d.ts +0 -77
  223. package/build/utils_request.d.ts.map +0 -1
  224. package/build/utils_request.js +0 -385
  225. package/build/utils_request.js.map +0 -1
  226. package/build/utils_social.d.ts +0 -210
  227. package/build/utils_social.d.ts.map +0 -1
  228. package/build/utils_social.js +0 -787
  229. package/build/utils_social.js.map +0 -1
  230. package/build/validators.d.ts +0 -23
  231. package/build/validators.d.ts.map +0 -1
  232. package/build/validators.js +0 -29
  233. package/build/validators.js.map +0 -1
@@ -1,581 +0,0 @@
1
- export const STATE_PERSISTENCE_KEY: "REQUEST_LIST_STATE";
2
- export const REQUESTS_PERSISTENCE_KEY: "REQUEST_LIST_REQUESTS";
3
- /**
4
- * @typedef RequestListOptions
5
- * @property {Array<RequestOptions | Request | { requestsFromUrl: string, regex?: RegExp } | string>} [sources]
6
- * An array of sources of URLs for the {@link RequestList}. It can be either an array of strings,
7
- * plain objects that define at least the `url` property, or an array of {@link Request} instances.
8
- *
9
- * **IMPORTANT:** The `sources` array will be consumed (left empty) after `RequestList` initializes.
10
- * This is a measure to prevent memory leaks in situations when millions of sources are
11
- * added.
12
- *
13
- * Additionally, the `requestsFromUrl` property may be used instead of `url`,
14
- * which will instruct `RequestList` to download the source URLs from a given remote location.
15
- * The URLs will be parsed from the received response.
16
- *
17
- * ```
18
- * [
19
- * // A single URL
20
- * 'http://example.com/a/b',
21
- *
22
- * // Modify Request options
23
- * { method: PUT, 'https://example.com/put, payload: { foo: 'bar' }}
24
- *
25
- * // Batch import of URLs from a file hosted on the web,
26
- * // where the URLs should be requested using the HTTP POST request
27
- * { method: 'POST', requestsFromUrl: 'http://example.com/urls.txt' },
28
- *
29
- * // Batch import from remote file, using a specific regular expression to extract the URLs.
30
- * { requestsFromUrl: 'http://example.com/urls.txt', regex: /https:\/\/example.com\/.+/ },
31
- *
32
- * // Get list of URLs from a Google Sheets document. Just add "/gviz/tq?tqx=out:csv" to the Google Sheet URL.
33
- * // For details, see https://help.apify.com/en/articles/2906022-scraping-a-list-of-urls-from-a-google-sheets-document
34
- * { requestsFromUrl: 'https://docs.google.com/spreadsheets/d/1GA5sSQhQjB_REes8I5IKg31S-TuRcznWOPjcpNqtxmU/gviz/tq?tqx=out:csv' }
35
- * ]
36
- * ```
37
- * @property {RequestListSourcesFunction} [sourcesFunction]
38
- * A function that will be called to get the sources for the `RequestList`, but only if `RequestList`
39
- * was not able to fetch their persisted version (see {@link RequestListOptions.persistRequestsKey}).
40
- * It must return an `Array` of {@link Request} or {@link RequestOptions}.
41
- *
42
- * This is very useful in a scenario when getting the sources is a resource intensive or time consuming
43
- * task, such as fetching URLs from multiple sitemaps or parsing URLs from large datasets. Using the
44
- * `sourcesFunction` in combination with `persistStateKey` and `persistRequestsKey` will allow you to
45
- * fetch and parse those URLs only once, saving valuable time when your actor migrates or restarts.
46
- *
47
- * If both {@link RequestListOptions.sources} and {@link RequestListOptions.sourcesFunction} are provided,
48
- * the sources returned by the function will be added after the `sources`.
49
- *
50
- * **Example:**
51
- * ```javascript
52
- * // Let's say we want to scrape URLs extracted from sitemaps.
53
- *
54
- * const sourcesFunction = async () => {
55
- * // With super large sitemaps, this operation could take very long
56
- * // and big websites typically have multiple sitemaps.
57
- * const sitemaps = await downloadHugeSitemaps();
58
- * return parseUrlsFromSitemaps(sitemaps);
59
- * }
60
- *
61
- * // Sitemaps can change in real-time, so it's important to persist
62
- * // the URLs we collected. Otherwise we might lose our scraping
63
- * // state in case of an actor migration / failure / time-out.
64
- * const requestList = new RequestList({
65
- * sourcesFunction,
66
- * persistStateKey: 'state-key',
67
- * persistRequestsKey: 'requests-key',
68
- * })
69
- *
70
- * // The sourcesFunction is called now and the Requests are persisted.
71
- * // If something goes wrong and we need to start again, RequestList
72
- * // will load the persisted Requests from storage and will NOT
73
- * // call the sourcesFunction again, saving time and resources.
74
- * await requestList.initialize();
75
- * ```
76
- * @property {ProxyConfiguration} [proxyConfiguration]
77
- * Used to pass the the proxy configuration for the `requestsFromUrls` objects.
78
- * Takes advantage of the internal address rotation and authentication process.
79
- * If undefined, the `requestsFromUrls` requests will be made without proxy.
80
- * @property {string} [persistStateKey]
81
- * Identifies the key in the default key-value store under which `RequestList` periodically stores its
82
- * state (i.e. which URLs were crawled and which not).
83
- * If the actor is restarted, `RequestList` will read the state
84
- * and continue where it left off.
85
- *
86
- * If `persistStateKey` is not set, `RequestList` will always start from the beginning,
87
- * and all the source URLs will be crawled again.
88
- * @property {string} [persistRequestsKey]
89
- * Identifies the key in the default key-value store under which the `RequestList` persists its
90
- * Requests during the {@link RequestList#initialize} call.
91
- * This is necessary if `persistStateKey` is set and the source URLs might potentially change,
92
- * to ensure consistency of the source URLs and state object. However, it comes with some
93
- * storage and performance overheads.
94
- *
95
- * If `persistRequestsKey` is not set, {@link RequestList#initialize} will always fetch the sources
96
- * from their origin, check that they are consistent with the restored state (if any)
97
- * and throw an error if they are not.
98
- * @property {RequestListState} [state]
99
- * The state object that the `RequestList` will be initialized from.
100
- * It is in the form as returned by `RequestList.getState()`, such as follows:
101
- *
102
- * ```
103
- * {
104
- * nextIndex: 5,
105
- * nextUniqueKey: 'unique-key-5'
106
- * inProgress: {
107
- * 'unique-key-1': true,
108
- * 'unique-key-4': true,
109
- * },
110
- * }
111
- * ```
112
- *
113
- * Note that the preferred (and simpler) way to persist the state of crawling of the `RequestList`
114
- * is to use the `stateKeyPrefix` parameter instead.
115
- * @property {boolean} [keepDuplicateUrls=false]
116
- * By default, `RequestList` will deduplicate the provided URLs. Default deduplication is based
117
- * on the `uniqueKey` property of passed source {@link Request} objects.
118
- *
119
- * If the property is not present, it is generated by normalizing the URL. If present, it is kept intact.
120
- * In any case, only one request per `uniqueKey` is added to the `RequestList` resulting in removal
121
- * of duplicate URLs / unique keys.
122
- *
123
- * Setting `keepDuplicateUrls` to `true` will append an additional identifier to the `uniqueKey`
124
- * of each request that does not already include a `uniqueKey`. Therefore, duplicate
125
- * URLs will be kept in the list. It does not protect the user from having duplicates in user set
126
- * `uniqueKey`s however. It is the user's responsibility to ensure uniqueness of their unique keys
127
- * if they wish to keep more than just a single copy in the `RequestList`.
128
- */
129
- /**
130
- * Represents a static list of URLs to crawl.
131
- * The URLs can be provided either in code or parsed from a text file hosted on the web.
132
- * `RequestList` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
133
- * and {@link PlaywrightCrawler} as a source of URLs to crawl.
134
- *
135
- * Each URL is represented using an instance of the {@link Request} class.
136
- * The list can only contain unique URLs. More precisely, it can only contain `Request` instances
137
- * with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
138
- * To add a single URL to the list multiple times, corresponding {@link Request} objects will need to have different
139
- * `uniqueKey` properties. You can use the `keepDuplicateUrls` option to do this for you when initializing the
140
- * `RequestList` from sources.
141
- *
142
- * Once you create an instance of `RequestList`, you need to call the {@link RequestList#initialize} function
143
- * before the instance can be used. After that, no more URLs can be added to the list.
144
- * Unlike {@link RequestQueue}, `RequestList` is static but it can contain even millions of URLs.
145
- * > Note that `RequestList` can be used together with `RequestQueue` by the same crawler.
146
- * > In such cases, each request from `RequestList` is enqueued into `RequestQueue` first and then consumed from the latter.
147
- * > This is necessary to avoid the same URL being processed more than once (from the list first and then possibly from the queue).
148
- * > In practical terms, such a combination can be useful when there is a large number of initial URLs,
149
- * > but more URLs would be added dynamically by the crawler.
150
- *
151
- * `RequestList` has an internal state where it stores information about which requests were already handled,
152
- * which are in progress and which were reclaimed. The state may be automatically persisted to the default
153
- * {@link KeyValueStore} by setting the `persistStateKey` option so that if the Node.js process is restarted,
154
- * the crawling can continue where it left off. The automated persisting is launched upon receiving the `persistState`
155
- * event that is periodically emitted by {@link events|Apify.events}.
156
- *
157
- * The internal state is closely tied to the provided sources (URLs). If the sources change on actor restart, the state will become corrupted and
158
- * `RequestList` will raise an exception. This typically happens when the sources is a list of URLs downloaded from the web.
159
- * In such case, use the `persistRequestsKey` option in conjunction with `persistStateKey`,
160
- * to make the `RequestList` store the initial sources to the default key-value store and load them after restart,
161
- * which will prevent any issues that a live list of URLs might cause.
162
- *
163
- * **Basic usage:**
164
- * ```javascript
165
- * // Use a helper function to simplify request list initialization.
166
- * // State and sources are automatically persisted. This is a preferred usage.
167
- * const requestList = await Apify.openRequestList('my-request-list', [
168
- * 'http://www.example.com/page-1',
169
- * { url: 'http://www.example.com/page-2', method: 'POST', userData: { foo: 'bar' }},
170
- * { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } },
171
- * ]);
172
- * ```
173
- *
174
- * **Advanced usage:**
175
- * ```javascript
176
- * // Use the constructor to get more control over the initialization.
177
- * const requestList = new Apify.RequestList({
178
- * sources: [
179
- * // Separate requests
180
- * { url: 'http://www.example.com/page-1', method: 'GET', headers: { ... } },
181
- * { url: 'http://www.example.com/page-2', userData: { foo: 'bar' }},
182
- *
183
- * // Bulk load of URLs from file `http://www.example.com/my-url-list.txt`
184
- * // Note that all URLs must start with http:// or https://
185
- * { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } },
186
- * ],
187
- *
188
- * // Persist the state to avoid re-crawling which can lead to data duplications.
189
- * // Keep in mind that the sources have to be immutable or this will throw an error.
190
- * persistStateKey: 'my-state',
191
- * });
192
- *
193
- * await requestList.initialize();
194
- * ```
195
- */
196
- export class RequestList {
197
- /**
198
- * @param {RequestListOptions} options All `RequestList` configuration options
199
- */
200
- constructor(options?: RequestListOptions);
201
- log: import("@apify/log/log").Log;
202
- /** @type {Array<Request>} */
203
- requests: Array<Request>;
204
- nextIndex: number;
205
- uniqueKeyToIndex: {};
206
- inProgress: {};
207
- reclaimed: {};
208
- persistStateKey: string | undefined;
209
- persistRequestsKey: string | undefined;
210
- initialState: RequestListState | undefined;
211
- keepDuplicateUrls: boolean;
212
- isStatePersisted: boolean;
213
- areRequestsPersisted: boolean;
214
- isLoading: boolean;
215
- isInitialized: boolean;
216
- sources: (string | Request | RequestOptions | {
217
- requestsFromUrl: string;
218
- regex?: RegExp | undefined;
219
- })[];
220
- sourcesFunction: RequestListSourcesFunction | undefined;
221
- proxyConfiguration: any;
222
- /**
223
- * Loads all remote sources of URLs and potentially starts periodic state persistence.
224
- * This function must be called before you can start using the instance in a meaningful way.
225
- *
226
- * @returns {Promise<void>}
227
- */
228
- initialize(): Promise<void>;
229
- /**
230
- * Adds previously persisted Requests, as retrieved from the key-value store.
231
- * This needs to be done in a memory efficient way. We should update the input
232
- * to a Stream once apify-client supports streams.
233
- * @param {Buffer} persistedRequests
234
- * @ignore
235
- * @protected
236
- * @internal
237
- */
238
- protected _addPersistedRequests(persistedRequests: Buffer): Promise<void>;
239
- /**
240
- * Add Requests from both options.sources and options.sourcesFunction.
241
- * This function is called only when persisted sources were not loaded.
242
- * We need to avoid keeping both sources and requests in memory
243
- * to reduce memory footprint with very large sources.
244
- * @returns {Promise<void>}
245
- * @ignore
246
- * @protected
247
- * @internal
248
- */
249
- protected _addRequestsFromSources(): Promise<void>;
250
- /**
251
- * Persists the current state of the `RequestList` into the default {@link KeyValueStore}.
252
- * The state is persisted automatically in regular intervals, but calling this method manually
253
- * is useful in cases where you want to have the most current state available after you pause
254
- * or stop fetching its requests. For example after you pause or abort a crawl. Or just before
255
- * a server migration.
256
- *
257
- * @return {Promise<void>}
258
- */
259
- persistState(): Promise<void>;
260
- /**
261
- * Unlike persistState(), this is used only internally, since the sources
262
- * are automatically persisted at RequestList initialization (if the persistRequestsKey is set),
263
- * but there's no reason to persist it again afterwards, because RequestList is immutable.
264
- *
265
- * @return {Promise<void>}
266
- * @ignore
267
- * @protected
268
- * @internal
269
- */
270
- protected _persistRequests(): Promise<void>;
271
- /**
272
- * Restores RequestList state from a state object.
273
- *
274
- * @param {RequestListState} state
275
- * @ignore
276
- * @protected
277
- * @internal
278
- */
279
- protected _restoreState(state: RequestListState): void;
280
- /**
281
- * Attempts to load state and requests using the `RequestList` configuration
282
- * and returns a tuple of [state, requests] where each may be null if not loaded.
283
- *
284
- * @return {Promise<Array<(RequestListState|null)>>}
285
- * @ignore
286
- * @protected
287
- * @internal
288
- */
289
- protected _loadStateAndPersistedRequests(): Promise<Array<(RequestListState | null)>>;
290
- /**
291
- * Returns an object representing the internal state of the `RequestList` instance.
292
- * Note that the object's fields can change in future releases.
293
- *
294
- * @returns {RequestListState}
295
- */
296
- getState(): RequestListState;
297
- /**
298
- * Resolves to `true` if the next call to {@link RequestList#fetchNextRequest} function
299
- * would return `null`, otherwise it resolves to `false`.
300
- * Note that even if the list is empty, there might be some pending requests currently being processed.
301
- *
302
- * @returns {Promise<boolean>}
303
- */
304
- isEmpty(): Promise<boolean>;
305
- /**
306
- * Returns `true` if all requests were already handled and there are no more left.
307
- *
308
- * @returns {Promise<boolean>}
309
- */
310
- isFinished(): Promise<boolean>;
311
- /**
312
- * Gets the next {@link Request} to process. First, the function gets a request previously reclaimed
313
- * using the {@link RequestList#reclaimRequest} function, if there is any.
314
- * Otherwise it gets the next request from sources.
315
- *
316
- * The function's `Promise` resolves to `null` if there are no more
317
- * requests to process.
318
- *
319
- * @returns {Promise<(Request|null)>}
320
- */
321
- fetchNextRequest(): Promise<(Request | null)>;
322
- /**
323
- * Marks request as handled after successful processing.
324
- *
325
- * @param {Request} request
326
- * @returns {Promise<void>}
327
- */
328
- markRequestHandled(request: Request): Promise<void>;
329
- /**
330
- * Reclaims request to the list if its processing failed.
331
- * The request will become available in the next `this.fetchNextRequest()`.
332
- *
333
- * @param {Request} request
334
- * @returns {Promise<void>}
335
- */
336
- reclaimRequest(request: Request): Promise<void>;
337
- /**
338
- * Adds all fetched requests from a URL from a remote resource.
339
- *
340
- * @ignore
341
- * @protected
342
- * @internal
343
- */
344
- protected _addFetchedRequests(source: any, fetchedRequests: any): Promise<void>;
345
- /**
346
- * Fetches URLs from requestsFromUrl and returns them in format of list of requests
347
- * @param {*} source
348
- * @return {Promise<Array<RequestOptions>>}
349
- * @ignore
350
- * @protected
351
- * @internal
352
- */
353
- protected _fetchRequestsFromUrl(source: any): Promise<Array<RequestOptions>>;
354
- /**
355
- * Adds given request.
356
- * If the `source` parameter is a string or plain object and not an instance
357
- * of a `Request`, then the function creates a `Request` instance.
358
- *
359
- * @param {(string|Request|object)} source
360
- * @ignore
361
- * @protected
362
- * @internal
363
- */
364
- protected _addRequest(source: (string | Request | object)): void;
365
- /**
366
- * Helper function that validates unique key.
367
- * Throws an error if uniqueKey is not a non-empty string.
368
- *
369
- * @ignore
370
- * @protected
371
- * @internal
372
- */
373
- protected _ensureUniqueKeyValid(uniqueKey: any): void;
374
- /**
375
- * Checks that request is not reclaimed and throws an error if so.
376
- *
377
- * @ignore
378
- * @protected
379
- * @internal
380
- */
381
- protected _ensureInProgressAndNotReclaimed(uniqueKey: any): void;
382
- /**
383
- * Throws an error if request list wasn't initialized.
384
- *
385
- * @ignore
386
- * @protected
387
- * @internal
388
- */
389
- protected _ensureIsInitialized(): void;
390
- /**
391
- * Returns the total number of unique requests present in the `RequestList`.
392
- *
393
- * @returns {number}
394
- */
395
- length(): number;
396
- /**
397
- * Returns number of handled requests.
398
- *
399
- * @returns {number}
400
- */
401
- handledCount(): number;
402
- }
403
- export function openRequestList(listName: string | null, sources: RequestListOptions['sources'], options?: RequestListOptions | undefined): Promise<RequestList>;
404
- export type RequestListOptions = {
405
- /**
406
- * An array of sources of URLs for the {@link RequestList }. It can be either an array of strings,
407
- * plain objects that define at least the `url` property, or an array of {@link Request } instances.
408
- *
409
- * **IMPORTANT:** The `sources` array will be consumed (left empty) after `RequestList` initializes.
410
- * This is a measure to prevent memory leaks in situations when millions of sources are
411
- * added.
412
- *
413
- * Additionally, the `requestsFromUrl` property may be used instead of `url`,
414
- * which will instruct `RequestList` to download the source URLs from a given remote location.
415
- * The URLs will be parsed from the received response.
416
- *
417
- * ```
418
- * [
419
- * // A single URL
420
- * 'http://example.com/a/b',
421
- *
422
- * // Modify Request options
423
- * { method: PUT, 'https://example.com/put, payload: { foo: 'bar' }}
424
- *
425
- * // Batch import of URLs from a file hosted on the web,
426
- * // where the URLs should be requested using the HTTP POST request
427
- * { method: 'POST', requestsFromUrl: 'http://example.com/urls.txt' },
428
- *
429
- * // Batch import from remote file, using a specific regular expression to extract the URLs.
430
- * { requestsFromUrl: 'http://example.com/urls.txt', regex: /https:\/\/example.com\/.+/ },
431
- *
432
- * // Get list of URLs from a Google Sheets document. Just add "/gviz/tq?tqx=out:csv" to the Google Sheet URL.
433
- * // For details, see https://help.apify.com/en/articles/2906022-scraping-a-list-of-urls-from-a-google-sheets-document
434
- * { requestsFromUrl: 'https://docs.google.com/spreadsheets/d/1GA5sSQhQjB_REes8I5IKg31S-TuRcznWOPjcpNqtxmU/gviz/tq?tqx=out:csv' }
435
- * ]
436
- * ```
437
- */
438
- sources?: (string | Request | RequestOptions | {
439
- requestsFromUrl: string;
440
- regex?: RegExp | undefined;
441
- })[] | undefined;
442
- /**
443
- * A function that will be called to get the sources for the `RequestList`, but only if `RequestList`
444
- * was not able to fetch their persisted version (see {@link RequestListOptions.persistRequestsKey }).
445
- * It must return an `Array` of {@link Request } or {@link RequestOptions }.
446
- *
447
- * This is very useful in a scenario when getting the sources is a resource intensive or time consuming
448
- * task, such as fetching URLs from multiple sitemaps or parsing URLs from large datasets. Using the
449
- * `sourcesFunction` in combination with `persistStateKey` and `persistRequestsKey` will allow you to
450
- * fetch and parse those URLs only once, saving valuable time when your actor migrates or restarts.
451
- *
452
- * If both {@link RequestListOptions.sources } and {@link RequestListOptions.sourcesFunction } are provided,
453
- * the sources returned by the function will be added after the `sources`.
454
- *
455
- * **Example:**
456
- * ```javascript
457
- * // Let's say we want to scrape URLs extracted from sitemaps.
458
- *
459
- * const sourcesFunction = async () => {
460
- * // With super large sitemaps, this operation could take very long
461
- * // and big websites typically have multiple sitemaps.
462
- * const sitemaps = await downloadHugeSitemaps();
463
- * return parseUrlsFromSitemaps(sitemaps);
464
- * }
465
- *
466
- * // Sitemaps can change in real-time, so it's important to persist
467
- * // the URLs we collected. Otherwise we might lose our scraping
468
- * // state in case of an actor migration / failure / time-out.
469
- * const requestList = new RequestList({
470
- * sourcesFunction,
471
- * persistStateKey: 'state-key',
472
- * persistRequestsKey: 'requests-key',
473
- * })
474
- *
475
- * // The sourcesFunction is called now and the Requests are persisted.
476
- * // If something goes wrong and we need to start again, RequestList
477
- * // will load the persisted Requests from storage and will NOT
478
- * // call the sourcesFunction again, saving time and resources.
479
- * await requestList.initialize();
480
- * ```
481
- */
482
- sourcesFunction?: RequestListSourcesFunction | undefined;
483
- /**
484
- * Used to pass the the proxy configuration for the `requestsFromUrls` objects.
485
- * Takes advantage of the internal address rotation and authentication process.
486
- * If undefined, the `requestsFromUrls` requests will be made without proxy.
487
- */
488
- proxyConfiguration?: any;
489
- /**
490
- * Identifies the key in the default key-value store under which `RequestList` periodically stores its
491
- * state (i.e. which URLs were crawled and which not).
492
- * If the actor is restarted, `RequestList` will read the state
493
- * and continue where it left off.
494
- *
495
- * If `persistStateKey` is not set, `RequestList` will always start from the beginning,
496
- * and all the source URLs will be crawled again.
497
- */
498
- persistStateKey?: string | undefined;
499
- /**
500
- * Identifies the key in the default key-value store under which the `RequestList` persists its
501
- * Requests during the {@link RequestListinitialize } call.
502
- * This is necessary if `persistStateKey` is set and the source URLs might potentially change,
503
- * to ensure consistency of the source URLs and state object. However, it comes with some
504
- * storage and performance overheads.
505
- *
506
- * If `persistRequestsKey` is not set, {@link RequestListinitialize } will always fetch the sources
507
- * from their origin, check that they are consistent with the restored state (if any)
508
- * and throw an error if they are not.
509
- */
510
- persistRequestsKey?: string | undefined;
511
- /**
512
- * The state object that the `RequestList` will be initialized from.
513
- * It is in the form as returned by `RequestList.getState()`, such as follows:
514
- *
515
- * ```
516
- * {
517
- * nextIndex: 5,
518
- * nextUniqueKey: 'unique-key-5'
519
- * inProgress: {
520
- * 'unique-key-1': true,
521
- * 'unique-key-4': true,
522
- * },
523
- * }
524
- * ```
525
- *
526
- * Note that the preferred (and simpler) way to persist the state of crawling of the `RequestList`
527
- * is to use the `stateKeyPrefix` parameter instead.
528
- */
529
- state?: RequestListState | undefined;
530
- /**
531
- * By default, `RequestList` will deduplicate the provided URLs. Default deduplication is based
532
- * on the `uniqueKey` property of passed source {@link Request } objects.
533
- *
534
- * If the property is not present, it is generated by normalizing the URL. If present, it is kept intact.
535
- * In any case, only one request per `uniqueKey` is added to the `RequestList` resulting in removal
536
- * of duplicate URLs / unique keys.
537
- *
538
- * Setting `keepDuplicateUrls` to `true` will append an additional identifier to the `uniqueKey`
539
- * of each request that does not already include a `uniqueKey`. Therefore, duplicate
540
- * URLs will be kept in the list. It does not protect the user from having duplicates in user set
541
- * `uniqueKey`s however. It is the user's responsibility to ensure uniqueness of their unique keys
542
- * if they wish to keep more than just a single copy in the `RequestList`.
543
- */
544
- keepDuplicateUrls?: boolean | undefined;
545
- };
546
- /**
547
- * Represents state of a {@link RequestList }. It can be used to resume a {@link RequestList } which has been previously processed.
548
- * You can obtain the state by calling {@link RequestListgetState } and receive an object with
549
- * the following structure:
550
- *
551
- * ```
552
- * {
553
- * nextIndex: 5,
554
- * nextUniqueKey: 'unique-key-5'
555
- * inProgress: {
556
- * 'unique-key-1': true,
557
- * 'unique-key-4': true
558
- * },
559
- * }
560
- * ```
561
- */
562
- export type RequestListState = {
563
- /**
564
- * Position of the next request to be processed.
565
- */
566
- nextIndex: number;
567
- /**
568
- * Key of the next request to be processed.
569
- */
570
- nextUniqueKey: string;
571
- /**
572
- * An object mapping request keys to a boolean value respresenting whether they are being processed at the moment.
573
- */
574
- inProgress: {
575
- [x: string]: boolean;
576
- };
577
- };
578
- export type RequestListSourcesFunction = () => Promise<Array<(RequestOptions | Request | string)>>;
579
- import Request from "./request";
580
- import { RequestOptions } from "./request";
581
- //# sourceMappingURL=request_list.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"request_list.d.ts","sourceRoot":"","sources":["../src/request_list.js"],"names":[],"mappings":"AAeA,yDAA0D;AAC1D,+DAAgE;AAIhE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6HG;AAEH;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkEG;AACH;IACI;;OAEG;IACH,sBAFW,kBAAkB,EAyE5B;IA3CG,kCAA+C;IAI/C,6BAA6B;IAC7B,UADW,MAAM,OAAO,CAAC,CACP;IAGlB,kBAAkB;IAGlB,qBAA0B;IAK1B,eAAoB;IAKpB,cAAmB;IAEnB,oCAAmF;IACnF,uCAA+F;IAE/F,2CAAyB;IAGzB,2BAA0C;IAG1C,0BAA4B;IAE5B,8BAAiC;IACjC,mBAAsB;IACtB,uBAA0B;IAE1B;yBAtQ0D,MAAM;;SAsQpC;IAC5B,wDAAsC;IAGtC,wBAA4C;IAGhD;;;;;OAKG;IACH,cAFa,QAAQ,IAAI,CAAC,CAwBzB;IAED;;;;;;;;OAQG;IACH,mDALW,MAAM,iBAkBhB;IAED;;;;;;;;;OASG;IACH,qCALa,QAAQ,IAAI,CAAC,CAqCzB;IAED;;;;;;;;OAQG;IACH,gBAFY,QAAQ,IAAI,CAAC,CAaxB;IAED;;;;;;;;;OASG;IACH,8BALY,QAAQ,IAAI,CAAC,CASxB;IAED;;;;;;;OAOG;IACH,+BALW,gBAAgB,QA0D1B;IAED;;;;;;;;OAQG;IACH,4CALY,QAAQ,MAAM,CAAC,gBAAgB,GAAC,IAAI,CAAC,CAAC,CAAC,CAqBlD;IAED;;;;;OAKG;IACH,YAFa,gBAAgB,CAY5B;IAED;;;;;;OAMG;IACH,WAFa,QAAQ,OAAO,CAAC,CAM5B;IAED;;;;OAIG;IACH,cAFa,QAAQ,OAAO,CAAC,CAM5B;IAED;;;;;;;;;OASG;IACH,oBAFa,QAAQ,CAAC,OAAO,GAAC,IAAI,CAAC,CAAC,CAuBnC;IAED;;;;;OAKG;IACH,4BAHW,OAAO,GACL,QAAQ,IAAI,CAAC,CAWzB;IAED;;;;;;OAMG;IACH,wBAHW,OAAO,GACL,QAAQ,IAAI,CAAC,CAUzB;IAED;;;;;;OAMG;IACH,gFAiBC;IAED;;;;;;;OAOG;IACH,8CALY,QAAQ,MAAM,cAAc,CAAC,CAAC,CAwBzC;IAED;;;;;;;;;OASG;IACH,8BALW,CAAC,MAAM,GAAC,OAAO,GAAC,MAAM,CAAC,QAmCjC;IAED;;;;;;;OAOG;IACH,sDAIC;IAED;;;;;;OAMG;IACH,iEAOC;IAED;;;;;;OAMG;IACH,uCAIC;IAED;;;;OAIG;IACH,UAFa,MAAM,CAMlB;IAED;;;;OAIG;IACH,gBAFa,MAAM,CAMlB;CACJ;AA2DM,0CAlCI,MAAM,GAAC,IAAI,WAWX,kBAAkB,CAAC,SAAS,CAAC,6CAkB3B,QAAQ,WAAW,CAAC,CAkBhC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;yBAr0BiE,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;eAw1B1D,MAAM;;;;mBAEN,MAAM;;;;;YAEC,MAAM,GAAC,OAAO;;;+CAMvB,QAAQ,MAAM,CAAC,cAAc,GAAC,OAAO,GAAC,MAAM,CAAC,CAAC,CAAC"}