apify 2.3.1-beta.4 → 3.0.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -5
- package/package.json +69 -128
- package/build/actor.d.ts +0 -113
- package/build/actor.d.ts.map +0 -1
- package/build/actor.js +0 -582
- package/build/actor.js.map +0 -1
- package/build/apify.d.ts +0 -752
- package/build/apify.d.ts.map +0 -1
- package/build/apify.js +0 -877
- package/build/apify.js.map +0 -1
- package/build/autoscaling/autoscaled_pool.d.ts +0 -384
- package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
- package/build/autoscaling/autoscaled_pool.js +0 -557
- package/build/autoscaling/autoscaled_pool.js.map +0 -1
- package/build/autoscaling/snapshotter.d.ts +0 -278
- package/build/autoscaling/snapshotter.d.ts.map +0 -1
- package/build/autoscaling/snapshotter.js +0 -447
- package/build/autoscaling/snapshotter.js.map +0 -1
- package/build/autoscaling/system_status.d.ts +0 -224
- package/build/autoscaling/system_status.d.ts.map +0 -1
- package/build/autoscaling/system_status.js +0 -228
- package/build/autoscaling/system_status.js.map +0 -1
- package/build/browser_launchers/browser_launcher.d.ts +0 -154
- package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
- package/build/browser_launchers/browser_launcher.js +0 -160
- package/build/browser_launchers/browser_launcher.js.map +0 -1
- package/build/browser_launchers/browser_plugin.d.ts +0 -23
- package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
- package/build/browser_launchers/browser_plugin.js +0 -25
- package/build/browser_launchers/browser_plugin.js.map +0 -1
- package/build/browser_launchers/playwright_launcher.d.ts +0 -131
- package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
- package/build/browser_launchers/playwright_launcher.js +0 -150
- package/build/browser_launchers/playwright_launcher.js.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
- package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.js +0 -197
- package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
- package/build/cache_container.d.ts +0 -31
- package/build/cache_container.d.ts.map +0 -1
- package/build/cache_container.js +0 -48
- package/build/cache_container.js.map +0 -1
- package/build/configuration.d.ts +0 -226
- package/build/configuration.d.ts.map +0 -1
- package/build/configuration.js +0 -325
- package/build/configuration.js.map +0 -1
- package/build/constants.d.ts +0 -37
- package/build/constants.d.ts.map +0 -1
- package/build/constants.js +0 -41
- package/build/constants.js.map +0 -1
- package/build/crawlers/basic_crawler.d.ts +0 -443
- package/build/crawlers/basic_crawler.d.ts.map +0 -1
- package/build/crawlers/basic_crawler.js +0 -664
- package/build/crawlers/basic_crawler.js.map +0 -1
- package/build/crawlers/browser_crawler.d.ts +0 -512
- package/build/crawlers/browser_crawler.d.ts.map +0 -1
- package/build/crawlers/browser_crawler.js +0 -540
- package/build/crawlers/browser_crawler.js.map +0 -1
- package/build/crawlers/cheerio_crawler.d.ts +0 -931
- package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
- package/build/crawlers/cheerio_crawler.js +0 -913
- package/build/crawlers/cheerio_crawler.js.map +0 -1
- package/build/crawlers/crawler_extension.d.ts +0 -10
- package/build/crawlers/crawler_extension.d.ts.map +0 -1
- package/build/crawlers/crawler_extension.js +0 -19
- package/build/crawlers/crawler_extension.js.map +0 -1
- package/build/crawlers/crawler_utils.d.ts +0 -34
- package/build/crawlers/crawler_utils.d.ts.map +0 -1
- package/build/crawlers/crawler_utils.js +0 -87
- package/build/crawlers/crawler_utils.js.map +0 -1
- package/build/crawlers/playwright_crawler.d.ts +0 -448
- package/build/crawlers/playwright_crawler.d.ts.map +0 -1
- package/build/crawlers/playwright_crawler.js +0 -299
- package/build/crawlers/playwright_crawler.js.map +0 -1
- package/build/crawlers/puppeteer_crawler.d.ts +0 -425
- package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
- package/build/crawlers/puppeteer_crawler.js +0 -299
- package/build/crawlers/puppeteer_crawler.js.map +0 -1
- package/build/crawlers/statistics.d.ts +0 -185
- package/build/crawlers/statistics.d.ts.map +0 -1
- package/build/crawlers/statistics.js +0 -331
- package/build/crawlers/statistics.js.map +0 -1
- package/build/enqueue_links/click_elements.d.ts +0 -179
- package/build/enqueue_links/click_elements.d.ts.map +0 -1
- package/build/enqueue_links/click_elements.js +0 -434
- package/build/enqueue_links/click_elements.js.map +0 -1
- package/build/enqueue_links/enqueue_links.d.ts +0 -117
- package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
- package/build/enqueue_links/enqueue_links.js +0 -163
- package/build/enqueue_links/enqueue_links.js.map +0 -1
- package/build/enqueue_links/shared.d.ts +0 -42
- package/build/enqueue_links/shared.d.ts.map +0 -1
- package/build/enqueue_links/shared.js +0 -121
- package/build/enqueue_links/shared.js.map +0 -1
- package/build/errors.d.ts +0 -29
- package/build/errors.d.ts.map +0 -1
- package/build/errors.js +0 -38
- package/build/errors.js.map +0 -1
- package/build/events.d.ts +0 -11
- package/build/events.d.ts.map +0 -1
- package/build/events.js +0 -147
- package/build/events.js.map +0 -1
- package/build/index.d.ts +0 -4
- package/build/index.d.ts.map +0 -1
- package/build/index.js +0 -7
- package/build/index.js.map +0 -1
- package/build/main.d.ts +0 -179
- package/build/main.d.ts.map +0 -1
- package/build/main.js +0 -81
- package/build/main.js.map +0 -1
- package/build/playwright_utils.d.ts +0 -9
- package/build/playwright_utils.d.ts.map +0 -1
- package/build/playwright_utils.js +0 -90
- package/build/playwright_utils.js.map +0 -1
- package/build/proxy_configuration.d.ts +0 -411
- package/build/proxy_configuration.d.ts.map +0 -1
- package/build/proxy_configuration.js +0 -517
- package/build/proxy_configuration.js.map +0 -1
- package/build/pseudo_url.d.ts +0 -86
- package/build/pseudo_url.d.ts.map +0 -1
- package/build/pseudo_url.js +0 -153
- package/build/pseudo_url.js.map +0 -1
- package/build/puppeteer_request_interception.d.ts +0 -8
- package/build/puppeteer_request_interception.d.ts.map +0 -1
- package/build/puppeteer_request_interception.js +0 -235
- package/build/puppeteer_request_interception.js.map +0 -1
- package/build/puppeteer_utils.d.ts +0 -250
- package/build/puppeteer_utils.d.ts.map +0 -1
- package/build/puppeteer_utils.js +0 -551
- package/build/puppeteer_utils.js.map +0 -1
- package/build/request.d.ts +0 -180
- package/build/request.d.ts.map +0 -1
- package/build/request.js +0 -261
- package/build/request.js.map +0 -1
- package/build/request_list.d.ts +0 -581
- package/build/request_list.d.ts.map +0 -1
- package/build/request_list.js +0 -826
- package/build/request_list.js.map +0 -1
- package/build/serialization.d.ts +0 -5
- package/build/serialization.d.ts.map +0 -1
- package/build/serialization.js +0 -139
- package/build/serialization.js.map +0 -1
- package/build/session_pool/errors.d.ts +0 -11
- package/build/session_pool/errors.d.ts.map +0 -1
- package/build/session_pool/errors.js +0 -18
- package/build/session_pool/errors.js.map +0 -1
- package/build/session_pool/events.d.ts +0 -5
- package/build/session_pool/events.d.ts.map +0 -1
- package/build/session_pool/events.js +0 -6
- package/build/session_pool/events.js.map +0 -1
- package/build/session_pool/session.d.ts +0 -286
- package/build/session_pool/session.d.ts.map +0 -1
- package/build/session_pool/session.js +0 -355
- package/build/session_pool/session.js.map +0 -1
- package/build/session_pool/session_pool.d.ts +0 -280
- package/build/session_pool/session_pool.d.ts.map +0 -1
- package/build/session_pool/session_pool.js +0 -393
- package/build/session_pool/session_pool.js.map +0 -1
- package/build/session_pool/session_utils.d.ts +0 -4
- package/build/session_pool/session_utils.d.ts.map +0 -1
- package/build/session_pool/session_utils.js +0 -24
- package/build/session_pool/session_utils.js.map +0 -1
- package/build/stealth/hiding_tricks.d.ts +0 -22
- package/build/stealth/hiding_tricks.d.ts.map +0 -1
- package/build/stealth/hiding_tricks.js +0 -308
- package/build/stealth/hiding_tricks.js.map +0 -1
- package/build/stealth/stealth.d.ts +0 -56
- package/build/stealth/stealth.d.ts.map +0 -1
- package/build/stealth/stealth.js +0 -125
- package/build/stealth/stealth.js.map +0 -1
- package/build/storages/dataset.d.ts +0 -288
- package/build/storages/dataset.d.ts.map +0 -1
- package/build/storages/dataset.js +0 -480
- package/build/storages/dataset.js.map +0 -1
- package/build/storages/key_value_store.d.ts +0 -243
- package/build/storages/key_value_store.d.ts.map +0 -1
- package/build/storages/key_value_store.js +0 -462
- package/build/storages/key_value_store.js.map +0 -1
- package/build/storages/request_queue.d.ts +0 -318
- package/build/storages/request_queue.d.ts.map +0 -1
- package/build/storages/request_queue.js +0 -636
- package/build/storages/request_queue.js.map +0 -1
- package/build/storages/storage_manager.d.ts +0 -87
- package/build/storages/storage_manager.d.ts.map +0 -1
- package/build/storages/storage_manager.js +0 -150
- package/build/storages/storage_manager.js.map +0 -1
- package/build/tsconfig.tsbuildinfo +0 -1
- package/build/typedefs.d.ts +0 -146
- package/build/typedefs.d.ts.map +0 -1
- package/build/typedefs.js +0 -88
- package/build/typedefs.js.map +0 -1
- package/build/utils.d.ts +0 -175
- package/build/utils.d.ts.map +0 -1
- package/build/utils.js +0 -731
- package/build/utils.js.map +0 -1
- package/build/utils_log.d.ts +0 -41
- package/build/utils_log.d.ts.map +0 -1
- package/build/utils_log.js +0 -192
- package/build/utils_log.js.map +0 -1
- package/build/utils_request.d.ts +0 -77
- package/build/utils_request.d.ts.map +0 -1
- package/build/utils_request.js +0 -385
- package/build/utils_request.js.map +0 -1
- package/build/utils_social.d.ts +0 -210
- package/build/utils_social.d.ts.map +0 -1
- package/build/utils_social.js +0 -787
- package/build/utils_social.js.map +0 -1
- package/build/validators.d.ts +0 -23
- package/build/validators.d.ts.map +0 -1
- package/build/validators.js +0 -29
- package/build/validators.js.map +0 -1
package/build/request_list.js
DELETED
|
@@ -1,826 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.openRequestList = exports.RequestList = exports.REQUESTS_PERSISTENCE_KEY = exports.STATE_PERSISTENCE_KEY = void 0;
|
|
4
|
-
const tslib_1 = require("tslib");
|
|
5
|
-
const ow_1 = (0, tslib_1.__importStar)(require("ow"));
|
|
6
|
-
const underscore_1 = (0, tslib_1.__importDefault)(require("underscore"));
|
|
7
|
-
const constants_1 = require("./constants");
|
|
8
|
-
const request_1 = (0, tslib_1.__importDefault)(require("./request")); // eslint-disable-line import/no-duplicates
|
|
9
|
-
const events_1 = (0, tslib_1.__importDefault)(require("./events"));
|
|
10
|
-
const utils_log_1 = (0, tslib_1.__importDefault)(require("./utils_log"));
|
|
11
|
-
const utils_1 = require("./utils");
|
|
12
|
-
const key_value_store_1 = require("./storages/key_value_store");
|
|
13
|
-
const serialization_1 = require("./serialization");
|
|
14
|
-
/* eslint-enable no-unused-vars,import/named,import/no-duplicates,import/order */
|
|
15
|
-
exports.STATE_PERSISTENCE_KEY = 'REQUEST_LIST_STATE';
|
|
16
|
-
exports.REQUESTS_PERSISTENCE_KEY = 'REQUEST_LIST_REQUESTS';
|
|
17
|
-
const CONTENT_TYPE_BINARY = 'application/octet-stream';
|
|
18
|
-
/**
|
|
19
|
-
* @typedef RequestListOptions
|
|
20
|
-
* @property {Array<RequestOptions | Request | { requestsFromUrl: string, regex?: RegExp } | string>} [sources]
|
|
21
|
-
* An array of sources of URLs for the {@link RequestList}. It can be either an array of strings,
|
|
22
|
-
* plain objects that define at least the `url` property, or an array of {@link Request} instances.
|
|
23
|
-
*
|
|
24
|
-
* **IMPORTANT:** The `sources` array will be consumed (left empty) after `RequestList` initializes.
|
|
25
|
-
* This is a measure to prevent memory leaks in situations when millions of sources are
|
|
26
|
-
* added.
|
|
27
|
-
*
|
|
28
|
-
* Additionally, the `requestsFromUrl` property may be used instead of `url`,
|
|
29
|
-
* which will instruct `RequestList` to download the source URLs from a given remote location.
|
|
30
|
-
* The URLs will be parsed from the received response.
|
|
31
|
-
*
|
|
32
|
-
* ```
|
|
33
|
-
* [
|
|
34
|
-
* // A single URL
|
|
35
|
-
* 'http://example.com/a/b',
|
|
36
|
-
*
|
|
37
|
-
* // Modify Request options
|
|
38
|
-
* { method: PUT, 'https://example.com/put, payload: { foo: 'bar' }}
|
|
39
|
-
*
|
|
40
|
-
* // Batch import of URLs from a file hosted on the web,
|
|
41
|
-
* // where the URLs should be requested using the HTTP POST request
|
|
42
|
-
* { method: 'POST', requestsFromUrl: 'http://example.com/urls.txt' },
|
|
43
|
-
*
|
|
44
|
-
* // Batch import from remote file, using a specific regular expression to extract the URLs.
|
|
45
|
-
* { requestsFromUrl: 'http://example.com/urls.txt', regex: /https:\/\/example.com\/.+/ },
|
|
46
|
-
*
|
|
47
|
-
* // Get list of URLs from a Google Sheets document. Just add "/gviz/tq?tqx=out:csv" to the Google Sheet URL.
|
|
48
|
-
* // For details, see https://help.apify.com/en/articles/2906022-scraping-a-list-of-urls-from-a-google-sheets-document
|
|
49
|
-
* { requestsFromUrl: 'https://docs.google.com/spreadsheets/d/1GA5sSQhQjB_REes8I5IKg31S-TuRcznWOPjcpNqtxmU/gviz/tq?tqx=out:csv' }
|
|
50
|
-
* ]
|
|
51
|
-
* ```
|
|
52
|
-
* @property {RequestListSourcesFunction} [sourcesFunction]
|
|
53
|
-
* A function that will be called to get the sources for the `RequestList`, but only if `RequestList`
|
|
54
|
-
* was not able to fetch their persisted version (see {@link RequestListOptions.persistRequestsKey}).
|
|
55
|
-
* It must return an `Array` of {@link Request} or {@link RequestOptions}.
|
|
56
|
-
*
|
|
57
|
-
* This is very useful in a scenario when getting the sources is a resource intensive or time consuming
|
|
58
|
-
* task, such as fetching URLs from multiple sitemaps or parsing URLs from large datasets. Using the
|
|
59
|
-
* `sourcesFunction` in combination with `persistStateKey` and `persistRequestsKey` will allow you to
|
|
60
|
-
* fetch and parse those URLs only once, saving valuable time when your actor migrates or restarts.
|
|
61
|
-
*
|
|
62
|
-
* If both {@link RequestListOptions.sources} and {@link RequestListOptions.sourcesFunction} are provided,
|
|
63
|
-
* the sources returned by the function will be added after the `sources`.
|
|
64
|
-
*
|
|
65
|
-
* **Example:**
|
|
66
|
-
* ```javascript
|
|
67
|
-
* // Let's say we want to scrape URLs extracted from sitemaps.
|
|
68
|
-
*
|
|
69
|
-
* const sourcesFunction = async () => {
|
|
70
|
-
* // With super large sitemaps, this operation could take very long
|
|
71
|
-
* // and big websites typically have multiple sitemaps.
|
|
72
|
-
* const sitemaps = await downloadHugeSitemaps();
|
|
73
|
-
* return parseUrlsFromSitemaps(sitemaps);
|
|
74
|
-
* }
|
|
75
|
-
*
|
|
76
|
-
* // Sitemaps can change in real-time, so it's important to persist
|
|
77
|
-
* // the URLs we collected. Otherwise we might lose our scraping
|
|
78
|
-
* // state in case of an actor migration / failure / time-out.
|
|
79
|
-
* const requestList = new RequestList({
|
|
80
|
-
* sourcesFunction,
|
|
81
|
-
* persistStateKey: 'state-key',
|
|
82
|
-
* persistRequestsKey: 'requests-key',
|
|
83
|
-
* })
|
|
84
|
-
*
|
|
85
|
-
* // The sourcesFunction is called now and the Requests are persisted.
|
|
86
|
-
* // If something goes wrong and we need to start again, RequestList
|
|
87
|
-
* // will load the persisted Requests from storage and will NOT
|
|
88
|
-
* // call the sourcesFunction again, saving time and resources.
|
|
89
|
-
* await requestList.initialize();
|
|
90
|
-
* ```
|
|
91
|
-
* @property {ProxyConfiguration} [proxyConfiguration]
|
|
92
|
-
* Used to pass the the proxy configuration for the `requestsFromUrls` objects.
|
|
93
|
-
* Takes advantage of the internal address rotation and authentication process.
|
|
94
|
-
* If undefined, the `requestsFromUrls` requests will be made without proxy.
|
|
95
|
-
* @property {string} [persistStateKey]
|
|
96
|
-
* Identifies the key in the default key-value store under which `RequestList` periodically stores its
|
|
97
|
-
* state (i.e. which URLs were crawled and which not).
|
|
98
|
-
* If the actor is restarted, `RequestList` will read the state
|
|
99
|
-
* and continue where it left off.
|
|
100
|
-
*
|
|
101
|
-
* If `persistStateKey` is not set, `RequestList` will always start from the beginning,
|
|
102
|
-
* and all the source URLs will be crawled again.
|
|
103
|
-
* @property {string} [persistRequestsKey]
|
|
104
|
-
* Identifies the key in the default key-value store under which the `RequestList` persists its
|
|
105
|
-
* Requests during the {@link RequestList#initialize} call.
|
|
106
|
-
* This is necessary if `persistStateKey` is set and the source URLs might potentially change,
|
|
107
|
-
* to ensure consistency of the source URLs and state object. However, it comes with some
|
|
108
|
-
* storage and performance overheads.
|
|
109
|
-
*
|
|
110
|
-
* If `persistRequestsKey` is not set, {@link RequestList#initialize} will always fetch the sources
|
|
111
|
-
* from their origin, check that they are consistent with the restored state (if any)
|
|
112
|
-
* and throw an error if they are not.
|
|
113
|
-
* @property {RequestListState} [state]
|
|
114
|
-
* The state object that the `RequestList` will be initialized from.
|
|
115
|
-
* It is in the form as returned by `RequestList.getState()`, such as follows:
|
|
116
|
-
*
|
|
117
|
-
* ```
|
|
118
|
-
* {
|
|
119
|
-
* nextIndex: 5,
|
|
120
|
-
* nextUniqueKey: 'unique-key-5'
|
|
121
|
-
* inProgress: {
|
|
122
|
-
* 'unique-key-1': true,
|
|
123
|
-
* 'unique-key-4': true,
|
|
124
|
-
* },
|
|
125
|
-
* }
|
|
126
|
-
* ```
|
|
127
|
-
*
|
|
128
|
-
* Note that the preferred (and simpler) way to persist the state of crawling of the `RequestList`
|
|
129
|
-
* is to use the `stateKeyPrefix` parameter instead.
|
|
130
|
-
* @property {boolean} [keepDuplicateUrls=false]
|
|
131
|
-
* By default, `RequestList` will deduplicate the provided URLs. Default deduplication is based
|
|
132
|
-
* on the `uniqueKey` property of passed source {@link Request} objects.
|
|
133
|
-
*
|
|
134
|
-
* If the property is not present, it is generated by normalizing the URL. If present, it is kept intact.
|
|
135
|
-
* In any case, only one request per `uniqueKey` is added to the `RequestList` resulting in removal
|
|
136
|
-
* of duplicate URLs / unique keys.
|
|
137
|
-
*
|
|
138
|
-
* Setting `keepDuplicateUrls` to `true` will append an additional identifier to the `uniqueKey`
|
|
139
|
-
* of each request that does not already include a `uniqueKey`. Therefore, duplicate
|
|
140
|
-
* URLs will be kept in the list. It does not protect the user from having duplicates in user set
|
|
141
|
-
* `uniqueKey`s however. It is the user's responsibility to ensure uniqueness of their unique keys
|
|
142
|
-
* if they wish to keep more than just a single copy in the `RequestList`.
|
|
143
|
-
*/
|
|
144
|
-
/**
|
|
145
|
-
* Represents a static list of URLs to crawl.
|
|
146
|
-
* The URLs can be provided either in code or parsed from a text file hosted on the web.
|
|
147
|
-
* `RequestList` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
|
|
148
|
-
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
|
|
149
|
-
*
|
|
150
|
-
* Each URL is represented using an instance of the {@link Request} class.
|
|
151
|
-
* The list can only contain unique URLs. More precisely, it can only contain `Request` instances
|
|
152
|
-
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
|
|
153
|
-
* To add a single URL to the list multiple times, corresponding {@link Request} objects will need to have different
|
|
154
|
-
* `uniqueKey` properties. You can use the `keepDuplicateUrls` option to do this for you when initializing the
|
|
155
|
-
* `RequestList` from sources.
|
|
156
|
-
*
|
|
157
|
-
* Once you create an instance of `RequestList`, you need to call the {@link RequestList#initialize} function
|
|
158
|
-
* before the instance can be used. After that, no more URLs can be added to the list.
|
|
159
|
-
* Unlike {@link RequestQueue}, `RequestList` is static but it can contain even millions of URLs.
|
|
160
|
-
* > Note that `RequestList` can be used together with `RequestQueue` by the same crawler.
|
|
161
|
-
* > In such cases, each request from `RequestList` is enqueued into `RequestQueue` first and then consumed from the latter.
|
|
162
|
-
* > This is necessary to avoid the same URL being processed more than once (from the list first and then possibly from the queue).
|
|
163
|
-
* > In practical terms, such a combination can be useful when there is a large number of initial URLs,
|
|
164
|
-
* > but more URLs would be added dynamically by the crawler.
|
|
165
|
-
*
|
|
166
|
-
* `RequestList` has an internal state where it stores information about which requests were already handled,
|
|
167
|
-
* which are in progress and which were reclaimed. The state may be automatically persisted to the default
|
|
168
|
-
* {@link KeyValueStore} by setting the `persistStateKey` option so that if the Node.js process is restarted,
|
|
169
|
-
* the crawling can continue where it left off. The automated persisting is launched upon receiving the `persistState`
|
|
170
|
-
* event that is periodically emitted by {@link events|Apify.events}.
|
|
171
|
-
*
|
|
172
|
-
* The internal state is closely tied to the provided sources (URLs). If the sources change on actor restart, the state will become corrupted and
|
|
173
|
-
* `RequestList` will raise an exception. This typically happens when the sources is a list of URLs downloaded from the web.
|
|
174
|
-
* In such case, use the `persistRequestsKey` option in conjunction with `persistStateKey`,
|
|
175
|
-
* to make the `RequestList` store the initial sources to the default key-value store and load them after restart,
|
|
176
|
-
* which will prevent any issues that a live list of URLs might cause.
|
|
177
|
-
*
|
|
178
|
-
* **Basic usage:**
|
|
179
|
-
* ```javascript
|
|
180
|
-
* // Use a helper function to simplify request list initialization.
|
|
181
|
-
* // State and sources are automatically persisted. This is a preferred usage.
|
|
182
|
-
* const requestList = await Apify.openRequestList('my-request-list', [
|
|
183
|
-
* 'http://www.example.com/page-1',
|
|
184
|
-
* { url: 'http://www.example.com/page-2', method: 'POST', userData: { foo: 'bar' }},
|
|
185
|
-
* { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } },
|
|
186
|
-
* ]);
|
|
187
|
-
* ```
|
|
188
|
-
*
|
|
189
|
-
* **Advanced usage:**
|
|
190
|
-
* ```javascript
|
|
191
|
-
* // Use the constructor to get more control over the initialization.
|
|
192
|
-
* const requestList = new Apify.RequestList({
|
|
193
|
-
* sources: [
|
|
194
|
-
* // Separate requests
|
|
195
|
-
* { url: 'http://www.example.com/page-1', method: 'GET', headers: { ... } },
|
|
196
|
-
* { url: 'http://www.example.com/page-2', userData: { foo: 'bar' }},
|
|
197
|
-
*
|
|
198
|
-
* // Bulk load of URLs from file `http://www.example.com/my-url-list.txt`
|
|
199
|
-
* // Note that all URLs must start with http:// or https://
|
|
200
|
-
* { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } },
|
|
201
|
-
* ],
|
|
202
|
-
*
|
|
203
|
-
* // Persist the state to avoid re-crawling which can lead to data duplications.
|
|
204
|
-
* // Keep in mind that the sources have to be immutable or this will throw an error.
|
|
205
|
-
* persistStateKey: 'my-state',
|
|
206
|
-
* });
|
|
207
|
-
*
|
|
208
|
-
* await requestList.initialize();
|
|
209
|
-
* ```
|
|
210
|
-
*/
|
|
211
|
-
class RequestList {
|
|
212
|
-
/**
|
|
213
|
-
* @param {RequestListOptions} options All `RequestList` configuration options
|
|
214
|
-
*/
|
|
215
|
-
constructor(options = {}) {
|
|
216
|
-
const { sources, sourcesFunction, persistStateKey, persistRequestsKey, state, proxyConfiguration, keepDuplicateUrls = false, } = options;
|
|
217
|
-
if (!(sources || sourcesFunction)) {
|
|
218
|
-
throw new ow_1.ArgumentError('At least one of "sources" or "sourcesFunction" must be provided.', this.constructor);
|
|
219
|
-
}
|
|
220
|
-
(0, ow_1.default)(options, ow_1.default.object.exactShape({
|
|
221
|
-
sources: ow_1.default.optional.array,
|
|
222
|
-
sourcesFunction: ow_1.default.optional.function,
|
|
223
|
-
persistStateKey: ow_1.default.optional.string,
|
|
224
|
-
persistRequestsKey: ow_1.default.optional.string,
|
|
225
|
-
state: ow_1.default.optional.object.exactShape({
|
|
226
|
-
nextIndex: ow_1.default.number,
|
|
227
|
-
nextUniqueKey: ow_1.default.string,
|
|
228
|
-
inProgress: ow_1.default.object,
|
|
229
|
-
}),
|
|
230
|
-
keepDuplicateUrls: ow_1.default.optional.boolean,
|
|
231
|
-
proxyConfiguration: ow_1.default.optional.object,
|
|
232
|
-
}));
|
|
233
|
-
this.log = utils_log_1.default.child({ prefix: 'RequestList' });
|
|
234
|
-
// Array of all requests from all sources, in the order as they appeared in sources.
|
|
235
|
-
// All requests in the array have distinct uniqueKey!
|
|
236
|
-
/** @type {Array<Request>} */
|
|
237
|
-
this.requests = [];
|
|
238
|
-
// Index to the next item in requests array to fetch. All previous requests are either handled or in progress.
|
|
239
|
-
this.nextIndex = 0;
|
|
240
|
-
// Dictionary, key is Request.uniqueKey, value is corresponding index in the requests array.
|
|
241
|
-
this.uniqueKeyToIndex = {};
|
|
242
|
-
// Dictionary of requests that were returned by fetchNextRequest().
|
|
243
|
-
// The key is uniqueKey, value is true.
|
|
244
|
-
// TODO: Change this to Set
|
|
245
|
-
this.inProgress = {};
|
|
246
|
-
// Dictionary of requests for which reclaimRequest() was called.
|
|
247
|
-
// The key is uniqueKey, value is true. TODO: Change this to Set
|
|
248
|
-
// Note that reclaimedRequests is always a subset of inProgress!
|
|
249
|
-
this.reclaimed = {};
|
|
250
|
-
this.persistStateKey = persistStateKey ? `SDK_${persistStateKey}` : persistStateKey;
|
|
251
|
-
this.persistRequestsKey = persistRequestsKey ? `SDK_${persistRequestsKey}` : persistRequestsKey;
|
|
252
|
-
this.initialState = state;
|
|
253
|
-
// If this option is set then all requests will get a pre-generated unique ID and duplicate URLs will be kept in the list.
|
|
254
|
-
this.keepDuplicateUrls = keepDuplicateUrls;
|
|
255
|
-
// Starts as true because until we handle the first request, the list is effectively persisted by doing nothing.
|
|
256
|
-
this.isStatePersisted = true;
|
|
257
|
-
// Starts as false because we don't know yet and sources might change in the meantime (eg. download from live list).
|
|
258
|
-
this.areRequestsPersisted = false;
|
|
259
|
-
this.isLoading = false;
|
|
260
|
-
this.isInitialized = false;
|
|
261
|
-
// Will be empty after initialization to save memory.
|
|
262
|
-
this.sources = sources || [];
|
|
263
|
-
this.sourcesFunction = sourcesFunction;
|
|
264
|
-
// The proxy configuration used for `requestsFromUrls` requests.
|
|
265
|
-
this.proxyConfiguration = proxyConfiguration;
|
|
266
|
-
}
|
|
267
|
-
/**
|
|
268
|
-
* Loads all remote sources of URLs and potentially starts periodic state persistence.
|
|
269
|
-
* This function must be called before you can start using the instance in a meaningful way.
|
|
270
|
-
*
|
|
271
|
-
* @returns {Promise<void>}
|
|
272
|
-
*/
|
|
273
|
-
async initialize() {
|
|
274
|
-
if (this.isLoading) {
|
|
275
|
-
throw new Error('RequestList sources are already loading or were loaded.');
|
|
276
|
-
}
|
|
277
|
-
this.isLoading = true;
|
|
278
|
-
const [state, persistedRequests] = await this._loadStateAndPersistedRequests();
|
|
279
|
-
// Add persisted requests / new sources in a memory efficient way because with very
|
|
280
|
-
// large lists, we were running out of memory.
|
|
281
|
-
if (persistedRequests) {
|
|
282
|
-
await this._addPersistedRequests(persistedRequests);
|
|
283
|
-
}
|
|
284
|
-
else {
|
|
285
|
-
await this._addRequestsFromSources();
|
|
286
|
-
}
|
|
287
|
-
this._restoreState(state);
|
|
288
|
-
this.isInitialized = true;
|
|
289
|
-
if (this.persistRequestsKey && !this.areRequestsPersisted)
|
|
290
|
-
await this._persistRequests();
|
|
291
|
-
if (this.persistStateKey) {
|
|
292
|
-
events_1.default.on(constants_1.ACTOR_EVENT_NAMES_EX.PERSIST_STATE, this.persistState.bind(this));
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
/**
|
|
296
|
-
* Adds previously persisted Requests, as retrieved from the key-value store.
|
|
297
|
-
* This needs to be done in a memory efficient way. We should update the input
|
|
298
|
-
* to a Stream once apify-client supports streams.
|
|
299
|
-
* @param {Buffer} persistedRequests
|
|
300
|
-
* @ignore
|
|
301
|
-
* @protected
|
|
302
|
-
* @internal
|
|
303
|
-
*/
|
|
304
|
-
async _addPersistedRequests(persistedRequests) {
|
|
305
|
-
// We don't need the sources so we purge them to
|
|
306
|
-
// prevent them from hanging in memory.
|
|
307
|
-
for (let i = 0; i < this.sources.length; i++) {
|
|
308
|
-
delete this.sources[i];
|
|
309
|
-
}
|
|
310
|
-
this.sources = [];
|
|
311
|
-
this.areRequestsPersisted = true;
|
|
312
|
-
const requestStream = (0, serialization_1.createDeserialize)(persistedRequests);
|
|
313
|
-
for await (const request of requestStream) {
|
|
314
|
-
this._addRequest(request);
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
/**
|
|
318
|
-
* Add Requests from both options.sources and options.sourcesFunction.
|
|
319
|
-
* This function is called only when persisted sources were not loaded.
|
|
320
|
-
* We need to avoid keeping both sources and requests in memory
|
|
321
|
-
* to reduce memory footprint with very large sources.
|
|
322
|
-
* @returns {Promise<void>}
|
|
323
|
-
* @ignore
|
|
324
|
-
* @protected
|
|
325
|
-
* @internal
|
|
326
|
-
*/
|
|
327
|
-
async _addRequestsFromSources() {
|
|
328
|
-
// We'll load all sources in sequence to ensure that they get loaded in the right order.
|
|
329
|
-
const sourcesCount = this.sources.length;
|
|
330
|
-
for (let i = 0; i < sourcesCount; i++) {
|
|
331
|
-
const source = this.sources[i];
|
|
332
|
-
// Using delete here to drop the original object ASAP to free memory
|
|
333
|
-
// .pop would reverse the array and .shift is SLOW.
|
|
334
|
-
delete this.sources[i];
|
|
335
|
-
if (source.requestsFromUrl) {
|
|
336
|
-
const fetchedRequests = await this._fetchRequestsFromUrl(source);
|
|
337
|
-
await this._addFetchedRequests(source, fetchedRequests);
|
|
338
|
-
}
|
|
339
|
-
else {
|
|
340
|
-
this._addRequest(source);
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
// Drop the original array full of empty indexes.
|
|
344
|
-
this.sources = [];
|
|
345
|
-
if (this.sourcesFunction) {
|
|
346
|
-
try {
|
|
347
|
-
const sourcesFromFunction = await this.sourcesFunction();
|
|
348
|
-
const sourcesFromFunctionCount = sourcesFromFunction.length;
|
|
349
|
-
for (let i = 0; i < sourcesFromFunctionCount; i++) {
|
|
350
|
-
const source = sourcesFromFunction.shift();
|
|
351
|
-
this._addRequest(source);
|
|
352
|
-
}
|
|
353
|
-
}
|
|
354
|
-
catch (err) {
|
|
355
|
-
throw new Error(`Loading requests with sourcesFunction failed.\nCause: ${err.message}`);
|
|
356
|
-
}
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
/**
|
|
360
|
-
* Persists the current state of the `RequestList` into the default {@link KeyValueStore}.
|
|
361
|
-
* The state is persisted automatically in regular intervals, but calling this method manually
|
|
362
|
-
* is useful in cases where you want to have the most current state available after you pause
|
|
363
|
-
* or stop fetching its requests. For example after you pause or abort a crawl. Or just before
|
|
364
|
-
* a server migration.
|
|
365
|
-
*
|
|
366
|
-
* @return {Promise<void>}
|
|
367
|
-
*/
|
|
368
|
-
async persistState() {
|
|
369
|
-
if (!this.persistStateKey) {
|
|
370
|
-
throw new Error('Cannot persist state. options.persistStateKey is not set.');
|
|
371
|
-
}
|
|
372
|
-
if (this.isStatePersisted)
|
|
373
|
-
return;
|
|
374
|
-
try {
|
|
375
|
-
await (0, key_value_store_1.setValue)(this.persistStateKey, this.getState());
|
|
376
|
-
this.isStatePersisted = true;
|
|
377
|
-
}
|
|
378
|
-
catch (err) {
|
|
379
|
-
this.log.exception(err, 'Attempted to persist state, but failed.');
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
/**
|
|
383
|
-
* Unlike persistState(), this is used only internally, since the sources
|
|
384
|
-
* are automatically persisted at RequestList initialization (if the persistRequestsKey is set),
|
|
385
|
-
* but there's no reason to persist it again afterwards, because RequestList is immutable.
|
|
386
|
-
*
|
|
387
|
-
* @return {Promise<void>}
|
|
388
|
-
* @ignore
|
|
389
|
-
* @protected
|
|
390
|
-
* @internal
|
|
391
|
-
*/
|
|
392
|
-
async _persistRequests() {
|
|
393
|
-
const serializedRequests = await (0, serialization_1.serializeArray)(this.requests);
|
|
394
|
-
await (0, key_value_store_1.setValue)(this.persistRequestsKey, serializedRequests, { contentType: CONTENT_TYPE_BINARY });
|
|
395
|
-
this.areRequestsPersisted = true;
|
|
396
|
-
}
|
|
397
|
-
/**
|
|
398
|
-
* Restores RequestList state from a state object.
|
|
399
|
-
*
|
|
400
|
-
* @param {RequestListState} state
|
|
401
|
-
* @ignore
|
|
402
|
-
* @protected
|
|
403
|
-
* @internal
|
|
404
|
-
*/
|
|
405
|
-
_restoreState(state) {
|
|
406
|
-
// If there's no state it means we've not persisted any (yet).
|
|
407
|
-
if (!state)
|
|
408
|
-
return;
|
|
409
|
-
// Restore previous state.
|
|
410
|
-
if (typeof state.nextIndex !== 'number' || state.nextIndex < 0) {
|
|
411
|
-
throw new Error('The state object is invalid: nextIndex must be a non-negative number.');
|
|
412
|
-
}
|
|
413
|
-
if (state.nextIndex > this.requests.length) {
|
|
414
|
-
throw new Error('The state object is not consistent with RequestList too few requests loaded.');
|
|
415
|
-
}
|
|
416
|
-
if (state.nextIndex < this.requests.length
|
|
417
|
-
&& this.requests[state.nextIndex].uniqueKey !== state.nextUniqueKey) {
|
|
418
|
-
throw new Error('The state object is not consistent with RequestList the order of URLs seems to have changed.');
|
|
419
|
-
}
|
|
420
|
-
const deleteFromInProgress = [];
|
|
421
|
-
underscore_1.default.keys(state.inProgress).forEach((uniqueKey) => {
|
|
422
|
-
const index = this.uniqueKeyToIndex[uniqueKey];
|
|
423
|
-
if (typeof index !== 'number') {
|
|
424
|
-
throw new Error('The state object is not consistent with RequestList. Unknown uniqueKey is present in the state.');
|
|
425
|
-
}
|
|
426
|
-
if (index >= state.nextIndex) {
|
|
427
|
-
deleteFromInProgress.push(uniqueKey);
|
|
428
|
-
}
|
|
429
|
-
});
|
|
430
|
-
// WORKAROUND:
|
|
431
|
-
// It happened to some users that state object contained something like:
|
|
432
|
-
// {
|
|
433
|
-
// "nextIndex": 11308,
|
|
434
|
-
// "nextUniqueKey": "https://www.anychart.com",
|
|
435
|
-
// "inProgress": {
|
|
436
|
-
// "https://www.ams360.com": true,
|
|
437
|
-
// ...
|
|
438
|
-
// "https://www.anychart.com": true,
|
|
439
|
-
// }
|
|
440
|
-
// Which then caused error "The request is not being processed (uniqueKey: https://www.anychart.com)"
|
|
441
|
-
// As a workaround, we just remove all inProgress requests whose index >= nextIndex,
|
|
442
|
-
// since they will be crawled again.
|
|
443
|
-
if (deleteFromInProgress.length) {
|
|
444
|
-
this.log.warning('RequestList\'s in-progress field is not consistent, skipping invalid in-progress entries', {
|
|
445
|
-
deleteFromInProgress,
|
|
446
|
-
});
|
|
447
|
-
underscore_1.default.each(deleteFromInProgress, (uniqueKey) => {
|
|
448
|
-
delete state.inProgress[uniqueKey];
|
|
449
|
-
});
|
|
450
|
-
}
|
|
451
|
-
this.nextIndex = state.nextIndex;
|
|
452
|
-
this.inProgress = state.inProgress;
|
|
453
|
-
// All in-progress requests need to be recrawled
|
|
454
|
-
this.reclaimed = underscore_1.default.clone(this.inProgress);
|
|
455
|
-
}
|
|
456
|
-
/**
|
|
457
|
-
* Attempts to load state and requests using the `RequestList` configuration
|
|
458
|
-
* and returns a tuple of [state, requests] where each may be null if not loaded.
|
|
459
|
-
*
|
|
460
|
-
* @return {Promise<Array<(RequestListState|null)>>}
|
|
461
|
-
* @ignore
|
|
462
|
-
* @protected
|
|
463
|
-
* @internal
|
|
464
|
-
*/
|
|
465
|
-
async _loadStateAndPersistedRequests() {
|
|
466
|
-
let state;
|
|
467
|
-
let persistedRequests;
|
|
468
|
-
if (this.initialState) {
|
|
469
|
-
state = this.initialState;
|
|
470
|
-
this.log.debug('Loaded state from options.state argument.');
|
|
471
|
-
}
|
|
472
|
-
else if (this.persistStateKey) {
|
|
473
|
-
state = (0, key_value_store_1.getValue)(this.persistStateKey);
|
|
474
|
-
if (state)
|
|
475
|
-
this.log.debug('Loaded state from key value store using the persistStateKey.');
|
|
476
|
-
}
|
|
477
|
-
if (this.persistRequestsKey) {
|
|
478
|
-
persistedRequests = await (0, key_value_store_1.getValue)(this.persistRequestsKey);
|
|
479
|
-
if (persistedRequests)
|
|
480
|
-
this.log.debug('Loaded requests from key value store using the persistRequestsKey.');
|
|
481
|
-
}
|
|
482
|
-
// Unwraps "state" promise if needed, otherwise no-op.
|
|
483
|
-
return Promise.all([state, persistedRequests]);
|
|
484
|
-
}
|
|
485
|
-
/**
|
|
486
|
-
* Returns an object representing the internal state of the `RequestList` instance.
|
|
487
|
-
* Note that the object's fields can change in future releases.
|
|
488
|
-
*
|
|
489
|
-
* @returns {RequestListState}
|
|
490
|
-
*/
|
|
491
|
-
getState() {
|
|
492
|
-
this._ensureIsInitialized();
|
|
493
|
-
return {
|
|
494
|
-
nextIndex: this.nextIndex,
|
|
495
|
-
nextUniqueKey: this.nextIndex < this.requests.length
|
|
496
|
-
? this.requests[this.nextIndex].uniqueKey
|
|
497
|
-
: null,
|
|
498
|
-
inProgress: this.inProgress,
|
|
499
|
-
};
|
|
500
|
-
}
|
|
501
|
-
/**
|
|
502
|
-
* Resolves to `true` if the next call to {@link RequestList#fetchNextRequest} function
|
|
503
|
-
* would return `null`, otherwise it resolves to `false`.
|
|
504
|
-
* Note that even if the list is empty, there might be some pending requests currently being processed.
|
|
505
|
-
*
|
|
506
|
-
* @returns {Promise<boolean>}
|
|
507
|
-
*/
|
|
508
|
-
async isEmpty() {
|
|
509
|
-
this._ensureIsInitialized();
|
|
510
|
-
return !(0, utils_1.getFirstKey)(this.reclaimed) && this.nextIndex >= this.requests.length;
|
|
511
|
-
}
|
|
512
|
-
/**
|
|
513
|
-
* Returns `true` if all requests were already handled and there are no more left.
|
|
514
|
-
*
|
|
515
|
-
* @returns {Promise<boolean>}
|
|
516
|
-
*/
|
|
517
|
-
async isFinished() {
|
|
518
|
-
this._ensureIsInitialized();
|
|
519
|
-
return !(0, utils_1.getFirstKey)(this.inProgress) && this.nextIndex >= this.requests.length;
|
|
520
|
-
}
|
|
521
|
-
/**
|
|
522
|
-
* Gets the next {@link Request} to process. First, the function gets a request previously reclaimed
|
|
523
|
-
* using the {@link RequestList#reclaimRequest} function, if there is any.
|
|
524
|
-
* Otherwise it gets the next request from sources.
|
|
525
|
-
*
|
|
526
|
-
* The function's `Promise` resolves to `null` if there are no more
|
|
527
|
-
* requests to process.
|
|
528
|
-
*
|
|
529
|
-
* @returns {Promise<(Request|null)>}
|
|
530
|
-
*/
|
|
531
|
-
async fetchNextRequest() {
|
|
532
|
-
this._ensureIsInitialized();
|
|
533
|
-
// First return reclaimed requests if any.
|
|
534
|
-
const uniqueKey = (0, utils_1.getFirstKey)(this.reclaimed);
|
|
535
|
-
if (uniqueKey) {
|
|
536
|
-
delete this.reclaimed[uniqueKey];
|
|
537
|
-
const index = this.uniqueKeyToIndex[uniqueKey];
|
|
538
|
-
return this.requests[index];
|
|
539
|
-
}
|
|
540
|
-
// Otherwise return next request.
|
|
541
|
-
if (this.nextIndex < this.requests.length) {
|
|
542
|
-
const request = this.requests[this.nextIndex];
|
|
543
|
-
this.inProgress[request.uniqueKey] = true;
|
|
544
|
-
this.nextIndex++;
|
|
545
|
-
this.isStatePersisted = false;
|
|
546
|
-
return request;
|
|
547
|
-
}
|
|
548
|
-
return null;
|
|
549
|
-
}
|
|
550
|
-
/**
|
|
551
|
-
* Marks request as handled after successful processing.
|
|
552
|
-
*
|
|
553
|
-
* @param {Request} request
|
|
554
|
-
* @returns {Promise<void>}
|
|
555
|
-
*/
|
|
556
|
-
async markRequestHandled(request) {
|
|
557
|
-
const { uniqueKey } = request;
|
|
558
|
-
this._ensureUniqueKeyValid(uniqueKey);
|
|
559
|
-
this._ensureInProgressAndNotReclaimed(uniqueKey);
|
|
560
|
-
this._ensureIsInitialized();
|
|
561
|
-
delete this.inProgress[uniqueKey];
|
|
562
|
-
this.isStatePersisted = false;
|
|
563
|
-
}
|
|
564
|
-
/**
|
|
565
|
-
* Reclaims request to the list if its processing failed.
|
|
566
|
-
* The request will become available in the next `this.fetchNextRequest()`.
|
|
567
|
-
*
|
|
568
|
-
* @param {Request} request
|
|
569
|
-
* @returns {Promise<void>}
|
|
570
|
-
*/
|
|
571
|
-
async reclaimRequest(request) {
|
|
572
|
-
const { uniqueKey } = request;
|
|
573
|
-
this._ensureUniqueKeyValid(uniqueKey);
|
|
574
|
-
this._ensureInProgressAndNotReclaimed(uniqueKey);
|
|
575
|
-
this._ensureIsInitialized();
|
|
576
|
-
this.reclaimed[uniqueKey] = true;
|
|
577
|
-
}
|
|
578
|
-
/**
|
|
579
|
-
* Adds all fetched requests from a URL from a remote resource.
|
|
580
|
-
*
|
|
581
|
-
* @ignore
|
|
582
|
-
* @protected
|
|
583
|
-
* @internal
|
|
584
|
-
*/
|
|
585
|
-
async _addFetchedRequests(source, fetchedRequests) {
|
|
586
|
-
const { requestsFromUrl, regex } = source;
|
|
587
|
-
const originalLength = this.requests.length;
|
|
588
|
-
fetchedRequests.forEach((request) => this._addRequest(request));
|
|
589
|
-
const fetchedCount = fetchedRequests.length;
|
|
590
|
-
const importedCount = this.requests.length - originalLength;
|
|
591
|
-
this.log.info('Fetched and loaded Requests from a remote resource.', {
|
|
592
|
-
requestsFromUrl,
|
|
593
|
-
regex,
|
|
594
|
-
fetchedCount,
|
|
595
|
-
importedCount,
|
|
596
|
-
duplicateCount: fetchedCount - importedCount,
|
|
597
|
-
sample: JSON.stringify(fetchedRequests.slice(0, 5)),
|
|
598
|
-
});
|
|
599
|
-
}
|
|
600
|
-
/**
|
|
601
|
-
* Fetches URLs from requestsFromUrl and returns them in format of list of requests
|
|
602
|
-
* @param {*} source
|
|
603
|
-
* @return {Promise<Array<RequestOptions>>}
|
|
604
|
-
* @ignore
|
|
605
|
-
* @protected
|
|
606
|
-
* @internal
|
|
607
|
-
*/
|
|
608
|
-
async _fetchRequestsFromUrl(source) {
|
|
609
|
-
const { requestsFromUrl, regex, ...sharedOpts } = source;
|
|
610
|
-
const { downloadListOfUrls } = utils_1.publicUtils;
|
|
611
|
-
// Download remote resource and parse URLs.
|
|
612
|
-
let urlsArr;
|
|
613
|
-
try {
|
|
614
|
-
urlsArr = await downloadListOfUrls({ url: requestsFromUrl, urlRegExp: regex, proxyUrl: this.proxyConfiguration?.newUrl() });
|
|
615
|
-
}
|
|
616
|
-
catch (err) {
|
|
617
|
-
throw new Error(`Cannot fetch a request list from ${requestsFromUrl}: ${err}`);
|
|
618
|
-
}
|
|
619
|
-
// Skip if resource contained no URLs.
|
|
620
|
-
if (!urlsArr.length) {
|
|
621
|
-
this.log.warning('list fetched, but it is empty.', { requestsFromUrl, regex });
|
|
622
|
-
return [];
|
|
623
|
-
}
|
|
624
|
-
return urlsArr.map((url) => ({ url, ...sharedOpts }));
|
|
625
|
-
}
|
|
626
|
-
/**
|
|
627
|
-
* Adds given request.
|
|
628
|
-
* If the `source` parameter is a string or plain object and not an instance
|
|
629
|
-
* of a `Request`, then the function creates a `Request` instance.
|
|
630
|
-
*
|
|
631
|
-
* @param {(string|Request|object)} source
|
|
632
|
-
* @ignore
|
|
633
|
-
* @protected
|
|
634
|
-
* @internal
|
|
635
|
-
*/
|
|
636
|
-
_addRequest(source) {
|
|
637
|
-
let request;
|
|
638
|
-
const type = typeof source;
|
|
639
|
-
if (type === 'string') {
|
|
640
|
-
request = new request_1.default({ url: source });
|
|
641
|
-
}
|
|
642
|
-
else if (source instanceof request_1.default) {
|
|
643
|
-
request = source;
|
|
644
|
-
}
|
|
645
|
-
else if (source && type === 'object') {
|
|
646
|
-
request = new request_1.default(source);
|
|
647
|
-
}
|
|
648
|
-
else {
|
|
649
|
-
throw new Error(`Cannot create Request from type: ${type}`);
|
|
650
|
-
}
|
|
651
|
-
const hasUniqueKey = !!source.uniqueKey;
|
|
652
|
-
// Add index to uniqueKey if duplicates are to be kept
|
|
653
|
-
if (this.keepDuplicateUrls && !hasUniqueKey) {
|
|
654
|
-
request.uniqueKey += `-${this.requests.length}`;
|
|
655
|
-
}
|
|
656
|
-
const { uniqueKey } = request;
|
|
657
|
-
this._ensureUniqueKeyValid(uniqueKey);
|
|
658
|
-
// Skip requests with duplicate uniqueKey
|
|
659
|
-
if (!this.uniqueKeyToIndex.hasOwnProperty(uniqueKey)) { // eslint-disable-line no-prototype-builtins
|
|
660
|
-
this.uniqueKeyToIndex[uniqueKey] = this.requests.length;
|
|
661
|
-
this.requests.push(request);
|
|
662
|
-
}
|
|
663
|
-
else if (this.keepDuplicateUrls) {
|
|
664
|
-
this.log.warning(`Duplicate uniqueKey: ${uniqueKey} found while the keepDuplicateUrls option was set. Check your sources' unique keys.`); // eslint-disable-line max-len
|
|
665
|
-
}
|
|
666
|
-
}
|
|
667
|
-
/**
|
|
668
|
-
* Helper function that validates unique key.
|
|
669
|
-
* Throws an error if uniqueKey is not a non-empty string.
|
|
670
|
-
*
|
|
671
|
-
* @ignore
|
|
672
|
-
* @protected
|
|
673
|
-
* @internal
|
|
674
|
-
*/
|
|
675
|
-
_ensureUniqueKeyValid(uniqueKey) {
|
|
676
|
-
if (typeof uniqueKey !== 'string' || !uniqueKey) {
|
|
677
|
-
throw new Error('Request object\'s uniqueKey must be a non-empty string');
|
|
678
|
-
}
|
|
679
|
-
}
|
|
680
|
-
/**
|
|
681
|
-
* Checks that request is not reclaimed and throws an error if so.
|
|
682
|
-
*
|
|
683
|
-
* @ignore
|
|
684
|
-
* @protected
|
|
685
|
-
* @internal
|
|
686
|
-
*/
|
|
687
|
-
_ensureInProgressAndNotReclaimed(uniqueKey) {
|
|
688
|
-
if (!this.inProgress[uniqueKey]) {
|
|
689
|
-
throw new Error(`The request is not being processed (uniqueKey: ${uniqueKey})`);
|
|
690
|
-
}
|
|
691
|
-
if (this.reclaimed[uniqueKey]) {
|
|
692
|
-
throw new Error(`The request was already reclaimed (uniqueKey: ${uniqueKey})`);
|
|
693
|
-
}
|
|
694
|
-
}
|
|
695
|
-
/**
|
|
696
|
-
* Throws an error if request list wasn't initialized.
|
|
697
|
-
*
|
|
698
|
-
* @ignore
|
|
699
|
-
* @protected
|
|
700
|
-
* @internal
|
|
701
|
-
*/
|
|
702
|
-
_ensureIsInitialized() {
|
|
703
|
-
if (!this.isInitialized) {
|
|
704
|
-
throw new Error('RequestList is not initialized; you must call "await requestList.initialize()" before using it!');
|
|
705
|
-
}
|
|
706
|
-
}
|
|
707
|
-
/**
|
|
708
|
-
* Returns the total number of unique requests present in the `RequestList`.
|
|
709
|
-
*
|
|
710
|
-
* @returns {number}
|
|
711
|
-
*/
|
|
712
|
-
length() {
|
|
713
|
-
this._ensureIsInitialized();
|
|
714
|
-
return this.requests.length;
|
|
715
|
-
}
|
|
716
|
-
/**
|
|
717
|
-
* Returns number of handled requests.
|
|
718
|
-
*
|
|
719
|
-
* @returns {number}
|
|
720
|
-
*/
|
|
721
|
-
handledCount() {
|
|
722
|
-
this._ensureIsInitialized();
|
|
723
|
-
return this.nextIndex - underscore_1.default.size(this.inProgress);
|
|
724
|
-
}
|
|
725
|
-
}
|
|
726
|
-
exports.RequestList = RequestList;
|
|
727
|
-
/**
|
|
728
|
-
* Opens a request list and returns a promise resolving to an instance
|
|
729
|
-
* of the {@link RequestList} class that is already initialized.
|
|
730
|
-
*
|
|
731
|
-
* {@link RequestList} represents a list of URLs to crawl, which is always stored in memory.
|
|
732
|
-
* To enable picking up where left off after a process restart, the request list sources
|
|
733
|
-
* are persisted to the key-value store at initialization of the list. Then, while crawling,
|
|
734
|
-
* a small state object is regularly persisted to keep track of the crawling status.
|
|
735
|
-
*
|
|
736
|
-
* For more details and code examples, see the {@link RequestList} class.
|
|
737
|
-
*
|
|
738
|
-
* **Example usage:**
|
|
739
|
-
*
|
|
740
|
-
* ```javascript
|
|
741
|
-
* const sources = [
|
|
742
|
-
* 'https://www.example.com',
|
|
743
|
-
* 'https://www.google.com',
|
|
744
|
-
* 'https://www.bing.com'
|
|
745
|
-
* ];
|
|
746
|
-
*
|
|
747
|
-
* const requestList = await Apify.openRequestList('my-name', sources);
|
|
748
|
-
* ```
|
|
749
|
-
*
|
|
750
|
-
* @param {string|null} listName
|
|
751
|
-
* Name of the request list to be opened. Setting a name enables the `RequestList`'s state to be persisted
|
|
752
|
-
* in the key-value store. This is useful in case of a restart or migration. Since `RequestList` is only
|
|
753
|
-
* stored in memory, a restart or migration wipes it clean. Setting a name will enable the `RequestList`'s
|
|
754
|
-
* state to survive those situations and continue where it left off.
|
|
755
|
-
*
|
|
756
|
-
* The name will be used as a prefix in key-value store, producing keys such as `NAME-REQUEST_LIST_STATE`
|
|
757
|
-
* and `NAME-REQUEST_LIST_SOURCES`.
|
|
758
|
-
*
|
|
759
|
-
* If `null`, the list will not be persisted and will only be stored in memory. Process restart
|
|
760
|
-
* will then cause the list to be crawled again from the beginning. We suggest always using a name.
|
|
761
|
-
* @param {RequestListOptions['sources']} sources
|
|
762
|
-
* An array of sources of URLs for the {@link RequestList}. It can be either an array of strings,
|
|
763
|
-
* plain objects that define at least the `url` property, or an array of {@link Request} instances.
|
|
764
|
-
*
|
|
765
|
-
* **IMPORTANT:** The `sources` array will be consumed (left empty) after {@link RequestList} initializes.
|
|
766
|
-
* This is a measure to prevent memory leaks in situations when millions of sources are
|
|
767
|
-
* added.
|
|
768
|
-
*
|
|
769
|
-
* Additionally, the `requestsFromUrl` property may be used instead of `url`,
|
|
770
|
-
* which will instruct {@link RequestList} to download the source URLs from a given remote location.
|
|
771
|
-
* The URLs will be parsed from the received response. In this case you can limit the URLs
|
|
772
|
-
* using `regex` parameter containing regular expression pattern for URLs to be included.
|
|
773
|
-
*
|
|
774
|
-
* For details, see the {@link RequestListOptions.sources}
|
|
775
|
-
* @param {RequestListOptions} [options]
|
|
776
|
-
* The {@link RequestList} options. Note that the `listName` parameter supersedes
|
|
777
|
-
* the {@link RequestListOptions.persistStateKey} and {@link RequestListOptions.persistRequestsKey}
|
|
778
|
-
* options and the `sources` parameter supersedes the {@link RequestListOptions.sources} option.
|
|
779
|
-
* @returns {Promise<RequestList>}
|
|
780
|
-
* @memberof module:Apify
|
|
781
|
-
* @name openRequestList
|
|
782
|
-
* @function
|
|
783
|
-
*/
|
|
784
|
-
const openRequestList = async (listName, sources, options = {}) => {
|
|
785
|
-
(0, ow_1.default)(listName, ow_1.default.any(ow_1.default.string, ow_1.default.null));
|
|
786
|
-
(0, ow_1.default)(sources, ow_1.default.array);
|
|
787
|
-
(0, ow_1.default)(options, ow_1.default.object.is((v) => !Array.isArray(v)));
|
|
788
|
-
const rl = new RequestList({
|
|
789
|
-
...options,
|
|
790
|
-
persistStateKey: listName ? `${listName}-${exports.STATE_PERSISTENCE_KEY}` : undefined,
|
|
791
|
-
persistRequestsKey: listName ? `${listName}-${exports.REQUESTS_PERSISTENCE_KEY}` : undefined,
|
|
792
|
-
sources,
|
|
793
|
-
});
|
|
794
|
-
await rl.initialize();
|
|
795
|
-
return rl;
|
|
796
|
-
};
|
|
797
|
-
exports.openRequestList = openRequestList;
|
|
798
|
-
/**
|
|
799
|
-
* Represents state of a {@link RequestList}. It can be used to resume a {@link RequestList} which has been previously processed.
|
|
800
|
-
* You can obtain the state by calling {@link RequestList#getState} and receive an object with
|
|
801
|
-
* the following structure:
|
|
802
|
-
*
|
|
803
|
-
* ```
|
|
804
|
-
* {
|
|
805
|
-
* nextIndex: 5,
|
|
806
|
-
* nextUniqueKey: 'unique-key-5'
|
|
807
|
-
* inProgress: {
|
|
808
|
-
* 'unique-key-1': true,
|
|
809
|
-
* 'unique-key-4': true
|
|
810
|
-
* },
|
|
811
|
-
* }
|
|
812
|
-
* ```
|
|
813
|
-
*
|
|
814
|
-
* @typedef RequestListState
|
|
815
|
-
* @property {number} nextIndex
|
|
816
|
-
* Position of the next request to be processed.
|
|
817
|
-
* @property {string} nextUniqueKey
|
|
818
|
-
* Key of the next request to be processed.
|
|
819
|
-
* @property {Object<string,boolean>} inProgress
|
|
820
|
-
* An object mapping request keys to a boolean value respresenting whether they are being processed at the moment.
|
|
821
|
-
*/
|
|
822
|
-
/**
|
|
823
|
-
* @callback RequestListSourcesFunction
|
|
824
|
-
* @return {Promise<Array<(RequestOptions|Request|string)>>}
|
|
825
|
-
*/
|
|
826
|
-
//# sourceMappingURL=request_list.js.map
|