apify 2.3.1-beta.4 → 3.0.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -5
- package/package.json +69 -128
- package/build/actor.d.ts +0 -113
- package/build/actor.d.ts.map +0 -1
- package/build/actor.js +0 -582
- package/build/actor.js.map +0 -1
- package/build/apify.d.ts +0 -752
- package/build/apify.d.ts.map +0 -1
- package/build/apify.js +0 -877
- package/build/apify.js.map +0 -1
- package/build/autoscaling/autoscaled_pool.d.ts +0 -384
- package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
- package/build/autoscaling/autoscaled_pool.js +0 -557
- package/build/autoscaling/autoscaled_pool.js.map +0 -1
- package/build/autoscaling/snapshotter.d.ts +0 -278
- package/build/autoscaling/snapshotter.d.ts.map +0 -1
- package/build/autoscaling/snapshotter.js +0 -447
- package/build/autoscaling/snapshotter.js.map +0 -1
- package/build/autoscaling/system_status.d.ts +0 -224
- package/build/autoscaling/system_status.d.ts.map +0 -1
- package/build/autoscaling/system_status.js +0 -228
- package/build/autoscaling/system_status.js.map +0 -1
- package/build/browser_launchers/browser_launcher.d.ts +0 -154
- package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
- package/build/browser_launchers/browser_launcher.js +0 -160
- package/build/browser_launchers/browser_launcher.js.map +0 -1
- package/build/browser_launchers/browser_plugin.d.ts +0 -23
- package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
- package/build/browser_launchers/browser_plugin.js +0 -25
- package/build/browser_launchers/browser_plugin.js.map +0 -1
- package/build/browser_launchers/playwright_launcher.d.ts +0 -131
- package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
- package/build/browser_launchers/playwright_launcher.js +0 -150
- package/build/browser_launchers/playwright_launcher.js.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
- package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.js +0 -197
- package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
- package/build/cache_container.d.ts +0 -31
- package/build/cache_container.d.ts.map +0 -1
- package/build/cache_container.js +0 -48
- package/build/cache_container.js.map +0 -1
- package/build/configuration.d.ts +0 -226
- package/build/configuration.d.ts.map +0 -1
- package/build/configuration.js +0 -325
- package/build/configuration.js.map +0 -1
- package/build/constants.d.ts +0 -37
- package/build/constants.d.ts.map +0 -1
- package/build/constants.js +0 -41
- package/build/constants.js.map +0 -1
- package/build/crawlers/basic_crawler.d.ts +0 -443
- package/build/crawlers/basic_crawler.d.ts.map +0 -1
- package/build/crawlers/basic_crawler.js +0 -664
- package/build/crawlers/basic_crawler.js.map +0 -1
- package/build/crawlers/browser_crawler.d.ts +0 -512
- package/build/crawlers/browser_crawler.d.ts.map +0 -1
- package/build/crawlers/browser_crawler.js +0 -540
- package/build/crawlers/browser_crawler.js.map +0 -1
- package/build/crawlers/cheerio_crawler.d.ts +0 -931
- package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
- package/build/crawlers/cheerio_crawler.js +0 -913
- package/build/crawlers/cheerio_crawler.js.map +0 -1
- package/build/crawlers/crawler_extension.d.ts +0 -10
- package/build/crawlers/crawler_extension.d.ts.map +0 -1
- package/build/crawlers/crawler_extension.js +0 -19
- package/build/crawlers/crawler_extension.js.map +0 -1
- package/build/crawlers/crawler_utils.d.ts +0 -34
- package/build/crawlers/crawler_utils.d.ts.map +0 -1
- package/build/crawlers/crawler_utils.js +0 -87
- package/build/crawlers/crawler_utils.js.map +0 -1
- package/build/crawlers/playwright_crawler.d.ts +0 -448
- package/build/crawlers/playwright_crawler.d.ts.map +0 -1
- package/build/crawlers/playwright_crawler.js +0 -299
- package/build/crawlers/playwright_crawler.js.map +0 -1
- package/build/crawlers/puppeteer_crawler.d.ts +0 -425
- package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
- package/build/crawlers/puppeteer_crawler.js +0 -299
- package/build/crawlers/puppeteer_crawler.js.map +0 -1
- package/build/crawlers/statistics.d.ts +0 -185
- package/build/crawlers/statistics.d.ts.map +0 -1
- package/build/crawlers/statistics.js +0 -331
- package/build/crawlers/statistics.js.map +0 -1
- package/build/enqueue_links/click_elements.d.ts +0 -179
- package/build/enqueue_links/click_elements.d.ts.map +0 -1
- package/build/enqueue_links/click_elements.js +0 -434
- package/build/enqueue_links/click_elements.js.map +0 -1
- package/build/enqueue_links/enqueue_links.d.ts +0 -117
- package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
- package/build/enqueue_links/enqueue_links.js +0 -163
- package/build/enqueue_links/enqueue_links.js.map +0 -1
- package/build/enqueue_links/shared.d.ts +0 -42
- package/build/enqueue_links/shared.d.ts.map +0 -1
- package/build/enqueue_links/shared.js +0 -121
- package/build/enqueue_links/shared.js.map +0 -1
- package/build/errors.d.ts +0 -29
- package/build/errors.d.ts.map +0 -1
- package/build/errors.js +0 -38
- package/build/errors.js.map +0 -1
- package/build/events.d.ts +0 -11
- package/build/events.d.ts.map +0 -1
- package/build/events.js +0 -147
- package/build/events.js.map +0 -1
- package/build/index.d.ts +0 -4
- package/build/index.d.ts.map +0 -1
- package/build/index.js +0 -7
- package/build/index.js.map +0 -1
- package/build/main.d.ts +0 -179
- package/build/main.d.ts.map +0 -1
- package/build/main.js +0 -81
- package/build/main.js.map +0 -1
- package/build/playwright_utils.d.ts +0 -9
- package/build/playwright_utils.d.ts.map +0 -1
- package/build/playwright_utils.js +0 -90
- package/build/playwright_utils.js.map +0 -1
- package/build/proxy_configuration.d.ts +0 -411
- package/build/proxy_configuration.d.ts.map +0 -1
- package/build/proxy_configuration.js +0 -517
- package/build/proxy_configuration.js.map +0 -1
- package/build/pseudo_url.d.ts +0 -86
- package/build/pseudo_url.d.ts.map +0 -1
- package/build/pseudo_url.js +0 -153
- package/build/pseudo_url.js.map +0 -1
- package/build/puppeteer_request_interception.d.ts +0 -8
- package/build/puppeteer_request_interception.d.ts.map +0 -1
- package/build/puppeteer_request_interception.js +0 -235
- package/build/puppeteer_request_interception.js.map +0 -1
- package/build/puppeteer_utils.d.ts +0 -250
- package/build/puppeteer_utils.d.ts.map +0 -1
- package/build/puppeteer_utils.js +0 -551
- package/build/puppeteer_utils.js.map +0 -1
- package/build/request.d.ts +0 -180
- package/build/request.d.ts.map +0 -1
- package/build/request.js +0 -261
- package/build/request.js.map +0 -1
- package/build/request_list.d.ts +0 -581
- package/build/request_list.d.ts.map +0 -1
- package/build/request_list.js +0 -826
- package/build/request_list.js.map +0 -1
- package/build/serialization.d.ts +0 -5
- package/build/serialization.d.ts.map +0 -1
- package/build/serialization.js +0 -139
- package/build/serialization.js.map +0 -1
- package/build/session_pool/errors.d.ts +0 -11
- package/build/session_pool/errors.d.ts.map +0 -1
- package/build/session_pool/errors.js +0 -18
- package/build/session_pool/errors.js.map +0 -1
- package/build/session_pool/events.d.ts +0 -5
- package/build/session_pool/events.d.ts.map +0 -1
- package/build/session_pool/events.js +0 -6
- package/build/session_pool/events.js.map +0 -1
- package/build/session_pool/session.d.ts +0 -286
- package/build/session_pool/session.d.ts.map +0 -1
- package/build/session_pool/session.js +0 -355
- package/build/session_pool/session.js.map +0 -1
- package/build/session_pool/session_pool.d.ts +0 -280
- package/build/session_pool/session_pool.d.ts.map +0 -1
- package/build/session_pool/session_pool.js +0 -393
- package/build/session_pool/session_pool.js.map +0 -1
- package/build/session_pool/session_utils.d.ts +0 -4
- package/build/session_pool/session_utils.d.ts.map +0 -1
- package/build/session_pool/session_utils.js +0 -24
- package/build/session_pool/session_utils.js.map +0 -1
- package/build/stealth/hiding_tricks.d.ts +0 -22
- package/build/stealth/hiding_tricks.d.ts.map +0 -1
- package/build/stealth/hiding_tricks.js +0 -308
- package/build/stealth/hiding_tricks.js.map +0 -1
- package/build/stealth/stealth.d.ts +0 -56
- package/build/stealth/stealth.d.ts.map +0 -1
- package/build/stealth/stealth.js +0 -125
- package/build/stealth/stealth.js.map +0 -1
- package/build/storages/dataset.d.ts +0 -288
- package/build/storages/dataset.d.ts.map +0 -1
- package/build/storages/dataset.js +0 -480
- package/build/storages/dataset.js.map +0 -1
- package/build/storages/key_value_store.d.ts +0 -243
- package/build/storages/key_value_store.d.ts.map +0 -1
- package/build/storages/key_value_store.js +0 -462
- package/build/storages/key_value_store.js.map +0 -1
- package/build/storages/request_queue.d.ts +0 -318
- package/build/storages/request_queue.d.ts.map +0 -1
- package/build/storages/request_queue.js +0 -636
- package/build/storages/request_queue.js.map +0 -1
- package/build/storages/storage_manager.d.ts +0 -87
- package/build/storages/storage_manager.d.ts.map +0 -1
- package/build/storages/storage_manager.js +0 -150
- package/build/storages/storage_manager.js.map +0 -1
- package/build/tsconfig.tsbuildinfo +0 -1
- package/build/typedefs.d.ts +0 -146
- package/build/typedefs.d.ts.map +0 -1
- package/build/typedefs.js +0 -88
- package/build/typedefs.js.map +0 -1
- package/build/utils.d.ts +0 -175
- package/build/utils.d.ts.map +0 -1
- package/build/utils.js +0 -731
- package/build/utils.js.map +0 -1
- package/build/utils_log.d.ts +0 -41
- package/build/utils_log.d.ts.map +0 -1
- package/build/utils_log.js +0 -192
- package/build/utils_log.js.map +0 -1
- package/build/utils_request.d.ts +0 -77
- package/build/utils_request.d.ts.map +0 -1
- package/build/utils_request.js +0 -385
- package/build/utils_request.js.map +0 -1
- package/build/utils_social.d.ts +0 -210
- package/build/utils_social.d.ts.map +0 -1
- package/build/utils_social.js +0 -787
- package/build/utils_social.js.map +0 -1
- package/build/validators.d.ts +0 -23
- package/build/validators.d.ts.map +0 -1
- package/build/validators.js +0 -29
- package/build/validators.js.map +0 -1
|
@@ -1,664 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.BasicCrawler = void 0;
|
|
4
|
-
const tslib_1 = require("tslib");
|
|
5
|
-
const consts_1 = require("@apify/consts");
|
|
6
|
-
const utilities_1 = require("@apify/utilities");
|
|
7
|
-
const ow_1 = (0, tslib_1.__importStar)(require("ow"));
|
|
8
|
-
const underscore_1 = (0, tslib_1.__importDefault)(require("underscore"));
|
|
9
|
-
const timeout_1 = require("@apify/timeout");
|
|
10
|
-
const autoscaled_pool_1 = (0, tslib_1.__importDefault)(require("../autoscaling/autoscaled_pool")); // eslint-disable-line import/no-duplicates
|
|
11
|
-
const events_1 = (0, tslib_1.__importDefault)(require("../events"));
|
|
12
|
-
const session_pool_1 = require("../session_pool/session_pool"); // eslint-disable-line import/no-duplicates
|
|
13
|
-
const statistics_1 = (0, tslib_1.__importDefault)(require("./statistics"));
|
|
14
|
-
const utils_log_1 = (0, tslib_1.__importDefault)(require("../utils_log")); // eslint-disable-line import/no-duplicates
|
|
15
|
-
const validators_1 = require("../validators");
|
|
16
|
-
/* eslint-enable no-unused-vars,import/named,import/no-duplicates,import/order */
|
|
17
|
-
/**
|
|
18
|
-
* @typedef {object} CrawlingContext
|
|
19
|
-
* @property {string} id
|
|
20
|
-
* @property {Request} request
|
|
21
|
-
* @property {Session} session
|
|
22
|
-
* @property {ProxyInfo} proxyInfo
|
|
23
|
-
* @property {*} response
|
|
24
|
-
*/
|
|
25
|
-
/**
|
|
26
|
-
* Since there's no set number of seconds before the container is terminated after
|
|
27
|
-
* a migration event, we need some reasonable number to use for RequestList persistence.
|
|
28
|
-
* Once a migration event is received, the Crawler will be paused and it will wait for
|
|
29
|
-
* this long before persisting the RequestList state. This should allow most healthy
|
|
30
|
-
* requests to finish and be marked as handled, thus lowering the amount of duplicate
|
|
31
|
-
* results after migration.
|
|
32
|
-
*
|
|
33
|
-
* @type {number}
|
|
34
|
-
* @ignore
|
|
35
|
-
*/
|
|
36
|
-
const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
37
|
-
/**
|
|
38
|
-
* @typedef BasicCrawlerOptions
|
|
39
|
-
* @property {HandleRequest} handleRequestFunction
|
|
40
|
-
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
|
|
41
|
-
*
|
|
42
|
-
* The function receives the following object as an argument:
|
|
43
|
-
* ```
|
|
44
|
-
* {
|
|
45
|
-
* request: Request,
|
|
46
|
-
* session: Session,
|
|
47
|
-
* crawler: BasicCrawler,
|
|
48
|
-
* }
|
|
49
|
-
* ```
|
|
50
|
-
* where the {@link Request} instance represents the URL to crawl.
|
|
51
|
-
*
|
|
52
|
-
* The function must return a promise, which is then awaited by the crawler.
|
|
53
|
-
*
|
|
54
|
-
* If the function throws an exception, the crawler will try to re-crawl the
|
|
55
|
-
* request later, up to `option.maxRequestRetries` times.
|
|
56
|
-
* If all the retries fail, the crawler calls the function
|
|
57
|
-
* provided to the `handleFailedRequestFunction` parameter.
|
|
58
|
-
* To make this work, you should **always**
|
|
59
|
-
* let your function throw exceptions rather than catch them.
|
|
60
|
-
* The exceptions are logged to the request using the
|
|
61
|
-
* {@link Request#pushErrorMessage} function.
|
|
62
|
-
* @property {RequestList} [requestList]
|
|
63
|
-
* Static list of URLs to be processed.
|
|
64
|
-
* Either `requestList` or `requestQueue` option must be provided (or both).
|
|
65
|
-
* @property {RequestQueue} [requestQueue]
|
|
66
|
-
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
67
|
-
* Either `requestList` or `requestQueue` option must be provided (or both).
|
|
68
|
-
* @property {number} [handleRequestTimeoutSecs=60]
|
|
69
|
-
* Timeout in which the function passed as `handleRequestFunction` needs to finish, in seconds.
|
|
70
|
-
* @property {HandleFailedRequest} [handleFailedRequestFunction]
|
|
71
|
-
* A function to handle requests that failed more than `option.maxRequestRetries` times.
|
|
72
|
-
*
|
|
73
|
-
* The function receives the following object as an argument:
|
|
74
|
-
* ```
|
|
75
|
-
* {
|
|
76
|
-
* request: Request,
|
|
77
|
-
* error: Error,
|
|
78
|
-
* session: Session,
|
|
79
|
-
* crawler: BasicCrawler,
|
|
80
|
-
* }
|
|
81
|
-
* ```
|
|
82
|
-
* where the {@link Request} instance corresponds to the failed request, and the `Error` instance
|
|
83
|
-
* represents the last error thrown during processing of the request.
|
|
84
|
-
*
|
|
85
|
-
* See
|
|
86
|
-
* [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/basic_crawler.js#L11)
|
|
87
|
-
* for the default implementation of this function.
|
|
88
|
-
* @property {number} [maxRequestRetries=3]
|
|
89
|
-
* Indicates how many times the request is retried if {@link BasicCrawlerOptions.handleRequestFunction} fails.
|
|
90
|
-
* @property {number} [maxRequestsPerCrawl]
|
|
91
|
-
* Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
|
|
92
|
-
* Always set this value in order to prevent infinite loops in misconfigured crawlers.
|
|
93
|
-
* Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
|
|
94
|
-
* @property {AutoscaledPoolOptions} [autoscaledPoolOptions]
|
|
95
|
-
* Custom options passed to the underlying {@link AutoscaledPool} constructor.
|
|
96
|
-
* Note that the `runTaskFunction` and `isTaskReadyFunction` options
|
|
97
|
-
* are provided by `BasicCrawler` and cannot be overridden.
|
|
98
|
-
* However, you can provide a custom implementation of `isFinishedFunction`.
|
|
99
|
-
* @property {number} [minConcurrency=1]
|
|
100
|
-
* Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
|
|
101
|
-
*
|
|
102
|
-
* *WARNING:* If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash.
|
|
103
|
-
* If you're not sure, just keep the default value and the concurrency will scale up automatically.
|
|
104
|
-
* @property {number} [maxConcurrency=1000]
|
|
105
|
-
* Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding {@link AutoscaledPool} option.
|
|
106
|
-
* @property {boolean} [useSessionPool=true]
|
|
107
|
-
* Basic crawler will initialize the {@link SessionPool} with the corresponding `sessionPoolOptions`.
|
|
108
|
-
* The session instance will be than available in the `handleRequestFunction`.
|
|
109
|
-
* @property {SessionPoolOptions} [sessionPoolOptions] The configuration options for {@link SessionPool} to use.
|
|
110
|
-
*/
|
|
111
|
-
/**
|
|
112
|
-
* Provides a simple framework for parallel crawling of web pages.
|
|
113
|
-
* The URLs to crawl are fed either from a static list of URLs
|
|
114
|
-
* or from a dynamic queue of URLs enabling recursive crawling of websites.
|
|
115
|
-
*
|
|
116
|
-
* `BasicCrawler` is a low-level tool that requires the user to implement the page
|
|
117
|
-
* download and data extraction functionality themselves.
|
|
118
|
-
* If you want a crawler that already facilitates this functionality,
|
|
119
|
-
* please consider using {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}.
|
|
120
|
-
*
|
|
121
|
-
* `BasicCrawler` invokes the user-provided {@link BasicCrawlerOptions.handleRequestFunction}
|
|
122
|
-
* for each {@link Request} object, which represents a single URL to crawl.
|
|
123
|
-
* The {@link Request} objects are fed from the {@link RequestList} or the {@link RequestQueue}
|
|
124
|
-
* instances provided by the {@link BasicCrawlerOptions.requestList} or {@link BasicCrawlerOptions.requestQueue}
|
|
125
|
-
* constructor options, respectively.
|
|
126
|
-
*
|
|
127
|
-
* If both {@link BasicCrawlerOptions.requestList} and {@link BasicCrawlerOptions.requestQueue} options are used,
|
|
128
|
-
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
129
|
-
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
130
|
-
*
|
|
131
|
-
* The crawler finishes if there are no more {@link Request} objects to crawl.
|
|
132
|
-
*
|
|
133
|
-
* New requests are only dispatched when there is enough free CPU and memory available,
|
|
134
|
-
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
135
|
-
* All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
|
|
136
|
-
* parameter of the `BasicCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
|
|
137
|
-
* {@link AutoscaledPool} options are available directly in the `BasicCrawler` constructor.
|
|
138
|
-
*
|
|
139
|
-
* **Example usage:**
|
|
140
|
-
*
|
|
141
|
-
* ```javascript
|
|
142
|
-
* // Prepare a list of URLs to crawl
|
|
143
|
-
* const requestList = new Apify.RequestList({
|
|
144
|
-
* sources: [
|
|
145
|
-
* { url: 'http://www.example.com/page-1' },
|
|
146
|
-
* { url: 'http://www.example.com/page-2' },
|
|
147
|
-
* ],
|
|
148
|
-
* });
|
|
149
|
-
* await requestList.initialize();
|
|
150
|
-
*
|
|
151
|
-
* // Crawl the URLs
|
|
152
|
-
* const crawler = new Apify.BasicCrawler({
|
|
153
|
-
* requestList,
|
|
154
|
-
* handleRequestFunction: async ({ request }) => {
|
|
155
|
-
* // 'request' contains an instance of the Request class
|
|
156
|
-
* // Here we simply fetch the HTML of the page and store it to a dataset
|
|
157
|
-
* const { body } = await Apify.utils.requestAsBrowser(request);
|
|
158
|
-
* await Apify.pushData({
|
|
159
|
-
* url: request.url,
|
|
160
|
-
* html: body,
|
|
161
|
-
* })
|
|
162
|
-
* },
|
|
163
|
-
* });
|
|
164
|
-
*
|
|
165
|
-
* await crawler.run();
|
|
166
|
-
* ```
|
|
167
|
-
* @property {Statistics} stats
|
|
168
|
-
* Contains statistics about the current run.
|
|
169
|
-
* @property {RequestList} [requestList]
|
|
170
|
-
* A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request}s.
|
|
171
|
-
* Only available if used by the crawler.
|
|
172
|
-
* @property {RequestQueue} [requestQueue]
|
|
173
|
-
* A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request}s.
|
|
174
|
-
* Only available if used by the crawler.
|
|
175
|
-
* @property {SessionPool} [sessionPool]
|
|
176
|
-
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session}s.
|
|
177
|
-
* Only available if used by the crawler.
|
|
178
|
-
* @property {AutoscaledPool} autoscaledPool
|
|
179
|
-
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
180
|
-
* Note that this property is only initialized after calling the {@link BasicCrawler#run} function.
|
|
181
|
-
* You can use it to change the concurrency settings on the fly,
|
|
182
|
-
* to pause the crawler by calling {@link AutoscaledPool#pause}
|
|
183
|
-
* or to abort it by calling {@link AutoscaledPool#abort}.
|
|
184
|
-
*/
|
|
185
|
-
class BasicCrawler {
|
|
186
|
-
/**
|
|
187
|
-
* @param {BasicCrawlerOptions} options
|
|
188
|
-
* All `BasicCrawler` parameters are passed via an options object.
|
|
189
|
-
*/
|
|
190
|
-
constructor(options) {
|
|
191
|
-
(0, ow_1.default)(options, 'BasicCrawlerOptions', ow_1.default.object.exactShape(BasicCrawler.optionsShape));
|
|
192
|
-
const { requestList, requestQueue, handleRequestFunction, handleRequestTimeoutSecs = 60, handleFailedRequestFunction, maxRequestRetries = 3, maxRequestsPerCrawl, autoscaledPoolOptions = {}, sessionPoolOptions = {}, useSessionPool = true,
|
|
193
|
-
// AutoscaledPool shorthands
|
|
194
|
-
minConcurrency, maxConcurrency,
|
|
195
|
-
// internal
|
|
196
|
-
log = utils_log_1.default.child({ prefix: this.constructor.name }), } = options;
|
|
197
|
-
if (!requestList && !requestQueue) {
|
|
198
|
-
const msg = 'At least one of the parameters "options.requestList" and "options.requestQueue" must be provided!';
|
|
199
|
-
throw new ow_1.ArgumentError(msg, this.constructor);
|
|
200
|
-
}
|
|
201
|
-
// assigning {} to the options as default break proper typing
|
|
202
|
-
/** @type {Log} */
|
|
203
|
-
this.log = log;
|
|
204
|
-
this.requestList = requestList;
|
|
205
|
-
this.requestQueue = requestQueue;
|
|
206
|
-
this.userProvidedHandler = handleRequestFunction;
|
|
207
|
-
this.failedContextHandler = handleFailedRequestFunction;
|
|
208
|
-
this.handleRequestTimeoutMillis = handleRequestTimeoutSecs * 1000;
|
|
209
|
-
const tryEnv = (val) => (val == null ? val : +val);
|
|
210
|
-
// allow at least 5min for internal timeouts
|
|
211
|
-
this.internalTimeoutMillis = tryEnv(process.env.APIFY_INTERNAL_TIMEOUT) ?? Math.max(this.handleRequestTimeoutMillis * 2, 300e3);
|
|
212
|
-
// override the default internal timeout of request queue to respect `handleRequestTimeoutMillis`
|
|
213
|
-
if (this.requestQueue) {
|
|
214
|
-
this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
|
|
215
|
-
}
|
|
216
|
-
this.handleFailedRequestFunction = handleFailedRequestFunction;
|
|
217
|
-
this.maxRequestRetries = maxRequestRetries;
|
|
218
|
-
this.handledRequestsCount = 0;
|
|
219
|
-
this.stats = new statistics_1.default({ logMessage: `${log.getOptions().prefix} request statistics:` });
|
|
220
|
-
/** @type {SessionPoolOptions} */
|
|
221
|
-
this.sessionPoolOptions = {
|
|
222
|
-
...sessionPoolOptions,
|
|
223
|
-
log,
|
|
224
|
-
};
|
|
225
|
-
this.useSessionPool = useSessionPool;
|
|
226
|
-
this.crawlingContexts = new Map();
|
|
227
|
-
const maxSignedInteger = 2 ** 31 - 1;
|
|
228
|
-
if (this.handleRequestTimeoutMillis > maxSignedInteger) {
|
|
229
|
-
log.warning(`handleRequestTimeoutMillis ${this.handleRequestTimeoutMillis}`
|
|
230
|
-
+ `does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
|
|
231
|
-
this.handleRequestTimeoutMillis = maxSignedInteger;
|
|
232
|
-
}
|
|
233
|
-
let shouldLogMaxPagesExceeded = true;
|
|
234
|
-
const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
235
|
-
const { isFinishedFunction } = autoscaledPoolOptions;
|
|
236
|
-
const basicCrawlerAutoscaledPoolConfiguration = {
|
|
237
|
-
minConcurrency,
|
|
238
|
-
maxConcurrency,
|
|
239
|
-
runTaskFunction: this._runTaskFunction.bind(this),
|
|
240
|
-
isTaskReadyFunction: async () => {
|
|
241
|
-
if (isMaxPagesExceeded()) {
|
|
242
|
-
if (shouldLogMaxPagesExceeded) {
|
|
243
|
-
log.info('Crawler reached the maxRequestsPerCrawl limit of '
|
|
244
|
-
+ `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
245
|
-
shouldLogMaxPagesExceeded = false;
|
|
246
|
-
}
|
|
247
|
-
return false;
|
|
248
|
-
}
|
|
249
|
-
return this._isTaskReadyFunction();
|
|
250
|
-
},
|
|
251
|
-
isFinishedFunction: async () => {
|
|
252
|
-
if (isMaxPagesExceeded()) {
|
|
253
|
-
log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests `
|
|
254
|
-
+ 'and all requests that were in progress at that time have now finished. '
|
|
255
|
-
+ `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
|
|
256
|
-
return true;
|
|
257
|
-
}
|
|
258
|
-
const isFinished = isFinishedFunction
|
|
259
|
-
? await isFinishedFunction()
|
|
260
|
-
: await this._defaultIsFinishedFunction();
|
|
261
|
-
if (isFinished) {
|
|
262
|
-
const reason = isFinishedFunction
|
|
263
|
-
? 'Crawler\'s custom isFinishedFunction() returned true, the crawler will shut down.'
|
|
264
|
-
: 'All the requests from request list and/or request queue have been processed, the crawler will shut down.';
|
|
265
|
-
log.info(reason);
|
|
266
|
-
}
|
|
267
|
-
return isFinished;
|
|
268
|
-
},
|
|
269
|
-
log,
|
|
270
|
-
};
|
|
271
|
-
this.autoscaledPoolOptions = underscore_1.default.defaults({}, basicCrawlerAutoscaledPoolConfiguration, autoscaledPoolOptions);
|
|
272
|
-
this.isRunningPromise = null;
|
|
273
|
-
// Attach a listener to handle migration and aborting events gracefully.
|
|
274
|
-
events_1.default.on(consts_1.ACTOR_EVENT_NAMES.MIGRATING, this._pauseOnMigration.bind(this));
|
|
275
|
-
events_1.default.on(consts_1.ACTOR_EVENT_NAMES.ABORTING, this._pauseOnMigration.bind(this));
|
|
276
|
-
}
|
|
277
|
-
/**
|
|
278
|
-
* Runs the crawler. Returns a promise that gets resolved once all the requests are processed.
|
|
279
|
-
*
|
|
280
|
-
* @return {Promise<void>}
|
|
281
|
-
*/
|
|
282
|
-
async run() {
|
|
283
|
-
if (this.isRunningPromise)
|
|
284
|
-
return this.isRunningPromise;
|
|
285
|
-
await this._init();
|
|
286
|
-
this.isRunningPromise = this.autoscaledPool.run();
|
|
287
|
-
await this.stats.startCapturing();
|
|
288
|
-
try {
|
|
289
|
-
await this.isRunningPromise;
|
|
290
|
-
}
|
|
291
|
-
finally {
|
|
292
|
-
await this.teardown();
|
|
293
|
-
await this.stats.stopCapturing();
|
|
294
|
-
const finalStats = this.stats.calculate();
|
|
295
|
-
const { requestsFailed, requestsFinished } = this.stats.state;
|
|
296
|
-
this.log.info('Final request statistics:', {
|
|
297
|
-
requestsFinished,
|
|
298
|
-
requestsFailed,
|
|
299
|
-
retryHistogram: this.stats.requestRetryHistogram,
|
|
300
|
-
...finalStats,
|
|
301
|
-
});
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
/**
|
|
305
|
-
* @return {Promise<void>}
|
|
306
|
-
* @ignore
|
|
307
|
-
* @protected
|
|
308
|
-
* @internal
|
|
309
|
-
*/
|
|
310
|
-
async _init() {
|
|
311
|
-
// Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
|
|
312
|
-
// so that the caller can get a reference to it before awaiting the promise returned from run()
|
|
313
|
-
// (otherwise there would be no way)
|
|
314
|
-
this.autoscaledPool = new autoscaled_pool_1.default(this.autoscaledPoolOptions);
|
|
315
|
-
if (this.useSessionPool) {
|
|
316
|
-
this.sessionPool = await (0, session_pool_1.openSessionPool)(this.sessionPoolOptions);
|
|
317
|
-
// Assuming there are not more than 20 browsers running at once;
|
|
318
|
-
this.sessionPool.setMaxListeners(20);
|
|
319
|
-
}
|
|
320
|
-
await this._loadHandledRequestCount();
|
|
321
|
-
}
|
|
322
|
-
/**
|
|
323
|
-
* @param {CrawlingContext} crawlingContext
|
|
324
|
-
* @return {Promise<void>}
|
|
325
|
-
* @ignore
|
|
326
|
-
* @protected
|
|
327
|
-
* @internal
|
|
328
|
-
*/
|
|
329
|
-
async _handleRequestFunction(crawlingContext) {
|
|
330
|
-
await this.userProvidedHandler(crawlingContext);
|
|
331
|
-
}
|
|
332
|
-
/**
|
|
333
|
-
* @ignore
|
|
334
|
-
* @protected
|
|
335
|
-
* @internal
|
|
336
|
-
*/
|
|
337
|
-
async _pauseOnMigration() {
|
|
338
|
-
if (this.autoscaledPool) {
|
|
339
|
-
// if run wasn't called, this is going to crash
|
|
340
|
-
await this.autoscaledPool.pause(SAFE_MIGRATION_WAIT_MILLIS)
|
|
341
|
-
.catch((err) => {
|
|
342
|
-
if (err.message.includes('running tasks did not finish')) {
|
|
343
|
-
this.log.error('The crawler was paused due to migration to another host, '
|
|
344
|
-
+ 'but some requests did not finish in time. Those requests\' results may be duplicated.');
|
|
345
|
-
}
|
|
346
|
-
else {
|
|
347
|
-
throw err;
|
|
348
|
-
}
|
|
349
|
-
});
|
|
350
|
-
}
|
|
351
|
-
const requestListPersistPromise = (async () => {
|
|
352
|
-
if (this.requestList) {
|
|
353
|
-
if (await this.requestList.isFinished())
|
|
354
|
-
return;
|
|
355
|
-
await this.requestList.persistState()
|
|
356
|
-
.catch((err) => {
|
|
357
|
-
if (err.message.includes('Cannot persist state.')) {
|
|
358
|
-
this.log.error('The crawler attempted to persist its request list\'s state and failed due to missing or '
|
|
359
|
-
+ 'invalid config. Make sure to use either Apify.openRequestList() or the "stateKeyPrefix" option of RequestList '
|
|
360
|
-
+ 'constructor to ensure your crawling state is persisted through host migrations and restarts.');
|
|
361
|
-
}
|
|
362
|
-
else {
|
|
363
|
-
this.log.exception(err, 'An unexpected error occured when the crawler '
|
|
364
|
-
+ 'attempted to persist its request list\'s state.');
|
|
365
|
-
}
|
|
366
|
-
});
|
|
367
|
-
}
|
|
368
|
-
})();
|
|
369
|
-
await Promise.all([
|
|
370
|
-
requestListPersistPromise,
|
|
371
|
-
this.stats.persistState(),
|
|
372
|
-
]);
|
|
373
|
-
}
|
|
374
|
-
/**
|
|
375
|
-
* Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
|
|
376
|
-
* and RequestQueue is present then enqueues it to the queue first.
|
|
377
|
-
*
|
|
378
|
-
* @ignore
|
|
379
|
-
* @protected
|
|
380
|
-
* @internal
|
|
381
|
-
*/
|
|
382
|
-
async _fetchNextRequest() {
|
|
383
|
-
if (!this.requestList)
|
|
384
|
-
return this.requestQueue.fetchNextRequest();
|
|
385
|
-
const request = await this.requestList.fetchNextRequest();
|
|
386
|
-
if (!this.requestQueue)
|
|
387
|
-
return request;
|
|
388
|
-
if (!request)
|
|
389
|
-
return this.requestQueue.fetchNextRequest();
|
|
390
|
-
try {
|
|
391
|
-
await this.requestQueue.addRequest(request, { forefront: true });
|
|
392
|
-
}
|
|
393
|
-
catch (err) {
|
|
394
|
-
// If requestQueue.addRequest() fails here then we must reclaim it back to
|
|
395
|
-
// the RequestList because probably it's not yet in the queue!
|
|
396
|
-
this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
|
|
397
|
-
await this.requestList.reclaimRequest(request);
|
|
398
|
-
return null;
|
|
399
|
-
}
|
|
400
|
-
await this.requestList.markRequestHandled(request);
|
|
401
|
-
return this.requestQueue.fetchNextRequest();
|
|
402
|
-
}
|
|
403
|
-
/**
|
|
404
|
-
* Wrapper around handleRequestFunction that fetches requests from RequestList/RequestQueue
|
|
405
|
-
* then retries them in a case of an error, etc.
|
|
406
|
-
*
|
|
407
|
-
* @ignore
|
|
408
|
-
* @protected
|
|
409
|
-
* @internal
|
|
410
|
-
*/
|
|
411
|
-
async _runTaskFunction() {
|
|
412
|
-
const source = this.requestQueue || this.requestList;
|
|
413
|
-
let request;
|
|
414
|
-
let session;
|
|
415
|
-
await this._timeoutAndRetry(async () => {
|
|
416
|
-
request = await this._fetchNextRequest();
|
|
417
|
-
}, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
418
|
-
(0, timeout_1.tryCancel)();
|
|
419
|
-
if (this.useSessionPool) {
|
|
420
|
-
await this._timeoutAndRetry(async () => {
|
|
421
|
-
session = await this.sessionPool.getSession();
|
|
422
|
-
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
423
|
-
}
|
|
424
|
-
(0, timeout_1.tryCancel)();
|
|
425
|
-
if (!request)
|
|
426
|
-
return;
|
|
427
|
-
// Reset loadedUrl so an old one is not carried over to retries.
|
|
428
|
-
request.loadedUrl = undefined;
|
|
429
|
-
const statisticsId = request.id || request.uniqueKey;
|
|
430
|
-
this.stats.startJob(statisticsId);
|
|
431
|
-
// Shared crawling context
|
|
432
|
-
const crawlingContext = {
|
|
433
|
-
id: (0, utilities_1.cryptoRandomObjectId)(10),
|
|
434
|
-
crawler: this,
|
|
435
|
-
request,
|
|
436
|
-
session,
|
|
437
|
-
};
|
|
438
|
-
this.crawlingContexts.set(crawlingContext.id, crawlingContext);
|
|
439
|
-
try {
|
|
440
|
-
await (0, timeout_1.addTimeoutToPromise)(() => this._handleRequestFunction(crawlingContext), this.handleRequestTimeoutMillis, `handleRequestFunction timed out after ${this.handleRequestTimeoutMillis / 1000} seconds (${request.id}).`);
|
|
441
|
-
await this._timeoutAndRetry(() => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
442
|
-
this.stats.finishJob(statisticsId);
|
|
443
|
-
this.handledRequestsCount++;
|
|
444
|
-
// reclaim session if request finishes successfully
|
|
445
|
-
if (session)
|
|
446
|
-
session.markGood();
|
|
447
|
-
}
|
|
448
|
-
catch (err) {
|
|
449
|
-
try {
|
|
450
|
-
await this._timeoutAndRetry(() => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
451
|
-
}
|
|
452
|
-
catch (secondaryError) {
|
|
453
|
-
this.log.exception(secondaryError, 'runTaskFunction error handler threw an exception. '
|
|
454
|
-
+ 'This places the crawler and its underlying storages into an unknown state and crawling will be terminated. '
|
|
455
|
-
+ 'This may have happened due to an internal error of Apify\'s API or due to a misconfigured crawler. '
|
|
456
|
-
+ 'If you are sure that there is no error in your code, selecting "Restart on error" in the actor\'s settings'
|
|
457
|
-
+ 'will make sure that the run continues where it left off, if programmed to handle restarts correctly.');
|
|
458
|
-
throw secondaryError;
|
|
459
|
-
}
|
|
460
|
-
}
|
|
461
|
-
finally {
|
|
462
|
-
this.crawlingContexts.delete(crawlingContext.id);
|
|
463
|
-
}
|
|
464
|
-
}
|
|
465
|
-
/**
|
|
466
|
-
* Run async callback with given timeout and retry.
|
|
467
|
-
* @ignore
|
|
468
|
-
*/
|
|
469
|
-
async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
|
|
470
|
-
try {
|
|
471
|
-
await (0, timeout_1.addTimeoutToPromise)(handler, timeout, error);
|
|
472
|
-
}
|
|
473
|
-
catch (e) {
|
|
474
|
-
if (retried <= maxRetries) { // we retry on any error, not just timeout
|
|
475
|
-
this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
|
|
476
|
-
return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
|
|
477
|
-
}
|
|
478
|
-
throw e;
|
|
479
|
-
}
|
|
480
|
-
}
|
|
481
|
-
/**
|
|
482
|
-
* Returns true if either RequestList or RequestQueue have a request ready for processing.
|
|
483
|
-
*
|
|
484
|
-
* @ignore
|
|
485
|
-
* @protected
|
|
486
|
-
* @internal
|
|
487
|
-
*/
|
|
488
|
-
async _isTaskReadyFunction() {
|
|
489
|
-
// First check RequestList, since it's only in memory.
|
|
490
|
-
const isRequestListEmpty = this.requestList ? (await this.requestList.isEmpty()) : true;
|
|
491
|
-
// If RequestList is not empty, task is ready, no reason to check RequestQueue.
|
|
492
|
-
if (!isRequestListEmpty)
|
|
493
|
-
return true;
|
|
494
|
-
// If RequestQueue is not empty, task is ready, return true, otherwise false.
|
|
495
|
-
return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
|
|
496
|
-
}
|
|
497
|
-
/**
|
|
498
|
-
* Returns true if both RequestList and RequestQueue have all requests finished.
|
|
499
|
-
*
|
|
500
|
-
* @ignore
|
|
501
|
-
* @protected
|
|
502
|
-
* @internal
|
|
503
|
-
*/
|
|
504
|
-
async _defaultIsFinishedFunction() {
|
|
505
|
-
const [isRequestListFinished, isRequestQueueFinished,] = await Promise.all([
|
|
506
|
-
this.requestList ? this.requestList.isFinished() : true,
|
|
507
|
-
this.requestQueue ? this.requestQueue.isFinished() : true,
|
|
508
|
-
]);
|
|
509
|
-
// If both are finished, return true, otherwise return false.
|
|
510
|
-
return isRequestListFinished && isRequestQueueFinished;
|
|
511
|
-
}
|
|
512
|
-
/**
|
|
513
|
-
* Handles errors thrown by user provided handleRequestFunction()
|
|
514
|
-
* @param {Error} error
|
|
515
|
-
* @param {object} crawlingContext
|
|
516
|
-
* @param {Request} crawlingContext.request
|
|
517
|
-
* @param {(RequestList|RequestQueue)} source
|
|
518
|
-
* @return {Promise<void>}
|
|
519
|
-
* @ignore
|
|
520
|
-
* @protected
|
|
521
|
-
* @internal
|
|
522
|
-
*/
|
|
523
|
-
async _requestFunctionErrorHandler(error, crawlingContext, source) {
|
|
524
|
-
const { request } = crawlingContext;
|
|
525
|
-
request.pushErrorMessage(error);
|
|
526
|
-
const shouldRetryRequest = !request.noRetry && request.retryCount < this.maxRequestRetries;
|
|
527
|
-
if (shouldRetryRequest) {
|
|
528
|
-
request.retryCount++;
|
|
529
|
-
this.log.exception(error, 'handleRequestFunction failed, reclaiming failed request back to the list or queue', underscore_1.default.pick(request, 'url', 'retryCount', 'id'));
|
|
530
|
-
await source.reclaimRequest(request);
|
|
531
|
-
}
|
|
532
|
-
else {
|
|
533
|
-
// If we get here, the request is either not retryable
|
|
534
|
-
// or failed more than retryCount times and will not be retried anymore.
|
|
535
|
-
// Mark the request as failed and do not retry.
|
|
536
|
-
this.handledRequestsCount++;
|
|
537
|
-
await source.markRequestHandled(request);
|
|
538
|
-
this.stats.failJob(request.id || request.url);
|
|
539
|
-
crawlingContext.error = error;
|
|
540
|
-
await this._handleFailedRequestFunction(crawlingContext); // This function prints an error message.
|
|
541
|
-
}
|
|
542
|
-
}
|
|
543
|
-
/**
|
|
544
|
-
* @param {object} crawlingContext
|
|
545
|
-
* @param {Error} crawlingContext.error
|
|
546
|
-
* @param {Request} crawlingContext.request
|
|
547
|
-
* @return {Promise<void>}
|
|
548
|
-
* @ignore
|
|
549
|
-
* @protected
|
|
550
|
-
* @internal
|
|
551
|
-
*/
|
|
552
|
-
async _handleFailedRequestFunction(crawlingContext) {
|
|
553
|
-
if (this.failedContextHandler) {
|
|
554
|
-
await this.failedContextHandler(crawlingContext);
|
|
555
|
-
}
|
|
556
|
-
else {
|
|
557
|
-
const { id, url, method, uniqueKey } = crawlingContext.request;
|
|
558
|
-
this.log.exception(crawlingContext.error, 'Request failed and reached maximum retries', { id, url, method, uniqueKey });
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
/**
|
|
562
|
-
* Updates handledRequestsCount from possibly stored counts,
|
|
563
|
-
* usually after worker migration. Since one of the stores
|
|
564
|
-
* needs to have priority when both are present,
|
|
565
|
-
* it is the request queue, because generally, the request
|
|
566
|
-
* list will first be dumped into the queue and then left
|
|
567
|
-
* empty.
|
|
568
|
-
*
|
|
569
|
-
* @return {Promise<void>}
|
|
570
|
-
* @ignore
|
|
571
|
-
* @protected
|
|
572
|
-
* @internal
|
|
573
|
-
*/
|
|
574
|
-
async _loadHandledRequestCount() {
|
|
575
|
-
if (this.requestQueue) {
|
|
576
|
-
this.handledRequestsCount = await this.requestQueue.handledCount();
|
|
577
|
-
}
|
|
578
|
-
else if (this.requestList) {
|
|
579
|
-
this.handledRequestsCount = this.requestList.handledCount();
|
|
580
|
-
}
|
|
581
|
-
}
|
|
582
|
-
/**
|
|
583
|
-
* @param {Array<any>} hooks
|
|
584
|
-
* @param {*} args
|
|
585
|
-
* @ignore
|
|
586
|
-
* @protected
|
|
587
|
-
* @internal
|
|
588
|
-
*/
|
|
589
|
-
async _executeHooks(hooks, ...args) {
|
|
590
|
-
if (Array.isArray(hooks) && hooks.length) {
|
|
591
|
-
for (const hook of hooks) {
|
|
592
|
-
await hook(...args);
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
}
|
|
596
|
-
/**
|
|
597
|
-
* Function for cleaning up after all request are processed.
|
|
598
|
-
* @ignore
|
|
599
|
-
*/
|
|
600
|
-
async teardown() {
|
|
601
|
-
if (this.useSessionPool) {
|
|
602
|
-
await this.sessionPool.teardown();
|
|
603
|
-
}
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
exports.BasicCrawler = BasicCrawler;
|
|
607
|
-
/**
|
|
608
|
-
* @internal
|
|
609
|
-
* @type any
|
|
610
|
-
*/
|
|
611
|
-
Object.defineProperty(BasicCrawler, "optionsShape", {
|
|
612
|
-
enumerable: true,
|
|
613
|
-
configurable: true,
|
|
614
|
-
writable: true,
|
|
615
|
-
value: {
|
|
616
|
-
requestList: ow_1.default.optional.object.validate(validators_1.validators.requestList),
|
|
617
|
-
requestQueue: ow_1.default.optional.object.validate(validators_1.validators.requestQueue),
|
|
618
|
-
// Subclasses override this function instead of passing it
|
|
619
|
-
// in constructor, so this validation needs to apply only
|
|
620
|
-
// if the user creates an instance of BasicCrawler directly.
|
|
621
|
-
handleRequestFunction: ow_1.default.function,
|
|
622
|
-
handleRequestTimeoutSecs: ow_1.default.optional.number,
|
|
623
|
-
handleFailedRequestFunction: ow_1.default.optional.function,
|
|
624
|
-
maxRequestRetries: ow_1.default.optional.number,
|
|
625
|
-
maxRequestsPerCrawl: ow_1.default.optional.number,
|
|
626
|
-
autoscaledPoolOptions: ow_1.default.optional.object,
|
|
627
|
-
sessionPoolOptions: ow_1.default.optional.object,
|
|
628
|
-
useSessionPool: ow_1.default.optional.boolean,
|
|
629
|
-
// AutoscaledPool shorthands
|
|
630
|
-
minConcurrency: ow_1.default.optional.number,
|
|
631
|
-
maxConcurrency: ow_1.default.optional.number,
|
|
632
|
-
// internal
|
|
633
|
-
log: ow_1.default.optional.object,
|
|
634
|
-
}
|
|
635
|
-
});
|
|
636
|
-
/**
|
|
637
|
-
* @callback HandleRequest
|
|
638
|
-
* @param {HandleRequestInputs} inputs Arguments passed to this callback.
|
|
639
|
-
* @returns {Promise<void>}
|
|
640
|
-
*/
|
|
641
|
-
/**
|
|
642
|
-
* @typedef HandleRequestInputs
|
|
643
|
-
* @property {Request} request The original {Request} object.
|
|
644
|
-
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
645
|
-
* Note that this property is only initialized after calling the {@link BasicCrawler#run} function.
|
|
646
|
-
* You can use it to change the concurrency settings on the fly,
|
|
647
|
-
* to pause the crawler by calling {@link AutoscaledPool#pause}
|
|
648
|
-
* or to abort it by calling {@link AutoscaledPool#abort}.
|
|
649
|
-
* @property {Session} [session]
|
|
650
|
-
* @property {BasicCrawler} [crawler]
|
|
651
|
-
*/
|
|
652
|
-
/**
|
|
653
|
-
* @callback HandleFailedRequest
|
|
654
|
-
* @param {HandleFailedRequestInput} inputs Arguments passed to this callback.
|
|
655
|
-
* @returns {Promise<void>}
|
|
656
|
-
*/
|
|
657
|
-
/**
|
|
658
|
-
* @typedef HandleFailedRequestInput
|
|
659
|
-
* @property {Error} error The Error thrown by `handleRequestFunction`.
|
|
660
|
-
* @property {Request} request The original {Request} object.
|
|
661
|
-
* @property {Session} session
|
|
662
|
-
* @property {ProxyInfo} proxyInfo
|
|
663
|
-
*/
|
|
664
|
-
//# sourceMappingURL=basic_crawler.js.map
|