@crawlee/basic 3.13.3-beta.9 → 3.13.4-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/internals/basic-crawler.d.ts +76 -76
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +39 -38
- package/internals/basic-crawler.js.map +1 -1
- package/internals/constants.d.ts +2 -2
- package/internals/constants.js +2 -2
- package/package.json +5 -5
- package/tsconfig.build.tsbuildinfo +1 -1
|
@@ -7,11 +7,11 @@ import type { Log } from '@apify/log';
|
|
|
7
7
|
import { TimeoutError } from '@apify/timeout';
|
|
8
8
|
export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<BasicCrawler, UserData> {
|
|
9
9
|
/**
|
|
10
|
-
* This function automatically finds and enqueues links from the current page, adding them to the {@
|
|
10
|
+
* This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
|
|
11
11
|
* currently used by the crawler.
|
|
12
12
|
*
|
|
13
13
|
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
|
|
14
|
-
* and override settings of the enqueued {@
|
|
14
|
+
* and override settings of the enqueued {@link Request} objects.
|
|
15
15
|
*
|
|
16
16
|
* Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
|
|
17
17
|
* for more details regarding its usage.
|
|
@@ -27,7 +27,7 @@ export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary>
|
|
|
27
27
|
* ```
|
|
28
28
|
*
|
|
29
29
|
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
|
|
30
|
-
* @returns Promise that resolves to {@
|
|
30
|
+
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
|
|
31
31
|
*/
|
|
32
32
|
enqueueLinks(options?: SetRequired<EnqueueLinksOptions, 'urls'>): Promise<BatchAddRequestsResult>;
|
|
33
33
|
}
|
|
@@ -44,37 +44,37 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
44
44
|
/**
|
|
45
45
|
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
|
|
46
46
|
*
|
|
47
|
-
* The function receives the {@
|
|
48
|
-
* where the {@
|
|
47
|
+
* The function receives the {@link BasicCrawlingContext} as an argument,
|
|
48
|
+
* where the {@link BasicCrawlingContext.request|`request`} represents the URL to crawl.
|
|
49
49
|
*
|
|
50
50
|
* The function must return a promise, which is then awaited by the crawler.
|
|
51
51
|
*
|
|
52
52
|
* If the function throws an exception, the crawler will try to re-crawl the
|
|
53
|
-
* request later, up to the {@
|
|
53
|
+
* request later, up to the {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
54
54
|
* If all the retries fail, the crawler calls the function
|
|
55
|
-
* provided to the {@
|
|
55
|
+
* provided to the {@link BasicCrawlerOptions.failedRequestHandler|`failedRequestHandler`} parameter.
|
|
56
56
|
* To make this work, we should **always**
|
|
57
57
|
* let our function throw exceptions rather than catch them.
|
|
58
58
|
* The exceptions are logged to the request using the
|
|
59
|
-
* {@
|
|
59
|
+
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
60
60
|
*/
|
|
61
61
|
requestHandler?: RequestHandler<LoadedContext<Context>>;
|
|
62
62
|
/**
|
|
63
63
|
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
|
|
64
64
|
*
|
|
65
|
-
* The function receives the {@
|
|
66
|
-
* where the {@
|
|
65
|
+
* The function receives the {@link BasicCrawlingContext} as an argument,
|
|
66
|
+
* where the {@link BasicCrawlingContext.request|`request`} represents the URL to crawl.
|
|
67
67
|
*
|
|
68
68
|
* The function must return a promise, which is then awaited by the crawler.
|
|
69
69
|
*
|
|
70
70
|
* If the function throws an exception, the crawler will try to re-crawl the
|
|
71
|
-
* request later, up to the {@
|
|
71
|
+
* request later, up to the {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
72
72
|
* If all the retries fail, the crawler calls the function
|
|
73
|
-
* provided to the {@
|
|
73
|
+
* provided to the {@link BasicCrawlerOptions.failedRequestHandler|`failedRequestHandler`} parameter.
|
|
74
74
|
* To make this work, we should **always**
|
|
75
75
|
* let our function throw exceptions rather than catch them.
|
|
76
76
|
* The exceptions are logged to the request using the
|
|
77
|
-
* {@
|
|
77
|
+
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
78
78
|
*
|
|
79
79
|
* @deprecated `handleRequestFunction` has been renamed to `requestHandler` and will be removed in a future version.
|
|
80
80
|
* @ignore
|
|
@@ -82,25 +82,25 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
82
82
|
handleRequestFunction?: RequestHandler<Context>;
|
|
83
83
|
/**
|
|
84
84
|
* Static list of URLs to be processed.
|
|
85
|
-
* If not provided, the crawler will open the default request queue when the {@
|
|
86
|
-
* > Alternatively, `requests` parameter of {@
|
|
85
|
+
* If not provided, the crawler will open the default request queue when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called.
|
|
86
|
+
* > Alternatively, `requests` parameter of {@link BasicCrawler.run|`crawler.run()`} could be used to enqueue the initial requests -
|
|
87
87
|
* it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`.
|
|
88
88
|
*/
|
|
89
89
|
requestList?: IRequestList;
|
|
90
90
|
/**
|
|
91
91
|
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
92
|
-
* If not provided, the crawler will open the default request queue when the {@
|
|
93
|
-
* > Alternatively, `requests` parameter of {@
|
|
92
|
+
* If not provided, the crawler will open the default request queue when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called.
|
|
93
|
+
* > Alternatively, `requests` parameter of {@link BasicCrawler.run|`crawler.run()`} could be used to enqueue the initial requests -
|
|
94
94
|
* it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`.
|
|
95
95
|
*/
|
|
96
96
|
requestQueue?: RequestProvider;
|
|
97
97
|
/**
|
|
98
|
-
* Timeout in which the function passed as {@
|
|
98
|
+
* Timeout in which the function passed as {@link BasicCrawlerOptions.requestHandler|`requestHandler`} needs to finish, in seconds.
|
|
99
99
|
* @default 60
|
|
100
100
|
*/
|
|
101
101
|
requestHandlerTimeoutSecs?: number;
|
|
102
102
|
/**
|
|
103
|
-
* Timeout in which the function passed as {@
|
|
103
|
+
* Timeout in which the function passed as {@link BasicCrawlerOptions.requestHandler|`requestHandler`} needs to finish, in seconds.
|
|
104
104
|
* @default 60
|
|
105
105
|
* @deprecated `handleRequestTimeoutSecs` has been renamed to `requestHandlerTimeoutSecs` and will be removed in a future version.
|
|
106
106
|
* @ignore
|
|
@@ -108,28 +108,28 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
108
108
|
handleRequestTimeoutSecs?: number;
|
|
109
109
|
/**
|
|
110
110
|
* User-provided function that allows modifying the request object before it gets retried by the crawler.
|
|
111
|
-
* It's executed before each retry for the requests that failed less than {@
|
|
111
|
+
* It's executed before each retry for the requests that failed less than {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
112
112
|
*
|
|
113
|
-
* The function receives the {@
|
|
114
|
-
* where the {@
|
|
113
|
+
* The function receives the {@link BasicCrawlingContext} as the first argument,
|
|
114
|
+
* where the {@link BasicCrawlingContext.request|`request`} corresponds to the request to be retried.
|
|
115
115
|
* Second argument is the `Error` instance that
|
|
116
116
|
* represents the last error thrown during processing of the request.
|
|
117
117
|
*/
|
|
118
118
|
errorHandler?: ErrorHandler<Context>;
|
|
119
119
|
/**
|
|
120
|
-
* A function to handle requests that failed more than {@
|
|
120
|
+
* A function to handle requests that failed more than {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
121
121
|
*
|
|
122
|
-
* The function receives the {@
|
|
123
|
-
* where the {@
|
|
122
|
+
* The function receives the {@link BasicCrawlingContext} as the first argument,
|
|
123
|
+
* where the {@link BasicCrawlingContext.request|`request`} corresponds to the failed request.
|
|
124
124
|
* Second argument is the `Error` instance that
|
|
125
125
|
* represents the last error thrown during processing of the request.
|
|
126
126
|
*/
|
|
127
127
|
failedRequestHandler?: ErrorHandler<Context>;
|
|
128
128
|
/**
|
|
129
|
-
* A function to handle requests that failed more than {@
|
|
129
|
+
* A function to handle requests that failed more than {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
130
130
|
*
|
|
131
|
-
* The function receives the {@
|
|
132
|
-
* where the {@
|
|
131
|
+
* The function receives the {@link BasicCrawlingContext} as the first argument,
|
|
132
|
+
* where the {@link BasicCrawlingContext.request|`request`} corresponds to the failed request.
|
|
133
133
|
* Second argument is the `Error` instance that
|
|
134
134
|
* represents the last error thrown during processing of the request.
|
|
135
135
|
*
|
|
@@ -138,7 +138,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
138
138
|
*/
|
|
139
139
|
handleFailedRequestFunction?: ErrorHandler<Context>;
|
|
140
140
|
/**
|
|
141
|
-
* Indicates how many times the request is retried if {@
|
|
141
|
+
* Indicates how many times the request is retried if {@link BasicCrawlerOptions.requestHandler|`requestHandler`} fails.
|
|
142
142
|
* @default 3
|
|
143
143
|
*/
|
|
144
144
|
maxRequestRetries?: number;
|
|
@@ -151,7 +151,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
151
151
|
* Maximum number of session rotations per request.
|
|
152
152
|
* The crawler will automatically rotate the session in case of a proxy error or if it gets blocked by the website.
|
|
153
153
|
*
|
|
154
|
-
* The session rotations are not counted towards the {@
|
|
154
|
+
* The session rotations are not counted towards the {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} limit.
|
|
155
155
|
* @default 10
|
|
156
156
|
*/
|
|
157
157
|
maxSessionRotations?: number;
|
|
@@ -162,44 +162,44 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
162
162
|
*/
|
|
163
163
|
maxRequestsPerCrawl?: number;
|
|
164
164
|
/**
|
|
165
|
-
* Custom options passed to the underlying {@
|
|
166
|
-
* > *NOTE:* The {@
|
|
167
|
-
* and {@
|
|
165
|
+
* Custom options passed to the underlying {@link AutoscaledPool} constructor.
|
|
166
|
+
* > *NOTE:* The {@link AutoscaledPoolOptions.runTaskFunction|`runTaskFunction`}
|
|
167
|
+
* and {@link AutoscaledPoolOptions.isTaskReadyFunction|`isTaskReadyFunction`} options
|
|
168
168
|
* are provided by the crawler and cannot be overridden.
|
|
169
|
-
* However, we can provide a custom implementation of {@
|
|
169
|
+
* However, we can provide a custom implementation of {@link AutoscaledPoolOptions.isFinishedFunction|`isFinishedFunction`}.
|
|
170
170
|
*/
|
|
171
171
|
autoscaledPoolOptions?: AutoscaledPoolOptions;
|
|
172
172
|
/**
|
|
173
173
|
* Sets the minimum concurrency (parallelism) for the crawl. Shortcut for the
|
|
174
|
-
* AutoscaledPool {@
|
|
174
|
+
* AutoscaledPool {@link AutoscaledPoolOptions.minConcurrency|`minConcurrency`} option.
|
|
175
175
|
* > *WARNING:* If we set this value too high with respect to the available system memory and CPU, our crawler will run extremely slow or crash.
|
|
176
176
|
* If not sure, it's better to keep the default value and the concurrency will scale up automatically.
|
|
177
177
|
*/
|
|
178
178
|
minConcurrency?: number;
|
|
179
179
|
/**
|
|
180
180
|
* Sets the maximum concurrency (parallelism) for the crawl. Shortcut for the
|
|
181
|
-
* AutoscaledPool {@
|
|
181
|
+
* AutoscaledPool {@link AutoscaledPoolOptions.maxConcurrency|`maxConcurrency`} option.
|
|
182
182
|
*/
|
|
183
183
|
maxConcurrency?: number;
|
|
184
184
|
/**
|
|
185
185
|
* The maximum number of requests per minute the crawler should run.
|
|
186
186
|
* By default, this is set to `Infinity`, but we can pass any positive, non-zero integer.
|
|
187
|
-
* Shortcut for the AutoscaledPool {@
|
|
187
|
+
* Shortcut for the AutoscaledPool {@link AutoscaledPoolOptions.maxTasksPerMinute|`maxTasksPerMinute`} option.
|
|
188
188
|
*/
|
|
189
189
|
maxRequestsPerMinute?: number;
|
|
190
190
|
/**
|
|
191
|
-
* Allows to keep the crawler alive even if the {@
|
|
191
|
+
* Allows to keep the crawler alive even if the {@link RequestQueue} gets empty.
|
|
192
192
|
* By default, the `crawler.run()` will resolve once the queue is empty. With `keepAlive: true` it will keep running,
|
|
193
193
|
* waiting for more requests to come. Use `crawler.stop()` to exit the crawler gracefully, or `crawler.teardown()` to stop it immediately.
|
|
194
194
|
*/
|
|
195
195
|
keepAlive?: boolean;
|
|
196
196
|
/**
|
|
197
|
-
* Basic crawler will initialize the {@
|
|
198
|
-
* The session instance will be than available in the {@
|
|
197
|
+
* Basic crawler will initialize the {@link SessionPool} with the corresponding {@link SessionPoolOptions|`sessionPoolOptions`}.
|
|
198
|
+
* The session instance will be than available in the {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
199
199
|
*/
|
|
200
200
|
useSessionPool?: boolean;
|
|
201
201
|
/**
|
|
202
|
-
* The configuration options for {@
|
|
202
|
+
* The configuration options for {@link SessionPool} to use.
|
|
203
203
|
*/
|
|
204
204
|
sessionPoolOptions?: SessionPoolOptions;
|
|
205
205
|
/**
|
|
@@ -255,7 +255,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
255
255
|
statisticsOptions?: StatisticsOptions;
|
|
256
256
|
/**
|
|
257
257
|
* HTTP client implementation for the `sendRequest` context helper and for plain HTTP crawling.
|
|
258
|
-
* Defaults to a new instance of {@
|
|
258
|
+
* Defaults to a new instance of {@link GotScrapingHttpClient}
|
|
259
259
|
*/
|
|
260
260
|
httpClient?: BaseHttpClient;
|
|
261
261
|
}
|
|
@@ -282,29 +282,29 @@ export interface CrawlerExperiments {
|
|
|
282
282
|
* `BasicCrawler` is a low-level tool that requires the user to implement the page
|
|
283
283
|
* download and data extraction functionality themselves.
|
|
284
284
|
* If we want a crawler that already facilitates this functionality,
|
|
285
|
-
* we should consider using {@
|
|
285
|
+
* we should consider using {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}.
|
|
286
286
|
*
|
|
287
|
-
* `BasicCrawler` invokes the user-provided {@
|
|
288
|
-
* for each {@
|
|
289
|
-
* The {@
|
|
290
|
-
* instances provided by the {@
|
|
287
|
+
* `BasicCrawler` invokes the user-provided {@link BasicCrawlerOptions.requestHandler|`requestHandler`}
|
|
288
|
+
* for each {@link Request} object, which represents a single URL to crawl.
|
|
289
|
+
* The {@link Request} objects are fed from the {@link RequestList} or {@link RequestQueue}
|
|
290
|
+
* instances provided by the {@link BasicCrawlerOptions.requestList|`requestList`} or {@link BasicCrawlerOptions.requestQueue|`requestQueue`}
|
|
291
291
|
* constructor options, respectively. If neither `requestList` nor `requestQueue` options are provided,
|
|
292
|
-
* the crawler will open the default request queue either when the {@
|
|
293
|
-
* or if `requests` parameter (representing the initial requests) of the {@
|
|
292
|
+
* the crawler will open the default request queue either when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called,
|
|
293
|
+
* or if `requests` parameter (representing the initial requests) of the {@link BasicCrawler.run|`crawler.run()`} function is provided.
|
|
294
294
|
*
|
|
295
|
-
* If both {@
|
|
296
|
-
* the instance first processes URLs from the {@
|
|
297
|
-
* to the {@
|
|
295
|
+
* If both {@link BasicCrawlerOptions.requestList|`requestList`} and {@link BasicCrawlerOptions.requestQueue|`requestQueue`} options are used,
|
|
296
|
+
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
297
|
+
* to the {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
298
298
|
*
|
|
299
|
-
* The crawler finishes if there are no more {@
|
|
299
|
+
* The crawler finishes if there are no more {@link Request} objects to crawl.
|
|
300
300
|
*
|
|
301
301
|
* New requests are only dispatched when there is enough free CPU and memory available,
|
|
302
|
-
* using the functionality provided by the {@
|
|
303
|
-
* All {@
|
|
302
|
+
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
303
|
+
* All {@link AutoscaledPool} configuration options can be passed to the {@link BasicCrawlerOptions.autoscaledPoolOptions|`autoscaledPoolOptions`}
|
|
304
304
|
* parameter of the `BasicCrawler` constructor.
|
|
305
|
-
* For user convenience, the {@
|
|
306
|
-
* {@
|
|
307
|
-
* underlying {@
|
|
305
|
+
* For user convenience, the {@link AutoscaledPoolOptions.minConcurrency|`minConcurrency`} and
|
|
306
|
+
* {@link AutoscaledPoolOptions.maxConcurrency|`maxConcurrency`} options of the
|
|
307
|
+
* underlying {@link AutoscaledPool} constructor are available directly in the `BasicCrawler` constructor.
|
|
308
308
|
*
|
|
309
309
|
* **Example usage:**
|
|
310
310
|
*
|
|
@@ -342,36 +342,36 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
342
342
|
readonly config: Configuration;
|
|
343
343
|
protected static readonly CRAWLEE_STATE_KEY = "CRAWLEE_STATE";
|
|
344
344
|
/**
|
|
345
|
-
* A reference to the underlying {@
|
|
345
|
+
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
346
346
|
*/
|
|
347
347
|
readonly stats: Statistics;
|
|
348
348
|
/**
|
|
349
|
-
* A reference to the underlying {@
|
|
349
|
+
* A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request|requests}.
|
|
350
350
|
* Only available if used by the crawler.
|
|
351
351
|
*/
|
|
352
352
|
requestList?: IRequestList;
|
|
353
353
|
/**
|
|
354
354
|
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
355
|
-
* A reference to the underlying {@
|
|
355
|
+
* A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request|requests}.
|
|
356
356
|
* Only available if used by the crawler.
|
|
357
357
|
*/
|
|
358
358
|
requestQueue?: RequestProvider;
|
|
359
359
|
/**
|
|
360
|
-
* A reference to the underlying {@
|
|
360
|
+
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
|
|
361
361
|
* Only available if used by the crawler.
|
|
362
362
|
*/
|
|
363
363
|
sessionPool?: SessionPool;
|
|
364
364
|
/**
|
|
365
|
-
* A reference to the underlying {@
|
|
366
|
-
* > *NOTE:* This property is only initialized after calling the {@
|
|
365
|
+
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
366
|
+
* > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
|
|
367
367
|
* We can use it to change the concurrency settings on the fly,
|
|
368
|
-
* to pause the crawler by calling {@
|
|
369
|
-
* or to abort it by calling {@
|
|
368
|
+
* to pause the crawler by calling {@link AutoscaledPool.pause|`autoscaledPool.pause()`}
|
|
369
|
+
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
370
370
|
*/
|
|
371
371
|
autoscaledPool?: AutoscaledPool;
|
|
372
372
|
/**
|
|
373
|
-
* Default {@
|
|
374
|
-
* See {@
|
|
373
|
+
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
374
|
+
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
375
375
|
*/
|
|
376
376
|
readonly router: RouterHandler<LoadedContext<Context>>;
|
|
377
377
|
running: boolean;
|
|
@@ -489,7 +489,7 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
489
489
|
* and `autoscaledPool.isFinished` returns `true`.
|
|
490
490
|
*
|
|
491
491
|
* We can use the `requests` parameter to enqueue the initial requests — it is a shortcut for
|
|
492
|
-
* running {@
|
|
492
|
+
* running {@link BasicCrawler.addRequests|`crawler.addRequests()`} before {@link BasicCrawler.run|`crawler.run()`}.
|
|
493
493
|
*
|
|
494
494
|
* @param [requests] The requests to add.
|
|
495
495
|
* @param [options] Options for the request queue.
|
|
@@ -516,19 +516,19 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
516
516
|
*/
|
|
517
517
|
addRequests(requests: (string | Source)[], options?: CrawlerAddRequestsOptions): Promise<CrawlerAddRequestsResult>;
|
|
518
518
|
/**
|
|
519
|
-
* Pushes data to the specified {@
|
|
519
|
+
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
520
520
|
*/
|
|
521
521
|
pushData(data: Parameters<Dataset['pushData']>[0], datasetIdOrName?: string): Promise<void>;
|
|
522
522
|
/**
|
|
523
|
-
* Retrieves the specified {@
|
|
523
|
+
* Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
|
|
524
524
|
*/
|
|
525
525
|
getDataset(idOrName?: string): Promise<Dataset>;
|
|
526
526
|
/**
|
|
527
|
-
* Retrieves data from the default crawler {@
|
|
527
|
+
* Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
|
|
528
528
|
*/
|
|
529
529
|
getData(...args: Parameters<Dataset['getData']>): ReturnType<Dataset['getData']>;
|
|
530
530
|
/**
|
|
531
|
-
* Retrieves all the data from the default crawler {@
|
|
531
|
+
* Retrieves all the data from the default crawler {@link Dataset} and exports them to the specified format.
|
|
532
532
|
* Supported formats are currently 'json' and 'csv', and will be inferred from the `path` automatically.
|
|
533
533
|
*/
|
|
534
534
|
exportData<Data>(path: string, format?: 'json' | 'csv', options?: DatasetExportOptions): Promise<Data[]>;
|
|
@@ -636,9 +636,9 @@ interface HandlePropertyNameChangeData<New, Old> {
|
|
|
636
636
|
allowUndefined?: boolean;
|
|
637
637
|
}
|
|
638
638
|
/**
|
|
639
|
-
* Creates new {@
|
|
640
|
-
* This instance can then serve as a {@
|
|
641
|
-
* Defaults to the {@
|
|
639
|
+
* Creates new {@link Router} instance that works based on request labels.
|
|
640
|
+
* This instance can then serve as a {@link BasicCrawlerOptions.requestHandler|`requestHandler`} of our {@link BasicCrawler}.
|
|
641
|
+
* Defaults to the {@link BasicCrawlingContext}.
|
|
642
642
|
*
|
|
643
643
|
* > Serves as a shortcut for using `Router.create<BasicCrawlingContext>()`.
|
|
644
644
|
*
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"basic-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/basic-crawler.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACR,yBAAyB,EACzB,wBAAwB,EACxB,qBAAqB,EACrB,cAAc,EACd,eAAe,EACf,oBAAoB,EACpB,mBAAmB,EACnB,YAAY,EACZ,eAAe,EACf,sBAAsB,EACtB,YAAY,EACZ,aAAa,EACb,SAAS,EACT,OAAO,EACP,cAAc,EACd,yBAAyB,EACzB,aAAa,EACb,YAAY,EACZ,OAAO,EACP,kBAAkB,EAClB,sBAAsB,EACtB,MAAM,EACN,iBAAiB,EACjB,cAAc,EACjB,MAAM,eAAe,CAAC;AACvB,OAAO,EACH,cAAc,EACd,aAAa,EAEb,OAAO,EASP,eAAe,EAOf,WAAW,EACX,UAAU,EAEb,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,SAAS,EAAE,sBAAsB,EAAE,UAAU,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAC7G,OAAO,EAAE,aAAa,EAAuB,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"basic-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/basic-crawler.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACR,yBAAyB,EACzB,wBAAwB,EACxB,qBAAqB,EACrB,cAAc,EACd,eAAe,EACf,oBAAoB,EACpB,mBAAmB,EACnB,YAAY,EACZ,eAAe,EACf,sBAAsB,EACtB,YAAY,EACZ,aAAa,EACb,SAAS,EACT,OAAO,EACP,cAAc,EACd,yBAAyB,EACzB,aAAa,EACb,YAAY,EACZ,OAAO,EACP,kBAAkB,EAClB,sBAAsB,EACtB,MAAM,EACN,iBAAiB,EACjB,cAAc,EACjB,MAAM,eAAe,CAAC;AACvB,OAAO,EACH,cAAc,EACd,aAAa,EAEb,OAAO,EASP,eAAe,EAOf,WAAW,EACX,UAAU,EAEb,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,SAAS,EAAE,sBAAsB,EAAE,UAAU,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAC7G,OAAO,EAAE,aAAa,EAAuB,MAAM,gBAAgB,CAAC;AAKpE,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAG7C,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,YAAY,CAAC;AAEtC,OAAO,EAAuB,YAAY,EAAa,MAAM,gBAAgB,CAAC;AAK9E,MAAM,WAAW,oBAAoB,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAC1E,SAAQ,eAAe,CAAC,YAAY,EAAE,QAAQ,CAAC;IAC/C;;;;;;;;;;;;;;;;;;;;;;OAsBG;IACH,YAAY,CAAC,OAAO,CAAC,EAAE,WAAW,CAAC,mBAAmB,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,sBAAsB,CAAC,CAAC;CACrG;AAaD,MAAM,MAAM,cAAc,CACtB,OAAO,SAAS,eAAe,GAAG,aAAa,CAAC,oBAAoB,GAAG,yBAAyB,CAAC,IACjG,CAAC,MAAM,EAAE,aAAa,CAAC,OAAO,CAAC,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAExD,MAAM,MAAM,YAAY,CACpB,OAAO,SAAS,eAAe,GAAG,aAAa,CAAC,oBAAoB,GAAG,yBAAyB,CAAC,IACjG,CAAC,MAAM,EAAE,aAAa,CAAC,OAAO,CAAC,EAAE,KAAK,EAAE,KAAK,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAEtE,MAAM,WAAW,2BAA2B,CACxC,OAAO,SAAS,eAAe,GAAG,oBAAoB,EACtD,OAAO,SAAS,YAAY,CAAC,GAAG,CAAC,GAAG,YAAY,CAAC,OAAO,CAAC;IAEzD,KAAK,EAAE,cAAc,CAAC;IACtB,OAAO,EAAE,OAAO,CAAC;IACjB,aAAa,EAAE,cAAc,CAAC;IAC9B,OAAO,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,qBAAqB,CAC7B,OAAO,SAAS,eAAe,GAAG,oBAAoB,EACtD,OAAO,SAAS,YAAY,CAAC,GAAG,CAAC,GAAG,YAAY,CAAC,OAAO,CAAC,IACzD,CAAC,MAAM,EAAE,2BAA2B,CAAC,OAAO,EAAE,OAAO,CAAC,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAE/E,MAAM,WAAW,mBAAmB,CAAC,OAAO,SAAS,eAAe,GAAG,oBAAoB;IACvF;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC;IAExD;;;;;;;;;;;;;;;;;;;OAmBG;IACH,qBAAqB,CAAC,EAAE,cAAc,CAAC,OAAO,CAAC,CAAC;IAEhD;;;;;OAKG;IACH,WAAW,CAAC,EAAE,YAAY,CAAC;IAE3B;;;;;OAKG;IACH,YAAY,CAAC,EAAE,eAAe,CAAC;IAE/B;;;OAGG;IACH,yBAAyB,CAAC,EAAE,MAAM,CAAC;IAEnC;;;;;OAKG;IACH,wBAAwB,CAAC,EAAE,MAAM,CAAC;IAElC;;;;;;;;OAQG;IACH,YAAY,CAAC,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;IAErC;;;;;;;OAOG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;IAE7C;;;;;;;;;;OAUG;IACH,2BAA2B,CAAC,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;IAEpD;;;OAGG;IACH,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAE3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;;;OAMG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAE9C;;;;;OAKG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAE9B;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB;;;OAGG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IAEzB;;OAEG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IAExC;;OAEG;IACH,4BAA4B,CAAC,EAAE,MAAM,CAAC;IAEtC;;;;;;;;;;;;;;;OAeG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAE9C;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IAEzB;;;OAGG;IACH,oBAAoB,CAAC,EAAE,OAAO,CAAC;IAE/B;;;OAGG;IACH,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAE1C,gBAAgB;IAChB,GAAG,CAAC,EAAE,GAAG,CAAC;IAEV;;;OAGG;IACH,WAAW,CAAC,EAAE,kBAAkB,CAAC;IAEjC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,iBAAiB,CAAC;IAEtC;;;OAGG;IACH,UAAU,CAAC,EAAE,cAAc,CAAC;CAC/B;AAED;;;;;GAKG;AACH,MAAM,WAAW,kBAAkB;IAC/B;;;;;OAKG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+DG;AACH,qBAAa,YAAY,CAAC,OAAO,SAAS,eAAe,GAAG,oBAAoB;IA2HxE,QAAQ,CAAC,MAAM;IA1HnB,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAiB,mBAAmB;IAE9D;;OAEG;IACH,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAE3B;;;OAGG;IACH,WAAW,CAAC,EAAE,YAAY,CAAC;IAE3B;;;;OAIG;IACH,YAAY,CAAC,EAAE,eAAe,CAAC;IAE/B;;;OAGG;IACH,WAAW,CAAC,EAAE,WAAW,CAAC;IAE1B;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC;IAEhC;;;OAGG;IACH,QAAQ,CAAC,MAAM,EAAE,aAAa,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAA2C;IAEjG,OAAO,UAAS;IAChB,iBAAiB,UAAS;IAE1B,QAAQ,CAAC,GAAG,EAAE,GAAG,CAAC;IAClB,SAAS,CAAC,cAAc,EAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IACnD,SAAS,CAAC,YAAY,CAAC,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;IAC/C,SAAS,CAAC,oBAAoB,CAAC,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;IACvD,SAAS,CAAC,2BAA2B,EAAG,MAAM,CAAC;IAC/C,SAAS,CAAC,qBAAqB,EAAE,MAAM,CAAC;IACxC,SAAS,CAAC,iBAAiB,EAAE,MAAM,CAAC;IACpC,SAAS,CAAC,qBAAqB,EAAE,MAAM,CAAC;IACxC,SAAS,CAAC,kBAAkB,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClD,SAAS,CAAC,mBAAmB,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,oBAAoB,EAAE,MAAM,CAAC;IACvC,SAAS,CAAC,4BAA4B,EAAE,MAAM,CAAC;IAC/C,SAAS,CAAC,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IACxD,SAAS,CAAC,kBAAkB,EAAE,kBAAkB,CAAC;IACjD,SAAS,CAAC,cAAc,EAAE,OAAO,CAAC;IAClC,SAAS,CAAC,gBAAgB,uBAA8B;IACxD,SAAS,CAAC,qBAAqB,EAAE,qBAAqB,CAAC;IACvD,SAAS,CAAC,MAAM,EAAE,YAAY,CAAC;IAC/B,SAAS,CAAC,UAAU,EAAE,cAAc,CAAC;IACrC,SAAS,CAAC,cAAc,EAAE,OAAO,CAAC;IAClC,SAAS,CAAC,oBAAoB,EAAE,OAAO,CAAC;IACxC,SAAS,CAAC,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IACpD,OAAO,CAAC,YAAY,CAAC,CAAU;IAE/B,OAAO,CAAC,WAAW,CAAqB;IACxC,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAA0B;IAC7D,OAAO,CAAC,mBAAmB,CAA0D;IAErF,SAAS,CAAC,MAAM,CAAC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MA2C3B;IAEF;;OAEG;gBAEC,OAAO,GAAE,mBAAmB,CAAC,OAAO,CAAM,EACjC,MAAM,gBAAkC;IAwNrD;;;;;OAKG;IACH,SAAS,CAAC,YAAY,CAAC,KAAK,EAAE,KAAK,GAAG,OAAO;IAI7C;;;;OAIG;cACa,gBAAgB,CAAC,gBAAgB,EAAE,OAAO,GAAG,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC;IAIpF;;OAEG;IACG,gBAAgB,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,uBAA4B;IAmB7E,OAAO,CAAC,iBAAiB;IAgDzB;;;;;;;;;OASG;IACG,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,MAAM,GAAG,OAAO,GAAG,cAAc,CAAC,EAAE,EAAE,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IA8GlH;;;;OAIG;IACH,IAAI,CAAC,OAAO,SAA6C,GAAG,IAAI;IAY1D,eAAe;IAYf,QAAQ,CAAC,KAAK,SAAS,UAAU,GAAG,UAAU,EAAE,YAAY,GAAS,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;IAKjG;;;;;;;;;;OAUG;IACG,WAAW,CACb,QAAQ,EAAE,CAAC,MAAM,GAAG,MAAM,CAAC,EAAE,EAC7B,OAAO,GAAE,yBAA8B,GACxC,OAAO,CAAC,wBAAwB,CAAC;IAsCpC;;OAEG;IACG,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,eAAe,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKjG;;OAEG;IACG,UAAU,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAIrD;;OAEG;IACG,OAAO,CAAC,GAAG,IAAI,EAAE,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAKtF;;;OAGG;IACG,UAAU,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,KAAK,EAAE,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;cAoC9F,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;cAoBtB,kBAAkB,CAAC,eAAe,EAAE,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC;IAI3E;;OAEG;IACH,SAAS,CAAC,sBAAsB,CAAC,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM;YAQvD,6BAA6B;cAS3B,sBAAsB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,GAAG,SAAS,CAAC;cAuBvE,iBAAiB;IAuCjC;;;OAGG;cACa,iBAAiB;IAyBjC;;;OAGG;cACa,eAAe,CAAC,gBAAgB,EAAE,OAAO;IAEzD;;;;OAIG;IACH,SAAS,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,YAAY,GAAG,eAAe;IAsC/E;;;OAGG;cACa,gBAAgB;IAqJhC;;;OAGG;cACa,gBAAgB,CAC5B,OAAO,EAAE,MAAM,OAAO,CAAC,OAAO,CAAC,EAC/B,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,KAAK,GAAG,MAAM,EACrB,UAAU,SAAI,EACd,OAAO,SAAI,GACZ,OAAO,CAAC,IAAI,CAAC;IAehB;;OAEG;cACa,oBAAoB;IASpC;;OAEG;cACa,0BAA0B;YAS5B,cAAc;IAQ5B;;OAEG;cACa,4BAA4B,CACxC,KAAK,EAAE,KAAK,EACZ,eAAe,EAAE,OAAO,EACxB,MAAM,EAAE,YAAY,GAAG,eAAe,GACvC,OAAO,CAAC,IAAI,CAAC;cA2DA,oBAAoB,CAAC,CAAC,EAAE,EAAE,EAAE,MAAM,OAAO,GAAG,OAAO,CAAC,CAAC,CAAC;cAStD,2BAA2B,CAAC,eAAe,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;IAclG;;;;OAIG;IACH,SAAS,CAAC,oBAAoB,CAAC,KAAK,EAAE,KAAK,EAAE,UAAU,UAAQ;IAmB/D,SAAS,CAAC,oBAAoB,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK;IAoB7D,SAAS,CAAC,kCAAkC,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,GAYrD,aAAa,CAAC,OAAO,CAAC;IAG5C;;;;;;;OAOG;cACa,wBAAwB,IAAI,OAAO,CAAC,IAAI,CAAC;cAQzC,aAAa,CAAC,QAAQ,SAAS,CAAC,GAAG,IAAI,EAAE,GAAG,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,EAC9E,KAAK,EAAE,QAAQ,EAAE,EACjB,GAAG,IAAI,EAAE,UAAU,CAAC,QAAQ,CAAC;IASjC;;;OAGG;IACG,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;IAc/B,SAAS,CAAC,yBAAyB,CAAC,GAAG,EAAE,GAAG,EAAE,EAC1C,WAAW,EACX,OAAO,EACP,WAAW,EACX,OAAO,EACP,WAAW,EACX,cAAsB,GACzB,EAAE,4BAA4B,CAAC,GAAG,EAAE,GAAG,CAAC;IA8BzC,SAAS,CAAC,2BAA2B,CAAC,OAAO,EAAE,OAAO;YAWxC,gBAAgB;IAc9B,SAAS,CAAC,6BAA6B,CAAC,OAAO,EAAE,OAAO;CA6C3D;AAED,MAAM,WAAW,oBAAoB;IACjC,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,SAAS,CAAC,EAAE,SAAS,CAAC;CACzB;AAED,MAAM,WAAW,yBAA0B,SAAQ,yBAAyB;CAAG;AAE/E,MAAM,WAAW,wBAAyB,SAAQ,wBAAwB;CAAG;AAE7E,MAAM,WAAW,iBAAkB,SAAQ,yBAAyB;IAChE;;;;OAIG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC/B;AAED,UAAU,4BAA4B,CAAC,GAAG,EAAE,GAAG;IAC3C,WAAW,CAAC,EAAE,GAAG,CAAC;IAClB,WAAW,CAAC,EAAE,GAAG,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,iBAAiB,CAC7B,OAAO,SAAS,oBAAoB,GAAG,oBAAoB,EAC3D,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,0BAEzC"}
|
|
@@ -33,29 +33,29 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
|
33
33
|
* `BasicCrawler` is a low-level tool that requires the user to implement the page
|
|
34
34
|
* download and data extraction functionality themselves.
|
|
35
35
|
* If we want a crawler that already facilitates this functionality,
|
|
36
|
-
* we should consider using {@
|
|
36
|
+
* we should consider using {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}.
|
|
37
37
|
*
|
|
38
|
-
* `BasicCrawler` invokes the user-provided {@
|
|
39
|
-
* for each {@
|
|
40
|
-
* The {@
|
|
41
|
-
* instances provided by the {@
|
|
38
|
+
* `BasicCrawler` invokes the user-provided {@link BasicCrawlerOptions.requestHandler|`requestHandler`}
|
|
39
|
+
* for each {@link Request} object, which represents a single URL to crawl.
|
|
40
|
+
* The {@link Request} objects are fed from the {@link RequestList} or {@link RequestQueue}
|
|
41
|
+
* instances provided by the {@link BasicCrawlerOptions.requestList|`requestList`} or {@link BasicCrawlerOptions.requestQueue|`requestQueue`}
|
|
42
42
|
* constructor options, respectively. If neither `requestList` nor `requestQueue` options are provided,
|
|
43
|
-
* the crawler will open the default request queue either when the {@
|
|
44
|
-
* or if `requests` parameter (representing the initial requests) of the {@
|
|
43
|
+
* the crawler will open the default request queue either when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called,
|
|
44
|
+
* or if `requests` parameter (representing the initial requests) of the {@link BasicCrawler.run|`crawler.run()`} function is provided.
|
|
45
45
|
*
|
|
46
|
-
* If both {@
|
|
47
|
-
* the instance first processes URLs from the {@
|
|
48
|
-
* to the {@
|
|
46
|
+
* If both {@link BasicCrawlerOptions.requestList|`requestList`} and {@link BasicCrawlerOptions.requestQueue|`requestQueue`} options are used,
|
|
47
|
+
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
|
|
48
|
+
* to the {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
|
|
49
49
|
*
|
|
50
|
-
* The crawler finishes if there are no more {@
|
|
50
|
+
* The crawler finishes if there are no more {@link Request} objects to crawl.
|
|
51
51
|
*
|
|
52
52
|
* New requests are only dispatched when there is enough free CPU and memory available,
|
|
53
|
-
* using the functionality provided by the {@
|
|
54
|
-
* All {@
|
|
53
|
+
* using the functionality provided by the {@link AutoscaledPool} class.
|
|
54
|
+
* All {@link AutoscaledPool} configuration options can be passed to the {@link BasicCrawlerOptions.autoscaledPoolOptions|`autoscaledPoolOptions`}
|
|
55
55
|
* parameter of the `BasicCrawler` constructor.
|
|
56
|
-
* For user convenience, the {@
|
|
57
|
-
* {@
|
|
58
|
-
* underlying {@
|
|
56
|
+
* For user convenience, the {@link AutoscaledPoolOptions.minConcurrency|`minConcurrency`} and
|
|
57
|
+
* {@link AutoscaledPoolOptions.maxConcurrency|`maxConcurrency`} options of the
|
|
58
|
+
* underlying {@link AutoscaledPool} constructor are available directly in the `BasicCrawler` constructor.
|
|
59
59
|
*
|
|
60
60
|
* **Example usage:**
|
|
61
61
|
*
|
|
@@ -101,7 +101,7 @@ class BasicCrawler {
|
|
|
101
101
|
value: config
|
|
102
102
|
});
|
|
103
103
|
/**
|
|
104
|
-
* A reference to the underlying {@
|
|
104
|
+
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
105
105
|
*/
|
|
106
106
|
Object.defineProperty(this, "stats", {
|
|
107
107
|
enumerable: true,
|
|
@@ -110,7 +110,7 @@ class BasicCrawler {
|
|
|
110
110
|
value: void 0
|
|
111
111
|
});
|
|
112
112
|
/**
|
|
113
|
-
* A reference to the underlying {@
|
|
113
|
+
* A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request|requests}.
|
|
114
114
|
* Only available if used by the crawler.
|
|
115
115
|
*/
|
|
116
116
|
Object.defineProperty(this, "requestList", {
|
|
@@ -121,7 +121,7 @@ class BasicCrawler {
|
|
|
121
121
|
});
|
|
122
122
|
/**
|
|
123
123
|
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
124
|
-
* A reference to the underlying {@
|
|
124
|
+
* A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request|requests}.
|
|
125
125
|
* Only available if used by the crawler.
|
|
126
126
|
*/
|
|
127
127
|
Object.defineProperty(this, "requestQueue", {
|
|
@@ -131,7 +131,7 @@ class BasicCrawler {
|
|
|
131
131
|
value: void 0
|
|
132
132
|
});
|
|
133
133
|
/**
|
|
134
|
-
* A reference to the underlying {@
|
|
134
|
+
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
|
|
135
135
|
* Only available if used by the crawler.
|
|
136
136
|
*/
|
|
137
137
|
Object.defineProperty(this, "sessionPool", {
|
|
@@ -141,11 +141,11 @@ class BasicCrawler {
|
|
|
141
141
|
value: void 0
|
|
142
142
|
});
|
|
143
143
|
/**
|
|
144
|
-
* A reference to the underlying {@
|
|
145
|
-
* > *NOTE:* This property is only initialized after calling the {@
|
|
144
|
+
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
145
|
+
* > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
|
|
146
146
|
* We can use it to change the concurrency settings on the fly,
|
|
147
|
-
* to pause the crawler by calling {@
|
|
148
|
-
* or to abort it by calling {@
|
|
147
|
+
* to pause the crawler by calling {@link AutoscaledPool.pause|`autoscaledPool.pause()`}
|
|
148
|
+
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
149
149
|
*/
|
|
150
150
|
Object.defineProperty(this, "autoscaledPool", {
|
|
151
151
|
enumerable: true,
|
|
@@ -154,8 +154,8 @@ class BasicCrawler {
|
|
|
154
154
|
value: void 0
|
|
155
155
|
});
|
|
156
156
|
/**
|
|
157
|
-
* Default {@
|
|
158
|
-
* See {@
|
|
157
|
+
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
158
|
+
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
159
159
|
*/
|
|
160
160
|
Object.defineProperty(this, "router", {
|
|
161
161
|
enumerable: true,
|
|
@@ -433,7 +433,8 @@ class BasicCrawler {
|
|
|
433
433
|
this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
|
|
434
434
|
let shouldLogMaxPagesExceeded = true;
|
|
435
435
|
const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
436
|
-
|
|
436
|
+
// eslint-disable-next-line prefer-const
|
|
437
|
+
let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
|
|
437
438
|
// override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
|
|
438
439
|
if (keepAlive) {
|
|
439
440
|
isFinishedFunction = async () => false;
|
|
@@ -452,7 +453,7 @@ class BasicCrawler {
|
|
|
452
453
|
}
|
|
453
454
|
return false;
|
|
454
455
|
}
|
|
455
|
-
return this._isTaskReadyFunction();
|
|
456
|
+
return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
|
|
456
457
|
},
|
|
457
458
|
isFinishedFunction: async () => {
|
|
458
459
|
if (isMaxPagesExceeded()) {
|
|
@@ -546,7 +547,7 @@ class BasicCrawler {
|
|
|
546
547
|
* and `autoscaledPool.isFinished` returns `true`.
|
|
547
548
|
*
|
|
548
549
|
* We can use the `requests` parameter to enqueue the initial requests — it is a shortcut for
|
|
549
|
-
* running {@
|
|
550
|
+
* running {@link BasicCrawler.addRequests|`crawler.addRequests()`} before {@link BasicCrawler.run|`crawler.run()`}.
|
|
550
551
|
*
|
|
551
552
|
* @param [requests] The requests to add.
|
|
552
553
|
* @param [options] Options for the request queue.
|
|
@@ -700,27 +701,27 @@ class BasicCrawler {
|
|
|
700
701
|
return requestQueue.addRequestsBatched(allowedRequests, options);
|
|
701
702
|
}
|
|
702
703
|
/**
|
|
703
|
-
* Pushes data to the specified {@
|
|
704
|
+
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
704
705
|
*/
|
|
705
706
|
async pushData(data, datasetIdOrName) {
|
|
706
707
|
const dataset = await this.getDataset(datasetIdOrName);
|
|
707
708
|
return dataset.pushData(data);
|
|
708
709
|
}
|
|
709
710
|
/**
|
|
710
|
-
* Retrieves the specified {@
|
|
711
|
+
* Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
|
|
711
712
|
*/
|
|
712
713
|
async getDataset(idOrName) {
|
|
713
714
|
return core_1.Dataset.open(idOrName, { config: this.config });
|
|
714
715
|
}
|
|
715
716
|
/**
|
|
716
|
-
* Retrieves data from the default crawler {@
|
|
717
|
+
* Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
|
|
717
718
|
*/
|
|
718
719
|
async getData(...args) {
|
|
719
720
|
const dataset = await this.getDataset();
|
|
720
721
|
return dataset.getData(...args);
|
|
721
722
|
}
|
|
722
723
|
/**
|
|
723
|
-
* Retrieves all the data from the default crawler {@
|
|
724
|
+
* Retrieves all the data from the default crawler {@link Dataset} and exports them to the specified format.
|
|
724
725
|
* Supported formats are currently 'json' and 'csv', and will be inferred from the `path` automatically.
|
|
725
726
|
*/
|
|
726
727
|
async exportData(path, format, options) {
|
|
@@ -964,7 +965,7 @@ class BasicCrawler {
|
|
|
964
965
|
request.state = core_1.RequestState.REQUEST_HANDLER;
|
|
965
966
|
await (0, timeout_1.addTimeoutToPromise)(async () => this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`);
|
|
966
967
|
await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
967
|
-
this.stats.finishJob(statisticsId);
|
|
968
|
+
this.stats.finishJob(statisticsId, request.retryCount);
|
|
968
969
|
this.handledRequestsCount++;
|
|
969
970
|
// reclaim session if request finishes successfully
|
|
970
971
|
request.state = core_1.RequestState.DONE;
|
|
@@ -1099,7 +1100,7 @@ class BasicCrawler {
|
|
|
1099
1100
|
// Mark the request as failed and do not retry.
|
|
1100
1101
|
this.handledRequestsCount++;
|
|
1101
1102
|
await source.markRequestHandled(request);
|
|
1102
|
-
this.stats.failJob(request.id || request.uniqueKey);
|
|
1103
|
+
this.stats.failJob(request.id || request.uniqueKey, request.retryCount);
|
|
1103
1104
|
await this._handleFailedRequestHandler(crawlingContext, error); // This function prints an error message.
|
|
1104
1105
|
}
|
|
1105
1106
|
async _tagUserHandlerError(cb) {
|
|
@@ -1335,9 +1336,9 @@ Object.defineProperty(BasicCrawler, "optionsShape", {
|
|
|
1335
1336
|
}
|
|
1336
1337
|
});
|
|
1337
1338
|
/**
|
|
1338
|
-
* Creates new {@
|
|
1339
|
-
* This instance can then serve as a {@
|
|
1340
|
-
* Defaults to the {@
|
|
1339
|
+
* Creates new {@link Router} instance that works based on request labels.
|
|
1340
|
+
* This instance can then serve as a {@link BasicCrawlerOptions.requestHandler|`requestHandler`} of our {@link BasicCrawler}.
|
|
1341
|
+
* Defaults to the {@link BasicCrawlingContext}.
|
|
1341
1342
|
*
|
|
1342
1343
|
* > Serves as a shortcut for using `Router.create<BasicCrawlingContext>()`.
|
|
1343
1344
|
*
|