@crawlee/basic 4.0.0-beta.6 → 4.0.0-beta.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/index.d.ts +1 -1
- package/index.d.ts.map +1 -1
- package/index.js +0 -1
- package/index.js.map +1 -1
- package/internals/basic-crawler.d.ts +292 -125
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +721 -354
- package/internals/basic-crawler.js.map +1 -1
- package/internals/send-request.d.ts +3 -5
- package/internals/send-request.d.ts.map +1 -1
- package/internals/send-request.js +21 -25
- package/internals/send-request.js.map +1 -1
- package/package.json +7 -8
- package/internals/constants.d.ts +0 -7
- package/internals/constants.d.ts.map +0 -1
- package/internals/constants.js +0 -7
- package/internals/constants.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,38 +1,13 @@
|
|
|
1
|
-
import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions,
|
|
2
|
-
import { AutoscaledPool,
|
|
3
|
-
import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
|
|
1
|
+
import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, Configuration, CrawleeLogger, CrawlingContext, DatasetExportOptions, EnqueueLinksOptions, EventManager, FinalStatistics, GetUserDataFromRequest, IRequestList, IRequestManager, ProxyConfiguration, Request, RequestsLike, RouterHandler, RouterRoutes, SkippedRequestCallback, Source, StatisticsOptions, StatisticState, StorageIdentifier } from '@crawlee/core';
|
|
2
|
+
import { AutoscaledPool, ContextPipeline, Dataset, RequestProvider, Statistics } from '@crawlee/core';
|
|
3
|
+
import type { Awaitable, BaseHttpClient, BatchAddRequestsResult, Dictionary, ISession, ISessionPool, ProxyInfo, SetStatusMessageOptions, StorageClient } from '@crawlee/types';
|
|
4
4
|
import { RobotsTxtFile } from '@crawlee/utils';
|
|
5
|
-
import type { SetRequired } from 'type-fest';
|
|
6
|
-
import type { Log } from '@apify/log';
|
|
5
|
+
import type { ReadonlyDeep, SetRequired } from 'type-fest';
|
|
7
6
|
import { TimeoutError } from '@apify/timeout';
|
|
8
|
-
export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<
|
|
9
|
-
/**
|
|
10
|
-
* This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
|
|
11
|
-
* currently used by the crawler.
|
|
12
|
-
*
|
|
13
|
-
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
|
|
14
|
-
* and override settings of the enqueued {@link Request} objects.
|
|
15
|
-
*
|
|
16
|
-
* Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
|
|
17
|
-
* for more details regarding its usage.
|
|
18
|
-
*
|
|
19
|
-
* **Example usage**
|
|
20
|
-
*
|
|
21
|
-
* ```ts
|
|
22
|
-
* async requestHandler({ enqueueLinks }) {
|
|
23
|
-
* await enqueueLinks({
|
|
24
|
-
* urls: [...],
|
|
25
|
-
* });
|
|
26
|
-
* },
|
|
27
|
-
* ```
|
|
28
|
-
*
|
|
29
|
-
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
|
|
30
|
-
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
|
|
31
|
-
*/
|
|
32
|
-
enqueueLinks(options?: SetRequired<EnqueueLinksOptions, 'urls'>): Promise<BatchAddRequestsResult>;
|
|
7
|
+
export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
|
|
33
8
|
}
|
|
34
|
-
export type RequestHandler<Context extends CrawlingContext =
|
|
35
|
-
export type ErrorHandler<Context extends CrawlingContext =
|
|
9
|
+
export type RequestHandler<Context extends CrawlingContext = CrawlingContext> = (inputs: Context) => Awaitable<void>;
|
|
10
|
+
export type ErrorHandler<Context extends CrawlingContext = CrawlingContext, ExtendedContext extends Context = Context> = (inputs: Context & Partial<ExtendedContext>, error: Error) => Awaitable<void>;
|
|
36
11
|
export interface StatusMessageCallbackParams<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> {
|
|
37
12
|
state: StatisticState;
|
|
38
13
|
crawler: Crawler;
|
|
@@ -40,7 +15,10 @@ export interface StatusMessageCallbackParams<Context extends CrawlingContext = B
|
|
|
40
15
|
message: string;
|
|
41
16
|
}
|
|
42
17
|
export type StatusMessageCallback<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<void>;
|
|
43
|
-
export
|
|
18
|
+
export type RequireContextPipeline<DefaultContextType extends CrawlingContext, FinalContextType extends DefaultContextType> = DefaultContextType extends FinalContextType ? {} : {
|
|
19
|
+
contextPipelineBuilder: () => ContextPipeline<CrawlingContext, FinalContextType>;
|
|
20
|
+
};
|
|
21
|
+
export interface BasicCrawlerOptions<Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> {
|
|
44
22
|
/**
|
|
45
23
|
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
|
|
46
24
|
*
|
|
@@ -58,7 +36,35 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
58
36
|
* The exceptions are logged to the request using the
|
|
59
37
|
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
60
38
|
*/
|
|
61
|
-
requestHandler?: RequestHandler<
|
|
39
|
+
requestHandler?: RequestHandler<ExtendedContext>;
|
|
40
|
+
/**
|
|
41
|
+
* Allows the user to extend the crawling context passed to the request handler with custom functionality.
|
|
42
|
+
*
|
|
43
|
+
* **Example usage:**
|
|
44
|
+
*
|
|
45
|
+
* ```javascript
|
|
46
|
+
* import { BasicCrawler } from 'crawlee';
|
|
47
|
+
*
|
|
48
|
+
* // Create a crawler instance
|
|
49
|
+
* const crawler = new BasicCrawler({
|
|
50
|
+
* extendContext(context) => ({
|
|
51
|
+
* async customHelper() {
|
|
52
|
+
* await context.pushData({ url: context.request.url })
|
|
53
|
+
* }
|
|
54
|
+
* }),
|
|
55
|
+
* async requestHandler(context) {
|
|
56
|
+
* await context.customHelper();
|
|
57
|
+
* },
|
|
58
|
+
* });
|
|
59
|
+
* ```
|
|
60
|
+
*/
|
|
61
|
+
extendContext?: (context: Context) => Awaitable<ContextExtension>;
|
|
62
|
+
/**
|
|
63
|
+
* *Intended for BasicCrawler subclasses*. Prepares a context pipeline that transforms the initial crawling context into the shape given by the `Context` type parameter.
|
|
64
|
+
*
|
|
65
|
+
* The option is not required if your crawler subclass does not extend the crawling context with custom information or helpers.
|
|
66
|
+
*/
|
|
67
|
+
contextPipelineBuilder?: () => ContextPipeline<CrawlingContext, Context>;
|
|
62
68
|
/**
|
|
63
69
|
* Static list of URLs to be processed.
|
|
64
70
|
* If not provided, the crawler will open the default request queue when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called.
|
|
@@ -73,6 +79,13 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
73
79
|
* it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`.
|
|
74
80
|
*/
|
|
75
81
|
requestQueue?: RequestProvider;
|
|
82
|
+
/**
|
|
83
|
+
* Allows explicitly configuring a request manager. Mutually exclusive with the `requestQueue` and `requestList` options.
|
|
84
|
+
*
|
|
85
|
+
* This enables explicitly configuring the crawler to use `RequestManagerTandem`, for instance.
|
|
86
|
+
* If using this, the type of `BasicCrawler.requestQueue` may not be fully compatible with the `RequestProvider` class.
|
|
87
|
+
*/
|
|
88
|
+
requestManager?: IRequestManager;
|
|
76
89
|
/**
|
|
77
90
|
* Timeout in which the function passed as {@link BasicCrawlerOptions.requestHandler|`requestHandler`} needs to finish, in seconds.
|
|
78
91
|
* @default 60
|
|
@@ -87,7 +100,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
87
100
|
* Second argument is the `Error` instance that
|
|
88
101
|
* represents the last error thrown during processing of the request.
|
|
89
102
|
*/
|
|
90
|
-
errorHandler?: ErrorHandler<
|
|
103
|
+
errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
91
104
|
/**
|
|
92
105
|
* A function to handle requests that failed more than {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
93
106
|
*
|
|
@@ -96,14 +109,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
96
109
|
* Second argument is the `Error` instance that
|
|
97
110
|
* represents the last error thrown during processing of the request.
|
|
98
111
|
*/
|
|
99
|
-
failedRequestHandler?: ErrorHandler<
|
|
112
|
+
failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
100
113
|
/**
|
|
101
114
|
* Specifies the maximum number of retries allowed for a request if its processing fails.
|
|
102
|
-
* This includes retries due to navigation errors or errors thrown from user-supplied
|
|
103
|
-
* (`requestHandler`, `preNavigationHooks`, `postNavigationHooks`).
|
|
104
|
-
*
|
|
105
|
-
* This limit does not apply to retries triggered by session rotation
|
|
106
|
-
* (see {@link BasicCrawlerOptions.maxSessionRotations|`maxSessionRotations`}).
|
|
115
|
+
* This includes retries due to navigation errors, session/proxy errors, or errors thrown from user-supplied
|
|
116
|
+
* functions (`requestHandler`, `preNavigationHooks`, `postNavigationHooks`).
|
|
107
117
|
* @default 3
|
|
108
118
|
*/
|
|
109
119
|
maxRequestRetries?: number;
|
|
@@ -112,26 +122,24 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
112
122
|
* @default 0
|
|
113
123
|
*/
|
|
114
124
|
sameDomainDelaySecs?: number;
|
|
115
|
-
/**
|
|
116
|
-
* Maximum number of session rotations per request.
|
|
117
|
-
* The crawler will automatically rotate the session in case of a proxy error or if it gets blocked by the website.
|
|
118
|
-
*
|
|
119
|
-
* The session rotations are not counted towards the {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} limit.
|
|
120
|
-
* @default 10
|
|
121
|
-
*/
|
|
122
|
-
maxSessionRotations?: number;
|
|
123
125
|
/**
|
|
124
126
|
* Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
|
|
125
127
|
* This value should always be set in order to prevent infinite loops in misconfigured crawlers.
|
|
126
128
|
* > *NOTE:* In cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
|
|
127
129
|
*/
|
|
128
130
|
maxRequestsPerCrawl?: number;
|
|
131
|
+
/**
|
|
132
|
+
* Maximum depth of the crawl. If not set, the crawl will continue until all requests are processed.
|
|
133
|
+
* Setting this to `0` will only process the initial requests, skipping all links enqueued by `crawlingContext.enqueueLinks` and `crawlingContext.addRequests`.
|
|
134
|
+
* Passing `1` will process the initial requests and all links enqueued by `crawlingContext.enqueueLinks` and `crawlingContext.addRequests` in the handler for initial requests.
|
|
135
|
+
*/
|
|
136
|
+
maxCrawlDepth?: number;
|
|
129
137
|
/**
|
|
130
138
|
* Custom options passed to the underlying {@link AutoscaledPool} constructor.
|
|
131
139
|
* > *NOTE:* The {@link AutoscaledPoolOptions.runTaskFunction|`runTaskFunction`}
|
|
132
|
-
* and
|
|
133
|
-
*
|
|
134
|
-
*
|
|
140
|
+
* option is provided by the crawler and cannot be overridden.
|
|
141
|
+
* However, we can provide custom implementations of {@link AutoscaledPoolOptions.isFinishedFunction|`isFinishedFunction`}
|
|
142
|
+
* and {@link AutoscaledPoolOptions.isTaskReadyFunction|`isTaskReadyFunction`}.
|
|
135
143
|
*/
|
|
136
144
|
autoscaledPoolOptions?: AutoscaledPoolOptions;
|
|
137
145
|
/**
|
|
@@ -159,14 +167,14 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
159
167
|
*/
|
|
160
168
|
keepAlive?: boolean;
|
|
161
169
|
/**
|
|
162
|
-
*
|
|
163
|
-
*
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
*
|
|
170
|
+
* An existing session pool instance to use. When provided, the crawler will use this pool directly instead of
|
|
171
|
+
* creating a new one, enabling session sharing across multiple crawlers. The crawler will not tear down a shared
|
|
172
|
+
* pool — the caller is responsible for its lifecycle.
|
|
173
|
+
*
|
|
174
|
+
* Accepts the built-in {@link SessionPool} or any object implementing the {@link ISessionPool} interface,
|
|
175
|
+
* so custom session-management strategies can be plugged in.
|
|
168
176
|
*/
|
|
169
|
-
|
|
177
|
+
sessionPool?: ISessionPool;
|
|
170
178
|
/**
|
|
171
179
|
* Defines the length of the interval for calling the `setStatusMessage` in seconds.
|
|
172
180
|
*/
|
|
@@ -188,6 +196,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
188
196
|
* ```
|
|
189
197
|
*/
|
|
190
198
|
statusMessageCallback?: StatusMessageCallback;
|
|
199
|
+
/**
|
|
200
|
+
* HTTP status codes that indicate the session should be retired.
|
|
201
|
+
* @default [401, 403, 429]
|
|
202
|
+
*/
|
|
203
|
+
blockedStatusCodes?: number[];
|
|
191
204
|
/**
|
|
192
205
|
* If set to `true`, the crawler will automatically try to bypass any detected bot protection.
|
|
193
206
|
*
|
|
@@ -199,15 +212,22 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
199
212
|
/**
|
|
200
213
|
* If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain,
|
|
201
214
|
* and skip those that are not allowed. This also prevents disallowed URLs to be added via `enqueueLinks`.
|
|
215
|
+
*
|
|
216
|
+
* If an object is provided, it may contain a `userAgent` property to specify which user-agent
|
|
217
|
+
* should be used when checking the robots.txt file. If not provided, the default user-agent `*` will be used.
|
|
202
218
|
*/
|
|
203
|
-
respectRobotsTxtFile?: boolean
|
|
219
|
+
respectRobotsTxtFile?: boolean | {
|
|
220
|
+
userAgent?: string;
|
|
221
|
+
};
|
|
204
222
|
/**
|
|
205
223
|
* When a request is skipped for some reason, you can use this callback to act on it.
|
|
206
|
-
* This is currently fired
|
|
224
|
+
* This is currently fired for requests skipped
|
|
225
|
+
* 1. based on robots.txt file,
|
|
226
|
+
* 2. because they don't match enqueueLinks filters,
|
|
227
|
+
* 3. because they are redirected to a URL that doesn't match the enqueueLinks strategy,
|
|
228
|
+
* 4. or because the {@link BasicCrawlerOptions.maxRequestsPerCrawl|`maxRequestsPerCrawl`} limit has been reached
|
|
207
229
|
*/
|
|
208
230
|
onSkippedRequest?: SkippedRequestCallback;
|
|
209
|
-
/** @internal */
|
|
210
|
-
log?: Log;
|
|
211
231
|
/**
|
|
212
232
|
* Enables experimental features of Crawlee, which can alter the behavior of the crawler.
|
|
213
233
|
* WARNING: these options are not guaranteed to be stable and may change or be removed at any time.
|
|
@@ -223,6 +243,53 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
223
243
|
* Defaults to a new instance of {@link GotScrapingHttpClient}
|
|
224
244
|
*/
|
|
225
245
|
httpClient?: BaseHttpClient;
|
|
246
|
+
/**
|
|
247
|
+
* If set, the crawler will be configured for all connections to use
|
|
248
|
+
* the Proxy URLs provided and rotated according to the configuration.
|
|
249
|
+
*/
|
|
250
|
+
proxyConfiguration?: ProxyConfiguration;
|
|
251
|
+
/**
|
|
252
|
+
* Custom configuration to use for this crawler.
|
|
253
|
+
* If provided, the crawler will use its own ServiceLocator instance instead of the global one.
|
|
254
|
+
*/
|
|
255
|
+
configuration?: Configuration;
|
|
256
|
+
/**
|
|
257
|
+
* Custom storage client to use for this crawler.
|
|
258
|
+
* If provided, the crawler will use its own ServiceLocator instance instead of the global one.
|
|
259
|
+
*/
|
|
260
|
+
storageClient?: StorageClient;
|
|
261
|
+
/**
|
|
262
|
+
* Custom event manager to use for this crawler.
|
|
263
|
+
* If provided, the crawler will use its own ServiceLocator instance instead of the global one.
|
|
264
|
+
*/
|
|
265
|
+
eventManager?: EventManager;
|
|
266
|
+
/**
|
|
267
|
+
* Custom logger to use for this crawler.
|
|
268
|
+
* If provided, the crawler will use its own ServiceLocator instance instead of the global one.
|
|
269
|
+
*/
|
|
270
|
+
logger?: CrawleeLogger;
|
|
271
|
+
/**
|
|
272
|
+
* A unique identifier for the crawler instance. This ID is used to isolate the state returned by
|
|
273
|
+
* {@link BasicCrawler.useState|`crawler.useState()`} from other crawler instances.
|
|
274
|
+
*
|
|
275
|
+
* When multiple crawler instances use `useState()` without an explicit `id`, they will share the same
|
|
276
|
+
* state object for backward compatibility. A warning will be logged in this case.
|
|
277
|
+
*
|
|
278
|
+
* To ensure each crawler has its own isolated state that also persists across script restarts
|
|
279
|
+
* (e.g., during Apify migrations), provide a stable, unique ID for each crawler instance.
|
|
280
|
+
*
|
|
281
|
+
*/
|
|
282
|
+
id?: string;
|
|
283
|
+
/**
|
|
284
|
+
* An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
|
|
285
|
+
* By default, status codes >= 500 trigger errors.
|
|
286
|
+
*/
|
|
287
|
+
ignoreHttpErrorStatusCodes?: number[];
|
|
288
|
+
/**
|
|
289
|
+
* An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
|
|
290
|
+
* By default, status codes >= 500 trigger errors.
|
|
291
|
+
*/
|
|
292
|
+
additionalHttpErrorStatusCodes?: number[];
|
|
226
293
|
}
|
|
227
294
|
/**
|
|
228
295
|
* A set of options that you can toggle to enable experimental features in Crawlee.
|
|
@@ -303,9 +370,14 @@ export interface CrawlerExperiments {
|
|
|
303
370
|
* ```
|
|
304
371
|
* @category Crawlers
|
|
305
372
|
*/
|
|
306
|
-
export declare class BasicCrawler<Context extends CrawlingContext =
|
|
307
|
-
|
|
373
|
+
export declare class BasicCrawler<Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> {
|
|
374
|
+
#private;
|
|
308
375
|
protected static readonly CRAWLEE_STATE_KEY = "CRAWLEE_STATE";
|
|
376
|
+
/**
|
|
377
|
+
* Tracks crawler instances that accessed shared state without having an explicit id.
|
|
378
|
+
* Used to detect and warn about multiple crawlers sharing the same state.
|
|
379
|
+
*/
|
|
380
|
+
private static useStateCrawlerIds;
|
|
309
381
|
/**
|
|
310
382
|
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
311
383
|
*/
|
|
@@ -322,10 +394,21 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
322
394
|
*/
|
|
323
395
|
requestQueue?: RequestProvider;
|
|
324
396
|
/**
|
|
325
|
-
*
|
|
326
|
-
|
|
397
|
+
* The main request-handling component of the crawler. It's initialized during the crawler startup.
|
|
398
|
+
*/
|
|
399
|
+
protected requestManager?: IRequestManager;
|
|
400
|
+
/**
|
|
401
|
+
* A reference to the underlying session pool that manages the crawler's {@link Session|sessions}. Typed as
|
|
402
|
+
* {@link ISessionPool} so custom implementations can be plugged in via the `sessionPool` constructor option.
|
|
327
403
|
*/
|
|
328
|
-
sessionPool
|
|
404
|
+
sessionPool: ISessionPool;
|
|
405
|
+
/**
|
|
406
|
+
* Set when the crawler constructed its own {@link SessionPool} (no `sessionPool` option was provided).
|
|
407
|
+
* Holds the same instance as `sessionPool`, but typed as the concrete class so the crawler can call
|
|
408
|
+
* lifecycle methods (`resetStore`, `teardown`) that aren't part of {@link ISessionPool}. A user-supplied
|
|
409
|
+
* pool is never owned and never torn down by the crawler.
|
|
410
|
+
*/
|
|
411
|
+
private ownedSessionPool?;
|
|
329
412
|
/**
|
|
330
413
|
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
331
414
|
* > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
|
|
@@ -334,40 +417,70 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
334
417
|
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
335
418
|
*/
|
|
336
419
|
autoscaledPool?: AutoscaledPool;
|
|
420
|
+
/**
|
|
421
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
422
|
+
* Only available if used by the crawler.
|
|
423
|
+
*/
|
|
424
|
+
proxyConfiguration?: ProxyConfiguration;
|
|
337
425
|
/**
|
|
338
426
|
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
339
427
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
340
428
|
*/
|
|
341
|
-
readonly router: RouterHandler<
|
|
429
|
+
readonly router: RouterHandler<Context>;
|
|
430
|
+
private _basicContextPipeline?;
|
|
431
|
+
/**
|
|
432
|
+
* The basic part of the context pipeline. Unlike the subclass pipeline, this
|
|
433
|
+
* part has no major side effects (e.g. launching a browser). It also makes typing more explicit, as subclass
|
|
434
|
+
* pipelines expect the basic crawler fields to already be present in the context at runtime.
|
|
435
|
+
*
|
|
436
|
+
* Context built with this pipeline can be passed into multiple crawler pipelines at once.
|
|
437
|
+
* This is used e.g. in the {@link AdaptivePlaywrightCrawler|`AdaptivePlaywrightCrawler`}.
|
|
438
|
+
*/
|
|
439
|
+
get basicContextPipeline(): ContextPipeline<{
|
|
440
|
+
request: Request;
|
|
441
|
+
}, CrawlingContext>;
|
|
442
|
+
private _contextPipeline?;
|
|
443
|
+
get contextPipeline(): ContextPipeline<CrawlingContext, ExtendedContext>;
|
|
342
444
|
running: boolean;
|
|
343
445
|
hasFinishedBefore: boolean;
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
protected
|
|
347
|
-
protected
|
|
446
|
+
protected unexpectedStop: boolean;
|
|
447
|
+
get log(): CrawleeLogger;
|
|
448
|
+
protected requestHandler: RequestHandler<ExtendedContext>;
|
|
449
|
+
protected errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
450
|
+
protected failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
348
451
|
protected requestHandlerTimeoutMillis: number;
|
|
349
452
|
protected internalTimeoutMillis: number;
|
|
350
453
|
protected maxRequestRetries: number;
|
|
454
|
+
protected maxCrawlDepth?: number;
|
|
351
455
|
protected sameDomainDelayMillis: number;
|
|
352
456
|
protected domainAccessedTime: Map<string, number>;
|
|
353
|
-
protected
|
|
457
|
+
protected maxRequestsPerCrawl?: number;
|
|
354
458
|
protected handledRequestsCount: number;
|
|
355
459
|
protected statusMessageLoggingInterval: number;
|
|
356
460
|
protected statusMessageCallback?: StatusMessageCallback;
|
|
357
|
-
protected
|
|
358
|
-
protected
|
|
359
|
-
protected
|
|
461
|
+
protected blockedStatusCodes: Set<number>;
|
|
462
|
+
protected additionalHttpErrorStatusCodes: Set<number>;
|
|
463
|
+
protected ignoreHttpErrorStatusCodes: Set<number>;
|
|
360
464
|
protected autoscaledPoolOptions: AutoscaledPoolOptions;
|
|
361
|
-
protected events: EventManager;
|
|
362
465
|
protected httpClient: BaseHttpClient;
|
|
363
466
|
protected retryOnBlocked: boolean;
|
|
364
|
-
protected respectRobotsTxtFile: boolean
|
|
467
|
+
protected respectRobotsTxtFile: boolean | {
|
|
468
|
+
userAgent?: string;
|
|
469
|
+
};
|
|
365
470
|
protected onSkippedRequest?: SkippedRequestCallback;
|
|
366
471
|
private _closeEvents?;
|
|
472
|
+
private loggedPerRun;
|
|
367
473
|
private experiments;
|
|
368
474
|
private readonly robotsTxtFileCache;
|
|
369
475
|
private _experimentWarnings;
|
|
476
|
+
private readonly crawlerId;
|
|
477
|
+
private readonly hasExplicitId;
|
|
478
|
+
private readonly contextPipelineOptions;
|
|
370
479
|
protected static optionsShape: {
|
|
480
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
481
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
482
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
483
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
371
484
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
372
485
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
373
486
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -384,28 +497,42 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
384
497
|
maxRequestRetries: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
385
498
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
386
499
|
sameDomainDelaySecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
387
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
388
|
-
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
389
500
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
390
501
|
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
502
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
503
|
+
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
391
504
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
392
505
|
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
393
506
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
394
|
-
|
|
507
|
+
sessionPool: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
395
508
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
396
|
-
|
|
509
|
+
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
397
510
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
398
511
|
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
399
512
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
400
513
|
statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
514
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
515
|
+
additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
516
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
517
|
+
ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
|
|
518
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
519
|
+
blockedStatusCodes: import("ow").ArrayPredicate<number>;
|
|
401
520
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
402
521
|
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
403
522
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
404
|
-
respectRobotsTxtFile: import("ow").
|
|
523
|
+
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
|
|
405
524
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
406
525
|
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
407
526
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
408
527
|
httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
528
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
529
|
+
configuration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
530
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
531
|
+
storageClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
532
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
533
|
+
eventManager: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
534
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
535
|
+
logger: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
409
536
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
410
537
|
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
411
538
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -414,17 +541,42 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
414
541
|
maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
415
542
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
416
543
|
keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
417
|
-
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
418
|
-
log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
419
544
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
420
545
|
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
421
546
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
422
547
|
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
548
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
549
|
+
id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
|
|
423
550
|
};
|
|
424
551
|
/**
|
|
425
552
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
426
553
|
*/
|
|
427
|
-
constructor(options?: BasicCrawlerOptions<Context
|
|
554
|
+
constructor(options?: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<CrawlingContext, Context>);
|
|
555
|
+
/**
|
|
556
|
+
* Determines if the given HTTP status code is an error status code given
|
|
557
|
+
* the default behaviour and user-set preferences.
|
|
558
|
+
* @param status
|
|
559
|
+
* @returns `true` if the status code is considered an error, `false` otherwise
|
|
560
|
+
*/
|
|
561
|
+
protected isErrorStatusCode(status: number): boolean;
|
|
562
|
+
/**
|
|
563
|
+
* Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
|
|
564
|
+
* This handles base context creation, session resolution, and context helpers.
|
|
565
|
+
*/
|
|
566
|
+
protected buildBasicContextPipeline(): ContextPipeline<{
|
|
567
|
+
request: Request;
|
|
568
|
+
}, CrawlingContext>;
|
|
569
|
+
private checkRobotsTxt;
|
|
570
|
+
/**
|
|
571
|
+
* Builds the subclass-specific context pipeline that transforms a `CrawlingContext` into the crawler's target context type.
|
|
572
|
+
* Subclasses should override this to add their own pipeline stages.
|
|
573
|
+
*/
|
|
574
|
+
protected buildContextPipeline(): ContextPipeline<CrawlingContext, CrawlingContext>;
|
|
575
|
+
private createBaseContext;
|
|
576
|
+
private resolveRequest;
|
|
577
|
+
private resolveSession;
|
|
578
|
+
private createContextHelpers;
|
|
579
|
+
private buildFinalContextPipeline;
|
|
428
580
|
/**
|
|
429
581
|
* Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
|
|
430
582
|
* Used for retrying requests that failed due to proxy errors.
|
|
@@ -432,12 +584,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
432
584
|
* @param error The error to check.
|
|
433
585
|
*/
|
|
434
586
|
protected isProxyError(error: Error): boolean;
|
|
435
|
-
/**
|
|
436
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
437
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
438
|
-
* @param _crawlingContext The crawling context to check.
|
|
439
|
-
*/
|
|
440
|
-
protected isRequestBlocked(_crawlingContext: Context): Promise<string | false>;
|
|
441
587
|
/**
|
|
442
588
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
443
589
|
*/
|
|
@@ -453,15 +599,21 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
453
599
|
* @param [requests] The requests to add.
|
|
454
600
|
* @param [options] Options for the request queue.
|
|
455
601
|
*/
|
|
456
|
-
run(requests?:
|
|
602
|
+
run(requests?: RequestsLike, options?: CrawlerRunOptions): Promise<FinalStatistics>;
|
|
457
603
|
/**
|
|
458
604
|
* Gracefully stops the current run of the crawler.
|
|
459
605
|
*
|
|
460
606
|
* All the tasks active at the time of calling this method will be allowed to finish.
|
|
607
|
+
*
|
|
608
|
+
* To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
|
|
461
609
|
*/
|
|
462
|
-
stop(
|
|
610
|
+
stop(reason?: string): void;
|
|
463
611
|
getRequestQueue(): Promise<RequestProvider>;
|
|
464
612
|
useState<State extends Dictionary = Dictionary>(defaultValue?: State): Promise<State>;
|
|
613
|
+
protected get pendingRequestCountApproximation(): number;
|
|
614
|
+
protected calculateEnqueuedRequestLimit(explicitLimit?: number): number | undefined;
|
|
615
|
+
protected handleSkippedRequest(options: Parameters<SkippedRequestCallback>[0]): Promise<void>;
|
|
616
|
+
private logOncePerRun;
|
|
465
617
|
/**
|
|
466
618
|
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
467
619
|
* adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
@@ -473,15 +625,15 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
473
625
|
* @param requests The requests to add
|
|
474
626
|
* @param options Options for the request queue
|
|
475
627
|
*/
|
|
476
|
-
addRequests(requests:
|
|
628
|
+
addRequests(requests: ReadonlyDeep<RequestsLike>, options?: CrawlerAddRequestsOptions): Promise<CrawlerAddRequestsResult>;
|
|
477
629
|
/**
|
|
478
630
|
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
479
631
|
*/
|
|
480
|
-
pushData(data: Parameters<Dataset['pushData']>[0],
|
|
632
|
+
pushData(data: Parameters<Dataset['pushData']>[0], datasetIdentifier?: string | StorageIdentifier): Promise<void>;
|
|
481
633
|
/**
|
|
482
634
|
* Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
|
|
483
635
|
*/
|
|
484
|
-
getDataset(
|
|
636
|
+
getDataset(identifier?: string | StorageIdentifier): Promise<Dataset>;
|
|
485
637
|
/**
|
|
486
638
|
* Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
|
|
487
639
|
*/
|
|
@@ -491,41 +643,52 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
491
643
|
* Supported formats are currently 'json' and 'csv', and will be inferred from the `path` automatically.
|
|
492
644
|
*/
|
|
493
645
|
exportData<Data>(path: string, format?: 'json' | 'csv', options?: DatasetExportOptions): Promise<Data[]>;
|
|
646
|
+
/**
|
|
647
|
+
* Initializes the crawler.
|
|
648
|
+
*/
|
|
494
649
|
protected _init(): Promise<void>;
|
|
495
|
-
protected
|
|
650
|
+
protected runRequestHandler(crawlingContext: ExtendedContext): Promise<void>;
|
|
496
651
|
/**
|
|
497
652
|
* Handles blocked request
|
|
498
653
|
*/
|
|
499
|
-
protected _throwOnBlockedRequest(
|
|
654
|
+
protected _throwOnBlockedRequest(statusCode: number): void;
|
|
500
655
|
private isAllowedBasedOnRobotsTxtFile;
|
|
501
656
|
protected getRobotsTxtFileForUrl(url: string): Promise<RobotsTxtFile | undefined>;
|
|
502
657
|
protected _pauseOnMigration(): Promise<void>;
|
|
503
658
|
/**
|
|
504
|
-
*
|
|
505
|
-
* and RequestQueue is present then enqueues it to the queue first.
|
|
659
|
+
* Initializes the RequestManager based on the configured requestList and requestQueue.
|
|
506
660
|
*/
|
|
507
|
-
|
|
661
|
+
private initializeRequestManager;
|
|
508
662
|
/**
|
|
509
|
-
*
|
|
510
|
-
* Can be used to clean up orphaned browser pages.
|
|
663
|
+
* Fetches the next request to process from the underlying request provider.
|
|
511
664
|
*/
|
|
512
|
-
protected
|
|
665
|
+
protected _fetchNextRequest(): Promise<Request<Dictionary> | null>;
|
|
513
666
|
/**
|
|
514
667
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
515
668
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
516
669
|
* should be ignored and will be reclaimed to the queue once ready.
|
|
517
670
|
*/
|
|
518
|
-
protected delayRequest(request: Request, source: IRequestList | RequestProvider): boolean;
|
|
671
|
+
protected delayRequest(request: Request, source: IRequestList | RequestProvider | IRequestManager): boolean;
|
|
672
|
+
/** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
|
|
673
|
+
protected handleRequest(crawlingContext: ExtendedContext, requestSource: IRequestManager, request: Request): Promise<void>;
|
|
674
|
+
/**
|
|
675
|
+
* Wrapper around the crawling context's `enqueueLinks` method:
|
|
676
|
+
* - Injects `crawlDepth` to each request being added based on the crawling context request.
|
|
677
|
+
* - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
|
|
678
|
+
* - These options can be overridden by the user.
|
|
679
|
+
* @internal
|
|
680
|
+
*/
|
|
681
|
+
protected enqueueLinksWithCrawlDepth(options: SetRequired<EnqueueLinksOptions, 'urls'>, request: Request<Dictionary>, requestQueue: RequestProvider): Promise<BatchAddRequestsResult>;
|
|
519
682
|
/**
|
|
520
|
-
*
|
|
521
|
-
*
|
|
683
|
+
* Generator function that yields requests injected with the given crawl depth.
|
|
684
|
+
* @internal
|
|
522
685
|
*/
|
|
523
|
-
protected
|
|
686
|
+
protected addCrawlDepthRequestGenerator(requests: RequestsLike, newRequestDepth: number): AsyncGenerator<Source, void, undefined>;
|
|
524
687
|
/**
|
|
525
|
-
* Run async callback with given timeout and retry.
|
|
688
|
+
* Run async callback with given timeout and retry. Returns the result of the callback.
|
|
526
689
|
* @ignore
|
|
527
690
|
*/
|
|
528
|
-
protected _timeoutAndRetry(handler: () => Promise<
|
|
691
|
+
protected _timeoutAndRetry<T>(handler: () => Promise<T>, timeout: number, error: Error | string, maxRetries?: number, retried?: number): Promise<T>;
|
|
529
692
|
/**
|
|
530
693
|
* Returns true if either RequestList or RequestQueue have a request ready for processing.
|
|
531
694
|
*/
|
|
@@ -534,13 +697,19 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
534
697
|
* Returns true if both RequestList and RequestQueue have all requests finished.
|
|
535
698
|
*/
|
|
536
699
|
protected _defaultIsFinishedFunction(): Promise<boolean>;
|
|
537
|
-
|
|
700
|
+
/**
|
|
701
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
702
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
703
|
+
*/
|
|
704
|
+
private unwrapError;
|
|
538
705
|
/**
|
|
539
706
|
* Handles errors thrown by user provided requestHandler()
|
|
707
|
+
*
|
|
708
|
+
* @param request The request object, passed separately to circumvent potential dynamic logic in crawlingContext.request
|
|
540
709
|
*/
|
|
541
|
-
protected _requestFunctionErrorHandler(error: Error, crawlingContext:
|
|
710
|
+
protected _requestFunctionErrorHandler(error: Error, crawlingContext: CrawlingContext, request: Request, source: IRequestList | IRequestManager): Promise<void>;
|
|
542
711
|
protected _tagUserHandlerError<T>(cb: () => unknown): Promise<T>;
|
|
543
|
-
protected _handleFailedRequestHandler(crawlingContext:
|
|
712
|
+
protected _handleFailedRequestHandler(crawlingContext: CrawlingContext, error: Error): Promise<void>;
|
|
544
713
|
/**
|
|
545
714
|
* Resolves the most verbose error message from a thrown error
|
|
546
715
|
* @param error The error received
|
|
@@ -549,27 +718,25 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
549
718
|
protected _getMessageFromError(error: Error, forceStack?: boolean): string | TimeoutError | undefined;
|
|
550
719
|
protected _canRequestBeRetried(request: Request, error: Error): boolean;
|
|
551
720
|
/**
|
|
552
|
-
* Updates handledRequestsCount from possibly stored counts,
|
|
553
|
-
* usually after worker migration. Since one of the stores
|
|
554
|
-
* needs to have priority when both are present,
|
|
555
|
-
* it is the request queue, because generally, the request
|
|
556
|
-
* list will first be dumped into the queue and then left
|
|
557
|
-
* empty.
|
|
721
|
+
* Updates handledRequestsCount from possibly stored counts, usually after worker migration.
|
|
558
722
|
*/
|
|
559
723
|
protected _loadHandledRequestCount(): Promise<void>;
|
|
560
724
|
protected _executeHooks<HookLike extends (...args: any[]) => Awaitable<void>>(hooks: HookLike[], ...args: Parameters<HookLike>): Promise<void>;
|
|
561
725
|
/**
|
|
562
|
-
*
|
|
563
|
-
*
|
|
726
|
+
* Stops the crawler immediately.
|
|
727
|
+
*
|
|
728
|
+
* This method doesn't wait for currently active requests to finish.
|
|
729
|
+
*
|
|
730
|
+
* To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
|
|
564
731
|
*/
|
|
565
732
|
teardown(): Promise<void>;
|
|
566
733
|
protected _getCookieHeaderFromRequest(request: Request): string;
|
|
567
734
|
private _getRequestQueue;
|
|
568
|
-
|
|
735
|
+
private requestMatchesEnqueueStrategy;
|
|
569
736
|
}
|
|
570
737
|
export interface CreateContextOptions {
|
|
571
738
|
request: Request;
|
|
572
|
-
session
|
|
739
|
+
session: ISession;
|
|
573
740
|
proxyInfo?: ProxyInfo;
|
|
574
741
|
}
|
|
575
742
|
export interface CrawlerAddRequestsOptions extends AddRequestsBatchedOptions {
|