@crawlee/basic 4.0.0-beta.10 → 4.0.0-beta.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.d.ts +0 -1
- package/index.d.ts.map +1 -1
- package/index.js +0 -1
- package/index.js.map +1 -1
- package/internals/basic-crawler.d.ts +65 -57
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +77 -40
- package/internals/basic-crawler.js.map +1 -1
- package/package.json +5 -5
- package/tsconfig.build.tsbuildinfo +1 -1
- package/internals/constants.d.ts +0 -7
- package/internals/constants.d.ts.map +0 -1
- package/internals/constants.js +0 -7
- package/internals/constants.js.map +0 -1
package/index.d.ts
CHANGED
package/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC"}
|
package/index.js
CHANGED
package/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC"}
|
|
@@ -1,38 +1,14 @@
|
|
|
1
|
-
import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, BaseHttpClient, CrawlingContext, DatasetExportOptions,
|
|
2
|
-
import { AutoscaledPool, Configuration, Dataset, RequestProvider, SessionPool, Statistics } from '@crawlee/core';
|
|
3
|
-
import type { Awaitable,
|
|
1
|
+
import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, BaseHttpClient, CrawlingContext, DatasetExportOptions, EventManager, FinalStatistics, GetUserDataFromRequest, IRequestList, ProxyInfo, Request, RequestOptions, RouterHandler, RouterRoutes, Session, SessionPoolOptions, SkippedRequestCallback, Source, StatisticsOptions, StatisticState } from '@crawlee/core';
|
|
2
|
+
import { AutoscaledPool, Configuration, ContextPipeline, Dataset, RequestProvider, SessionPool, Statistics } from '@crawlee/core';
|
|
3
|
+
import type { Awaitable, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
|
|
4
4
|
import { RobotsTxtFile } from '@crawlee/utils';
|
|
5
|
-
import type {
|
|
5
|
+
import type { ReadonlyDeep } from 'type-fest';
|
|
6
6
|
import type { Log } from '@apify/log';
|
|
7
7
|
import { TimeoutError } from '@apify/timeout';
|
|
8
|
-
export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<
|
|
9
|
-
/**
|
|
10
|
-
* This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
|
|
11
|
-
* currently used by the crawler.
|
|
12
|
-
*
|
|
13
|
-
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
|
|
14
|
-
* and override settings of the enqueued {@link Request} objects.
|
|
15
|
-
*
|
|
16
|
-
* Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
|
|
17
|
-
* for more details regarding its usage.
|
|
18
|
-
*
|
|
19
|
-
* **Example usage**
|
|
20
|
-
*
|
|
21
|
-
* ```ts
|
|
22
|
-
* async requestHandler({ enqueueLinks }) {
|
|
23
|
-
* await enqueueLinks({
|
|
24
|
-
* urls: [...],
|
|
25
|
-
* });
|
|
26
|
-
* },
|
|
27
|
-
* ```
|
|
28
|
-
*
|
|
29
|
-
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
|
|
30
|
-
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
|
|
31
|
-
*/
|
|
32
|
-
enqueueLinks(options?: SetRequired<EnqueueLinksOptions, 'urls'>): Promise<BatchAddRequestsResult>;
|
|
8
|
+
export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
|
|
33
9
|
}
|
|
34
|
-
export type RequestHandler<Context extends CrawlingContext =
|
|
35
|
-
export type ErrorHandler<Context extends CrawlingContext =
|
|
10
|
+
export type RequestHandler<Context extends CrawlingContext = CrawlingContext> = (inputs: Context) => Awaitable<void>;
|
|
11
|
+
export type ErrorHandler<Context extends CrawlingContext = CrawlingContext, ExtendedContext extends Context = Context> = (inputs: Context & Partial<ExtendedContext>, error: Error) => Awaitable<void>;
|
|
36
12
|
export interface StatusMessageCallbackParams<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> {
|
|
37
13
|
state: StatisticState;
|
|
38
14
|
crawler: Crawler;
|
|
@@ -40,7 +16,10 @@ export interface StatusMessageCallbackParams<Context extends CrawlingContext = B
|
|
|
40
16
|
message: string;
|
|
41
17
|
}
|
|
42
18
|
export type StatusMessageCallback<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<void>;
|
|
43
|
-
export
|
|
19
|
+
export type RequireContextPipeline<DefaultContextType extends CrawlingContext, FinalContextType extends DefaultContextType> = DefaultContextType extends FinalContextType ? {} : {
|
|
20
|
+
contextPipelineBuilder: () => ContextPipeline<CrawlingContext, FinalContextType>;
|
|
21
|
+
};
|
|
22
|
+
export interface BasicCrawlerOptions<Context extends CrawlingContext = CrawlingContext, ContextExtension = {}, ExtendedContext extends Context = Context & ContextExtension> {
|
|
44
23
|
/**
|
|
45
24
|
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
|
|
46
25
|
*
|
|
@@ -58,7 +37,35 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
58
37
|
* The exceptions are logged to the request using the
|
|
59
38
|
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
60
39
|
*/
|
|
61
|
-
requestHandler?: RequestHandler<
|
|
40
|
+
requestHandler?: RequestHandler<ExtendedContext>;
|
|
41
|
+
/**
|
|
42
|
+
* Allows the user to extend the crawling context passed to the request handler with custom functionality.
|
|
43
|
+
*
|
|
44
|
+
* **Example usage:**
|
|
45
|
+
*
|
|
46
|
+
* ```javascript
|
|
47
|
+
* import { BasicCrawler } from 'crawlee';
|
|
48
|
+
*
|
|
49
|
+
* // Create a crawler instance
|
|
50
|
+
* const crawler = new BasicCrawler({
|
|
51
|
+
* extendContext(context) => ({
|
|
52
|
+
* async customHelper() {
|
|
53
|
+
* await context.pushData({ url: context.request.url })
|
|
54
|
+
* }
|
|
55
|
+
* }),
|
|
56
|
+
* async requestHandler(context) {
|
|
57
|
+
* await context.customHelper();
|
|
58
|
+
* },
|
|
59
|
+
* });
|
|
60
|
+
* ```
|
|
61
|
+
*/
|
|
62
|
+
extendContext?: (context: Context) => Awaitable<ContextExtension>;
|
|
63
|
+
/**
|
|
64
|
+
* *Intended for BasicCrawler subclasses*. Prepares a context pipeline that transforms the initial crawling context into the shape given by the `Context` type parameter.
|
|
65
|
+
*
|
|
66
|
+
* The option is not required if your crawler subclass does not extend the crawling context with custom information or helpers.
|
|
67
|
+
*/
|
|
68
|
+
contextPipelineBuilder?: () => ContextPipeline<CrawlingContext, Context>;
|
|
62
69
|
/**
|
|
63
70
|
* Static list of URLs to be processed.
|
|
64
71
|
* If not provided, the crawler will open the default request queue when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called.
|
|
@@ -87,7 +94,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
87
94
|
* Second argument is the `Error` instance that
|
|
88
95
|
* represents the last error thrown during processing of the request.
|
|
89
96
|
*/
|
|
90
|
-
errorHandler?: ErrorHandler<
|
|
97
|
+
errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
91
98
|
/**
|
|
92
99
|
* A function to handle requests that failed more than {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
93
100
|
*
|
|
@@ -96,7 +103,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
96
103
|
* Second argument is the `Error` instance that
|
|
97
104
|
* represents the last error thrown during processing of the request.
|
|
98
105
|
*/
|
|
99
|
-
failedRequestHandler?: ErrorHandler<
|
|
106
|
+
failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
100
107
|
/**
|
|
101
108
|
* Specifies the maximum number of retries allowed for a request if its processing fails.
|
|
102
109
|
* This includes retries due to navigation errors or errors thrown from user-supplied functions
|
|
@@ -303,7 +310,7 @@ export interface CrawlerExperiments {
|
|
|
303
310
|
* ```
|
|
304
311
|
* @category Crawlers
|
|
305
312
|
*/
|
|
306
|
-
export declare class BasicCrawler<Context extends CrawlingContext =
|
|
313
|
+
export declare class BasicCrawler<Context extends CrawlingContext = CrawlingContext, ContextExtension = {}, ExtendedContext extends Context = Context & ContextExtension> {
|
|
307
314
|
readonly config: Configuration;
|
|
308
315
|
protected static readonly CRAWLEE_STATE_KEY = "CRAWLEE_STATE";
|
|
309
316
|
/**
|
|
@@ -338,13 +345,16 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
338
345
|
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
339
346
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
340
347
|
*/
|
|
341
|
-
readonly router: RouterHandler<
|
|
348
|
+
readonly router: RouterHandler<Context>;
|
|
349
|
+
private contextPipelineBuilder;
|
|
350
|
+
private _contextPipeline?;
|
|
351
|
+
get contextPipeline(): ContextPipeline<CrawlingContext, ExtendedContext>;
|
|
342
352
|
running: boolean;
|
|
343
353
|
hasFinishedBefore: boolean;
|
|
344
354
|
readonly log: Log;
|
|
345
|
-
protected requestHandler: RequestHandler<
|
|
346
|
-
protected errorHandler?: ErrorHandler<
|
|
347
|
-
protected failedRequestHandler?: ErrorHandler<
|
|
355
|
+
protected requestHandler: RequestHandler<ExtendedContext>;
|
|
356
|
+
protected errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
357
|
+
protected failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
348
358
|
protected requestHandlerTimeoutMillis: number;
|
|
349
359
|
protected internalTimeoutMillis: number;
|
|
350
360
|
protected maxRequestRetries: number;
|
|
@@ -356,7 +366,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
356
366
|
protected statusMessageCallback?: StatusMessageCallback;
|
|
357
367
|
protected sessionPoolOptions: SessionPoolOptions;
|
|
358
368
|
protected useSessionPool: boolean;
|
|
359
|
-
protected crawlingContexts: Map<string, Context>;
|
|
360
369
|
protected autoscaledPoolOptions: AutoscaledPoolOptions;
|
|
361
370
|
protected events: EventManager;
|
|
362
371
|
protected httpClient: BaseHttpClient;
|
|
@@ -368,6 +377,10 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
368
377
|
private readonly robotsTxtFileCache;
|
|
369
378
|
private _experimentWarnings;
|
|
370
379
|
protected static optionsShape: {
|
|
380
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
381
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
382
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
383
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
371
384
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
372
385
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
373
386
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -424,7 +437,8 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
424
437
|
/**
|
|
425
438
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
426
439
|
*/
|
|
427
|
-
constructor(options?: BasicCrawlerOptions<Context>,
|
|
440
|
+
constructor(options?: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<CrawlingContext, Context>, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
|
|
441
|
+
config?: Configuration);
|
|
428
442
|
/**
|
|
429
443
|
* Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
|
|
430
444
|
* Used for retrying requests that failed due to proxy errors.
|
|
@@ -432,12 +446,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
432
446
|
* @param error The error to check.
|
|
433
447
|
*/
|
|
434
448
|
protected isProxyError(error: Error): boolean;
|
|
435
|
-
/**
|
|
436
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
437
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
438
|
-
* @param _crawlingContext The crawling context to check.
|
|
439
|
-
*/
|
|
440
|
-
protected isRequestBlocked(_crawlingContext: Context): Promise<string | false>;
|
|
441
449
|
/**
|
|
442
450
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
443
451
|
*/
|
|
@@ -473,7 +481,7 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
473
481
|
* @param requests The requests to add
|
|
474
482
|
* @param options Options for the request queue
|
|
475
483
|
*/
|
|
476
|
-
addRequests(requests: (string | Source)[]
|
|
484
|
+
addRequests(requests: ReadonlyDeep<(string | Source)[]>, options?: CrawlerAddRequestsOptions): Promise<CrawlerAddRequestsResult>;
|
|
477
485
|
/**
|
|
478
486
|
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
479
487
|
*/
|
|
@@ -492,7 +500,7 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
492
500
|
*/
|
|
493
501
|
exportData<Data>(path: string, format?: 'json' | 'csv', options?: DatasetExportOptions): Promise<Data[]>;
|
|
494
502
|
protected _init(): Promise<void>;
|
|
495
|
-
protected
|
|
503
|
+
protected runRequestHandler(crawlingContext: CrawlingContext): Promise<void>;
|
|
496
504
|
/**
|
|
497
505
|
* Handles blocked request
|
|
498
506
|
*/
|
|
@@ -505,11 +513,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
505
513
|
* and RequestQueue is present then enqueues it to the queue first.
|
|
506
514
|
*/
|
|
507
515
|
protected _fetchNextRequest(): Promise<Request<Dictionary> | null | undefined>;
|
|
508
|
-
/**
|
|
509
|
-
* Executed when `errorHandler` finishes or the request is successful.
|
|
510
|
-
* Can be used to clean up orphaned browser pages.
|
|
511
|
-
*/
|
|
512
|
-
protected _cleanupContext(_crawlingContext: Context): Promise<void>;
|
|
513
516
|
/**
|
|
514
517
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
515
518
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
@@ -535,12 +538,17 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
535
538
|
*/
|
|
536
539
|
protected _defaultIsFinishedFunction(): Promise<boolean>;
|
|
537
540
|
private _rotateSession;
|
|
541
|
+
/**
|
|
542
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
543
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
544
|
+
*/
|
|
545
|
+
private unwrapError;
|
|
538
546
|
/**
|
|
539
547
|
* Handles errors thrown by user provided requestHandler()
|
|
540
548
|
*/
|
|
541
|
-
protected _requestFunctionErrorHandler(error: Error, crawlingContext:
|
|
549
|
+
protected _requestFunctionErrorHandler(error: Error, crawlingContext: CrawlingContext, source: IRequestList | RequestProvider): Promise<void>;
|
|
542
550
|
protected _tagUserHandlerError<T>(cb: () => unknown): Promise<T>;
|
|
543
|
-
protected _handleFailedRequestHandler(crawlingContext:
|
|
551
|
+
protected _handleFailedRequestHandler(crawlingContext: CrawlingContext, error: Error): Promise<void>;
|
|
544
552
|
/**
|
|
545
553
|
* Resolves the most verbose error message from a thrown error
|
|
546
554
|
* @param error The error received
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"basic-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/basic-crawler.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACR,yBAAyB,EACzB,wBAAwB,EACxB,qBAAqB,EACrB,cAAc,EACd,eAAe,EACf,oBAAoB,EACpB,
|
|
1
|
+
{"version":3,"file":"basic-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/basic-crawler.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACR,yBAAyB,EACzB,wBAAwB,EACxB,qBAAqB,EACrB,cAAc,EACd,eAAe,EACf,oBAAoB,EACpB,YAAY,EACZ,eAAe,EACf,sBAAsB,EACtB,YAAY,EACZ,SAAS,EACT,OAAO,EACP,cAAc,EACd,aAAa,EACb,YAAY,EACZ,OAAO,EACP,kBAAkB,EAClB,sBAAsB,EACtB,MAAM,EACN,iBAAiB,EACjB,cAAc,EACjB,MAAM,eAAe,CAAC;AACvB,OAAO,EACH,cAAc,EACd,aAAa,EACb,eAAe,EAKf,OAAO,EAUP,eAAe,EAOf,WAAW,EACX,UAAU,EAEb,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AACrF,OAAO,EAAE,aAAa,EAAuB,MAAM,gBAAgB,CAAC;AAKpE,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAG9C,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,YAAY,CAAC;AAEtC,OAAO,EAAuB,YAAY,EAAa,MAAM,gBAAgB,CAAC;AAK9E,MAAM,WAAW,oBAAoB,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAAE,SAAQ,eAAe,CAAC,QAAQ,CAAC;CAAG;AAapH,MAAM,MAAM,cAAc,CAAC,OAAO,SAAS,eAAe,GAAG,eAAe,IAAI,CAAC,MAAM,EAAE,OAAO,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAErH,MAAM,MAAM,YAAY,CACpB,OAAO,SAAS,eAAe,GAAG,eAAe,EACjD,eAAe,SAAS,OAAO,GAAG,OAAO,IACzC,CAAC,MAAM,EAAE,OAAO,GAAG,OAAO,CAAC,eAAe,CAAC,EAAE,KAAK,EAAE,KAAK,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAElF,MAAM,WAAW,2BAA2B,CACxC,OAAO,SAAS,eAAe,GAAG,oBAAoB,EACtD,OAAO,SAAS,YAAY,CAAC,GAAG,CAAC,GAAG,YAAY,CAAC,OAAO,CAAC;IAEzD,KAAK,EAAE,cAAc,CAAC;IACtB,OAAO,EAAE,OAAO,CAAC;IACjB,aAAa,EAAE,cAAc,CAAC;IAC9B,OAAO,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,qBAAqB,CAC7B,OAAO,SAAS,eAAe,GAAG,oBAAoB,EACtD,OAAO,SAAS,YAAY,CAAC,GAAG,CAAC,GAAG,YAAY,CAAC,OAAO,CAAC,IACzD,CAAC,MAAM,EAAE,2BAA2B,CAAC,OAAO,EAAE,OAAO,CAAC,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAE/E,MAAM,MAAM,sBAAsB,CAC9B,kBAAkB,SAAS,eAAe,EAC1C,gBAAgB,SAAS,kBAAkB,IAC3C,kBAAkB,SAAS,gBAAgB,GACzC,EAAE,GACF;IAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC,eAAe,EAAE,gBAAgB,CAAC,CAAA;CAAE,CAAC;AAE3F,MAAM,WAAW,mBAAmB,CAChC,OAAO,SAAS,eAAe,GAAG,eAAe,EACjD,gBAAgB,GAAG,EAAE,EACrB,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB;IAE5D;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC,eAAe,CAAC,CAAC;IAEjD;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,aAAa,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,KAAK,SAAS,CAAC,gBAAgB,CAAC,CAAC;IAElE;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,eAAe,CAAC,eAAe,EAAE,OAAO,CAAC,CAAC;IAEzE;;;;;OAKG;IACH,WAAW,CAAC,EAAE,YAAY,CAAC;IAE3B;;;;;OAKG;IACH,YAAY,CAAC,EAAE,eAAe,CAAC;IAE/B;;;OAGG;IACH,yBAAyB,CAAC,EAAE,MAAM,CAAC;IAEnC;;;;;;;;OAQG;IACH,YAAY,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAE9D;;;;;;;OAOG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAEtE;;;;;;;;OAQG;IACH,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAE3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;;;OAMG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAE9C;;;;;OAKG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAE9B;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB;;;OAGG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IAEzB;;OAEG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IAExC;;OAEG;IACH,4BAA4B,CAAC,EAAE,MAAM,CAAC;IAEtC;;;;;;;;;;;;;;;OAeG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAE9C;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IAEzB;;;OAGG;IACH,oBAAoB,CAAC,EAAE,OAAO,CAAC;IAE/B;;;OAGG;IACH,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAE1C,gBAAgB;IAChB,GAAG,CAAC,EAAE,GAAG,CAAC;IAEV;;;OAGG;IACH,WAAW,CAAC,EAAE,kBAAkB,CAAC;IAEjC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,iBAAiB,CAAC;IAEtC;;;OAGG;IACH,UAAU,CAAC,EAAE,cAAc,CAAC;CAC/B;AAED;;;;;GAKG;AACH,MAAM,WAAW,kBAAkB;IAC/B;;;;;OAKG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+DG;AACH,qBAAa,YAAY,CACrB,OAAO,SAAS,eAAe,GAAG,eAAe,EACjD,gBAAgB,GAAG,EAAE,EACrB,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB;IAoIxD,QAAQ,CAAC,MAAM;IAlInB,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAiB,mBAAmB;IAE9D;;OAEG;IACH,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAE3B;;;OAGG;IACH,WAAW,CAAC,EAAE,YAAY,CAAC;IAE3B;;;;OAIG;IACH,YAAY,CAAC,EAAE,eAAe,CAAC;IAE/B;;;OAGG;IACH,WAAW,CAAC,EAAE,WAAW,CAAC;IAE1B;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC;IAEhC;;;OAGG;IACH,QAAQ,CAAC,MAAM,EAAE,aAAa,CAAC,OAAO,CAAC,CAA4B;IAEnE,OAAO,CAAC,sBAAsB,CAA0D;IACxF,OAAO,CAAC,gBAAgB,CAAC,CAAoD;IAE7E,IAAI,eAAe,IAAI,eAAe,CAAC,eAAe,EAAE,eAAe,CAAC,CAMvE;IAED,OAAO,UAAS;IAChB,iBAAiB,UAAS;IAE1B,QAAQ,CAAC,GAAG,EAAE,GAAG,CAAC;IAClB,SAAS,CAAC,cAAc,EAAG,cAAc,CAAC,eAAe,CAAC,CAAC;IAC3D,SAAS,CAAC,YAAY,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IACxE,SAAS,CAAC,oBAAoB,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAChF,SAAS,CAAC,2BAA2B,EAAG,MAAM,CAAC;IAC/C,SAAS,CAAC,qBAAqB,EAAE,MAAM,CAAC;IACxC,SAAS,CAAC,iBAAiB,EAAE,MAAM,CAAC;IACpC,SAAS,CAAC,qBAAqB,EAAE,MAAM,CAAC;IACxC,SAAS,CAAC,kBAAkB,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClD,SAAS,CAAC,mBAAmB,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,oBAAoB,EAAE,MAAM,CAAC;IACvC,SAAS,CAAC,4BAA4B,EAAE,MAAM,CAAC;IAC/C,SAAS,CAAC,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IACxD,SAAS,CAAC,kBAAkB,EAAE,kBAAkB,CAAC;IACjD,SAAS,CAAC,cAAc,EAAE,OAAO,CAAC;IAClC,SAAS,CAAC,qBAAqB,EAAE,qBAAqB,CAAC;IACvD,SAAS,CAAC,MAAM,EAAE,YAAY,CAAC;IAC/B,SAAS,CAAC,UAAU,EAAE,cAAc,CAAC;IACrC,SAAS,CAAC,cAAc,EAAE,OAAO,CAAC;IAClC,SAAS,CAAC,oBAAoB,EAAE,OAAO,CAAC;IACxC,SAAS,CAAC,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IACpD,OAAO,CAAC,YAAY,CAAC,CAAU;IAE/B,OAAO,CAAC,WAAW,CAAqB;IACxC,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAA0B;IAC7D,OAAO,CAAC,mBAAmB,CAA0D;IAErF,SAAS,CAAC,MAAM,CAAC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAwC3B;IAEF;;OAEG;gBAEC,OAAO,GAAE,mBAAmB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC,GACpE,sBAAsB,CAAC,eAAe,EAAE,OAAO,CAAa,EAAE,wGAAwG;IACjK,MAAM,gBAAkC;IA2MrD;;;;;OAKG;IACH,SAAS,CAAC,YAAY,CAAC,KAAK,EAAE,KAAK,GAAG,OAAO;IAI7C;;OAEG;IACG,gBAAgB,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,uBAA4B;IAmB7E,OAAO,CAAC,iBAAiB;IAgDzB;;;;;;;;;OASG;IACG,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,MAAM,GAAG,OAAO,GAAG,cAAc,CAAC,EAAE,EAAE,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IA8GlH;;;;OAIG;IACH,IAAI,CAAC,OAAO,SAA6C,GAAG,IAAI;IAY1D,eAAe;IAYf,QAAQ,CAAC,KAAK,SAAS,UAAU,GAAG,UAAU,EAAE,YAAY,GAAS,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;IAKjG;;;;;;;;;;OAUG;IACG,WAAW,CACb,QAAQ,EAAE,YAAY,CAAC,CAAC,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC,EAC3C,OAAO,GAAE,yBAA8B,GACxC,OAAO,CAAC,wBAAwB,CAAC;IAsCpC;;OAEG;IACG,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,eAAe,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKjG;;OAEG;IACG,UAAU,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAIrD;;OAEG;IACG,OAAO,CAAC,GAAG,IAAI,EAAE,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAKtF;;;OAGG;IACG,UAAU,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,KAAK,EAAE,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;cAoC9F,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;cAoBtB,iBAAiB,CAAC,eAAe,EAAE,eAAe,GAAG,OAAO,CAAC,IAAI,CAAC;IAUlF;;OAEG;IACH,SAAS,CAAC,sBAAsB,CAAC,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM;YAQvD,6BAA6B;cAS3B,sBAAsB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,GAAG,SAAS,CAAC;cAuBvE,iBAAiB;IAuCjC;;;OAGG;cACa,iBAAiB;IAyBjC;;;;OAIG;IACH,SAAS,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,YAAY,GAAG,eAAe;IAsC/E;;;OAGG;cACa,gBAAgB;IAyJhC;;;OAGG;cACa,gBAAgB,CAC5B,OAAO,EAAE,MAAM,OAAO,CAAC,OAAO,CAAC,EAC/B,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,KAAK,GAAG,MAAM,EACrB,UAAU,SAAI,EACd,OAAO,SAAI,GACZ,OAAO,CAAC,IAAI,CAAC;IAehB;;OAEG;cACa,oBAAoB;IASpC;;OAEG;cACa,0BAA0B;YAS5B,cAAc;IAQ5B;;;OAGG;IACH,OAAO,CAAC,WAAW;IAWnB;;OAEG;cACa,4BAA4B,CACxC,KAAK,EAAE,KAAK,EACZ,eAAe,EAAE,eAAe,EAChC,MAAM,EAAE,YAAY,GAAG,eAAe,GACvC,OAAO,CAAC,IAAI,CAAC;cA4DA,oBAAoB,CAAC,CAAC,EAAE,EAAE,EAAE,MAAM,OAAO,GAAG,OAAO,CAAC,CAAC,CAAC;cAStD,2BAA2B,CAAC,eAAe,EAAE,eAAe,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;IAe1G;;;;OAIG;IACH,SAAS,CAAC,oBAAoB,CAAC,KAAK,EAAE,KAAK,EAAE,UAAU,UAAQ;IAmB/D,SAAS,CAAC,oBAAoB,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK;IAoB7D;;;;;;;OAOG;cACa,wBAAwB,IAAI,OAAO,CAAC,IAAI,CAAC;cAQzC,aAAa,CAAC,QAAQ,SAAS,CAAC,GAAG,IAAI,EAAE,GAAG,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,EAC9E,KAAK,EAAE,QAAQ,EAAE,EACjB,GAAG,IAAI,EAAE,UAAU,CAAC,QAAQ,CAAC;IASjC;;;OAGG;IACG,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;IAY/B,SAAS,CAAC,2BAA2B,CAAC,OAAO,EAAE,OAAO;YAWxC,gBAAgB;IAc9B,SAAS,CAAC,6BAA6B,CAAC,OAAO,EAAE,OAAO;CA6C3D;AAED,MAAM,WAAW,oBAAoB;IACjC,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,SAAS,CAAC,EAAE,SAAS,CAAC;CACzB;AAED,MAAM,WAAW,yBAA0B,SAAQ,yBAAyB;CAAG;AAE/E,MAAM,WAAW,wBAAyB,SAAQ,wBAAwB;CAAG;AAE7E,MAAM,WAAW,iBAAkB,SAAQ,yBAAyB;IAChE;;;;OAIG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC/B;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,iBAAiB,CAC7B,OAAO,SAAS,oBAAoB,GAAG,oBAAoB,EAC3D,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,0BAEzC"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { writeFile } from 'node:fs/promises';
|
|
2
2
|
import { dirname } from 'node:path';
|
|
3
|
-
import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
3
|
+
import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
4
|
import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
5
5
|
import { stringify } from 'csv-stringify/sync';
|
|
6
6
|
import { ensureDir, writeJSON } from 'fs-extra/esm';
|
|
@@ -121,6 +121,14 @@ export class BasicCrawler {
|
|
|
121
121
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
122
122
|
*/
|
|
123
123
|
router = Router.create();
|
|
124
|
+
contextPipelineBuilder;
|
|
125
|
+
_contextPipeline;
|
|
126
|
+
get contextPipeline() {
|
|
127
|
+
if (this._contextPipeline === undefined) {
|
|
128
|
+
this._contextPipeline = this.contextPipelineBuilder();
|
|
129
|
+
}
|
|
130
|
+
return this._contextPipeline;
|
|
131
|
+
}
|
|
124
132
|
running = false;
|
|
125
133
|
hasFinishedBefore = false;
|
|
126
134
|
log;
|
|
@@ -138,7 +146,6 @@ export class BasicCrawler {
|
|
|
138
146
|
statusMessageCallback;
|
|
139
147
|
sessionPoolOptions;
|
|
140
148
|
useSessionPool;
|
|
141
|
-
crawlingContexts = new Map();
|
|
142
149
|
autoscaledPoolOptions;
|
|
143
150
|
events;
|
|
144
151
|
httpClient;
|
|
@@ -150,6 +157,8 @@ export class BasicCrawler {
|
|
|
150
157
|
robotsTxtFileCache;
|
|
151
158
|
_experimentWarnings = {};
|
|
152
159
|
static optionsShape = {
|
|
160
|
+
contextPipelineBuilder: ow.optional.object,
|
|
161
|
+
extendContext: ow.optional.function,
|
|
153
162
|
requestList: ow.optional.object.validate(validators.requestList),
|
|
154
163
|
requestQueue: ow.optional.object.validate(validators.requestQueue),
|
|
155
164
|
// Subclasses override this function instead of passing it
|
|
@@ -185,7 +194,8 @@ export class BasicCrawler {
|
|
|
185
194
|
/**
|
|
186
195
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
187
196
|
*/
|
|
188
|
-
constructor(options = {},
|
|
197
|
+
constructor(options = {}, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
|
|
198
|
+
config = Configuration.getGlobalConfig()) {
|
|
189
199
|
this.config = config;
|
|
190
200
|
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
|
|
191
201
|
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
|
|
@@ -193,6 +203,32 @@ export class BasicCrawler {
|
|
|
193
203
|
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
194
204
|
// internal
|
|
195
205
|
log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
|
|
206
|
+
// Store the builder so that it can be run when the contextPipeline is needed.
|
|
207
|
+
// Invoking it immediately would cause problems with parent constructor call order.
|
|
208
|
+
this.contextPipelineBuilder = () => {
|
|
209
|
+
let contextPipeline = (options.contextPipelineBuilder?.() ??
|
|
210
|
+
ContextPipeline.create()); // Thanks to the RequireContextPipeline, contextPipeline will only be undefined if InitialContextType is CrawlingContext
|
|
211
|
+
if (options.extendContext !== undefined) {
|
|
212
|
+
contextPipeline = contextPipeline.compose({
|
|
213
|
+
action: async (context) => await options.extendContext(context),
|
|
214
|
+
});
|
|
215
|
+
}
|
|
216
|
+
contextPipeline = contextPipeline.compose({
|
|
217
|
+
action: async (context) => {
|
|
218
|
+
const { request } = context;
|
|
219
|
+
if (!this.requestMatchesEnqueueStrategy(request)) {
|
|
220
|
+
// eslint-disable-next-line dot-notation
|
|
221
|
+
const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
|
|
222
|
+
this.log.debug(message);
|
|
223
|
+
request.noRetry = true;
|
|
224
|
+
request.state = RequestState.SKIPPED;
|
|
225
|
+
throw new ContextPipelineInterruptedError(message);
|
|
226
|
+
}
|
|
227
|
+
return context;
|
|
228
|
+
},
|
|
229
|
+
});
|
|
230
|
+
return contextPipeline;
|
|
231
|
+
};
|
|
196
232
|
this.requestList = requestList;
|
|
197
233
|
this.requestQueue = requestQueue;
|
|
198
234
|
this.httpClient = httpClient ?? new GotScrapingHttpClient();
|
|
@@ -247,7 +283,6 @@ export class BasicCrawler {
|
|
|
247
283
|
}
|
|
248
284
|
}
|
|
249
285
|
this.useSessionPool = useSessionPool;
|
|
250
|
-
this.crawlingContexts = new Map();
|
|
251
286
|
const maxSignedInteger = 2 ** 31 - 1;
|
|
252
287
|
if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
|
|
253
288
|
log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
|
|
@@ -310,14 +345,6 @@ export class BasicCrawler {
|
|
|
310
345
|
isProxyError(error) {
|
|
311
346
|
return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
|
|
312
347
|
}
|
|
313
|
-
/**
|
|
314
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
315
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
316
|
-
* @param _crawlingContext The crawling context to check.
|
|
317
|
-
*/
|
|
318
|
-
async isRequestBlocked(_crawlingContext) {
|
|
319
|
-
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
|
|
320
|
-
}
|
|
321
348
|
/**
|
|
322
349
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
323
350
|
*/
|
|
@@ -590,8 +617,10 @@ export class BasicCrawler {
|
|
|
590
617
|
}
|
|
591
618
|
await this._loadHandledRequestCount();
|
|
592
619
|
}
|
|
593
|
-
async
|
|
594
|
-
await this.
|
|
620
|
+
async runRequestHandler(crawlingContext) {
|
|
621
|
+
await this.contextPipeline.call(crawlingContext, async (finalContext) => {
|
|
622
|
+
await addTimeoutToPromise(async () => this.requestHandler(finalContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${finalContext.request.id}).`);
|
|
623
|
+
});
|
|
595
624
|
}
|
|
596
625
|
/**
|
|
597
626
|
* Handles blocked request
|
|
@@ -686,11 +715,6 @@ export class BasicCrawler {
|
|
|
686
715
|
await this.requestList.markRequestHandled(request);
|
|
687
716
|
return this.requestQueue.fetchNextRequest();
|
|
688
717
|
}
|
|
689
|
-
/**
|
|
690
|
-
* Executed when `errorHandler` finishes or the request is successful.
|
|
691
|
-
* Can be used to clean up orphaned browser pages.
|
|
692
|
-
*/
|
|
693
|
-
async _cleanupContext(_crawlingContext) { }
|
|
694
718
|
/**
|
|
695
719
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
696
720
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
@@ -759,18 +783,14 @@ export class BasicCrawler {
|
|
|
759
783
|
request.loadedUrl = undefined;
|
|
760
784
|
const statisticsId = request.id || request.uniqueKey;
|
|
761
785
|
this.stats.startJob(statisticsId);
|
|
762
|
-
|
|
763
|
-
// @ts-expect-error
|
|
764
|
-
// All missing properties (that extend CrawlingContext) are set dynamically,
|
|
765
|
-
// but TS does not know that, so otherwise it would throw when compiling.
|
|
786
|
+
const deferredCleanup = [];
|
|
766
787
|
const crawlingContext = {
|
|
767
788
|
id: cryptoRandomObjectId(10),
|
|
768
|
-
crawler: this,
|
|
769
789
|
log: this.log,
|
|
770
790
|
request,
|
|
771
791
|
session,
|
|
772
792
|
enqueueLinks: async (options) => {
|
|
773
|
-
return enqueueLinks({
|
|
793
|
+
return await enqueueLinks({
|
|
774
794
|
// specify the RQ first to allow overriding it
|
|
775
795
|
requestQueue: await this.getRequestQueue(),
|
|
776
796
|
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
@@ -778,17 +798,21 @@ export class BasicCrawler {
|
|
|
778
798
|
...options,
|
|
779
799
|
});
|
|
780
800
|
},
|
|
781
|
-
addRequests:
|
|
801
|
+
addRequests: async (requests, options) => {
|
|
802
|
+
await this.addRequests(requests, options);
|
|
803
|
+
},
|
|
782
804
|
pushData: this.pushData.bind(this),
|
|
783
805
|
useState: this.useState.bind(this),
|
|
784
806
|
sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
|
|
785
807
|
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
|
|
808
|
+
registerDeferredCleanup: (cleanup) => {
|
|
809
|
+
deferredCleanup.push(cleanup);
|
|
810
|
+
},
|
|
786
811
|
};
|
|
787
|
-
this.crawlingContexts.set(crawlingContext.id, crawlingContext);
|
|
788
812
|
let isRequestLocked = true;
|
|
789
813
|
try {
|
|
790
814
|
request.state = RequestState.REQUEST_HANDLER;
|
|
791
|
-
await
|
|
815
|
+
await this.runRequestHandler(crawlingContext);
|
|
792
816
|
await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
793
817
|
isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
|
|
794
818
|
this.stats.finishJob(statisticsId, request.retryCount);
|
|
@@ -797,7 +821,8 @@ export class BasicCrawler {
|
|
|
797
821
|
request.state = RequestState.DONE;
|
|
798
822
|
crawlingContext.session?.markGood();
|
|
799
823
|
}
|
|
800
|
-
catch (
|
|
824
|
+
catch (rawError) {
|
|
825
|
+
const err = this.unwrapError(rawError);
|
|
801
826
|
try {
|
|
802
827
|
request.state = RequestState.ERROR_HANDLER;
|
|
803
828
|
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
@@ -807,24 +832,24 @@ export class BasicCrawler {
|
|
|
807
832
|
request.state = RequestState.DONE;
|
|
808
833
|
}
|
|
809
834
|
catch (secondaryError) {
|
|
810
|
-
|
|
835
|
+
const unwrappedSecondaryError = this.unwrapError(secondaryError);
|
|
836
|
+
if (!unwrappedSecondaryError.triggeredFromUserHandler &&
|
|
811
837
|
// avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
|
|
812
|
-
!(
|
|
838
|
+
!(unwrappedSecondaryError instanceof CriticalError)) {
|
|
813
839
|
const apifySpecific = process.env.APIFY_IS_AT_HOME
|
|
814
840
|
? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
|
|
815
841
|
: '';
|
|
816
|
-
this.log.exception(
|
|
842
|
+
this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
|
|
817
843
|
`This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
|
|
818
844
|
}
|
|
819
845
|
request.state = RequestState.ERROR;
|
|
820
|
-
throw
|
|
846
|
+
throw unwrappedSecondaryError;
|
|
821
847
|
}
|
|
822
848
|
// decrease the session score if the request fails (but the error handler did not throw)
|
|
823
849
|
crawlingContext.session?.markBad();
|
|
824
850
|
}
|
|
825
851
|
finally {
|
|
826
|
-
await
|
|
827
|
-
this.crawlingContexts.delete(crawlingContext.id);
|
|
852
|
+
await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
|
|
828
853
|
// Safety net - release the lock if nobody managed to do it before
|
|
829
854
|
if (isRequestLocked && source instanceof RequestProvider) {
|
|
830
855
|
try {
|
|
@@ -883,6 +908,18 @@ export class BasicCrawler {
|
|
|
883
908
|
request.sessionRotationCount++;
|
|
884
909
|
crawlingContext.session?.retire();
|
|
885
910
|
}
|
|
911
|
+
/**
|
|
912
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
913
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
914
|
+
*/
|
|
915
|
+
unwrapError(error) {
|
|
916
|
+
if (error instanceof RequestHandlerError ||
|
|
917
|
+
error instanceof ContextPipelineInitializationError ||
|
|
918
|
+
error instanceof ContextPipelineCleanupError) {
|
|
919
|
+
return this.unwrapError(error.cause);
|
|
920
|
+
}
|
|
921
|
+
return error;
|
|
922
|
+
}
|
|
886
923
|
/**
|
|
887
924
|
* Handles errors thrown by user provided requestHandler()
|
|
888
925
|
*/
|
|
@@ -895,7 +932,8 @@ export class BasicCrawler {
|
|
|
895
932
|
const shouldRetryRequest = this._canRequestBeRetried(request, error);
|
|
896
933
|
if (shouldRetryRequest) {
|
|
897
934
|
await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
|
|
898
|
-
await this.errorHandler?.(crawlingContext,
|
|
935
|
+
await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
936
|
+
error);
|
|
899
937
|
if (error instanceof SessionError) {
|
|
900
938
|
await this._rotateSession(crawlingContext);
|
|
901
939
|
}
|
|
@@ -947,7 +985,8 @@ export class BasicCrawler {
|
|
|
947
985
|
const message = this._getMessageFromError(error, true);
|
|
948
986
|
this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
|
|
949
987
|
if (this.failedRequestHandler) {
|
|
950
|
-
await this.failedRequestHandler?.(crawlingContext,
|
|
988
|
+
await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
989
|
+
error);
|
|
951
990
|
}
|
|
952
991
|
}
|
|
953
992
|
/**
|
|
@@ -1013,9 +1052,7 @@ export class BasicCrawler {
|
|
|
1013
1052
|
*/
|
|
1014
1053
|
async teardown() {
|
|
1015
1054
|
this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
|
|
1016
|
-
|
|
1017
|
-
await this.sessionPool.teardown();
|
|
1018
|
-
}
|
|
1055
|
+
await this.sessionPool?.teardown();
|
|
1019
1056
|
if (this._closeEvents) {
|
|
1020
1057
|
await this.events.close();
|
|
1021
1058
|
}
|