@crawlee/basic 4.0.0-beta.10 → 4.0.0-beta.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.d.ts +0 -1
- package/index.d.ts.map +1 -1
- package/index.js +0 -1
- package/index.js.map +1 -1
- package/internals/basic-crawler.d.ts +77 -57
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +93 -43
- package/internals/basic-crawler.js.map +1 -1
- package/internals/send-request.d.ts +1 -1
- package/internals/send-request.d.ts.map +1 -1
- package/internals/send-request.js +2 -2
- package/internals/send-request.js.map +1 -1
- package/package.json +5 -5
- package/tsconfig.build.tsbuildinfo +1 -1
- package/internals/constants.d.ts +0 -7
- package/internals/constants.d.ts.map +0 -1
- package/internals/constants.js +0 -7
- package/internals/constants.js.map +0 -1
package/index.d.ts
CHANGED
package/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC"}
|
package/index.js
CHANGED
package/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,eAAe,CAAC;AAC9B,cAAc,8BAA8B,CAAC"}
|
|
@@ -1,38 +1,14 @@
|
|
|
1
|
-
import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, BaseHttpClient, CrawlingContext, DatasetExportOptions,
|
|
2
|
-
import { AutoscaledPool, Configuration, Dataset, RequestProvider, SessionPool, Statistics } from '@crawlee/core';
|
|
3
|
-
import type { Awaitable,
|
|
1
|
+
import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, BaseHttpClient, CrawlingContext, DatasetExportOptions, EventManager, FinalStatistics, GetUserDataFromRequest, IRequestList, ProxyConfiguration, ProxyInfo, Request, RequestOptions, RouterHandler, RouterRoutes, Session, SessionPoolOptions, SkippedRequestCallback, Source, StatisticsOptions, StatisticState } from '@crawlee/core';
|
|
2
|
+
import { AutoscaledPool, Configuration, ContextPipeline, Dataset, RequestProvider, SessionPool, Statistics } from '@crawlee/core';
|
|
3
|
+
import type { Awaitable, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
|
|
4
4
|
import { RobotsTxtFile } from '@crawlee/utils';
|
|
5
|
-
import type {
|
|
5
|
+
import type { ReadonlyDeep } from 'type-fest';
|
|
6
6
|
import type { Log } from '@apify/log';
|
|
7
7
|
import { TimeoutError } from '@apify/timeout';
|
|
8
|
-
export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<
|
|
9
|
-
/**
|
|
10
|
-
* This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
|
|
11
|
-
* currently used by the crawler.
|
|
12
|
-
*
|
|
13
|
-
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
|
|
14
|
-
* and override settings of the enqueued {@link Request} objects.
|
|
15
|
-
*
|
|
16
|
-
* Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
|
|
17
|
-
* for more details regarding its usage.
|
|
18
|
-
*
|
|
19
|
-
* **Example usage**
|
|
20
|
-
*
|
|
21
|
-
* ```ts
|
|
22
|
-
* async requestHandler({ enqueueLinks }) {
|
|
23
|
-
* await enqueueLinks({
|
|
24
|
-
* urls: [...],
|
|
25
|
-
* });
|
|
26
|
-
* },
|
|
27
|
-
* ```
|
|
28
|
-
*
|
|
29
|
-
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
|
|
30
|
-
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
|
|
31
|
-
*/
|
|
32
|
-
enqueueLinks(options?: SetRequired<EnqueueLinksOptions, 'urls'>): Promise<BatchAddRequestsResult>;
|
|
8
|
+
export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
|
|
33
9
|
}
|
|
34
|
-
export type RequestHandler<Context extends CrawlingContext =
|
|
35
|
-
export type ErrorHandler<Context extends CrawlingContext =
|
|
10
|
+
export type RequestHandler<Context extends CrawlingContext = CrawlingContext> = (inputs: Context) => Awaitable<void>;
|
|
11
|
+
export type ErrorHandler<Context extends CrawlingContext = CrawlingContext, ExtendedContext extends Context = Context> = (inputs: Context & Partial<ExtendedContext>, error: Error) => Awaitable<void>;
|
|
36
12
|
export interface StatusMessageCallbackParams<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> {
|
|
37
13
|
state: StatisticState;
|
|
38
14
|
crawler: Crawler;
|
|
@@ -40,7 +16,10 @@ export interface StatusMessageCallbackParams<Context extends CrawlingContext = B
|
|
|
40
16
|
message: string;
|
|
41
17
|
}
|
|
42
18
|
export type StatusMessageCallback<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<void>;
|
|
43
|
-
export
|
|
19
|
+
export type RequireContextPipeline<DefaultContextType extends CrawlingContext, FinalContextType extends DefaultContextType> = DefaultContextType extends FinalContextType ? {} : {
|
|
20
|
+
contextPipelineBuilder: () => ContextPipeline<CrawlingContext, FinalContextType>;
|
|
21
|
+
};
|
|
22
|
+
export interface BasicCrawlerOptions<Context extends CrawlingContext = CrawlingContext, ContextExtension = {}, ExtendedContext extends Context = Context & ContextExtension> {
|
|
44
23
|
/**
|
|
45
24
|
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
|
|
46
25
|
*
|
|
@@ -58,7 +37,35 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
58
37
|
* The exceptions are logged to the request using the
|
|
59
38
|
* {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
|
|
60
39
|
*/
|
|
61
|
-
requestHandler?: RequestHandler<
|
|
40
|
+
requestHandler?: RequestHandler<ExtendedContext>;
|
|
41
|
+
/**
|
|
42
|
+
* Allows the user to extend the crawling context passed to the request handler with custom functionality.
|
|
43
|
+
*
|
|
44
|
+
* **Example usage:**
|
|
45
|
+
*
|
|
46
|
+
* ```javascript
|
|
47
|
+
* import { BasicCrawler } from 'crawlee';
|
|
48
|
+
*
|
|
49
|
+
* // Create a crawler instance
|
|
50
|
+
* const crawler = new BasicCrawler({
|
|
51
|
+
* extendContext(context) => ({
|
|
52
|
+
* async customHelper() {
|
|
53
|
+
* await context.pushData({ url: context.request.url })
|
|
54
|
+
* }
|
|
55
|
+
* }),
|
|
56
|
+
* async requestHandler(context) {
|
|
57
|
+
* await context.customHelper();
|
|
58
|
+
* },
|
|
59
|
+
* });
|
|
60
|
+
* ```
|
|
61
|
+
*/
|
|
62
|
+
extendContext?: (context: Context) => Awaitable<ContextExtension>;
|
|
63
|
+
/**
|
|
64
|
+
* *Intended for BasicCrawler subclasses*. Prepares a context pipeline that transforms the initial crawling context into the shape given by the `Context` type parameter.
|
|
65
|
+
*
|
|
66
|
+
* The option is not required if your crawler subclass does not extend the crawling context with custom information or helpers.
|
|
67
|
+
*/
|
|
68
|
+
contextPipelineBuilder?: () => ContextPipeline<CrawlingContext, Context>;
|
|
62
69
|
/**
|
|
63
70
|
* Static list of URLs to be processed.
|
|
64
71
|
* If not provided, the crawler will open the default request queue when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called.
|
|
@@ -87,7 +94,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
87
94
|
* Second argument is the `Error` instance that
|
|
88
95
|
* represents the last error thrown during processing of the request.
|
|
89
96
|
*/
|
|
90
|
-
errorHandler?: ErrorHandler<
|
|
97
|
+
errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
91
98
|
/**
|
|
92
99
|
* A function to handle requests that failed more than {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
|
|
93
100
|
*
|
|
@@ -96,7 +103,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
96
103
|
* Second argument is the `Error` instance that
|
|
97
104
|
* represents the last error thrown during processing of the request.
|
|
98
105
|
*/
|
|
99
|
-
failedRequestHandler?: ErrorHandler<
|
|
106
|
+
failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
100
107
|
/**
|
|
101
108
|
* Specifies the maximum number of retries allowed for a request if its processing fails.
|
|
102
109
|
* This includes retries due to navigation errors or errors thrown from user-supplied functions
|
|
@@ -223,6 +230,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
|
|
|
223
230
|
* Defaults to a new instance of {@link GotScrapingHttpClient}
|
|
224
231
|
*/
|
|
225
232
|
httpClient?: BaseHttpClient;
|
|
233
|
+
/**
|
|
234
|
+
* If set, the crawler will be configured for all connections to use
|
|
235
|
+
* the Proxy URLs provided and rotated according to the configuration.
|
|
236
|
+
*/
|
|
237
|
+
proxyConfiguration?: ProxyConfiguration;
|
|
226
238
|
}
|
|
227
239
|
/**
|
|
228
240
|
* A set of options that you can toggle to enable experimental features in Crawlee.
|
|
@@ -303,7 +315,7 @@ export interface CrawlerExperiments {
|
|
|
303
315
|
* ```
|
|
304
316
|
* @category Crawlers
|
|
305
317
|
*/
|
|
306
|
-
export declare class BasicCrawler<Context extends CrawlingContext =
|
|
318
|
+
export declare class BasicCrawler<Context extends CrawlingContext = CrawlingContext, ContextExtension = {}, ExtendedContext extends Context = Context & ContextExtension> {
|
|
307
319
|
readonly config: Configuration;
|
|
308
320
|
protected static readonly CRAWLEE_STATE_KEY = "CRAWLEE_STATE";
|
|
309
321
|
/**
|
|
@@ -334,17 +346,25 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
334
346
|
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
335
347
|
*/
|
|
336
348
|
autoscaledPool?: AutoscaledPool;
|
|
349
|
+
/**
|
|
350
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
351
|
+
* Only available if used by the crawler.
|
|
352
|
+
*/
|
|
353
|
+
proxyConfiguration?: ProxyConfiguration;
|
|
337
354
|
/**
|
|
338
355
|
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
339
356
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
340
357
|
*/
|
|
341
|
-
readonly router: RouterHandler<
|
|
358
|
+
readonly router: RouterHandler<Context>;
|
|
359
|
+
private contextPipelineBuilder;
|
|
360
|
+
private _contextPipeline?;
|
|
361
|
+
get contextPipeline(): ContextPipeline<CrawlingContext, ExtendedContext>;
|
|
342
362
|
running: boolean;
|
|
343
363
|
hasFinishedBefore: boolean;
|
|
344
364
|
readonly log: Log;
|
|
345
|
-
protected requestHandler: RequestHandler<
|
|
346
|
-
protected errorHandler?: ErrorHandler<
|
|
347
|
-
protected failedRequestHandler?: ErrorHandler<
|
|
365
|
+
protected requestHandler: RequestHandler<ExtendedContext>;
|
|
366
|
+
protected errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
367
|
+
protected failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
|
|
348
368
|
protected requestHandlerTimeoutMillis: number;
|
|
349
369
|
protected internalTimeoutMillis: number;
|
|
350
370
|
protected maxRequestRetries: number;
|
|
@@ -356,7 +376,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
356
376
|
protected statusMessageCallback?: StatusMessageCallback;
|
|
357
377
|
protected sessionPoolOptions: SessionPoolOptions;
|
|
358
378
|
protected useSessionPool: boolean;
|
|
359
|
-
protected crawlingContexts: Map<string, Context>;
|
|
360
379
|
protected autoscaledPoolOptions: AutoscaledPoolOptions;
|
|
361
380
|
protected events: EventManager;
|
|
362
381
|
protected httpClient: BaseHttpClient;
|
|
@@ -368,6 +387,10 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
368
387
|
private readonly robotsTxtFileCache;
|
|
369
388
|
private _experimentWarnings;
|
|
370
389
|
protected static optionsShape: {
|
|
390
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
391
|
+
contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
392
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
393
|
+
extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
|
|
371
394
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
372
395
|
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
373
396
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -394,6 +417,8 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
394
417
|
sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
395
418
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
396
419
|
useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
|
|
420
|
+
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
421
|
+
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
|
|
397
422
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
398
423
|
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
|
|
399
424
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
@@ -424,7 +449,8 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
424
449
|
/**
|
|
425
450
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
426
451
|
*/
|
|
427
|
-
constructor(options?: BasicCrawlerOptions<Context>,
|
|
452
|
+
constructor(options?: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<CrawlingContext, Context>, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
|
|
453
|
+
config?: Configuration);
|
|
428
454
|
/**
|
|
429
455
|
* Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
|
|
430
456
|
* Used for retrying requests that failed due to proxy errors.
|
|
@@ -432,12 +458,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
432
458
|
* @param error The error to check.
|
|
433
459
|
*/
|
|
434
460
|
protected isProxyError(error: Error): boolean;
|
|
435
|
-
/**
|
|
436
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
437
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
438
|
-
* @param _crawlingContext The crawling context to check.
|
|
439
|
-
*/
|
|
440
|
-
protected isRequestBlocked(_crawlingContext: Context): Promise<string | false>;
|
|
441
461
|
/**
|
|
442
462
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
443
463
|
*/
|
|
@@ -473,7 +493,7 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
473
493
|
* @param requests The requests to add
|
|
474
494
|
* @param options Options for the request queue
|
|
475
495
|
*/
|
|
476
|
-
addRequests(requests: (string | Source)[]
|
|
496
|
+
addRequests(requests: ReadonlyDeep<(string | Source)[]>, options?: CrawlerAddRequestsOptions): Promise<CrawlerAddRequestsResult>;
|
|
477
497
|
/**
|
|
478
498
|
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
479
499
|
*/
|
|
@@ -492,7 +512,7 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
492
512
|
*/
|
|
493
513
|
exportData<Data>(path: string, format?: 'json' | 'csv', options?: DatasetExportOptions): Promise<Data[]>;
|
|
494
514
|
protected _init(): Promise<void>;
|
|
495
|
-
protected
|
|
515
|
+
protected runRequestHandler(crawlingContext: CrawlingContext): Promise<void>;
|
|
496
516
|
/**
|
|
497
517
|
* Handles blocked request
|
|
498
518
|
*/
|
|
@@ -505,11 +525,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
505
525
|
* and RequestQueue is present then enqueues it to the queue first.
|
|
506
526
|
*/
|
|
507
527
|
protected _fetchNextRequest(): Promise<Request<Dictionary> | null | undefined>;
|
|
508
|
-
/**
|
|
509
|
-
* Executed when `errorHandler` finishes or the request is successful.
|
|
510
|
-
* Can be used to clean up orphaned browser pages.
|
|
511
|
-
*/
|
|
512
|
-
protected _cleanupContext(_crawlingContext: Context): Promise<void>;
|
|
513
528
|
/**
|
|
514
529
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
515
530
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
@@ -535,12 +550,17 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
|
|
|
535
550
|
*/
|
|
536
551
|
protected _defaultIsFinishedFunction(): Promise<boolean>;
|
|
537
552
|
private _rotateSession;
|
|
553
|
+
/**
|
|
554
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
555
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
556
|
+
*/
|
|
557
|
+
private unwrapError;
|
|
538
558
|
/**
|
|
539
559
|
* Handles errors thrown by user provided requestHandler()
|
|
540
560
|
*/
|
|
541
|
-
protected _requestFunctionErrorHandler(error: Error, crawlingContext:
|
|
561
|
+
protected _requestFunctionErrorHandler(error: Error, crawlingContext: CrawlingContext, source: IRequestList | RequestProvider): Promise<void>;
|
|
542
562
|
protected _tagUserHandlerError<T>(cb: () => unknown): Promise<T>;
|
|
543
|
-
protected _handleFailedRequestHandler(crawlingContext:
|
|
563
|
+
protected _handleFailedRequestHandler(crawlingContext: CrawlingContext, error: Error): Promise<void>;
|
|
544
564
|
/**
|
|
545
565
|
* Resolves the most verbose error message from a thrown error
|
|
546
566
|
* @param error The error received
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"basic-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/basic-crawler.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACR,yBAAyB,EACzB,wBAAwB,EACxB,qBAAqB,EACrB,cAAc,EACd,eAAe,EACf,oBAAoB,EACpB,
|
|
1
|
+
{"version":3,"file":"basic-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/basic-crawler.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACR,yBAAyB,EACzB,wBAAwB,EACxB,qBAAqB,EACrB,cAAc,EACd,eAAe,EACf,oBAAoB,EACpB,YAAY,EACZ,eAAe,EACf,sBAAsB,EACtB,YAAY,EACZ,kBAAkB,EAClB,SAAS,EACT,OAAO,EACP,cAAc,EACd,aAAa,EACb,YAAY,EACZ,OAAO,EACP,kBAAkB,EAClB,sBAAsB,EACtB,MAAM,EACN,iBAAiB,EACjB,cAAc,EACjB,MAAM,eAAe,CAAC;AACvB,OAAO,EACH,cAAc,EACd,aAAa,EACb,eAAe,EAKf,OAAO,EAUP,eAAe,EAOf,WAAW,EACX,UAAU,EAEb,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AACrF,OAAO,EAAE,aAAa,EAAuB,MAAM,gBAAgB,CAAC;AAKpE,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAG9C,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,YAAY,CAAC;AAEtC,OAAO,EAAuB,YAAY,EAAa,MAAM,gBAAgB,CAAC;AAK9E,MAAM,WAAW,oBAAoB,CAAC,QAAQ,SAAS,UAAU,GAAG,UAAU,CAAE,SAAQ,eAAe,CAAC,QAAQ,CAAC;CAAG;AAapH,MAAM,MAAM,cAAc,CAAC,OAAO,SAAS,eAAe,GAAG,eAAe,IAAI,CAAC,MAAM,EAAE,OAAO,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAErH,MAAM,MAAM,YAAY,CACpB,OAAO,SAAS,eAAe,GAAG,eAAe,EACjD,eAAe,SAAS,OAAO,GAAG,OAAO,IACzC,CAAC,MAAM,EAAE,OAAO,GAAG,OAAO,CAAC,eAAe,CAAC,EAAE,KAAK,EAAE,KAAK,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAElF,MAAM,WAAW,2BAA2B,CACxC,OAAO,SAAS,eAAe,GAAG,oBAAoB,EACtD,OAAO,SAAS,YAAY,CAAC,GAAG,CAAC,GAAG,YAAY,CAAC,OAAO,CAAC;IAEzD,KAAK,EAAE,cAAc,CAAC;IACtB,OAAO,EAAE,OAAO,CAAC;IACjB,aAAa,EAAE,cAAc,CAAC;IAC9B,OAAO,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,qBAAqB,CAC7B,OAAO,SAAS,eAAe,GAAG,oBAAoB,EACtD,OAAO,SAAS,YAAY,CAAC,GAAG,CAAC,GAAG,YAAY,CAAC,OAAO,CAAC,IACzD,CAAC,MAAM,EAAE,2BAA2B,CAAC,OAAO,EAAE,OAAO,CAAC,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC;AAE/E,MAAM,MAAM,sBAAsB,CAC9B,kBAAkB,SAAS,eAAe,EAC1C,gBAAgB,SAAS,kBAAkB,IAC3C,kBAAkB,SAAS,gBAAgB,GACzC,EAAE,GACF;IAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC,eAAe,EAAE,gBAAgB,CAAC,CAAA;CAAE,CAAC;AAE3F,MAAM,WAAW,mBAAmB,CAChC,OAAO,SAAS,eAAe,GAAG,eAAe,EACjD,gBAAgB,GAAG,EAAE,EACrB,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB;IAE5D;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC,eAAe,CAAC,CAAC;IAEjD;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,aAAa,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,KAAK,SAAS,CAAC,gBAAgB,CAAC,CAAC;IAElE;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,eAAe,CAAC,eAAe,EAAE,OAAO,CAAC,CAAC;IAEzE;;;;;OAKG;IACH,WAAW,CAAC,EAAE,YAAY,CAAC;IAE3B;;;;;OAKG;IACH,YAAY,CAAC,EAAE,eAAe,CAAC;IAE/B;;;OAGG;IACH,yBAAyB,CAAC,EAAE,MAAM,CAAC;IAEnC;;;;;;;;OAQG;IACH,YAAY,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAE9D;;;;;;;OAOG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAEtE;;;;;;;;OAQG;IACH,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAE3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;;;;OAMG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAE9C;;;;;OAKG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAE9B;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB;;;OAGG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IAEzB;;OAEG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IAExC;;OAEG;IACH,4BAA4B,CAAC,EAAE,MAAM,CAAC;IAEtC;;;;;;;;;;;;;;;OAeG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAE9C;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IAEzB;;;OAGG;IACH,oBAAoB,CAAC,EAAE,OAAO,CAAC;IAE/B;;;OAGG;IACH,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IAE1C,gBAAgB;IAChB,GAAG,CAAC,EAAE,GAAG,CAAC;IAEV;;;OAGG;IACH,WAAW,CAAC,EAAE,kBAAkB,CAAC;IAEjC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,iBAAiB,CAAC;IAEtC;;;OAGG;IACH,UAAU,CAAC,EAAE,cAAc,CAAC;IAE5B;;;OAGG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;CAC3C;AAED;;;;;GAKG;AACH,MAAM,WAAW,kBAAkB;IAC/B;;;;;OAKG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+DG;AACH,qBAAa,YAAY,CACrB,OAAO,SAAS,eAAe,GAAG,eAAe,EACjD,gBAAgB,GAAG,EAAE,EACrB,eAAe,SAAS,OAAO,GAAG,OAAO,GAAG,gBAAgB;IA2IxD,QAAQ,CAAC,MAAM;IAzInB,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAiB,mBAAmB;IAE9D;;OAEG;IACH,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAE3B;;;OAGG;IACH,WAAW,CAAC,EAAE,YAAY,CAAC;IAE3B;;;;OAIG;IACH,YAAY,CAAC,EAAE,eAAe,CAAC;IAE/B;;;OAGG;IACH,WAAW,CAAC,EAAE,WAAW,CAAC;IAE1B;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,cAAc,CAAC;IAEhC;;;OAGG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IAExC;;;OAGG;IACH,QAAQ,CAAC,MAAM,EAAE,aAAa,CAAC,OAAO,CAAC,CAA4B;IAEnE,OAAO,CAAC,sBAAsB,CAA0D;IACxF,OAAO,CAAC,gBAAgB,CAAC,CAAoD;IAE7E,IAAI,eAAe,IAAI,eAAe,CAAC,eAAe,EAAE,eAAe,CAAC,CAMvE;IAED,OAAO,UAAS;IAChB,iBAAiB,UAAS;IAE1B,QAAQ,CAAC,GAAG,EAAE,GAAG,CAAC;IAClB,SAAS,CAAC,cAAc,EAAG,cAAc,CAAC,eAAe,CAAC,CAAC;IAC3D,SAAS,CAAC,YAAY,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IACxE,SAAS,CAAC,oBAAoB,CAAC,EAAE,YAAY,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;IAChF,SAAS,CAAC,2BAA2B,EAAG,MAAM,CAAC;IAC/C,SAAS,CAAC,qBAAqB,EAAE,MAAM,CAAC;IACxC,SAAS,CAAC,iBAAiB,EAAE,MAAM,CAAC;IACpC,SAAS,CAAC,qBAAqB,EAAE,MAAM,CAAC;IACxC,SAAS,CAAC,kBAAkB,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClD,SAAS,CAAC,mBAAmB,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,oBAAoB,EAAE,MAAM,CAAC;IACvC,SAAS,CAAC,4BAA4B,EAAE,MAAM,CAAC;IAC/C,SAAS,CAAC,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IACxD,SAAS,CAAC,kBAAkB,EAAE,kBAAkB,CAAC;IACjD,SAAS,CAAC,cAAc,EAAE,OAAO,CAAC;IAClC,SAAS,CAAC,qBAAqB,EAAE,qBAAqB,CAAC;IACvD,SAAS,CAAC,MAAM,EAAE,YAAY,CAAC;IAC/B,SAAS,CAAC,UAAU,EAAE,cAAc,CAAC;IACrC,SAAS,CAAC,cAAc,EAAE,OAAO,CAAC;IAClC,SAAS,CAAC,oBAAoB,EAAE,OAAO,CAAC;IACxC,SAAS,CAAC,gBAAgB,CAAC,EAAE,sBAAsB,CAAC;IACpD,OAAO,CAAC,YAAY,CAAC,CAAU;IAE/B,OAAO,CAAC,WAAW,CAAqB;IACxC,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAA0B;IAC7D,OAAO,CAAC,mBAAmB,CAA0D;IAErF,SAAS,CAAC,MAAM,CAAC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;MAyC3B;IAEF;;OAEG;gBAEC,OAAO,GAAE,mBAAmB,CAAC,OAAO,EAAE,gBAAgB,EAAE,eAAe,CAAC,GACpE,sBAAsB,CAAC,eAAe,EAAE,OAAO,CAAa,EAAE,wGAAwG;IACjK,MAAM,gBAAkC;IA6MrD;;;;;OAKG;IACH,SAAS,CAAC,YAAY,CAAC,KAAK,EAAE,KAAK,GAAG,OAAO;IAI7C;;OAEG;IACG,gBAAgB,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,uBAA4B;IAmB7E,OAAO,CAAC,iBAAiB;IAgDzB;;;;;;;;;OASG;IACG,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,MAAM,GAAG,OAAO,GAAG,cAAc,CAAC,EAAE,EAAE,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IA8GlH;;;;OAIG;IACH,IAAI,CAAC,OAAO,SAA6C,GAAG,IAAI;IAY1D,eAAe;IAYf,QAAQ,CAAC,KAAK,SAAS,UAAU,GAAG,UAAU,EAAE,YAAY,GAAS,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;IAKjG;;;;;;;;;;OAUG;IACG,WAAW,CACb,QAAQ,EAAE,YAAY,CAAC,CAAC,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC,EAC3C,OAAO,GAAE,yBAA8B,GACxC,OAAO,CAAC,wBAAwB,CAAC;IAsCpC;;OAEG;IACG,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,eAAe,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKjG;;OAEG;IACG,UAAU,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAIrD;;OAEG;IACG,OAAO,CAAC,GAAG,IAAI,EAAE,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAKtF;;;OAGG;IACG,UAAU,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,KAAK,EAAE,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;cAoC9F,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;cAoBtB,iBAAiB,CAAC,eAAe,EAAE,eAAe,GAAG,OAAO,CAAC,IAAI,CAAC;IAUlF;;OAEG;IACH,SAAS,CAAC,sBAAsB,CAAC,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM;YAQvD,6BAA6B;cAS3B,sBAAsB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,GAAG,SAAS,CAAC;cAuBvE,iBAAiB;IAuCjC;;;OAGG;cACa,iBAAiB;IAyBjC;;;;OAIG;IACH,SAAS,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,YAAY,GAAG,eAAe;IAsC/E;;;OAGG;cACa,gBAAgB;IA+JhC;;;OAGG;cACa,gBAAgB,CAC5B,OAAO,EAAE,MAAM,OAAO,CAAC,OAAO,CAAC,EAC/B,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,KAAK,GAAG,MAAM,EACrB,UAAU,SAAI,EACd,OAAO,SAAI,GACZ,OAAO,CAAC,IAAI,CAAC;IAehB;;OAEG;cACa,oBAAoB;IASpC;;OAEG;cACa,0BAA0B;YAS5B,cAAc;IAQ5B;;;OAGG;IACH,OAAO,CAAC,WAAW;IAWnB;;OAEG;cACa,4BAA4B,CACxC,KAAK,EAAE,KAAK,EACZ,eAAe,EAAE,eAAe,EAChC,MAAM,EAAE,YAAY,GAAG,eAAe,GACvC,OAAO,CAAC,IAAI,CAAC;cA4DA,oBAAoB,CAAC,CAAC,EAAE,EAAE,EAAE,MAAM,OAAO,GAAG,OAAO,CAAC,CAAC,CAAC;cAStD,2BAA2B,CAAC,eAAe,EAAE,eAAe,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;IAe1G;;;;OAIG;IACH,SAAS,CAAC,oBAAoB,CAAC,KAAK,EAAE,KAAK,EAAE,UAAU,UAAQ;IAmB/D,SAAS,CAAC,oBAAoB,CAAC,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK;IAoB7D;;;;;;;OAOG;cACa,wBAAwB,IAAI,OAAO,CAAC,IAAI,CAAC;cAQzC,aAAa,CAAC,QAAQ,SAAS,CAAC,GAAG,IAAI,EAAE,GAAG,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,EAC9E,KAAK,EAAE,QAAQ,EAAE,EACjB,GAAG,IAAI,EAAE,UAAU,CAAC,QAAQ,CAAC;IASjC;;;OAGG;IACG,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;IAY/B,SAAS,CAAC,2BAA2B,CAAC,OAAO,EAAE,OAAO;YAWxC,gBAAgB;IAc9B,SAAS,CAAC,6BAA6B,CAAC,OAAO,EAAE,OAAO;CA6C3D;AAED,MAAM,WAAW,oBAAoB;IACjC,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,SAAS,CAAC,EAAE,SAAS,CAAC;CACzB;AAED,MAAM,WAAW,yBAA0B,SAAQ,yBAAyB;CAAG;AAE/E,MAAM,WAAW,wBAAyB,SAAQ,wBAAwB;CAAG;AAE7E,MAAM,WAAW,iBAAkB,SAAQ,yBAAyB;IAChE;;;;OAIG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC/B;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,iBAAiB,CAC7B,OAAO,SAAS,oBAAoB,GAAG,oBAAoB,EAC3D,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,0BAEzC"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { writeFile } from 'node:fs/promises';
|
|
2
2
|
import { dirname } from 'node:path';
|
|
3
|
-
import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
3
|
+
import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
4
|
import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
5
5
|
import { stringify } from 'csv-stringify/sync';
|
|
6
6
|
import { ensureDir, writeJSON } from 'fs-extra/esm';
|
|
@@ -116,11 +116,24 @@ export class BasicCrawler {
|
|
|
116
116
|
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
117
117
|
*/
|
|
118
118
|
autoscaledPool;
|
|
119
|
+
/**
|
|
120
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
121
|
+
* Only available if used by the crawler.
|
|
122
|
+
*/
|
|
123
|
+
proxyConfiguration;
|
|
119
124
|
/**
|
|
120
125
|
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
121
126
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
122
127
|
*/
|
|
123
128
|
router = Router.create();
|
|
129
|
+
contextPipelineBuilder;
|
|
130
|
+
_contextPipeline;
|
|
131
|
+
get contextPipeline() {
|
|
132
|
+
if (this._contextPipeline === undefined) {
|
|
133
|
+
this._contextPipeline = this.contextPipelineBuilder();
|
|
134
|
+
}
|
|
135
|
+
return this._contextPipeline;
|
|
136
|
+
}
|
|
124
137
|
running = false;
|
|
125
138
|
hasFinishedBefore = false;
|
|
126
139
|
log;
|
|
@@ -138,7 +151,6 @@ export class BasicCrawler {
|
|
|
138
151
|
statusMessageCallback;
|
|
139
152
|
sessionPoolOptions;
|
|
140
153
|
useSessionPool;
|
|
141
|
-
crawlingContexts = new Map();
|
|
142
154
|
autoscaledPoolOptions;
|
|
143
155
|
events;
|
|
144
156
|
httpClient;
|
|
@@ -150,6 +162,8 @@ export class BasicCrawler {
|
|
|
150
162
|
robotsTxtFileCache;
|
|
151
163
|
_experimentWarnings = {};
|
|
152
164
|
static optionsShape = {
|
|
165
|
+
contextPipelineBuilder: ow.optional.object,
|
|
166
|
+
extendContext: ow.optional.function,
|
|
153
167
|
requestList: ow.optional.object.validate(validators.requestList),
|
|
154
168
|
requestQueue: ow.optional.object.validate(validators.requestQueue),
|
|
155
169
|
// Subclasses override this function instead of passing it
|
|
@@ -166,6 +180,7 @@ export class BasicCrawler {
|
|
|
166
180
|
autoscaledPoolOptions: ow.optional.object,
|
|
167
181
|
sessionPoolOptions: ow.optional.object,
|
|
168
182
|
useSessionPool: ow.optional.boolean,
|
|
183
|
+
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
169
184
|
statusMessageLoggingInterval: ow.optional.number,
|
|
170
185
|
statusMessageCallback: ow.optional.function,
|
|
171
186
|
retryOnBlocked: ow.optional.boolean,
|
|
@@ -185,17 +200,45 @@ export class BasicCrawler {
|
|
|
185
200
|
/**
|
|
186
201
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
187
202
|
*/
|
|
188
|
-
constructor(options = {},
|
|
203
|
+
constructor(options = {}, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
|
|
204
|
+
config = Configuration.getGlobalConfig()) {
|
|
189
205
|
this.config = config;
|
|
190
206
|
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
|
|
191
|
-
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
|
|
207
|
+
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
|
|
192
208
|
// AutoscaledPool shorthands
|
|
193
209
|
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
194
210
|
// internal
|
|
195
211
|
log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
|
|
212
|
+
// Store the builder so that it can be run when the contextPipeline is needed.
|
|
213
|
+
// Invoking it immediately would cause problems with parent constructor call order.
|
|
214
|
+
this.contextPipelineBuilder = () => {
|
|
215
|
+
let contextPipeline = (options.contextPipelineBuilder?.() ??
|
|
216
|
+
ContextPipeline.create()); // Thanks to the RequireContextPipeline, contextPipeline will only be undefined if InitialContextType is CrawlingContext
|
|
217
|
+
if (options.extendContext !== undefined) {
|
|
218
|
+
contextPipeline = contextPipeline.compose({
|
|
219
|
+
action: async (context) => await options.extendContext(context),
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
contextPipeline = contextPipeline.compose({
|
|
223
|
+
action: async (context) => {
|
|
224
|
+
const { request } = context;
|
|
225
|
+
if (!this.requestMatchesEnqueueStrategy(request)) {
|
|
226
|
+
// eslint-disable-next-line dot-notation
|
|
227
|
+
const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
|
|
228
|
+
this.log.debug(message);
|
|
229
|
+
request.noRetry = true;
|
|
230
|
+
request.state = RequestState.SKIPPED;
|
|
231
|
+
throw new ContextPipelineInterruptedError(message);
|
|
232
|
+
}
|
|
233
|
+
return context;
|
|
234
|
+
},
|
|
235
|
+
});
|
|
236
|
+
return contextPipeline;
|
|
237
|
+
};
|
|
196
238
|
this.requestList = requestList;
|
|
197
239
|
this.requestQueue = requestQueue;
|
|
198
240
|
this.httpClient = httpClient ?? new GotScrapingHttpClient();
|
|
241
|
+
this.proxyConfiguration = proxyConfiguration;
|
|
199
242
|
this.log = log;
|
|
200
243
|
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
|
|
201
244
|
this.statusMessageCallback = statusMessageCallback;
|
|
@@ -247,7 +290,6 @@ export class BasicCrawler {
|
|
|
247
290
|
}
|
|
248
291
|
}
|
|
249
292
|
this.useSessionPool = useSessionPool;
|
|
250
|
-
this.crawlingContexts = new Map();
|
|
251
293
|
const maxSignedInteger = 2 ** 31 - 1;
|
|
252
294
|
if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
|
|
253
295
|
log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
|
|
@@ -310,14 +352,6 @@ export class BasicCrawler {
|
|
|
310
352
|
isProxyError(error) {
|
|
311
353
|
return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
|
|
312
354
|
}
|
|
313
|
-
/**
|
|
314
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
315
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
316
|
-
* @param _crawlingContext The crawling context to check.
|
|
317
|
-
*/
|
|
318
|
-
async isRequestBlocked(_crawlingContext) {
|
|
319
|
-
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
|
|
320
|
-
}
|
|
321
355
|
/**
|
|
322
356
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
323
357
|
*/
|
|
@@ -590,8 +624,10 @@ export class BasicCrawler {
|
|
|
590
624
|
}
|
|
591
625
|
await this._loadHandledRequestCount();
|
|
592
626
|
}
|
|
593
|
-
async
|
|
594
|
-
await this.
|
|
627
|
+
async runRequestHandler(crawlingContext) {
|
|
628
|
+
await this.contextPipeline.call(crawlingContext, async (finalContext) => {
|
|
629
|
+
await addTimeoutToPromise(async () => this.requestHandler(finalContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${finalContext.request.id}).`);
|
|
630
|
+
});
|
|
595
631
|
}
|
|
596
632
|
/**
|
|
597
633
|
* Handles blocked request
|
|
@@ -686,11 +722,6 @@ export class BasicCrawler {
|
|
|
686
722
|
await this.requestList.markRequestHandled(request);
|
|
687
723
|
return this.requestQueue.fetchNextRequest();
|
|
688
724
|
}
|
|
689
|
-
/**
|
|
690
|
-
* Executed when `errorHandler` finishes or the request is successful.
|
|
691
|
-
* Can be used to clean up orphaned browser pages.
|
|
692
|
-
*/
|
|
693
|
-
async _cleanupContext(_crawlingContext) { }
|
|
694
725
|
/**
|
|
695
726
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
696
727
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
@@ -737,7 +768,12 @@ export class BasicCrawler {
|
|
|
737
768
|
tryCancel();
|
|
738
769
|
if (this.useSessionPool) {
|
|
739
770
|
await this._timeoutAndRetry(async () => {
|
|
740
|
-
session = await this.sessionPool.
|
|
771
|
+
session = await this.sessionPool.newSession({
|
|
772
|
+
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
|
|
773
|
+
request: request ?? undefined,
|
|
774
|
+
}),
|
|
775
|
+
maxUsageCount: 1,
|
|
776
|
+
});
|
|
741
777
|
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
742
778
|
}
|
|
743
779
|
tryCancel();
|
|
@@ -759,18 +795,15 @@ export class BasicCrawler {
|
|
|
759
795
|
request.loadedUrl = undefined;
|
|
760
796
|
const statisticsId = request.id || request.uniqueKey;
|
|
761
797
|
this.stats.startJob(statisticsId);
|
|
762
|
-
|
|
763
|
-
// @ts-expect-error
|
|
764
|
-
// All missing properties (that extend CrawlingContext) are set dynamically,
|
|
765
|
-
// but TS does not know that, so otherwise it would throw when compiling.
|
|
798
|
+
const deferredCleanup = [];
|
|
766
799
|
const crawlingContext = {
|
|
767
800
|
id: cryptoRandomObjectId(10),
|
|
768
|
-
crawler: this,
|
|
769
801
|
log: this.log,
|
|
770
802
|
request,
|
|
771
803
|
session,
|
|
804
|
+
proxyInfo: session?.proxyInfo,
|
|
772
805
|
enqueueLinks: async (options) => {
|
|
773
|
-
return enqueueLinks({
|
|
806
|
+
return await enqueueLinks({
|
|
774
807
|
// specify the RQ first to allow overriding it
|
|
775
808
|
requestQueue: await this.getRequestQueue(),
|
|
776
809
|
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
@@ -778,17 +811,21 @@ export class BasicCrawler {
|
|
|
778
811
|
...options,
|
|
779
812
|
});
|
|
780
813
|
},
|
|
781
|
-
addRequests:
|
|
814
|
+
addRequests: async (requests, options) => {
|
|
815
|
+
await this.addRequests(requests, options);
|
|
816
|
+
},
|
|
782
817
|
pushData: this.pushData.bind(this),
|
|
783
818
|
useState: this.useState.bind(this),
|
|
784
|
-
sendRequest: createSendRequest(this.httpClient, request, session
|
|
819
|
+
sendRequest: createSendRequest(this.httpClient, request, session),
|
|
785
820
|
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
|
|
821
|
+
registerDeferredCleanup: (cleanup) => {
|
|
822
|
+
deferredCleanup.push(cleanup);
|
|
823
|
+
},
|
|
786
824
|
};
|
|
787
|
-
this.crawlingContexts.set(crawlingContext.id, crawlingContext);
|
|
788
825
|
let isRequestLocked = true;
|
|
789
826
|
try {
|
|
790
827
|
request.state = RequestState.REQUEST_HANDLER;
|
|
791
|
-
await
|
|
828
|
+
await this.runRequestHandler(crawlingContext);
|
|
792
829
|
await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
793
830
|
isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
|
|
794
831
|
this.stats.finishJob(statisticsId, request.retryCount);
|
|
@@ -797,7 +834,8 @@ export class BasicCrawler {
|
|
|
797
834
|
request.state = RequestState.DONE;
|
|
798
835
|
crawlingContext.session?.markGood();
|
|
799
836
|
}
|
|
800
|
-
catch (
|
|
837
|
+
catch (rawError) {
|
|
838
|
+
const err = this.unwrapError(rawError);
|
|
801
839
|
try {
|
|
802
840
|
request.state = RequestState.ERROR_HANDLER;
|
|
803
841
|
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
@@ -807,24 +845,24 @@ export class BasicCrawler {
|
|
|
807
845
|
request.state = RequestState.DONE;
|
|
808
846
|
}
|
|
809
847
|
catch (secondaryError) {
|
|
810
|
-
|
|
848
|
+
const unwrappedSecondaryError = this.unwrapError(secondaryError);
|
|
849
|
+
if (!unwrappedSecondaryError.triggeredFromUserHandler &&
|
|
811
850
|
// avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
|
|
812
|
-
!(
|
|
851
|
+
!(unwrappedSecondaryError instanceof CriticalError)) {
|
|
813
852
|
const apifySpecific = process.env.APIFY_IS_AT_HOME
|
|
814
853
|
? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
|
|
815
854
|
: '';
|
|
816
|
-
this.log.exception(
|
|
855
|
+
this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
|
|
817
856
|
`This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
|
|
818
857
|
}
|
|
819
858
|
request.state = RequestState.ERROR;
|
|
820
|
-
throw
|
|
859
|
+
throw unwrappedSecondaryError;
|
|
821
860
|
}
|
|
822
861
|
// decrease the session score if the request fails (but the error handler did not throw)
|
|
823
862
|
crawlingContext.session?.markBad();
|
|
824
863
|
}
|
|
825
864
|
finally {
|
|
826
|
-
await
|
|
827
|
-
this.crawlingContexts.delete(crawlingContext.id);
|
|
865
|
+
await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
|
|
828
866
|
// Safety net - release the lock if nobody managed to do it before
|
|
829
867
|
if (isRequestLocked && source instanceof RequestProvider) {
|
|
830
868
|
try {
|
|
@@ -883,6 +921,18 @@ export class BasicCrawler {
|
|
|
883
921
|
request.sessionRotationCount++;
|
|
884
922
|
crawlingContext.session?.retire();
|
|
885
923
|
}
|
|
924
|
+
/**
|
|
925
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
926
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
927
|
+
*/
|
|
928
|
+
unwrapError(error) {
|
|
929
|
+
if (error instanceof RequestHandlerError ||
|
|
930
|
+
error instanceof ContextPipelineInitializationError ||
|
|
931
|
+
error instanceof ContextPipelineCleanupError) {
|
|
932
|
+
return this.unwrapError(error.cause);
|
|
933
|
+
}
|
|
934
|
+
return error;
|
|
935
|
+
}
|
|
886
936
|
/**
|
|
887
937
|
* Handles errors thrown by user provided requestHandler()
|
|
888
938
|
*/
|
|
@@ -895,7 +945,8 @@ export class BasicCrawler {
|
|
|
895
945
|
const shouldRetryRequest = this._canRequestBeRetried(request, error);
|
|
896
946
|
if (shouldRetryRequest) {
|
|
897
947
|
await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
|
|
898
|
-
await this.errorHandler?.(crawlingContext,
|
|
948
|
+
await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
949
|
+
error);
|
|
899
950
|
if (error instanceof SessionError) {
|
|
900
951
|
await this._rotateSession(crawlingContext);
|
|
901
952
|
}
|
|
@@ -947,7 +998,8 @@ export class BasicCrawler {
|
|
|
947
998
|
const message = this._getMessageFromError(error, true);
|
|
948
999
|
this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
|
|
949
1000
|
if (this.failedRequestHandler) {
|
|
950
|
-
await this.failedRequestHandler?.(crawlingContext,
|
|
1001
|
+
await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1002
|
+
error);
|
|
951
1003
|
}
|
|
952
1004
|
}
|
|
953
1005
|
/**
|
|
@@ -1013,9 +1065,7 @@ export class BasicCrawler {
|
|
|
1013
1065
|
*/
|
|
1014
1066
|
async teardown() {
|
|
1015
1067
|
this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
|
|
1016
|
-
|
|
1017
|
-
await this.sessionPool.teardown();
|
|
1018
|
-
}
|
|
1068
|
+
await this.sessionPool?.teardown();
|
|
1019
1069
|
if (this._closeEvents) {
|
|
1020
1070
|
await this.events.close();
|
|
1021
1071
|
}
|